Post by Oliver S.I measured the context-switch from one thread to another on the same CPU
to cost about 800 clock-cycles on my uniprocessor Athlon-XP when a thread
chose to relinquish the execution-resources by doing WDFSO and there's
another thread which is eligible to be re-started immediately. It would
be interesting to know what timing a SMP-system has on this; especially
because the pools of runnable threads are managed globally and the cache-
lines of the management-data on this pools are moved between the CPUs so
that this time might vary depending on from where a thread has been put
to the runnable-pool on a NUMA-system.
As I don't own a SMP-system I helped myself to get some worst-case numbers
on cases when the cache doesn't hold any OS-specific contents by simply
polluting the caches myself in several nucances. These are the numbers
for my uniprocessor Athlon-XP (1,4GHz, SiS745-chipset):
no cache-pollution: ~750 clock cycles
clean pollution of the l1-cache: ~750 clock cycles
dirty pollution of the l1-cache: ~1330 clock cycles
clean pollution of the l2-cache: ~750 clock cycles
dirty pollution of the l2-cache: ~2760 clock cycles
As "clean pollution" I consider overwriting a the cachelines of the cache
with content from higher memory-levels (l2-cahce or memory) without causing
any cacheline to become "dirty", i.e. it must be written back to memory (I'm
aware that write-through-caches never become dirty). Dirty pollution simply
overwrites the cachelines so a cache-level becomes completely dirty (assuming
my XP-OS uses proper page-colouring - but without that, ***@home woudln't
run competititve under XP *g*).
This is my current testing-code (the only problem I see with this code is
that there might be always some other threads in the running queue between
the two threads flipping each other; the best would be to hand the whole
task to the OS-initialization *g*):
#include <windows.h>
#include <memory>
#include "../common/xstddef.h"
#include "../common/Win32/xhandle.h"
std::size_t const CACHE_POLLUTION_SIZE = 64 * 1024;
bool const DO_DIRTY_CACHE_POLLUTION = true;
DWORDLONG __fastcall GetTSC();
void __fastcall memrd( void *dest, std::size_t count );
DWORD WINAPI FlippingThread( LPVOID lpvThreadParam );
struct ThreadInfo
{
void *pvCacheTrashingBuffer;
std::size_t szCacheTrashingBuffer;
bool fDoDirtyCachePollution;
LONG volatile lRemainingTests;
BOOL volatile *pfPrevTickIntact;
DWORDLONG volatile *pdwlPrevTick;
DWORDLONG volatile *pdwlFastestContextSwitch;
HANDLE hEvtWaitFor;
HANDLE hEvtWakeUpOtherThread;
};
void __cdecl main()
{
using namespace ns_OClasses;
using namespace ns_OClasses::ns_Win32;
std::size_t szCacheTrashingBuffer;
void *pvCacheTrashingBuffer;
bool fDoDirtyCachePollution;
BOOL volatile fPrevTickIntact;
DWORDLONG volatile dwlPrevTick;
DWORD dwTests;
DWORDLONG volatile dwlFastestContextSwitch;
XHANDLE ahEvtFlippingWaitFor[2];
ThreadInfo atiFlippingThreads[2];
XHANDLE ahThrFlippingThreads[2];
szCacheTrashingBuffer = ::CACHE_POLLUTION_SIZE;
pvCacheTrashingBuffer = ::VirtualAlloc( NULL, szCacheTrashingBuffer, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE );
std::memset( pvCacheTrashingBuffer, 0, szCacheTrashingBuffer );
fDoDirtyCachePollution = ::DO_DIRTY_CACHE_POLLUTION;
fPrevTickIntact = FALSE;
dwlFastestContextSwitch = (DWORDLONG)(LONGLONG)-1;
dwTests = 100000;
ahEvtFlippingWaitFor[0].h = ::CreateEvent( NULL, FALSE, FALSE, NULL );
ahEvtFlippingWaitFor[1].h = ::CreateEvent( NULL, FALSE, FALSE, NULL );
atiFlippingThreads[0].pvCacheTrashingBuffer = pvCacheTrashingBuffer;
atiFlippingThreads[0].szCacheTrashingBuffer = szCacheTrashingBuffer;
atiFlippingThreads[0].fDoDirtyCachePollution = fDoDirtyCachePollution;
atiFlippingThreads[0].lRemainingTests = dwTests - (dwTests / 2);
atiFlippingThreads[0].pfPrevTickIntact = &fPrevTickIntact;
atiFlippingThreads[0].pdwlPrevTick = &dwlPrevTick;
atiFlippingThreads[0].pdwlFastestContextSwitch = &dwlFastestContextSwitch;
atiFlippingThreads[0].hEvtWaitFor = ahEvtFlippingWaitFor[0].h;
atiFlippingThreads[0].hEvtWakeUpOtherThread = ahEvtFlippingWaitFor[1].h;
atiFlippingThreads[1].pvCacheTrashingBuffer = pvCacheTrashingBuffer;
atiFlippingThreads[1].szCacheTrashingBuffer = szCacheTrashingBuffer;
atiFlippingThreads[1].fDoDirtyCachePollution = fDoDirtyCachePollution;
atiFlippingThreads[1].lRemainingTests = dwTests / 2;
atiFlippingThreads[1].pfPrevTickIntact = &fPrevTickIntact;
atiFlippingThreads[1].pdwlPrevTick = &dwlPrevTick;
atiFlippingThreads[1].pdwlFastestContextSwitch = &dwlFastestContextSwitch;
atiFlippingThreads[1].hEvtWaitFor = ahEvtFlippingWaitFor[1].h;
atiFlippingThreads[1].hEvtWakeUpOtherThread = ahEvtFlippingWaitFor[0].h;
ahThrFlippingThreads[0].h = ::CreateThread( NULL, 0, FlippingThread, &atiFlippingThreads[0], 0, NULL );
ahThrFlippingThreads[1].h = ::CreateThread( NULL, 0, FlippingThread, &atiFlippingThreads[1], 0, NULL );
::SetThreadAffinityMask( ahThrFlippingThreads[0].h, 1 );
::SetThreadAffinityMask( ahThrFlippingThreads[1].h, 1 );
::SetEvent( ahEvtFlippingWaitFor[0].h );
::WaitForMultipleObjects( 2, (HANDLE *)ahThrFlippingThreads, TRUE, INFINITE );
}
DWORD WINAPI FlippingThread( LPVOID lpvThreadParam )
{
ThreadInfo *pti = (ThreadInfo *)lpvThreadParam;
DWORDLONG dwlTicks;
for( ; ; )
{
if( pti->lRemainingTests-- == 0 )
return 0;
::WaitForSingleObject( pti->hEvtWaitFor, INFINITE );
if( *pti->pfPrevTickIntact &&
(dwlTicks = (GetTSC() - *pti->pdwlPrevTick) & 0xFFFFFFFFFFFFu) < *pti->pdwlFastestContextSwitch )
*pti->pdwlFastestContextSwitch = dwlTicks;
if( pti->fDoDirtyCachePollution )
std::memset( pti->pvCacheTrashingBuffer, 0, pti->szCacheTrashingBuffer );
else
memrd( pti->pvCacheTrashingBuffer, pti->szCacheTrashingBuffer );
::SetEvent( pti->hEvtWakeUpOtherThread );
*pti->pfPrevTickIntact = FALSE;
ns_OClasses::load_store_fence();
*pti->pdwlPrevTick = GetTSC();
ns_OClasses::load_store_fence();
*pti->pfPrevTickIntact = TRUE;
}
}
__declspec(naked)
DWORDLONG __fastcall GetTSC()
{
__asm
{
rdtsc
ret
}
}
__declspec(naked)
void __fastcall memrd( void *dest, std::size_t count )
{
__asm
{
cmp edx, 1
jb byeBye
killAgain:
mov al, [ecx]
sub edx, 1
jae killAgain
byeBye:
ret
}
}