Code Search for Developers
 
 
  

GSLocalMemory.cpp from guliverkli at Krugle


Show GSLocalMemory.cpp syntax highlighted

/* 
 *	Copyright (C) 2003-2005 Gabest
 *	http://www.gabest.org
 *
 *  This Program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *   
 *  This Program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *   
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *  http://www.gnu.org/copyleft/gpl.html
 *
 *	Special Notes: 
 *
 *	Based on Page.c from GSSoft
 *	Copyright (C) 2002-2004 GSsoft Team
 *
 */
 
#include "StdAfx.h"
#include "GSLocalMemory.h"
#include "x86.h"

//

static class cdscd
{
	CString toBin(UINT64 n, int bits)
	{
		CString str;
		while(bits-- > 0) str += n&(1i64<<bits) ? '1' : '0';
		return str;
	}
public:

	cdscd()
	{
		return;
/*
		union
		{
			struct {DWORD mant:23; DWORD exp:8; DWORD sign:1;} fi;
			DWORD i;
			float f;
		} nf, nf2;

		union
		{
			struct {UINT64 mant:52; UINT64 exp:11; UINT64 sign:1;} di;
			UINT64 i;
			double d;
		} nd;

		nd.di.mant = 0;
		nd.di.exp = 1023+128;
		nd.di.sign = 0;

		nd.d = pow(2.0, 128);

		TRACE(_T("%016I64x (%I64x:%I64x:%I64x), %f\n"), nd.i, nd.di.sign, nd.di.exp, nd.di.mant, nd.d);
*/
/*
		unsigned int control_word = 0;
		int err;
		
//		nd.d = 0;
//		TRACE(_T("%016I64x (%I64x:%I64x:%I64x), %f\n"), nd.i, nd.di.sign, nd.di.exp, nd.di.mant, nd.d);

//		nf.f = nd.d;
//		TRACE(_T("%08x (%x:%x:%x), %f\n"), nf.i, nf.fi.sign, nf.fi.exp, nf.fi.mant, nf.f);

		nf2.fi.sign = 0;
		nf2.fi.exp = 0x7f;
		nf2.fi.mant = 1;

		//err = _controlfp_s(&control_word, _RC_CHOP, _MCW_RC);

		nf.f = 0;

		for(int i = 0; i < 20; i++)
		{
			nd.d = (double)nf.f + (double)nf2.f;
			TRACE(_T("%016I64x (%I64x:%03I64x:%s), %f\n"), nd.i, nd.di.sign, nd.di.exp, toBin(nd.di.mant, 52), nd.d);
			*((UINT64*)&nd.d) &= ~(1ui64<<(52-24));
			TRACE(_T("%016I64x (%I64x:%03I64x:%s), %f\n"), nd.i, nd.di.sign, nd.di.exp, toBin(nd.di.mant, 52), nd.d);
			nf.f = (float)nd.d;
			TRACE(_T("%08x         (%x:%02x :%s                             ), %f\n"), nf.i, nf.fi.sign, nf.fi.exp, toBin(nf.fi.mant, 23), nf.f);
		}
*/
/*
		GSLocalMemory lm;

		for(int i = 0; i < 1024*1024; i++)
			((DWORD*)lm.GetVM())[i] = rand()*0x12345678;


		GIFRegTEX0 TEX0;
		TEX0.TBP0 = 0;
		TEX0.TBW = 16;
		GIFRegTEXA TEXA;
		TEXA.AEM = 0; // 1

		__declspec(align(32)) static DWORD dst[1024*1024];

		for(int j = 0; j < 10; j++)
		{
			clock_t start = clock();

			for(int i = 0; i < 1000; i++)
			{
				lm.unSwizzleTexture32(CRect(0,0,1024,1024), (BYTE*)dst, 1024*4, TEX0, TEXA);
			}

			clock_t diff = clock() - start;

			CString str;
			str.Format(_T("%d"), diff);
			AfxMessageBox(str);
		}
*/
/*
		__declspec(align(16)) static vertex_t v[100000];
		for(int i = 0; i < countof(v); i++)
		{
			v[i].u = 1.0f*rand() - RAND_MAX/2;
			v[i].v = 1.0f*rand() - RAND_MAX/2;
		}

		for(int j = 0; j < 10; j++)
		{
			clock_t start = clock();

			uvmm_t uvmm;
			for(int i = 0; i < 1000; i++)
				UVMinMax_sse2(countof(v), v, &uvmm);

			clock_t diff = clock() - start;

			CString str;
			str.Format(_T("%d, u %.2f %.2f, v %.2f %.2f"), diff, uvmm.umin, uvmm.umax, uvmm.vmin, uvmm.vmax);
			AfxMessageBox(str);
		}
*/
/*
		GIFRegTEX0 TEX0;
		GIFRegTEXCLUT TEXCLUT;
		TEX0.PSM = PSM_PSMT8;
		TEX0.CLD = 1;
		TEX0.CSA = 0;
		TEX0.CPSM = PSM_PSMCT32; // PSM_PSMCT16S
		TEX0.CBP = 0;
		TEX0.CSM = 0; // 0
		TEXCLUT.CBW = 1;
		TEXCLUT.COU = 0;
		TEXCLUT.COV = 0;

		for(int j = 0; j < 10; j++)
		{
			clock_t start = clock();

			for(int i = 0; i < 10000000; i++)
				lm.WriteCLUT(TEX0, TEXCLUT);

			clock_t diff = clock() - start;

			CString str;
			str.Format(_T("%d"), diff);
			AfxMessageBox(str);
		}
*/
/*
		GIFRegTEXA TEXA;
		TEXA.AEM = 0; // 1
		TEXA.TA0 = 0x7f;
		TEXA.TA1 = 0x80;

		int w = 1024*1024;

		WORD* src = (WORD*)lm.GetVM();
		DWORD* dst = (DWORD*)_aligned_malloc(w*4, 16);

		for(int j = 0; j < 10; j++)
		{
			clock_t start = clock();

			for(int i = 0; i < 1000; i++)
			{
				//Expand16_sse2(src, dst, w, &TEXA);
				Expand16_c(src, dst, w, &TEXA);
			}

			clock_t diff = clock() - start;

			CString str;
			str.Format(_T("%d"), diff);
			AfxMessageBox(str);
		}

		_aligned_free(dst);
*/
	}
} sddscsd;

//

#define ASSERT_BLOCK(r, w, h) \
	ASSERT((r).Width() >= w && (r).Height() >= h && !((r).left&(w-1)) && !((r).top&(h-1)) && !((r).right&(w-1)) && !((r).bottom&(h-1))); \

#if defined(_M_AMD64) || _M_IX86_FP >= 2
#define BLOCK_PREFETCH(mem) \
	_mm_prefetch(&mem[16*0], _MM_HINT_T0); \
	_mm_prefetch(&mem[16*2], _MM_HINT_T0); \
	_mm_prefetch(&mem[16*4], _MM_HINT_T0); \
	_mm_prefetch(&mem[16*6], _MM_HINT_T0); \
	_mm_prefetch(&mem[16*8], _MM_HINT_T0); \
	_mm_prefetch(&mem[16*10], _MM_HINT_T0); \
	_mm_prefetch(&mem[16*12], _MM_HINT_T0); \
	_mm_prefetch(&mem[16*14], _MM_HINT_T0); \

#define BLOCK_PREFETCH_32(x, y, w) {const char* next = (const char*)&m_vm32[blockAddress32(x + (w), y, TEX0.TBP0, TEX0.TBW)]; BLOCK_PREFETCH(next);}
#define BLOCK_PREFETCH_16(x, y, w) {const char* next = (const char*)&m_vm16[blockAddress16(x + (w), y, TEX0.TBP0, TEX0.TBW)]; BLOCK_PREFETCH(next);}
#define BLOCK_PREFETCH_16S(x, y, w) {const char* next = (const char*)&m_vm16[blockAddress16S(x + (w), y, TEX0.TBP0, TEX0.TBW)]; BLOCK_PREFETCH(next);}
#define BLOCK_PREFETCH_8(x, y, w) {const char* next = (const char*)&m_vm8[blockAddress8(x + (w), y, TEX0.TBP0, TEX0.TBW)]; BLOCK_PREFETCH(next);}
#define BLOCK_PREFETCH_4(x, y, w) {const char* next = (const char*)&m_vm8[blockAddress4(x + (w), y, TEX0.TBP0, TEX0.TBW)>>1]; BLOCK_PREFETCH(next);}
#else 
#define BLOCK_PREFETCH_32(x, y, w)
#define BLOCK_PREFETCH_16(x, y, w)
#define BLOCK_PREFETCH_16S(x, y, w)
#define BLOCK_PREFETCH_8(x, y, w)
#define BLOCK_PREFETCH_4(x, y, w)
#endif

#define FOREACH_BLOCK_START(r, w, h, t) \
	for(int y = (r).top; y < (r).bottom; y += (h)) \
	{ 	ASSERT_BLOCK(r, w, h); \
		BYTE* ptr = dst + (y-(r).top)*dstpitch; \
		for(int x = (r).left; x < (r).right; x += (w)) \
		{ \
			BLOCK_PREFETCH_##t##(x + (w), y, w) \

#define FOREACH_BLOCK_END }}

//

DWORD GSLocalMemory::pageOffset32[32][32][64];
DWORD GSLocalMemory::pageOffset32Z[32][32][64];
DWORD GSLocalMemory::pageOffset16[32][64][64];
DWORD GSLocalMemory::pageOffset16S[32][64][64];
DWORD GSLocalMemory::pageOffset16Z[32][64][64];
DWORD GSLocalMemory::pageOffset16SZ[32][64][64];
DWORD GSLocalMemory::pageOffset8[32][64][128];
DWORD GSLocalMemory::pageOffset4[32][128][128];

int GSLocalMemory::rowOffset32[2048];
int GSLocalMemory::rowOffset32Z[2048];
int GSLocalMemory::rowOffset16[2048];
int GSLocalMemory::rowOffset16S[2048];
int GSLocalMemory::rowOffset16Z[2048];
int GSLocalMemory::rowOffset16SZ[2048];
int GSLocalMemory::rowOffset8[2][2048];
int GSLocalMemory::rowOffset4[2][2048];

//

DWORD GSLocalMemory::m_xtbl[1024];
DWORD GSLocalMemory::m_ytbl[1024]; 

//

GSLocalMemory::psmtbl_t GSLocalMemory::m_psmtbl[64];

//

GSLocalMemory::GSLocalMemory()
	: m_fCLUTMayBeDirty(true)
{
	int len = 1024*1024*4*2; // *2 for safety...

	m_vm8 = (BYTE*)_aligned_malloc(len, 16);
	memset(m_vm8, 0, len);

	m_pCLUT = (WORD*)_aligned_malloc(256*2*sizeof(WORD)*2, 16);
	m_pCLUT32 = (DWORD*)_aligned_malloc(256*sizeof(DWORD), 16);
	m_pCLUT64 = (UINT64*)_aligned_malloc(256*sizeof(UINT64), 16);

	for(int bp = 0; bp < 32; bp++)
	{
		for(int y = 0; y < 32; y++) for(int x = 0; x < 64; x++)
		{
			pageOffset32[bp][y][x] = pixelAddressOrg32(x, y, bp, 0);
			pageOffset32Z[bp][y][x] = pixelAddressOrg32Z(x, y, bp, 0);
		}

		for(int y = 0; y < 64; y++) for(int x = 0; x < 64; x++) 
		{
			pageOffset16[bp][y][x] = pixelAddressOrg16(x, y, bp, 0);
			pageOffset16S[bp][y][x] = pixelAddressOrg16S(x, y, bp, 0);
			pageOffset16Z[bp][y][x] = pixelAddressOrg16Z(x, y, bp, 0);
			pageOffset16SZ[bp][y][x] = pixelAddressOrg16SZ(x, y, bp, 0);
		}

		for(int y = 0; y < 64; y++) for(int x = 0; x < 128; x++)
		{
			pageOffset8[bp][y][x] = pixelAddressOrg8(x, y, bp, 0);
		}

		for(int y = 0; y < 128; y++) for(int x = 0; x < 128; x++)
		{
			pageOffset4[bp][y][x] = pixelAddressOrg4(x, y, bp, 0);
		}
	}

	{
		for(int x = 0; x < countof(rowOffset32); x++)
			rowOffset32[x] = (int)pixelAddress32(x, 0, 0, 32) - (int)pixelAddress32(0, 0, 0, 32);

		for(int x = 0; x < countof(rowOffset32Z); x++)
			rowOffset32Z[x] = (int)pixelAddress32Z(x, 0, 0, 32) - (int)pixelAddress32Z(0, 0, 0, 32);

		for(int x = 0; x < countof(rowOffset16); x++)
			rowOffset16[x] = (int)pixelAddress16(x, 0, 0, 32) - (int)pixelAddress16(0, 0, 0, 32);

		for(int x = 0; x < countof(rowOffset16S); x++)
			rowOffset16S[x] = (int)pixelAddress16S(x, 0, 0, 32) - (int)pixelAddress16S(0, 0, 0, 32);

		for(int x = 0; x < countof(rowOffset16Z); x++)
			rowOffset16Z[x] = (int)pixelAddress16Z(x, 0, 0, 32) - (int)pixelAddress16Z(0, 0, 0, 32);

		for(int x = 0; x < countof(rowOffset16SZ); x++)
			rowOffset16SZ[x] = (int)pixelAddress16SZ(x, 0, 0, 32) - (int)pixelAddress16SZ(0, 0, 0, 32);

		for(int x = 0; x < countof(rowOffset8[0]); x++)
			rowOffset8[0][x] = (int)pixelAddress8(x, 0, 0, 32) - (int)pixelAddress8(0, 0, 0, 32),
			rowOffset8[1][x] = (int)pixelAddress8(x, 2, 0, 32) - (int)pixelAddress8(0, 2, 0, 32);

		for(int x = 0; x < countof(rowOffset4[0]); x++)
			rowOffset4[0][x] = (int)pixelAddress4(x, 0, 0, 32) - (int)pixelAddress4(0, 0, 0, 32),
			rowOffset4[1][x] = (int)pixelAddress4(x, 2, 0, 32) - (int)pixelAddress4(0, 2, 0, 32);
	}

	for(int i = 0; i < countof(m_psmtbl); i++)
	{
		m_psmtbl[i].pa = &GSLocalMemory::pixelAddress32;
		m_psmtbl[i].ba = &GSLocalMemory::blockAddress32;
		m_psmtbl[i].pga = &GSLocalMemory::pageAddress32;
		m_psmtbl[i].rp = &GSLocalMemory::readPixel32;
		m_psmtbl[i].rpa = &GSLocalMemory::readPixel32;
		m_psmtbl[i].wp = &GSLocalMemory::writePixel32;
		m_psmtbl[i].wpa = &GSLocalMemory::writePixel32;
		m_psmtbl[i].rt = m_psmtbl[i].rtNP = m_psmtbl[i].rtP = &GSLocalMemory::readTexel32;
		m_psmtbl[i].rta = &GSLocalMemory::readTexel32;
		m_psmtbl[i].wfa = &GSLocalMemory::writePixel32;
		m_psmtbl[i].st = &GSLocalMemory::SwizzleTexture32;
		m_psmtbl[i].ust = m_psmtbl[i].ustP = m_psmtbl[i].ustNP = &GSLocalMemory::unSwizzleTexture32;
		m_psmtbl[i].bpp = m_psmtbl[i].trbpp = 32;
		m_psmtbl[i].pal = 0;
		m_psmtbl[i].bs = CSize(8, 8);
		for(int j = 0; j < 8; j++) m_psmtbl[i].rowOffset[j] = rowOffset32;
	}

	m_psmtbl[PSM_PSMCT16].pa = &GSLocalMemory::pixelAddress16;
	m_psmtbl[PSM_PSMCT16S].pa = &GSLocalMemory::pixelAddress16S;
	m_psmtbl[PSM_PSMT8].pa = &GSLocalMemory::pixelAddress8;
	m_psmtbl[PSM_PSMT4].pa = &GSLocalMemory::pixelAddress4;
	m_psmtbl[PSM_PSMZ32].pa = &GSLocalMemory::pixelAddress32Z;
	m_psmtbl[PSM_PSMZ24].pa = &GSLocalMemory::pixelAddress32Z;
	m_psmtbl[PSM_PSMZ16].pa = &GSLocalMemory::pixelAddress16Z;
	m_psmtbl[PSM_PSMZ16S].pa = &GSLocalMemory::pixelAddress16SZ;

	m_psmtbl[PSM_PSMCT16].ba = &GSLocalMemory::blockAddress16;
	m_psmtbl[PSM_PSMCT16S].ba = &GSLocalMemory::blockAddress16S;
	m_psmtbl[PSM_PSMT8].ba = &GSLocalMemory::blockAddress8;
	m_psmtbl[PSM_PSMT4].ba = &GSLocalMemory::blockAddress4;
	m_psmtbl[PSM_PSMZ32].ba = &GSLocalMemory::blockAddress32Z;
	m_psmtbl[PSM_PSMZ24].ba = &GSLocalMemory::blockAddress32Z;
	m_psmtbl[PSM_PSMZ16].ba = &GSLocalMemory::blockAddress16Z;
	m_psmtbl[PSM_PSMZ16S].ba = &GSLocalMemory::blockAddress16SZ;

	m_psmtbl[PSM_PSMCT16].pga = &GSLocalMemory::pageAddress16;
	m_psmtbl[PSM_PSMCT16S].pga = &GSLocalMemory::pageAddress16;
	m_psmtbl[PSM_PSMZ16].pga = &GSLocalMemory::pageAddress16;
	m_psmtbl[PSM_PSMZ16S].pga = &GSLocalMemory::pageAddress16;
	m_psmtbl[PSM_PSMT8].pga = &GSLocalMemory::pageAddress8;
	m_psmtbl[PSM_PSMT4].pga = &GSLocalMemory::pageAddress4;

	m_psmtbl[PSM_PSMCT24].rp = &GSLocalMemory::readPixel24;
	m_psmtbl[PSM_PSMCT16].rp = &GSLocalMemory::readPixel16;
	m_psmtbl[PSM_PSMCT16S].rp = &GSLocalMemory::readPixel16S;
	m_psmtbl[PSM_PSMT8].rp = &GSLocalMemory::readPixel8;
	m_psmtbl[PSM_PSMT4].rp = &GSLocalMemory::readPixel4;
	m_psmtbl[PSM_PSMT8H].rp = &GSLocalMemory::readPixel8H;
	m_psmtbl[PSM_PSMT4HL].rp = &GSLocalMemory::readPixel4HL;
	m_psmtbl[PSM_PSMT4HH].rp = &GSLocalMemory::readPixel4HH;
	m_psmtbl[PSM_PSMZ32].rp = &GSLocalMemory::readPixel32Z;
	m_psmtbl[PSM_PSMZ24].rp = &GSLocalMemory::readPixel24Z;
	m_psmtbl[PSM_PSMZ16].rp = &GSLocalMemory::readPixel16Z;
	m_psmtbl[PSM_PSMZ16S].rp = &GSLocalMemory::readPixel16SZ;

	m_psmtbl[PSM_PSMCT24].rpa = &GSLocalMemory::readPixel24;
	m_psmtbl[PSM_PSMCT16].rpa = &GSLocalMemory::readPixel16;
	m_psmtbl[PSM_PSMCT16S].rpa = &GSLocalMemory::readPixel16S;
	m_psmtbl[PSM_PSMT8].rpa = &GSLocalMemory::readPixel8;
	m_psmtbl[PSM_PSMT4].rpa = &GSLocalMemory::readPixel4;
	m_psmtbl[PSM_PSMT8H].rpa = &GSLocalMemory::readPixel8H;
	m_psmtbl[PSM_PSMT4HL].rpa = &GSLocalMemory::readPixel4HL;
	m_psmtbl[PSM_PSMT4HH].rpa = &GSLocalMemory::readPixel4HH;
	m_psmtbl[PSM_PSMZ32].rpa = &GSLocalMemory::readPixel32Z;
	m_psmtbl[PSM_PSMZ24].rpa = &GSLocalMemory::readPixel24Z;
	m_psmtbl[PSM_PSMZ16].rpa = &GSLocalMemory::readPixel16Z;
	m_psmtbl[PSM_PSMZ16S].rpa = &GSLocalMemory::readPixel16SZ;

	m_psmtbl[PSM_PSMCT32].wp = &GSLocalMemory::writePixel32;
	m_psmtbl[PSM_PSMCT24].wp = &GSLocalMemory::writePixel24;
	m_psmtbl[PSM_PSMCT16].wp = &GSLocalMemory::writePixel16;
	m_psmtbl[PSM_PSMCT16S].wp = &GSLocalMemory::writePixel16S;
	m_psmtbl[PSM_PSMT8].wp = &GSLocalMemory::writePixel8;
	m_psmtbl[PSM_PSMT4].wp = &GSLocalMemory::writePixel4;
	m_psmtbl[PSM_PSMT8H].wp = &GSLocalMemory::writePixel8H;
	m_psmtbl[PSM_PSMT4HL].wp = &GSLocalMemory::writePixel4HL;
	m_psmtbl[PSM_PSMT4HH].wp = &GSLocalMemory::writePixel4HH;
	m_psmtbl[PSM_PSMZ32].wp = &GSLocalMemory::writePixel32Z;
	m_psmtbl[PSM_PSMZ24].wp = &GSLocalMemory::writePixel24Z;
	m_psmtbl[PSM_PSMZ16].wp = &GSLocalMemory::writePixel16Z;
	m_psmtbl[PSM_PSMZ16S].wp = &GSLocalMemory::writePixel16SZ;

	m_psmtbl[PSM_PSMCT32].wpa = &GSLocalMemory::writePixel32;
	m_psmtbl[PSM_PSMCT24].wpa = &GSLocalMemory::writePixel24;
	m_psmtbl[PSM_PSMCT16].wpa = &GSLocalMemory::writePixel16;
	m_psmtbl[PSM_PSMCT16S].wpa = &GSLocalMemory::writePixel16S;
	m_psmtbl[PSM_PSMT8].wpa = &GSLocalMemory::writePixel8;
	m_psmtbl[PSM_PSMT4].wpa = &GSLocalMemory::writePixel4;
	m_psmtbl[PSM_PSMT8H].wpa = &GSLocalMemory::writePixel8H;
	m_psmtbl[PSM_PSMT4HL].wpa = &GSLocalMemory::writePixel4HL;
	m_psmtbl[PSM_PSMT4HH].wpa = &GSLocalMemory::writePixel4HH;
	m_psmtbl[PSM_PSMZ32].wpa = &GSLocalMemory::writePixel32Z;
	m_psmtbl[PSM_PSMZ24].wpa = &GSLocalMemory::writePixel24Z;
	m_psmtbl[PSM_PSMZ16].wpa = &GSLocalMemory::writePixel16Z;
	m_psmtbl[PSM_PSMZ16S].wpa = &GSLocalMemory::writePixel16SZ;

	m_psmtbl[PSM_PSMCT24].rt = &GSLocalMemory::readTexel24;
	m_psmtbl[PSM_PSMCT16].rt = &GSLocalMemory::readTexel16;
	m_psmtbl[PSM_PSMCT16S].rt = &GSLocalMemory::readTexel16S;
	m_psmtbl[PSM_PSMT8].rt = &GSLocalMemory::readTexel8;
	m_psmtbl[PSM_PSMT4].rt = &GSLocalMemory::readTexel4;
	m_psmtbl[PSM_PSMT8H].rt = &GSLocalMemory::readTexel8H;
	m_psmtbl[PSM_PSMT4HL].rt = &GSLocalMemory::readTexel4HL;
	m_psmtbl[PSM_PSMT4HH].rt = &GSLocalMemory::readTexel4HH;

	m_psmtbl[PSM_PSMCT24].rta = &GSLocalMemory::readTexel24;
	m_psmtbl[PSM_PSMCT16].rta = &GSLocalMemory::readTexel16;
	m_psmtbl[PSM_PSMCT16S].rta = &GSLocalMemory::readTexel16S;
	m_psmtbl[PSM_PSMT8].rta = &GSLocalMemory::readTexel8;
	m_psmtbl[PSM_PSMT4].rta = &GSLocalMemory::readTexel4;
	m_psmtbl[PSM_PSMT8H].rta = &GSLocalMemory::readTexel8H;
	m_psmtbl[PSM_PSMT4HL].rta = &GSLocalMemory::readTexel4HL;
	m_psmtbl[PSM_PSMT4HH].rta = &GSLocalMemory::readTexel4HH;

	m_psmtbl[PSM_PSMCT24].wfa = &GSLocalMemory::writePixel24;
	m_psmtbl[PSM_PSMCT16].wfa = &GSLocalMemory::writeFrame16;
	m_psmtbl[PSM_PSMCT16S].wfa = &GSLocalMemory::writeFrame16S;

	m_psmtbl[PSM_PSMCT16].rtP = &GSLocalMemory::readTexel16P;
	m_psmtbl[PSM_PSMCT16S].rtP = &GSLocalMemory::readTexel16SP;
	m_psmtbl[PSM_PSMT8].rtP = &GSLocalMemory::readTexel8P;
	m_psmtbl[PSM_PSMT4].rtP = &GSLocalMemory::readTexel4P;
	m_psmtbl[PSM_PSMT8H].rtP = &GSLocalMemory::readTexel8HP;
	m_psmtbl[PSM_PSMT4HL].rtP = &GSLocalMemory::readTexel4HLP;
	m_psmtbl[PSM_PSMT4HH].rtP = &GSLocalMemory::readTexel4HHP;

	m_psmtbl[PSM_PSMCT16].rtNP = &GSLocalMemory::readTexel16P;
	m_psmtbl[PSM_PSMCT16S].rtNP = &GSLocalMemory::readTexel16SP;
	m_psmtbl[PSM_PSMT8].rtNP = &GSLocalMemory::readTexel8;
	m_psmtbl[PSM_PSMT4].rtNP = &GSLocalMemory::readTexel4;
	m_psmtbl[PSM_PSMT8H].rtNP = &GSLocalMemory::readTexel8H;
	m_psmtbl[PSM_PSMT4HL].rtNP = &GSLocalMemory::readTexel4HL;
	m_psmtbl[PSM_PSMT4HH].rtNP = &GSLocalMemory::readTexel4HH;

	m_psmtbl[PSM_PSMCT24].st = &GSLocalMemory::SwizzleTexture24;
	m_psmtbl[PSM_PSMCT16].st = &GSLocalMemory::SwizzleTexture16;
	m_psmtbl[PSM_PSMCT16S].st = &GSLocalMemory::SwizzleTexture16S;
	m_psmtbl[PSM_PSMT8].st = &GSLocalMemory::SwizzleTexture8;
	m_psmtbl[PSM_PSMT4].st = &GSLocalMemory::SwizzleTexture4;
	m_psmtbl[PSM_PSMT8H].st = &GSLocalMemory::SwizzleTexture8H;
	m_psmtbl[PSM_PSMT4HL].st = &GSLocalMemory::SwizzleTexture4HL;
	m_psmtbl[PSM_PSMT4HH].st = &GSLocalMemory::SwizzleTexture4HH;

	m_psmtbl[PSM_PSMCT24].ust = &GSLocalMemory::unSwizzleTexture24;
	m_psmtbl[PSM_PSMCT16].ust = &GSLocalMemory::unSwizzleTexture16;
	m_psmtbl[PSM_PSMCT16S].ust = &GSLocalMemory::unSwizzleTexture16S;
	m_psmtbl[PSM_PSMT8].ust = &GSLocalMemory::unSwizzleTexture8;
	m_psmtbl[PSM_PSMT4].ust = &GSLocalMemory::unSwizzleTexture4;
	m_psmtbl[PSM_PSMT8H].ust = &GSLocalMemory::unSwizzleTexture8H;
	m_psmtbl[PSM_PSMT4HL].ust = &GSLocalMemory::unSwizzleTexture4HL;
	m_psmtbl[PSM_PSMT4HH].ust = &GSLocalMemory::unSwizzleTexture4HH;

	m_psmtbl[PSM_PSMCT16].ustP = &GSLocalMemory::unSwizzleTexture16P;
	m_psmtbl[PSM_PSMCT16S].ustP = &GSLocalMemory::unSwizzleTexture16SP;
	m_psmtbl[PSM_PSMT8].ustP = &GSLocalMemory::unSwizzleTexture8P;
	m_psmtbl[PSM_PSMT4].ustP = &GSLocalMemory::unSwizzleTexture4P;
	m_psmtbl[PSM_PSMT8H].ustP = &GSLocalMemory::unSwizzleTexture8HP;
	m_psmtbl[PSM_PSMT4HL].ustP = &GSLocalMemory::unSwizzleTexture4HLP;
	m_psmtbl[PSM_PSMT4HH].ustP = &GSLocalMemory::unSwizzleTexture4HHP;

	m_psmtbl[PSM_PSMCT16].ustNP = &GSLocalMemory::unSwizzleTexture16P;
	m_psmtbl[PSM_PSMCT16S].ustNP = &GSLocalMemory::unSwizzleTexture16SP;
	m_psmtbl[PSM_PSMT8].ustNP = &GSLocalMemory::unSwizzleTexture8NP;
	m_psmtbl[PSM_PSMT4].ustNP = &GSLocalMemory::unSwizzleTexture4NP;
	m_psmtbl[PSM_PSMT8H].ustNP = &GSLocalMemory::unSwizzleTexture8HNP;
	m_psmtbl[PSM_PSMT4HL].ustNP = &GSLocalMemory::unSwizzleTexture4HLNP;
	m_psmtbl[PSM_PSMT4HH].ustNP = &GSLocalMemory::unSwizzleTexture4HHNP;

	m_psmtbl[PSM_PSMT8].pal = m_psmtbl[PSM_PSMT8H].pal = 256;
	m_psmtbl[PSM_PSMT4].pal = m_psmtbl[PSM_PSMT4HL].pal = m_psmtbl[PSM_PSMT4HH].pal = 16;

	m_psmtbl[PSM_PSMCT16].bpp = m_psmtbl[PSM_PSMCT16S].bpp = 16;
	m_psmtbl[PSM_PSMT8].bpp = 8;
	m_psmtbl[PSM_PSMT4].bpp = 4;
	m_psmtbl[PSM_PSMZ16].bpp = m_psmtbl[PSM_PSMZ16S].bpp = 16;

	m_psmtbl[PSM_PSMCT24].trbpp = 24;
	m_psmtbl[PSM_PSMCT16].trbpp = m_psmtbl[PSM_PSMCT16S].trbpp = 16;
	m_psmtbl[PSM_PSMT8].trbpp = m_psmtbl[PSM_PSMT8H].trbpp = 8;
	m_psmtbl[PSM_PSMT4].trbpp = m_psmtbl[PSM_PSMT4HL].trbpp = m_psmtbl[PSM_PSMT4HH].trbpp = 4;
	m_psmtbl[PSM_PSMZ24].trbpp = 24;
	m_psmtbl[PSM_PSMZ16].trbpp = m_psmtbl[PSM_PSMZ16S].trbpp = 16;

	m_psmtbl[PSM_PSMCT16].bs = m_psmtbl[PSM_PSMCT16S].bs = CSize(16, 8);
	m_psmtbl[PSM_PSMT8].bs = CSize(16, 16);
	m_psmtbl[PSM_PSMT4].bs = CSize(32, 32);
	m_psmtbl[PSM_PSMZ16].bs = m_psmtbl[PSM_PSMZ16S].bs = CSize(16, 8);

	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMCT16].rowOffset[i] = rowOffset16;
	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMCT16S].rowOffset[i] = rowOffset16S;
	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMT8].rowOffset[i] = rowOffset8[((i+2)>>2)&1];
	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMT4].rowOffset[i] = rowOffset4[((i+2)>>2)&1];
	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMZ32].rowOffset[i] = rowOffset32Z;
	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMZ24].rowOffset[i] = rowOffset32Z;
	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMZ16].rowOffset[i] = rowOffset16Z;
	for(int i = 0; i < 8; i++) m_psmtbl[PSM_PSMZ16S].rowOffset[i] = rowOffset16SZ;


#ifdef _OPENMP
	// need real cpus, with HT it is only going to be slower now
	omp_set_num_threads(omp_get_num_procs());
#endif

/*	
//	return; 
//	omp_set_num_threads(2);
//
	{
		int tmpsize = 1024*1024*2;
		BYTE* tmp1 = new BYTE[tmpsize];
		BYTE* tmp2 = new BYTE[tmpsize];

		for(int i = 0; i < 1024*1024; i++)
			((DWORD*)GetVM())[i] = rand()*0x12345678;

		GIFRegTEX0 TEX0;
		TEX0.TBP0 = 0;
		TEX0.TBW = 16;
		GIFRegTEXA TEXA;
		TEXA.AEM = 0; // 1

		__declspec(align(32)) static DWORD dst[1024*1024];

		int nthreads[] = {1,2,4,8};
		for(int j = 0; j < countof(nthreads); j++)
		{
			omp_set_num_threads(nthreads[j]);

			clock_t start = clock();

			for(int i = 0; i < 1000; i++)
			{
				unSwizzleTexture32(CRect(0,0,1024,1024), (BYTE*)dst, 1024*4, TEX0, TEXA);
				// for(int k = 0; k < tmpsize; k++) tmp2[k] = tmp1[k];
			}

			clock_t diff = clock() - start;

			CString str;
			str.Format(_T("%d threads: %d ms"), nthreads[j], diff);
			AfxMessageBox(str);
		}
			
		omp_set_num_threads(2);

		delete [] tmp1;
		delete [] tmp2;
	}
*/
}

GSLocalMemory::~GSLocalMemory()
{
	_aligned_free(m_vm8);
	_aligned_free(m_pCLUT);
	_aligned_free(m_pCLUT32);
	_aligned_free(m_pCLUT64);	
}

////////////////////

void GSLocalMemory::RoundDown(CSize& s, CSize bs)
{
	s.cx &= ~(bs.cx-1);
	s.cy &= ~(bs.cy-1);
}

void GSLocalMemory::RoundUp(CSize& s, CSize bs)
{
	s.cx = (s.cx + (bs.cx-1)) & ~(bs.cx-1);
	s.cy = (s.cy + (bs.cy-1)) & ~(bs.cy-1);
}

////////////////////

DWORD __fastcall GSLocalMemory::pageAddress32(int x, int y, DWORD bp, DWORD bw)
{
	return ((bp >> 5) + (y >> 5) * bw + (x >> 6)) << 11; 
}

DWORD __fastcall GSLocalMemory::pageAddress16(int x, int y, DWORD bp, DWORD bw)
{
	return ((bp >> 5) + (y >> 6) * bw + (x >> 6)) << 12;
}

DWORD __fastcall GSLocalMemory::pageAddress8(int x, int y, DWORD bp, DWORD bw)
{
	return ((bp >> 5) + (y >> 6) * ((bw+1)>>1) + (x >> 7)) << 13; 
}

DWORD __fastcall GSLocalMemory::pageAddress4(int x, int y, DWORD bp, DWORD bw)
{
	return ((bp >> 5) + (y >> 7) * ((bw+1)>>1) + (x >> 7)) << 14;
}

////////////////////

DWORD __fastcall GSLocalMemory::blockAddress32(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f);
	DWORD block = blockTable32[(y >> 3) & 3][(x >> 3) & 7];
	return (page + block) << 6;
}

DWORD __fastcall GSLocalMemory::blockAddress16(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16[(y >> 3) & 7][(x >> 4) & 3];
	return (page + block) << 7;
}

DWORD __fastcall GSLocalMemory::blockAddress16S(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
	return (page + block) << 7;
}

DWORD __fastcall GSLocalMemory::blockAddress8(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + ((y >> 1) & ~0x1f) * ((bw+1)>>1) + ((x >> 2) & ~0x1f); 
	DWORD block = blockTable8[(y >> 4) & 3][(x >> 4) & 7];
	return (page + block) << 8;
}

DWORD __fastcall GSLocalMemory::blockAddress4(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + ((y >> 2) & ~0x1f) * ((bw+1)>>1) + ((x >> 2) & ~0x1f); 
	DWORD block = blockTable4[(y >> 4) & 7][(x >> 5) & 3];
	return (page + block) << 9;
}

DWORD __fastcall GSLocalMemory::blockAddress32Z(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
	return (page + block) << 6;
}

DWORD __fastcall GSLocalMemory::blockAddress16Z(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
	return (page + block) << 7;
}

DWORD __fastcall GSLocalMemory::blockAddress16SZ(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
	return (page + block) << 7;
}

////////////////////

DWORD __fastcall GSLocalMemory::pixelAddressOrg32(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 5) * bw + (x >> 6); 
//	DWORD block = (bp & 0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
//	DWORD word = (((page << 5) + block) << 6) + columnTable32[y & 7][x & 7];
	DWORD page = bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f);
	DWORD block = blockTable32[(y >> 3) & 3][(x >> 3) & 7];
	DWORD word = ((page + block) << 6) + columnTable32[y & 7][x & 7];
	ASSERT(word < 1024*1024);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddressOrg16(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
//	DWORD block = (bp & 0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
//	DWORD word = (((page << 5) + block) << 7) + columnTable16[y & 7][x & 15];
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16[(y >> 3) & 7][(x >> 4) & 3];
	DWORD word = ((page + block) << 7) + columnTable16[y & 7][x & 15];
	ASSERT(word < 1024*1024*2);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddressOrg16S(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
//	DWORD block = (bp & 0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
//	DWORD word = (((page << 5) + block) << 7) + columnTable16[y & 7][x & 15];
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
	DWORD word = ((page + block) << 7) + columnTable16[y & 7][x & 15];
	ASSERT(word < 1024*1024*2);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddressOrg8(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 6) * ((bw+1)>>1) + (x >> 7); 
//	DWORD block = (bp & 0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
//	DWORD word = (((page << 5) + block) << 8) + columnTable8[y & 15][x & 15];
	DWORD page = bp + ((y >> 1) & ~0x1f) * ((bw+1)>>1) + ((x >> 2) & ~0x1f); 
	DWORD block = blockTable8[(y >> 4) & 3][(x >> 4) & 7];
	DWORD word = ((page + block) << 8) + columnTable8[y & 15][x & 15];
//	ASSERT(word < 1024*1024*4);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddressOrg4(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 7) * ((bw+1)>>1) + (x >> 7);
//	DWORD block = (bp & 0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
//	DWORD word = (((page << 5) + block) << 9) + columnTable4[y & 15][x & 31];
	DWORD page = bp + ((y >> 2) & ~0x1f) * ((bw+1)>>1) + ((x >> 2) & ~0x1f); 
	DWORD block = blockTable4[(y >> 4) & 7][(x >> 5) & 3];
	DWORD word = ((page + block) << 9) + columnTable4[y & 15][x & 31];
	ASSERT(word < 1024*1024*8);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddressOrg32Z(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 5) * bw + (x >> 6); 
//	DWORD block = (bp & 0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
//	DWORD word = (((page << 5) + block) << 6) + ((y & 7) << 3) + (x & 7);
	DWORD page = bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
	DWORD word = ((page + block) << 6) + ((y & 7) << 3) + (x & 7);
	ASSERT(word < 1024*1024);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddressOrg16Z(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
//	DWORD block = (bp & 0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
//	DWORD word = (((page << 5) + block) << 7) + ((y & 7) << 4) + (x & 15);
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
	DWORD word = ((page + block) << 7) + ((y & 7) << 4) + (x & 15);
	ASSERT(word < 1024*1024*2);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddressOrg16SZ(int x, int y, DWORD bp, DWORD bw)
{
//	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
//	DWORD block = (bp & 0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
//	DWORD word = (((page << 5) + block) << 7) + ((y & 7) << 4) + (x & 15);
	DWORD page = bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f); 
	DWORD block = blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
	DWORD word = ((page + block) << 7) + ((y & 7) << 4) + (x & 15);
	ASSERT(word < 1024*1024*2);
	return word;
}

////////////////////

DWORD __fastcall GSLocalMemory::pixelAddress32(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 5) * bw + (x >> 6); 
	DWORD word = (page << 11) + pageOffset32[bp & 0x1f][y & 0x1f][x & 0x3f];
	ASSERT(word < 1024*1024);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddress16(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
	DWORD word = (page << 12) + pageOffset16[bp & 0x1f][y & 0x3f][x & 0x3f];
	ASSERT(word < 1024*1024*2);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddress16S(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
	DWORD word = (page << 12) + pageOffset16S[bp & 0x1f][y & 0x3f][x & 0x3f];
	ASSERT(word < 1024*1024*2);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddress8(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 6) * ((bw+1)>>1) + (x >> 7); 
	DWORD word = (page << 13) + pageOffset8[bp & 0x1f][y & 0x3f][x & 0x7f];
	ASSERT(word < 1024*1024*4);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddress4(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 7) * ((bw+1)>>1) + (x >> 7);
	DWORD word = (page << 14) + pageOffset4[bp & 0x1f][y & 0x7f][x & 0x7f];
	ASSERT(word < 1024*1024*8);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddress32Z(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 5) * bw + (x >> 6); 
	DWORD word = (page << 11) + pageOffset32Z[bp & 0x1f][y & 0x1f][x & 0x3f];
	ASSERT(word < 1024*1024);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddress16Z(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
	DWORD word = (page << 12) + pageOffset16Z[bp & 0x1f][y & 0x3f][x & 0x3f];
	ASSERT(word < 1024*1024*2);
	return word;
}

DWORD __fastcall GSLocalMemory::pixelAddress16SZ(int x, int y, DWORD bp, DWORD bw)
{
	DWORD page = (bp >> 5) + (y >> 6) * bw + (x >> 6); 
	DWORD word = (page << 12) + pageOffset16SZ[bp & 0x1f][y & 0x3f][x & 0x3f];
	ASSERT(word < 1024*1024*2);
	return word;
}

////////////////////

void GSLocalMemory::writePixel32(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress32(x, y, bp, bw);
	m_vm32[addr] = c;
}

void GSLocalMemory::writePixel24(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress32(x, y, bp, bw);
	m_vm32[addr] = (m_vm32[addr] & 0xff000000) | (c & 0x00ffffff);
}

void GSLocalMemory::writePixel16(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress16(x, y, bp, bw);
	m_vm16[addr] = (WORD)c;
}

void GSLocalMemory::writePixel16S(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress16S(x, y, bp, bw);
	m_vm16[addr] = (WORD)c;
}

void GSLocalMemory::writePixel8(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress8(x, y, bp, bw);
	m_vm8[addr] = (BYTE)c;
}

void GSLocalMemory::writePixel8H(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress32(x, y, bp, bw);
	m_vm32[addr] = (m_vm32[addr] & 0x00ffffff) | (c << 24);
}

void GSLocalMemory::writePixel4(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress4(x, y, bp, bw);
	int shift = (addr&1) << 2; addr >>= 1;
	m_vm8[addr] = (BYTE)((m_vm8[addr] & (0xf0 >> shift)) | ((c & 0x0f) << shift));
}

void GSLocalMemory::writePixel4HL(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress32(x, y, bp, bw);
	m_vm32[addr] = (m_vm32[addr] & 0xf0ffffff) | ((c & 0x0f) << 24);
}

void GSLocalMemory::writePixel4HH(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress32(x, y, bp, bw);
	m_vm32[addr] = (m_vm32[addr] & 0x0fffffff) | ((c & 0x0f) << 28);
}

void GSLocalMemory::writePixel32Z(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress32Z(x, y, bp, bw);
	m_vm32[addr] = c;
}

void GSLocalMemory::writePixel24Z(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress32Z(x, y, bp, bw);
	m_vm32[addr] = (m_vm32[addr] & 0xff000000) | (c & 0x00ffffff);
}

void GSLocalMemory::writePixel16Z(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress16Z(x, y, bp, bw);
	m_vm16[addr] = (WORD)c;
}

void GSLocalMemory::writePixel16SZ(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress16SZ(x, y, bp, bw);
	m_vm16[addr] = (WORD)c;
}

////////////////////

void GSLocalMemory::writeFrame16(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	c = ((c>>16)&0x8000)|((c>>9)&0x7c00)|((c>>6)&0x03e0)|((c>>3)&0x001f);
	writePixel16(x, y, c, bp, bw);
}

void GSLocalMemory::writeFrame16S(int x, int y, DWORD c, DWORD bp, DWORD bw)
{
	c = ((c>>16)&0x8000)|((c>>9)&0x7c00)|((c>>6)&0x03e0)|((c>>3)&0x001f);
	writePixel16S(x, y, c, bp, bw);
}

////////////////////

DWORD GSLocalMemory::readPixel32(int x, int y, DWORD bp, DWORD bw)
{
	return m_vm32[pixelAddress32(x, y, bp, bw)];
}

DWORD GSLocalMemory::readPixel24(int x, int y, DWORD bp, DWORD bw)
{
	return m_vm32[pixelAddress32(x, y, bp, bw)] & 0x00ffffff;
}

DWORD GSLocalMemory::readPixel16(int x, int y, DWORD bp, DWORD bw)
{
	return (DWORD)m_vm16[pixelAddress16(x, y, bp, bw)];
}

DWORD GSLocalMemory::readPixel16S(int x, int y, DWORD bp, DWORD bw)
{
	return (DWORD)m_vm16[pixelAddress16S(x, y, bp, bw)];
}

DWORD GSLocalMemory::readPixel8(int x, int y, DWORD bp, DWORD bw)
{
	return (DWORD)m_vm8[pixelAddress8(x, y, bp, bw)];
}

DWORD GSLocalMemory::readPixel8H(int x, int y, DWORD bp, DWORD bw)
{
	return m_vm32[pixelAddress32(x, y, bp, bw)] >> 24;
}

DWORD GSLocalMemory::readPixel4(int x, int y, DWORD bp, DWORD bw)
{
	DWORD addr = pixelAddress4(x, y, bp, bw);
	return (m_vm8[addr>>1] >> ((addr&1) << 2)) & 0x0f;
}

DWORD GSLocalMemory::readPixel4HL(int x, int y, DWORD bp, DWORD bw)
{
	return (m_vm32[pixelAddress32(x, y, bp, bw)] >> 24) & 0x0f;
}

DWORD GSLocalMemory::readPixel4HH(int x, int y, DWORD bp, DWORD bw)
{
	return (m_vm32[pixelAddress32(x, y, bp, bw)] >> 28) & 0x0f;
}

DWORD GSLocalMemory::readPixel32Z(int x, int y, DWORD bp, DWORD bw)
{
	return m_vm32[pixelAddress32Z(x, y, bp, bw)];
}

DWORD GSLocalMemory::readPixel24Z(int x, int y, DWORD bp, DWORD bw)
{
	return m_vm32[pixelAddress32Z(x, y, bp, bw)] & 0x00ffffff;
}

DWORD GSLocalMemory::readPixel16Z(int x, int y, DWORD bp, DWORD bw)
{
	return (DWORD)m_vm16[pixelAddress16Z(x, y, bp, bw)];
}

DWORD GSLocalMemory::readPixel16SZ(int x, int y, DWORD bp, DWORD bw)
{
	return (DWORD)m_vm16[pixelAddress16SZ(x, y, bp, bw)];
}

////////////////////
/*
void GSLocalMemory::writePixel32(DWORD addr, DWORD c)
{
	m_vm32[addr] = c;
}

void GSLocalMemory::writePixel24(DWORD addr, DWORD c)
{
	m_vm32[addr] = (m_vm32[addr] & 0xff000000) | (c & 0x00ffffff);
}

void GSLocalMemory::writePixel16(DWORD addr, DWORD c)
{
	m_vm16[addr] = (WORD)c;
}

void GSLocalMemory::writePixel16S(DWORD addr, DWORD c)
{
	m_vm16[addr] = (WORD)c;
}

void GSLocalMemory::writePixel8(DWORD addr, DWORD c)
{
	m_vm8[addr] = (BYTE)c;
}

void GSLocalMemory::writePixel8H(DWORD addr, DWORD c)
{
	m_vm32[addr] = (m_vm32[addr] & 0x00ffffff) | (c << 24);
}

void GSLocalMemory::writePixel4(DWORD addr, DWORD c)
{
	int shift = (addr&1) << 2; addr >>= 1;
	m_vm8[addr] = (BYTE)((m_vm8[addr] & (0xf0 >> shift)) | ((c & 0x0f) << shift));
}

void GSLocalMemory::writePixel4HL(DWORD addr, DWORD c)
{
	m_vm32[addr] = (m_vm32[addr] & 0xf0ffffff) | ((c & 0x0f) << 24);
}

void GSLocalMemory::writePixel4HH(DWORD addr, DWORD c)
{
	m_vm32[addr] = (m_vm32[addr] & 0x0fffffff) | ((c & 0x0f) << 28);
}

void GSLocalMemory::writePixel32Z(DWORD addr, DWORD c)
{
	m_vm32[addr] = c;
}

void GSLocalMemory::writePixel24Z(DWORD addr, DWORD c)
{
	m_vm32[addr] = (m_vm32[addr] & 0xff000000) | (c & 0x00ffffff);
}

void GSLocalMemory::writePixel16Z(DWORD addr, DWORD c)
{
	m_vm16[addr] = (WORD)c;
}

void GSLocalMemory::writePixel16SZ(DWORD addr, DWORD c)
{
	m_vm16[addr] = (WORD)c;
}

////////////////////

void GSLocalMemory::writeFrame16(DWORD addr, DWORD c)
{
	writePixel16(addr, ((c>>16)&0x8000)|((c>>9)&0x7c00)|((c>>6)&0x03e0)|((c>>3)&0x001f));
}

void GSLocalMemory::writeFrame16S(DWORD addr, DWORD c)
{
	writePixel16S(addr, ((c>>16)&0x8000)|((c>>9)&0x7c00)|((c>>6)&0x03e0)|((c>>3)&0x001f));
}

////////////////////

DWORD GSLocalMemory::readPixel32(DWORD addr)
{
	return m_vm32[addr];
}

DWORD GSLocalMemory::readPixel24(DWORD addr)
{
	return m_vm32[addr] & 0x00ffffff;
}

DWORD GSLocalMemory::readPixel16(DWORD addr)
{
	return (DWORD)m_vm16[addr];
}

DWORD GSLocalMemory::readPixel16S(DWORD addr)
{
	return (DWORD)m_vm16[addr];
}

DWORD GSLocalMemory::readPixel8(DWORD addr)
{
	return (DWORD)m_vm8[addr];
}

DWORD GSLocalMemory::readPixel8H(DWORD addr)
{
	return m_vm32[addr] >> 24;
}

DWORD GSLocalMemory::readPixel4(DWORD addr)
{
	return (m_vm8[addr>>1] >> ((addr&1) << 2)) & 0x0f;
}

DWORD GSLocalMemory::readPixel4HL(DWORD addr)
{
	return (m_vm32[addr] >> 24) & 0x0f;
}

DWORD GSLocalMemory::readPixel4HH(DWORD addr)
{
	return (m_vm32[addr] >> 28) & 0x0f;
}

DWORD GSLocalMemory::readPixel32Z(DWORD addr)
{
	return m_vm32[addr];
}

DWORD GSLocalMemory::readPixel24Z(DWORD addr)
{
	return m_vm32[addr] & 0x00ffffff;
}

DWORD GSLocalMemory::readPixel16Z(DWORD addr)
{
	return (DWORD)m_vm16[addr];
}

DWORD GSLocalMemory::readPixel16SZ(DWORD addr)
{
	return (DWORD)m_vm16[addr];
}
*/
////////////////////

bool GSLocalMemory::FillRect(const CRect& r, DWORD c, DWORD psm, DWORD fbp, DWORD fbw)
{
	const psmtbl_t& tbl = m_psmtbl[psm];

	writePixel wp = tbl.wp;
	pixelAddress ba = tbl.ba;

	int w = tbl.bs.cx;
	int h = tbl.bs.cy;
	int bpp = tbl.bpp;

	int shift = 0;

	switch(bpp)
	{
	case 32: shift = 0; break;
	case 16: shift = 1; c = (c&0xffff)*0x00010001; break;
	case 8: shift = 2; c = (c&0xff)*0x01010101; break;
	case 4: shift = 3; c = (c&0xf)*0x11111111; break;
	}

	CRect clip((r.left+(w-1))&~(w-1), (r.top+(h-1))&~(h-1), r.right&~(w-1), r.bottom&~(h-1));

	for(int y = r.top; y < clip.top; y++)
		for(int x = r.left; x < r.right; x++)
			(this->*wp)(x, y, c, fbp, fbw);

	for(int y = clip.top; y < clip.bottom; y += h)
	{
		for(int ys = y, ye = y + h; ys < ye; ys++)
		{
			for(int x = r.left; x < clip.left; x++)
				(this->*wp)(x, ys, c, fbp, fbw);
			for(int x = clip.right; x < r.right; x++)
				(this->*wp)(x, ys, c, fbp, fbw);
		}
	}

	if(psm == PSM_PSMCT24 || psm == PSM_PSMZ24)
	{
		c &= 0x00ffffff;
		for(int y = clip.top; y < clip.bottom; y += h)
		{
			for(int x = clip.left; x < clip.right; x += w)
			{
				DWORD* p = &m_vm32[ba(x, y, fbp, fbw)];
				for(int i = 0; i < 64; i++)
					p[i] = (p[i]&0xff000000) | c;
			}
		}
	}
	else
	{
		for(int y = clip.top; y < clip.bottom; y += h)
			for(int x = clip.left; x < clip.right; x += w)
				memsetd(&m_vm8[ba(x, y, fbp, fbw) << 2 >> shift], c, 64);
	}

	for(int y = clip.bottom; y < r.bottom; y++)
		for(int x = r.left; x < r.right; x++)
			(this->*wp)(x, y, c, fbp, fbw);

	return(true);
}

////////////////////

void GSLocalMemory::WriteCLUT(GIFRegTEX0 TEX0, GIFRegTEXCLUT TEXCLUT)
{
	switch(TEX0.CLD)
	{
	default:
	case 0: return;
	case 1: break;
	case 2: m_CBP[0] = TEX0.CBP; break;
	case 3: m_CBP[1] = TEX0.CBP; break;
	case 4: if(m_CBP[0] == TEX0.CBP) return; break;
	case 5: if(m_CBP[1] == TEX0.CBP) return; break;
	}

	{
		if(!m_fCLUTMayBeDirty && m_prevTEX0.i64 == TEX0.i64 && m_prevTEXCLUT.i64 == TEXCLUT.i64)
			return;

		m_prevTEX0 = TEX0;
		m_prevTEXCLUT = TEXCLUT;

		m_fCLUTMayBeDirty = false;
	}

	DWORD bp = TEX0.CBP;
	DWORD bw = TEX0.CSM == 0 ? 1 : TEXCLUT.CBW;

	WORD* pCLUT = m_pCLUT + (TEX0.CSA<<4);

	// NOTE: TEX0.CPSM == PSM_PSMCT24 is non-standard, KH uses it

	if(TEX0.CSM == 0)
	{
		if(TEX0.CPSM == PSM_PSMCT16 || TEX0.CPSM == PSM_PSMCT16S)
		{
			WORD* vm = &m_vm16[TEX0.CPSM == PSM_PSMCT16 ? blockAddress16(0, 0, bp, bw) : blockAddress16S(0, 0, bp, bw)];

			if(TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT8H)
			{
				WriteCLUT_T16_I8_CSM1(vm, pCLUT);
			}
			else if(TEX0.PSM == PSM_PSMT4HH || TEX0.PSM == PSM_PSMT4HL || TEX0.PSM == PSM_PSMT4)
			{
				WriteCLUT_T16_I4_CSM1(vm, pCLUT);
			}
		}
		else if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24)
		{
			DWORD* vm = &m_vm32[blockAddress32(0, 0, bp, bw)];

			if(TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT8H)
			{
				WriteCLUT_T32_I8_CSM1(vm, pCLUT);
			}
			else if(TEX0.PSM == PSM_PSMT4HH || TEX0.PSM == PSM_PSMT4HL || TEX0.PSM == PSM_PSMT4)
			{
				WriteCLUT_T32_I4_CSM1(vm, pCLUT);
			}
		}
	}
	else
	{
		readPixel rp = m_psmtbl[TEX0.CPSM].rp;

		int nPaletteEntries = m_psmtbl[TEX0.PSM].pal;

		ASSERT(nPaletteEntries == 0 || TEX0.CPSM == PSM_PSMCT16); // this is the only allowed format for CSM2, but we implement all of them, just in case...

		if(TEX0.CPSM == PSM_PSMCT16 || TEX0.CPSM == PSM_PSMCT16S)
		{
			for(int i = 0; i < nPaletteEntries; i++)
			{
				pCLUT[i] = (WORD)(this->*rp)((TEXCLUT.COU<<4) + i, TEXCLUT.COV, bp, bw);
			}
		}
		else if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24)
		{
			for(int i = 0; i < nPaletteEntries; i++)
			{
				DWORD dw = (this->*rp)((TEXCLUT.COU<<4) + i, TEXCLUT.COV, bp, bw);
				pCLUT[i] = (WORD)(dw & 0xffff);
				pCLUT[i+256] = (WORD)(dw >> 16);
			}
		}
	}
}

//

void GSLocalMemory::ReadCLUT(GIFRegTEX0 TEX0, GIFRegTEXA TEXA, DWORD* pCLUT32)
{
	ASSERT(pCLUT32);

	WORD* pCLUT = m_pCLUT + (TEX0.CSA<<4);

	if(TEX0.CPSM == PSM_PSMCT32)
	{
		switch(TEX0.PSM)
		{
		case PSM_PSMT8:
		case PSM_PSMT8H:
			ReadCLUT32_T32_I8(pCLUT, pCLUT32);
			break;
		case PSM_PSMT4:
		case PSM_PSMT4HL:
		case PSM_PSMT4HH:
			ReadCLUT32_T32_I4(pCLUT, pCLUT32);
			break;
		}
	}
	else if(TEX0.CPSM == PSM_PSMCT16 || TEX0.CPSM == PSM_PSMCT16S)
	{
		switch(TEX0.PSM)
		{
		case PSM_PSMT8:
		case PSM_PSMT8H:
			ReadCLUT32_T16_I8(pCLUT, pCLUT32);
			break;
		case PSM_PSMT4:
		case PSM_PSMT4HL:
		case PSM_PSMT4HH:
			ReadCLUT32_T16_I4(pCLUT, pCLUT32);
			break;
		}
	}
}

void GSLocalMemory::SetupCLUT(GIFRegTEX0 TEX0, GIFRegTEXA TEXA)
{
	// TODO: cache m_pCLUT*

	ReadCLUT(TEX0, TEXA, m_pCLUT32);

	switch(TEX0.PSM)
	{
	case PSM_PSMT4:
	case PSM_PSMT4HL:
	case PSM_PSMT4HH:
		// sse2?
		if(TEX0.CPSM == PSM_PSMCT32)
		{
			for(int j = 0, k = 0; j < 16; j++)
				for(int i = 0; i < 16; i++, k++)
					m_pCLUT64[k] = ((UINT64)m_pCLUT32[j] << 32) | m_pCLUT32[i];
		}
		else
		{
			for(int j = 0, k = 0; j < 16; j++)
				for(int i = 0; i < 16; i++, k++)
					m_pCLUT64[k] = ((UINT64)m_pCLUT32[j] << 16) | (m_pCLUT32[i]&0xffff);
		}
		break;
	}
}

//

void GSLocalMemory::ReadCLUT32(GIFRegTEX0 TEX0, GIFRegTEXA TEXA, DWORD* pCLUT32)
{
	ASSERT(pCLUT32);

	WORD* pCLUT = m_pCLUT + (TEX0.CSA<<4);

	if(TEX0.CPSM == PSM_PSMCT32)
	{
		switch(TEX0.PSM)
		{
		case PSM_PSMT8:
		case PSM_PSMT8H:
			ReadCLUT32_T32_I8(pCLUT, pCLUT32);
			break;
		case PSM_PSMT4:
		case PSM_PSMT4HL:
		case PSM_PSMT4HH:
			ReadCLUT32_T32_I4(pCLUT, pCLUT32);
			break;
		}
	}
	else if(TEX0.CPSM == PSM_PSMCT16 || TEX0.CPSM == PSM_PSMCT16S)
	{
		Expand16(pCLUT, pCLUT32, m_psmtbl[TEX0.PSM].pal, &TEXA);
	}
}

void GSLocalMemory::SetupCLUT32(GIFRegTEX0 TEX0, GIFRegTEXA TEXA)
{
	// TODO: cache m_pCLUT*

	ReadCLUT32(TEX0, TEXA, m_pCLUT32);

	switch(TEX0.PSM)
	{
	case PSM_PSMT4:
	case PSM_PSMT4HL:
	case PSM_PSMT4HH:
		// sse2?
		for(int j = 0, k = 0; j < 16; j++)
			for(int i = 0; i < 16; i++, k++)
				m_pCLUT64[k] = ((UINT64)m_pCLUT32[j] << 32) | m_pCLUT32[i];
		break;
	}
}

void GSLocalMemory::CopyCLUT32(DWORD* pCLUT32, int nPaletteEntries)
{
	memcpy(pCLUT32, m_pCLUT32, sizeof(DWORD)*nPaletteEntries);
}

////////////////////

DWORD GSLocalMemory::readTexel32(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_vm32[pixelAddress32(x, y, TEX0.TBP0, TEX0.TBW)];
}

DWORD GSLocalMemory::readTexel24(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return Expand24To32(m_vm32[pixelAddress32(x, y, TEX0.TBP0, TEX0.TBW)], TEX0.ai32[1]&4, TEXA);
}

DWORD GSLocalMemory::readTexel16(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return Expand16To32(m_vm16[pixelAddress16(x, y, TEX0.TBP0, TEX0.TBW)], TEXA);
}

DWORD GSLocalMemory::readTexel16S(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return Expand16To32(m_vm16[pixelAddress16S(x, y, TEX0.TBP0, TEX0.TBW)], TEXA);
}

DWORD GSLocalMemory::readTexel8(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel8(x, y, TEX0.TBP0, TEX0.TBW)];
}

DWORD GSLocalMemory::readTexel8H(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel8H(x, y, TEX0.TBP0, TEX0.TBW)];
}

DWORD GSLocalMemory::readTexel4(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel4(x, y, TEX0.TBP0, TEX0.TBW)];
}

DWORD GSLocalMemory::readTexel4HL(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel4HL(x, y, TEX0.TBP0, TEX0.TBW)];
}

DWORD GSLocalMemory::readTexel4HH(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel4HH(x, y, TEX0.TBP0, TEX0.TBW)];
}

////////////////////
/*
DWORD GSLocalMemory::readTexel32(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_vm32[addr];
}

DWORD GSLocalMemory::readTexel24(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return Expand24To32(m_vm32[addr], TEX0.ai32[1]&4, TEXA);
}

DWORD GSLocalMemory::readTexel16(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return Expand16To32(m_vm16[addr], TEXA);
}

DWORD GSLocalMemory::readTexel16S(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return Expand16To32(m_vm16[addr], TEXA);
}

DWORD GSLocalMemory::readTexel8(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel8(addr)];
}

DWORD GSLocalMemory::readTexel8H(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel8H(addr)];
}

DWORD GSLocalMemory::readTexel4(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel4(addr)];
}

DWORD GSLocalMemory::readTexel4HL(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel4HL(addr)];
}

DWORD GSLocalMemory::readTexel4HH(DWORD addr, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return m_pCLUT32[readPixel4HH(addr)];
}
*/
////////////////////

static void SwizzleTextureStep(int& tx, int& ty, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
//	if(ty == TRXREG.RRH && tx == TRXPOS.DSAX) ASSERT(0);

	if(++tx == TRXREG.RRW)
	{
		tx = TRXPOS.DSAX;
		ty++;
	}
}

#define IsTopLeftAligned(dsax, tx, ty, bw, bh) \
	(((dsax) & ((bw)-1)) == 0 && ((tx) & ((bw)-1)) == 0 && (dsax) == (tx) && ((ty) & ((bh)-1)) == 0)

void GSLocalMemory::SwizzleTexture32(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX)*4;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);

	if(!fTopLeftAligned || (tw & 7) || (th & 7) || (len % srcpitch))
	{
		if(fTopLeftAligned && tw >= 8 && th >= 8)
		{
			int twa = tw & ~7;
			int tha = th & ~7;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 8)
			{
				for(int x = tx; x < twa; x += 8)
					SwizzleBlock32u((BYTE*)&m_vm32[blockAddress32(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*4, srcpitch);

				for(int i = 0; i < 8; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel32(x, ty, ((DWORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		if(len > 0 && tw >= 8 && th >= 2 && IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 2))
		{
			int twa = tw & ~7;
			int tha = th & ~1;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 2)
			{
				for(int x = tx; x < twa; x += 8)
					SwizzleColumn32(ty, (BYTE*)&m_vm32[blockAddress32(x, ty&~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*4, srcpitch);

				for(int i = 0; i < 2; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel32(x, ty, ((DWORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		th += ty;

		if((DWORD_PTR)src & 0xf)
		{
			for(int y = ty; y < th; y += 8, src += srcpitch*8)
				for(int x = tx; x < tw; x += 8)
					SwizzleBlock32u((BYTE*)&m_vm32[blockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*4, srcpitch);
		}
		else
		{
			for(int y = ty; y < th; y += 8, src += srcpitch*8)
				for(int x = tx; x < tw; x += 8)
					SwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*4, srcpitch);
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture24(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX)*3;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);

	if(!fTopLeftAligned || (tw & 7) || (th & 7) || (len % srcpitch))
	{
		// TODO

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		__declspec(align(16)) DWORD block[8*8];

		th += ty;

		for(int y = ty; y < th; y += 8, src += srcpitch*8)
		{
			for(int x = tx; x < tw; x += 8)
			{
				BYTE* s = src + (x - tx)*3;
				DWORD* d = block;

				for(int j = 0, diff = srcpitch - 8*3; j < 8; j++, s += diff, d += 8)
					for(int i = 0; i < 8; i++, s += 3)
						d[i] = (s[2]<<16)|(s[1]<<8)|s[0];

				SwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], (BYTE*)block, sizeof(block)/8, 0x00ffffff);
			}
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture16(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX)*2;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 16, 8);

	if(!fTopLeftAligned || (tw & 15) || (th & 7) || (len % srcpitch))
	{
		if(fTopLeftAligned && tw >= 16 && th >= 8)
		{
			int twa = tw & ~15;
			int tha = th & ~7;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 8)
			{
				for(int x = tx; x < twa; x += 16)
					SwizzleBlock16u((BYTE*)&m_vm16[blockAddress16(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);

				for(int i = 0; i < 8; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel16(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		if(len > 0 && tw >= 16 && th >= 2 && IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 16, 2))
		{
			int twa = tw & ~15;
			int tha = th & ~1;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 2)
			{
				for(int x = tx; x < twa; x += 16)
					SwizzleColumn16(ty, (BYTE*)&m_vm16[blockAddress16(x, ty&~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);

				for(int i = 0; i < 2; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel16(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		th += ty;

		if((DWORD_PTR)src & 0xf)
		{
			for(int y = ty; y < th; y += 8, src += srcpitch*8)
				for(int x = tx; x < tw; x += 16)
					SwizzleBlock16u((BYTE*)&m_vm16[blockAddress16(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);
		}
		else
		{
			for(int y = ty; y < th; y += 8, src += srcpitch*8)
				for(int x = tx; x < tw; x += 16)
					SwizzleBlock16((BYTE*)&m_vm16[blockAddress16(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture16S(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX)*2;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 16, 8);

	if(!fTopLeftAligned || (tw & 15) || (th & 7) || (len % srcpitch))
	{
		if(fTopLeftAligned && tw >= 16 && th >= 8)
		{
			int twa = tw & ~15;
			int tha = th & ~7;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 8)
			{
				for(int x = tx; x < twa; x += 16)
					SwizzleBlock16u((BYTE*)&m_vm16[blockAddress16S(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);

				for(int i = 0; i < 8; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel16S(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		if(len > 0 && tw >= 16 && th >= 2 && IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 16, 2))
		{
			int twa = tw & ~15;
			int tha = th & ~1;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 2)
			{
				for(int x = tx; x < twa; x += 16)
					SwizzleColumn16(ty, (BYTE*)&m_vm16[blockAddress16S(x, ty&~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);

				for(int i = 0; i < 2; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel16S(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		th += ty;

		if((DWORD_PTR)src & 0xf)
		{
			for(int y = ty; y < th; y += 8, src += srcpitch*8)
				for(int x = tx; x < tw; x += 16)
					SwizzleBlock16((BYTE*)&m_vm16[blockAddress16S(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);
		}
		else
		{
			for(int y = ty; y < th; y += 8, src += srcpitch*8)
				for(int x = tx; x < tw; x += 16)
					SwizzleBlock16((BYTE*)&m_vm16[blockAddress16S(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx)*2, srcpitch);
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture8(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = TRXREG.RRW - TRXPOS.DSAX;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 16, 16);

	if(!fTopLeftAligned || (tw & 15) || (th & 15) || (len % srcpitch))
	{
		if(fTopLeftAligned && tw >= 16 && th >= 16)
		{
			int twa = tw & ~15;
			int tha = th & ~15;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 16)
			{
				for(int x = tx; x < twa; x += 16)
					SwizzleBlock8u((BYTE*)&m_vm8[blockAddress8(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch);

				for(int i = 0; i < 16; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel8(x, ty, src[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		if(len > 0 && tw >= 16 && th >= 4 && IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 16, 4))
		{
			int twa = tw & ~15;
			int tha = th & ~3;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 4)
			{
				for(int x = tx; x < twa; x += 16)
					SwizzleColumn8(ty, (BYTE*)&m_vm8[blockAddress8(x, ty&~15, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch);

				for(int i = 0; i < 4; i++, ty++, src += srcpitch)
					for(int x = twa; x < tw; x++)
						writePixel8(x, ty, src[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW);
			}
		}

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		th += ty;

		if((DWORD_PTR)src & 0xf)
		{
			for(int y = ty; y < th; y += 16, src += srcpitch*16)
				for(int x = tx; x < tw; x += 16)
					SwizzleBlock8u((BYTE*)&m_vm8[blockAddress8(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch);
		}
		else
		{
			for(int y = ty; y < th; y += 16, src += srcpitch*16)
				for(int x = tx; x < tw; x += 16)
					SwizzleBlock8((BYTE*)&m_vm8[blockAddress8(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch);
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture8H(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = TRXREG.RRW - TRXPOS.DSAX;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);

	if(!fTopLeftAligned || (tw & 7) || (th & 7) || (len % srcpitch))
	{
		// TODO

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		__declspec(align(16)) DWORD block[8*8];

		th += ty;

		for(int y = ty; y < th; y += 8, src += srcpitch*8)
		{
			for(int x = tx; x < tw; x += 8)
			{
				BYTE* s = src + (x - tx);
				DWORD* d = block;

				for(int j = 0; j < 8; j++, s += srcpitch, d += 8)
					for(int i = 0; i < 8; i++)
						d[i] = s[i] << 24;

				SwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], (BYTE*)block, sizeof(block)/8, 0xff000000);
			}
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture4(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX)/2;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 32, 16);

	if(!fTopLeftAligned || (tw & 31) || (th & 15) || (len % srcpitch))
	{
		if(fTopLeftAligned && tw >= 32 && th >= 16)
		{
			int twa = tw & ~31;
			int tha = th & ~15;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 16)
			{
				for(int x = tx; x < twa; x += 32)
					SwizzleBlock4u((BYTE*)&m_vm8[blockAddress4(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)>>1], src + (x - tx)/2, srcpitch);

				for(int i = 0; i < 16; i++, ty++, src += srcpitch)
				{
					BYTE* s = src + (twa - tx)/2;

					for(int x = twa; x < tw; x += 2, s++)
					{
						writePixel4(x, ty, *s&0xf, BITBLTBUF.DBP, BITBLTBUF.DBW),
						writePixel4(x+1, ty, *s>>4, BITBLTBUF.DBP, BITBLTBUF.DBW);
					}
				}
			}
		}

		if(len > 0 && tw >= 32 && th >= 4 && IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 32, 4))
		{
			int twa = tw & ~31;
			int tha = th & ~3;

			len -= tha * srcpitch;
			th -= tha;

			for(int j = 0; j < tha; j += 4)
			{
				for(int x = tx; x < twa; x += 32)
					SwizzleColumn4(ty, (BYTE*)&m_vm8[blockAddress4(x, ty&~15, BITBLTBUF.DBP, BITBLTBUF.DBW)>>1], src + (x - tx)/2, srcpitch);

				for(int i = 0; i < 4; i++, ty++, src += srcpitch)
				{
					BYTE* s = src + (twa - tx)/2;

					for(int x = twa; x < tw; x += 2, s++)
					{
						writePixel4(x, ty, *s&0xf, BITBLTBUF.DBP, BITBLTBUF.DBW),
						writePixel4(x+1, ty, *s>>4, BITBLTBUF.DBP, BITBLTBUF.DBW);
					}
				}
			}
		}

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		th += ty;

		if((DWORD_PTR)src & 0xf)
		{
			for(int y = ty; y < th; y += 16, src += srcpitch*16)
				for(int x = tx; x < tw; x += 32)
					SwizzleBlock4u((BYTE*)&m_vm8[blockAddress4(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)>>1], src + (x - tx)/2, srcpitch);
		}
		else
		{
			for(int y = ty; y < th; y += 16, src += srcpitch*16)
				for(int x = tx; x < tw; x += 32)
					SwizzleBlock4((BYTE*)&m_vm8[blockAddress4(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)>>1], src + (x - tx)/2, srcpitch);
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture4HL(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX)/2;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);

	if(!fTopLeftAligned || (tw & 7) || (th & 7) || (len % srcpitch))
	{
		// TODO

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		__declspec(align(16)) DWORD block[8*8];

		th += ty;

		for(int y = ty; y < th; y += 8, src += srcpitch*8)
		{
			for(int x = tx; x < tw; x += 8)
			{
				BYTE* s = src + (x - tx)/2;
				DWORD* d = block;

				for(int j = 0; j < 8; j++, s += srcpitch, d += 8)
					for(int i = 0; i < 8/2; i++)
						d[i*2] = (s[i]&0x0f) << 24, 
						d[i*2+1] = (s[i]&0xf0) << 20;

				SwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], (BYTE*)block, sizeof(block)/8, 0x0f000000);
			}
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTexture4HH(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(TRXREG.RRW == 0) return;

	int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX)/2;
	int th = len / srcpitch;

	bool fTopLeftAligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);

	if(!fTopLeftAligned || (tw & 7) || (th & 7) || (len % srcpitch))
	{
		// TODO

		SwizzleTextureX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
	}
	else
	{
		__declspec(align(16)) DWORD block[8*8];

		th += ty;

		for(int y = ty; y < th; y += 8, src += srcpitch*8)
		{
			for(int x = tx; x < tw; x += 8)
			{
				BYTE* s = src + (x - tx)/2;
				DWORD* d = block;

				for(int j = 0; j < 8; j++, s += srcpitch, d += 8)
					for(int i = 0; i < 8/2; i++)
						d[i*2] = (s[i]&0x0f) << 28, 
						d[i*2+1] = (s[i]&0xf0) << 24;

				SwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], (BYTE*)block, sizeof(block)/8, 0xf0000000);
			}
		}

		ty = th;
	}
}

void GSLocalMemory::SwizzleTextureX(int& tx, int& ty, BYTE* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
{
	if(len <= 0) return;

	BYTE* pb = (BYTE*)src;
	WORD* pw = (WORD*)src;
	DWORD* pd = (DWORD*)src;

	// if(ty >= (int)TRXREG.RRH) {ASSERT(0); return;}

	switch(BITBLTBUF.DPSM)
	{
	case PSM_PSMCT32:
		for(len /= 4; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pd++)
			writePixel32(tx, ty, *pd, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMCT24:
		for(len /= 3; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pb+=3)
			writePixel24(tx, ty, *(DWORD*)pb, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMCT16:
		for(len /= 2; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pw++)
			writePixel16(tx, ty, *pw, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMCT16S:
		for(len /= 2; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pw++)
			writePixel16S(tx, ty, *pw, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMT8:
		for(; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pb++)
			writePixel8(tx, ty, *pb, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMT4:
		for(; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pb++)
			writePixel4(tx, ty, *pb&0xf, BITBLTBUF.DBP, BITBLTBUF.DBW),
			writePixel4(tx+1, ty, *pb>>4, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMT8H:
		for(; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pb++)
			writePixel8H(tx, ty, *pb, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMT4HL:
		for(; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pb++)
			writePixel4HL(tx, ty, *pb&0xf, BITBLTBUF.DBP, BITBLTBUF.DBW),
			writePixel4HL(tx+1, ty, *pb>>4, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMT4HH:
		for(; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pb++)
			writePixel4HH(tx, ty, *pb&0xf, BITBLTBUF.DBP, BITBLTBUF.DBW),
			writePixel4HH(tx+1, ty, *pb>>4, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMZ32:
		for(len /= 4; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pd++)
			writePixel32Z(tx, ty, *pd, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMZ24:
		for(len /= 3; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pb+=3)
			writePixel24Z(tx, ty, *(DWORD*)pb, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMZ16:
		for(len /= 2; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pw++)
			writePixel16Z(tx, ty, *pw, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	case PSM_PSMZ16S:
		for(len /= 2; len-- > 0; SwizzleTextureStep(tx, ty, TRXPOS, TRXREG), pw++)
			writePixel16SZ(tx, ty, *pw, BITBLTBUF.DBP, BITBLTBUF.DBW);
		break;
	}
}

///////////////////

void GSLocalMemory::unSwizzleTexture32(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x-r.left)*4, dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture24(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			__declspec(align(16)) DWORD block[8*8];
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);
			ExpandBlock24(block, (DWORD*)ptr + (x-r.left), dstpitch, &TEXA);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture16(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 16, 8, 16)
		{
			__declspec(align(16)) WORD block[16*8];
			unSwizzleBlock16((BYTE*)&m_vm16[blockAddress16(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);
			ExpandBlock16(block, (DWORD*)ptr + (x-r.left), dstpitch, &TEXA);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture16S(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 16, 8, 16S)
		{
			__declspec(align(16)) WORD block[16*8];
			unSwizzleBlock16((BYTE*)&m_vm16[blockAddress16S(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);
			ExpandBlock16(block, (DWORD*)ptr + (x-r.left), dstpitch, &TEXA);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture8(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 16, 16, 8)
		{
			__declspec(align(16)) BYTE block[16*16];
			unSwizzleBlock8((BYTE*)&m_vm8[blockAddress8(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/16);

			BYTE* s = block;
			BYTE* d = ptr + (x-r.left)*4;

			for(int j = 0; j < 16; j++, s += 16, d += dstpitch)
				for(int i = 0; i < 16; i++)
					((DWORD*)d)[i] = m_pCLUT32[s[i]];
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture8H(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			__declspec(align(16)) DWORD block[8*8];
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);

			DWORD* s = block;
			BYTE* d = ptr + (x-r.left)*4;

			for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
				for(int i = 0; i < 8; i++)
					((DWORD*)d)[i] = m_pCLUT32[s[i] >> 24];
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 32, 16, 4)
		{
			__declspec(align(16)) BYTE block[(32/2)*16];
			unSwizzleBlock4((BYTE*)&m_vm8[blockAddress4(x, y, TEX0.TBP0, TEX0.TBW)>>1], (BYTE*)block, sizeof(block)/16);

			BYTE* s = block;
			BYTE* d = ptr + (x-r.left)*4;

			for(int j = 0; j < 16; j++, s += 32/2, d += dstpitch)
				for(int i = 0; i < 32/2; i++)
					((UINT64*)d)[i] = m_pCLUT64[s[i]];
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4HL(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			__declspec(align(16)) DWORD block[8*8];
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);

			DWORD* s = block;
			BYTE* d = ptr + (x-r.left)*4;

			for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
				for(int i = 0; i < 8; i++)
					((DWORD*)d)[i] = m_pCLUT32[(s[i] >> 24)&0x0f];
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4HH(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			__declspec(align(16)) DWORD block[8*8];
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);

			DWORD* s = block;
			BYTE* d = ptr + (x-r.left)*4;

			for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
				for(int i = 0; i < 8; i++)
					((DWORD*)d)[i] = m_pCLUT32[s[i] >> 28];
		}
		FOREACH_BLOCK_END
	}
}

///////////////////

void GSLocalMemory::ReadTexture(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA, GIFRegCLAMP& CLAMP)
{
	unSwizzleTexture st = m_psmtbl[TEX0.PSM].ust;
	readTexel rt = m_psmtbl[TEX0.PSM].rt;
	CSize bs = m_psmtbl[TEX0.PSM].bs;

	if(r.Width() < bs.cx || r.Height() < bs.cy 
	|| (r.left & (bs.cx-1)) || (r.top & (bs.cy-1)) 
	|| (r.right & (bs.cx-1)) || (r.bottom & (bs.cy-1)) 
	|| (CLAMP.WMS&2) || (CLAMP.WMT&2))
	{
		ReadTexture<DWORD>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
	}
	else
	{
		(this->*st)(r, dst, dstpitch, TEX0, TEXA);
	}
}

////////////////////

DWORD GSLocalMemory::readTexel16P(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return readPixel16(x, y, TEX0.TBP0, TEX0.TBW);
}

DWORD GSLocalMemory::readTexel16SP(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return readPixel16S(x, y, TEX0.TBP0, TEX0.TBW);
}

DWORD GSLocalMemory::readTexel8P(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return readPixel8(x, y, TEX0.TBP0, TEX0.TBW);
}

DWORD GSLocalMemory::readTexel8HP(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return readPixel8H(x, y, TEX0.TBP0, TEX0.TBW);
}

DWORD GSLocalMemory::readTexel4P(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return readPixel4(x, y, TEX0.TBP0, TEX0.TBW);
}

DWORD GSLocalMemory::readTexel4HLP(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return readPixel4HL(x, y, TEX0.TBP0, TEX0.TBW);
}

DWORD GSLocalMemory::readTexel4HHP(int x, int y, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	return readPixel4HH(x, y, TEX0.TBP0, TEX0.TBW);
}

///////////////////

void GSLocalMemory::unSwizzleTexture16P(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 16, 8, 16)
		{
			unSwizzleBlock16((BYTE*)&m_vm16[blockAddress16(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x-r.left)*2, dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture16SP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 16, 8, 16S)
		{
			unSwizzleBlock16((BYTE*)&m_vm16[blockAddress16S(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x-r.left)*2, dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture8P(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 16, 16, 8)
		{
			unSwizzleBlock8((BYTE*)&m_vm8[blockAddress8(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x-r.left), dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture8HP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			unSwizzleBlock8HP((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x-r.left), dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4P(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 32, 16, 4)
		{
			unSwizzleBlock4P((BYTE*)&m_vm8[blockAddress4(x, y, TEX0.TBP0, TEX0.TBW)>>1], ptr + (x-r.left), dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4HLP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			unSwizzleBlock4HLP((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x-r.left), dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4HHP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			unSwizzleBlock4HHP((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x-r.left), dstpitch);
		}
		FOREACH_BLOCK_END
	}
}

///////////////////

void GSLocalMemory::ReadTextureP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA, GIFRegCLAMP& CLAMP)
{
	unSwizzleTexture st = m_psmtbl[TEX0.PSM].ustP;
	readTexel rt = m_psmtbl[TEX0.PSM].rtP;
	CSize bs = m_psmtbl[TEX0.PSM].bs;

	if(r.Width() < bs.cx || r.Height() < bs.cy 
	|| (r.left & (bs.cx-1)) || (r.top & (bs.cy-1)) 
	|| (r.right & (bs.cx-1)) || (r.bottom & (bs.cy-1)) 
	|| (CLAMP.WMS&2) || (CLAMP.WMT&2))
	{
		switch(TEX0.PSM)
		{
		default:
		case PSM_PSMCT32:
		case PSM_PSMCT24:
			ReadTexture<DWORD>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
			break;
		case PSM_PSMCT16:
		case PSM_PSMCT16S:
			ReadTexture<WORD>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
			break;
		case PSM_PSMT8:
		case PSM_PSMT8H:
		case PSM_PSMT4:
		case PSM_PSMT4HL:
		case PSM_PSMT4HH:
			ReadTexture<BYTE>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
			break;
		}
	}
	else
	{
		(this->*st)(r, dst, dstpitch, TEX0, TEXA);
	}
}

///////////////////

void GSLocalMemory::unSwizzleTexture8NP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 16, 16, 8)
		{
			__declspec(align(16)) BYTE block[16*16];
			unSwizzleBlock8((BYTE*)&m_vm8[blockAddress8(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/16);

			BYTE* s = block;

			if(TEX0.CPSM == PSM_PSMCT32)
			{
				BYTE* d = ptr + (x-r.left)*4;
				for(int j = 0; j < 16; j++, s += 16, d += dstpitch)
					for(int i = 0; i < 16; i++)
						((DWORD*)d)[i] = m_pCLUT32[s[i]];
			}
			else
			{
				BYTE* d = ptr + (x-r.left)*2;
				for(int j = 0; j < 16; j++, s += 16, d += dstpitch)
					for(int i = 0; i < 16; i++)
						((WORD*)d)[i] = (WORD)m_pCLUT32[s[i]];
			}
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture8HNP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			__declspec(align(16)) DWORD block[8*8];
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);

			DWORD* s = block;

			if(TEX0.CPSM == PSM_PSMCT32)
			{
				BYTE* d = ptr + (x-r.left)*4;
				for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
					for(int i = 0; i < 8; i++)
						((DWORD*)d)[i] = m_pCLUT32[s[i] >> 24];
			}
			else
			{
				BYTE* d = ptr + (x-r.left)*2;
				for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
					for(int i = 0; i < 8; i++)
						((WORD*)d)[i] = (WORD)m_pCLUT32[s[i] >> 24];
			}
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4NP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 32, 16, 4)
		{
			__declspec(align(16)) BYTE block[(32/2)*16];
			unSwizzleBlock4((BYTE*)&m_vm8[blockAddress4(x, y, TEX0.TBP0, TEX0.TBW)>>1], (BYTE*)block, sizeof(block)/16);

			BYTE* s = block;

			if(TEX0.CPSM == PSM_PSMCT32)
			{
				BYTE* d = ptr + (x-r.left)*4;
				for(int j = 0; j < 16; j++, s += 32/2, d += dstpitch)
					for(int i = 0; i < 32/2; i++)
						((UINT64*)d)[i] = m_pCLUT64[s[i]];
			}
			else
			{
				BYTE* d = ptr + (x-r.left)*2;
				for(int j = 0; j < 16; j++, s += 32/2, d += dstpitch)
					for(int i = 0; i < 32/2; i++)
						((DWORD*)d)[i] = (DWORD)m_pCLUT64[s[i]];
			}
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4HLNP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			__declspec(align(16)) DWORD block[8*8];
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);

			DWORD* s = block;

			if(TEX0.CPSM == PSM_PSMCT32)
			{
				BYTE* d = ptr + (x-r.left)*4;
				for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
					for(int i = 0; i < 8; i++)
						((DWORD*)d)[i] = m_pCLUT32[(s[i] >> 24)&0x0f];
			}
			else
			{
				BYTE* d = ptr + (x-r.left)*2;
				for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
					for(int i = 0; i < 8; i++)
						((WORD*)d)[i] = (WORD)m_pCLUT32[(s[i] >> 24)&0x0f];
			}
		}
		FOREACH_BLOCK_END
	}
}

void GSLocalMemory::unSwizzleTexture4HHNP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA)
{
	#pragma omp parallel
	{
		#pragma omp for
		FOREACH_BLOCK_START(r, 8, 8, 32)
		{
			__declspec(align(16)) DWORD block[8*8];
			unSwizzleBlock32((BYTE*)&m_vm32[blockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block)/8);

			DWORD* s = block;

			if(TEX0.CPSM == PSM_PSMCT32)
			{
				BYTE* d = ptr + (x-r.left)*4;
				for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
					for(int i = 0; i < 8; i++)
						((DWORD*)d)[i] = m_pCLUT32[s[i] >> 28];
			}
			else
			{
				BYTE* d = ptr + (x-r.left)*2;
				for(int j = 0; j < 8; j++, s += 8, d += dstpitch)
					for(int i = 0; i < 8; i++)
						((WORD*)d)[i] = (WORD)m_pCLUT32[s[i] >> 28];
			}
		}
		FOREACH_BLOCK_END
	}
}

///////////////////

void GSLocalMemory::ReadTextureNP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA, GIFRegCLAMP& CLAMP)
{
	unSwizzleTexture st = m_psmtbl[TEX0.PSM].ustNP;
	readTexel rt = m_psmtbl[TEX0.PSM].rtNP;

	CSize bs = m_psmtbl[TEX0.PSM].bs;

	if(r.Width() < bs.cx || r.Height() < bs.cy 
	|| (r.left & (bs.cx-1)) || (r.top & (bs.cy-1)) 
	|| (r.right & (bs.cx-1)) || (r.bottom & (bs.cy-1)) 
	|| (CLAMP.WMS&2) || (CLAMP.WMT&2))
	{
		switch(TEX0.PSM)
		{
		default:
		case PSM_PSMCT32:
		case PSM_PSMCT24:
			ReadTexture<DWORD>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
			break;
		case PSM_PSMCT16:
		case PSM_PSMCT16S:
			ReadTexture<WORD>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
			break;
		case PSM_PSMT8:
		case PSM_PSMT8H:
		case PSM_PSMT4:
		case PSM_PSMT4HL:
		case PSM_PSMT4HH:
			switch(TEX0.CPSM)
			{
			default:
			case PSM_PSMCT32:
				ReadTexture<DWORD>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
				break;
			case PSM_PSMCT16:
			case PSM_PSMCT16S:
				ReadTexture<WORD>(r, dst, dstpitch, TEX0, TEXA, CLAMP, rt, st);
				break;
			}
			break;
		}
	}
	else
	{
		(this->*st)(r, dst, dstpitch, TEX0, TEXA);
	}
}

//

template<typename DstT> 
void GSLocalMemory::ReadTexture(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA, GIFRegCLAMP& CLAMP, readTexel rt, unSwizzleTexture st)
{
	// this function is not thread safe!

	CSize bs = m_psmtbl[TEX0.PSM].bs;

	CRect cr(
		(r.left + (bs.cx-1)) & ~(bs.cx-1), 
		(r.top + (bs.cy-1)) & ~(bs.cy-1), 
		r.right & ~(bs.cx-1), 
		r.bottom & ~(bs.cy-1));

	bool fAligned = ((DWORD_PTR)(dst + (cr.left-r.left)*sizeof(DstT)) & 0xf) == 0;

	if((CLAMP.WMS&2) || (CLAMP.WMT&2))
	{
		DWORD wms = CLAMP.WMS, wmt = CLAMP.WMT;
		DWORD minu = CLAMP.MINU, maxu = CLAMP.MAXU;
		DWORD minv = CLAMP.MINV, maxv = CLAMP.MAXV;

		switch(wms)
		{
		default: for(int x = r.left; x < r.right; x++) m_xtbl[x] = x; break;
		case 2: for(int x = r.left; x < r.right; x++) m_xtbl[x] = x < minu ? minu : x > maxu ? maxu : x; break;
		case 3: for(int x = r.left; x < r.right; x++) m_xtbl[x] = (x & minu) | maxu; break;
		}

		switch(wmt)
		{
		default: for(int y = r.top; y < r.bottom; y++) m_ytbl[y] = y; break;
		case 2: for(int y = r.top; y < r.bottom; y++) m_ytbl[y] = y < minv ? minv : y > maxv ? maxv : y;  break;
		case 3: for(int y = r.top; y < r.bottom; y++) m_ytbl[y] = (y & minv) | maxv; break;
		}

		if(fAligned && wms <= 2 && wmt <= 2)
		{
			// TODO: read clamped areas only once

			for(int y = r.top; y < cr.top; y++, dst += dstpitch)
				for(int x = r.left, i = 0; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(m_xtbl[x], m_ytbl[y], TEX0, TEXA);

			if(!cr.IsRectEmpty())
			(this->*st)(cr, dst + (cr.left - r.left)*sizeof(DstT), dstpitch, TEX0, TEXA);

			for(int y = cr.top; y < cr.bottom; y++, dst += dstpitch)
			{
				for(int x = r.left, i = 0; x < cr.left; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(m_xtbl[x], m_ytbl[y], TEX0, TEXA);
				for(int x = cr.right, i = x - r.left; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(m_xtbl[x], m_ytbl[y], TEX0, TEXA);
			}

			for(int y = cr.bottom; y < r.bottom; y++, dst += dstpitch)
				for(int x = r.left, i = 0; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(m_xtbl[x], m_ytbl[y], TEX0, TEXA);
		}
		else
		{
			for(int y = r.top; y < r.bottom; y++, dst += dstpitch)
				for(int x = r.left, i = 0; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(m_xtbl[x], m_ytbl[y], TEX0, TEXA);
		}
	}
	else
	{
		if(fAligned)
		{
			for(int y = r.top; y < cr.top; y++, dst += dstpitch)
				for(int x = r.left, i = 0; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(x, y, TEX0, TEXA);

			if(!cr.IsRectEmpty())
			(this->*st)(cr, dst + (cr.left - r.left)*sizeof(DstT), dstpitch, TEX0, TEXA);

			for(int y = cr.top; y < cr.bottom; y++, dst += dstpitch)
			{
				for(int x = r.left, i = 0; x < cr.left; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(x, y, TEX0, TEXA);
				for(int x = cr.right, i = x - r.left; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(x, y, TEX0, TEXA);
			}

			for(int y = cr.bottom; y < r.bottom; y++, dst += dstpitch)
				for(int x = r.left, i = 0; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(x, y, TEX0, TEXA);
		}
		else
		{
			for(int y = r.top; y < r.bottom; y++, dst += dstpitch)
				for(int x = r.left, i = 0; x < r.right; x++, i++)
					((DstT*)dst)[i] = (DstT)(this->*rt)(x, y, TEX0, TEXA);
		}
	}
}

//

HRESULT GSLocalMemory::SaveBMP(IDirect3DDevice9* pDev, LPCTSTR fn, DWORD bp, DWORD bw, DWORD psm, int w, int h)
{
	CComPtr<IDirect3DTexture9> pTexture;
	D3DLOCKED_RECT lr;
	HRESULT hr = pDev->CreateTexture(w, h, 1, 0, D3DFMT_A8R8G8B8, D3DPOOL_SYSTEMMEM, &pTexture, NULL);
	if(FAILED(hr) || FAILED(pTexture->LockRect(0, &lr, NULL, 0))) return E_FAIL;

	readTexel rt = m_psmtbl[psm].rt;
	GIFRegTEX0 TEX0;
	TEX0.TBP0 = bp;
	TEX0.TBW = bw;
	TEX0.PSM = psm;
	GIFRegTEXA TEXA;
	TEXA.AEM = 0;
	TEXA.TA0 = 0;
	TEXA.TA1 = 0x80;

	BYTE* p = (BYTE*)lr.pBits;

	for(int j = 0; j < h; j++, p += lr.Pitch)
		for(int i = 0; i < w; i++)
			((DWORD*)p)[i] = (this->*rt)(i, j, TEX0, TEXA);

	pTexture->UnlockRect(0);

	return D3DXSaveTextureToFile(fn, D3DXIFF_BMP, pTexture, NULL);
}




See more files for this project here

guliverkli

Home of VobSub, Media Player Classic (MPC) and other misc utils.

Project homepage: http://sourceforge.net/projects/guliverkli
Programming language(s): C,C++,PHP
License: other

  res/
    GSdx9.rc2
    hlsl_merge.fx
    hlsl_rb.fx
    hlsl_tfx.fx
    logo1.bmp
    ps11_en00.psh
    ps11_en01.psh
    ps11_en10.psh
    ps11_en11.psh
    ps11_tfx000.psh
    ps11_tfx010.psh
    ps11_tfx011.psh
    ps11_tfx1x0.psh
    ps11_tfx1x1.psh
    ps11_tfx200.psh
    ps11_tfx210.psh
    ps11_tfx211.psh
    ps11_tfx300.psh
    ps11_tfx310.psh
    ps11_tfx311.psh
    ps11_tfx4xx.psh
    ps14_en00.psh
    ps14_en01.psh
    ps14_en10.psh
    ps14_en11.psh
  GS.cpp
  GS.h
  GSCapture.cpp
  GSCapture.h
  GSCaptureDlg.cpp
  GSCaptureDlg.h
  GSHash.cpp
  GSHash.h
  GSLocalMemory.cpp
  GSLocalMemory.h
  GSPerfMon.cpp
  GSPerfMon.h
  GSRegs.cpp
  GSRenderer.cpp
  GSRenderer.h
  GSRendererHW.cpp
  GSRendererHW.h
  GSRendererNull.cpp
  GSRendererNull.h
  GSRendererSoft.cpp
  GSRendererSoft.h
  GSSettingsDlg.cpp
  GSSettingsDlg.h
  GSSoftVertex.cpp
  GSSoftVertex.h
  GSState.cpp
  GSState.h
  GSTables.cpp
  GSTables.h
  GSTextureCache.cpp
  GSTextureCache.h
  GSTransfer.cpp
  GSUtil.cpp
  GSUtil.h
  GSVertexList.cpp
  GSVertexList.h
  GSWnd.cpp
  GSWnd.h
  GSdx9.cpp
  GSdx9.def
  GSdx9.h
  GSdx9.icproj
  GSdx9.rc
  GSdx9.sln
  GSdx9.vcproj
  GSdx9_ic.sln
  GSdx9_vs2005.sln
  GSdx9_vs2005.vcproj
  resource.h
  stdafx.cpp
  stdafx.h
  x86-32.asm
  x86-64.asm
  x86.cpp
  x86.h