AllRGB Monte Carlo

// Bits per plane. Must be even and <=8, namely either 2, 4, 6, 8
#define CBIT 8

//Address scheme 4=fastest
#define ADR_SCHEME4
//Color Address scheme
#define CADR_SCHEME0
//#define CIDX3

// norm L1 or L2N or HIE
#define L1

// temperature range
#define BMIN (10.0/(1<<(CBIT)))
#define BMAX BMIN //2.0

#define TDIV 10
#define TOTALSTEP (TDIV*2)
#define OBSEVERY 2

// Steps for each temperature
#define MAXS (TOTALSTEP/TDIV)

/////////////////////////////////////////////////////
// no need to edit below

// Misc. consts
#define VMAX (1<<CBIT)
#define CBMASK (VMAX-1)

#define LBIT (CBIT+CBIT/2)
#define L (1<<LBIT)
#define L2 (L*L)
#define L2MASK (L2-1)
#define LMASK (L-1)
#define LMASKY (L2MASK ^ LMASK)

//Address conversion
// x, y ->3char
#define BTMOV(x, mask, sh) (((x)&mask)<<sh)
#define BTMOV2(x, mask, sh) (((x)&mask)>>sh)
// c00000 c0000 c000 c00  c0   c
//  8421 8421   8421 8421 8421 8421
//  yyxx yyxx   yyxx yyxx yyxx yyxx

#ifdef ADR_SCHEME0
#define XY2ADR(x,y) ((x)|((y)*L))
#define ADR2XY(a,x,y) { x=a&(L-1);y=a/L;}
#endif

// 4bit
#ifdef ADR_SCHEME4
#define XY2ADR(x,y) (\
    BTMOV(x,0xf,  0) | BTMOV(y,0xf,  4) | \
    BTMOV(x,0xf0, 4) | BTMOV(y,0xf0, 8) | \
    BTMOV(x,0xf00,8) | BTMOV(y,0xf00,12))
#define ADR2XY(a,x,y) {\
    x=BTMOV2((a),0x0f,0)  |BTMOV2((a),0xf00,4)  |BTMOV2((a),0xf0000,8);\
    y=BTMOV2((a),0xf0,4)  |BTMOV2((a),0xf000,8) |BTMOV2((a),0xf00000,12);}
#endif

// 3bit
#ifdef ADR_SCHEME3
#define XY2ADR(x,y) (\
    BTMOV(x,07,  0)  | BTMOV(y,07, 3)  |\
    BTMOV(x,070, 3)  | BTMOV(y,070,6)  |\
    BTMOV(x,0700,6)  | BTMOV(y,0700,9) |\
    BTMOV(x,07000,9) | BTMOV(y,07000,12))
#define ADR2XY(a,x,y) {\
    x=BTMOV2((a),07,0)  |BTMOV2((a),0700,3)   |BTMOV2((a),070000,6)  |BTMOV2((a),07000000,9);\
    y=BTMOV2((a),070,3) |BTMOV2((a),07000,6)  |BTMOV2((a),0700000,9) |BTMOV2((a),070000000,12);}
#endif

// 2bit
#ifdef ADR_SCHEME2
#define XY2ADR(x,y) (\
    BTMOV(x,0x3, 0) | BTMOV(y,0x3, 2) | BTMOV(x,0xc,2) | BTMOV(y,0xc,4) |\
    BTMOV(x,0x30,4) | BTMOV(y,0x30,6) | BTMOV(x,0xc0,6)|BTMOV(y,0xc0,8) |\
    BTMOV(x,0x300,8) | BTMOV(y,0x300,10) | BTMOV(x,0xc00,10)|BTMOV(y,0xc00,12) )
#define ADR2XY(a,x,y) {\
    x=BTMOV2(a,0x03,0)  |BTMOV2(a,0x30,2)   |BTMOV2(a,0x300,4)|\
      BTMOV2(a,0x3000,6)|BTMOV2(a,0x30000,8)|BTMOV2(a,0x300000,10);\
    y=BTMOV2(a,0x0c,2)  |BTMOV2(a,0xc0,4)   |BTMOV2(a,0xc00,6)|\
      BTMOV2(a,0xc000,8)|BTMOV2(a,0xc0000,10)|BTMOV2(a,0xc00000,12);}
#endif
//////////////////////////////////////////////////////////
#ifdef CADR_SCHEME4
#define rgb2idx(r,g,b) (\
	 BTMOV(r,0x0f,0) | BTMOV(g,0x0f,4)  | BTMOV(b, 0x0f,8) |\
	 BTMOV(r,0xf0,8) | BTMOV(g,0xf0,12) | BTMOV(b, 0xf0,16) \
	)
#endif

#ifdef CADR_SCHEME0
#define rgb2idx(r,g,b) ((r)|((g)<<CBIT)|((b)<<(CBIT*2)))
#endif
/////////////////////////////////////////////////////
// global variables block
typedef unsigned char rgbtype;
// main color data
rgbtype colr[L2],colg[L2], colb[L2];
rgbtype *col3[3]={colr, colg, colb};
// energy
long long ebest,  ene;
// color->pos index
#ifdef CIDX3
unsigned char cidx[L2*3];
#define CIDX(idx) (cidx[(idx)*3]|(cidx[(idx)*3+1]<<8)|(cidx[(idx)*3+2]<<16))
#define LETCIDX(idx, ad) { cidx[idx*3+0]=(ad&0xff); cidx[idx*3+1]=((ad>>8)&0xff); cidx[idx*3+2]=((ad>>16)&0xff);}
#else
int cidx[L2];
#define CIDX(idx) (cidx[idx]);
#define LETCIDX(idx, ad) { cidx[idx]=ad;}
#endif

//Magnetization
int mx[3][L], my[3][L];
double mcostab[L], msintab[L];
//for debug
long long n_mc, n_accp, n_recv;

int coldif[VMAX*2+2];
#ifdef HIE
#define ENE1(x,y) (coldif[(x)^(y)])
#endif

#ifdef LSQ
#define ENE1(x,y) (coldif[(x)-(y)+VMAX])
#endif

#ifdef L2N
#define ENE1(x,y) (((x)-(y))*((x)-(y)))
#endif

#ifdef L1
#define ENE1(x,y) (abs((x)-(y)))
#endif

// max possible 1-pixel energy
#define E1MAX (VMAX*VMAX*4)

// Metropolis threshold table
// energy difference of swapping 2 pixels can be as large as 2*E1MAX
static int thtab[E1MAX*2+10];

#include <math.h>
#include <string.h>

#include <stdio.h>
#include <stdlib.h>

#include <sys/time.h>
// for timing
int getusec()
{
    static int sec0=0,usec0=0;
    struct timeval tv;
    struct timezone tz;
    gettimeofday(&tv,&tz);
    int res=(tv.tv_sec-sec0)*1000000+(tv.tv_usec-usec0);
    sec0=tv.tv_sec;
    usec0=tv.tv_usec;
    return res;
}
///////////////////////////////////////////////////////////////////
// observation routine
// energy-corr
static int nE;
double edat[MAXS];
double mdat[MAXS];
double mkdat[LBIT+2][MAXS];
#ifdef HIE
// n-bit domain wall length
double ehie[CBIT][MAXS];
#endif

void init_obs(int ini, int k)
{
    int i,x,y;
    if (ini)
    {
	for(i=0;i<L;i++)
	{
	    double th = 2*M_PI*i/L;
	    mcostab[i]=cos(th);
	    msintab[i]=sin(th);
	}
    }

	for(i=0;i<L;i++)
    mx[k][i]=my[k][i]=0;

    for(y=0;y<L;y++){
	for(x=0;x<L;x++){
	    int a=(x|(y*L));// +2*L;
	    mx[k][x] += col3[k][a];
	    my[k][y] += col3[k][a];
	}}
}

double obs_mag(int plane, int k)
{
    int x;
    double cx=0, sx=0, cy=0, sy=0;

    init_obs(0, plane);

    for(x=0;x<L;x++)
    {
	int xx = (x*k)&(L-1);
	cx += mx[plane][x]*mcostab[xx]*1.0;
	sx += mx[plane][x]*msintab[xx]*1.0;
	cy += my[plane][x]*mcostab[xx]*1.0;
	sy += my[plane][x]*msintab[xx]*1.0;
    }
    cx = cx*cx;
    cy = cy*cy;
    sx = sx*sx;
    sy = sy*sy;

    return (cx+cy+sx+sy)/L2/L2;
}

void storee()
{
    if(nE==MAXS)exit(1);
    mdat[nE]=obs_mag(0,1) + obs_mag(1,1) +obs_mag(2,1);
    edat[nE] = ene * 1.0/L2;
    int k=1;
    int ix=0;
    while(L/k>=4)
    {
        mkdat[ix][nE]=obs_mag(0,k) + obs_mag(1,k) +obs_mag(2,k);
	k*=2;
	ix++;
    }
#ifdef HIE
    // sub domail wall ehie[CBIT][MAXS]
    int eh[CBIT+1], etab[VMAX];
    etab[0]= CBIT;
    for(k=0;k<CBIT;k++)
    {
	eh[k]=0;
	etab[1<<k]=k;
    }
    for(ix=0;ix<L2;ix++)
    {
	int x,y;
	ADR2XY(ix,x,y);
	int x1 = (x-1)&LMASK;
	int y1 = (y-1)&LMASK;
	int adl=XY2ADR(x1,y);
	int adu=XY2ADR(x,y1);
	int kk;
	for(kk=0;kk<3;kk++)
	{
	    eh[etab[ENE1(col3[kk][ix],col3[kk][adl])]]++;
	    eh[etab[ENE1(col3[kk][ix],col3[kk][adu])]]++;
	}
    }
    for(k=0;k<CBIT;k++) ehie[k][nE] = eh[k]/3.0/2.0/L2;
#endif

    nE++;
}

void CalcAve(double *e1, double *m1, double *cv, double *m2, double *ecor, double *mcor, double *mk)
{
    double e1s=0.0;
    double e2s=0.0;
    double ecs=0.0;
    double m1s=0.0;
    double m2s=0.0;
    double mcs=0.0;
    int i, s, k;

    for(k=0;k<LBIT;k++)mk[k]=0.0;

    for(i=0;i<nE;i++)
    {
	double e= edat[i];
	e1s+=e;
	e2s+=e*e;
	double m= mdat[i];
	m1s+=m;
	m2s+=m*m;

	for(k=0;k<LBIT;k++)mk[k] += mkdat[k][i];
    }
    e1s/=nE;
    e2s/=nE;
    e2s -= e1s*e1s;
    m1s/=nE;
    m2s/=nE;
    m2s -= m1s*m1s;

    for(k=0;k<LBIT;k++)mk[k]/=nE;

    int ns=0;
    for(s=1;s<=nE/4;s++)
    {
	double sum=0;
	for(i=0;i<nE-s;i++)
	    sum += (edat[i]-e1s)*(edat[i+s]-e1s);
	sum /= (nE-s);
	sum /= e2s;

	if(sum>0.0)ecs += -s/log(sum);
	ns++;
	if (sum<0.5)break;
    }
    ecs/=ns;

    ns=0;
    for(s=1;s<=nE/4;s++)
    {
	double sum=0;
	for(i=0;i<nE-s;i++)
	    sum += (mdat[i]-m1s)*(mdat[i+s]-m1s);
	sum /= (nE-s);
	sum /= m2s;

	if(sum>0.0) mcs += -s/log(sum);
	ns++;
	if (sum<0.5)break;
    }
    mcs/=ns;

    *e1 = e1s;
    *cv = e2s;
    *ecor = ecs * OBSEVERY;
    *m1 = m1s;
    *m2 = m2s;
    *mcor = mcs * OBSEVERY;
#ifdef HIE
    // sub domail wall ehie[LBIT][MAXS]
    for(k=0;k<CBIT;k++)
    {
        for(i=1;i<nE;i++) ehie[k][0] += ehie[k][i];
	ehie[k][0] /=nE;
    }
#endif
}

void init_th(double beta, int imax)
{
    int i;
    thtab[0] = imax/2;  // 50% chance for microcanonical change
    if (beta<0.0)
    {
	// quench case
	for(i=1;i<E1MAX*2+10;i++)
	    thtab[i] = 0;
	return;
    }
    for(i=1;i<E1MAX*2+10;i++)
    {
	thtab[i] = (int)(imax * exp( - i * beta));
    }
}


void init_cidx()
{
    //assume col is ready
    int i;
    for(i=0;i<L2;i++)
    {
	LETCIDX(i,0);
    }
    bool dup = false;
    for(i=0;i<L2;i++)
    {
	int r=colr[i];
	int g=colg[i];
	int b=colb[i];
	int idx =rgb2idx(r,g,b);
	int c0 = CIDX(idx);
	if (c0 !=0) dup=true;
	LETCIDX(idx,i);
    }

    if (dup)
    {
	printf("# init_cidx: colors are not unique\n");
	exit(1);
    }
    else
	printf("# init_cidx: check OK\n");
}


// dump color to a file
void dump(int idx, rgbtype *rr, rgbtype *gg, rgbtype *bb)
{
    bool do_stuff = true;

    if (do_stuff)
    {
	//init_cidx();
	char fn[128];
	sprintf(fn, "%04d.ppm",idx);
	FILE *fp;
	unsigned char *pbuf = (unsigned char *)malloc(L2*3);
	fp=fopen(fn,"w");
	if (fp==NULL)
	{
	    printf("cannot open %s for dump\n",fn);
	    exit(1);
	}
	int i;
	int vv = 255/(VMAX-1);
	printf("#Dump %s, color scale *%d Size %f Mb\n",fn, vv, L2*3e-6);
	int x,y;
	for(i=0;i<L2;i++)
	{
	    ADR2XY(i, x, y);
	    int ad= (x+y*L)*3;
	    pbuf[ad+0] = vv*rr[i];
	    pbuf[ad+1] = vv*gg[i];
	    pbuf[ad+2] = vv*bb[i];
	}
	fprintf(fp, "P6\n%d %d\n255\n", L,L);
	fwrite(pbuf,1,L2*3,fp);
	fclose(fp);
	free(pbuf);
    }
}

void dump_small(int idx, rgbtype *rr, rgbtype *gg, rgbtype *bb)
{
    bool do_stuff = true;
    if (do_stuff)
    {
	char fn[128];
	sprintf(fn, "ppms/%04d.ppm",idx);
	FILE *fp;

	int l00=720;
	if (l00>L)l00=480;
	if (l00>L)l00=320;
	if (l00>L)return;
	unsigned char *pbuf = (unsigned char *)malloc(l00*l00*3);
	int *ibuf = (int *)malloc(l00*l00*3*4);
	int *ibuf2 = (int *)malloc(l00*l00*4);
	int i;
	for(i=0;i<l00*l00;i++)
	{
	    ibuf2[i]=0;
	    ibuf[i*3+0]=0;
	    ibuf[i*3+1]=0;
	    ibuf[i*3+2]=0;
	}

	fp=fopen(fn,"w");
	if (fp==NULL)
	{
	    printf("cannot open %s for dump\n",fn);
	    exit(1);
	}
	int vv = 255/(VMAX-1);
	printf("#Dump %s, color scale *%d Size %f Mb\n",fn, vv, l00*l00*3e-6);

	//Calc
	for(i=0;i<L2;i++)
	{
	    int x,y;
	    ADR2XY(i,x,y);
	    int x2 = x*l00/L;
	    int y2 = y*l00/L;
	    int ad = x2+y2*l00;
	    ibuf2[ad]++;
	    ibuf[ad*3+0] += vv*rr[i];
	    ibuf[ad*3+1] += vv*gg[i];
	    ibuf[ad*3+2] += vv*bb[i];
	}
	for(i=0;i<l00*l00;i++)
	{
	    pbuf[i*3+0] = ibuf[i*3+0]/ibuf2[i];
	    pbuf[i*3+1] = ibuf[i*3+1]/ibuf2[i];
	    pbuf[i*3+2] = ibuf[i*3+2]/ibuf2[i];
	}

	fprintf(fp, "P6\n%d %d\n255\n", l00,l00);
	fwrite(pbuf,1,l00*l00*3,fp);
	fclose(fp);
	free(pbuf);
	free(ibuf);
	free(ibuf2);
    }
}

// resume from an image file
//  or import smaller size image and expand, add lower bit data
void load_ppm(char *fn)
{
    char buf[128];
    FILE *fp=fopen(fn, "r");
    if (fp==NULL)
    {
	printf("# cannot open file %s\n",fn);
	exit(1);
    }
    int ll;
    fgets(buf,99,fp);
    fgets(buf,99,fp);
    sscanf(buf, "%d", &ll);
    fgets(buf,99,fp);
    printf("#Loading %s L=%d\n",fn, ll);

    if (ll==L)
    {
	//resume
	printf("  #Same size data\n");
	unsigned char *pbuf = (unsigned char *)malloc(L2*3);
	fread(pbuf,1,L2*3,fp);
	fclose(fp);
	int vv = 255/(VMAX-1);
	printf("  # read complete, color scale /%d\n", vv);
	int i;
	int ad,x,y;
	for(i=0;i<L2;i++)
	{
	    x=i&(L-1);
	    y=i/L;
	    ad = XY2ADR(x,y);
	    unsigned char *c = &pbuf[i*3];
	    colr[ad] = c[0]/vv;
	    colg[ad] = c[1]/vv;
	    colb[ad] = c[2]/vv;
	}
	free(pbuf);
	return;
    }
    if (ll*ll==L)
    {
	// self-similar
	printf("  # sqrt(L) size data\n");
	unsigned char *pbuf = (unsigned char *)malloc(ll*ll*3);
	fread(pbuf,1,ll*ll*3,fp);
	fclose(fp);

	int v2 = (1<<(CBIT/2));
	int vv = 255/(v2-1);
	printf("  # read complete, color scale /%d\n", vv);

	int x,y;

	//block
	for(x=0;x<ll;x++){
	    for(y=0;y<ll;y++){
		int ad1 = (x+y*ll)*3;
		int r1=(pbuf[ad1 +0]/vv)<<(CBIT/2);
		int g1=(pbuf[ad1 +1]/vv)<<(CBIT/2);
		int b1=(pbuf[ad1 +2]/vv)<<(CBIT/2);

		int xx,yy;
		for(xx=0;xx<ll;xx++){
		    for(yy=0;yy<ll;yy++){
			int x2=xx;
			int y2=yy;
			if (x&1) x2=ll-1-xx;
			if (y&1) y2=ll-1-yy;
			int ad0 = (x2+y2*ll)*3;

			int r=pbuf[ad0 +0]/vv +r1;
			int g=pbuf[ad0 +1]/vv +g1;
			int b=pbuf[ad0 +2]/vv +b1;
			int ad1 = (x*ll+xx)+(y*ll+yy)*L;
			colr[ad1] = r;
			colg[ad1] = g;
			colb[ad1] = b;
		    }}
	    }}
	free(pbuf);
	return;
    }

    if (ll==L/8)
    {
	// expand CBIT-2 image
	printf("  # 1/8 size data\n");
	unsigned char *pbuf = (unsigned char *)malloc(L2*3/64);
	fread(pbuf,1,L2*3/64,fp);
	fclose(fp);
	printf("  # read complete\n");

	int ofs[8][8][3];
	int x,y;
	int vv = 255/(VMAX/4-1);

	// 8x8 optimum solution
	int ofsR[8][8];
	int ofsG[8][8];
	int ofsRy[8]={1,0,0,1,2,3,3,2};
	int ofsGx[8]={0,0,1,2,3,3,2,1};
	int ofsB[8][8]=
	{
	    {2,3,3,3,3,2,2,2},
	    {2,3,3,3,3,1,1,1},
	    {0,1,2,2,2,0,0,0},
	    {0,1,1,1,1,0,0,0},
	    {0,1,1,1,1,0,0,0},
	    {0,2,2,2,1,0,0,0},
	    {1,3,3,3,3,2,1,1},
	    {2,3,3,3,3,2,2,2}
	};

	// fine bit part
	for(x=0;x<8;x++){
	    for(y=0;y<8;y++){
		ofs[x][y][0] = x/2;
		ofs[x][y][1] = y/2;
		ofs[x][y][2] = (x&1) + (y&1)*2;
		ofsR[y][x]=ofsRy[y];
		ofsG[y][x]=ofsGx[y];
	    }}

	// repeat small image
	//block
	for(x=0;x<8;x++){
	    for(y=0;y<8;y++){
		int r0 =ofsR[x][y]*4;
		int g0 =ofsG[x][y]*4;
		int b0 =ofsB[x][y]*4;
		int xx,yy;
		for(xx=0;xx<L/8;x++){
		    for(yy=0;yy<L/8;yy++){
			int x2=xx;
			int y2=yy;
			if (x&1) x2=L/8-1-xx;
			if (y&1) y2=L/8-1-yy;
			int ad0 = (x2+y2*(L/8))*3;

			int r=pbuf[ad0 +0]/vv +r0;
			int g=pbuf[ad0 +1]/vv +g0;
			int b=pbuf[ad0 +2]/vv + b0;

			int adr=(x*ll+xx)+(y*ll+yy)*L;
			colr[adr]=r;
			colg[adr]=g;
			colb[adr]=b;
		    }}
	    }}

	free(pbuf);
	return;
    }
    printf("#Couldn't load %s\n",fn);
    exit(1);
}

// L1-norm energy
int ene1aux(int ad, rgbtype *d)
{
    int x,y;
    ADR2XY(ad,x,y);
    int x1 = (x-1)&LMASK;
    int y1 = (y-1)&LMASK;
    int adl=XY2ADR(x1,y);
    int adu=XY2ADR(x,y1);
    int val= d[ad];

    int res = 
	+ENE1(d[adl],val) //left
	+ENE1(d[adu], val);//up
    return res;
}

//L1 norm energy

int getene1(int ad)
{
    return ene1aux(ad, colr)+ene1aux(ad,colg)+ene1aux(ad,colb);
}

// total energy
long long calc_ene()
{
    int i;
    long long res=0;

    for(i=0;i<L2;i++)
    {
	res += getene1(i);
    }
    return res;
}

// standard Metropolis step, attempt swapping pixel

void sweep_c(int step, int dc, int plane)
{
    int r1,g1,b1, k;
    rgbtype neibuf[VMAX][4];
    rgbtype cnow[VMAX], icnow[VMAX];
    rgbtype *nei[VMAX][4];
    int adbuf[VMAX];
    int adnei[VMAX][4];

    rgbtype *val = col3[plane];

#ifdef TM
    int tt1=0,tt2=0,tt3=0, tt1a=0,tt1b=0,tt1c=0;
#endif

    // Outer loop
    for(b1=0;b1<VMAX;b1++){
	for(g1=0;g1<VMAX;g1++){

#ifdef TM
	    getusec();
#endif
	    ///////////////////////////////////
	    // Phase 1: Init buffer
	    for(r1=0;r1<VMAX;r1++)
	    {
		int c1;
		switch(plane)
		{
		    case 0:c1 = rgb2idx(r1,g1,b1);break;
		    case 1:c1 = rgb2idx(g1,r1,b1);break;
		    case 2:c1 = rgb2idx(g1,b1,r1);break;
		}
		int ad = CIDX(c1);
		adbuf[r1]=ad;
		cnow[r1]=r1;
		icnow[r1]=r1;

		// Calc neighbor Address
		int x,y;
		ADR2XY(ad,x,y);
		int xL = (x-1)&LMASK;
		int xR = (x+1)&LMASK;
		int yU = (y-1)&LMASK;
		int yD = (y+1)&LMASK;
		int adL=XY2ADR(xL,y);
		int adR=XY2ADR(xR,y);
		int adU=XY2ADR(x,yU);
		int adD=XY2ADR(x,yD);

		adnei[r1][0] = adL;
		adnei[r1][1] = adR;
		adnei[r1][2] = adU;
		adnei[r1][3] = adD;
	    }
#ifdef TM
	    // init end
	    tt1a += getusec();
#endif
	    // Fetch neighbor values
	    for(r1=0;r1<VMAX;r1++)
	    {
		for(k=0;k<4;k++) neibuf[r1][k]= val[adnei[r1][k]];
	    }
#ifdef TM
	    tt1b += getusec();
#endif

	    // Check address
	    for(r1=0;r1<VMAX;r1++)
	    {
		val[adbuf[r1]]++;
	    }

	    for(r1=0;r1<VMAX;r1++)
	    {
		for(k=0;k<4;k++)
		{
		    int ad1=adnei[r1][k];
		    if(neibuf[r1][k] != val[ad1])
		    {
			rgbtype v=neibuf[r1][k];
			nei[r1][k] = &cnow[v];
		    }
		    else nei[r1][k] = &neibuf[r1][k];
		}
	    }
#ifdef TM
	    // init end
	    tt1c +=getusec();
#endif

	    // Begin main loop
	    int s;
	    for(s=0;s<step;s++)
	    {
		int di = ((rand()>>6)%dc)+1;
		//int ix1= (rand()>>6)%(VMAX-di);
		//int ix2= ix1+ di;
		//int val1=cnow[ix1];
		//int val2=cnow[ix2];

		int val1= (rand()>>6)%(VMAX-di);
		int val2= val1 + di;
		int ix1 = icnow[val1];
		int ix2 = icnow[val2];

		// target address
		rgbtype *c2 = &cnow[ix2];

#ifdef L2N
		// l2-norm
		int sj1 = *nei[ix1][0]+*nei[ix1][1]+*nei[ix1][2]+*nei[ix1][3];
		int sj2 = *nei[ix2][0]+*nei[ix2][1]+*nei[ix2][2]+*nei[ix2][3];
		if ( nei[ix1][0]==c2 || nei[ix1][1]==c2 || nei[ix1][2]==c2 || nei[ix1][3]==c2)
		{
		    sj1 -= val2;
		    sj2 -= val1;
		} 
		int de = (sj2-sj1)*(val2-val1); 
#else
		int n11=*nei[ix1][0];
		int n12=*nei[ix1][1];
		int n13=*nei[ix1][2];
		int n14=*nei[ix1][3];

		int n21=*nei[ix2][0];
		int n22=*nei[ix2][1];
		int n23=*nei[ix2][2];
		int n24=*nei[ix2][3];

		int e1now = ENE1(val1,n11)+ENE1(val1,n12)+ENE1(val1,n13)+ENE1(val1,n14);
		int e2now = ENE1(val2,n21)+ENE1(val2,n22)+ENE1(val2,n23)+ENE1(val2,n24);

		int e1new = ENE1(val2,n11)+ENE1(val2,n12)+ENE1(val2,n13)+ENE1(val2,n14);
		int e2new = ENE1(val1,n21)+ENE1(val1,n22)+ENE1(val1,n23)+ENE1(val1,n24);

		// undo double count
		if ( nei[ix1][0]==c2 || nei[ix1][1]==c2 || nei[ix1][2]==c2 || nei[ix1][3]==c2)
		{
		    int dup=ENE1(val1,val2);
		    e1now -=dup;
		    e2now -=dup;
		}

		// energy increase by swapping
		int de = e1new+e2new - e1now-e2now; //(sj2-sj1)*(val2-val1); 
#endif
		bool accp=true;
		if (de>=0)
		{
		    if (rand() > thtab[de]) accp=false;
		}

		n_mc++;
		if (accp)
		{
		    //swap
		    cnow[ix1] = val2;
		    cnow[ix2] = val1;
		    icnow[val2] = ix1;
		    icnow[val1] = ix2;
		    ene += de;
		    n_accp++;
		}
	    }//Loop

#ifdef TM
	    tt2 += getusec();
#endif
	    //Phase 3: Finish data
	    for(r1=0;r1<VMAX;r1++)
	    {
		int r2 = cnow[r1];
		val[adbuf[r1]] = r2;
		int c1;
		switch(plane)
		{
		    case 0:c1 = rgb2idx(r2,g1,b1);break;
		    case 1:c1 = rgb2idx(g1,r2,b1);break;
		    case 2:c1 = rgb2idx(g1,b1,r2);break;
		}
		LETCIDX(c1, adbuf[r1]);
	    }
#ifdef TM
	    tt3+=getusec();
#endif
	}}

#ifdef TM
    printf("#MC P%d %d (%f %f %f) %f %f\n", plane, step, tt1a*1e-6,tt1b*1e-6,tt1c*1e-6,tt2*1e-6,tt3*1e-6);
#endif
}

void sweep_x(int dc, int plane)
{
    int ad1;
    rgbtype *col = col3[plane];

    for(ad1=0;ad1<L2;ad1++)
    {
	int x1,y1;
	ADR2XY(ad1,x1,y1);
	int r1=colr[ad1];
	int g1=colg[ad1];
	int b1=colb[ad1];
	int c1=rgb2idx(r1,g1,b1);
	int val1=col[ad1];
	int val2=val1;

	int r2,g2,b2;
	r2=r1;g2=g1;b2=b1;
	if(dc==1)
	{
	    int rn=(rand()>>10)&1;
	    int sig=rn*2-1;
	    val2 = val1+sig;
	}
	else
	{
	    int rn=(rand()>>10)&(2*dc-1);
	    int sig=(rn&1)*2-1;
	    rn = 1+rn/2;
	    val2=val1+sig*rn;
	}
        if(val2<0 || val2>=VMAX)continue;
	if(plane==0) r2=val2;
	else
	if(plane==1) g2=val2;
	else b2=val2;

	int c2=rgb2idx(r2,g2,b2);
	int ad2=CIDX(c2);
	int x2,y2;
	ADR2XY(ad2,x2,y2);

	int x1L = (x1-1)&LMASK;
	int x1R = (x1+1)&LMASK;
	int y1U = (y1-1)&LMASK;
	int y1D = (y1+1)&LMASK;
	int ad1L=XY2ADR(x1L,y1);
	int ad1R=XY2ADR(x1R,y1);
	int ad1U=XY2ADR(x1,y1U);
	int ad1D=XY2ADR(x1,y1D);

	int x2L = (x2-1)&LMASK;
	int x2R = (x2+1)&LMASK;
	int y2U = (y2-1)&LMASK;
	int y2D = (y2+1)&LMASK;
	int ad2L=XY2ADR(x2L,y2);
	int ad2R=XY2ADR(x2R,y2);
	int ad2U=XY2ADR(x2,y2U);
	int ad2D=XY2ADR(x2,y2D);

#if defined(L2N) || defined(L2HEX)
	int sj1,sj2;
	int pl;
	sj1= col[ad1L] +  col[ad1R] +  col[ad1U] +  col[ad1D];
	sj2= col[ad2L] +  col[ad2R] +  col[ad2U] +  col[ad2D];
	if ( ad1==ad2L || ad1==ad2R || ad1==ad2U || ad1==ad2D)
	{
	    sj1-=val2;
	    sj2-=val1;
	} 
#ifdef L2HEX
	sj1 = (CHEX+1)*sj1 +CHEX*neicbuf[ix1];
	sj2 = (CHEX+1)*sj2 +CHEX*neicbuf[ix2];
#endif
	//int de = (r2-r1)*(sj2[0]-sj1[0]) + (g2-g1)*(sj2[1]-sj1[1]) +(b2-b1)*(sj2[2]-sj1[2]);
	int de = (val2-val1)*(sj2-sj1);
#else
	int n1L=col[ad1L];
	int n1R=col[ad1R];
	int n1U=col[ad1U];
	int n1D=col[ad1D];
	int n2L=col[ad2L];
	int n2R=col[ad2R];
	int n2U=col[ad2U];
	int n2D=col[ad2D];

	int e1now = ENE1(val1,n1L)+ENE1(val1,n1R)+ENE1(val1,n1U)+ENE1(val1,n1D);
	int e2now = ENE1(val2,n2L)+ENE1(val2,n2R)+ENE1(val2,n2U)+ENE1(val2,n2D);

	int e1new = ENE1(val2,n1L)+ENE1(val2,n1R)+ENE1(val2,n1U)+ENE1(val2,n1D);
	int e2new = ENE1(val1,n2L)+ENE1(val1,n2R)+ENE1(val1,n2U)+ENE1(val1,n2D);
	// undo double count
	if ( ad1L==ad2 || ad1R==ad2 || ad1U==ad2 || ad1D==ad2)
	{
	    int dup=ENE1(val1,val2);
	    e1now -=dup;
	    e2now -=dup;
	}

	// energy increase by swapping
	int de = e1new+e2new - e1now-e2now; //(sj2-sj1)*(val2-val1); 
#endif

	bool accp=true;
	if (de>=0)
	{
	    if (rand() > thtab[de]) accp=false;
	}

	n_mc++;
	if (accp)
	{
	    //swap
	    col[ad1]=val2;
	    col[ad2]=val1;

	    LETCIDX(c1, ad2);
	    LETCIDX(c2, ad1);

	    ene += de;
	    n_accp++;
	}
    }//Loop
}

int  main(int argc, char **argv)
{

    int i;
    //test address
    for(i=0;i<L2;i++)
    {
	int x,y,a2;
	ADR2XY(i,x,y);
	a2=XY2ADR(x,y);
	//printf("i %x X %x Y %x A %x\n",i,x,y,a2);
	if(i!=a2)exit(1);
    }

    bool do_init_stuff = true;
#ifdef HIE
    for(i=0;i<VMAX;i++)
    {
	coldif[i]=0;
	int j;
	for(j=1;j<VMAX;j*=2)
	    if(i&j) coldif[i]=j;
    }
#endif
#if LSQ
    for(i=0;i<=VMAX;i++)
    {
	coldif[VMAX+i]=(int)(sqrt(i*1.0*VMAX));
	coldif[VMAX-i]=(int)(sqrt(i*1.0*VMAX));
    }
#endif
    if (do_init_stuff)
    {
	//init
	for(i=0;i<L2;i++)
	{
	    int r=i &CBMASK;
	    int g=(i>>CBIT) &CBMASK;
	    int b=(i>>(CBIT*2)) &CBMASK;
	    colr[i] = r;
	    colg[i] = g;
	    colb[i] = b;
	}
	if(argc==2)
	{
	    load_ppm(argv[1]);
	}
	else
	{
	    printf("#Begin randomize\n");
	    for(i=0;i<L2*3;i++)
	    {
		int i1 = (rand()>>4)&L2MASK;
		int i2 = (rand()>>4)&L2MASK;
		rgbtype c;
		c=colr[i1];
		colr[i1]=colr[i2];
		colr[i2]=c;
		c=colg[i1];
		colg[i1]=colg[i2];
		colg[i2]=c;
		c=colb[i1];
		colb[i1]=colb[i2];
		colb[i2]=c;
	    }
	    printf("# Finished randomize\n");
	}
    }// init stuff

    init_obs(1,0);
    init_cidx();

    ene = calc_ene();
#ifdef L2N
    ene/=2;
#endif


    int t,s;

    FILE *fp;
    if (do_init_stuff)
    {
	fp = fopen("elog","a");
	fprintf(fp, "\n");
    }

    int dc=16;//VMAX/2;

    //Begin
    for(t=0;t<TDIV;t++)
    {
	double beta = BMIN * exp( log(BMAX/BMIN)*t/TDIV );
	init_th(beta, RAND_MAX);
	nE=0;
	n_accp=n_mc=0;

	getusec();
	for(s=0;s<MAXS;s++)
	{
#if 0
	    sweep_x(dc, 0);
	    sweep_x(dc, 1);
	    sweep_x(dc, 2);
#else
	    sweep_c(VMAX*2, dc, 0);
	    sweep_c(VMAX*2, dc, 1);
	    sweep_c(VMAX*2, dc, 2);
#endif
	    if((s % OBSEVERY)== OBSEVERY-1)storee();
	}
	int tt=getusec();
	fflush(stdout);

	double ee1, cv, tc, m1,m2,mc;
	double mk[LBIT];
	CalcAve(&ee1, &m1, &cv, &m2, &tc, &mc, mk);
	if (do_init_stuff)
	{
	    fprintf(fp, "%e %f %e %f %f %f %f Acp %f (%d) 1kMC=%f sec\n",
		    beta,ee1,cv,tc,m1,m2,mc, 100.0*n_accp/n_mc, dc, tt*1e-6/MAXS*1000);
	    fflush(fp);

	    FILE *mfp;
	    int kk;
#if 0
	    mfp=fopen("mlog","a");
	    fprintf(mfp, "%e ", beta);
	    for(kk=0;kk<LBIT-1;kk++)fprintf(mfp, "%e ",mk[kk]);
	    fprintf(mfp, "\n");
	    fclose(mfp);
#endif

#ifdef HIE
	    mfp=fopen("ehlog","a");
	    fprintf(mfp, "%e ", beta);
	    for(kk=CBIT-1;kk>=0;kk--)fprintf(mfp, "%e ", ehie[kk][0]);
	    fprintf(mfp, "\n");
	    fclose(mfp);
#endif
#if 0
	    FILE *fp2=fopen("alog","a");
	    fprintf(fp2,"\n\n#%d data\n", nE);
	    int i;
	    for(i=0;i<nE;i++)fprintf(fp2,"%e %e %e\n",beta, edat[i],mdat[i]);
	    fclose(fp2);
#endif
	}

	if( 100*n_accp/n_mc < 40 && dc!=1) dc/=2;
	else if( 100*n_accp/n_mc >75 && dc<=VMAX/4) dc*=2;
#if 1
	//Dump
	//dump_small(t, colr,colg,colb); 
	//dump(2, colr,colg,colb);
#endif
    }

    if(do_init_stuff)
    {
	fprintf(fp, "\n\n");
	fclose(fp);
	fp=fopen("ehlog","a");
	fprintf(fp, "\n\n");
	fclose(fp);
    }

    dump(2, colr,colg,colb);

}