/*******************************************************************
FFFF v3.2.3

Main code   : Daniele Paccaloni [daniele.paccaloni@dylogic.com]

MANY THANKS TO:
Steven Kortze (Fragment Programs OS X compatibility)
Peter Kankowski (SSE optimizations)
Richard Rauch (Linux porting hints)
falco[SCT] (K6 cmove bugfix)
Gerard Basler (3DNow! code and SSE optimizations)
Benjamin Lipchak (R3xx optimizations)
ATI developers support (special thanks to Jeff and Andy)
Andre Krause
Amichai Rothman
C.Kleinhuis
Chris Martin
Francesco D'Amico
Jean-Philippe Perois
John Dough
Luciano Genero
Mitch Wright
Stephan Grossklass [JGrossklass@t-online.de]
... and all the others who kindly sent code, fixes, suggestions and feedback !
*******************************************************************/

int get_nprocs() {return 1;}

// WARNING: This source is a real mess ! :)))
// WARNING: This is only meant as some "portable" glue for assembly.
// WARNING: Do NOT attempt to learn C/C++ from this code !

#ifdef _WIN32
  #include <windows.h>
#endif

#ifdef __APPLE__
  #include <unistd.h>
  #include <string.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <pthread.h>
  #include <sched.h>
  #include <sys/select.h>
  #include <sys/types.h>
  #include <sys/sysctl.h>
  #include <OpenGL/gl.h>
  #include <GLUT/glut.h>
  #include <CoreServices/CoreServices.h>
#elif defined(sgi)
  #include <unistd.h>
  #include <string.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <mutex.h>
  #include <pthread.h>
  #include <sched.h>
  #include <sys/time.h>
  #include <sys/select.h>
  #include <sys/types.h>
  #include <sys/sysctl.h>
  #include <sys/systeminfo.h>
#elif defined(__linux__)
  #include <unistd.h>
  #include <string.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <pthread.h>
  #include <sched.h>
  #include <sys/time.h>
  #include <sys/select.h>
  #include <sys/types.h>
  #include <sys/sysctl.h>
//  #include <sys/sysinfo.h>
#else
  #include "GL/glut.h"
  #include "GL/gl.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include "math.h"
#include "VertexProgramATI.h"

#ifndef __APPLE__
  #include "VertexProgramNV.h"
#endif

#include "FragmentProgramARB10.h"
// App includes
#include "PixelBuffer.h"

// Defines
#define RENDER_MODE_GPU_VP    0
#define RENDER_MODE_FPU_C     1
#define RENDER_MODE_FPU_ASM   2
#define RENDER_MODE_CPU_SSE   3
#define RENDER_MODE_CPU_SSE2  4
#define RENDER_MODE_CPU_3DNOW 5
#define RENDER_MODE_GPU_FP    9

#define ITER_BLACK 0xFFFFFFFF

// GLUT callbacks
void display(void);
void myReshape(int w, int h);
void NextFrame();
void processNormalKeys(unsigned char key, int x, int y);
void mousemove(int x, int y);
void mouseclick(int button, int down, int x, int y);
void idleFunc();

// App prototypes
void prepareWorldSpace(void);
void postRedisplay(void);
bool renderImage(unsigned int maxi, int mode);
bool calcPixelRow(int row, unsigned int maxi, int mode);
bool calcPixelRow_C(int row, unsigned int maxi);
bool calcPixelRow_FPU_ASM(int row, unsigned int maxi);
bool calcPixelRow_CPU_SSE(int row, unsigned int maxi);
bool calcPixelRow_CPU_SSE2(int row, unsigned int maxi);
bool calcPixelRow_CPU_3DNOW(int row, unsigned int maxi);
bool renderPixelRow(int row, int mode);
void prepareColorTable(int numColors, GLubyte startR, GLubyte startG, GLubyte startB);
bool checkSSE();
bool checkSSE2();
bool check3DNow();
void updateWinTitle();
void rotatePalette(int delta);
void runBenchmark();
void printHelp();

#if defined(sgi)
extern bool calcPixelRow_FPU_ASM_MIPS(unsigned int* rowBuffer, unsigned int maxi, unsigned int iter_black, int width, double cx, double cy, double sx);
extern bool calcPixelRow_FPU_ASM_MIPSR8000(unsigned int* rowBuffer, unsigned int maxi, unsigned int iter_black, int width, double cx, double cy, double sx);
#endif


#ifdef __APPLE__
// OS X
void* slaveThreadCode(void* lpParameter);
#  if __BIG_ENDIAN__
volatile long InterlockedDecrement(volatile long* ptr) {
  volatile long tmp;
  __asm __volatile (
                    "1:     lwarx   %0, 0, %1\n"	// Load Word and Reserve Indexed
                    "       addic   %0, %0, -1\n"	// Add Immediate Carrying
                    "       stwcx.  %0, 0, %1\n"	// Store Word Conditional Indexed
                    "       bne-    1b"
                    : /* output */     "=&r" (tmp)
                    : /* input */      "r" (ptr)
                    : /* clobbered */  "cc", "memory");
                    return tmp;
}
#  else // __APPLE__ && __LITTLE_ENDIAN__
long InterlockedDecrement(long* ptr) {
	DecrementAtomic(ptr);
	return (*ptr);
}
#  endif  // __APPLE__ && __BIG_ENDIAN__
#elif defined (sgi)
// IRIX
void* slaveThreadCode(void* lpParameter);
unsigned long InterlockedDecrement(unsigned long* ptr) {
  return test_then_add(ptr, -1);
}
#elif defined (__linux__)
// LINUX
void* slaveThreadCode(void* lpParameter);
volatile long InterlockedDecrement(volatile long* ptr) {
  __asm __volatile (
    "movl $-1, %%eax;"
    "lock xaddl %%eax, (%%ecx);" // Assert CPU LOCK# signal {temp = *ecx; *ecx += eax; eax = temp}
    "decl %%eax;"                  // Update eax, since it holds the old value of *ecx"
    : /* output regs */
    : /* input regs */     "c" (ptr)
    );
  // Return value in eax */
}

#elif defined(WIN32)
// WIN32
DWORD WINAPI slaveThreadCode(LPVOID lpParameter);
#endif


GPUProgram* vp_ = NULL;

// Application stuff
char* progname = "FFFF v3.2.3";
char winTitle_[256];
long numCPU_ = 0;
int* slices_ = NULL; // Slices array (only used if SMP available).
PixelBuffer* pixBuf_ = NULL;
int debug = 0;
const char* glVersion_ = NULL; // OpenGL version
GLubyte* glArrColor_ = NULL;
GLfloat* glArrVertex_ = NULL;
int mode_ = RENDER_MODE_FPU_ASM; // Current render mode
bool useGLArrays_ = true;
GLubyte* colorTable_ = NULL;
bool rendering_ = false;
bool benchmarking_ = false;
bool reshaped_ = false;
bool avail_SSE = false;
bool avail_SSE2 = false;
bool avail_3DNow = false;
bool avail_VP = false;
bool avail_FP = false;

unsigned int maxi_ = 40;
double ax_ = -2.0f;
double ay_ = -1.5f;
double ex_ =  1.0f;
double ey_ =  1.5f;
double sx_, sy_;
double four = 4.0f;

#if defined (sgi)
unsigned long slavesWorking_ = 0;
#else
long slavesWorking_ = 0;
#endif

#if defined(__APPLE__) || defined(sgi) || defined(__linux__)
// OS X, IRIX
pthread_t* threadIDs_;
long long sysFreq = 1000000;
// We have to simulate some Win32 stuff.
typedef union _LARGE_INTEGER {
//  struct {
//    unsigned long LowPart;
//    long HighPart;
//  };
  struct {
    unsigned long LowPart;
    long HighPart;
  } u;
  long long QuadPart;
} LARGE_INTEGER, *PLARGE_INTEGER;
bool QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount) {
  timeval tp;
  gettimeofday(&tp, NULL);
  lpPerformanceCount->QuadPart = tp.tv_sec*1000000 + tp.tv_usec;
  return true;
}
#elif defined(WIN32)
// Win32 system stuff
SYSTEM_INFO   inf;
HANDLE finishSem_ = NULL;
HANDLE* threads_ = NULL;
DWORD* threadIDs_ = NULL;
LARGE_INTEGER sysFreqQuery;
LONGLONG sysFreq;
LONGLONG timing;
#endif

LARGE_INTEGER timeTmp0;
LARGE_INTEGER timeTmp1;
double timingf;

// GLUT stuff
int win_top;
int w_; // Window width
int h_; // Window height
int mouseClickPos[2] = {0,0};
int mousePos[2] = {0,0};
bool mouseDown[3] = {false, false, false};
int glutKeyModif_ = 0;
bool doublebuf_ = false;
double ax0_, ay0_, ex0_, ey0_;


int main(int argc, char* argv[]) {
  printf(progname);
  printf("\n(C)1994-2006 Daniele Paccaloni (daniele.paccaloni@dylogic.com)\n");
  printf("Initalizing...\n");
  sprintf(winTitle_, "%s initializing...", progname);

  // System & Performance init.
  int rc = 0;
#ifdef __APPLE__
  // OS X
  int mib[2], value = 0; 
  size_t len = sizeof(value); 
  mib[0] = CTL_HW; 
  mib[1] = HW_NCPU;
  rc = sysctl(mib, 2, &value, &len, NULL, 0);
  if (rc == 0) numCPU_ = value;
  else numCPU_ = 1;
#elif defined (sgi)
  numCPU_ = sysconf(_SC_NPROC_ONLN);
#elif defined (__linux__)
  numCPU_ = get_nprocs();
#elif defined (WIN32)
  // WIN32
  GetSystemInfo(&inf);
  numCPU_ = inf.dwNumberOfProcessors;
#endif

  printf("Number of CPUs: %d\n", numCPU_);
  if (numCPU_ > 1)  {
    // Multiple CPUs available.
    printf("SMP support available, creating %d slave threads.\n", numCPU_-1);
    // Allocate and fill slices array.
    slices_ = (int*) malloc(numCPU_*sizeof(int*));
    for (int i=0; i<numCPU_; i++) slices_[i] = i;
  }
  // If more than 1 CPU, Create SMP structures
#if defined(__APPLE__) || defined(sgi) || defined(__linux__)
  // UNIX
  if (numCPU_ > 1)  {
    threadIDs_ = (pthread_t*) malloc((numCPU_-1)*sizeof(pthread_t));
    if (threadIDs_ == NULL) {
      printf("*ERROR* Failed to allocate slave threads pool.\n");
      numCPU_ = 1;
    }
    if (numCPU_ > 1) {
      // Create Threads
      pthread_attr_t attr;
      pthread_attr_init(&attr);
		pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
		pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); // Do not inherit scheduling from parent thread.
		struct sched_param sp;
		memset(&sp, 0, sizeof(sched_param));
		sp.sched_priority = 0;
		pthread_attr_setschedpolicy(&attr, SCHED_OTHER);
		pthread_attr_setschedparam(&attr, &sp);
		pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);
		pthread_t* curThread = threadIDs_;
		for (int i=0; i<(numCPU_-1); i++) 
		{
			rc = pthread_create(curThread, &attr, slaveThreadCode, &slices_[i]);
			if (rc != 0) 
			{
				// ERROR creating thread.
				printf("PANIC: thread creation failed\n");
				return false;
			}
			curThread++;
		}
		pthread_attr_destroy(&attr);
	}
    else 
	{
		printf("Switching to MONO CPU mode.\n");
    }
  }
#else
  // WIN32
  if (numCPU_ > 1) 
  {
		threads_ = (HANDLE*) malloc((numCPU_-1)*sizeof(HANDLE*));
		threadIDs_ = (DWORD*) malloc((numCPU_-1)*sizeof(DWORD*));
		if ((threads_ == NULL) || (threadIDs_ == NULL)) 
		{
			printf("*ERROR* Failed to allocate calc thread pool.\n");
			numCPU_ = 1;
		}
		if (numCPU_ > 1) 
		{
			// Create Threads
			HANDLE* curThread = threads_;
			DWORD* curThreadID = threadIDs_;
			for (int i=0; i<(numCPU_-1); i++) 
			{
				*curThread = CreateThread(
					NULL,  // pointer to security attributes
					0,                         // initial thread stack size
					(LPTHREAD_START_ROUTINE) slaveThreadCode,     // pointer to thread function
					&slices_[i],                        // argument for new thread
					0*CREATE_SUSPENDED,                     // creation flags
					curThreadID                         // pointer to receive thread ID
					);
			}
		}
		else 
		{
			printf("Switching to MONO CPU mode.\n");
		}
	}
#endif
 
  //avail_SSE = checkSSE();
  avail_SSE = 1;
  if (avail_SSE)  {
#if defined(__APPLE__) && __BIG_ENDIAN__
    // PowerPC
    printf("AltiVec instructions supported. Switching to AltiVec quadpoints computation.\n");
#else
    // Intel x86
    printf("SSE instructions supported. Switching to SSE quadpoints computation.\n");
#endif
    mode_ = RENDER_MODE_CPU_SSE;
  }
  else {
    printf("Vector instructions NOT supported.\n");
    printf("Switching to machine code FPU mode.\n");
  }

  //avail_SSE2 = checkSSE2();
  avail_SSE2 = 1;
  if (avail_SSE2)  {
#if defined (sgi)
    printf("MIPS dual FPU units supported.\n");
    mode_ = RENDER_MODE_CPU_SSE2;
    printf(" Switching to R8000 opt dualpoints computation.\n");
#else
    printf("SSE2 instructions supported.\n");
#endif
  }
  else {
#if defined (sgi)
    printf("MIPS dual FPU units NOT available.\n");
#else
    printf("SSE2 instructions NOT supported.\n");
#endif
  }

  avail_3DNow = check3DNow();
  if (avail_3DNow) {
    printf("AMD 3DNow! instructions supported.");
    if (!avail_SSE) {
      mode_ = RENDER_MODE_CPU_3DNOW;
      printf(" Switching to 3DNow! quadpoints computation.\n");
    }
    else printf("\n");
  }
  else {
    printf("3DNow! instructions NOT available.\n");
  }


  if ((!avail_SSE) && (!avail_SSE2) && (!avail_3DNow)) {
#if defined(__APPLE__) &&__LITTLE_ENDIAN__	// This is default for Intel Macs due to assembler problem
    printf("Switching to FPU compiled C mode.\n");
    mode_ = RENDER_MODE_FPU_C;
#else													//	Apple G3s and earlier, also other non vector-enabled systems
    printf("Switching to machine code FPU mode.\n");
    mode_ = RENDER_MODE_FPU_ASM;
#endif
  }

  // Initialize GLUT
  glutInit(&argc, argv);

  glutInitWindowPosition(0, 0);
  glutInitWindowSize(500, 500);


  // Display mode
  glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
  // Create toplevel win
  win_top = glutCreateWindow(winTitle_);
  // Set win and icon titles
  glutSetWindowTitle(progname);
  updateWinTitle();
  
  glutSetIconTitle(progname);
  // Set myReshape function
  glutReshapeFunc(myReshape);
  // Set display function
  glutDisplayFunc(display);
  // Set idle function
  glutIdleFunc(NULL);
  // Set cursor
  glutSetCursor(GLUT_CURSOR_CROSSHAIR);
  // Set mouse callback
	glutMouseFunc(mouseclick);
  glutMotionFunc(mousemove);
  glutPassiveMotionFunc(mousemove);
  // Set keyboard callback
  glutKeyboardFunc(processNormalKeys);
  // Set idle function
  glutIdleFunc(idleFunc);

  //printf(" o: Toggle Object.\n");

  // OpenGL init
  glDisable(GL_LIGHTING);
  glDisable(GL_LIGHT0);
  glDisable(GL_DITHER);
  glDisable(GL_DEPTH_TEST);
  glShadeModel(GL_FLAT);
  glMatrixMode(GL_MODELVIEW);
  glLoadIdentity();
  glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
  glClear(GL_COLOR_BUFFER_BIT);
  glDisable(GL_NORMALIZE);
  glDrawBuffer(GL_FRONT);

  glVersion_ = (const char*) glGetString(GL_VERSION); // Get OpenGL Version
  printf("OpenGL v%s\n", glVersion_);
	printf("  Renderer: %s\n", glGetString(GL_RENDERER));
	printf("  Vendor: %s\n", glGetString(GL_VENDOR));
  useGLArrays_ = (glVersion_[0] >= '2') || (glVersion_[2] > '0');
  if (!useGLArrays_) {
    printf("*WARNING* OpenGL arrays not available. Render will be slower.\n");
  }
  // Print help
  printHelp();
	//printf("  Extensions: %s\n",glGetString(GL_EXTENSIONS));
  //printf("\nSeen anything faster ? Then... time to change your OpenGL card !\n");


  // Initialize colortable
  prepareColorTable(10000, 70, 50, 200);

  // Enter GLUT mainloop

  glutMainLoop();

  // exit
  return 0;
}

/**
 * GLUT Idle function
 */
void idleFunc() {
  if (mouseDown[0] || mouseDown[2]) {
    double zoom = 0.95;
    int msx = mousePos[0];
    int msy = mousePos[1];

    if (mouseDown[2]) {
      // zoomout
      zoom = 1.05;
    }

    double mx = ax_ + sx_*((double)msx);
    double my = ay_ + sy_*((double)msy);

    //printf("x=%d, y=%d\n", msx, msy);
    //printf("mx=%f, my=%f\n", mx, my);
    
    sx_ *= zoom;
    sy_ *= zoom;

    ax_ = mx - sx_*((double)msx);
    ay_ = my - sy_*((double)msy);
    ex_ = ax_ + sx_*((double)w_);
    ey_ = ay_ + sy_*((double)h_);

    // Recalc real plane parameters
    //sx_ = (ex_ - ax_) / ((double) w_);
    //sy_ = (ey_ - ay_) / ((double) h_);
    //sy_ = sx_;
    
    if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) {
      prepareWorldSpace();
#ifdef _WIN32
      // WIN32
      Sleep(10);
#else
      // UNIX
      usleep(10000);
#endif
      //InitializeVertexProgramNV();
    }
    postRedisplay();
  }
  else {
#ifdef _WIN32
    // WIN32
    Sleep(10);
#else
    // UNIX
    usleep(10000);
#endif
  }
}

//------------------ OPENGL RENDER FUNCTION ------------------

void display(void)
{
  if (benchmarking_) {
    runBenchmark();
  }
  rendering_ = true;
  renderImage(maxi_, mode_);

  if (doublebuf_) glutSwapBuffers();
  rendering_ = false;
}

//------------------ RESHAPE WINDOW ------------------

void myReshape(int w, int h)
{
  reshaped_ = true;

  w_ = w;
  h_ = h;

  // Recalc real plane parameters
  sx_ = (ex_ - ax_) / ((double) w_);
  //sy_ = (ey_ - ay_) / ((double) h_);
  sy_ = sx_;

  sprintf(winTitle_, "%s %d*%d i=%d", progname, w_, h_, maxi_);
  updateWinTitle();
  
  //printf("My reshape!\n");
  glViewport(0, 0, w, h);

  prepareWorldSpace();

  // Render again the image
  postRedisplay();
}

//------------------ USER INPUT HANDLERS ------------------

void mousemove(int x, int y)
{
	int difference[2];

	// Calculate the change in mouse position
	difference[0] = mouseClickPos[0] - x;
	difference[1] = mouseClickPos[1] - y;

	// If the left mouse button is held down, translate the set
	if (mouseDown[1]) {
    ax_ = ax0_ + ((double)difference[0])*sx_;
    ay_ = ay0_ + ((double)difference[1])*sy_;
    ex_ = ex0_ + ((double)difference[0])*sx_;
    ey_ = ey0_ + ((double)difference[1])*sy_;
    if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) {
      prepareWorldSpace();
      //InitializeVertexProgramNV();
    }
    postRedisplay();
	}

  mousePos[0] = x;
	mousePos[1] = y;
}


void mouseclick(int button, int down, int x, int y) {
	// Update mouse states
	switch(button)
	{
	case GLUT_LEFT_BUTTON:
    if (down == GLUT_DOWN) {
      mouseDown[0] = true;
    }
    else {
      mouseDown[0] = false;
    }
		break;
	case GLUT_MIDDLE_BUTTON:
		if (down == GLUT_DOWN)
			mouseDown[1] = true;
		else
			mouseDown[1] = false;
		break;
  case GLUT_RIGHT_BUTTON:
		if (down == GLUT_DOWN)
			mouseDown[2] = true;
		else
			mouseDown[2] = false;
		break;
	}
  
  ax0_ = ax_;
  ay0_ = ay_;
  ex0_ = ex_;
  ey0_ = ey_;

  mouseClickPos[0] = x;
	mouseClickPos[1] = y;
}

void processNormalKeys(unsigned char key, int x, int y) 
{
	glutKeyModif_ = glutGetModifiers();
	if (key == 27) exit(0); // ESC to quit
	key |= 0x20;
	if (key == '1') 
	{
		mode_ = RENDER_MODE_FPU_C;
		if (vp_ != NULL) { delete vp_; vp_ = NULL; };
		prepareWorldSpace();
		postRedisplay();
	}
	else if (key == '2') 
	{
		mode_ = RENDER_MODE_FPU_ASM;
		if (vp_ != NULL) { delete vp_; vp_ = NULL; };
		prepareWorldSpace();
		postRedisplay();
	}
	else if (key == '3') 
	{
		if (avail_SSE) 
		{
			mode_ = RENDER_MODE_CPU_SSE;
			if (vp_ != NULL) { delete vp_; vp_ = NULL; };
			prepareWorldSpace();
			postRedisplay();
		}
	}
	else if (key == '4') 
	{
		if (avail_SSE2) 
		{
			mode_ = RENDER_MODE_CPU_SSE2;
			if (vp_ != NULL) { delete vp_; vp_ = NULL; };
			prepareWorldSpace();
			postRedisplay();
		}
	}
	else if (key == '5') 
	{
		if (avail_3DNow) 
		{
			mode_ = RENDER_MODE_CPU_3DNOW;
			if (vp_ != NULL) { delete vp_; vp_ = NULL; };
			prepareWorldSpace();
			postRedisplay();
		}
	}
	else if (key == '9') 
	{
		// Delete previous GPUPrograms
		if (vp_ != NULL) { delete vp_; vp_ = NULL; };

		// Save maxi_ in case of failure
		unsigned int maxi_old = maxi_;

		// Try ARB Fragment Program
		if (maxi_ > 10) maxi_ = 20;
		vp_ = new FragmentProgramARB10(maxi_, w_, h_, ax_, ay_, ex_, ey_);
		if (vp_->isValid()) {
		mode_ = RENDER_MODE_GPU_FP;
		prepareWorldSpace();
		postRedisplay();
	}
    else 
	{
		delete vp_; vp_ = NULL;
		maxi_ = maxi_old;

		// Fragment Programs not supported on this card... please code it and tell me :)
		printf("FragmentPrograms not available or not supported on this card\n");

		//prepareWorldSpace();
		//postRedisplay();

		return;
		}
	}
	else if (key == '0') 
	{
		// Delete previous GPUPrograms
		if (vp_ != NULL) { delete vp_; vp_ = NULL; };

		// Save maxi_ in case of failure
		unsigned int maxi_old = maxi_;

#ifndef __APPLE__
		// Try with nVidia
		if (maxi_ > 60) maxi_ = 60;
		vp_ = new VertexProgramNV(maxi_, w_, h_, ax_, ay_, ex_, ey_);
		if (vp_->isValid()) 
		{
		}
#else
		// We do not support nVidia custom extensions on the Mac, sorry.
		if (false);
#endif
		else 
		{
			maxi_ = maxi_old;
			delete vp_; vp_ = NULL;

			// Try with ATI
			if (maxi_ > 10) maxi_ = 10; // This fixes the MORTAL PAUSE when starting using a long VP !!! ATI has resolved this bug.
			vp_ = new VertexProgramATI(maxi_, w_, h_, ax_, ay_, ex_, ey_);
			if (vp_->isValid()) 
			{
			}
			else 
			{
				delete vp_; vp_ = NULL;
				maxi_ = maxi_old;

				// Vertex Programs not supported on this card... please code it and tell me :)
				printf("VertexPrograms not available or not supported on this card\n");
        
				//prepareWorldSpace();
				//postRedisplay();
				return;
			}
		}
		mode_ = RENDER_MODE_GPU_VP;
		prepareWorldSpace();
		postRedisplay();
	}
	else if (key == ' ') 
	{
		// Calculate image
		postRedisplay();
	}
	else if (key == '-') 
	{
		// Dec iters
		int nmaxi = maxi_;
		if (glutKeyModif_ & GLUT_ACTIVE_SHIFT) nmaxi -= 20;
		else nmaxi--;
		if (nmaxi > 0) maxi_ = nmaxi;
		if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) 
		{
			vp_->initialize(maxi_, w_, h_, ax_, ay_, ex_, ey_);
		}
		postRedisplay();
	}
	else if (key == '+') 
	{
		// Inc iters
		int nmaxi = maxi_;
		if (glutKeyModif_ & GLUT_ACTIVE_SHIFT) nmaxi += 20;
		else nmaxi++;
		if (nmaxi < 9999) maxi_ = nmaxi;
		if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) 
		{
			vp_->initialize(maxi_, w_, h_, ax_, ay_, ex_, ey_);
		}
		postRedisplay();
	}
	else if (key == '*') 
	{
		// Rotate palette right
		rotatePalette(+1);
	}
	else if (key == '/') 
	{
		// Rotate palette left
		rotatePalette(-1);
	}
	else if (key == 'd') 
	{
		doublebuf_ = !doublebuf_;
		if (doublebuf_) 
		{
			glDrawBuffer(GL_BACK);
		}
		else 
		{
			glDrawBuffer(GL_FRONT);
		}	
		// Calculate image
		postRedisplay();
	}
	else if (key == 's') 
	{
		printf("Stats not yet supported.\n");
		//printf("  Total iters: %d\n", pixBuf_->getTotalIters(maxi_, ITER_BLACK));
	}
	else if (key == 'h') 
	{
		printHelp();
	}
	else if (key == 'b') 
	{
		benchmarking_ = true;
		glutReshapeWindow(500, 500);
		// Calculate image
		postRedisplay();
	}
	else if (key == 'r') 
	{
		maxi_ = 40;
		ax_ = -2.0f;
		ay_ = -1.5f;
		ex_ =  1.0f;
		ey_ =  1.5f;
		four = 4.0f;
		myReshape(w_, h_);
	}
	else if (key == 'o') 
	{
		doublebuf_ = false;
		glDrawBuffer(GL_FRONT);
		//printf("----------------- NEW ORBIT --------------------\n");
		// Display orbit
		// C row calculation routine
		// Calc vars
		double cx = ax_ + sx_*((double)x);
		double cy = ay_ + sy_*((double)y);
		double zx, zy;
		double zx2, zy2;
		// Calc Pixel
		zx = cx;
		zy = cy;
		glColor3d(1.0, 1.0, 1.0);
		glBegin(GL_POINTS);
		for (unsigned int i=0; i<maxi_; i++) 
		{
			//printf("Orbit: %1.15f\t%1.15f\n", zx, zy);
			glVertex2i((zx-ax_)/sx_, (zy-ay_)/sy_);
			zx2 = zx*zx;
			zy2 = zy*zy;
			if ((zx2 + zy2) > 4) break;
			zy = 2*zx*zy;
			zx = zx2 - zy2;
			zx += cx;
			zy += cy;
		}
		glEnd();
		glFlush();
	}
	updateWinTitle();
}


//------------------ Render the image -------------------
bool renderImage(unsigned int maxi, int mode) {
  //printf("%f,%f %d\n", ax_, ay_, maxi);

  if (h_ == 0) return true;

  glClear(GL_COLOR_BUFFER_BIT);

  //printf("ax=%f, ay=%f, ex=%f, ey=%f\n", ax_, ay_, ex_, ey_);

  if (mode == RENDER_MODE_GPU_FP) {
  	glBegin(GL_QUADS);
		  glTexCoord4f(ax_, ay_, 1.0, ay_);
		  //glVertex2f(0,0);
		  glVertex2f(ax_,ay_);
		  glTexCoord4f(ex_, ay_, 1.0, ay_);
		  //glVertex2f(1,0);
		  glVertex2f(ex_, ay_);
		  glTexCoord4f(ex_, ey_, 1.0, ey_);
		  //glVertex2f(1,1);
		  glVertex2f(ex_, ey_);
		  glTexCoord4f(ax_, ey_, 1.0, ey_);
		  //glVertex2f(0,1);
		  glVertex2f(ax_, ey_);
  	glEnd();
    /*
  	glBegin(GL_QUADS);
		  glTexCoord4f(ax_, ay_, ay_, 1.0);
		  //glVertex2f(0,0);
		  glVertex2f(ax_,ay_);
		  glTexCoord4f(ex_, ay_, ay_, 1.0);
		  //glVertex2f(1,0);
		  glVertex2f(ex_, ay_);
		  glTexCoord4f(ex_, ey_, ey_, 1.0);
		  //glVertex2f(1,1);
		  glVertex2f(ex_, ey_);
		  glTexCoord4f(ax_, ey_, ey_, 1.0);
		  //glVertex2f(0,1);
		  glVertex2f(ax_, ey_);
  	glEnd();
    */
    glFlush();
    return true;
  }

  // Wait until sync == 0. If > 0, dec sync and begin execution
  while (slavesWorking_ != 0) {
#ifdef _WIN32
      Sleep(1);
#else
	  usleep(1000);	//	1000
#endif
  }

  if (mode == RENDER_MODE_GPU_VP) {
    for (int y=0; y<h_; y++) {
      renderPixelRow(y, mode);
    }
    glFlush();
    return true; // RETURN HERE IF CALCULATING WITH THE GPU !!!
  }
  else {
    // Ok use the CPUs, but first check for window reshaped !
    if (reshaped_) {
      reshaped_ = false;
      glDisableClientState(GL_VERTEX_ARRAY);
      glDisableClientState(GL_COLOR_ARRAY);
      glFinish(); // We don't want to realloc the arrays if the GPU is still using them.
      glArrColor_ = (GLubyte*) realloc(glArrColor_, w_*3*sizeof(GLubyte));
      if (glArrColor_ == NULL) {
        printf("ERROR: allocating row color buffer\n");
      }
      glArrVertex_ = (GLfloat*) realloc(glArrVertex_, w_*2*sizeof(GLfloat));
      if (glArrVertex_ == NULL) {
        printf("ERROR: allocating vertex buffer\n");
      }
      // Reallocate pixel buffer
      delete(pixBuf_); pixBuf_ = NULL;
      // In case of quad pixel calculation, pad buffer width to multiple of 4
      int w = w_;
      if (RENDER_MODE_CPU_SSE) w = (w+3) & 0xfffffffc;
      // In case of dual pixel calculation, pad buffer width to multiple of 2
      if (RENDER_MODE_CPU_SSE2 | RENDER_MODE_CPU_3DNOW) w = (w+1) & 0xfffffffe;
      pixBuf_ = new PixelBuffer(w, h_);
    }
  }

  // Get the fun !!!
  int y = 0;

  // Mark all rows as not calculated
  for (y=0; y<h_; y++) pixBuf_->setRowCalculated(y, false);

  // Start the slave threads (on SMP machines)
  int neededSlaves = numCPU_ - 1;
  int endRow = h_/numCPU_;
  // Check for threads to use (max one per row)
  if (neededSlaves > h_) {
    endRow = 1;
    neededSlaves = h_-1;
  }

#ifdef _WIN32
      Sleep(3);
#else
	  usleep(3000);
#endif
  // Start threads
  slavesWorking_ = neededSlaves;

  bool complete = false;

  // The master thread (this) calcs+render his slice
  for (y=0; y<endRow; y++) {
    if (!calcPixelRow(y, maxi, mode)) {
      printf("INTERNAL ERROR: Calculation routine failed.\n");
      complete = true;
      break;
    }
    //printf("Calculated row %d\n", y);
    renderPixelRow(y, mode);
  }

  // After calculating his slice, the master thread renders the slaves' ones
  while (!complete) {
    complete = true;
    for (y=0; y<h_; y++) {
      if (pixBuf_->isRowCalculated(y)) renderPixelRow(y, mode);
      else complete = false;
    }
  }
  //slavesWorking_ = 0;
  //while (slavesWorking_ != 0) Sleep(1);

  // Flush damn OpenGL queue (needed!)
  glFlush();
  return true;
}

/**
 * Render the given pixel row for the given mode
 * @return true if row calc was ok, false otherwise
 */
bool renderPixelRow(int row, int mode) {
  //if (!pixBuf_->isRowCalculated(row)) return false;

  if (mode == RENDER_MODE_GPU_VP) {
    double cx = ax_;
    double cy = ay_ + sy_*((double)row);
    if (useGLArrays_) {
      /* enable arrays client state */
      glEnableClientState(GL_VERTEX_ARRAY);
      GLfloat* arrV = glArrVertex_;
      for (int x=0; x<w_; x++) {
        //glVertex2d(cx, cy);
        *arrV++ = cx;
        *arrV++ = cy;
        cx += sx_;
      }
      glVertexPointer(2, GL_FLOAT, 0, glArrVertex_);
      glDrawArrays(GL_POINTS, 0, w_);
      /* disable arrays client state */
      glDisableClientState(GL_VERTEX_ARRAY);
    }
    else {
      glBegin(GL_POINTS);
      for (int x=0; x<w_; x++) {
        glVertex2d(cx, cy);
        cx += sx_;
      }
      glEnd();
    }
    return true;
  }

  if (pixBuf_ == NULL) return false;
  unsigned int* rowBuffer = pixBuf_->getRowPointer(row);
  if (rowBuffer == NULL) return false;
  //printf("Rendering row %d (%p)\n", row, rowBuffer);

  int i = 0;
  unsigned int c = 0;
  GLfloat x = 0.0f;

  if (useGLArrays_) {
    /* enable arrays client state */
    glEnableClientState(GL_COLOR_ARRAY);
    glEnableClientState(GL_VERTEX_ARRAY);
    GLubyte* arrC = glArrColor_;
    GLfloat* arrV = glArrVertex_;
    GLfloat frow = (GLfloat) row;
    for (i=0; i < w_; i++) {
      x += 1.0f;
      //printf("%u\n", i);
      //printf("%p\t%p\t%p\n", rowBuffer, arrC, arrV);
      c = *rowBuffer;
      //printf("c=%u", c); fflush(stdout);
      if (c == ITER_BLACK) {
        *arrC++ = (GLubyte) 0;
        *arrC++ = (GLubyte) 0;
        *arrC++ = (GLubyte) 0;
      }
      else {
        *arrC++ = colorTable_[c*3];
        *arrC++ = colorTable_[c*3+1];
        *arrC++ = colorTable_[c*3+2];
      }
      *arrV++ = x;
      *arrV++ = frow;
      rowBuffer++;
    }
    glColorPointer(3, GL_UNSIGNED_BYTE, 0, glArrColor_);
    glVertexPointer(2, GL_FLOAT, 0, glArrVertex_);
    glDrawArrays(GL_POINTS, 0, w_);
    /* disable arrays client state */
    glDisableClientState(GL_COLOR_ARRAY);
    glDisableClientState(GL_VERTEX_ARRAY);
  }
  else {
    glBegin(GL_POINTS);
    for (int x=0; x<w_; x++) {
      //printf("%d\n", i);
      i = *rowBuffer;
      if (i == ITER_BLACK) {
        glColor3d(0.0f, 0.0f, 0.0f);
      }
      else {
        glColor3ub(colorTable_[i*3], colorTable_[i*3+1], colorTable_[i*3+2]);
      }
      glVertex2i(x, row);
      rowBuffer++;
    }
    glEnd();
  }

  return true;
}

/**
 * Calculate the given pixel row using the given mode
 * @return true if row calc was ok, false otherwise
 */
bool calcPixelRow(int row, unsigned int maxi, int mode) {
  //printf("Calculateing row %d\n", row);
  pixBuf_->setRowCalculated(row, false);
  bool retcode = false;
  if (mode == RENDER_MODE_FPU_C) retcode = calcPixelRow_C(row, maxi);
  else if (mode == RENDER_MODE_FPU_ASM) retcode = calcPixelRow_FPU_ASM(row, maxi);
  else if (mode == RENDER_MODE_CPU_SSE) retcode = calcPixelRow_CPU_SSE(row, maxi);
  else if (mode == RENDER_MODE_CPU_SSE2) retcode = calcPixelRow_CPU_SSE2(row, maxi);
  else if (mode == RENDER_MODE_CPU_3DNOW) retcode = calcPixelRow_CPU_3DNOW(row, maxi);
  if (retcode) pixBuf_->setRowCalculated(row, true);
  // TODO: Row agent not yet supported
  return retcode;
}

#pragma GCC optimization_level 3	//	optimize this for GCC. On a Dual G5 2.5 GHz machine, performance equal to FPU ASM routine!
/**
 * Calculate the given pixel row using C mode
 * @return true if row calc was ok, false otherwise
 */
bool calcPixelRow_C(int row, unsigned int maxi) {
  
  unsigned int i;
  int x;
  unsigned int* rowBuffer = pixBuf_->getRowPointer(row);
  //printf("(rowBuffer[%d]=%p)", row, rowBuffer);

  // C row calculation routine
  // Calc vars
  register double cx = ax_;
  register double cy = ay_ + sy_*((double)row);
  register double zx, zy;
  register double zx2, zy2;
  
  //printf("%f, %f\n", cx, cy);calcPixelRow_FPU_ASM_MIPS

	for (x=0; x<w_; x++) 
	{
		// Calc Pixel
		zx = cx;
		zy = cy;
    
		for (i=0; i<maxi; i++) 
		{
			zx2 = zx*zx;
			zy2 = zy*zy;
			if ((zx2 + zy2) > 4) break;
			zy = 2*zx*zy;
			zx = zx2 - zy2;
			zx += cx;
			zy += cy;
		}
    
		cx += sx_;
		if (i == maxi) *rowBuffer = ITER_BLACK;
		else *rowBuffer = i;
                //printf("i=%u", *rowBuffer); fflush(stdout);
		rowBuffer++;
	}
	
  // End of calculation routine

  return true;
}
#pragma GCC optimization_level reset	//	reset the optimization for GCC

/**
 * Calculate the given pixel row using ASM_FPU mode
 * @return true if row calc was ok, false otherwise
 */
bool calcPixelRow_FPU_ASM(int row, unsigned int maxi) {
  unsigned int* rowBuffer = pixBuf_->getRowPointer(row);
  double _cy = ay_ + sy_*((double)row);
  
#if defined(__APPLE__) && defined(__BIG_ENDIAN__)
  // Pretty speedy (but unoptimized PowerPC FPU code).
  // I guess with 2x unrolling we'll gain some extra speed.
  // The PowerPC FPU is nicely implemented.
  asm volatile (
    "NxtPix:                             \n"
    "        fmr %[zx], %[cx]            \n"
    "        fmr %[zy], %[cy]            \n"
    "                                    \n"
    "        mtctr %[maxi]               \n"
    "NxtI:                               \n"
    "        fmul %[zx2], %[zx], %[zx]   \n"
    "        fmul %[zy2], %[zy], %[zy]   \n"
    "        fmul %[zy], %[zx], %[zy]    \n"
    "        fadd %[m2], %[zx2], %[zy2]  \n"
    "        fsub %[zx], %[zx2], %[zy2]  \n"
    "        fadd %[zy], %[zy], %[zy]    \n"
    "        fcmpu cr0, %[m2], %[fourd]  \n"
    "        bgt- DonePix                \n"
    "        fadd %[zx], %[zx], %[cx]    \n"
    "        fadd %[zy], %[zy], %[cy]    \n"
    "        bdnz+ NxtI                  \n"
    "                                    \n"
    "DonePix:                            \n"
    "        mfctr %[i]                  \n"
    "        sub %[i], %[maxi], %[i]     \n"
    "        cmp cr0, %[i], %[maxi]      \n"
    "        bne+ NotBlack               \n"
    "        mr %[i], %[iter_black]      \n"
    "NotBlack:                           \n"
    "        stwx %[i], 0, %[rowBuffer]  \n"
    "        fadd %[cx], %[cx], %[sx]    \n"
    "        add %[rowBuffer], %[rowBuffer], %[four]\n"
    "        sub. %[x], %[x], %[one]     \n"
    "        bne+ NxtPix                 \n"
    : /* output */    //"=&r" (oldval), "=&r" (tmp), "=m" (*once_control)
    : /* input */
      [rowBuffer]  "r" (rowBuffer),
      [x]          "r" (w_),
      [maxi]       "r" (maxi),
      [iter_black] "r" (ITER_BLACK),
      [i]          "r" (0),
      [one]        "r" (1),
      [four]       "r" (4),
      [cx]         "f" (ax_),
      [cy]         "f" (_cy),
      [sx]         "f" (sx_),
      [zx]         "f" (0.0),
      [zy]         "f" (0.0),
      [zx2]        "f" (0.0),
      [zy2]        "f" (0.0),
      [m2]         "f" (0.0),
      [fourd]      "f" (4.0)
    : /* clobbered */ "cr0"
  );
  return true;
#elif defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__))
	// WOULD SOMEONE PLEASE TELL ME WHAT'S WRONG WITH THE CODE BELOW ?
	// I AM GETTING SICK OF THIS %#^&! AT&T SYNTAX.
	// REMOVE THE return false AND TEST IT PLEASE. [Daniele]
	return false;
  // x86 for GCC lame syntax.
  __asm __volatile (
	//"finit;"
 	"movl	%[rowBuffer], %%esi;"
	"fldl	%[four];"
	"fldl	%[ax_];"
	"fldl	%[_cy];"
	"movl	%[maxi], %%edi;"
	"movl	%[w_], %%edx;"
"nxtpixF:"
	"fld	%%st(1);"
	"fld	%%st(1);"
	"xorl	%%ecx, %%ecx;"
"iterloopF:"
	"fld	%%st(1);"
	"fmul	%%st, %%st;"
	"fld	%%st(1);"
	"fmul	%%st, %%st;"
	"fxch	%%st(1);"
	"fld	%%st;"
	"fadd	%%st(2), %%st;"
	"fcomp	%%st(7);"
	"fnstsw	%%ax;"
	"sahf;"
	"fsubrp	%%st, %%st(1);"
	"jnc	donepixF;"
	"fadd	%%st(4), %%st;"
	"cmpl	%%edi, %%ecx;"
	"fxch	%%st(2);"
	"fadd	%%st, %%st;"
	"jz	donepixF;"
	"fmulp	%%st, %%st(1);"
	"addl	$1, %%ecx;"
	"fadd	%%st(2), %%st;"
	"jmp	iterloopF;"
"donepixF:"
	"fstp	%%st;"
	"fstp	%%st;"
	"fstp	%%st;"
	"cmpl	%%edi, %%ecx;"
	"fldl	%[sx_];"
	"jnz	notblackF;"
	"movl	$0xFFFFFFFF, %%ecx;" // ITER_BLACK constant.
"notblackF:"
	"movl	%%ecx, (%%esi);"
	"faddp	%%st, %%st(2);"
	"addl	$4, %%esi;"
	"subl	$1, %%edx;"
	"jnz	nxtpixF;"
	"fstp	%%st;"
	"fstp	%%st;"
	"fstp	%%st;"
    : /* output */ 
    : /* input */      [rowBuffer] "m" (rowBuffer), [four] "m" (four), [ax_] "m" (ax_), [_cy] "m" (_cy), [maxi] "m" (maxi), [w_] "m" (w_), [sx_] "m" (sx_)  
    : /* clobbered */  "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
  );
  return true;  
#elif defined(WIN32)
  // x86
  _asm {
  
  finit           // This is required to avoid VisualC default lower FPU precision.

  mov esi, rowBuffer;

	fld	four        // STACK: 4
	fld	ax_         // STACK: ax 4
	fld	_cy         // STACK: ay ax 4

  mov	edi,maxi	  // edi = maxiters

	//mov	si,600
	mov	edx,w_

nxtpix:
	fld st(1)		    // STACK: cx ay ax 4
	fld st(1)		    // STACK: cy cx ay ax 4

	xor ecx,ecx     // First iter

iterloop:
	fld st(1)		    // STACK: zx cy cx ay ax 4
	fmul st,st      // STACK: zy*zx cy cx ay ax 4
	fld st(1)		    // STACK: zy zx*zx cy cx ay ax 4
	fmul st,st      // STACK: zy*zy zx*zx cy cx ay ax 4
	fxch st(1)      // STACK: zx*zx zy*zy cy cx ay ax 4
	fld st          // STACK: zx*zx zx*zx zy*zy cy cx ay ax 4
	fadd st,st(2)   // STACK: zx*zx+zy*zy zx*zx zy*zy cy cx ay ax 4

	fcomp st(7)		  // STACK: zx*zx zy*zy cy cx ay ax 4
	fnstsw ax
	sahf            // Check for modulo^2 > 4
	fsubrp st(1),st // STACK: zx*zx-zy*zy cy cx ay ax 4
	jnc	donepix
	fadd st,st(4)   // STACK: zx*zx-zy*zy+ay cy cx ay ax 4
	cmp ecx,edi     // Check for maxiters
	fxch st(2)      // STACK: zx*zx-zy*zy+ay cy cx ay ax 4
	fadd st,st      // STACK: 2*cy zx*zx-zy*zy+ay cx ay ax 4
	jz donepix

	fmulp st(1),st
 	add ecx, 1
	fadd st,st(2)
	jmp iterloop


donepix:
	fstp	st
	fstp	st
	fstp	st
	cmp ecx,edi     // Check for maxiters
	fld	sx_
	jnz	notblack
  mov ecx,ITER_BLACK
notblack:
	mov		dword ptr [esi], ecx	//;SET PIXEL !
	faddp	st(2),st
	add	esi,4     ;Change to next pixBuf element

	sub	edx,1
	jnz	NxtPix

	fstp	st
	fstp	st
	fstp	st
  }
  return true;
#elif defined(sgi)

  return calcPixelRow_FPU_ASM_MIPS(rowBuffer, maxi, ITER_BLACK, w_, ax_, _cy, sx_);

#else
  // Unsupported.
  return false;
#endif
}


/**
 * Calculate the given pixel row using ASM_SSE mode
 * @return true if row calc was ok, false otherwise
 */
bool calcPixelRow_CPU_SSE(int row, unsigned int maxi) {
  unsigned int* rowBuffer = pixBuf_->getRowPointer(row);

  // C row calculation routine
  // Calc vars
  //double _cx = ax_;
  float _cx = (float)ax_;
  float _cy = (float)(ay_ + sy_*((float)row));
  float _sx = (float)sy_;

#if defined(__APPLE__) || defined(__linux__)
  float __attribute__ ((aligned(16))) cxs[] = {(float)_cx, (float)_cx, (float)_cx, (float)_cx};
  float __attribute__ ((aligned(16))) cys[] = {(float)_cy, (float)_cy, (float)_cy, (float)_cy};
  float __attribute__ ((aligned(16))) coeffs[] = {0, (float)_sx, 2*((float)_sx), 3* ((float)_sx)};
  float __attribute__ ((aligned(16))) qsx[] = {4*((float)sx_), 4*((float)sx_), 4* ((float)sx_), 4*((float)sx_)};
  float __attribute__ ((aligned(16))) zeros[] = {0.0,0.0,0.0,0.0};
  float __attribute__ ((aligned(16))) four[] = {4.0,4.0,4.0,4.0};
  unsigned int __attribute__ ((aligned(16))) ones[] = {1,1,1,1};
  unsigned int __attribute__ ((aligned(16))) iter_black[] = {ITER_BLACK, ITER_BLACK, ITER_BLACK, ITER_BLACK};
  unsigned int __attribute__ ((aligned(16))) iter_max[] = {maxi, maxi, maxi, maxi};
#endif

#if defined(__APPLE__) && defined(__BIG_ENDIAN__)
  // Mac geek, this is what you are looking for.
  // PLEASE NOTE: This code is preliminary and not yet optimized.
  // Hope that gcc developers will add support for gcc stile vector registers.
  //                                        [Daniele]
  asm volatile (
                ".align 8                          \n"
                "        vxor v0, v0, v0             \n"
                "        lvx v1, 0, %[cxs]           \n" // cx
                "        lvx v2, 0, %[cys]           \n" // cy
                "        lvx v3, 0, %[coeffs]        \n"
                "        vaddfp v1, v1, v3           \n"
                "        lvx v4, 0, %[four]          \n"
                "        lvx v6, 0, %[qsx]           \n"
                "        lvx v7, 0, %[iter_black]    \n"
                "        lvx v15, 0, %[iter_max]     \n"
                "                                    \n"
                "NxtPixA:                            \n"
                "        mtctr %[maxi]               \n"
                "        vor v8, v1, v1              \n" // zx
                "        vor v9, v2, v2              \n" // zy
                "        vxor v20, v20, v20          \n" // Quad iters counter.
                "        lvx v21, 0, %[ones]         \n" // Quad iters incrementers.
                "NxtIA:                              \n"
                "        vmaddfp v10, v8, v8, v0     \n" // zx2
                "        vmaddfp v11, v9, v9, v0     \n" // zy2
                "        vmaddfp v9, v8, v9, v0      \n" // zy_ = zx * zy
                "        vaddfp v12, v10, v11        \n" // modulo2 = zx2 + zy2
                "        vsubfp v8, v10, v11         \n" // zx = zx2 - zy2
                "        vaddfp v9, v9, v9           \n" // zy = 2 * zx * zy
                "        vcmpgtfp. v13, v4, v12       \n" // Set iter mask (if all elements are in bailout then CR6[2] is set and the quad is done). 
                "        beq- cr6, DonePixA          \n"
                "        vand v21, v21, v13          \n" // Mask the incrementers.
                "        vadduwm v20, v20, v21       \n" // Inc quad iters.
                "        vaddfp v8, v8, v1           \n" // zx = zx + cx
                "        vaddfp v9, v9, v2           \n" // zy = zy + cy
                "        bdnz+ NxtIA                 \n"
                "                                    \n"
                "DonePixA:                           \n"
                "        vcmpequw v13, v20, v15      \n"
                "        vandc v20, v20, v13         \n"
                "        vand  v21, v7, v13          \n"
                "        vor v20, v20, v21           \n"
                "        stvx v20, 0, %[rowBuffer]   \n"
                "        vaddfp v1, v1, v6           \n" // Step to next quad pixels.
                "        add %[rowBuffer], %[rowBuffer], %[sixteen]\n"
                "        sub. %[x], %[x], %[one]     \n"
                "        bne+ NxtPixA                \n"
                : /* output */    //"=&r" (oldval), "=&r" (tmp), "=m" (*once_control)
                : /* input */
                [cxs]        "r" (cxs),
                [cys]        "r" (cys),
                [coeffs]     "r" (coeffs),
                [qsx]        "r" (qsx),
                [zeros]      "r" (zeros),
                [ones]       "r" (ones),
                [one]        "r" (1),
                [four]       "r" (four),
                [rowBuffer]  "r" (rowBuffer),
                [x]          "r" (w_/4),
                [maxi]       "r" (maxi),
                [iter_black] "r" (iter_black),
                [iter_max]   "r" (iter_max),
                [sixteen]    "r" (16)
                : /* clobbered */ "cr0", "cr6", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v15", "v20", "v21"
                );
  return true;  
#endif

#if defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__))
  // x86 for GCC lame syntax.
  __asm __volatile (
	"movl	%[rowBuffer], %%esi;"
	"movl	%[maxi], %%edi;"
	"movl	%[w_], %%edx;"
	"addl	$3, %%edx;"
	"shrl	$2, %%edx;"
	"movss	%[_cx], %%xmm6;"
	"shufps	$0, %%xmm6, %%xmm6;"
	"movaps	%[coeffs], %%xmm5;"
	"addps	%%xmm5, %%xmm6;"
	"movss	%[_cy], %%xmm7;"
	"shufps	$0, %%xmm7, %%xmm7;"
	"movaps	%[four], %%xmm5;"
"nxtpixS:"
	"movaps	%%xmm6, %%xmm0;"
	"movaps	%%xmm7, %%xmm1;"
	"xorps	%%xmm4, %%xmm4;"
	"movl	%%edi, %%ecx;"
"iterloopS:"
	"movaps	%%xmm0, %%xmm2;"
	"mulps  %%xmm0, %%xmm0;"
	"movaps %%xmm1, %%xmm3;"
	"addps  %%xmm1, %%xmm1;"
	"mulps  %%xmm2, %%xmm1;"
	"movaps %%xmm0, %%xmm2;"
	"mulps  %%xmm3, %%xmm3;"
	"addps  %%xmm7, %%xmm1;"
	"subps  %%xmm3, %%xmm0;"
	"addps  %%xmm3, %%xmm2;"
	"cmpleps %%xmm5, %%xmm2;"
	"addps   %%xmm6, %%xmm0;"
	"movmskps %%xmm2, %%eax;"
	"testl	%%eax, %%eax;"
	"jz	donepixS;"
	"andps	%%xmm5, %%xmm2;"
	"addps	%%xmm2, %%xmm4;"
	"subl	$1, %%ecx;"
	"jnz	iterloopS;"
"donepixS:"
	"cvtss2si %%xmm4, %%ecx;"
	"movl	$0xffffffff, %%eax;"
	"shrl	$2, %%ecx;"
	"cmpl	%%edi, %%ecx;"
	"cmovel	%%eax, %%ecx;"
	"movl	%%ecx, (%%esi);"
	"shufps	$0xe5, %%xmm4, %%xmm4;"
	"cvtss2si %%xmm4, %%ecx;"
	"shrl	$2, %%ecx;"
	"cmpl	%%edi, %%ecx;"
	"cmovel	%%eax, %%ecx;"
	"movl	%%ecx, 4(%%esi);"
	"shufps	$0xe6, %%xmm4, %%xmm4;"
	"cvtss2si %%xmm4, %%ecx;"
	"shrl	$2, %%ecx;"
	"cmpl	%%edi, %%ecx;"
	"cmovel	%%eax, %%ecx;"
	"movl	%%ecx, 8(%%esi);"
	"shufps	$0xe7, %%xmm4, %%xmm4;"
	"cvtss2si %%xmm4, %%ecx;"
	"shrl	$2, %%ecx;"
	"cmpl	%%edi, %%ecx;"
	"cmovel	%%eax, %%ecx;"
	"movl	%%ecx, 12(%%esi);"
	"addl	$16, %%esi;"
	"subl	$1, %%edx;"
	"addps	%[qsx], %%xmm6;"
	"jnz	nxtpixS;"
    : /* output */ 
    : /* input */      [rowBuffer] "m" (rowBuffer), [maxi] "m" (maxi), [w_] "m" (w_), [_cx] "m" (_cx), [_cy] "m" (_cy), [coeffs] "m" (*coeffs), [four] "m" (*four), [qsx] "m" (*qsx)
    : /* clobbered */  "%eax", "%ecx", "%edx", "%esi", "%edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
  return true;  

#endif

  
#if defined(WIN32)
  __declspec(align(64)) float coeffs[] = {0, (float)_sx, 2*((float)_sx), 3* ((float)_sx)};
  __declspec(align(64)) float qsx[] = {4*((float)sx_), 4*((float)sx_), 4* ((float)sx_), 4*((float)sx_)};
  __declspec(align(64)) float four[] = {4,4,4,4};
  //__declspec(align(8)) unsigned short masktest[] = {1,2,4,8};
  //__declspec(align(8)) unsigned short maskshift[] = {0,1,2,3};
  //__declspec(align(64)) float mytest[] = {1,2,3,5};

	// xmm0 re
	// xmm1 im
	// xmm2 tmp1
	// xmm3 tmp2
	// xmm4 count
	// xmm5 zmax
	// xmm6 _cx
	// xmm7 _cy

	_asm
	{
		//femms;
		mov esi, rowBuffer;

		mov edi, maxi	  // edi = maxiters

		mov edx,w_      // edx = num of pixels
		add edx,3       // pad row to quadpixel boundary (multiple of four)
		shr edx,2       // edx = num of quadpixels

		movss xmm6, _cx // xmm6 = ?,?,?,cx
		shufps xmm6, xmm6, 0 // xmm6 = cx,cx,cx,cx
		movaps xmm5, dword ptr [coeffs] // xmm5 = 0,sx,2*sx,3*sx
		addps xmm6, xmm5 // xmm6 = zx0,zx1,zx2,zx3
		movss xmm7, _cy // xmm7 = ?,?,?,cy
		shufps xmm7, xmm7, 0 // xmm7 = cy,cy,cy,cy
		movaps xmm5, dword ptr [four] // xmm5 = 4,4,4,4

    align 16

/*
// OLD v3.2.2 routine
nxtpix:
		movaps xmm0, xmm6 // xmm0 = zx0,zx1,zx2,zx3
		movaps xmm1, xmm7 // xmm1 = zy0, zy1, zy2, zy3

		xorps xmm4, xmm4    // zero quadpixel iters counter
		mov ecx,edi     // ecx = iters counter

iterloop:
		movaps xmm2, xmm0 // xmm2 = zx0,zx1,zx2,zx3
		mulps xmm2, xmm1 // xmm2 = zx0*zy0, zx1*zy1, zx2*zy2, zx3*zy3
		mulps xmm0, xmm0  // xmm0 = zx0*zx0, zx1*zx1, zx2*zx2, zx3*zx3
		mulps xmm1, xmm1  // xmm1 = zy0*zy0, zy1*zy1, zy2*zy2, zy3*zy3
		addps xmm2, xmm2 // xmm2 = 2*zx0*zy0, 2*zx1*zy1, 2*zx2*zy2, 2*zx3*zy3
		movaps xmm3, xmm0 // xmm3 = zx0*zx0, zx1*zx1, zx2*zx2, zx3*zx3
		addps xmm3, xmm1  // xmm3 = zy0*zy0+zx0*zx0, zy1*zy1+zx1*zx1, zy2*zy2+zx2*zx2, zy3*zy3+zx3*zx3
		cmpltps xmm3, xmm5  // xmm3 = <4,<4,<4,<4
		movmskps eax, xmm3 // 
		test eax, eax     // Test for all ready
		jz donepix

		subps xmm0, xmm1 // re = re*re-im*im
		movaps xmm1, xmm2

		andps xmm3, xmm5
		addps xmm4, xmm3

		addps xmm0, xmm6 // re = re + zinitre
		addps xmm1, xmm7 // im = im + zinitim

		dec ecx         // dec iter counter

		jne iterloop
*/

    // Peter Kankowski's optimized SSE routine.
nxtpix:
    movaps xmm0, xmm6
    movaps xmm1, xmm7
    xorps  xmm4, xmm4
    mov ecx, edi
iterloop:
    // xmm0 = zx             xmm1 = zy
    movaps xmm2, xmm0
    mulps  xmm0, xmm0
    movaps xmm3, xmm1
    addps  xmm1, xmm1
    // xmm0 = zx^2           xmm1 = 2 * zy     xmm2 = zx           xmm3 = zy
    mulps  xmm1, xmm2
    movaps xmm2, xmm0
    mulps  xmm3, xmm3
    // xmm0 = zx^2           xmm1 = 2*zy*zx    xmm2 = zx^2         xmm3 = zy^2
    addps  xmm1, xmm7
    subps  xmm0, xmm3
    addps  xmm2, xmm3
    // xmm0 = zx^2 - zy^2    xmm1=2*zy*zx+py   xmm2 = zx^2 + zy^2  xmm3 = zy^2
    cmpleps xmm2, xmm5
    addps   xmm0, xmm6
    movmskps eax, xmm2
    test eax, eax
    jz donepix
    andps   xmm2, xmm5      // xmm4 += (xmm2 < 4.0) ? 4.0 : 0.0;
    addps   xmm4, xmm2
    sub ecx, 1
    jnz iterloop

donepix:
		cvtss2si ecx, xmm4
		mov eax, ITER_BLACK
		shr ecx, 2
		cmp ecx, edi
		cmove ecx, eax
		mov	dword ptr [esi],ecx	;SET PIXEL !
		shufps xmm4, xmm4, 0xe5
		cvtss2si ecx, xmm4
		shr ecx, 2
		cmp ecx, edi
		cmove ecx, eax
		mov	dword ptr [esi+4],ecx	;SET PIXEL !
		shufps xmm4, xmm4, 0xe6
		cvtss2si ecx, xmm4
		shr ecx, 2
		cmp ecx, edi
		cmove ecx, eax
		mov	dword ptr [esi+8],ecx	;SET PIXEL !
		shufps xmm4, xmm4, 0xe7
		cvtss2si ecx, xmm4
		shr ecx, 2
		cmp ecx, edi
		cmove ecx, eax
		mov	dword ptr [esi+12],ecx	;SET PIXEL !

		add	esi,16     //Change to next pixBuf elements
		sub	edx,1      // Decrease number of quadpixels to compute
		addps xmm6, dword ptr [qsx] // xmm6 = next quadpixels cx[]

		jnz	nxtpix
		//femms;
	}
	// End of calculation routine
        return true;
#endif
  return false;
}


/**
 * Calculate the given pixel row using ASM_SSE2 mode
 * @return true if row calc was ok, false otherwise
 */
bool calcPixelRow_CPU_SSE2(int row, unsigned int maxi) {
  unsigned int* rowBuffer = pixBuf_->getRowPointer(row);

  // C row calculation routine
  // Calc vars
  //double _cx = ax_;
  double _cx = (double)ax_;
  double _cy = (double)(ay_ + sy_*((double)row));
  double _sx = (double)sy_;

#if defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__))
  double __attribute__ ((aligned(16))) coeffs[] = {0, (double)_sx};
  double __attribute__ ((aligned(16))) dsx[] = {2.0*((double)sx_), 2.0*((double)sx_)};
  double __attribute__ ((aligned(16))) four[] = {4.0,4.0};

  // x86 for GCC lame syntax.
  __asm __volatile (
	"movl	%[rowBuffer], %%esi;"
	"movl	%[maxi], %%edi;"
	"movl	%[w_], %%edx;"
	"addl	$1, %%edx;"
	"shrl	$1, %%edx;"
	"movsd	%[_cx], %%xmm6;"
	"shufpd	$0, %%xmm6, %%xmm6;"
	"movapd	%[coeffs], %%xmm5;"
	"addpd	%%xmm5, %%xmm6;"
	"movsd	%[_cy], %%xmm7;"
	"shufpd	$0, %%xmm7, %%xmm7;"
	"movapd	%[four], %%xmm5;"
"nxtpixS2:"
	"movapd	%%xmm6, %%xmm0;"
	"movapd	%%xmm7, %%xmm1;"
	"xorpd	%%xmm4, %%xmm4;"
	"movl	%%edi, %%ecx;"
"iterloopS2:"
	"movapd	%%xmm0, %%xmm2;"
	"mulpd  %%xmm0, %%xmm0;"
	"movapd %%xmm1, %%xmm3;"
	"addpd  %%xmm1, %%xmm1;"
	"mulpd  %%xmm2, %%xmm1;"
	"movapd %%xmm0, %%xmm2;"
	"mulpd  %%xmm3, %%xmm3;"
	"addpd  %%xmm7, %%xmm1;"
	"subpd  %%xmm3, %%xmm0;"
	"addpd	%%xmm3, %%xmm2;"
	"cmplepd %%xmm5, %%xmm2;"
	"addpd	%%xmm6, %%xmm0;"
	"movmskpd %%xmm2, %%eax;"
	"testl	%%eax, %%eax;"
	"jz	donepixS2;"
	"andpd	%%xmm5, %%xmm2;"
	"addpd	%%xmm2, %%xmm4;"
	"subl	$1, %%ecx;"
	"jnz	iterloopS2;"
"donepixS2:"
	"cvtsd2si %%xmm4, %%ecx;"
	"movl	$0xffffffff, %%eax;"
	"shrl	$2, %%ecx;"
	"cmpl	%%edi, %%ecx;"
	"cmovel	%%eax, %%ecx;"
	"movl	%%ecx, (%%esi);"
	"shufpd	$3, %%xmm4, %%xmm4;"
	"cvtsd2si %%xmm4, %%ecx;"
	"shrl	$2, %%ecx;"
	"cmpl	%%edi, %%ecx;"
	"cmovel	%%eax, %%ecx;"
	"movl	%%ecx, 4(%%esi);"
	"addl	$8, %%esi;"
	"subl	$1, %%edx;"
	"addpd	%[dsx], %%xmm6;"
	"jnz	nxtpixS2;"
    : /* output */ 
    : /* input */      [rowBuffer] "m" (rowBuffer), [maxi] "m" (maxi), [w_] "m" (w_), [_cx] "m" (_cx), [_cy] "m" (_cy), [coeffs] "m" (*coeffs), [four] "m" (*four), [dsx] "m" (*dsx)
    : /* clobbered */  "%eax", "%ecx", "%edx", "%esi", "%edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
  return true;  

#endif

#ifdef WIN32
  __declspec(align(64)) double coeffs[] = {0, (double)_sx};
  __declspec(align(64)) double dsx[] = {2*((double)sx_), 2*((double)sx_)};
  __declspec(align(64)) double four[] = {4,4};

	_asm
	{
		//femms;
		mov esi, rowBuffer;

		mov edi, maxi	  // edi = maxiters

		mov edx,w_      // edx = num of pixels
		add edx,1       // pad row to dualpixel boundary (multiple of four)
		shr edx,1       // edx = num of dualpixels

		movsd xmm6, _cx // xmm6 = ?,cx
		shufpd xmm6, xmm6, 0 // xmm6 = cx,cx
		movapd xmm5, dword ptr [coeffs] // xmm5 = 0,sx
		addpd xmm6, xmm5 // xmm6 = zx0,zx1
		movsd xmm7, _cy // xmm7 = ?,cy
		shufpd xmm7, xmm7, 0 // xmm7 = cy,cy
		movapd xmm5, dword ptr [four] // xmm5 = 4,4
    align 16

/*
// OLD v3.2.2 routine
nxtpix:
		movapd xmm0, xmm6 // xmm0 = zx0, zx1
		movapd xmm1, xmm7 // xmm1 = zy0, zy1
		xorpd xmm4, xmm4    // zero dualpixel iters counter
		mov ecx,edi     // ecx = iters counter
iterloop:
		movapd xmm2, xmm0 // xmm2 = zx0,zx1
		mulpd xmm2, xmm1 // xmm2 = zx0*zy0, zx1*zy1
		mulpd xmm0, xmm0  // xmm0 = zx0*zx0, zx1*zx1
		mulpd xmm1, xmm1  // xmm1 = zy0*zy0, zy1*zy1
		addpd xmm2, xmm2 // xmm2 = 2*zx0*zy0, 2*zx1*zy1
		movapd xmm3, xmm0 // xmm3 = zx0*zx0, zx1*zx1
		addpd xmm3, xmm1  // xmm3 = zy0*zy0+zx0*zx0, zy1*zy1+zx1*zx1
		cmpltpd xmm3, xmm5  // xmm3 = <4,<4,<4,<4
		movmskpd eax, xmm3
		test eax, eax     // Test for all ready
		jz donepix

		subpd xmm0, xmm1 // re = re*re-im*im
		movapd xmm1, xmm2

		andpd xmm3, xmm5
		addpd xmm4, xmm3

		addpd xmm0, xmm6 // re = re + zinitre
		addpd xmm1, xmm7 // im = im + zinitim

		dec ecx         // dec iter counter

		jne iterloop
*/

    // Peter Kankowski's optimized SSE routine.
nxtpix:
    movapd xmm0, xmm6
    movapd xmm1, xmm7
    xorpd  xmm4, xmm4
    mov ecx, edi
iterloop:
    // xmm0 = zx             xmm1 = zy
    movapd xmm2, xmm0
    mulpd  xmm0, xmm0
    movapd xmm3, xmm1
    addpd  xmm1, xmm1
    // xmm0 = zx^2           xmm1 = 2 * zy     xmm2 = zx           xmm3 = zy
    mulpd  xmm1, xmm2
    movapd xmm2, xmm0
    mulpd  xmm3, xmm3
    // xmm0 = zx^2           xmm1 = 2*zy*zx    xmm2 = zx^2         xmm3 = zy^2
    addpd  xmm1, xmm7
    subpd  xmm0, xmm3
    addpd  xmm2, xmm3
    // xmm0 = zx^2 - zy^2    xmm1=2*zy*zx+py   xmm2 = zx^2 + zy^2  xmm3 = zy^2
    cmplepd xmm2, xmm5
    addpd   xmm0, xmm6
    movmskpd eax, xmm2
    test eax, eax
    jz donepix
    andpd   xmm2, xmm5      // xmm4 += (xmm2 < 4.0) ? 4.0 : 0.0;
    addpd   xmm4, xmm2
    sub ecx, 1
    jnz iterloop

donepix:
		cvtsd2si ecx, xmm4
		mov eax, ITER_BLACK
		shr ecx, 2
		cmp ecx, edi
		cmove ecx, eax
		mov	dword ptr [esi],ecx	;SET PIXEL !
		shufpd xmm4, xmm4, 0x3
		cvtsd2si ecx, xmm4
		shr ecx, 2
		cmp ecx, edi
		cmove ecx, eax
		mov	dword ptr [esi+4],ecx	;SET PIXEL !

		add	esi,8     //Change to next pixBuf elements
		sub	edx,1     // Decrease number of dualpixels to compute
		addpd xmm6, dword ptr [dsx] // xmm6 = next dualpixels cx[]

		jnz	nxtpix
		//femms;
	}
	// End of calculation routine
	return true;
#elif defined(sgi)

  return calcPixelRow_FPU_ASM_MIPSR8000(rowBuffer, maxi, ITER_BLACK, w_, ax_, _cy, sx_);

#else
  // Unsupported.
  return false;
#endif
}


/**
 * Calculate the given pixel row using CPU_3DNOW mode
 * Author: G�rard Basler
 * @return true if row calc was ok, false otherwise
 */
bool calcPixelRow_CPU_3DNOW(int row, unsigned int maxi) {
	unsigned int* rowBuffer = pixBuf_->getRowPointer(row);
	float _cx = (float)ax_;
	float _cy = (float)(ay_ + sy_*((double)row));
	float _sx = (float)sy_;

#ifdef _WIN32
	__declspec(align(32)) float coeffs[] = {0, (float)_sx}, qsx[] = {2*((float)sx_), 2*((float)sx_)}, zmax[] = {4.0f, 4.0f}, cmax[] = {maxi, maxi};

	// eax tmp
	// ebx tmp
	// ecx i
	// edx w_
	// esi maxi
	// edi rowBuffer

	// mm0 cx (zinitre)
	// mm1 cy (zinitim)
	// mm2 zx (re)
	// mm3 zy (im)
	// mm4 count
	// mm5 zmax
	// mm6 tmp1
	// mm7 tmp2
	_asm {
		femms;
		push ebx;
		mov	edx,w_      // edx = num of pixels
		add edx,1       // pad row to dualpixel boundary (multiple of four)
		shr edx,1       // edx = num of dualpixels
		mov esi, maxi
		mov edi, rowBuffer
		pxor mm0, mm0
		movd mm0, _cx
		pfacc mm0, mm0
		pfadd mm0, coeffs
		pxor mm1, mm1
		movd mm1, _cy
		pfacc mm1, mm1
		movq mm5, zmax
		mov ebx, ITER_BLACK // frame pointer destroyed!
		align 16
nxtpix:
		movq mm2, mm0 // re = zinitre
		movq mm3, mm1 // im = zinitim
		pxor mm4, mm4 // count = 0
		mov ecx, esi // i = maxi
iterloop:
		movq mm6, mm2 // tmp1 = re
		pfmul mm6, mm3 // tmp1 = re*im
		pfmul mm2, mm2 // re = re*re
		pfmul mm3, mm3 // im = im*im
		pfadd mm6, mm6 // tmp1 = 2*re*im
		// if ((num.real() * num.real() + num.imag() * num.imag()) > 4.0)
		movq mm7, mm2 // tmp2 = re*re
		pfadd mm7, mm3 // tmp2 = re*re+im*im
		// we need more registers, so free one
		pfsub mm2, mm3 // re = re*re-im*im
		movq mm3, mm6 // im = 2*re*im, tmp1 is now free

		movq mm6, mm5 // tmp1 = zmax
		pfcmpge mm6, mm7 // pcmpgtd mm6, mm7 // danger! integer compare!
		psubd mm4, mm6 // count = count - (compare)
		psrlq mm6, 1 // pmovmskb eax, mm6 // only athlon supports this
		movd eax, mm6
		test eax,eax
		je donepix

		dec ecx // nCountMax--
		//num += zInit
		pfadd mm2, mm0 // re = re + zinitre
		pfadd mm3, mm1 // im = im + zinitim
		// ... hope this helps the branch prediction
		jne iterloop

donepix:
		//pcmpeqd mm6, cmax // short replacement for the code below
		//por mm4, mm6 // but requires one more memory op
		//movq [edi], mm4

		movd ecx, mm4
		cmp ecx, esi
    //cmove ecx, ebx // falco[SCT]: Conditional moves that ARE NOT supported on the K6. 
    jne no_black0
    mov ecx, ebx
no_black0:
		mov	dword ptr [edi],ecx	// SET PIXEL !
		punpckhdq mm4, mm4 // lo = hi
		movd ecx, mm4
		cmp ecx, esi
    //cmove ecx, ebx // falco[SCT]: Conditional moves that ARE NOT supported on the K6. 
    jne no_black1
    mov ecx, ebx
no_black1:
		mov	dword ptr [edi+4],ecx	// SET PIXEL !

		// cx += sx_;
		pfadd mm0, qsx
		add	 edi, 2*4
		dec edx
		jne nxtpix

		pop ebx;
		femms;
	}

	// End of calculation routine
#endif
	return true;
}

bool checkSSE() {
#if defined(__APPLE__) && defined(__BIG_ENDIAN__)
  // PowerPC (should work for Intel but I prefer asm code).
  int mib[2], value = 0; 
  size_t len = sizeof(value); 
  mib[0] = CTL_HW; 
  mib[1] = HW_VECTORUNIT;
  int rc = sysctl(mib, 2, &value, &len, NULL, 0);
  if (rc == 0) return (value != 0); 
  else return false;
#endif
  
	bool ssehw = false;
#if defined(WIN32)
	_asm {
		// Move the number 1 into eax - this will move the
		// feature bits into EDX when a CPUID is issued, that
		// is, EDX will then hold the key to the cpuid
		mov eax, 1

		// Does this processor have SSE support?
		cpuid

		// Perform CPUID (puts processor feature info in EDX)
		// Shift the bits in edx to the right by 26, thus bit 25
		// (SSE bit) is now in CF bit in EFLAGS register.
		shr edx,0x1A

		// If CF is not set, jump over next instruction
		jnc nocarryflag

		// set the return value to 1 if the CF flag is set
		mov [ssehw], 1

		nocarryflag:
	}
#elif defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__))
  // x86 for GCC lame syntax.
  __asm __volatile (
    "movl $1, %%eax;"
    "cpuid;"
    "shrl $0x1a, %%edx;"
    "jnc nocarryflag;"
    "movl $1, %0;"
    "nocarryflag:"
    : /* output */ "=m" (ssehw)
    : /* input */      
    : /* clobbered */  "%eax", "%edx"
  );
#endif	
  return ssehw;
}


bool checkSSE2() {
  bool sse2hw = false;
#if defined (WIN32)
	_asm {
		// Move the number 1 into eax - this will move the
		// feature bits into EDX when a CPUID is issued, that
		// is, EDX will then hold the key to the cpuid
		mov eax, 1

		// Does this processor have SSE support?
		cpuid

		// Perform CPUID (puts processor feature info in EDX)
		// Shift the bits in edx to the right by 27, thus bit 26
		// (SSE2 bit) is now in CF bit in EFLAGS register.
		shr edx,0x1B

		// If CF is not set, jump over next instruction
		jnc nocarryflag

		// set the return value to 1 if the CF flag is set
		mov [sse2hw], 1

		nocarryflag:
	}
#elif defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__))
  // x86 for GCC lame syntax.
  __asm __volatile (
    "movl $1, %%eax;"
    "cpuid;"
    "shrl $0x1b, %%edx;"
    "jnc .nocarryflag2;"
    "movl $1, %0;"
    ".nocarryflag2:"
    : /* output */ "=m" (sse2hw)
    : /* input */      
    : /* clobbered */  "%eax", "%edx"
  );
#elif defined(sgi)
  // Check for MIPS processor with dual FPUs.
  char strCPU[512];
  strCPU[0] = '\0';
  long slen = sysinfo(_MIPS_SI_PROCESSORS, strCPU, sizeof(strCPU)-1);
  strCPU[511] = '\0'; // Just in case.
  printf("Detected MIPS CPUs: %s\n", strCPU);
  sse2hw = (strstr(strCPU, "R8000") != NULL);
  if (!sse2hw) sse2hw = (strstr(strCPU, "R10000") != NULL);
  if (!sse2hw) sse2hw = (strstr(strCPU, "R12000") != NULL);
  if (!sse2hw) sse2hw = (strstr(strCPU, "R14000") != NULL);
  if (!sse2hw) sse2hw = (strstr(strCPU, "R16000") != NULL);
  if (!sse2hw) sse2hw = (strstr(strCPU, "R18000") != NULL); // I can dream :)
#endif	
  return sse2hw;
}

// Check for 3DNow
bool check3DNow() {
  bool b3DNow = false;
#ifdef WIN32
  _asm {
      mov   eax, 80000000h    ; CPUID function: Largest extended value
      cpuid
      cmp   eax, 80000001h    ; We can execute feature #1, right?
      jl    No3dNow           ; If not, we are done here.

      mov   eax, 80000001h    ; CPUID function: Signature + features
      cpuid

      shr   edx, 31           ; bit 31 indicates 3DNow! support

      // If CF is not set, jump over next instruction
      jnc No3dNow

      // set the return value to 1 if the CF flag is set
      mov [b3DNow], 1
No3dNow:
  }
#endif
  return b3DNow;
}


void prepareWorldSpace() {
  // Recalc real plane parameters
  sx_ = (ex_ - ax_) / ((double) w_);
  //sy_ = (ey_ - ay_) / ((double) h_);
  sy_ = sx_;

  // Setup world space depending on render mode
  if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) {
    if (vp_ != NULL) vp_->prepareWorldSpace(w_, h_, ax_, ay_, ex_, ey_);
  }
  else {
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    //if (w_ <= h_) gluOrtho2D(0.0f, (GLdouble)w_*(GLdouble)h_/(GLdouble)w_, (GLdouble)h_, 0.0f);
    //else gluOrtho2D(0.0f, (GLdouble)w_, (GLdouble)h_*(GLdouble)w_/(GLdouble)h_, 0.0f);
    gluOrtho2D(0, (double)w_, (double)h_, 0);
    glTranslated(.375, .375, 0); // Fixed bt Richard Rauch as per OpenGL Red Book.
  }
}


void prepareColorTable(int numColors, GLubyte startR, GLubyte startG, GLubyte startB) {
  // Allocate colortable memory
  if (colorTable_ != NULL) free(colorTable_);
  colorTable_ = (GLubyte*) malloc(numColors*3*sizeof(GLubyte));
  if (colorTable_ == NULL) return;

  // Fill colortable
  int addR = 3;
  int addG = 4;
  int addB = -2;

  int r = startR;
  int g = startG;
  int b = startB;

  GLubyte* curCol = colorTable_;

  for (int i=0; i<numColors; i++) {
    //printf("Color is R=%d\tG=%d\tB=%d\n", r, g, b);
    *curCol++ = r;
    *curCol++ = g;
    *curCol++ = b;
    r += addR;
    g += addG;
    b += addB;
    if (r < 0) {
      r = -r;
      addR = -addR;
    }
    else if (r > 240) {
      addR = -addR;
      r += addR;
    }
    if (g < 0) {
      g = -g;
      addG = -addG;
    }
    else if (g > 240) {
      addG = -addG;
      g += addG;
    }
    if (b < 0) {
      b = -b;
      addB = -addB;
    }
    else if (b > 240) {
      addB = -addB;
      b += addB;
    }
  }
}


void updateWinTitle() {
  char agent[256];

  switch(mode_) {
    case RENDER_MODE_FPU_C:
      sprintf(agent, "%dxCPU FPU C", numCPU_);
      break;
    case RENDER_MODE_FPU_ASM:
      sprintf(agent, "%dxCPU FPU ASM", numCPU_);
      break;
    case RENDER_MODE_CPU_SSE:
#if defined(__APPLE__) && defined(__BIG_ENDIAN__)
      // PowerPC
      sprintf(agent, "%dxCPU AltiVec ASM", numCPU_);
#else
      // Intel x86
      sprintf(agent, "%dxCPU SSE", numCPU_);
#endif
      break;
    case RENDER_MODE_CPU_SSE2:
#if defined (sgi)
      // MIPS
      sprintf(agent, "%dxCPU R8000 opt dual FPU ASM", numCPU_);
#else
      sprintf(agent, "%dxCPU SSE2", numCPU_);
#endif
      break;
    case RENDER_MODE_CPU_3DNOW:
      sprintf(agent, "%dxCPU 3DNow!", numCPU_);
      break;
    case RENDER_MODE_GPU_VP:
      sprintf(agent, "GPU VertP %s", glGetString(GL_RENDERER));
      break;
    case RENDER_MODE_GPU_FP:
      sprintf(agent, "GPU FragP %s", glGetString(GL_RENDERER));
      break;
    default:
      sprintf(agent, "UNKNOWN!");
  }
  if (w_ == 0 && h_ == 0) {	//	initizl settings
    w_ = 500;
	h_ = 500;
  }
  sprintf(winTitle_, "%s %d*%d i=%d %s", progname, w_, h_, maxi_, agent);
  glutSetWindowTitle(winTitle_);
}

/**
 * A slave just calculates a slice of the image.
 * There is one slice for any CPU
 * Nothing more... we dont want a slave learning too many things ! :)
 */
#if defined(__APPLE__) || defined(sgi) || defined(__linux__)
// UNIX
void* slaveThreadCode(void* lpParameter) {
#else
// WIN32
DWORD WINAPI slaveThreadCode(LPVOID lpParameter) {
#endif
  int slice = 1 + *((int*)lpParameter);
  int endRow;

  printf("Thread %d says: \"I'm a slave, I'm alive.\"\n", slice);

  while (true) {
    // Wait until sync > 0.
    while (slavesWorking_ == 0) {
#ifdef _WIN32
      Sleep(1);
#else
      usleep(1000);
#endif
    }
    // if this thread must calculate
    if (slice >= h_) {
#ifdef _WIN32
      Sleep(1);
#else
      usleep(1000);
#endif
      continue;
    }

    // Calculate
    int startRow = (h_/numCPU_)*slice;
    if (numCPU_ >= h_) {
      startRow = slice;
      endRow = startRow + 1;
    }
    else if (slice == (numCPU_-1)) endRow = h_;
    else endRow = (h_/numCPU_)*(slice+1);

    for (int y=startRow; y<endRow; y++) {
      calcPixelRow(y, maxi_, mode_);
    }

    // Finished, decrement sync
    InterlockedDecrement(&slavesWorking_);
    // Wait for all finished
    while (slavesWorking_ != 0) {
#ifdef _WIN32
      Sleep(1);
#else
      usleep(1000);
#endif
    }
  }

  return 0;
}


void runBenchmark() {
  unsigned int o_maxi = maxi_;
  int o_w = w_;
  int o_h = h_;
  // Get system timer resolution
#ifdef WIN32
  if (!QueryPerformanceFrequency(&sysFreqQuery)) {
    printf("ERROR: System does not support performance counters.\n");
    benchmarking_ = false;
    return;
  }
  sysFreq = sysFreqQuery.QuadPart;
#endif
  
  w_ = 500;
  h_ = 500;
  maxi_ = 9999;
  ax_ = -2.0f;
  ay_ = -1.5f;
  ex_ =  1.0f;
  ey_ =  1.5f;
  sx_ = (ex_ - ax_) / ((double) w_);
  sy_ = (ey_ - ay_) / ((double) h_);
  four = 4.0f;
  reshaped_ = true;
  // Reallocate pixel buffer
  if (pixBuf_ != NULL) delete pixBuf_;
  pixBuf_ = new PixelBuffer(w_, h_);

  printf("\n%s BENCHMARK (Using 1 CPU, no render)\n", progname);
#ifdef _WIN32
  printf(" Sys perf timer freq: %u Hz\n", sysFreq);
#endif
  printf(" size:\t%d*%d\n", w_, h_);
  printf(" maxiters:\t%d\n", maxi_);
  printf(" rangex:\t%.2f to %.2f\n", ax_, ex_);
  printf(" rangey:\t%.2f to %.2f\n", ay_, ey_);

  glutSetWindowTitle("BenchMarking... (results in console)");
  
  int y;

  // CPU SSE
  pixBuf_->clearBuffer();
#if defined(__APPLE__) && __BIG_ENDIAN__
  // PowerPC
  printf("  [4f] AltiVec benchmark:\n");
#elif defined(sgi)
  // IRIX
  printf("  [4f] Vector benchmark:\n");
#else
  // Intel x86
  printf("  [4f] SSE benchmark:\n");
#endif  
  if (avail_SSE) {
    QueryPerformanceCounter(&timeTmp0);
    for (y=0; y<h_; y++) {
      calcPixelRow(y, maxi_, RENDER_MODE_CPU_SSE);
    }
    QueryPerformanceCounter(&timeTmp1);
    timingf = timeTmp1.QuadPart - timeTmp0.QuadPart;
    timingf = timingf / sysFreq;
    printf("    %.3f sec\n", timingf);
    printf("    %.3f MegaIters/sec\n", ((double)pixBuf_->getTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f);
  }
  else {
    printf("    Not supported.\n");
  }
  // CPU SSE2
  pixBuf_->clearBuffer();
#if defined(__APPLE__) && __BIG_ENDIAN__
  // PowerPC
  printf("  [2d] PowerPC970 dual FPU benchmark:\n");
#elif defined(sgi)
  // IRIX
  printf("  [2d] R8000 dual FPU benchmark:\n");
#else
  // Intel x86
  printf("  [2d] SSE2 benchmark:\n");
#endif  
  if (avail_SSE2) {
    QueryPerformanceCounter(&timeTmp0);
    for (y=0; y<h_; y++) {
      calcPixelRow(y, maxi_, RENDER_MODE_CPU_SSE2);
    }
    QueryPerformanceCounter(&timeTmp1);
    timingf = timeTmp1.QuadPart - timeTmp0.QuadPart;
    timingf = timingf / sysFreq;
    printf("    %.3f sec\n", timingf);
    printf("    %.3f MegaIters/sec\n", ((double)pixBuf_->getTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f);
  }
  else {
    printf("    Not supported.\n");
  }
  // CPU 3DNOW!
  pixBuf_->clearBuffer();
  printf("  [2f] 3DNow! benchmark:\n");
  if (avail_3DNow) {
    QueryPerformanceCounter(&timeTmp0);
    for (y=0; y<h_; y++) {
      calcPixelRow(y, maxi_, RENDER_MODE_CPU_3DNOW);
    }
    QueryPerformanceCounter(&timeTmp1);
    timingf = timeTmp1.QuadPart - timeTmp0.QuadPart;
    timingf = timingf / sysFreq;
    printf("    %.3f sec\n", timingf);
    printf("    %.3f MegaIters/sec\n", ((double)pixBuf_->getTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f);
  }
  else {
    printf("    Not supported.\n");
  }
  // FPU ASM
  pixBuf_->clearBuffer();
  printf("  [1d] FPU ASM benchmark:\n");
  QueryPerformanceCounter(&timeTmp0);
  for (y=0; y<h_; y++) {
    calcPixelRow(y, maxi_, RENDER_MODE_FPU_ASM);
  }
  QueryPerformanceCounter(&timeTmp1);
  timingf = timeTmp1.QuadPart - timeTmp0.QuadPart;
  timingf = timingf / sysFreq;
  printf("    %.3f sec\n", timingf);
  printf("    %.3f MegaIters/sec\n", ((double)pixBuf_->getTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f);
  // FPU C
  pixBuf_->clearBuffer();
  printf("  [1d] FPU C benchmark:\n");
  QueryPerformanceCounter(&timeTmp0);
  for (y=0; y<h_; y++) {
    calcPixelRow(y, maxi_, RENDER_MODE_FPU_C);
  }
  QueryPerformanceCounter(&timeTmp1);
  timingf = timeTmp1.QuadPart - timeTmp0.QuadPart;
  timingf = timingf / sysFreq;
  printf("    %.3f sec\n", timingf);
  printf("    %.3f MegaIters/sec\n", ((double)pixBuf_->getTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f);
  // GPU VP
  printf("  [4?] GPU VertexProgram benchmark (beta! maxiters=10) on %s:\n", glGetString(GL_RENDERER));
  pixBuf_->clearBuffer();
  maxi_ = 10;
  if (vp_ != NULL) {
    delete vp_; vp_ = NULL;
  }
#if !defined(__APPLE__) && !defined(sgi)
  vp_ = new VertexProgramNV(maxi_, w_, h_, ax_, ay_, ex_, ey_);
  if (!vp_->isValid()) {
    delete vp_;
#else
  if (true) {
#endif
    // Try with ATI VP
    vp_ = new VertexProgramATI(maxi_, w_, h_, ax_, ay_, ex_, ey_);
  }
  if (vp_->isValid()) {
    mode_ = RENDER_MODE_GPU_VP;
    QueryPerformanceCounter(&timeTmp0);
    int repeats = 200;
    for (int i=0; i<repeats; i++) {
      renderImage(maxi_, mode_);
      glFinish(); // Correct way: wait until all OpenGL commands have been completed !!
    }
    QueryPerformanceCounter(&timeTmp1);
    timingf = timeTmp1.QuadPart - timeTmp0.QuadPart;
    timingf = timingf / sysFreq;
    printf("    %.3f sec\n", timingf);
    printf("    %.3f MegaIters/sec\n", (500*500*10*repeats)/timingf/1000000.0f);
  }
  else {
    printf("    Not supported.\n");
  }
  // GPU FP
  printf("  [4?] GPU FragmentProgram benchmark (beta! maxiters=20) on %s:\n", glGetString(GL_RENDERER));
  pixBuf_->clearBuffer();
  maxi_ = 20;
  if (vp_ != NULL) {
    delete vp_; vp_ = NULL;
  }
  vp_ = new FragmentProgramARB10(maxi_, w_, h_, ax_, ay_, ex_, ey_);
  if (vp_->isValid()) {
    mode_ = RENDER_MODE_GPU_FP;
    QueryPerformanceCounter(&timeTmp0);
    int repeats = 200;
    for (int i=0; i<repeats; i++) {
      renderImage(maxi_, mode_);
      glFinish(); // Correct way: wait until all OpenGL commands have been completed !!
    }
    QueryPerformanceCounter(&timeTmp1);
    timingf = timeTmp1.QuadPart - timeTmp0.QuadPart;
    timingf = timingf / sysFreq;
    printf("    %.3f sec\n", timingf);
    printf("    %.3f MegaIters/sec\n", (500*500*20*repeats)/timingf/1000000.0f);
  }
  else {
    printf("    Not supported.\n");
  }

  // Benchmark done
  maxi_ = o_maxi;
  w_ = o_w;
  h_ = o_h;
  printf("\n");
  prepareWorldSpace();
  postRedisplay();
  benchmarking_ = false;
}


/**
 * Rotate the palette
 * @args If +1 rotate palette right, if -1 rotate palette left
 */
void rotatePalette(int delta) {
  GLubyte tmpR;
  GLubyte tmpG;
  GLubyte tmpB;
  int copySize = 9999 * 3;
  if (delta < 0) {
    GLubyte* src = colorTable_+3;
    GLubyte* dst = colorTable_;
    tmpR = *(dst);
    tmpG = *(dst+1);
    tmpB = *(dst+2);
    memmove(dst, src, copySize);
    *(src+9998*3) = tmpR;
    *(src+9998*3+1) = tmpG;
    *(src+9998*3+2) = tmpB;
  }
  else if (delta > 0) {
    GLubyte* src = colorTable_;
    GLubyte* dst = colorTable_+3;
    tmpR = *(dst+9998*3);
    tmpG = *(dst+9998*3+1);
    tmpB = *(dst+9998*3+2);
    memmove(dst, src, copySize);
    *(src) = tmpR;
    *(src+1) = tmpG;
    *(src+2) = tmpB;
  }

  // Redraw
  for (int y=0; y<h_; y++) {
    if (pixBuf_->isRowCalculated(y)) renderPixelRow(y, mode_);
  }
  if (doublebuf_) glutSwapBuffers();

}


void postRedisplay(void) {
  glutPostRedisplay();
}


void printHelp() {
  printf("\nKeys:\n");
  printf(" 1: Lame FPU computation, C code.\n");
  printf(" 2: Fast FPU computation, 100%% machine code.\n");
#if defined(__APPLE__) && __BIG_ENDIAN__
  // PowerPC
  printf(" 3: Quadfast AltiVec computation, 100%% machine code.\n");
#elif defined (sgi)
  // MIPS
  printf(" 4: Dualfast (R8000 opt) computation, 100%% machine code.\n");
#else
  // Intel x86
  printf(" 3: Quadfast SSE computation, 100%% machine code.\n");
  printf(" 4: Dualfast SSE2 computation, 100%% machine code.\n");
  printf(" 5: Dualfast 3DNow computation, 100%% machine code.\n");
#endif
  printf(" 9: Experimental GPU Fragment Program computation (OpenGL 1.3 ARB only)!\n");
  printf(" 0: Experimental GPU Vertex Program computation (nVidia or ATI cards only)!\n");
  printf(" d: Toggle double/single buffer (may not work on some cards).\n");
  printf(" +,-: Inc/Dec max iters (press shift for +/-20).\n");
  printf(" /,*: Rotate palette (press 'd' if this does not work).\n");
  printf(" h: Shows this help.\n");
  printf(" o: Draw orbits (single buffered mode only).\n");
  printf(" r: Reset zoom position.\n");
  printf(" b: Speed benchmark in current mode (resets max iters to 40).\n");
  printf("    See result in the console.\n");
}