/******************************************************************* FFFF v3.2.3 Main code : Daniele Paccaloni [daniele.paccaloni@dylogic.com] MANY THANKS TO: Steven Kortze (Fragment Programs OS X compatibility) Peter Kankowski (SSE optimizations) Richard Rauch (Linux porting hints) falco[SCT] (K6 cmove bugfix) Gerard Basler (3DNow! code and SSE optimizations) Benjamin Lipchak (R3xx optimizations) ATI developers support (special thanks to Jeff and Andy) Andre Krause Amichai Rothman C.Kleinhuis Chris Martin Francesco D'Amico Jean-Philippe Perois John Dough Luciano Genero Mitch Wright Stephan Grossklass [JGrossklass@t-online.de] ... and all the others who kindly sent code, fixes, suggestions and feedback ! *******************************************************************/ int get_nprocs() {return 1;} // WARNING: This source is a real mess ! :))) // WARNING: This is only meant as some "portable" glue for assembly. // WARNING: Do NOT attempt to learn C/C++ from this code ! #ifdef _WIN32 #include #endif #ifdef __APPLE__ #include #include #include #include #include #include #include #include #include #include #include #include #elif defined(sgi) #include #include #include #include #include #include #include #include #include #include #include #include #elif defined(__linux__) #include #include #include #include #include #include #include #include #include #include // #include #else #include "GL/glut.h" #include "GL/gl.h" #endif #include #include #include "math.h" #include "VertexProgramATI.h" #ifndef __APPLE__ #include "VertexProgramNV.h" #endif #include "FragmentProgramARB10.h" // App includes #include "PixelBuffer.h" // Defines #define RENDER_MODE_GPU_VP 0 #define RENDER_MODE_FPU_C 1 #define RENDER_MODE_FPU_ASM 2 #define RENDER_MODE_CPU_SSE 3 #define RENDER_MODE_CPU_SSE2 4 #define RENDER_MODE_CPU_3DNOW 5 #define RENDER_MODE_GPU_FP 9 #define ITER_BLACK 0xFFFFFFFF // GLUT callbacks void display(void); void myReshape(int w, int h); void NextFrame(); void processNormalKeys(unsigned char key, int x, int y); void mousemove(int x, int y); void mouseclick(int button, int down, int x, int y); void idleFunc(); // App prototypes void prepareWorldSpace(void); void postRedisplay(void); bool renderImage(unsigned int maxi, int mode); bool calcPixelRow(int row, unsigned int maxi, int mode); bool calcPixelRow_C(int row, unsigned int maxi); bool calcPixelRow_FPU_ASM(int row, unsigned int maxi); bool calcPixelRow_CPU_SSE(int row, unsigned int maxi); bool calcPixelRow_CPU_SSE2(int row, unsigned int maxi); bool calcPixelRow_CPU_3DNOW(int row, unsigned int maxi); bool renderPixelRow(int row, int mode); void prepareColorTable(int numColors, GLubyte startR, GLubyte startG, GLubyte startB); bool checkSSE(); bool checkSSE2(); bool check3DNow(); void updateWinTitle(); void rotatePalette(int delta); void runBenchmark(); void printHelp(); #if defined(sgi) extern bool calcPixelRow_FPU_ASM_MIPS(unsigned int* rowBuffer, unsigned int maxi, unsigned int iter_black, int width, double cx, double cy, double sx); extern bool calcPixelRow_FPU_ASM_MIPSR8000(unsigned int* rowBuffer, unsigned int maxi, unsigned int iter_black, int width, double cx, double cy, double sx); #endif #ifdef __APPLE__ // OS X void* slaveThreadCode(void* lpParameter); # if __BIG_ENDIAN__ volatile long InterlockedDecrement(volatile long* ptr) { volatile long tmp; __asm __volatile ( "1: lwarx %0, 0, %1\n" // Load Word and Reserve Indexed " addic %0, %0, -1\n" // Add Immediate Carrying " stwcx. %0, 0, %1\n" // Store Word Conditional Indexed " bne- 1b" : /* output */ "=&r" (tmp) : /* input */ "r" (ptr) : /* clobbered */ "cc", "memory"); return tmp; } # else // __APPLE__ && __LITTLE_ENDIAN__ long InterlockedDecrement(long* ptr) { DecrementAtomic(ptr); return (*ptr); } # endif // __APPLE__ && __BIG_ENDIAN__ #elif defined (sgi) // IRIX void* slaveThreadCode(void* lpParameter); unsigned long InterlockedDecrement(unsigned long* ptr) { return test_then_add(ptr, -1); } #elif defined (__linux__) // LINUX void* slaveThreadCode(void* lpParameter); volatile long InterlockedDecrement(volatile long* ptr) { __asm __volatile ( "movl $-1, %%eax;" "lock xaddl %%eax, (%%ecx);" // Assert CPU LOCK# signal {temp = *ecx; *ecx += eax; eax = temp} "decl %%eax;" // Update eax, since it holds the old value of *ecx" : /* output regs */ : /* input regs */ "c" (ptr) ); // Return value in eax */ } #elif defined(WIN32) // WIN32 DWORD WINAPI slaveThreadCode(LPVOID lpParameter); #endif GPUProgram* vp_ = NULL; // Application stuff char* progname = "FFFF v3.2.3"; char winTitle_[256]; long numCPU_ = 0; int* slices_ = NULL; // Slices array (only used if SMP available). PixelBuffer* pixBuf_ = NULL; int debug = 0; const char* glVersion_ = NULL; // OpenGL version GLubyte* glArrColor_ = NULL; GLfloat* glArrVertex_ = NULL; int mode_ = RENDER_MODE_FPU_ASM; // Current render mode bool useGLArrays_ = true; GLubyte* colorTable_ = NULL; bool rendering_ = false; bool benchmarking_ = false; bool reshaped_ = false; bool avail_SSE = false; bool avail_SSE2 = false; bool avail_3DNow = false; bool avail_VP = false; bool avail_FP = false; unsigned int maxi_ = 40; double ax_ = -2.0f; double ay_ = -1.5f; double ex_ = 1.0f; double ey_ = 1.5f; double sx_, sy_; double four = 4.0f; #if defined (sgi) unsigned long slavesWorking_ = 0; #else long slavesWorking_ = 0; #endif #if defined(__APPLE__) || defined(sgi) || defined(__linux__) // OS X, IRIX pthread_t* threadIDs_; long long sysFreq = 1000000; // We have to simulate some Win32 stuff. typedef union _LARGE_INTEGER { // struct { // unsigned long LowPart; // long HighPart; // }; struct { unsigned long LowPart; long HighPart; } u; long long QuadPart; } LARGE_INTEGER, *PLARGE_INTEGER; bool QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount) { timeval tp; gettimeofday(&tp, NULL); lpPerformanceCount->QuadPart = tp.tv_sec*1000000 + tp.tv_usec; return true; } #elif defined(WIN32) // Win32 system stuff SYSTEM_INFO inf; HANDLE finishSem_ = NULL; HANDLE* threads_ = NULL; DWORD* threadIDs_ = NULL; LARGE_INTEGER sysFreqQuery; LONGLONG sysFreq; LONGLONG timing; #endif LARGE_INTEGER timeTmp0; LARGE_INTEGER timeTmp1; double timingf; // GLUT stuff int win_top; int w_; // Window width int h_; // Window height int mouseClickPos[2] = {0,0}; int mousePos[2] = {0,0}; bool mouseDown[3] = {false, false, false}; int glutKeyModif_ = 0; bool doublebuf_ = false; double ax0_, ay0_, ex0_, ey0_; int main(int argc, char* argv[]) { printf(progname); printf("\n(C)1994-2006 Daniele Paccaloni (daniele.paccaloni@dylogic.com)\n"); printf("Initalizing...\n"); sprintf(winTitle_, "%s initializing...", progname); // System & Performance init. int rc = 0; #ifdef __APPLE__ // OS X int mib[2], value = 0; size_t len = sizeof(value); mib[0] = CTL_HW; mib[1] = HW_NCPU; rc = sysctl(mib, 2, &value, &len, NULL, 0); if (rc == 0) numCPU_ = value; else numCPU_ = 1; #elif defined (sgi) numCPU_ = sysconf(_SC_NPROC_ONLN); #elif defined (__linux__) numCPU_ = get_nprocs(); #elif defined (WIN32) // WIN32 GetSystemInfo(&inf); numCPU_ = inf.dwNumberOfProcessors; #endif printf("Number of CPUs: %d\n", numCPU_); if (numCPU_ > 1) { // Multiple CPUs available. printf("SMP support available, creating %d slave threads.\n", numCPU_-1); // Allocate and fill slices array. slices_ = (int*) malloc(numCPU_*sizeof(int*)); for (int i=0; i 1) { threadIDs_ = (pthread_t*) malloc((numCPU_-1)*sizeof(pthread_t)); if (threadIDs_ == NULL) { printf("*ERROR* Failed to allocate slave threads pool.\n"); numCPU_ = 1; } if (numCPU_ > 1) { // Create Threads pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); // Do not inherit scheduling from parent thread. struct sched_param sp; memset(&sp, 0, sizeof(sched_param)); sp.sched_priority = 0; pthread_attr_setschedpolicy(&attr, SCHED_OTHER); pthread_attr_setschedparam(&attr, &sp); pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); pthread_t* curThread = threadIDs_; for (int i=0; i<(numCPU_-1); i++) { rc = pthread_create(curThread, &attr, slaveThreadCode, &slices_[i]); if (rc != 0) { // ERROR creating thread. printf("PANIC: thread creation failed\n"); return false; } curThread++; } pthread_attr_destroy(&attr); } else { printf("Switching to MONO CPU mode.\n"); } } #else // WIN32 if (numCPU_ > 1) { threads_ = (HANDLE*) malloc((numCPU_-1)*sizeof(HANDLE*)); threadIDs_ = (DWORD*) malloc((numCPU_-1)*sizeof(DWORD*)); if ((threads_ == NULL) || (threadIDs_ == NULL)) { printf("*ERROR* Failed to allocate calc thread pool.\n"); numCPU_ = 1; } if (numCPU_ > 1) { // Create Threads HANDLE* curThread = threads_; DWORD* curThreadID = threadIDs_; for (int i=0; i<(numCPU_-1); i++) { *curThread = CreateThread( NULL, // pointer to security attributes 0, // initial thread stack size (LPTHREAD_START_ROUTINE) slaveThreadCode, // pointer to thread function &slices_[i], // argument for new thread 0*CREATE_SUSPENDED, // creation flags curThreadID // pointer to receive thread ID ); } } else { printf("Switching to MONO CPU mode.\n"); } } #endif //avail_SSE = checkSSE(); avail_SSE = 1; if (avail_SSE) { #if defined(__APPLE__) && __BIG_ENDIAN__ // PowerPC printf("AltiVec instructions supported. Switching to AltiVec quadpoints computation.\n"); #else // Intel x86 printf("SSE instructions supported. Switching to SSE quadpoints computation.\n"); #endif mode_ = RENDER_MODE_CPU_SSE; } else { printf("Vector instructions NOT supported.\n"); printf("Switching to machine code FPU mode.\n"); } //avail_SSE2 = checkSSE2(); avail_SSE2 = 1; if (avail_SSE2) { #if defined (sgi) printf("MIPS dual FPU units supported.\n"); mode_ = RENDER_MODE_CPU_SSE2; printf(" Switching to R8000 opt dualpoints computation.\n"); #else printf("SSE2 instructions supported.\n"); #endif } else { #if defined (sgi) printf("MIPS dual FPU units NOT available.\n"); #else printf("SSE2 instructions NOT supported.\n"); #endif } avail_3DNow = check3DNow(); if (avail_3DNow) { printf("AMD 3DNow! instructions supported."); if (!avail_SSE) { mode_ = RENDER_MODE_CPU_3DNOW; printf(" Switching to 3DNow! quadpoints computation.\n"); } else printf("\n"); } else { printf("3DNow! instructions NOT available.\n"); } if ((!avail_SSE) && (!avail_SSE2) && (!avail_3DNow)) { #if defined(__APPLE__) &&__LITTLE_ENDIAN__ // This is default for Intel Macs due to assembler problem printf("Switching to FPU compiled C mode.\n"); mode_ = RENDER_MODE_FPU_C; #else // Apple G3s and earlier, also other non vector-enabled systems printf("Switching to machine code FPU mode.\n"); mode_ = RENDER_MODE_FPU_ASM; #endif } // Initialize GLUT glutInit(&argc, argv); glutInitWindowPosition(0, 0); glutInitWindowSize(500, 500); // Display mode glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); // Create toplevel win win_top = glutCreateWindow(winTitle_); // Set win and icon titles glutSetWindowTitle(progname); updateWinTitle(); glutSetIconTitle(progname); // Set myReshape function glutReshapeFunc(myReshape); // Set display function glutDisplayFunc(display); // Set idle function glutIdleFunc(NULL); // Set cursor glutSetCursor(GLUT_CURSOR_CROSSHAIR); // Set mouse callback glutMouseFunc(mouseclick); glutMotionFunc(mousemove); glutPassiveMotionFunc(mousemove); // Set keyboard callback glutKeyboardFunc(processNormalKeys); // Set idle function glutIdleFunc(idleFunc); //printf(" o: Toggle Object.\n"); // OpenGL init glDisable(GL_LIGHTING); glDisable(GL_LIGHT0); glDisable(GL_DITHER); glDisable(GL_DEPTH_TEST); glShadeModel(GL_FLAT); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glClearColor(0.0f, 0.0f, 0.0f, 1.0f); glClear(GL_COLOR_BUFFER_BIT); glDisable(GL_NORMALIZE); glDrawBuffer(GL_FRONT); glVersion_ = (const char*) glGetString(GL_VERSION); // Get OpenGL Version printf("OpenGL v%s\n", glVersion_); printf(" Renderer: %s\n", glGetString(GL_RENDERER)); printf(" Vendor: %s\n", glGetString(GL_VENDOR)); useGLArrays_ = (glVersion_[0] >= '2') || (glVersion_[2] > '0'); if (!useGLArrays_) { printf("*WARNING* OpenGL arrays not available. Render will be slower.\n"); } // Print help printHelp(); //printf(" Extensions: %s\n",glGetString(GL_EXTENSIONS)); //printf("\nSeen anything faster ? Then... time to change your OpenGL card !\n"); // Initialize colortable prepareColorTable(10000, 70, 50, 200); // Enter GLUT mainloop glutMainLoop(); // exit return 0; } /** * GLUT Idle function */ void idleFunc() { if (mouseDown[0] || mouseDown[2]) { double zoom = 0.95; int msx = mousePos[0]; int msy = mousePos[1]; if (mouseDown[2]) { // zoomout zoom = 1.05; } double mx = ax_ + sx_*((double)msx); double my = ay_ + sy_*((double)msy); //printf("x=%d, y=%d\n", msx, msy); //printf("mx=%f, my=%f\n", mx, my); sx_ *= zoom; sy_ *= zoom; ax_ = mx - sx_*((double)msx); ay_ = my - sy_*((double)msy); ex_ = ax_ + sx_*((double)w_); ey_ = ay_ + sy_*((double)h_); // Recalc real plane parameters //sx_ = (ex_ - ax_) / ((double) w_); //sy_ = (ey_ - ay_) / ((double) h_); //sy_ = sx_; if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) { prepareWorldSpace(); #ifdef _WIN32 // WIN32 Sleep(10); #else // UNIX usleep(10000); #endif //InitializeVertexProgramNV(); } postRedisplay(); } else { #ifdef _WIN32 // WIN32 Sleep(10); #else // UNIX usleep(10000); #endif } } //------------------ OPENGL RENDER FUNCTION ------------------ void display(void) { if (benchmarking_) { runBenchmark(); } rendering_ = true; renderImage(maxi_, mode_); if (doublebuf_) glutSwapBuffers(); rendering_ = false; } //------------------ RESHAPE WINDOW ------------------ void myReshape(int w, int h) { reshaped_ = true; w_ = w; h_ = h; // Recalc real plane parameters sx_ = (ex_ - ax_) / ((double) w_); //sy_ = (ey_ - ay_) / ((double) h_); sy_ = sx_; sprintf(winTitle_, "%s %d*%d i=%d", progname, w_, h_, maxi_); updateWinTitle(); //printf("My reshape!\n"); glViewport(0, 0, w, h); prepareWorldSpace(); // Render again the image postRedisplay(); } //------------------ USER INPUT HANDLERS ------------------ void mousemove(int x, int y) { int difference[2]; // Calculate the change in mouse position difference[0] = mouseClickPos[0] - x; difference[1] = mouseClickPos[1] - y; // If the left mouse button is held down, translate the set if (mouseDown[1]) { ax_ = ax0_ + ((double)difference[0])*sx_; ay_ = ay0_ + ((double)difference[1])*sy_; ex_ = ex0_ + ((double)difference[0])*sx_; ey_ = ey0_ + ((double)difference[1])*sy_; if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) { prepareWorldSpace(); //InitializeVertexProgramNV(); } postRedisplay(); } mousePos[0] = x; mousePos[1] = y; } void mouseclick(int button, int down, int x, int y) { // Update mouse states switch(button) { case GLUT_LEFT_BUTTON: if (down == GLUT_DOWN) { mouseDown[0] = true; } else { mouseDown[0] = false; } break; case GLUT_MIDDLE_BUTTON: if (down == GLUT_DOWN) mouseDown[1] = true; else mouseDown[1] = false; break; case GLUT_RIGHT_BUTTON: if (down == GLUT_DOWN) mouseDown[2] = true; else mouseDown[2] = false; break; } ax0_ = ax_; ay0_ = ay_; ex0_ = ex_; ey0_ = ey_; mouseClickPos[0] = x; mouseClickPos[1] = y; } void processNormalKeys(unsigned char key, int x, int y) { glutKeyModif_ = glutGetModifiers(); if (key == 27) exit(0); // ESC to quit key |= 0x20; if (key == '1') { mode_ = RENDER_MODE_FPU_C; if (vp_ != NULL) { delete vp_; vp_ = NULL; }; prepareWorldSpace(); postRedisplay(); } else if (key == '2') { mode_ = RENDER_MODE_FPU_ASM; if (vp_ != NULL) { delete vp_; vp_ = NULL; }; prepareWorldSpace(); postRedisplay(); } else if (key == '3') { if (avail_SSE) { mode_ = RENDER_MODE_CPU_SSE; if (vp_ != NULL) { delete vp_; vp_ = NULL; }; prepareWorldSpace(); postRedisplay(); } } else if (key == '4') { if (avail_SSE2) { mode_ = RENDER_MODE_CPU_SSE2; if (vp_ != NULL) { delete vp_; vp_ = NULL; }; prepareWorldSpace(); postRedisplay(); } } else if (key == '5') { if (avail_3DNow) { mode_ = RENDER_MODE_CPU_3DNOW; if (vp_ != NULL) { delete vp_; vp_ = NULL; }; prepareWorldSpace(); postRedisplay(); } } else if (key == '9') { // Delete previous GPUPrograms if (vp_ != NULL) { delete vp_; vp_ = NULL; }; // Save maxi_ in case of failure unsigned int maxi_old = maxi_; // Try ARB Fragment Program if (maxi_ > 10) maxi_ = 20; vp_ = new FragmentProgramARB10(maxi_, w_, h_, ax_, ay_, ex_, ey_); if (vp_->isValid()) { mode_ = RENDER_MODE_GPU_FP; prepareWorldSpace(); postRedisplay(); } else { delete vp_; vp_ = NULL; maxi_ = maxi_old; // Fragment Programs not supported on this card... please code it and tell me :) printf("FragmentPrograms not available or not supported on this card\n"); //prepareWorldSpace(); //postRedisplay(); return; } } else if (key == '0') { // Delete previous GPUPrograms if (vp_ != NULL) { delete vp_; vp_ = NULL; }; // Save maxi_ in case of failure unsigned int maxi_old = maxi_; #ifndef __APPLE__ // Try with nVidia if (maxi_ > 60) maxi_ = 60; vp_ = new VertexProgramNV(maxi_, w_, h_, ax_, ay_, ex_, ey_); if (vp_->isValid()) { } #else // We do not support nVidia custom extensions on the Mac, sorry. if (false); #endif else { maxi_ = maxi_old; delete vp_; vp_ = NULL; // Try with ATI if (maxi_ > 10) maxi_ = 10; // This fixes the MORTAL PAUSE when starting using a long VP !!! ATI has resolved this bug. vp_ = new VertexProgramATI(maxi_, w_, h_, ax_, ay_, ex_, ey_); if (vp_->isValid()) { } else { delete vp_; vp_ = NULL; maxi_ = maxi_old; // Vertex Programs not supported on this card... please code it and tell me :) printf("VertexPrograms not available or not supported on this card\n"); //prepareWorldSpace(); //postRedisplay(); return; } } mode_ = RENDER_MODE_GPU_VP; prepareWorldSpace(); postRedisplay(); } else if (key == ' ') { // Calculate image postRedisplay(); } else if (key == '-') { // Dec iters int nmaxi = maxi_; if (glutKeyModif_ & GLUT_ACTIVE_SHIFT) nmaxi -= 20; else nmaxi--; if (nmaxi > 0) maxi_ = nmaxi; if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) { vp_->initialize(maxi_, w_, h_, ax_, ay_, ex_, ey_); } postRedisplay(); } else if (key == '+') { // Inc iters int nmaxi = maxi_; if (glutKeyModif_ & GLUT_ACTIVE_SHIFT) nmaxi += 20; else nmaxi++; if (nmaxi < 9999) maxi_ = nmaxi; if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) { vp_->initialize(maxi_, w_, h_, ax_, ay_, ex_, ey_); } postRedisplay(); } else if (key == '*') { // Rotate palette right rotatePalette(+1); } else if (key == '/') { // Rotate palette left rotatePalette(-1); } else if (key == 'd') { doublebuf_ = !doublebuf_; if (doublebuf_) { glDrawBuffer(GL_BACK); } else { glDrawBuffer(GL_FRONT); } // Calculate image postRedisplay(); } else if (key == 's') { printf("Stats not yet supported.\n"); //printf(" Total iters: %d\n", pixBuf_->getTotalIters(maxi_, ITER_BLACK)); } else if (key == 'h') { printHelp(); } else if (key == 'b') { benchmarking_ = true; glutReshapeWindow(500, 500); // Calculate image postRedisplay(); } else if (key == 'r') { maxi_ = 40; ax_ = -2.0f; ay_ = -1.5f; ex_ = 1.0f; ey_ = 1.5f; four = 4.0f; myReshape(w_, h_); } else if (key == 'o') { doublebuf_ = false; glDrawBuffer(GL_FRONT); //printf("----------------- NEW ORBIT --------------------\n"); // Display orbit // C row calculation routine // Calc vars double cx = ax_ + sx_*((double)x); double cy = ay_ + sy_*((double)y); double zx, zy; double zx2, zy2; // Calc Pixel zx = cx; zy = cy; glColor3d(1.0, 1.0, 1.0); glBegin(GL_POINTS); for (unsigned int i=0; i 4) break; zy = 2*zx*zy; zx = zx2 - zy2; zx += cx; zy += cy; } glEnd(); glFlush(); } updateWinTitle(); } //------------------ Render the image ------------------- bool renderImage(unsigned int maxi, int mode) { //printf("%f,%f %d\n", ax_, ay_, maxi); if (h_ == 0) return true; glClear(GL_COLOR_BUFFER_BIT); //printf("ax=%f, ay=%f, ex=%f, ey=%f\n", ax_, ay_, ex_, ey_); if (mode == RENDER_MODE_GPU_FP) { glBegin(GL_QUADS); glTexCoord4f(ax_, ay_, 1.0, ay_); //glVertex2f(0,0); glVertex2f(ax_,ay_); glTexCoord4f(ex_, ay_, 1.0, ay_); //glVertex2f(1,0); glVertex2f(ex_, ay_); glTexCoord4f(ex_, ey_, 1.0, ey_); //glVertex2f(1,1); glVertex2f(ex_, ey_); glTexCoord4f(ax_, ey_, 1.0, ey_); //glVertex2f(0,1); glVertex2f(ax_, ey_); glEnd(); /* glBegin(GL_QUADS); glTexCoord4f(ax_, ay_, ay_, 1.0); //glVertex2f(0,0); glVertex2f(ax_,ay_); glTexCoord4f(ex_, ay_, ay_, 1.0); //glVertex2f(1,0); glVertex2f(ex_, ay_); glTexCoord4f(ex_, ey_, ey_, 1.0); //glVertex2f(1,1); glVertex2f(ex_, ey_); glTexCoord4f(ax_, ey_, ey_, 1.0); //glVertex2f(0,1); glVertex2f(ax_, ey_); glEnd(); */ glFlush(); return true; } // Wait until sync == 0. If > 0, dec sync and begin execution while (slavesWorking_ != 0) { #ifdef _WIN32 Sleep(1); #else usleep(1000); // 1000 #endif } if (mode == RENDER_MODE_GPU_VP) { for (int y=0; ysetRowCalculated(y, false); // Start the slave threads (on SMP machines) int neededSlaves = numCPU_ - 1; int endRow = h_/numCPU_; // Check for threads to use (max one per row) if (neededSlaves > h_) { endRow = 1; neededSlaves = h_-1; } #ifdef _WIN32 Sleep(3); #else usleep(3000); #endif // Start threads slavesWorking_ = neededSlaves; bool complete = false; // The master thread (this) calcs+render his slice for (y=0; yisRowCalculated(y)) renderPixelRow(y, mode); else complete = false; } } //slavesWorking_ = 0; //while (slavesWorking_ != 0) Sleep(1); // Flush damn OpenGL queue (needed!) glFlush(); return true; } /** * Render the given pixel row for the given mode * @return true if row calc was ok, false otherwise */ bool renderPixelRow(int row, int mode) { //if (!pixBuf_->isRowCalculated(row)) return false; if (mode == RENDER_MODE_GPU_VP) { double cx = ax_; double cy = ay_ + sy_*((double)row); if (useGLArrays_) { /* enable arrays client state */ glEnableClientState(GL_VERTEX_ARRAY); GLfloat* arrV = glArrVertex_; for (int x=0; xgetRowPointer(row); if (rowBuffer == NULL) return false; //printf("Rendering row %d (%p)\n", row, rowBuffer); int i = 0; unsigned int c = 0; GLfloat x = 0.0f; if (useGLArrays_) { /* enable arrays client state */ glEnableClientState(GL_COLOR_ARRAY); glEnableClientState(GL_VERTEX_ARRAY); GLubyte* arrC = glArrColor_; GLfloat* arrV = glArrVertex_; GLfloat frow = (GLfloat) row; for (i=0; i < w_; i++) { x += 1.0f; //printf("%u\n", i); //printf("%p\t%p\t%p\n", rowBuffer, arrC, arrV); c = *rowBuffer; //printf("c=%u", c); fflush(stdout); if (c == ITER_BLACK) { *arrC++ = (GLubyte) 0; *arrC++ = (GLubyte) 0; *arrC++ = (GLubyte) 0; } else { *arrC++ = colorTable_[c*3]; *arrC++ = colorTable_[c*3+1]; *arrC++ = colorTable_[c*3+2]; } *arrV++ = x; *arrV++ = frow; rowBuffer++; } glColorPointer(3, GL_UNSIGNED_BYTE, 0, glArrColor_); glVertexPointer(2, GL_FLOAT, 0, glArrVertex_); glDrawArrays(GL_POINTS, 0, w_); /* disable arrays client state */ glDisableClientState(GL_COLOR_ARRAY); glDisableClientState(GL_VERTEX_ARRAY); } else { glBegin(GL_POINTS); for (int x=0; xsetRowCalculated(row, false); bool retcode = false; if (mode == RENDER_MODE_FPU_C) retcode = calcPixelRow_C(row, maxi); else if (mode == RENDER_MODE_FPU_ASM) retcode = calcPixelRow_FPU_ASM(row, maxi); else if (mode == RENDER_MODE_CPU_SSE) retcode = calcPixelRow_CPU_SSE(row, maxi); else if (mode == RENDER_MODE_CPU_SSE2) retcode = calcPixelRow_CPU_SSE2(row, maxi); else if (mode == RENDER_MODE_CPU_3DNOW) retcode = calcPixelRow_CPU_3DNOW(row, maxi); if (retcode) pixBuf_->setRowCalculated(row, true); // TODO: Row agent not yet supported return retcode; } #pragma GCC optimization_level 3 // optimize this for GCC. On a Dual G5 2.5 GHz machine, performance equal to FPU ASM routine! /** * Calculate the given pixel row using C mode * @return true if row calc was ok, false otherwise */ bool calcPixelRow_C(int row, unsigned int maxi) { unsigned int i; int x; unsigned int* rowBuffer = pixBuf_->getRowPointer(row); //printf("(rowBuffer[%d]=%p)", row, rowBuffer); // C row calculation routine // Calc vars register double cx = ax_; register double cy = ay_ + sy_*((double)row); register double zx, zy; register double zx2, zy2; //printf("%f, %f\n", cx, cy);calcPixelRow_FPU_ASM_MIPS for (x=0; x 4) break; zy = 2*zx*zy; zx = zx2 - zy2; zx += cx; zy += cy; } cx += sx_; if (i == maxi) *rowBuffer = ITER_BLACK; else *rowBuffer = i; //printf("i=%u", *rowBuffer); fflush(stdout); rowBuffer++; } // End of calculation routine return true; } #pragma GCC optimization_level reset // reset the optimization for GCC /** * Calculate the given pixel row using ASM_FPU mode * @return true if row calc was ok, false otherwise */ bool calcPixelRow_FPU_ASM(int row, unsigned int maxi) { unsigned int* rowBuffer = pixBuf_->getRowPointer(row); double _cy = ay_ + sy_*((double)row); #if defined(__APPLE__) && defined(__BIG_ENDIAN__) // Pretty speedy (but unoptimized PowerPC FPU code). // I guess with 2x unrolling we'll gain some extra speed. // The PowerPC FPU is nicely implemented. asm volatile ( "NxtPix: \n" " fmr %[zx], %[cx] \n" " fmr %[zy], %[cy] \n" " \n" " mtctr %[maxi] \n" "NxtI: \n" " fmul %[zx2], %[zx], %[zx] \n" " fmul %[zy2], %[zy], %[zy] \n" " fmul %[zy], %[zx], %[zy] \n" " fadd %[m2], %[zx2], %[zy2] \n" " fsub %[zx], %[zx2], %[zy2] \n" " fadd %[zy], %[zy], %[zy] \n" " fcmpu cr0, %[m2], %[fourd] \n" " bgt- DonePix \n" " fadd %[zx], %[zx], %[cx] \n" " fadd %[zy], %[zy], %[cy] \n" " bdnz+ NxtI \n" " \n" "DonePix: \n" " mfctr %[i] \n" " sub %[i], %[maxi], %[i] \n" " cmp cr0, %[i], %[maxi] \n" " bne+ NotBlack \n" " mr %[i], %[iter_black] \n" "NotBlack: \n" " stwx %[i], 0, %[rowBuffer] \n" " fadd %[cx], %[cx], %[sx] \n" " add %[rowBuffer], %[rowBuffer], %[four]\n" " sub. %[x], %[x], %[one] \n" " bne+ NxtPix \n" : /* output */ //"=&r" (oldval), "=&r" (tmp), "=m" (*once_control) : /* input */ [rowBuffer] "r" (rowBuffer), [x] "r" (w_), [maxi] "r" (maxi), [iter_black] "r" (ITER_BLACK), [i] "r" (0), [one] "r" (1), [four] "r" (4), [cx] "f" (ax_), [cy] "f" (_cy), [sx] "f" (sx_), [zx] "f" (0.0), [zy] "f" (0.0), [zx2] "f" (0.0), [zy2] "f" (0.0), [m2] "f" (0.0), [fourd] "f" (4.0) : /* clobbered */ "cr0" ); return true; #elif defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__)) // WOULD SOMEONE PLEASE TELL ME WHAT'S WRONG WITH THE CODE BELOW ? // I AM GETTING SICK OF THIS %#^&! AT&T SYNTAX. // REMOVE THE return false AND TEST IT PLEASE. [Daniele] return false; // x86 for GCC lame syntax. __asm __volatile ( //"finit;" "movl %[rowBuffer], %%esi;" "fldl %[four];" "fldl %[ax_];" "fldl %[_cy];" "movl %[maxi], %%edi;" "movl %[w_], %%edx;" "nxtpixF:" "fld %%st(1);" "fld %%st(1);" "xorl %%ecx, %%ecx;" "iterloopF:" "fld %%st(1);" "fmul %%st, %%st;" "fld %%st(1);" "fmul %%st, %%st;" "fxch %%st(1);" "fld %%st;" "fadd %%st(2), %%st;" "fcomp %%st(7);" "fnstsw %%ax;" "sahf;" "fsubrp %%st, %%st(1);" "jnc donepixF;" "fadd %%st(4), %%st;" "cmpl %%edi, %%ecx;" "fxch %%st(2);" "fadd %%st, %%st;" "jz donepixF;" "fmulp %%st, %%st(1);" "addl $1, %%ecx;" "fadd %%st(2), %%st;" "jmp iterloopF;" "donepixF:" "fstp %%st;" "fstp %%st;" "fstp %%st;" "cmpl %%edi, %%ecx;" "fldl %[sx_];" "jnz notblackF;" "movl $0xFFFFFFFF, %%ecx;" // ITER_BLACK constant. "notblackF:" "movl %%ecx, (%%esi);" "faddp %%st, %%st(2);" "addl $4, %%esi;" "subl $1, %%edx;" "jnz nxtpixF;" "fstp %%st;" "fstp %%st;" "fstp %%st;" : /* output */ : /* input */ [rowBuffer] "m" (rowBuffer), [four] "m" (four), [ax_] "m" (ax_), [_cy] "m" (_cy), [maxi] "m" (maxi), [w_] "m" (w_), [sx_] "m" (sx_) : /* clobbered */ "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" ); return true; #elif defined(WIN32) // x86 _asm { finit // This is required to avoid VisualC default lower FPU precision. mov esi, rowBuffer; fld four // STACK: 4 fld ax_ // STACK: ax 4 fld _cy // STACK: ay ax 4 mov edi,maxi // edi = maxiters //mov si,600 mov edx,w_ nxtpix: fld st(1) // STACK: cx ay ax 4 fld st(1) // STACK: cy cx ay ax 4 xor ecx,ecx // First iter iterloop: fld st(1) // STACK: zx cy cx ay ax 4 fmul st,st // STACK: zy*zx cy cx ay ax 4 fld st(1) // STACK: zy zx*zx cy cx ay ax 4 fmul st,st // STACK: zy*zy zx*zx cy cx ay ax 4 fxch st(1) // STACK: zx*zx zy*zy cy cx ay ax 4 fld st // STACK: zx*zx zx*zx zy*zy cy cx ay ax 4 fadd st,st(2) // STACK: zx*zx+zy*zy zx*zx zy*zy cy cx ay ax 4 fcomp st(7) // STACK: zx*zx zy*zy cy cx ay ax 4 fnstsw ax sahf // Check for modulo^2 > 4 fsubrp st(1),st // STACK: zx*zx-zy*zy cy cx ay ax 4 jnc donepix fadd st,st(4) // STACK: zx*zx-zy*zy+ay cy cx ay ax 4 cmp ecx,edi // Check for maxiters fxch st(2) // STACK: zx*zx-zy*zy+ay cy cx ay ax 4 fadd st,st // STACK: 2*cy zx*zx-zy*zy+ay cx ay ax 4 jz donepix fmulp st(1),st add ecx, 1 fadd st,st(2) jmp iterloop donepix: fstp st fstp st fstp st cmp ecx,edi // Check for maxiters fld sx_ jnz notblack mov ecx,ITER_BLACK notblack: mov dword ptr [esi], ecx //;SET PIXEL ! faddp st(2),st add esi,4 ;Change to next pixBuf element sub edx,1 jnz NxtPix fstp st fstp st fstp st } return true; #elif defined(sgi) return calcPixelRow_FPU_ASM_MIPS(rowBuffer, maxi, ITER_BLACK, w_, ax_, _cy, sx_); #else // Unsupported. return false; #endif } /** * Calculate the given pixel row using ASM_SSE mode * @return true if row calc was ok, false otherwise */ bool calcPixelRow_CPU_SSE(int row, unsigned int maxi) { unsigned int* rowBuffer = pixBuf_->getRowPointer(row); // C row calculation routine // Calc vars //double _cx = ax_; float _cx = (float)ax_; float _cy = (float)(ay_ + sy_*((float)row)); float _sx = (float)sy_; #if defined(__APPLE__) || defined(__linux__) float __attribute__ ((aligned(16))) cxs[] = {(float)_cx, (float)_cx, (float)_cx, (float)_cx}; float __attribute__ ((aligned(16))) cys[] = {(float)_cy, (float)_cy, (float)_cy, (float)_cy}; float __attribute__ ((aligned(16))) coeffs[] = {0, (float)_sx, 2*((float)_sx), 3* ((float)_sx)}; float __attribute__ ((aligned(16))) qsx[] = {4*((float)sx_), 4*((float)sx_), 4* ((float)sx_), 4*((float)sx_)}; float __attribute__ ((aligned(16))) zeros[] = {0.0,0.0,0.0,0.0}; float __attribute__ ((aligned(16))) four[] = {4.0,4.0,4.0,4.0}; unsigned int __attribute__ ((aligned(16))) ones[] = {1,1,1,1}; unsigned int __attribute__ ((aligned(16))) iter_black[] = {ITER_BLACK, ITER_BLACK, ITER_BLACK, ITER_BLACK}; unsigned int __attribute__ ((aligned(16))) iter_max[] = {maxi, maxi, maxi, maxi}; #endif #if defined(__APPLE__) && defined(__BIG_ENDIAN__) // Mac geek, this is what you are looking for. // PLEASE NOTE: This code is preliminary and not yet optimized. // Hope that gcc developers will add support for gcc stile vector registers. // [Daniele] asm volatile ( ".align 8 \n" " vxor v0, v0, v0 \n" " lvx v1, 0, %[cxs] \n" // cx " lvx v2, 0, %[cys] \n" // cy " lvx v3, 0, %[coeffs] \n" " vaddfp v1, v1, v3 \n" " lvx v4, 0, %[four] \n" " lvx v6, 0, %[qsx] \n" " lvx v7, 0, %[iter_black] \n" " lvx v15, 0, %[iter_max] \n" " \n" "NxtPixA: \n" " mtctr %[maxi] \n" " vor v8, v1, v1 \n" // zx " vor v9, v2, v2 \n" // zy " vxor v20, v20, v20 \n" // Quad iters counter. " lvx v21, 0, %[ones] \n" // Quad iters incrementers. "NxtIA: \n" " vmaddfp v10, v8, v8, v0 \n" // zx2 " vmaddfp v11, v9, v9, v0 \n" // zy2 " vmaddfp v9, v8, v9, v0 \n" // zy_ = zx * zy " vaddfp v12, v10, v11 \n" // modulo2 = zx2 + zy2 " vsubfp v8, v10, v11 \n" // zx = zx2 - zy2 " vaddfp v9, v9, v9 \n" // zy = 2 * zx * zy " vcmpgtfp. v13, v4, v12 \n" // Set iter mask (if all elements are in bailout then CR6[2] is set and the quad is done). " beq- cr6, DonePixA \n" " vand v21, v21, v13 \n" // Mask the incrementers. " vadduwm v20, v20, v21 \n" // Inc quad iters. " vaddfp v8, v8, v1 \n" // zx = zx + cx " vaddfp v9, v9, v2 \n" // zy = zy + cy " bdnz+ NxtIA \n" " \n" "DonePixA: \n" " vcmpequw v13, v20, v15 \n" " vandc v20, v20, v13 \n" " vand v21, v7, v13 \n" " vor v20, v20, v21 \n" " stvx v20, 0, %[rowBuffer] \n" " vaddfp v1, v1, v6 \n" // Step to next quad pixels. " add %[rowBuffer], %[rowBuffer], %[sixteen]\n" " sub. %[x], %[x], %[one] \n" " bne+ NxtPixA \n" : /* output */ //"=&r" (oldval), "=&r" (tmp), "=m" (*once_control) : /* input */ [cxs] "r" (cxs), [cys] "r" (cys), [coeffs] "r" (coeffs), [qsx] "r" (qsx), [zeros] "r" (zeros), [ones] "r" (ones), [one] "r" (1), [four] "r" (four), [rowBuffer] "r" (rowBuffer), [x] "r" (w_/4), [maxi] "r" (maxi), [iter_black] "r" (iter_black), [iter_max] "r" (iter_max), [sixteen] "r" (16) : /* clobbered */ "cr0", "cr6", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v15", "v20", "v21" ); return true; #endif #if defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__)) // x86 for GCC lame syntax. __asm __volatile ( "movl %[rowBuffer], %%esi;" "movl %[maxi], %%edi;" "movl %[w_], %%edx;" "addl $3, %%edx;" "shrl $2, %%edx;" "movss %[_cx], %%xmm6;" "shufps $0, %%xmm6, %%xmm6;" "movaps %[coeffs], %%xmm5;" "addps %%xmm5, %%xmm6;" "movss %[_cy], %%xmm7;" "shufps $0, %%xmm7, %%xmm7;" "movaps %[four], %%xmm5;" "nxtpixS:" "movaps %%xmm6, %%xmm0;" "movaps %%xmm7, %%xmm1;" "xorps %%xmm4, %%xmm4;" "movl %%edi, %%ecx;" "iterloopS:" "movaps %%xmm0, %%xmm2;" "mulps %%xmm0, %%xmm0;" "movaps %%xmm1, %%xmm3;" "addps %%xmm1, %%xmm1;" "mulps %%xmm2, %%xmm1;" "movaps %%xmm0, %%xmm2;" "mulps %%xmm3, %%xmm3;" "addps %%xmm7, %%xmm1;" "subps %%xmm3, %%xmm0;" "addps %%xmm3, %%xmm2;" "cmpleps %%xmm5, %%xmm2;" "addps %%xmm6, %%xmm0;" "movmskps %%xmm2, %%eax;" "testl %%eax, %%eax;" "jz donepixS;" "andps %%xmm5, %%xmm2;" "addps %%xmm2, %%xmm4;" "subl $1, %%ecx;" "jnz iterloopS;" "donepixS:" "cvtss2si %%xmm4, %%ecx;" "movl $0xffffffff, %%eax;" "shrl $2, %%ecx;" "cmpl %%edi, %%ecx;" "cmovel %%eax, %%ecx;" "movl %%ecx, (%%esi);" "shufps $0xe5, %%xmm4, %%xmm4;" "cvtss2si %%xmm4, %%ecx;" "shrl $2, %%ecx;" "cmpl %%edi, %%ecx;" "cmovel %%eax, %%ecx;" "movl %%ecx, 4(%%esi);" "shufps $0xe6, %%xmm4, %%xmm4;" "cvtss2si %%xmm4, %%ecx;" "shrl $2, %%ecx;" "cmpl %%edi, %%ecx;" "cmovel %%eax, %%ecx;" "movl %%ecx, 8(%%esi);" "shufps $0xe7, %%xmm4, %%xmm4;" "cvtss2si %%xmm4, %%ecx;" "shrl $2, %%ecx;" "cmpl %%edi, %%ecx;" "cmovel %%eax, %%ecx;" "movl %%ecx, 12(%%esi);" "addl $16, %%esi;" "subl $1, %%edx;" "addps %[qsx], %%xmm6;" "jnz nxtpixS;" : /* output */ : /* input */ [rowBuffer] "m" (rowBuffer), [maxi] "m" (maxi), [w_] "m" (w_), [_cx] "m" (_cx), [_cy] "m" (_cy), [coeffs] "m" (*coeffs), [four] "m" (*four), [qsx] "m" (*qsx) : /* clobbered */ "%eax", "%ecx", "%edx", "%esi", "%edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); return true; #endif #if defined(WIN32) __declspec(align(64)) float coeffs[] = {0, (float)_sx, 2*((float)_sx), 3* ((float)_sx)}; __declspec(align(64)) float qsx[] = {4*((float)sx_), 4*((float)sx_), 4* ((float)sx_), 4*((float)sx_)}; __declspec(align(64)) float four[] = {4,4,4,4}; //__declspec(align(8)) unsigned short masktest[] = {1,2,4,8}; //__declspec(align(8)) unsigned short maskshift[] = {0,1,2,3}; //__declspec(align(64)) float mytest[] = {1,2,3,5}; // xmm0 re // xmm1 im // xmm2 tmp1 // xmm3 tmp2 // xmm4 count // xmm5 zmax // xmm6 _cx // xmm7 _cy _asm { //femms; mov esi, rowBuffer; mov edi, maxi // edi = maxiters mov edx,w_ // edx = num of pixels add edx,3 // pad row to quadpixel boundary (multiple of four) shr edx,2 // edx = num of quadpixels movss xmm6, _cx // xmm6 = ?,?,?,cx shufps xmm6, xmm6, 0 // xmm6 = cx,cx,cx,cx movaps xmm5, dword ptr [coeffs] // xmm5 = 0,sx,2*sx,3*sx addps xmm6, xmm5 // xmm6 = zx0,zx1,zx2,zx3 movss xmm7, _cy // xmm7 = ?,?,?,cy shufps xmm7, xmm7, 0 // xmm7 = cy,cy,cy,cy movaps xmm5, dword ptr [four] // xmm5 = 4,4,4,4 align 16 /* // OLD v3.2.2 routine nxtpix: movaps xmm0, xmm6 // xmm0 = zx0,zx1,zx2,zx3 movaps xmm1, xmm7 // xmm1 = zy0, zy1, zy2, zy3 xorps xmm4, xmm4 // zero quadpixel iters counter mov ecx,edi // ecx = iters counter iterloop: movaps xmm2, xmm0 // xmm2 = zx0,zx1,zx2,zx3 mulps xmm2, xmm1 // xmm2 = zx0*zy0, zx1*zy1, zx2*zy2, zx3*zy3 mulps xmm0, xmm0 // xmm0 = zx0*zx0, zx1*zx1, zx2*zx2, zx3*zx3 mulps xmm1, xmm1 // xmm1 = zy0*zy0, zy1*zy1, zy2*zy2, zy3*zy3 addps xmm2, xmm2 // xmm2 = 2*zx0*zy0, 2*zx1*zy1, 2*zx2*zy2, 2*zx3*zy3 movaps xmm3, xmm0 // xmm3 = zx0*zx0, zx1*zx1, zx2*zx2, zx3*zx3 addps xmm3, xmm1 // xmm3 = zy0*zy0+zx0*zx0, zy1*zy1+zx1*zx1, zy2*zy2+zx2*zx2, zy3*zy3+zx3*zx3 cmpltps xmm3, xmm5 // xmm3 = <4,<4,<4,<4 movmskps eax, xmm3 // test eax, eax // Test for all ready jz donepix subps xmm0, xmm1 // re = re*re-im*im movaps xmm1, xmm2 andps xmm3, xmm5 addps xmm4, xmm3 addps xmm0, xmm6 // re = re + zinitre addps xmm1, xmm7 // im = im + zinitim dec ecx // dec iter counter jne iterloop */ // Peter Kankowski's optimized SSE routine. nxtpix: movaps xmm0, xmm6 movaps xmm1, xmm7 xorps xmm4, xmm4 mov ecx, edi iterloop: // xmm0 = zx xmm1 = zy movaps xmm2, xmm0 mulps xmm0, xmm0 movaps xmm3, xmm1 addps xmm1, xmm1 // xmm0 = zx^2 xmm1 = 2 * zy xmm2 = zx xmm3 = zy mulps xmm1, xmm2 movaps xmm2, xmm0 mulps xmm3, xmm3 // xmm0 = zx^2 xmm1 = 2*zy*zx xmm2 = zx^2 xmm3 = zy^2 addps xmm1, xmm7 subps xmm0, xmm3 addps xmm2, xmm3 // xmm0 = zx^2 - zy^2 xmm1=2*zy*zx+py xmm2 = zx^2 + zy^2 xmm3 = zy^2 cmpleps xmm2, xmm5 addps xmm0, xmm6 movmskps eax, xmm2 test eax, eax jz donepix andps xmm2, xmm5 // xmm4 += (xmm2 < 4.0) ? 4.0 : 0.0; addps xmm4, xmm2 sub ecx, 1 jnz iterloop donepix: cvtss2si ecx, xmm4 mov eax, ITER_BLACK shr ecx, 2 cmp ecx, edi cmove ecx, eax mov dword ptr [esi],ecx ;SET PIXEL ! shufps xmm4, xmm4, 0xe5 cvtss2si ecx, xmm4 shr ecx, 2 cmp ecx, edi cmove ecx, eax mov dword ptr [esi+4],ecx ;SET PIXEL ! shufps xmm4, xmm4, 0xe6 cvtss2si ecx, xmm4 shr ecx, 2 cmp ecx, edi cmove ecx, eax mov dword ptr [esi+8],ecx ;SET PIXEL ! shufps xmm4, xmm4, 0xe7 cvtss2si ecx, xmm4 shr ecx, 2 cmp ecx, edi cmove ecx, eax mov dword ptr [esi+12],ecx ;SET PIXEL ! add esi,16 //Change to next pixBuf elements sub edx,1 // Decrease number of quadpixels to compute addps xmm6, dword ptr [qsx] // xmm6 = next quadpixels cx[] jnz nxtpix //femms; } // End of calculation routine return true; #endif return false; } /** * Calculate the given pixel row using ASM_SSE2 mode * @return true if row calc was ok, false otherwise */ bool calcPixelRow_CPU_SSE2(int row, unsigned int maxi) { unsigned int* rowBuffer = pixBuf_->getRowPointer(row); // C row calculation routine // Calc vars //double _cx = ax_; double _cx = (double)ax_; double _cy = (double)(ay_ + sy_*((double)row)); double _sx = (double)sy_; #if defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__)) double __attribute__ ((aligned(16))) coeffs[] = {0, (double)_sx}; double __attribute__ ((aligned(16))) dsx[] = {2.0*((double)sx_), 2.0*((double)sx_)}; double __attribute__ ((aligned(16))) four[] = {4.0,4.0}; // x86 for GCC lame syntax. __asm __volatile ( "movl %[rowBuffer], %%esi;" "movl %[maxi], %%edi;" "movl %[w_], %%edx;" "addl $1, %%edx;" "shrl $1, %%edx;" "movsd %[_cx], %%xmm6;" "shufpd $0, %%xmm6, %%xmm6;" "movapd %[coeffs], %%xmm5;" "addpd %%xmm5, %%xmm6;" "movsd %[_cy], %%xmm7;" "shufpd $0, %%xmm7, %%xmm7;" "movapd %[four], %%xmm5;" "nxtpixS2:" "movapd %%xmm6, %%xmm0;" "movapd %%xmm7, %%xmm1;" "xorpd %%xmm4, %%xmm4;" "movl %%edi, %%ecx;" "iterloopS2:" "movapd %%xmm0, %%xmm2;" "mulpd %%xmm0, %%xmm0;" "movapd %%xmm1, %%xmm3;" "addpd %%xmm1, %%xmm1;" "mulpd %%xmm2, %%xmm1;" "movapd %%xmm0, %%xmm2;" "mulpd %%xmm3, %%xmm3;" "addpd %%xmm7, %%xmm1;" "subpd %%xmm3, %%xmm0;" "addpd %%xmm3, %%xmm2;" "cmplepd %%xmm5, %%xmm2;" "addpd %%xmm6, %%xmm0;" "movmskpd %%xmm2, %%eax;" "testl %%eax, %%eax;" "jz donepixS2;" "andpd %%xmm5, %%xmm2;" "addpd %%xmm2, %%xmm4;" "subl $1, %%ecx;" "jnz iterloopS2;" "donepixS2:" "cvtsd2si %%xmm4, %%ecx;" "movl $0xffffffff, %%eax;" "shrl $2, %%ecx;" "cmpl %%edi, %%ecx;" "cmovel %%eax, %%ecx;" "movl %%ecx, (%%esi);" "shufpd $3, %%xmm4, %%xmm4;" "cvtsd2si %%xmm4, %%ecx;" "shrl $2, %%ecx;" "cmpl %%edi, %%ecx;" "cmovel %%eax, %%ecx;" "movl %%ecx, 4(%%esi);" "addl $8, %%esi;" "subl $1, %%edx;" "addpd %[dsx], %%xmm6;" "jnz nxtpixS2;" : /* output */ : /* input */ [rowBuffer] "m" (rowBuffer), [maxi] "m" (maxi), [w_] "m" (w_), [_cx] "m" (_cx), [_cy] "m" (_cy), [coeffs] "m" (*coeffs), [four] "m" (*four), [dsx] "m" (*dsx) : /* clobbered */ "%eax", "%ecx", "%edx", "%esi", "%edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); return true; #endif #ifdef WIN32 __declspec(align(64)) double coeffs[] = {0, (double)_sx}; __declspec(align(64)) double dsx[] = {2*((double)sx_), 2*((double)sx_)}; __declspec(align(64)) double four[] = {4,4}; _asm { //femms; mov esi, rowBuffer; mov edi, maxi // edi = maxiters mov edx,w_ // edx = num of pixels add edx,1 // pad row to dualpixel boundary (multiple of four) shr edx,1 // edx = num of dualpixels movsd xmm6, _cx // xmm6 = ?,cx shufpd xmm6, xmm6, 0 // xmm6 = cx,cx movapd xmm5, dword ptr [coeffs] // xmm5 = 0,sx addpd xmm6, xmm5 // xmm6 = zx0,zx1 movsd xmm7, _cy // xmm7 = ?,cy shufpd xmm7, xmm7, 0 // xmm7 = cy,cy movapd xmm5, dword ptr [four] // xmm5 = 4,4 align 16 /* // OLD v3.2.2 routine nxtpix: movapd xmm0, xmm6 // xmm0 = zx0, zx1 movapd xmm1, xmm7 // xmm1 = zy0, zy1 xorpd xmm4, xmm4 // zero dualpixel iters counter mov ecx,edi // ecx = iters counter iterloop: movapd xmm2, xmm0 // xmm2 = zx0,zx1 mulpd xmm2, xmm1 // xmm2 = zx0*zy0, zx1*zy1 mulpd xmm0, xmm0 // xmm0 = zx0*zx0, zx1*zx1 mulpd xmm1, xmm1 // xmm1 = zy0*zy0, zy1*zy1 addpd xmm2, xmm2 // xmm2 = 2*zx0*zy0, 2*zx1*zy1 movapd xmm3, xmm0 // xmm3 = zx0*zx0, zx1*zx1 addpd xmm3, xmm1 // xmm3 = zy0*zy0+zx0*zx0, zy1*zy1+zx1*zx1 cmpltpd xmm3, xmm5 // xmm3 = <4,<4,<4,<4 movmskpd eax, xmm3 test eax, eax // Test for all ready jz donepix subpd xmm0, xmm1 // re = re*re-im*im movapd xmm1, xmm2 andpd xmm3, xmm5 addpd xmm4, xmm3 addpd xmm0, xmm6 // re = re + zinitre addpd xmm1, xmm7 // im = im + zinitim dec ecx // dec iter counter jne iterloop */ // Peter Kankowski's optimized SSE routine. nxtpix: movapd xmm0, xmm6 movapd xmm1, xmm7 xorpd xmm4, xmm4 mov ecx, edi iterloop: // xmm0 = zx xmm1 = zy movapd xmm2, xmm0 mulpd xmm0, xmm0 movapd xmm3, xmm1 addpd xmm1, xmm1 // xmm0 = zx^2 xmm1 = 2 * zy xmm2 = zx xmm3 = zy mulpd xmm1, xmm2 movapd xmm2, xmm0 mulpd xmm3, xmm3 // xmm0 = zx^2 xmm1 = 2*zy*zx xmm2 = zx^2 xmm3 = zy^2 addpd xmm1, xmm7 subpd xmm0, xmm3 addpd xmm2, xmm3 // xmm0 = zx^2 - zy^2 xmm1=2*zy*zx+py xmm2 = zx^2 + zy^2 xmm3 = zy^2 cmplepd xmm2, xmm5 addpd xmm0, xmm6 movmskpd eax, xmm2 test eax, eax jz donepix andpd xmm2, xmm5 // xmm4 += (xmm2 < 4.0) ? 4.0 : 0.0; addpd xmm4, xmm2 sub ecx, 1 jnz iterloop donepix: cvtsd2si ecx, xmm4 mov eax, ITER_BLACK shr ecx, 2 cmp ecx, edi cmove ecx, eax mov dword ptr [esi],ecx ;SET PIXEL ! shufpd xmm4, xmm4, 0x3 cvtsd2si ecx, xmm4 shr ecx, 2 cmp ecx, edi cmove ecx, eax mov dword ptr [esi+4],ecx ;SET PIXEL ! add esi,8 //Change to next pixBuf elements sub edx,1 // Decrease number of dualpixels to compute addpd xmm6, dword ptr [dsx] // xmm6 = next dualpixels cx[] jnz nxtpix //femms; } // End of calculation routine return true; #elif defined(sgi) return calcPixelRow_FPU_ASM_MIPSR8000(rowBuffer, maxi, ITER_BLACK, w_, ax_, _cy, sx_); #else // Unsupported. return false; #endif } /** * Calculate the given pixel row using CPU_3DNOW mode * Author: Gérard Basler * @return true if row calc was ok, false otherwise */ bool calcPixelRow_CPU_3DNOW(int row, unsigned int maxi) { unsigned int* rowBuffer = pixBuf_->getRowPointer(row); float _cx = (float)ax_; float _cy = (float)(ay_ + sy_*((double)row)); float _sx = (float)sy_; #ifdef _WIN32 __declspec(align(32)) float coeffs[] = {0, (float)_sx}, qsx[] = {2*((float)sx_), 2*((float)sx_)}, zmax[] = {4.0f, 4.0f}, cmax[] = {maxi, maxi}; // eax tmp // ebx tmp // ecx i // edx w_ // esi maxi // edi rowBuffer // mm0 cx (zinitre) // mm1 cy (zinitim) // mm2 zx (re) // mm3 zy (im) // mm4 count // mm5 zmax // mm6 tmp1 // mm7 tmp2 _asm { femms; push ebx; mov edx,w_ // edx = num of pixels add edx,1 // pad row to dualpixel boundary (multiple of four) shr edx,1 // edx = num of dualpixels mov esi, maxi mov edi, rowBuffer pxor mm0, mm0 movd mm0, _cx pfacc mm0, mm0 pfadd mm0, coeffs pxor mm1, mm1 movd mm1, _cy pfacc mm1, mm1 movq mm5, zmax mov ebx, ITER_BLACK // frame pointer destroyed! align 16 nxtpix: movq mm2, mm0 // re = zinitre movq mm3, mm1 // im = zinitim pxor mm4, mm4 // count = 0 mov ecx, esi // i = maxi iterloop: movq mm6, mm2 // tmp1 = re pfmul mm6, mm3 // tmp1 = re*im pfmul mm2, mm2 // re = re*re pfmul mm3, mm3 // im = im*im pfadd mm6, mm6 // tmp1 = 2*re*im // if ((num.real() * num.real() + num.imag() * num.imag()) > 4.0) movq mm7, mm2 // tmp2 = re*re pfadd mm7, mm3 // tmp2 = re*re+im*im // we need more registers, so free one pfsub mm2, mm3 // re = re*re-im*im movq mm3, mm6 // im = 2*re*im, tmp1 is now free movq mm6, mm5 // tmp1 = zmax pfcmpge mm6, mm7 // pcmpgtd mm6, mm7 // danger! integer compare! psubd mm4, mm6 // count = count - (compare) psrlq mm6, 1 // pmovmskb eax, mm6 // only athlon supports this movd eax, mm6 test eax,eax je donepix dec ecx // nCountMax-- //num += zInit pfadd mm2, mm0 // re = re + zinitre pfadd mm3, mm1 // im = im + zinitim // ... hope this helps the branch prediction jne iterloop donepix: //pcmpeqd mm6, cmax // short replacement for the code below //por mm4, mm6 // but requires one more memory op //movq [edi], mm4 movd ecx, mm4 cmp ecx, esi //cmove ecx, ebx // falco[SCT]: Conditional moves that ARE NOT supported on the K6. jne no_black0 mov ecx, ebx no_black0: mov dword ptr [edi],ecx // SET PIXEL ! punpckhdq mm4, mm4 // lo = hi movd ecx, mm4 cmp ecx, esi //cmove ecx, ebx // falco[SCT]: Conditional moves that ARE NOT supported on the K6. jne no_black1 mov ecx, ebx no_black1: mov dword ptr [edi+4],ecx // SET PIXEL ! // cx += sx_; pfadd mm0, qsx add edi, 2*4 dec edx jne nxtpix pop ebx; femms; } // End of calculation routine #endif return true; } bool checkSSE() { #if defined(__APPLE__) && defined(__BIG_ENDIAN__) // PowerPC (should work for Intel but I prefer asm code). int mib[2], value = 0; size_t len = sizeof(value); mib[0] = CTL_HW; mib[1] = HW_VECTORUNIT; int rc = sysctl(mib, 2, &value, &len, NULL, 0); if (rc == 0) return (value != 0); else return false; #endif bool ssehw = false; #if defined(WIN32) _asm { // Move the number 1 into eax - this will move the // feature bits into EDX when a CPUID is issued, that // is, EDX will then hold the key to the cpuid mov eax, 1 // Does this processor have SSE support? cpuid // Perform CPUID (puts processor feature info in EDX) // Shift the bits in edx to the right by 26, thus bit 25 // (SSE bit) is now in CF bit in EFLAGS register. shr edx,0x1A // If CF is not set, jump over next instruction jnc nocarryflag // set the return value to 1 if the CF flag is set mov [ssehw], 1 nocarryflag: } #elif defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__)) // x86 for GCC lame syntax. __asm __volatile ( "movl $1, %%eax;" "cpuid;" "shrl $0x1a, %%edx;" "jnc nocarryflag;" "movl $1, %0;" "nocarryflag:" : /* output */ "=m" (ssehw) : /* input */ : /* clobbered */ "%eax", "%edx" ); #endif return ssehw; } bool checkSSE2() { bool sse2hw = false; #if defined (WIN32) _asm { // Move the number 1 into eax - this will move the // feature bits into EDX when a CPUID is issued, that // is, EDX will then hold the key to the cpuid mov eax, 1 // Does this processor have SSE support? cpuid // Perform CPUID (puts processor feature info in EDX) // Shift the bits in edx to the right by 27, thus bit 26 // (SSE2 bit) is now in CF bit in EFLAGS register. shr edx,0x1B // If CF is not set, jump over next instruction jnc nocarryflag // set the return value to 1 if the CF flag is set mov [sse2hw], 1 nocarryflag: } #elif defined(__linux__) || (defined(__APPLE__) && defined(__LITTLE_ENDIAN__)) // x86 for GCC lame syntax. __asm __volatile ( "movl $1, %%eax;" "cpuid;" "shrl $0x1b, %%edx;" "jnc .nocarryflag2;" "movl $1, %0;" ".nocarryflag2:" : /* output */ "=m" (sse2hw) : /* input */ : /* clobbered */ "%eax", "%edx" ); #elif defined(sgi) // Check for MIPS processor with dual FPUs. char strCPU[512]; strCPU[0] = '\0'; long slen = sysinfo(_MIPS_SI_PROCESSORS, strCPU, sizeof(strCPU)-1); strCPU[511] = '\0'; // Just in case. printf("Detected MIPS CPUs: %s\n", strCPU); sse2hw = (strstr(strCPU, "R8000") != NULL); if (!sse2hw) sse2hw = (strstr(strCPU, "R10000") != NULL); if (!sse2hw) sse2hw = (strstr(strCPU, "R12000") != NULL); if (!sse2hw) sse2hw = (strstr(strCPU, "R14000") != NULL); if (!sse2hw) sse2hw = (strstr(strCPU, "R16000") != NULL); if (!sse2hw) sse2hw = (strstr(strCPU, "R18000") != NULL); // I can dream :) #endif return sse2hw; } // Check for 3DNow bool check3DNow() { bool b3DNow = false; #ifdef WIN32 _asm { mov eax, 80000000h ; CPUID function: Largest extended value cpuid cmp eax, 80000001h ; We can execute feature #1, right? jl No3dNow ; If not, we are done here. mov eax, 80000001h ; CPUID function: Signature + features cpuid shr edx, 31 ; bit 31 indicates 3DNow! support // If CF is not set, jump over next instruction jnc No3dNow // set the return value to 1 if the CF flag is set mov [b3DNow], 1 No3dNow: } #endif return b3DNow; } void prepareWorldSpace() { // Recalc real plane parameters sx_ = (ex_ - ax_) / ((double) w_); //sy_ = (ey_ - ay_) / ((double) h_); sy_ = sx_; // Setup world space depending on render mode if ((mode_ == RENDER_MODE_GPU_VP) || (mode_ == RENDER_MODE_GPU_FP)) { if (vp_ != NULL) vp_->prepareWorldSpace(w_, h_, ax_, ay_, ex_, ey_); } else { glMatrixMode(GL_PROJECTION); glLoadIdentity(); //if (w_ <= h_) gluOrtho2D(0.0f, (GLdouble)w_*(GLdouble)h_/(GLdouble)w_, (GLdouble)h_, 0.0f); //else gluOrtho2D(0.0f, (GLdouble)w_, (GLdouble)h_*(GLdouble)w_/(GLdouble)h_, 0.0f); gluOrtho2D(0, (double)w_, (double)h_, 0); glTranslated(.375, .375, 0); // Fixed bt Richard Rauch as per OpenGL Red Book. } } void prepareColorTable(int numColors, GLubyte startR, GLubyte startG, GLubyte startB) { // Allocate colortable memory if (colorTable_ != NULL) free(colorTable_); colorTable_ = (GLubyte*) malloc(numColors*3*sizeof(GLubyte)); if (colorTable_ == NULL) return; // Fill colortable int addR = 3; int addG = 4; int addB = -2; int r = startR; int g = startG; int b = startB; GLubyte* curCol = colorTable_; for (int i=0; i 240) { addR = -addR; r += addR; } if (g < 0) { g = -g; addG = -addG; } else if (g > 240) { addG = -addG; g += addG; } if (b < 0) { b = -b; addB = -addB; } else if (b > 240) { addB = -addB; b += addB; } } } void updateWinTitle() { char agent[256]; switch(mode_) { case RENDER_MODE_FPU_C: sprintf(agent, "%dxCPU FPU C", numCPU_); break; case RENDER_MODE_FPU_ASM: sprintf(agent, "%dxCPU FPU ASM", numCPU_); break; case RENDER_MODE_CPU_SSE: #if defined(__APPLE__) && defined(__BIG_ENDIAN__) // PowerPC sprintf(agent, "%dxCPU AltiVec ASM", numCPU_); #else // Intel x86 sprintf(agent, "%dxCPU SSE", numCPU_); #endif break; case RENDER_MODE_CPU_SSE2: #if defined (sgi) // MIPS sprintf(agent, "%dxCPU R8000 opt dual FPU ASM", numCPU_); #else sprintf(agent, "%dxCPU SSE2", numCPU_); #endif break; case RENDER_MODE_CPU_3DNOW: sprintf(agent, "%dxCPU 3DNow!", numCPU_); break; case RENDER_MODE_GPU_VP: sprintf(agent, "GPU VertP %s", glGetString(GL_RENDERER)); break; case RENDER_MODE_GPU_FP: sprintf(agent, "GPU FragP %s", glGetString(GL_RENDERER)); break; default: sprintf(agent, "UNKNOWN!"); } if (w_ == 0 && h_ == 0) { // initizl settings w_ = 500; h_ = 500; } sprintf(winTitle_, "%s %d*%d i=%d %s", progname, w_, h_, maxi_, agent); glutSetWindowTitle(winTitle_); } /** * A slave just calculates a slice of the image. * There is one slice for any CPU * Nothing more... we dont want a slave learning too many things ! :) */ #if defined(__APPLE__) || defined(sgi) || defined(__linux__) // UNIX void* slaveThreadCode(void* lpParameter) { #else // WIN32 DWORD WINAPI slaveThreadCode(LPVOID lpParameter) { #endif int slice = 1 + *((int*)lpParameter); int endRow; printf("Thread %d says: \"I'm a slave, I'm alive.\"\n", slice); while (true) { // Wait until sync > 0. while (slavesWorking_ == 0) { #ifdef _WIN32 Sleep(1); #else usleep(1000); #endif } // if this thread must calculate if (slice >= h_) { #ifdef _WIN32 Sleep(1); #else usleep(1000); #endif continue; } // Calculate int startRow = (h_/numCPU_)*slice; if (numCPU_ >= h_) { startRow = slice; endRow = startRow + 1; } else if (slice == (numCPU_-1)) endRow = h_; else endRow = (h_/numCPU_)*(slice+1); for (int y=startRow; yclearBuffer(); #if defined(__APPLE__) && __BIG_ENDIAN__ // PowerPC printf(" [4f] AltiVec benchmark:\n"); #elif defined(sgi) // IRIX printf(" [4f] Vector benchmark:\n"); #else // Intel x86 printf(" [4f] SSE benchmark:\n"); #endif if (avail_SSE) { QueryPerformanceCounter(&timeTmp0); for (y=0; ygetTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f); } else { printf(" Not supported.\n"); } // CPU SSE2 pixBuf_->clearBuffer(); #if defined(__APPLE__) && __BIG_ENDIAN__ // PowerPC printf(" [2d] PowerPC970 dual FPU benchmark:\n"); #elif defined(sgi) // IRIX printf(" [2d] R8000 dual FPU benchmark:\n"); #else // Intel x86 printf(" [2d] SSE2 benchmark:\n"); #endif if (avail_SSE2) { QueryPerformanceCounter(&timeTmp0); for (y=0; ygetTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f); } else { printf(" Not supported.\n"); } // CPU 3DNOW! pixBuf_->clearBuffer(); printf(" [2f] 3DNow! benchmark:\n"); if (avail_3DNow) { QueryPerformanceCounter(&timeTmp0); for (y=0; ygetTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f); } else { printf(" Not supported.\n"); } // FPU ASM pixBuf_->clearBuffer(); printf(" [1d] FPU ASM benchmark:\n"); QueryPerformanceCounter(&timeTmp0); for (y=0; ygetTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f); // FPU C pixBuf_->clearBuffer(); printf(" [1d] FPU C benchmark:\n"); QueryPerformanceCounter(&timeTmp0); for (y=0; ygetTotalIters(maxi_, ITER_BLACK))/timingf/1000000.0f); // GPU VP printf(" [4?] GPU VertexProgram benchmark (beta! maxiters=10) on %s:\n", glGetString(GL_RENDERER)); pixBuf_->clearBuffer(); maxi_ = 10; if (vp_ != NULL) { delete vp_; vp_ = NULL; } #if !defined(__APPLE__) && !defined(sgi) vp_ = new VertexProgramNV(maxi_, w_, h_, ax_, ay_, ex_, ey_); if (!vp_->isValid()) { delete vp_; #else if (true) { #endif // Try with ATI VP vp_ = new VertexProgramATI(maxi_, w_, h_, ax_, ay_, ex_, ey_); } if (vp_->isValid()) { mode_ = RENDER_MODE_GPU_VP; QueryPerformanceCounter(&timeTmp0); int repeats = 200; for (int i=0; iclearBuffer(); maxi_ = 20; if (vp_ != NULL) { delete vp_; vp_ = NULL; } vp_ = new FragmentProgramARB10(maxi_, w_, h_, ax_, ay_, ex_, ey_); if (vp_->isValid()) { mode_ = RENDER_MODE_GPU_FP; QueryPerformanceCounter(&timeTmp0); int repeats = 200; for (int i=0; i 0) { GLubyte* src = colorTable_; GLubyte* dst = colorTable_+3; tmpR = *(dst+9998*3); tmpG = *(dst+9998*3+1); tmpB = *(dst+9998*3+2); memmove(dst, src, copySize); *(src) = tmpR; *(src+1) = tmpG; *(src+2) = tmpB; } // Redraw for (int y=0; yisRowCalculated(y)) renderPixelRow(y, mode_); } if (doublebuf_) glutSwapBuffers(); } void postRedisplay(void) { glutPostRedisplay(); } void printHelp() { printf("\nKeys:\n"); printf(" 1: Lame FPU computation, C code.\n"); printf(" 2: Fast FPU computation, 100%% machine code.\n"); #if defined(__APPLE__) && __BIG_ENDIAN__ // PowerPC printf(" 3: Quadfast AltiVec computation, 100%% machine code.\n"); #elif defined (sgi) // MIPS printf(" 4: Dualfast (R8000 opt) computation, 100%% machine code.\n"); #else // Intel x86 printf(" 3: Quadfast SSE computation, 100%% machine code.\n"); printf(" 4: Dualfast SSE2 computation, 100%% machine code.\n"); printf(" 5: Dualfast 3DNow computation, 100%% machine code.\n"); #endif printf(" 9: Experimental GPU Fragment Program computation (OpenGL 1.3 ARB only)!\n"); printf(" 0: Experimental GPU Vertex Program computation (nVidia or ATI cards only)!\n"); printf(" d: Toggle double/single buffer (may not work on some cards).\n"); printf(" +,-: Inc/Dec max iters (press shift for +/-20).\n"); printf(" /,*: Rotate palette (press 'd' if this does not work).\n"); printf(" h: Shows this help.\n"); printf(" o: Draw orbits (single buffered mode only).\n"); printf(" r: Reset zoom position.\n"); printf(" b: Speed benchmark in current mode (resets max iters to 40).\n"); printf(" See result in the console.\n"); }