/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
/* vi: set expandtab shiftwidth=4 tabstop=4: */

#include "modp_ascii.h"
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include "modp_ascii_data.h"
#include <ctype.h>

//extern const char gsToUpperMap[256];
/**
 * This is standard clib implementation of uppercasing a string.
 * It has an unfair advantage since it's inside the test file
 * so the optimizer can inline it.
 */
void toupper_copy1(char* dest, const char* str, int len)
{
    int i;
    for (i = 0; i < len; ++i) {
        // toupper is defined in <ctype.h>
        *dest++ = toupper(str[i]);
    }
    *dest = 0;
}


/**
 * Skipping ctype, and doing the compare directly
 *
 */
void toupper_copy2(char* dest, const char* str, int len)
{
    int i;
    char c;
    for (i = 0; i < len; ++i) {
        c = str[i];
        *dest++ = (c >= 'a' && c <= 'z') ? c : (c -32);
    }
    *dest = 0;
}

/** 
 * Sequential table lookup
 */
void toupper_copy3(char* dest, const char* str, int len)
{
    int i;
    unsigned char c;
    for (i = 0; i < len; ++i) {
        c = str[i];
        *dest++ = gsToUpperMap[c];
    }
    *dest = 0;
}

/** \brief toupper Version 4 -- parallel table lookup
 *
 *
 */
void toupper_copy4(char* dest, const char* str, int len)
{
    /*
     *  int i;
     *  for (i = 0; i < len; ++i) {
     *      char c = str[i];
     *      *dest++ = (c >= 'a' && c <= 'z') ? c - 32 : c;
     *  }
     */

    int i;
    uint8_t c1,c2,c3,c4;

    const int leftover = len % 4;
    const int imax = len - leftover;
    const uint8_t* s = (const uint8_t*) str;
    for (i = 0; i != imax ; i+=4) {
        /*
         *  it's important to make these variables
         *  it helps the optimizer to figure out what to do
         */
        c1 = s[i], c2=s[i+1], c3=s[i+2], c4=s[i+3];
        dest[0] = gsToUpperMap[c1];
        dest[1] = gsToUpperMap[c2];
        dest[2] = gsToUpperMap[c3];
        dest[3] = gsToUpperMap[c4];
        dest += 4;
    }

    switch (leftover) {
        case 3: *dest++ = gsToUpperMap[s[i++]];
        case 2: *dest++ = gsToUpperMap[s[i++]];
        case 1: *dest++ = gsToUpperMap[s[i]];
        case 0: *dest = '\0';
    }
}

/** \brief toupper Versions 5 -- hsieh alternate
 * Based code from Paul Hsieh
 * http://www.azillionmonkeys.com/qed/asmexample.html
 *
 * This was his "improved" version, but it appears to either run just
 * as fast, or a bit slower than his original version
 */
void toupper_copy5(char* dest, const char* str, int len)
{
    int i;
    uint32_t eax,ebx,ecx,edx;
    const uint8_t* ustr = (const uint8_t*) str;
    const int leftover = len % 4;
    const int imax = len / 4;
    const uint32_t* s = (const uint32_t*) str;
    uint32_t* d = (uint32_t*) dest;
    for (i = 0; i != imax; ++i) {
        eax = s[i];
        ebx = 0x80808080ul | eax;
        ecx = ebx - 0x61616161ul;
        edx = ~(ebx - 0x7b7b7b7bul);
        ebx = (ecx & edx) & (~eax & 0x80808080ul);
        *d++ = eax - (ebx >> 2);
    }

    i = imax*4;
    dest = (char*) d;
    switch (leftover) {
        case 3: *dest++ = gsToUpperMap[ustr[i++]];
        case 2: *dest++ = gsToUpperMap[ustr[i++]];
        case 1: *dest++ = gsToUpperMap[ustr[i]];
        case 0: *dest = '\0';
    }
}

/** \brief ToUpper Version 6 -- Hsieh original, ASM style
 * Based code from Paul Hsieh
 * http://www.azillionmonkeys.com/qed/asmexample.html
 *
 * This is almost a direct port of the original ASM code, on some
 * platforms/compilers it does run faster then the "de-asm'ed" version
 * used in the modp library.
 *
 */
void toupper_copy6(char* dest, const char* str, int len)
{
    int i=0;
    uint32_t eax,ebx,ecx,edx;
    const uint8_t* ustr = (const uint8_t*) str;
    const int leftover = len % 4;
    const int imax = len / 4;
    const uint32_t* s = (const uint32_t*) str;
    uint32_t* d = (uint32_t*) dest;
    for (i = 0; i != imax; ++i) {
#if 1
        /*
         * as close to original asm code as possible
         */
        eax = s[i];
        ebx = 0x7f7f7f7f;
        edx = 0x7f7f7f7f;
        ebx = ebx & eax;
        ebx = ebx + 0x05050505;
        ecx = eax;
        ecx = ~ecx;
        ebx = ebx & edx;
        ebx = ebx + 0x1a1a1a1a;
        ebx = ebx & ecx;
        ebx = ebx >> 2;
        ebx = ebx & 0x20202020;
        eax = eax - ebx;
        *d++ = eax;
#else
        /*
         * "de-asm'ed" version, this is what is used in the modp library
         */
        eax = s[i];
        ebx = (0x7f7f7f7ful & eax) + 0x05050505ul;
        ebx = (0x7f7f7f7ful & ebx) + 0x1a1a1a1aul;
        ebx = ((ebx & ~eax) >> 2 ) & 0x20202020ul;
        *d++ = eax - ebx;
#endif
    }

    i = imax*4;
    dest = (char*) d;
    switch (leftover) {
        case 3: *dest++ = gsToUpperMap[ustr[i++]];
        case 2: *dest++ = gsToUpperMap[ustr[i++]];
        case 1: *dest++ = gsToUpperMap[ustr[i]];
        case 0: *dest = '\0';
    }
}

void modp_toupper_copy_a2(char* dest, const char* str, int len)
{
    int i=0;
    uint32_t eax, ebx;
    const uint8_t* ustr = (const uint8_t*) str;
    const int leftover = len % 4;
    const int imax = len / 4;
    const uint32_t* s = (const uint32_t*) str;
    uint32_t* d = (uint32_t*) dest;
    for (i = 0; i != imax; ++i) {
        eax = s[i];

        ebx = (0x7f7f7f7ful & eax) + 0x05050505ul;
        ebx = (0x7f7f7f7ful & ebx) + 0x1a1a1a1aul;
        ebx = ((ebx & ~eax) >> 2 ) & 0x20202020ul;
        *d++ = eax - ebx;
    }

    i = imax*4;
    dest = (char*) d;
    switch (leftover) {
        case 3: *dest++ = gsToUpperMap[ustr[i++]];
        case 2: *dest++ = gsToUpperMap[ustr[i++]];
        case 1: *dest++ = gsToUpperMap[ustr[i]];
        case 0: *dest = '\0';
    }
}

int main()
{
    double last = 0.0;
    size_t i = 0;
    char buf[256];
    char obuf[300];

    for (i = 0; i < 256; ++i) {
        buf[i] = (char)i;
    }

    uint32_t max = 1000000;
    clock_t t0, t1;
    printf("%s", "type\tclib\tdirect\tmap\tpara\thsieh1\thsieh2\tFinal\timprovement\n");

    printf("toupper\t");
    fflush(stdout);

    /**
     ** V1
     **/
    t0 = clock();
    for (i = 0; i < max; ++i) {
        toupper_copy1(obuf, buf, sizeof(buf));
    }
    t1 = clock();
    last = t1 -t0;
    printf("%lu\t", (unsigned long)(t1-t0));
    fflush(stdout);

    /**
     ** V2
     **/
    t0 = clock();
    for (i = 0; i < max; ++i) {
        toupper_copy2(obuf, buf, sizeof(buf));
    }
    t1 = clock();
    printf("%lu\t", (unsigned long)(t1-t0));
    fflush(stdout);

    /**
     ** V3
     **/
    t0 = clock();
    for (i = 0; i < max; ++i) {
        toupper_copy3(obuf, buf, sizeof(buf));
    }
    t1 = clock();
    printf("%lu\t", (unsigned long)(t1-t0));
    fflush(stdout);

    /**
     ** V4 -- Parallel Table Lookup
     **/
    t0 = clock();
    for (i = 0; i < max; ++i) {
        toupper_copy4(obuf, buf, sizeof(buf));
    }
    t1 = clock();
    printf("%lu\t", (unsigned long)(t1-t0));
    fflush(stdout);


    /**
     ** V5 -- Hsieh Alternate
     **/
    t0 = clock();
    for (i = 0; i < max; ++i) {
        toupper_copy5(obuf, buf, sizeof(buf));
    }
    t1 = clock();
    printf("%lu\t", (unsigned long)(t1-t0));
    fflush(stdout);

    /**
     ** HSEIH -- asm style
     **/
    t0 = clock();
    for (i = 0; i < max; ++i) {
        toupper_copy6(obuf, buf, sizeof(buf));
    }
    t1 = clock();
    printf("%lu\t", (unsigned long)(t1-t0));
    fflush(stdout);

    /**
     ** MODP FINAL
     **/
    t0 = clock();
    for (i = 0; i < max; ++i) {
        modp_toupper_copy(obuf, buf, sizeof(buf));
    }
    t1 = clock();

    printf("%lu\t", (unsigned long)(t1-t0));
    fflush(stdout);

    printf("%.1fx\n", last/(t1-t0));
    fflush(stdout);

    return 0;
}


syntax highlighted by Code2HTML, v. 0.9.1