/*
 * Copyright (c) 1994 David I. Bell
 * Permission is granted to use, distribute, or modify this source,
 * provided that this copyright notice remains intact.
 *
 * Faster than usual multiplying and squaring routines.
 * The algorithm used is the reasonably simple one from Knuth, volume 2,
 * section 4.3.3.  These recursive routines are of speed O(N^1.585)
 * instead of O(N^2).  The usual multiplication and (almost usual) squaring
 * algorithms are used for small numbers.  On a 386 with its compiler, the
 * two algorithms are equal in speed at about 100 decimal digits.
 */

#include <tcl.h>
#include "zmath.h"


LEN _mul2_ = MUL_ALG2;		/* size of number to use multiply algorithm 2 */
LEN _sq2_ = SQ_ALG2;		/* size of number to use square algorithm 2 */


static HALF *tempbuf;		/* temporary buffer for multiply and square */

static LEN domul MATH_PROTO((HALF *v1, LEN size1, HALF *v2, LEN size2, HALF *ans));
static LEN dosquare MATH_PROTO((HALF *vp, LEN size, HALF *ans));


/*
 * Multiply two numbers using the following formula recursively:
 *	(A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
 * where S is a power of 2^16, and so multiplies by it are shifts, and
 * A,B,C,D are the left and right halfs of the numbers to be multiplied.
 */
void
zmul(z1, z2, res)
	ZVALUE z1, z2;		/* numbers to multiply */
	ZVALUE *res;		/* result of multiplication */
{
	LEN len;		/* size of array */

	if (ziszero(z1) || ziszero(z2)) {
		*res = _zero_;
		return;
	}
	if (zisunit(z1)) {
		zcopy(z2, res);
		res->sign = (z1.sign != z2.sign);
		return;
	}
	if (zisunit(z2)) {
		zcopy(z1, res);
		res->sign = (z1.sign != z2.sign);
		return;
	}

	/*
	 * Allocate a temporary buffer for the recursion levels to use.
	 * An array needs to be allocated large enough for all of the
	 * temporary results to fit in.  This size is about twice the size
	 * of the largest original number, since each recursion level uses
	 * the size of its given number, and whose size is 1/2 the size of
	 * the previous level.  The sum of the infinite series is 2.
	 * Add some extra words because of rounding when dividing by 2
	 * and also because of the extra word that each multiply needs.
	 */
	len = z1.len;
	if (len < z2.len)
		len = z2.len;
	len = 2 * len + 64;
	tempbuf = zalloctemp(len);

	res->sign = (z1.sign != z2.sign);
	res->v = alloc(z1.len + z2.len + 1);
	res->len = domul(z1.v, z1.len, z2.v, z2.len, res->v);
}


/*
 * Recursive routine to multiply two numbers by splitting them up into
 * two numbers of half the size, and using the results of multiplying the
 * subpieces.  The result is placed in the indicated array, which must be
 * large enough for the result plus one extra word (size1 + size2 + 1).
 * Returns the actual size of the result with leading zeroes stripped.
 * This also uses a temporary array which must be twice as large as
 * one more than the size of the number at the top level recursive call.
 */
static LEN
domul(v1, size1, v2, size2, ans)
	HALF *v1;		/* first number */
	LEN size1;		/* size of first number */
	HALF *v2;		/* second number */
	LEN size2;		/* size of second number */
	HALF *ans;		/* location for result */
{
	LEN shift;		/* amount numbers are shifted by */
	LEN sizeA;		/* size of left half of first number */
	LEN sizeB;		/* size of right half of first number */
	LEN sizeC;		/* size of left half of second number */
	LEN sizeD;		/* size of right half of second number */
	LEN sizeAB;		/* size of subtraction of A and B */
	LEN sizeDC;		/* size of subtraction of D and C */
	LEN sizeABDC;		/* size of product of above two results */
	LEN subsize;		/* size of difference of halfs */
	LEN copysize;		/* size of number left to copy */
	LEN sizetotal;		/* total size of product */
	LEN len;		/* temporary length */
	HALF *baseA;		/* base of left half of first number */
	HALF *baseB;		/* base of right half of first number */
	HALF *baseC;		/* base of left half of second number */
	HALF *baseD;		/* base of right half of second number */
	HALF *baseAB;		/* base of result of subtraction of A and B */
	HALF *baseDC;		/* base of result of subtraction of D and C */
	HALF *baseABDC;		/* base of product of above two results */
	HALF *baseAC;		/* base of product of A and C */
	HALF *baseBD;		/* base of product of B and D */
	FULL carry;		/* carry digit for small multiplications */
	FULL carryACBD;		/* carry from addition of A*C and B*D */
	FULL digit;		/* single digit multiplying by */
	HALF *temp;		/* base for temporary calculations */
	BOOL neg;		/* whether imtermediate term is negative */
	register HALF *hd, *h1=NULL, *h2=NULL;	/* for inner loops */
	SIUNION sival;		/* for addition of digits */

	/*
	 * Trim the numbers of leading zeroes and initialize the
	 * estimated size of the result.
	 */
	hd = &v1[size1 - 1];
	while ((*hd == 0) && (size1 > 1)) {
		hd--;
		size1--;
	}
	hd = &v2[size2 - 1];
	while ((*hd == 0) && (size2 > 1)) {
		hd--;
		size2--;
	}
	sizetotal = size1 + size2;

	/*
	 * First check for zero answer.
	 */
	if (((size1 == 1) && (*v1 == 0)) || ((size2 == 1) && (*v2 == 0))) {
		*ans = 0;
		return 1;
	}

	/*
	 * Exchange the two numbers if necessary to make the number of
	 * digits of the first number be greater than or equal to the
	 * second number.
	 */
	if (size1 < size2) {
		len = size1; size1 = size2; size2 = len;
		hd = v1; v1 = v2; v2 = hd;
	}

	/*
	 * If the smaller number has only a few digits, then calculate
	 * the result in the normal manner in order to avoid the overhead
	 * of the recursion for small numbers.  The number of digits where
	 * the algorithm changes is settable from 2 to maxint.
	 */
	if (size2 < _mul2_) {
		/*
		 * First clear the top part of the result, and then multiply
		 * by the lowest digit to get the first partial sum.  Later
		 * products will then add into this result.
		 */
		hd = &ans[size1];
		len = size2;
		while (len--)
			*hd++ = 0;

		digit = *v2++;
		h1 = v1;
		hd = ans;
		carry = 0;
		len = size1;
		while (len >= 4) {	/* expand the loop some */
			len -= 4;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (len--) {
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		*hd = (HALF)carry;

		/*
		 * Now multiply by the remaining digits of the second number,
		 * adding each product into the final result.
		 */
		h2 = ans;
		while (--size2 > 0) {
			digit = *v2++;
			h1 = v1;
			hd = ++h2;
			if (digit == 0)
				continue;
			carry = 0;
			len = size1;
			while (len >= 4) {	/* expand the loop some */
				len -= 4;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
			}
			while (len--) {
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
			}
			while (carry) {
				sival.ivalue = ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
			}
		}

		/*
		 * Now return the true size of the number.
		 */
		len = sizetotal;
		hd = &ans[len - 1];
		while ((*hd == 0) && (len > 1)) {
			hd--;
			len--;
		}
		return len;
	}

	/*
	 * Need to multiply by a large number.
	 * Allocate temporary space for calculations, and calculate the
	 * value for the shift.  The shift value is 1/2 the size of the
	 * larger (first) number (rounded up).  The amount of temporary
	 * space needed is twice the size of the shift, plus one more word
	 * for the multiply to use.
	 */
	shift = (size1 + 1) / 2;
	temp = tempbuf;
	tempbuf += (2 * shift) + 1;

	/*
	 * Determine the sizes and locations of all the numbers.
	 * The value of sizeC can be negative, and this is checked later.
	 * The value of sizeD is limited by the full size of the number.
	 */
	baseA = v1 + shift;
	baseB = v1;
	/* 
	 * XXX - Saber-C Version 3.1 says:
	 *
	 *    W#26, Storing a bad pointer into auto variable dmul`baseC.
	 *
	 * This warning is issued during the regression test #026
	 * (read cryrand).
	 *
	 * Saver-C claims that v2+shift is past the end of allocated
	 * memory for v2.  When this warning is first issued, shift
	 * has the value 51.
	 *
	 * The call stack is:
	 *
	 *   domul(0x165ca0, 101, 0x160998, 40, 0x16d0a8) at "zmul.c":315 
	 *   zmul(0x2ddf88, 0x2ddf8c, 0x2ddc98) at "zmul.c":73 
	 *   qsqrt(0x38defc, 0x38ea94) at "qfunc.c":248 
	 *   qln(0x38defc, 0x38de70) at "qtrans.c":589 
	 *   qpower(0x38eacc, 0x38a398, 0x38a018) at "qtrans.c":657 
	 *   powervalue(0x1740f8,0x174118,0x195234,0x19523c) at "value.c":1009 
	 *   f_power(2, 0x533278) at "func.c":1188 
	 *   builtinfunc(117, 2, 0x174118) at "func.c":354 
	 *   o_call(0x5328d8, 117, 2) at "opcodes.c":2094 
	 *   calculate(0x5328d8, 1) at "opcodes.c":288 
	 *   o_usercall(0x5328d8, 54, 1) at "opcodes.c":2082 
	 *   calculate(0x48ffa8, 4) at "opcodes.c":288 
	 *   o_usercall(0x48ffa8, 53, 1) at "opcodes.c":2082 
	 *   calculate(0x1652a0, 1) at "opcodes.c":288 
	 *   o_usercall(0x1652a0, 57, 1) at "opcodes.c":2082 
	 *   calculate(0x16cca8, 0) at "opcodes.c":288 
	 *   evaluate(0) at "codegen.c":167 
	 *   getcommands(0) at "codegen.c":106 
	 *   getcommands(0) at "codegen.c":76 
	 *   getcommands(1) at "codegen.c":76 
	 *   main(-1, 0x181f8c) at "calc.c":155 
	 *
	 * Purify does not complain about this code.  Who is right?
	 */
	baseC = v2 + shift;
	baseD = v2;
	baseAB = ans;
	baseDC = ans + shift;
	baseAC = ans + shift * 2;
	baseBD = ans;

	sizeA = size1 - shift;
	sizeC = size2 - shift;

	sizeB = shift;
	hd = &baseB[shift - 1];
	while ((*hd == 0) && (sizeB > 1)) {
		hd--;
		sizeB--;
	}

	sizeD = shift;
	if (sizeD > size2)
		sizeD = size2;
	hd = &baseD[sizeD - 1];
	while ((*hd == 0) && (sizeD > 1)) {
		hd--;
		sizeD--;
	}

	/*
	 * If the smaller number has a high half of zero, then calculate
	 * the result by breaking up the first number into two numbers
	 * and combining the results using the obvious formula:
	 *	(A*S+B) * D = (A*D)*S + B*D
	 */
	if (sizeC <= 0) {
		len = domul(baseB, sizeB, baseD, sizeD, ans);
		hd = &ans[len];
		len = sizetotal - len;
		while (len--)
			*hd++ = 0;

		/*
		 * Add the second number into the first number, shifted
		 * over at the correct position.
		 */
		len = domul(baseA, sizeA, baseD, sizeD, temp);
		h1 = temp;
		hd = ans + shift;
		carry = 0;
		while (len--) {
			sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}

		/*
		 * Determine the final size of the number and return it.
		 */
		len = sizetotal;
		hd = &ans[len - 1];
		while ((*hd == 0) && (len > 1)) {
			hd--;
			len--;
		}
		tempbuf = temp;
		return len;
	}

	/*
	 * Now we know that the high halfs of the numbers are nonzero,
	 * so we can use the complete formula.
	 *	(A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D.
	 * The steps are done in the following order:
	 *	A-B
	 *	D-C
	 *	(A-B)*(D-C)
	 *	S^2*A*C + B*D
	 *	(S^2+S)*A*C + (S+1)*B*D				(*)
	 *	(S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
	 *
	 * Note: step (*) above can produce a result which is larger than
	 * the final product will be, and this is where the extra word
	 * needed in the product comes from.  After the final subtraction is
	 * done, the result fits in the expected size.  Using the extra word
	 * is easier than suppressing the carries and borrows everywhere.
	 *
	 * Begin by forming the product (A-B)*(D-C) into a temporary
	 * location that we save until the final step.  Do each subtraction
	 * at positions 0 and S.  Be very careful about the relative sizes
	 * of the numbers since this result can be negative.  For the first
	 * step calculate the absolute difference of A and B into a temporary
	 * location at position 0 of the result.  Negate the sign if A is
	 * smaller than B.
	 */
	neg = FALSE;
	if (sizeA == sizeB) {
		len = sizeA;
		h1 = &baseA[len - 1];
		h2 = &baseB[len - 1];
		while ((len > 1) && (*h1 == *h2)) {
			len--;
			h1--;
			h2--;
		}
	}
	if ((sizeA > sizeB) || ((sizeA == sizeB) && h1 && h2 && (*h1 > *h2))) {
		h1 = baseA;
		h2 = baseB;
		sizeAB = sizeA;
		subsize = sizeB;
	} else {
		neg = !neg;
		h1 = baseB;
		h2 = baseA;
		sizeAB = sizeB;
		subsize = sizeA;
	}
	copysize = sizeAB - subsize;

	hd = baseAB;
	carry = 0;
	while (subsize--) {
		sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}

	hd = &baseAB[sizeAB - 1];
	while ((*hd == 0) && (sizeAB > 1)) {
		hd--;
		sizeAB--;
	}

	/*
	 * This completes the calculation of abs(A-B).  For the next step
	 * calculate the absolute difference of D and C into a temporary
	 * location at position S of the result.  Negate the sign if C is
	 * larger than D.
	 */
	if (sizeC == sizeD) {
		len = sizeC;
		h1 = &baseC[len - 1];
		h2 = &baseD[len - 1];
		while ((len > 1) && (*h1 == *h2)) {
			len--;
			h1--;
			h2--;
		}
	}
	if ((sizeC > sizeD) || ((sizeC == sizeD) && (*h1 > *h2)))
	{
		neg = !neg;
		h1 = baseC;
		h2 = baseD;
		sizeDC = sizeC;
		subsize = sizeD;
	} else {
		h1 = baseD;
		h2 = baseC;
		sizeDC = sizeD;
		subsize = sizeC;
	}
	copysize = sizeDC - subsize;

	hd = baseDC;
	carry = 0;
	while (subsize--) {
		sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}
	hd = &baseDC[sizeDC - 1];
	while ((*hd == 0) && (sizeDC > 1)) {
		hd--;
		sizeDC--;
	}

	/*
	 * This completes the calculation of abs(D-C).  Now multiply
	 * together abs(A-B) and abs(D-C) into a temporary location,
	 * which is preserved until the final steps.
	 */
	baseABDC = temp;
	sizeABDC = domul(baseAB, sizeAB, baseDC, sizeDC, baseABDC);

	/*
	 * Now calculate B*D and A*C into one of their two final locations.
	 * Make sure the high order digits of the products are zeroed since
	 * this initializes the final result.  Be careful about this zeroing
	 * since the size of the high order words might be smaller than
	 * the shift size.  Do B*D first since the multiplies use one more
	 * word than the size of the product.  Also zero the final extra
	 * word in the result for possible carries to use.
	 */
	len = domul(baseB, sizeB, baseD, sizeD, baseBD);
	hd = &baseBD[len];
	len = shift * 2 - len;
	while (len--)
		*hd++ = 0;

	len = domul(baseA, sizeA, baseC, sizeC, baseAC);
	hd = &baseAC[len];
	len = sizetotal - shift * 2 - len + 1;
	while (len--)
		*hd++ = 0;

	/*
	 * Now add in A*C and B*D into themselves at the other shifted
	 * position that they need.  This addition is tricky in order to
	 * make sure that the two additions cannot interfere with each other.
	 * Therefore we first add in the top half of B*D and the lower half
	 * of A*C.  The sources and destinations of these two additions
	 * overlap, and so the same answer results from the two additions,
	 * thus only two pointers suffice for both additions.  Keep the
	 * final carry from these additions for later use since we cannot
	 * afford to change the top half of A*C yet.
	 */
	h1 = baseBD + shift;
	h2 = baseAC;
	carryACBD = 0;
	len = shift;
	while (len--) {
		sival.ivalue = ((FULL) *h1) + ((FULL) *h2) + carryACBD;
		*h1++ = sival.silow;
		*h2++ = sival.silow;
		carryACBD = sival.sihigh;
	}

	/*
	 * Now add in the bottom half of B*D and the top half of A*C.
	 * These additions are straightforward, except that A*C should
	 * be done first because of possible carries from B*D, and the
	 * top half of A*C might not exist.  Add in one of the carries
	 * from the previous addition while we are at it.
	 */
	h1 = baseAC + shift;
	hd = baseAC;
	carry = carryACBD;
	len = sizetotal - 3 * shift;
	while (len--) {
		sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (carry) {
		sival.ivalue = ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}

	h1 = baseBD;
	hd = baseBD + shift;
	carry = 0;
	len = shift;
	while (len--) {
		sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (carry) {
		sival.ivalue = ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}

	/*
	 * Now finally add in the other delayed carry from the
	 * above addition.
	 */
	hd = baseAC + shift;
	while (carryACBD) {
		sival.ivalue = ((FULL) *hd) + carryACBD;
		*hd++ = sival.silow;
		carryACBD = sival.sihigh;
	}

	/*
	 * Now finally add or subtract (A-B)*(D-C) into the final result at
	 * the correct position (S), according to whether it is positive or
	 * negative.  When subtracting, the answer cannot go negative.
	 */
	h1 = baseABDC;
	hd = ans + shift;
	carry = 0;
	len = sizeABDC;
	if (neg) {
		while (len--) {
			sival.ivalue = BASE1 - ((FULL) *hd) +
				((FULL) *h1++) + carry;
			*hd++ = BASE1 - sival.silow;
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = BASE1 - ((FULL) *hd) + carry;
			*hd++ = BASE1 - sival.silow;
			carry = sival.sihigh;
		}
	} else {
		while (len--) {
			sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
	}

	/*
	 * Finally determine the size of the final result and return that.
	 */
	len = sizetotal;
	hd = &ans[len - 1];
	while ((*hd == 0) && (len > 1)) {
		hd--;
		len--;
	}
	tempbuf = temp;
	return len;
}


/*
 * Square a number by using the following formula recursively:
 *	(A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2
 * where S is a power of 2^16, and so multiplies by it are shifts,
 * and A and B are the left and right halfs of the number to square.
 */
void
zsquare(z, res)
	ZVALUE z, *res;
{
	LEN len;

	if (ziszero(z)) {
		*res = _zero_;
		return;
	}
	if (zisunit(z)) {
		*res = _one_;
		return;
	}

	/*
	 * Allocate a temporary array if necessary for the recursion to use.
	 * The array needs to be allocated large enough for all of the
	 * temporary results to fit in.  This size is about 3 times the
	 * size of the original number, since each recursion level uses 3/2
	 * of the size of its given number, and whose size is 1/2 the size
	 * of the previous level.  The sum of the infinite series is 3.
	 * Allocate some extra words for rounding up the sizes.
	 */
	len = 3 * z.len + 32;
	tempbuf = zalloctemp(len);

	res->sign = 0;
	res->v = alloc((z.len+1) * 2);
	res->len = dosquare(z.v, z.len, res->v);
}


/*
 * Recursive routine to square a number by splitting it up into two numbers
 * of half the size, and using the results of squaring the subpieces.
 * The result is placed in the indicated array, which must be large
 * enough for the result (size * 2).  Returns the size of the result.
 * This uses a temporary array which must be 3 times as large as the
 * size of the number at the top level recursive call.
 */
static LEN
dosquare(vp, size, ans)
	HALF *vp;		/* value to be squared */
	LEN size;		/* length of value to square */
	HALF *ans;		/* location for result */
{
	LEN shift;		/* amount numbers are shifted by */
	LEN sizeA;		/* size of left half of number to square */
	LEN sizeB;		/* size of right half of number to square */
	LEN sizeAA;		/* size of square of left half */
	LEN sizeBB;		/* size of square of right half */
	LEN sizeAABB;		/* size of sum of squares of A and B */
	LEN sizeAB;		/* size of difference of A and B */
	LEN sizeABAB;		/* size of square of difference of A and B */
	LEN subsize;		/* size of difference of halfs */
	LEN copysize;		/* size of number left to copy */
	LEN sumsize;		/* size of sum */
	LEN sizetotal;		/* total size of square */
	LEN len;		/* temporary length */
	LEN len1;		/* another temporary length */
	FULL carry;		/* carry digit for small multiplications */
	FULL digit;		/* single digit multiplying by */
	HALF *temp;		/* base for temporary calculations */
	HALF *baseA;		/* base of left half of number */
	HALF *baseB;		/* base of right half of number */
	HALF *baseAA;		/* base of square of left half of number */
	HALF *baseBB;		/* base of square of right half of number */
	HALF *baseAABB;		/* base of sum of squares of A and B */
	HALF *baseAB;		/* base of difference of A and B */
	HALF *baseABAB;		/* base of square of difference of A and B */
	register HALF *hd, *h1, *h2, *h3;	/* for inner loops */
	SIUNION sival;		/* for addition of digits */

	/*
	 * First trim the number of leading zeroes.
	 */
	hd = &vp[size - 1];
	while ((*hd == 0) && (size > 1)) {
		size--;
		hd--;
	}
	sizetotal = size + size;

	/*
	 * If the number has only a small number of digits, then use the
	 * (almost) normal multiplication method.  Multiply each halfword
	 * only by those halfwards further on in the number.  Missed terms
	 * will then be the same pairs of products repeated, and the squares
	 * of each halfword.  The first case is handled by doubling the
	 * result.  The second case is handled explicitly.  The number of
	 * digits where the algorithm changes is settable from 2 to maxint.
	 */
	if (size < _sq2_) {
		hd = ans;
		len = sizetotal;
		while (len--)
			*hd++ = 0;

		h2 = vp;
		hd = ans + 1;
		for (len = size; len--; hd += 2) {
			digit = (FULL) *h2++;
			if (digit == 0)
				continue;
			h3 = h2;
			h1 = hd;
			carry = 0;
			len1 = len;
			while (len1 >= 4) {	/* expand the loop some */
				len1 -= 4;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + carry;
				*h1++ = sival.silow;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + ((FULL) sival.sihigh);
				*h1++ = sival.silow;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + ((FULL) sival.sihigh);
				*h1++ = sival.silow;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + ((FULL) sival.sihigh);
				*h1++ = sival.silow;
				carry = sival.sihigh;
			}
			while (len1--) {
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + carry;
				*h1++ = sival.silow;
				carry = sival.sihigh;
			}
			while (carry) {
				sival.ivalue = ((FULL) *h1) + carry;
				*h1++ = sival.silow;
				carry = sival.sihigh;
			}
		}

		/*
		 * Now double the result.
		 * There is no final carry to worry about because we
		 * handle all digits of the result which must fit.
		 */
		carry = 0;
		hd = ans;
		len = sizetotal;
		while (len--) {
			digit = ((FULL) *hd);
			sival.ivalue = digit + digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}

		/*
		 * Now add in the squares of each halfword.
		 */
		carry = 0;
		hd = ans;
		h3 = vp;
		len = size;
		while (len--) {
			digit = ((FULL) *h3++);
			sival.ivalue = digit * digit + ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}

		/*
		 * Finally return the size of the result.
		 */
		len = sizetotal;
		hd = &ans[len - 1];
		while ((*hd == 0) && (len > 1)) {
			len--;
			hd--;
		}
		return len;
	}

	/*
	 * The number to be squared is large.
	 * Allocate temporary space and determine the sizes and
	 * positions of the values to be calculated.
	 */
	temp = tempbuf;
	tempbuf += (3 * (size + 1) / 2);

	sizeA = size / 2;
	sizeB = size - sizeA;
	shift = sizeB;
	baseA = vp + sizeB;
	baseB = vp;
	baseAA = &ans[shift * 2];
	baseBB = ans;
	baseAABB = temp;
	baseAB = temp;
	baseABAB = &temp[shift];

	/*
	 * Trim the second number of leading zeroes.
	 */
	hd = &baseB[sizeB - 1];
	while ((*hd == 0) && (sizeB > 1)) {
		sizeB--;
		hd--;
	}

	/*
	 * Now to proceed to calculate the result using the formula.
	 *	(A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
	 * The steps are done in the following order:
	 *	S^2*A^2 + B^2
	 *	A^2 + B^2
	 *	(S^2+S)*A^2 + (S+1)*B^2
	 *	(A-B)^2
	 *	(S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
	 *
	 * Begin by forming the squares of two the halfs concatenated
	 * together in the final result location.  Make sure that the
	 * highest words of the results are zero.
	 */
	sizeBB = dosquare(baseB, sizeB, baseBB);
	hd = &baseBB[sizeBB];
	len = shift * 2 - sizeBB;
	while (len--)
		*hd++ = 0;

	sizeAA = dosquare(baseA, sizeA, baseAA);
	hd = &baseAA[sizeAA];
	len = sizetotal - shift * 2 - sizeAA;
	while (len--)
		*hd++ = 0;

	/*
	 * Sum the two squares into a temporary location.
	 */
	if (sizeAA >= sizeBB) {
		h1 = baseAA;
		h2 = baseBB;
		sizeAABB = sizeAA;
		sumsize = sizeBB;
	} else {
		h1 = baseBB;
		h2 = baseAA;
		sizeAABB = sizeBB;
		sumsize = sizeAA;
	}
	copysize = sizeAABB - sumsize;

	hd = baseAABB;
	carry = 0;
	while (sumsize--) {
		sival.ivalue = ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = ((FULL) *h1++) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	if (carry) {
		*hd = (HALF)carry;
		sizeAABB++;
	}

	/*
	 * Add the sum back into the previously calculated squares
	 * shifted over to the proper location.
	 */
	h1 = baseAABB;
	hd = ans + shift;
	carry = 0;
	len = sizeAABB;
	while (len--) {
		sival.ivalue = ((FULL) *hd) + ((FULL) *h1++) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (carry) {
		/*
		 * XXX - Saber-C Version 3.1 says:
		 *
		 *    W#113, Using allocated data what has not been set.
		 *
		 * This warning is issued during the regression test #622
		 * (algcheck 1).
		 *
		 * Saber-C claims that *hd has been allocated but not set.
		 * When this warning was first issued, hd == 0x19547c
		 * which is 12 bytes off from ans.  The value of carry
		 * was 1, the value of shift was 2, len was -1 and sizeAABB
		 * was 4, sizeBB as 4, sizeAA was 2.  The value stored
		 * in *hd was 0xbfbf.
		 *
		 * The call stack is:
		 *
		 *   dosquare(0x183e88, 3, 0x195470) at "zmul.c":981 
		 *   zsquare(0x1952b0, 0x4c1348) at "zmul.c":687 
		 *   zpowermod(0x5d1d58,0x5d1d5c,0x5d1d60,0x390218) at "zmod.c":679 
		 *   qpowermod(0x38a088, 0x3901a8, 0x38a964) at "qfunc.c":76 
		 *   builtinfunc(113, 3, 0x1740f0) at "func.c":386 
		 *   o_call(0x3b5de0, 113, 3) at "opcodes.c":2094 
		 *   calculate(0x3b5de0, 3) at "opcodes.c":288 
		 *   o_usercall(0x3b5de0, 13, 3) at "opcodes.c":2082 
		 *   calculate(0x47dfa8, 0) at "opcodes.c":288 
		 *   o_usercall(0x47dfa8, 14, 0) at "opcodes.c":2082 
		 *   calculate(0x16cca8, 0) at "opcodes.c":288 
		 *   evaluate(0) at "codegen.c":167 
		 *   getcommands(0) at "codegen.c":106 
		 *   getcommands(1) at "codegen.c":76 
		 *   main(-1, 0x181f8c) at "calc.c":155 
		 *
		 * Purify does not report this.  Who is right?
		 */
		sival.ivalue = ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}

	/*
	 * Calculate the absolute value of the difference of the two halfs
	 * into a temporary location.
	 */
	if (sizeA == sizeB) {
		len = sizeA;
		h1 = &baseA[len - 1];
		h2 = &baseB[len - 1];
		while ((len > 1) && (*h1 == *h2)) {
			len--;
			h1--;
			h2--;
		}
	}
	if ((sizeA > sizeB) || ((sizeA == sizeB) && (*h1 > *h2)))
	{
		h1 = baseA;
		h2 = baseB;
		sizeAB = sizeA;
		subsize = sizeB;
	} else {
		h1 = baseB;
		h2 = baseA;
		sizeAB = sizeB;
		subsize = sizeA;
	}
	copysize = sizeAB - subsize;

	hd = baseAB;
	carry = 0;
	while (subsize--) {
		sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}

	hd = &baseAB[sizeAB - 1];
	while ((*hd == 0) && (sizeAB > 1)) {
		sizeAB--;
		hd--;
	}

	/*
	 * Now square the number into another temporary location,
	 * and subtract that from the final result.
	 */
	sizeABAB = dosquare(baseAB, sizeAB, baseABAB);

	h1 = baseABAB;
	hd = ans + shift;
	carry = 0;
	while (sizeABAB--) {
		sival.ivalue = BASE1 - ((FULL) *hd) + ((FULL) *h1++) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}
	while (carry) {
		sival.ivalue = BASE1 - ((FULL) *hd) + carry;
		*hd++ = BASE1 - sival.silow;
		carry = sival.sihigh;
	}

	/*
	 * Return the size of the result.
	 */
	len = sizetotal;
	hd = &ans[len - 1];
	while ((*hd == 0) && (len > 1)) {
		len--;
		hd--;
	}
	tempbuf = temp;
	return len;
}


/*
 * Return a pointer to a buffer to be used for holding a temporary number.
 * The buffer will be at least as large as the specified number of HALFs,
 * and remains valid until the next call to this routine.  The buffer cannot
 * be freed by the caller.  There is only one temporary buffer, and so as to
 * avoid possible conflicts this is only used by the lowest level routines
 * such as divide, multiply, and square.
 */
HALF *
zalloctemp(len)
	LEN len;		/* required number of HALFs in buffer */
{
	HALF *hp;
	static LEN buflen;	/* current length of temp buffer */
	static HALF *bufptr;	/* pointer to current temp buffer */

	if (len <= buflen)
		return bufptr;

	/*
	 * We need to grow the temporary buffer.
	 * First free any existing buffer, and then allocate the new one.
	 * While we are at it, make the new buffer bigger than necessary
	 * in order to reduce the number of reallocations.
	 */
	len += 100;
	if (buflen) {
		buflen = 0;
		ckfree(bufptr);
	}
	hp = (HALF *) ckalloc(len * sizeof(HALF));
	if (hp == NULL)
		math_error("No memory for temp buffer");
	bufptr = hp;
	buflen = len;
	return hp;
}

/* END CODE */


syntax highlighted by Code2HTML, v. 0.9.1