/*-------------------------------------------------------------------- This source distribution is placed in the public domain by its author, Jason Papadopoulos. You may use it for any purpose, free of charge, without having to notify anyone. I disclaim any responsibility for any errors. Optionally, please be nice and tell me if you find this source to be useful. Again optionally, if you add to the functionality present here please consider making those additions public too, so that others may benefit from your work. --jasonp@boo.net 6/3/07 --------------------------------------------------------------------*/ #include #include "mpqs.h" int sort_ascending(const void *x, const void *y); /*--------------------------------------------------------------------*/ void poly_init(sieve_conf_t *conf, uint32 sieve_size) { uint32 i, j; uint32 start_bits; uint32 num_factors, rem; uint32 num_derived_poly; mp_t t0, t1; msieve_obj *obj = conf->obj; /* compute the optimal A value for MPQS polynomials. This is sqrt(2*n)/sieve_size, and the closer a generated A value gets to this number the smaller the average sieve value will be */ mp_add(conf->n, conf->n, &t0); mp_isqrt(&t0, &t1); mp_add_1(&t1, 1, &t1); mp_divrem_1(&t1, sieve_size, &t1); mp_copy(&t1, &conf->target_a); conf->a_bits = mp_bits(&t1); /* Determine the allowable range for factors of the 'a' coefficient in MPQS polynomials. We simplify the factor selection code later by marking the offsets where factor base primes increase in size by one bit. factor_bounds[i] gives the index in the factor base of the first i-bit prime. If i-bit primes really are present in the factor base, then factor_bounds[i] and factor_bounds[i+1] must both be nonzero. Otherwise the factor base *ends* in an i-bit prime. Note that any factor selected for use in 'a' values is not sieved. However, for primes above conf->sieve_large_fb_start the sieve code uses a hashtable that doesn't know this. Hence the pool of primes that SIQS can use is limited to factor base offsets less than sieve_large_fb_start. This is desirable anyway, since these primes are small and we get to choose more of them, reusing 'a' values for longer */ i = 10; memset(conf->factor_bounds, 0, sizeof(conf->factor_bounds)); memset(conf->factor_bits, 0, sizeof(conf->factor_bits)); memset(conf->poly_factors, 0, sizeof(conf->poly_factors)); memset(conf->poly_tmp_b, 0, sizeof(conf->poly_tmp_b)); start_bits = conf->factor_base[i].logprime; conf->factor_bounds[start_bits] = i; while (i < conf->fb_size) { if (i >= conf->sieve_large_fb_start || conf->factor_base[i].logprime > 15) break; if (conf->factor_base[i].logprime > start_bits) { start_bits = conf->factor_base[i].logprime; conf->factor_bounds[start_bits] = i; } i++; } conf->factor_bounds[start_bits+1] = i; /* Determine the number of factors that each 'a' value will have, and also the number of bits in each factor. Factors can have 7-14 bits, favoring a bit size that produces many polynomials but not *too* many */ if (conf->a_bits > 210) start_bits = 15; else if (conf->a_bits > 210) start_bits = 14; else if (conf->a_bits > 190) start_bits = 13; else if (conf->a_bits > 180) start_bits = 12; else start_bits = 11; for (i = start_bits, num_factors = rem = 0; i >= 7; i--) { num_factors = conf->a_bits / i; rem = conf->a_bits % i; /* stop searching for factor sizes if the factor base can handle the current set */ if (conf->factor_bounds[i] == 0 || num_factors == 1) continue; if (rem == 0) { /* the number of bits in the combined list is exactly the right size */ if (num_factors > 2 && conf->factor_bounds[i+1] > 0) break; } else if (rem <= num_factors) { /* the number of bits in the combined list is not large enough, but is close enough so that some primes can be one bit larger to compensate. This will work if the pool of available primes has candidates in the i and i+1 bin */ if (num_factors > 2 && conf->factor_bounds[i+1] > 0 && conf->factor_bounds[i+2] > 0) break; } else if ((i - rem) <= num_factors) { /* the number of bits in the combined list is too small by more than the number of primes. Add an extra prime to the list and make several primes one bit smaller. This will work if the pool of available primes has candidates in the i and i-1 bin */ if (conf->factor_bounds[i+1] > 0 && conf->factor_bounds[i-1] > 0) break; } } if (i < 7 || num_factors < 2 || num_factors > MAX_POLY_FACTORS) { logprintf(obj, "fatal error: poly selection failed\n"); exit(-1); } /* explicitly list out the number of bits each prime in the list will have. Some primes may be one bit too large or too small */ for (j = 0; j < num_factors; j++) conf->factor_bits[j] = i; if (rem <= num_factors) { for (j = 0; j < rem; j++) conf->factor_bits[j]++; } else { conf->factor_bits[j] = i; num_factors++; for (j = 0; j < (i - rem); j++) conf->factor_bits[j]--; } conf->num_poly_factors = num_factors; /* verify the collection adds up to what is expected */ for (i = j = 0; j < num_factors; j++) { i += conf->factor_bits[j]; } if (i != conf->a_bits) { logprintf(obj, "failure assigning bits of poly factors\n"); exit(-1); } /* for big factorizations, where there are a lot of primes, it becomes really important to generate A values that are as close as possible to the 'optimal' A value. In this case, we make a few of the primes selected deliberately too small, so that the poly generation code has room to pick just the right factors */ if (num_factors >= 8 && num_factors < 15) { if (conf->factor_bits[0] > conf->factor_bits[num_factors - 1]) { switch (num_factors) { default: conf->factor_bits[3]--; conf->factor_bits[2]--; case 9: conf->factor_bits[1]--; } conf->factor_bits[0]--; } else { switch (num_factors) { default: conf->factor_bits[num_factors-4]--; conf->factor_bits[num_factors-3]--; case 9: conf->factor_bits[num_factors-2]--; } conf->factor_bits[num_factors-1]--; } } /* allocate scratch structures */ num_derived_poly = 1 << (num_factors - 1); conf->root_correction = (uint32 *)malloc(num_derived_poly * sizeof(uint32)); conf->next_poly_action = (uint8 *)malloc(num_derived_poly * sizeof(uint8)); conf->curr_b = (mp_t *)malloc(num_derived_poly * sizeof(mp_t)); conf->poly_b_small[0] = (uint32 *)malloc(conf->sieve_large_fb_start * num_factors * sizeof(uint32)); for (i = 1; i < num_factors; i++) { conf->poly_b_small[i] = conf->poly_b_small[i-1] + conf->sieve_large_fb_start; } conf->poly_b_array = NULL; if (conf->fb_size > conf->sieve_large_fb_start) { conf->poly_b_array = (uint32 *)malloc( num_factors * sizeof(uint32) * (conf->fb_size - conf->sieve_large_fb_start)); } logprintf(obj, "polynomial 'A' values have %u factors\n", num_factors); } /*--------------------------------------------------------------------*/ void poly_free(sieve_conf_t *conf) { free(conf->root_correction); free(conf->next_poly_action); free(conf->poly_b_array); free(conf->poly_b_small[0]); free(conf->curr_b); } /*--------------------------------------------------------------------*/ void build_derived_poly(sieve_conf_t *conf) { /* Given a config struct with conf->curr_a and conf->poly_factors already filled in, calculate all of the 'b' values corresponding to this 'a' and also the two peices of scratch information associated with each 'b'. */ uint32 i; mp_t tmp; mp_t *a = &conf->curr_a; mp_t *b = conf->curr_b; mp_t *poly_tmp_b = conf->poly_tmp_b; uint32 num_factors = conf->num_poly_factors; uint32 *poly_factors = conf->poly_factors; fb_t *factor_base = conf->factor_base; uint32 correction; uint32 num_derived_poly; uint32 *modsqrt_array = conf->modsqrt_array; /* compute the intermediate values that will be used to form the 'b' coefficient of this and the next 1<<(num_factors-1) polynomials */ for (i = 0; i < num_factors; i++) { mp_t adivp; uint32 rem; fb_t *fbptr = factor_base + poly_factors[i]; uint32 prime = fbptr->prime; uint32 modsqrt = modsqrt_array[poly_factors[i]]; mp_divrem_1(a, prime, &adivp); rem = mp_mod_1(&adivp, prime); rem = mp_modinv_1(rem, prime); rem = mp_modmul_1(rem, modsqrt, prime); if (rem > prime / 2) rem = prime - rem; mp_mul_1(&adivp, rem, poly_tmp_b + i); } /* The first 'b' coefficient is the sum of all the temporary b values. Also double each temporary value. */ mp_clear(b); for (i = 0; i < num_factors; i++) { mp_add(b, poly_tmp_b + i, b); mp_add(poly_tmp_b + i, poly_tmp_b + i, poly_tmp_b + i); } /* There's a lot about self-initializing MPQS that is not mentioned in Contini's paper. First, ordinary MPQS reduces every 'b' value mod a, but we can't do that here. That's because the precomputations below need to work with un- reduced numbers, since (b mod a) mod p != b mod p in general. But this also means that b can exceed a, and can be negative. My homebrew multiple precision library cannot deal with negative numbers, so the rest of the code wants b to always be positive. We work around the problem by cheating. The whole idea here is to find a and b such that we can compute the values x for which (a*x+b)^2-n is divisible by small primes. If b is between a and 2*a, and we know the corresponding set of x values (which the precomputing below will find), then we can implicitly work with (a*(x+1) + (b-a))^2 - n. This forms a new b value that is smaller than a, and we can find the set of x values by just adding 1 to each x value that we had for the original b. Negative values of b are treated similarly. So we need to store a correction, equal to the integer quotient (current b)/a, that is added to all the x values computed by later polynomials. Every time a new 'b' value is computed, we update the correction and then reduce b mod a. In this way, the rest of the code gets the 'b' values it wants, and we still initialize polynomials fast. The first b value is always slightly larger than a, so actual division is not needed to find the first correction */ correction = 0; while (mp_cmp(b, a) >= 0) { mp_sub(b, a, b); correction++; } conf->root_correction[0] = correction; /* Now derive all of the 'b' values that will use the same 'a' value */ num_derived_poly = 1 << (num_factors - 1); for (i = 1; i < num_derived_poly; i++) { /* Determine which temporary value will be used to compute the next b, and whether it will be added or subtracted. See Contini's SIQS paper for details on why this works. Also modify the correction to account for the new b value. When generating the roots for later polynomials, we have to remember which temporary value was used; its identity, along with a bit to choose addition or subtraction, goes into conf->next_poly_action[] */ uint32 bitpos = 0; b = conf->curr_b + i; while ((i & (1 << bitpos)) == 0) bitpos++; conf->next_poly_action[i] = bitpos; if (i & (1 << (bitpos+1)) ) { mp_add(b-1, poly_tmp_b + bitpos, b); if (mp_cmp(b, a) >= 0) { mp_sub(b, a, b); correction++; } } else { mp_copy(b-1, &tmp); if (mp_cmp(poly_tmp_b + bitpos, &tmp) >= 0) { mp_add(&tmp, a, &tmp); correction--; } mp_sub(&tmp, poly_tmp_b + bitpos, b); conf->next_poly_action[i] |= 0x80; } conf->root_correction[i] = correction; } } /*--------------------------------------------------------------------*/ void build_base_poly(sieve_conf_t *conf) { /* Build the next MPQS polynomial and prepare the factor base for using it */ msieve_obj *obj = conf->obj; char buf[256]; uint32 i, j, k; mp_t *a = &conf->curr_a; mp_t *b = conf->curr_b; mp_t tmp; fb_t *factor_base = conf->factor_base; uint32 *poly_factors = conf->poly_factors; uint32 num_factors = conf->num_poly_factors; uint8 *factor_bits = conf->factor_bits; uint32 *factor_bounds = conf->factor_bounds; mp_t *poly_tmp_b; uint32 curr_poly_factor; uint32 *poly_b_array; uint32 **poly_b_small; uint32 sieve_size = conf->num_sieve_blocks * conf->sieve_block_size / 2; uint32 *modsqrt_array = conf->modsqrt_array; /* compute the factors for this polynomial; filter out duplicate factors. Note that we store factor base offsets and not the primes themselves Also compute 'a', the product of these factors */ mp_clear(a); a->nwords = a->val[0] = 1; i = 0; while (i < num_factors - 1) { uint8 bits = factor_bits[i]; uint32 range = factor_bounds[bits+1] - factor_bounds[bits]; poly_factors[i] = factor_bounds[bits] + get_rand(&obj->seed1, &obj->seed2) % range; for (j = 0; j < i; j++) { if (poly_factors[j] == poly_factors[i]) break; } if (j == i) { /* factor is accepted */ uint32 prime = factor_base[poly_factors[i]].prime; mp_mul_1(a, prime, a); i++; } } /* the last factor is chosen to bring 'a' as close as possible to the value of conf->target_a. The 'yield' of the polynomial (in terms of number of relations to expect) depends critically on 'a' being neither too large nor too small. The sieve code will only work if this factor is one of the first sieve_large_fb_start factor base primes */ mp_div(&conf->target_a, a, &tmp); i = tmp.val[0]; /* first bracket the desired factor */ for (j = MIN_FB_OFFSET+1; j < conf->sieve_large_fb_start - 3; j++) { uint32 left = conf->factor_base[j].prime; uint32 right = conf->factor_base[j+1].prime; if (i >= left && i <= right) break; } for (; j < conf->sieve_large_fb_start - 2; j++) { uint32 left = conf->factor_base[j].prime; uint32 right = conf->factor_base[j+1].prime; uint32 prime, fb_offset; /* pick the factor base prime closest to the ideal value (stored in 'i') */ if (abs((int32)(i - right)) < abs((int32)(i - left))) { fb_offset = j + 1; prime = right; } else { fb_offset = j; prime = left; } /* keep it if it's not a duplicate */ for (k = 0; k < num_factors - 1; k++) { if (poly_factors[k] == fb_offset) break; } if (k == num_factors - 1) { poly_factors[k] = fb_offset; mp_mul_1(a, prime, a); break; } } qsort(poly_factors, (size_t)num_factors, sizeof(uint32), sort_ascending); /* Build all of the 'b' values corresponding to this 'a' */ build_derived_poly(conf); /* Initialize the factor base */ curr_poly_factor = 0; poly_b_array = conf->poly_b_array; poly_b_small = conf->poly_b_small; poly_tmp_b = conf->poly_tmp_b; for (i = MIN_FB_OFFSET + 1; i < conf->fb_size; i++) { fb_t *fbptr = factor_base + i; uint32 root1, root2; uint32 prime = fbptr->prime; if (prime < 100 && conf->multiplier % prime == 0) { /* prime is part of the multiplier. Don't sieve with it */ root1 = root2 = INVALID_ROOT; } else if (curr_poly_factor < num_factors && i == poly_factors[curr_poly_factor]) { /* the factor base prime divides 'a'. Skip it (we can compute the necessary roots here but it's harder for later polynomials) */ root1 = root2 = INVALID_ROOT; curr_poly_factor++; } else { /* compute the two polynomial-specific square roots for each factor base prime. Note that the roots represent an offset from -sieve_size, *not* from zero. Also, if B is the value of b that ordinary SIQS must compute, we are actually finding the roots of (a*(x+k) + (B-k*a))^2, for whatever value of k makes the second term less than a. Doing this right would require subtracting k (actually conf->root_correction[0]) from the roots generated below, but we can cheat and do that during the sieving when it's cheaper */ uint32 a_modp = mp_mod_1(a, prime); uint32 b_modp = prime - mp_mod_1(b, prime); uint32 modsqrt = modsqrt_array[i]; root1 = b_modp + modsqrt; root2 = b_modp + prime - modsqrt; j = sieve_size % prime; a_modp = mp_modinv_1(a_modp, prime); root1 = mp_modmul_1(root1, a_modp, prime); root2 = mp_modmul_1(root2, a_modp, prime); root1 = mp_modadd_1(root1, j, prime); root2 = mp_modadd_1(root2, j, prime); /* also perform the precomputations for the set of future polynomials that will reuse this 'a' value. */ if (i >= conf->sieve_large_fb_start) { for (j = 0; j < num_factors; j++) { b_modp = mp_mod_1(poly_tmp_b + j, prime); poly_b_array[j] = mp_modmul_1(a_modp, b_modp, prime); } poly_b_array += num_factors; } else { for (j = 0; j < num_factors; j++) { b_modp = mp_mod_1(poly_tmp_b + j, prime); poly_b_small[j][i] = mp_modmul_1( a_modp, b_modp, prime); } } } /* The sieving code uses shortcuts that depend on root2 >= root1 at all times */ if (root1 < root2) { fbptr->root1 = root1; fbptr->root2 = root2; } else { fbptr->root1 = root2; fbptr->root2 = root1; } } /* dump the 'a' value to disk */ i = sprintf(buf, "A"); for (j = 0; j < conf->num_poly_factors; j++) i += sprintf(buf + i, " %x", conf->poly_factors[j]); i += sprintf(buf + i, "\n"); qs_print_to_savefile(obj, buf); } /*--------------------------------------------------------------------*/ int sort_ascending(const void *x, const void *y) { uint32 *xx = (uint32 *)x; uint32 *yy = (uint32 *)y; return xx[0] - yy[0]; }