/*--------------------------------------------------------------------
This source distribution is placed in the public domain by its author,
Jason Papadopoulos. You may use it for any purpose, free of charge,
without having to notify anyone. I disclaim any responsibility for any
errors.
Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may
benefit from your work.
--jasonp@boo.net 6/3/07
--------------------------------------------------------------------*/
#include <common.h>
#include "mpqs.h"
static void collect_relations(sieve_conf_t *conf,
uint32 target_relations,
qs_core_sieve_fcn core_sieve_fcn);
static uint32 do_sieving_internal(sieve_conf_t *conf,
uint32 max_relations,
qs_core_sieve_fcn core_sieve_fcn);
#ifdef SIEVE_TIMING
#define PRINT_TIME(var) printf(#var ": %lf (%4.1f%%)\n", \
(double)(var) / 1.86e9, \
100.0 * (double)(var) / (double)total_time);
uint64 total_time;
uint64 base_poly_time;
uint64 next_poly_small_time;
uint64 next_poly_large_time;
uint64 plus_init_time;
uint64 bucket_time;
uint64 sieve_small_time;
uint64 sieve_large_time;
uint64 tf_plus_scan_time;
uint64 tf_total_time;
uint64 tf_small_time;
uint64 tf_large_time;
#else
#define PRINT_TIME(var) /* nothing */
#endif
/*--------------------------------------------------------------------*/
void do_sieving(msieve_obj *obj, mp_t *n,
mp_t **poly_a_list, poly_t **poly_list,
fb_t *factor_base, uint32 *modsqrt_array,
sieve_param_t *params, uint32 multiplier,
relation_t **relation_list, uint32 *num_relations,
la_col_t **cycle_list, uint32 *num_cycles) {
sieve_conf_t conf;
uint32 bound;
uint32 i;
uint32 bits;
uint32 fb_size = params->fb_size;
uint32 sieve_size = params->sieve_size;
uint32 num_sieve_blocks;
uint32 error_bits;
uint32 max_relations, relations_found;
uint32 sieve_block_size;
uint32 recip_cutoff;
qs_core_sieve_fcn core_sieve_fcn;
/* fill in initial sieve parameters */
memset(&conf, 0, sizeof(conf));
conf.obj = obj;
conf.n = n;
conf.multiplier = multiplier;
conf.factor_base = factor_base;
conf.modsqrt_array = modsqrt_array;
conf.fb_size = params->fb_size;
bits = mp_bits(conf.n);
/* decide on the size of one sieve block. If the L1
cache size is 32kB then this is also the sieve block
size, otherwise it is 64kB. The latter rule is needed
because Pentium 4 CPUs have very small L1 sizes, and
are designed to work out of L2 cache most of the
time anyway */
if (obj->cache_size1 == 32768)
conf.sieve_block_size = sieve_block_size = 32768;
else
conf.sieve_block_size = sieve_block_size = 65536;
conf.sieve_array = (uint8 *)aligned_malloc(sieve_block_size, 64);
/* decide on the core sieving routine to use */
core_sieve_fcn = NULL;
#if defined(__GNUC__)
#if defined(__i386__)
if (obj->cpu == cpu_pentium2 && sieve_block_size == 65536) {
logprintf(obj, "using 64kb Pentium 2 sieve core\n");
core_sieve_fcn = qs_core_sieve_p2_64k;
}
else if (obj->cpu == cpu_pentium3 && sieve_block_size == 65536) {
logprintf(obj, "using 64kb Pentium 3 sieve core\n");
core_sieve_fcn = qs_core_sieve_p3_64k;
}
else if (obj->cpu == cpu_pentium4 && sieve_block_size == 65536) {
logprintf(obj, "using 64kb Pentium 4 sieve core\n");
core_sieve_fcn = qs_core_sieve_p4_64k;
}
else if (obj->cpu == cpu_pentium_m && sieve_block_size == 32768) {
logprintf(obj, "using 32kb Pentium M sieve core\n");
core_sieve_fcn = qs_core_sieve_pm_32k;
}
else if (obj->cpu == cpu_core && sieve_block_size == 32768) {
logprintf(obj, "using 32kb Intel Core sieve core\n");
core_sieve_fcn = qs_core_sieve_core_32k;
}
else if (obj->cpu == cpu_athlon && sieve_block_size == 65536) {
logprintf(obj, "using 64kb Athlon sieve core\n");
core_sieve_fcn = qs_core_sieve_k7_64k;
}
else if (obj->cpu == cpu_athlon_xp && sieve_block_size == 65536) {
logprintf(obj, "using 64kb Athlon XP sieve core\n");
core_sieve_fcn = qs_core_sieve_k7xp_64k;
}
else if (obj->cpu == cpu_opteron && sieve_block_size == 65536) {
logprintf(obj, "using 64kb Opteron sieve core\n");
core_sieve_fcn = qs_core_sieve_k8_64k;
}
#elif defined(__x86_64__)
if (obj->cpu == cpu_pentium4 && sieve_block_size == 65536) {
logprintf(obj, "using 32kb Pentium 4 sieve core\n");
core_sieve_fcn = qs_core_sieve_p4_64k;
}
else if (obj->cpu == cpu_core && sieve_block_size == 32768) {
logprintf(obj, "using 32kb Intel Core sieve core\n");
core_sieve_fcn = qs_core_sieve_core_32k;
}
else if (obj->cpu == cpu_opteron && sieve_block_size == 65536) {
logprintf(obj, "using 64kb Opteron sieve core\n");
core_sieve_fcn = qs_core_sieve_k8_64k;
}
#endif
#endif
#if defined(_MSC_VER) && (_MSC_VER >= 1400)
if (sieve_block_size == 32768) {
logprintf(obj, "using VC8 32kb sieve core\n");
core_sieve_fcn = qs_core_sieve_vc8_32k;
}
else {
logprintf(obj, "using VC8 64kb sieve core\n");
core_sieve_fcn = qs_core_sieve_vc8_64k;
}
#else
if (core_sieve_fcn == NULL) {
if (sieve_block_size == 32768) {
logprintf(obj, "using generic 32kb sieve core\n");
core_sieve_fcn = qs_core_sieve_generic_32k;
}
else {
logprintf(obj, "using generic 64kb sieve core\n");
core_sieve_fcn = qs_core_sieve_generic_64k;
}
}
#endif
/* round the size of the sieve interval (positive plus
negative parts combined) up to an integral number
of sieve blocks */
num_sieve_blocks = (2 * sieve_size + sieve_block_size - 1) /
sieve_block_size;
logprintf(obj, "sieve interval: %u blocks of size %u\n",
num_sieve_blocks, sieve_block_size);
/* choose the largest factor base prime that is not
sieved, and that uses small reciprocals during
trial factoring. The small prime variation becomes
less important as the input size increases, and
we reduce its influence in that case */
if (bits > 350)
bound = 15;
else if (bits > 290)
bound = 20;
else if (bits > 230)
bound = 22;
else if (bits > 200)
bound = 22;
else
bound = 50;
bound = MIN(bound, fb_size);
/* to avoid using explicit remainder operations
when trial factoring by factor base primes,
we multiply by precomputed reciprocals of those
primes. See Agner Fog's brilliant optimization
manuals for why this works (www.agner.org)
The reciprocals must all fit in 32 bits, and be
given inputs not exceeding 2^r for r a compile-time
constant. Each reciprocal value for prime p is either
2^r / p or 2^r / p + 1, so there is a range of factor
base entries that have r = 32 (for small p) and
another range that has r = 40 (for larger p).
range 1: p < 256 or small prime bound, r = 32 */
recip_cutoff = ((uint64)1 << 32) / (
(uint64)num_sieve_blocks * sieve_block_size);
if (recip_cutoff < 256) {
logprintf(obj, "error1: factor base and/or sieve interval "
"is too large\n");
exit(-1);
}
for (i = MIN_FB_OFFSET + 1; i < bound; i++) {
fb_t *fbptr = conf.factor_base + i;
uint32 prime = fbptr->prime;
if (prime >= 1000 || prime >= recip_cutoff)
break;
fbptr->recip = ((uint64)1 << 32) / (uint64)prime;
if (floor(MP_RADIX / (double)prime + 0.5) ==
(double)fbptr->recip) {
fbptr->rcorrect = 1;
}
else {
fbptr->rcorrect = 0;
fbptr->recip++;
}
}
conf.tf_small_recip1_cutoff = i;
/* range 2: p < (cutoff for unsieved factor base entries), r = 40.
This range is probably empty */
for (; i < bound; i++) {
fb_t *fbptr = conf.factor_base + i;
uint32 prime = fbptr->prime;
if (prime >= 1000)
break;
fbptr->recip = ((uint64)1 << 40) / (uint64)prime;
if (floor(256 * MP_RADIX / (double)prime + 0.5) ==
(double)fbptr->recip) {
fbptr->rcorrect = 1;
}
else {
fbptr->rcorrect = 0;
fbptr->recip++;
}
}
conf.tf_small_recip2_cutoff = i;
conf.sieve_small_fb_start = i;
/* range 3: p < block size, r = 32 */
for (; i < fb_size; i++) {
fb_t *fbptr = conf.factor_base + i;
uint32 prime = fbptr->prime;
if (prime >= sieve_block_size || prime >= recip_cutoff)
break;
fbptr->recip = ((uint64)1 << 32) / (uint64)prime;
if (floor(MP_RADIX / (double)prime + 0.5) ==
(double)fbptr->recip) {
fbptr->rcorrect = 1;
}
else {
fbptr->rcorrect = 0;
fbptr->recip++;
}
}
conf.tf_med_recip1_cutoff = i;
/* repeat, but for large reciprocals. An implicit
limit here is that the trial division code will not
work if the product of the largest such factor base
prime and the largest sieve offset exceeds 2^40
range 4: p < block size, r = 40 */
recip_cutoff = ((uint64)1 << 40) / (
(uint64)num_sieve_blocks * sieve_block_size);
if (recip_cutoff < sieve_block_size) {
logprintf(obj, "error: factor base and/or sieve interval "
"is too large\n");
exit(-1);
}
for (; i < fb_size; i++) {
fb_t *fbptr = conf.factor_base + i;
uint32 prime = fbptr->prime;
if (prime >= sieve_block_size)
break;
fbptr->recip = ((uint64)1 << 40) / (uint64)prime;
if (floor(256 * MP_RADIX / (double)prime + 0.5) ==
(double)fbptr->recip) {
fbptr->rcorrect = 1;
}
else {
fbptr->rcorrect = 0;
fbptr->recip++;
}
}
conf.tf_med_recip2_cutoff = i;
/* choose the largest factor base prime that does not
use hashtables or reciprocals. We could make this
range empty, but it's better to start using hashtable
with primes somewhat larger than the block size.
This greatly reduces the amount of memory used by
the hashtables, and trial factoring by primes in this
range is extremely cheap */
for (; i < fb_size && conf.factor_base[i].prime <
3 * sieve_block_size; i++) {
/* nothing */
}
conf.tf_large_cutoff = i;
conf.sieve_large_fb_start = i;
conf.packed_fb = (packed_fb_t *)malloc(i * sizeof(packed_fb_t));
/* The sieve code is optimized for sieving intervals that are
extremely small. To reduce the overhead of using a large
factor base, cache blocking is used for the sieve interval
*and* the factor base; this requires that sieving takes place
for several polynomials simultaneously. */
/* the number of polynomials that are simultaneously
sieved depends on how long each sieve interval is;
more polynomials are more efficient but consume
more memory */
conf.poly_block = (uint32)((double)65536 / sieve_block_size *
100 / num_sieve_blocks + 1);
logprintf(obj, "processing polynomials in batches of %u\n",
conf.poly_block);
conf.fb_block = 200;
bound = conf.factor_base[conf.fb_size - 1].prime;
logprintf(obj, "using a sieve bound of %u (%u primes)\n",
bound, conf.fb_size);
/* The trial factoring code can only handle 32-bit
factors, so the single large prime bound must
fit in 32 bits. Note that the SQUFOF code can
only manage 31-bit factors, so that single large
prime relations can have a 32-bit factor but
double large prime relations are limited to 31-bit
factors */
if ((uint32)(-1) / bound <= params->large_mult)
bound = (uint32)(-1);
else
bound *= params->large_mult;
logprintf(obj, "using large prime bound of %u (%d bits)\n",
bound, (int32)(log((double)bound) / M_LN2));
/* compute max_fb2, the square of the largest factor base prime */
i = conf.factor_base[fb_size - 1].prime;
mp_clear(&conf.max_fb2);
conf.max_fb2.nwords = 1;
conf.max_fb2.val[0] = i;
mp_mul_1(&conf.max_fb2, i, &conf.max_fb2);
mp_clear(&conf.large_prime_max2);
/* fill in sieving cutoffs that leverage the small
prime variation. Do not skip small primes if the
factor base is too small, and skip fewer small
primes when the factor base is large (i.e. when it's
more important to increase the relation rate) */
error_bits = (uint32)(log((double)bound) / M_LN2 + 0.5);
if (fb_size < 800) {
conf.cutoff2 = (uint32)(1.5 * error_bits);
conf.cutoff1 = conf.cutoff2;
conf.sieve_small_fb_start = MIN_FB_OFFSET + 1;
/* do not change the trial factoring ranges, since
they depend on the choice of reciprocal used */
}
else {
/* Turn on double large primes if n is 85 digits or more */
if (bits >= 282) {
mp_t *large_max2 = &conf.large_prime_max2;
/* the double-large-prime cutoff is equal to
the single-large-prime cutoff raised to the 1.8
power. We don't square the single cutoff because
relations with two really big large primes are
very rare, and will almost never survive the
filtering step anyway */
mp_clear(large_max2);
large_max2->nwords = 1;
large_max2->val[0] = bound;
mp_mul_1(large_max2,
(uint32)pow((double)bound, 0.8),
large_max2);
conf.cutoff2 = mp_bits(large_max2);
logprintf(obj, "using double large prime bound of "
"%s (%u-%u bits)\n",
mp_sprintf(large_max2, 10, obj->mp_sprintf_buf),
mp_bits(&conf.max_fb2),
conf.cutoff2);
}
else {
conf.cutoff2 = error_bits;
}
conf.cutoff1 = conf.cutoff2 + 30;
logprintf(obj, "using trial factoring cutoff "
"of %u bits\n", conf.cutoff2);
}
/* set up the hashtable if it's going to be used; otherwise
the rest of the sieve code will silently ignore it.
There is one hash bin for every sieve block in the sieve
interval; there are also twice as many such bins because
we collect positive and negative sieve offsets into separate
hashtables. Finally, we sieve over up to POLY_BLOCK_SIZE
polynomials at the same time, so the number of hash bins
is similarly multiplied.
Needless to say, that's a *lot* of hash bins! The sieve
interval must be extremely small to keep the hashtable
size down; that's okay, very small sieve intervals are
actually more likely to contain smooth relations */
conf.hashtable = (hashtable_t *)calloc((size_t)(conf.poly_block *
num_sieve_blocks),
sizeof(hashtable_t));
if (fb_size > conf.sieve_large_fb_start) {
for (i = 0; i < conf.poly_block * num_sieve_blocks; i++) {
conf.hashtable[i].num_alloc = 1000;
conf.hashtable[i].list = (hash_entry_t *)
malloc(1000 * sizeof(hash_entry_t));
}
}
/* fill in miscellaneous parameters */
conf.num_sieve_blocks = num_sieve_blocks;
conf.large_prime_max = bound;
/* initialize the polynomial generation code. Note that
we do *not* use the sieve size that is input to specify
the sizing of polynomials, but instead use the rounded value
derived from it */
poly_init(&conf, num_sieve_blocks * sieve_block_size / 2);
/* initialize the bookkeeping for tracking partial relations */
conf.components = 0;
conf.vertices = 0;
if (!(obj->flags & MSIEVE_FLAG_SKIP_QS_CYCLES)) {
conf.cycle_hashtable = (uint32 *)calloc(
(size_t)(1 << LOG2_CYCLE_HASH),
sizeof(uint32));
conf.cycle_table_size = 1;
conf.cycle_table_alloc = 10000;
conf.cycle_table = (cycle_t *)malloc(conf.cycle_table_alloc *
sizeof(cycle_t));
}
/* stop when this many relations are found */
max_relations = obj->max_relations;
if (obj->max_relations == 0)
max_relations = fb_size + 3 * NUM_EXTRA_RELATIONS / 2;
/* do all the sieving, and save the relations
that were generated */
obj->flags |= MSIEVE_FLAG_SIEVING_IN_PROGRESS;
TIME1(total_time)
relations_found = do_sieving_internal(&conf, max_relations,
core_sieve_fcn);
TIME2(total_time)
PRINT_TIME(total_time);
PRINT_TIME(base_poly_time);
PRINT_TIME(next_poly_small_time);
PRINT_TIME(next_poly_large_time);
PRINT_TIME(plus_init_time);
PRINT_TIME(bucket_time);
PRINT_TIME(sieve_small_time);
PRINT_TIME(sieve_large_time);
PRINT_TIME(tf_plus_scan_time);
PRINT_TIME(tf_total_time);
PRINT_TIME(tf_small_time);
PRINT_TIME(tf_large_time);
/* free all of the sieving structures first, to leave
more memory for the postprocessing step. Do *not* free
the polynomial subsystem yet. Sieving is only considered
finished when the savefile has received all pending output */
qs_flush_savefile(obj);
fclose(obj->savefile);
obj->savefile = NULL;
obj->flags &= ~MSIEVE_FLAG_SIEVING_IN_PROGRESS;
for (i = 0; i < conf.poly_block * num_sieve_blocks; i++) {
free(conf.hashtable[i].list);
}
free(conf.hashtable);
free(conf.packed_fb);
aligned_free(conf.sieve_array);
/* if enough relations are available, do the postprocessing
and save the results where the rest of the program can
find them. Don't run filter_relations() if the cycle-
counting structures have not been initialized */
if (relations_found >= max_relations &&
!(obj->flags & MSIEVE_FLAG_SKIP_QS_CYCLES)) {
if(obj->flags & (MSIEVE_FLAG_USE_LOGFILE |
MSIEVE_FLAG_LOG_TO_STDOUT)) {
fprintf(stderr, "sieving complete, "
"commencing postprocessing\n");
}
qs_filter_relations(&conf);
*relation_list = conf.relation_list;
*num_relations = conf.num_relations;
*cycle_list = conf.cycle_list;
*num_cycles = conf.num_cycles;
*poly_a_list = conf.poly_a_list;
*poly_list = conf.poly_list;
}
poly_free(&conf);
free(conf.cycle_table);
free(conf.cycle_hashtable);
}
/*--------------------------------------------------------------------*/
static uint32 do_sieving_internal(sieve_conf_t *conf,
uint32 max_relations,
qs_core_sieve_fcn core_sieve_fcn) {
uint32 num_relations = 0;
uint32 update;
msieve_obj *obj = conf->obj;
/* open the savefile; if the file already
exists and the first line contains n in base 16,
then read in the large primes of relations that have
already been found */
obj->savefile = fopen(obj->savefile_name, "a+");
if (obj->savefile == NULL) {
logprintf(obj, "cannot open savefile\n");
exit(-1);
}
fseek(obj->savefile, (long)0, SEEK_END);
if (ftell(obj->savefile) != 0) {
char buf[256];
mp_t t0;
fseek(obj->savefile, (long)0, SEEK_SET);
fgets(buf, (int)sizeof(buf), obj->savefile);
mp_str2mp(buf, &t0, 16);
if (mp_cmp(conf->n, &t0) != 0) {
/* the savefile was for a different n. Truncate
the file and write the present n. I hope you
backed up savefiles you wanted! */
fclose(obj->savefile);
obj->savefile = fopen(obj->savefile_name, "w");
qs_print_to_savefile(obj, mp_sprintf(conf->n, 16,
obj->mp_sprintf_buf));
qs_print_to_savefile(obj, "\n");
}
else {
/* Read in the large primes for all the relations
in the savefile, and count the number of
cycles that can be formed. Note that no check
for duplicate or corrupted relations is made here;
the cycle finder will rebuild everything from
scratch when sieving finishes, and does all the
verification at that point */
fgets(buf, (int)sizeof(buf), obj->savefile);
while (!feof(obj->savefile)) {
uint32 prime1, prime2;
char *tmp = strchr(buf, 'L');
if (buf[0] != 'R' || tmp == NULL) {
fgets(buf, (int)sizeof(buf),
obj->savefile);
continue;
}
read_large_primes(tmp, &prime1, &prime2);
if (prime1 == prime2) {
conf->num_relations++;
}
else {
add_to_cycles(conf, prime1, prime2);
conf->num_cycles++;
}
fgets(buf, (int)sizeof(buf),
obj->savefile);
}
fseek(obj->savefile, (long)0, SEEK_END);
logprintf(obj, "restarting with %u full and %u "
"partial relations\n",
conf->num_relations,
conf->num_cycles);
num_relations = conf->num_relations +
conf->num_cycles +
conf->components - conf->vertices;
}
}
else {
/* start a new savefile; write n to it */
qs_print_to_savefile(obj, mp_sprintf(conf->n, 16,
obj->mp_sprintf_buf));
qs_print_to_savefile(obj, "\n");
}
/* choose how many full relations to collect before
printing a progress update */
if (max_relations >= 62000)
update = 5;
else if (max_relations >= 50000)
update = 25;
else if (max_relations >= 10000)
update = 100;
else
update = 200;
update = MIN(update, max_relations / 10);
if (num_relations < max_relations &&
(obj->flags & (MSIEVE_FLAG_USE_LOGFILE |
MSIEVE_FLAG_LOG_TO_STDOUT))) {
fprintf(stderr, "\nsieving in progress "
"(press Ctrl-C to pause)\n");
}
/* sieve until at least that many relations have
been found, then update the number of fulls and
partials. This way we can declare sieving to be
finished the moment enough relations are available */
while (!(obj->flags & MSIEVE_FLAG_STOP_SIEVING) &&
num_relations < max_relations) {
collect_relations(conf, update, core_sieve_fcn);
num_relations = conf->num_relations +
conf->num_cycles +
conf->components - conf->vertices;
if (obj->flags & (MSIEVE_FLAG_USE_LOGFILE |
MSIEVE_FLAG_LOG_TO_STDOUT)) {
fprintf(stderr, "%u relations (%u full + "
"%u combined from %u partial), need %u\r",
num_relations,
conf->num_relations,
conf->num_cycles +
conf->components - conf->vertices,
conf->num_cycles,
max_relations);
fflush(stderr);
}
}
if (obj->flags & (MSIEVE_FLAG_USE_LOGFILE |
MSIEVE_FLAG_LOG_TO_STDOUT))
fprintf(stderr, "\n");
logprintf(obj, "%u relations (%u full + %u combined from "
"%u partial), need %u\n",
num_relations, conf->num_relations,
conf->num_cycles +
conf->components - conf->vertices,
conf->num_cycles,
max_relations);
return num_relations;
}
/*--------------------------------------------------------------------*/
static void collect_relations(sieve_conf_t *conf,
uint32 target_relations,
qs_core_sieve_fcn core_sieve_fcn) {
uint32 i;
uint32 relations_found = 0;
uint32 num_poly = 1 << (conf->num_poly_factors - 1);
uint32 poly_block = MIN(num_poly, conf->poly_block);
/* top-level sieving loop: keep building polynomials
and sieving with them until at least target_relations
relations have been found */
while (relations_found < target_relations) {
/* build the next batch of polynomials. For
big factorizations there may be thousands
of them */
TIME1(base_poly_time)
build_base_poly(conf);
TIME2(base_poly_time)
/* Do the sieving for all polynomials, handling
batches of polynomials at a time. */
i = 0;
while (i < num_poly) {
uint32 curr_num_poly = MIN(poly_block, num_poly - i);
relations_found += core_sieve_fcn(conf, i,
curr_num_poly);
i += curr_num_poly;
/* see if anybody wants sieving to stop. If num_poly
is small we bail out after sieving for the entire
current batch of polynomials has finished. If
num_poly is large we wait until a significant
number of polynomials have been sieved. Basically
you're entitled to a big pile of polynomials
whenever the loop runs, and you should get your
money's worth without having to wait a really
long time to finish up */
if ((conf->obj->flags & MSIEVE_FLAG_STOP_SIEVING) &&
((num_poly <= 1024 && i == num_poly) ||
(num_poly > 1024 && i > 2000))) {
return;
}
}
}
}
/*--------------------------------------------------------------------*/
uint32 check_sieve_val(sieve_conf_t *conf, int32 sieve_offset,
uint32 bits, mp_t *a, mp_t *b, mp_t *c,
uint32 poly_index, hashtable_t *hash_bucket) {
/* check a single sieve value for smoothness. This
routine is called quite rarely but is very compu-
tationally intensive. Returns 1 if the input sieve
value is completely factored, 0 otherwise.
There are a lot of things in this routine that are
not explained very well in references. The quadratic
sieve uses a polynomial p(x) = (sqrt(n) + x), and trial
division is performed on p(x)^2 - n. MPQS instead uses
a polynomial p(x) = a * x + b, where a and b are several
digits smaller than sqrt(n).
Thus, for MPQS p(x)^2 - n is usually much larger than
sqrt(n). However, the way a and b were computed, p(x)^2 - n
is divisible by a. Rather than compute p(x)^2 - n and divide
manually by a, you can instead compute a * x^2 + 2 * b * x + c,
where c is precomputed to be (b*b-n)/a. This quadratic
polynomial happens to be (p(x)^2-n)/a, and its value is a
little less than sqrt(n) so you can trial divide like in QS.
c and 2*b were both precomputed by calling code, although
c is not used after the sieving phase and need not be saved */
uint32 i, j;
uint32 num_factors = 0;
uint32 sign_of_index;
uint32 fb_offsets[32 * MAX_MP_WORDS / 4];
mp_t prod, res;
fb_t *factor_base = conf->factor_base;
packed_fb_t *packed_factor_base = conf->packed_fb;
uint32 tf_small_recip1_cutoff = conf->tf_small_recip1_cutoff;
uint32 tf_small_recip2_cutoff = conf->tf_small_recip2_cutoff;
uint32 tf_med_recip1_cutoff = conf->tf_med_recip1_cutoff;
uint32 tf_med_recip2_cutoff = conf->tf_med_recip2_cutoff;
uint32 tf_large_cutoff = conf->tf_large_cutoff;
hash_entry_t *list;
uint32 lowbits;
mp_t base, exponent, ans;
uint32 cutoff2;
uint32 abs_offset;
uint32 tf_offset;
uint32 index;
uint32 sieve_block_size = conf->sieve_block_size;
int32 sieve_size = conf->num_sieve_blocks * sieve_block_size;
/* Compute the polynomial index. We work with two
such numbers, one to use in the trial factoring and
the other to compute the value of the sieving
polynomial. The numbers are different because of
a subtle issue in polynomial initialization (see
the full description in poly.c) */
tf_offset = sieve_offset + sieve_size / 2;
index = tf_offset & (sieve_block_size - 1);
sieve_offset += (int32)(conf->root_correction[poly_index] -
conf->root_correction[0]);
abs_offset = (uint32)abs(sieve_offset);
sign_of_index = POSITIVE;
if (sieve_offset < 0)
sign_of_index = NEGATIVE;
/* if the index is positive, compute a*x^2 + 2*b*x + c, or
using Horner's rule, (a*x+2*b)*x + c.
if the index is negative, compute a*x^2 - 2*b*x + c, or
using Horner's rule, (a*x-2*b)*x + c. */
mp_mul_1(a, abs_offset, &prod);
if (sign_of_index == POSITIVE) {
mp_add(&prod, b, &prod);
}
else {
/* An extremely subtle special case: if sieve_offset
is -1, then the polynomial value is a-2b+c; if this
was ordinary MPQS we could guarantee that 2b < a,
but with self-initialization this may not be the case. */
if (abs_offset == 1 && mp_cmp(a, b) < 0)
return 0;
mp_sub(&prod, b, &prod);
}
mp_mul_1(&prod, abs_offset, &res);
/* handle the '+c' part. Note that because c is negative,
the final polynomial value can have the OPPOSITE SIGN
of sieve_offset. The linear algebra code cares about
the sign of 'res', but the MPQS square root phase cares
about the sign of sieve_offset. Thus, you have to track
both these values for every relation.
The literature considers this fact too low-level to mention */
if (mp_cmp(&res, c) >= 0) { /* poly value is positive */
mp_sub(&res, c, &res);
}
else { /* poly value is negative */
mp_sub(c, &res, &res);
fb_offsets[num_factors++] = 0;
}
/* now that the sieve value has been computed, calculate
the exact number of bits that would be needed to indicate
trial factoring is worthwhile. If sieve_offset is near
a real root of the polynomial then res can be several digits
smaller than calling code expects, and we don't want to
throw it away by accident (especially since the odds are
better that it will be smooth) */
cutoff2 = mp_bits(&res);
if (cutoff2 >= conf->cutoff2)
cutoff2 -= conf->cutoff2;
else
cutoff2 = 0;
/* Do trial division; factor out all primes in
the factor base from 'res'. First pull out
factors of two */
lowbits = mp_rjustify(&res, &res);
bits += lowbits;
for (i = 0; i < lowbits; i++)
fb_offsets[num_factors++] = MIN_FB_OFFSET;
/* Now deal with all the rest of the factor base
primes. Rather than perform multiple-precision
divides, this code uses the fact that if 'prime'
divides 'res', then 'sieve_offset' lies on one
of two arithmetic progressions. In other words,
sieve_offset = root1 + i*prime or root2 + j*prime
for some i or j, if prime divides res. Because
sieve_offset is always less than 32 bits, and
root1 and root2 are available, a single remainder
operation is enough to determine divisibility.
The first step is a small amount of trial division
before comparison of log2(sieve_value) to cutoff2.
Begin with factor base primes whose reciprocal assumes
a numerator up to 2^32 */
for (i = MIN_FB_OFFSET + 1; i < tf_small_recip1_cutoff; i++) {
fb_t *fbptr = factor_base + i;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
uint32 logprime = fbptr->logprime;
uint32 recip = fbptr->recip;
uint32 rcorrect = fbptr->rcorrect;
/* if the roots have not been computed, do
a multiple precision mod. The only value for
which this is true (in this loop) should be
the small multiplier */
if (root1 == INVALID_ROOT) {
if (mp_mod_1(&res, prime) == 0) {
do {
bits += logprime;
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
continue;
}
j = (uint32)(((uint64)(tf_offset + rcorrect) *
(uint64)recip) >> 32);
j = tf_offset - j * prime;
if (j == root1 || j == root2) {
do {
bits += logprime;
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
}
/* continue with primes whose reciprocal assumes 2^40.
This range is probably empty */
for (; i < tf_small_recip2_cutoff; i++) {
fb_t *fbptr = factor_base + i;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
uint32 logprime = fbptr->logprime;
uint32 recip = fbptr->recip;
uint32 rcorrect = fbptr->rcorrect;
if (root1 == INVALID_ROOT) {
if (mp_mod_1(&res, prime) == 0) {
do {
bits += logprime;
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
continue;
}
j = (uint32)(((uint64)(tf_offset + rcorrect) *
(uint64)recip) >> 40);
j = tf_offset - j * prime;
if (j == root1 || j == root2) {
do {
bits += logprime;
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
}
if (bits <= cutoff2)
return 0;
/* Now perform trial division for the rest of the
"small" factor base primes. Begin with those whose
reciprocal assumes numerators up to 2^32 */
TIME1(tf_small_time)
for (; i < tf_med_recip1_cutoff; i++) {
fb_t *fbptr = factor_base + i;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
uint32 recip = fbptr->recip;
uint32 rcorrect = fbptr->rcorrect;
#ifdef MANUAL_PREFETCH
if (i % 4 == 0)
PREFETCH(fbptr + 4);
#endif
/* if the roots have not been computed, do
a multiple precision mod. The only values for
which this is true (in this loop) are the
factors of the polynomial 'a' value */
if (root1 == INVALID_ROOT) {
if (mp_mod_1(&res, prime) == 0) {
do {
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
continue;
}
j = (uint32)(((uint64)(tf_offset + rcorrect) *
(uint64)recip) >> 32);
j = tf_offset - j * prime;
if (j == root1 || j == root2) {
do {
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
}
/* repeat for numerators up to 2^40 */
for (; i < tf_med_recip2_cutoff; i++) {
fb_t *fbptr = factor_base + i;
uint32 prime = fbptr->prime;
uint32 root1 = fbptr->root1;
uint32 root2 = fbptr->root2;
uint32 recip = fbptr->recip;
uint32 rcorrect = fbptr->rcorrect;
#ifdef MANUAL_PREFETCH
if (i % 4 == 0)
PREFETCH(fbptr + 4);
#endif
if (root1 == INVALID_ROOT) {
if (mp_mod_1(&res, prime) == 0) {
do {
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
continue;
}
j = (uint32)(((uint64)(tf_offset + rcorrect) *
(uint64)recip) >> 40);
j = tf_offset - j * prime;
if (j == root1 || j == root2) {
do {
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
}
/* handle the largest factor base primes that are
not hashed. Since by design all such primes p exceed
the size of the sieve block, at most two offsets
per sieve block are divisible by p. We have the
offsets of the *next* sieve values that are divisible
by p, so if the current offset is p away from either
of these then it is divisible by p as well. This test
is multiply-and-divide-free, and so is extremely fast */
for (; i < tf_large_cutoff; i++) {
packed_fb_t *pfbptr = packed_factor_base + i;
uint32 prime = pfbptr->prime;
uint32 root1 = pfbptr->next_loc1;
uint32 root2 = pfbptr->next_loc2;
#ifdef MANUAL_PREFETCH
if (i % 4 == 0)
PREFETCH(pfbptr + 4);
#endif
if (root1 == INVALID_ROOT) {
if (mp_mod_1(&res, prime) == 0) {
do {
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
continue;
}
j = index + prime - sieve_block_size;
if (j == root1 || j == root2) {
do {
fb_offsets[num_factors++] = i;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
}
TIME2(tf_small_time)
list = hash_bucket->list;
/* This hashtable entry contains all the primes that
divide sieve offsets in this range, along with the
sieve offsets those primes divide. Hence trial
division for the entire factor base above small_fb_size
requires only iterating through this hashtable entry
and comparing sieve offsets
Not only does this not require any divisions, but
the number of entries in list[] is much smaller
than the full factor base (5-10x smaller) */
TIME1(tf_large_time)
for (i = 0; i < hash_bucket->num_used; i++) {
#ifdef MANUAL_PREFETCH
if (i % 8 == 0)
PREFETCH(list + i + 16);
#endif
if (list[i].sieve_offset == index) {
uint32 prime_index = list[i].prime_index;
uint32 prime = factor_base[prime_index].prime;
do {
fb_offsets[num_factors++] = prime_index;
mp_divrem_1(&res, prime, &res);
j = mp_mod_1(&res, prime);
} while (j == 0);
}
}
TIME2(tf_large_time)
/* encode the sign of sieve_offset into its top bit */
abs_offset |= sign_of_index << 31;
/* If 'res' has been completely factored, save a full relation */
if (res.nwords == 1 && res.val[0] == 1) {
save_relation(conf, abs_offset, fb_offsets,
num_factors, poly_index,
1, 1);
return 1;
}
/* if 'res' is smaller than the bound for partial
relations, save a 1LP-relation */
if (res.nwords == 1 && res.val[0] < conf->large_prime_max) {
save_relation(conf, abs_offset, fb_offsets,
num_factors, poly_index,
1, res.val[0]);
return 0;
}
/* if 'res' is smaller than the square of the
largest factor base prime, then 'res' is
itself prime and is useless for our purposes.
Note that single large prime relations will
always fail at this point */
if (mp_cmp(&res, &conf->max_fb2) < 0)
return 0;
/* 'res' is not too small; see if it's too big */
if (mp_cmp(&res, &conf->large_prime_max2) > 0)
return 0;
/* perform a base-2 pseudoprime test to make sure
'res' is composite */
mp_sub_1(&res, 1, &exponent);
mp_clear(&base);
base.nwords = 1; base.val[0] = 2;
mp_expo(&base, &exponent, &res, &ans);
if (mp_is_one(&ans))
return 0;
/* *finally* attempt to factor 'res'; if successful,
and both factors are smaller than the single
large prime bound, save 'res' as a partial-partial
relation */
i = squfof(&res);
if (i > 1) {
mp_divrem_1(&res, i, &res);
if (i < conf->large_prime_max && res.nwords == 1 &&
res.val[0] < conf->large_prime_max) {
if (i == res.val[0]) {
/* if 'res' is a perfect square, then this
is actually a full relation! */
save_relation(conf, abs_offset, fb_offsets,
num_factors, poly_index,
i, res.val[0]);
return 1;
}
else {
save_relation(conf, abs_offset, fb_offsets,
num_factors, poly_index,
i, res.val[0]);
}
}
}
return 0;
}
syntax highlighted by Code2HTML, v. 0.9.1