OpenXM_contrib/gmp/tune/tuneup.c - diff

Return to tuneup.c CVS log

Up to [local] / OpenXM_contrib / gmp / tune

Diff for /OpenXM_contrib/gmp/tune/Attic/tuneup.c between version 1.1.1.1 and 1.1.1.2

-version 1.1.1.1, 2000/09/09 14:13:19
+version 1.1.1.2, 2003/08/25 16:06:38
 Line 1
 Line 1
 Line 1
- /* Create tuned thresholds for various algorithms. */
+ /* Create tuned thresholds for various algorithms.
- /*
+ Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
- Copyright (C) 1999, 2000 Free Software Foundation, Inc.
  This file is part of the GNU MP Library.
-Line 18  License for more details.
+Line 17  License for more details.
 Line 18  License for more details.
 Line 17  License for more details.
  You should have received a copy of the GNU Lesser General Public License
  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- MA 02111-1307, USA.
+ MA 02111-1307, USA. */
- */
  /* Usage: tune [-t] [-t] [-p precision]
     -t turns on some diagnostic traces, a second -t turns on more traces.
-Line 52  MA 02111-1307, USA.
+Line 51  MA 02111-1307, USA.
 Line 52  MA 02111-1307, USA.
 Line 51  MA 02111-1307, USA.
     to the final speed of the relevant routines, but nothing has been done to
     check that carefully.
+    Remarks:
+    The code here isn't a vision of loveliness, mainly because it's subject
+    to ongoing modifications according to new things wanting to be tuned and
+    practical requirements of systems tested.
+    The way parts of the library are recompiled to insinuate the tuning
+    variables is a bit subtle, but unavoidable since of course the main
+    library has fixed thresholds compiled-in but we want to vary them here.
+    Most of the nonsense for this can be found in tune/Makefile.am and under
+    TUNE_PROGRAM_BUILD in gmp-impl.h.
+    The dirty hack which the "second_start_min" feature could perhaps be done
+    more generally, so if say karatsuba is never better than toom3 then it
+    can be detected and omitted.  Currently we're hoping very hard that this
+    doesn't arise in practice, and if it does then it indicates something
+    badly sub-optimal in the karatsuba implementation.
     Limitations:
     The FFTs aren't subject to the same badness rule as the other thresholds,
     so each k is probably being brought on a touch early.  This isn't likely
     to make a difference, and the simpler probing means fewer tests.
  */
- #define TUNE_PROGRAM_BUILD  1
+ #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
+ #include "config.h"
  #include <math.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <time.h>
+ #if HAVE_UNISTD_H
  #include <unistd.h>
+ #endif
  #include "gmp.h"
  #include "gmp-impl.h"
+ #include "longlong.h"
+ #include "tests.h"
  #include "speed.h"
- #include "sqr_basecase.h"
  #if !HAVE_DECL_OPTARG
  extern char *optarg;
-Line 80  extern int optind, opterr;
+Line 102  extern int optind, opterr;
 Line 80  extern int optind, opterr;
 Line 102  extern int optind, opterr;
  #endif
- #define MAX_SIZE        1000  /* limbs */
+ #define DEFAULT_MAX_SIZE   1000  /* limbs */
- #define STEP_FACTOR     0.01  /* how much to step sizes by (rounded down) */
+ #define MAX_TABLE             5  /* entries */
- #define MAX_TABLE       2     /* threshold entries */
  #if WANT_FFT
  mp_size_t  option_fft_max_size = 50000;  /* limbs */
  #else
-Line 103  int  allocdat = 0;
+Line 123  int  allocdat = 0;
 Line 103  int  allocdat = 0;
 Line 123  int  allocdat = 0;
  /* Each "_threshold" array must be 1 bigger than the number of thresholds
-    being tuned in a set, because one() stores an value in the entry above
+    being tuned in a set, because one() stores a value in the entry above
     the one it's determining. */
  mp_size_t  mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
- mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
  mp_size_t  sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
- mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
+ mp_size_t  sb_preinv_threshold[2] = { MP_SIZE_T_MAX };
- mp_size_t  bz_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  dc_threshold[2] = { MP_SIZE_T_MAX };
- mp_size_t  fib_threshold[2] = { MP_SIZE_T_MAX };
  mp_size_t  powm_threshold[2] = { MP_SIZE_T_MAX };
  mp_size_t  gcd_accel_threshold[2] = { MP_SIZE_T_MAX };
  mp_size_t  gcdext_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  divexact_1_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  divrem_1_norm_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  divrem_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  divrem_2_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  mod_1_norm_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  mod_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  modexact_1_odd_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  get_str_basecase_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  get_str_precompute_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  set_str_threshold[2] = { MP_SIZE_T_MAX };
+ mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
+ mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
- #ifndef KARATSUBA_SQR_MAX
+ #ifndef TUNE_SQR_KARATSUBA_MAX
- #define KARATSUBA_SQR_MAX  0 /* meaning no limit */
+ #define TUNE_SQR_KARATSUBA_MAX  0 /* meaning no limit */
  #endif
  struct param_t {
-   const char  *name[MAX_TABLE];
+   const char        *name[MAX_TABLE];
-   int         stop_since_change;
+   speed_function_t  function;
-   mp_size_t   min_size;
+   speed_function_t  function2;
-   mp_size_t   max_size[MAX_TABLE];
+   double            step_factor;    /* how much to step sizes (rounded down) */
+   double            function_fudge; /* multiplier for "function" speeds */
+   int               stop_since_change;
+   double            stop_factor;
+   mp_size_t         min_size[MAX_TABLE];
+   int               min_is_always;
+   int               second_start_min;
+   mp_size_t         max_size[MAX_TABLE];
+   mp_size_t         check_size;
+   mp_size_t         size_extra;
+ #define DATA_HIGH_LT_R  1
+ #define DATA_HIGH_GE_R  2
+   int               data_high;
+   int               noprint;
  };
+ #ifndef UDIV_PREINV_ALWAYS
+ #define UDIV_PREINV_ALWAYS 0
+ #endif
+ mp_limb_t
+ randlimb_norm (void)
+ {
+   mp_limb_t  n;
+   mpn_random (&n, 1);
+   n |= GMP_LIMB_HIGHBIT;
+   return n;
+ }
+ #define MP_LIMB_T_HALFMASK  ((CNST_LIMB(1) << (BITS_PER_MP_LIMB/2)) - 1)
+ mp_limb_t
+ randlimb_half (void)
+ {
+   mp_limb_t  n;
+   mpn_random (&n, 1);
+   n &= MP_LIMB_T_HALFMASK;
+   n += (n==0);
+   return n;
+ }
  /* Add an entry to the end of the dat[] array, reallocing to make it bigger
     if necessary.  */
  void
-Line 140  add_dat (mp_size_t size, double d)
+Line 211  add_dat (mp_size_t size, double d)
 Line 140  add_dat (mp_size_t size, double d)
 Line 211  add_dat (mp_size_t size, double d)
    if (ndat == allocdat)
      {
-       dat = (struct dat_t *) _mp_allocate_or_reallocate
+       dat = (struct dat_t *) __gmp_allocate_or_reallocate
          (dat, allocdat * sizeof(dat[0]),
           (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
        allocdat += ALLOCDAT_STEP;
-Line 191  analyze_dat (int i, int final)
+Line 262  analyze_dat (int i, int final)
 Line 191  analyze_dat (int i, int final)
 Line 262  analyze_dat (int i, int final)
            min_j = j;
          }
      }
    return min_j;
  }
+ /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
+ mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
+                                     mp_srcptr ap, mp_size_t size,
+                                     mp_limb_t d));
+ mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
  double
- tuneup_measure (speed_function_t fun, struct speed_params *s)
+ speed_mpn_mod_1_tune (struct speed_params *s)
  {
-   static mp_ptr  xp, yp;
+   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
+ }
+ double
+ speed_mpn_divrem_1_tune (struct speed_params *s)
+ {
+   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
+ }
+ double
+ tuneup_measure (speed_function_t fun,
+                 const struct param_t *param,
+                 struct speed_params *s)
+ {
+   static struct param_t  dummy;
    double   t;
    TMP_DECL (marker);
+   if (! param)
+     param = &dummy;
+   s->size += param->size_extra;
    TMP_MARK (marker);
    s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
    s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
-Line 210  tuneup_measure (speed_function_t fun, struct speed_par
+Line 307  tuneup_measure (speed_function_t fun, struct speed_par
 Line 210  tuneup_measure (speed_function_t fun, struct speed_par
 Line 307  tuneup_measure (speed_function_t fun, struct speed_par
    mpn_random (s->xp, s->size);
    mpn_random (s->yp, s->size);
+   switch (param->data_high) {
+   case DATA_HIGH_LT_R:
+     s->xp[s->size-1] %= s->r;
+     s->yp[s->size-1] %= s->r;
+     break;
+   case DATA_HIGH_GE_R:
+     s->xp[s->size-1] |= s->r;
+     s->yp[s->size-1] |= s->r;
+     break;
+   }
    t = speed_measure (fun, s);
+   s->size -= param->size_extra;
    TMP_FREE (marker);
    return t;
  }
+ #define PRINT_WIDTH  28
  void
- print_define (const char *name, mp_size_t value)
+ print_define_start (const char *name)
  {
-   printf ("#ifndef %s\n", name);
+   printf ("#define %-*s  ", PRINT_WIDTH, name);
-   printf ("#define %-23s  ", name);
+   if (option_trace)
+     printf ("...\n");
+ }
+ void
+ print_define_end_remark (const char *name, mp_size_t value, const char *remark)
+ {
+   if (option_trace)
+     printf ("#define %-*s  ", PRINT_WIDTH, name);
    if (value == MP_SIZE_T_MAX)
-     printf ("MP_SIZE_T_MAX\n");
+     printf ("MP_SIZE_T_MAX");
    else
-     printf ("%5ld\n", value);
+     printf ("%5ld", value);
-   printf ("#endif\n");
+   if (remark != NULL)
+     printf ("  /* %s */", remark);
+   printf ("\n");
  }
+ void
+ print_define_end (const char *name, mp_size_t value)
+ {
+   const char  *remark;
+   if (value == MP_SIZE_T_MAX)
+     remark = "never";
+   else if (value == 0)
+     remark = "always";
+   else
+     remark = NULL;
+   print_define_end_remark (name, value, remark);
+ }
+ void
+ print_define (const char *name, mp_size_t value)
+ {
+   print_define_start (name);
+   print_define_end (name, value);
+ }
+ void
+ print_define_remark (const char *name, mp_size_t value, const char *remark)
+ {
+   print_define_start (name);
+   print_define_end_remark (name, value, remark);
+ }
  /* table[i+1] needs to be set to a sensible value when testing method i+1
-    because mpn_mul_n uses TOOM3_MUL_THRESHOLD to size the temporary
+    because mpn_mul_n uses MUL_TOOM3_THRESHOLD to size the temporary
     workspace for mpn_kara_mul_n. */
  void
- one (speed_function_t function, mp_size_t table[], size_t max_table,
+ one (mp_size_t table[], size_t max_table, struct param_t *param)
-      struct param_t *param)
  {
-   static struct param_t  dummy;
+   mp_size_t  table_save0 = 0;
+   int  since_positive, since_thresh_change;
+   int  thresh_idx, new_thresh_idx;
    int  i;
-   if (param == NULL)  param = &dummy;
+   ASSERT_ALWAYS (max_table <= MAX_TABLE);
- #define DEFAULT(x,n)  if (param->x == 0)  param->x = (n);
+ #define DEFAULT(x,n)  if (! (param->x))  param->x = (n);
+   DEFAULT (function_fudge, 1.0);
+   DEFAULT (function2, param->function);
+   DEFAULT (step_factor, 0.01);  /* small steps by default */
    DEFAULT (stop_since_change, 80);
-   DEFAULT (min_size, 10);
+   DEFAULT (stop_factor, 1.2);
-   for (i = 0; i < numberof (param->max_size); i++)
+   for (i = 0; i < max_table; i++)
-     DEFAULT (max_size[i], MAX_SIZE);
+     DEFAULT (min_size[i], 10);
+   for (i = 0; i < max_table; i++)
+     DEFAULT (max_size[i], DEFAULT_MAX_SIZE);
-   s.size = param->min_size;
+   if (param->check_size != 0)
+     {
+       double   t1, t2;
+       s.size = param->check_size;
-   for (i = 0; i < max_table && s.size < MAX_SIZE; i++)
+       table[0] = s.size+1;
+       table[1] = param->max_size[0];
+       t1 = tuneup_measure (param->function, param, &s);
+       table[0] = s.size;
+       table[1] = s.size+1;
+       t2 = tuneup_measure (param->function2, param, &s);
+       if (t1 == -1.0 || t2 == -1.0)
+         {
+           printf ("Oops, can't run both functions at size %ld\n", s.size);
+           abort ();
+         }
+       t1 *= param->function_fudge;
+       /* ask that t2 is at least 4% below t1 */
+       if (t1 < t2*1.04)
+         {
+           if (option_trace)
+             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
+           table[0] = MP_SIZE_T_MAX;
+           if (! param->noprint)
+             print_define (param->name[0], table[0]);
+           return;
+         }
+       if (option_trace >= 2)
+         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
+                 s.size, t1, t2);
+     }
+   for (i = 0, s.size = 1; i < max_table && s.size < param->max_size[i]; i++)
      {
-       int  since_positive, since_thresh_change;
+       if (i == 1 && param->second_start_min)
-       int  thresh_idx, new_thresh_idx;
+         s.size = 1;
+       if (s.size < param->min_size[i])
+         s.size = param->min_size[i];
+       if (! (param->noprint || (i == 1 && param->second_start_min)))
+         print_define_start (param->name[i]);
        ndat = 0;
        since_positive = 0;
        since_thresh_change = 0;
-Line 268  one (speed_function_t function, mp_size_t table[], siz
+Line 464  one (speed_function_t function, mp_size_t table[], siz
 Line 268  one (speed_function_t function, mp_size_t table[], siz
 Line 464  one (speed_function_t function, mp_size_t table[], siz
            printf ("              (seconds)    (seconds)    diff    thresh\n");
          }
-       for ( ; s.size < MAX_SIZE;
+       for (;
-             s.size += MAX ((mp_size_t) floor (s.size * STEP_FACTOR), 1))
+            s.size < param->max_size[i];
+            s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
          {
            double   ti, tiplus1, d;
-Line 287  one (speed_function_t function, mp_size_t table[], siz
+Line 484  one (speed_function_t function, mp_size_t table[], siz
 Line 287  one (speed_function_t function, mp_size_t table[], siz
 Line 484  one (speed_function_t function, mp_size_t table[], siz
            /*
              FIXME: check minimum size requirements are met, possibly by just
              checking for the -1 returns from the speed functions.
-             if (s.size < MPN_TOOM_TABLE_TO_MINSIZE (i))
-             continue;
            */
+           /* under this hack, don't let method 0 get used at s.size */
+           if (i == 1 && param->second_start_min)
+             table[0] = MIN (s.size-1, table_save0);
            /* using method i at this size */
            table[i] = s.size+1;
-           table[i+1] = MAX_SIZE;
+           table[i+1] = param->max_size[i];
-           ti = tuneup_measure (function, &s);
+           ti = tuneup_measure (param->function, param, &s);
            if (ti == -1.0)
              abort ();
+           ti *= param->function_fudge;
            /* using method i+1 at this size */
            table[i] = s.size;
            table[i+1] = s.size+1;
-           tiplus1 = tuneup_measure (function, &s);
+           tiplus1 = tuneup_measure (param->function2, param, &s);
            if (tiplus1 == -1.0)
              abort ();
-Line 318  one (speed_function_t function, mp_size_t table[], siz
+Line 518  one (speed_function_t function, mp_size_t table[], siz
 Line 318  one (speed_function_t function, mp_size_t table[], siz
 Line 518  one (speed_function_t function, mp_size_t table[], siz
            if (option_trace >= 2)
-             printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %d\n",
+             printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
                      i, s.size, ti, tiplus1, d,
                      ti > tiplus1 ? '#' : ' ',
                      dat[new_thresh_idx].size);
-Line 327  one (speed_function_t function, mp_size_t table[], siz
+Line 527  one (speed_function_t function, mp_size_t table[], siz
 Line 327  one (speed_function_t function, mp_size_t table[], siz
 Line 527  one (speed_function_t function, mp_size_t table[], siz
               certain number of measurements ago.  */
  #define STOP_SINCE_POSITIVE  200
            if (d >= 0)
              since_positive = 0;
            else
              if (++since_positive > STOP_SINCE_POSITIVE)
                {
-Line 338  one (speed_function_t function, mp_size_t table[], siz
+Line 538  one (speed_function_t function, mp_size_t table[], siz
 Line 338  one (speed_function_t function, mp_size_t table[], siz
 Line 538  one (speed_function_t function, mp_size_t table[], siz
                }
            /* Stop if method i has become slower by a certain factor. */
- #define STOP_FACTOR   1.2
+           if (ti >= tiplus1 * param->stop_factor)
-           if (ti >= tiplus1 * STOP_FACTOR)
              {
                if (option_trace >= 1)
                  printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n",
-                         i, STOP_FACTOR);
+                         i, param->stop_factor);
                break;
              }
-Line 374  one (speed_function_t function, mp_size_t table[], siz
+Line 573  one (speed_function_t function, mp_size_t table[], siz
 Line 374  one (speed_function_t function, mp_size_t table[], siz
 Line 573  one (speed_function_t function, mp_size_t table[], siz
          }
        /* Stop when the size limit is reached before the end of the
-          crossover, without a specified param->max_size[i]. */
+          crossover, but only show this as an error for >= the default max
-       if (s.size >= MAX_SIZE)
+          size.  FIXME: Maybe should make it a param choice whether this is
+          an error.  */
+       if (s.size >= param->max_size[i]
+           && param->max_size[i] >= DEFAULT_MAX_SIZE)
          {
            fprintf (stderr, "%s\n", param->name[i]);
            fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n",
-Line 393  one (speed_function_t function, mp_size_t table[], siz
+Line 595  one (speed_function_t function, mp_size_t table[], siz
 Line 393  one (speed_function_t function, mp_size_t table[], siz
 Line 595  one (speed_function_t function, mp_size_t table[], siz
        table[i] = dat[analyze_dat (i, 1)].size;
-       print_define (param->name[i], table[i]);
+       /* fudge here, let min_is_always apply only to i==0, that's what the
+          sqr_n thresholds want */
+       if (i == 0 && param->min_is_always && table[i] == param->min_size[i])
+         table[i] = 0;
-       /* Look for the next threshold starting from the current one, but back
+       /* under the second_start_min fudge, if the second threshold turns out
-          a bit. */
+          to be lower than the first, then the second method is unwanted, we
+          should go straight from algorithm 1 to algorithm 3.  */
+       if (param->second_start_min)
+         {
+           if (i == 0)
+             {
+               table_save0 = table[0];
+               table[0] = 0;
+             }
+           else if (i == 1)
+             {
+               table[0] = table_save0;
+               if (table[1] <= table[0])
+                 {
+                   table[0] = table[1];
+                   table[1] = 0;
+                 }
+             }
+           s.size = MAX (table[0], table[1]) + 1;
+         }
+       if (! (param->noprint || (i == 0 && param->second_start_min)))
+         {
+           if (i == 1 && param->second_start_min)
+             {
+               print_define_end (param->name[0], table[0]);
+               print_define_start (param->name[1]);
+             }
+           print_define_end (param->name[i], table[i]);
+         }
+       /* Look for the next threshold starting from the current one. */
        s.size = table[i]+1;
-     }
+       /* Take a MAX of all to allow for second_start_min producing a 0. */
+       {
+         int  j;
+         for (j = 0; j < i; j++)
+           s.size = MAX (s.size, table[j]+1);
+       }
+     }
  }
-Line 426  struct fft_param_t {
+Line 670  struct fft_param_t {
 Line 426  struct fft_param_t {
 Line 670  struct fft_param_t {
    mp_size_t         sqr;
  };
  /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
     N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
     of 2^(k-1) bits. */
-Line 433  struct fft_param_t {
+Line 678  struct fft_param_t {
 Line 433  struct fft_param_t {
 Line 678  struct fft_param_t {
  mp_size_t
  fft_step_size (int k)
  {
-   if (2*k-1 > BITS_PER_INT)
+   mp_size_t  step;
+   step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB;
+   step *= (mp_size_t) 1 << k;
+   if (step <= 0)
      {
        printf ("Can't handle k=%d\n", k);
        abort ();
      }
-   return (1<<k) * (MAX (1<<(k-1), BITS_PER_MP_LIMB)) / BITS_PER_MP_LIMB;
+   return step;
  }
  mp_size_t
-Line 469  fft (struct fft_param_t *p)
+Line 720  fft (struct fft_param_t *p)
 Line 469  fft (struct fft_param_t *p)
 Line 720  fft (struct fft_param_t *p)
    option_trace = MAX (option_trace, option_fft_trace);
-   printf ("#ifndef %s\n", p->table_name);
    printf ("#define %s  {", p->table_name);
    if (option_trace >= 2)
      printf ("\n");
-Line 487  fft (struct fft_param_t *p)
+Line 737  fft (struct fft_param_t *p)
 Line 487  fft (struct fft_param_t *p)
 Line 737  fft (struct fft_param_t *p)
        if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))
          break;
-       usleep(10000);
        /* compare k to k+1 in the middle of the current k+1 step */
        s.size = size + fft_step_size (k+1) / 2;
        s.r = k;
-       tk = tuneup_measure (p->function, &s);
+       tk = tuneup_measure (p->function, NULL, &s);
        if (tk == -1.0)
          abort ();
-       usleep(10000);
        s.r = k+1;
-       tk1 = tuneup_measure (p->function, &s);
+       tk1 = tuneup_measure (p->function, NULL, &s);
        if (tk1 == -1.0)
          abort ();
        if (option_trace >= 2)
-         printf ("at %ld   size=%ld  k=%d  %.9lf   k=%d %.9lf\n",
+         printf ("at %ld   size=%ld  k=%d  %.9f   k=%d %.9f\n",
                  size, s.size, k, tk, k+1, tk1);
        /* declare the k+1 threshold as soon as it's faster at its midpoint */
-Line 519  fft (struct fft_param_t *p)
+Line 765  fft (struct fft_param_t *p)
 Line 519  fft (struct fft_param_t *p)
 Line 765  fft (struct fft_param_t *p)
    mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;
    printf (" 0 }\n");
-   printf ("#endif\n");
    size = p->first_size;
    /* Declare an FFT faster than a plain toom3 etc multiplication found as
       soon as one faster measurement obtained.  A multiplication in the
       middle of the FFT step is tested.  */
-Line 548  fft (struct fft_param_t *p)
+Line 793  fft (struct fft_param_t *p)
 Line 548  fft (struct fft_param_t *p)
 Line 793  fft (struct fft_param_t *p)
        if (size >= p->max_size)
          break;
-       usleep(10000);
        s.size = size + fft_step_size (k) / 2;
        s.r = k;
-       tk = tuneup_measure (p->function, &s);
+       tk = tuneup_measure (p->function, NULL, &s);
        if (tk == -1.0)
          abort ();
-       usleep(10000);
        if (!modf)  s.size /= 2;
-       tm = tuneup_measure (p->mul_function, &s);
+       tm = tuneup_measure (p->mul_function, NULL, &s);
        if (tm == -1.0)
          abort ();
        if (option_trace >= 2)
-         printf ("at %ld   size=%ld   k=%d  %.9lf   size=%ld %s mul %.9lf\n",
+         printf ("at %ld   size=%ld   k=%d  %.9f   size=%ld %s mul %.9f\n",
                  size,
                  size + fft_step_size (k) / 2, k, tk,
                  s.size, modf ? "modf" : "full", tm);
-Line 588  fft (struct fft_param_t *p)
+Line 829  fft (struct fft_param_t *p)
 Line 588  fft (struct fft_param_t *p)
 Line 829  fft (struct fft_param_t *p)
  }
+ /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
+    giving wrong results.  */
  void
- all (void)
+ tune_mul (void)
  {
-   TMP_DECL (marker);
+   static struct param_t  param;
+   param.name[0] = "MUL_KARATSUBA_THRESHOLD";
+   param.name[1] = "MUL_TOOM3_THRESHOLD";
+   param.function = speed_mpn_mul_n;
+   param.min_size[0] = MAX (4, MPN_KARA_MUL_N_MINSIZE);
+   param.max_size[0] = MUL_TOOM3_THRESHOLD_LIMIT-1;
+   param.max_size[1] = MUL_TOOM3_THRESHOLD_LIMIT-1;
+   one (mul_threshold, 2, &param);
-   TMP_MARK (marker);
+   /* disabled until tuned */
-   s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
+   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
-   s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
+ }
-   speed_time_init ();
-   fprintf (stderr, "speed_precision %d, speed_unittime %.2e\n",
-            speed_precision, speed_unittime);
-   fprintf (stderr, "MAX_SIZE %ld, fft_max_size %ld, STEP_FACTOR %.3f\n",
-            MAX_SIZE, option_fft_max_size, STEP_FACTOR);
-   fprintf (stderr, "\n");
-   {
+ /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
-     struct tm  *tp;
+    is faster only at size==2 then we don't want to bother with extra code
-     time_t     t;
+    just for that.  Start karatsuba from 4 same as MUL above.  */
-     time (&t);
+ void
-     tp = localtime (&t);
+ tune_sqr (void)
-     printf ("/* Generated by tuneup.c, %d-%02d-%02d. */\n\n",
+ {
-             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
+   static struct param_t  param;
+   param.name[0] = "SQR_BASECASE_THRESHOLD";
+   param.name[1] = "SQR_KARATSUBA_THRESHOLD";
+   param.name[2] = "SQR_TOOM3_THRESHOLD";
+   param.function = speed_mpn_sqr_n;
+   param.min_is_always = 1;
+   param.second_start_min = 1;
+   param.min_size[0] = 3;
+   param.min_size[1] = MAX (4, MPN_KARA_SQR_N_MINSIZE);
+   param.min_size[2] = MPN_TOOM3_SQR_N_MINSIZE;
+   param.max_size[0] = TUNE_SQR_KARATSUBA_MAX;
+   param.max_size[1] = TUNE_SQR_KARATSUBA_MAX;
+   one (sqr_threshold, 3, &param);
+   /* disabled until tuned */
+   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
+ }
+ void
+ tune_sb_preinv (void)
+ {
+   static struct param_t  param;
+   if (UDIV_PREINV_ALWAYS)
+     {
+       print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always");
+       return;
+     }
+   param.check_size = 256;
+   param.min_size[0] = 3;
+   param.min_is_always = 1;
+   param.size_extra = 3;
+   param.stop_factor = 2.0;
+   param.name[0] = "DIV_SB_PREINV_THRESHOLD";
+   param.function = speed_mpn_sb_divrem_m3;
+   one (sb_preinv_threshold, 1, &param);
+ }
+ void
+ tune_dc (void)
+ {
+   static struct param_t  param;
+   param.name[0] = "DIV_DC_THRESHOLD";
+   param.function = speed_mpn_dc_tdiv_qr;
+   one (dc_threshold, 1, &param);
+ }
+ /* This is an indirect determination, based on a comparison between redc and
+    mpz_mod.  A fudge factor of 1.04 is applied to redc, to represent
+    additional overheads it gets in mpz_powm.
+    stop_factor is 1.1 to hopefully help cray vector systems, where otherwise
+    currently it hits the 1000 limb limit with only a factor of about 1.18
+    (threshold should be around 650).  */
+ void
+ tune_powm (void)
+ {
+   static struct param_t  param;
+   param.name[0] = "POWM_THRESHOLD";
+   param.function = speed_redc;
+   param.function2 = speed_mpz_mod;
+   param.step_factor = 0.03;
+   param.stop_factor = 1.1;
+   param.function_fudge = 1.04;
+   one (powm_threshold, 1, &param);
+ }
+ void
+ tune_gcd_accel (void)
+ {
+   static struct param_t  param;
+   param.name[0] = "GCD_ACCEL_THRESHOLD";
+   param.function = speed_mpn_gcd;
+   param.min_size[0] = 1;
+   one (gcd_accel_threshold, 1, &param);
+ }
+ /* A comparison between the speed of a single limb step and a double limb
+    step is made.  On a 32-bit limb the ratio is about 2.2 single steps to
+    equal a double step, or on a 64-bit limb about 2.09.  (These were found
+    from counting the steps on a 10000 limb gcdext.  */
+ void
+ tune_gcdext (void)
+ {
+   static struct param_t  param;
+   param.name[0] = "GCDEXT_THRESHOLD";
+   param.function = speed_mpn_gcdext_one_single;
+   param.function2 = speed_mpn_gcdext_one_double;
+   switch (BITS_PER_MP_LIMB) {
+   case 32: param.function_fudge = 2.2; break;
+   case 64: param.function_fudge = 2.09; break;
+   default:
+     printf ("Don't know GCDEXT_THERSHOLD factor for BITS_PER_MP_LIMB == %d\n",
+             BITS_PER_MP_LIMB);
+     abort ();
    }
+   param.min_size[0] = 5;
+   param.min_is_always = 1;
+   param.max_size[0] = 300;
+   param.check_size = 300;
+   one (gcdext_threshold, 1, &param);
+ }
+ /* size_extra==1 reflects the fact that with high<divisor one division is
+    always skipped.  Forcing high<divisor while testing ensures consistency
+    while stepping through sizes, ie. that size-1 divides will be done each
+    time.
+    min_size==2 and min_is_always are used so that if plain division is only
+    better at size==1 then don't bother including that code just for that
+    case, instead go with preinv always and get a size saving.  */
+ #define DIV_1_PARAMS                    \
+   param.check_size = 256;               \
+   param.min_size[0] = 2;                   \
+   param.min_is_always = 1;              \
+   param.data_high = DATA_HIGH_LT_R;     \
+   param.size_extra = 1;                 \
+   param.stop_factor = 2.0;
+ double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
+ void
+ tune_divrem_1 (void)
+ {
+   /* plain version by default */
+   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
+ #ifndef HAVE_NATIVE_mpn_divrem_1
+ #define HAVE_NATIVE_mpn_divrem_1 0
+ #endif
+   /* No support for tuning native assembler code, do that by hand and put
+      the results in the .asm file, there's no need for such thresholds to
+      appear in gmp-mparam.h.  */
+   if (HAVE_NATIVE_mpn_divrem_1)
+     return;
+   if (UDIV_PREINV_ALWAYS)
+     {
+       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
+       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
+       return;
+     }
+   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
+   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
+      a bit out for the fractional part, but that's too bad, the integer part
+      is more important. */
    {
      static struct param_t  param;
-     param.name[0] = "KARATSUBA_MUL_THRESHOLD";
+     param.name[0] = "DIVREM_1_NORM_THRESHOLD";
-     param.name[1] = "TOOM3_MUL_THRESHOLD";
+     DIV_1_PARAMS;
-     param.max_size[1] = TOOM3_MUL_THRESHOLD_LIMIT;
+     s.r = randlimb_norm ();
-     one (speed_mpn_mul_n, mul_threshold, numberof(mul_threshold)-1, &param);
+     param.function = speed_mpn_divrem_1_tune;
+     one (divrem_1_norm_threshold, 1, &param);
    }
-   printf("\n");
    {
      static struct param_t  param;
-     param.name[0] = "KARATSUBA_SQR_THRESHOLD";
+     param.name[0] = "DIVREM_1_UNNORM_THRESHOLD";
-     param.name[1] = "TOOM3_SQR_THRESHOLD";
+     DIV_1_PARAMS;
-     param.max_size[0] = KARATSUBA_SQR_MAX;
+     s.r = randlimb_half ();
-     one (speed_mpn_sqr_n, sqr_threshold, numberof(sqr_threshold)-1, &param);
+     param.function = speed_mpn_divrem_1_tune;
+     one (divrem_1_unnorm_threshold, 1, &param);
    }
-   printf("\n");
+ }
+ double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
+ void
+ tune_mod_1 (void)
+ {
+   /* plain version by default */
+   tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
+ #ifndef HAVE_NATIVE_mpn_mod_1
+ #define HAVE_NATIVE_mpn_mod_1 0
+ #endif
+   /* No support for tuning native assembler code, do that by hand and put
+      the results in the .asm file, there's no need for such thresholds to
+      appear in gmp-mparam.h.  */
+   if (HAVE_NATIVE_mpn_mod_1)
+     return;
+   if (UDIV_PREINV_ALWAYS)
+     {
+       print_define ("MOD_1_NORM_THRESHOLD", 0L);
+       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
+       return;
+     }
+   tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
    {
      static struct param_t  param;
-     param.name[0] = "BZ_THRESHOLD";
+     param.name[0] = "MOD_1_NORM_THRESHOLD";
-     one (speed_mpn_bz_tdiv_qr, bz_threshold, 1, &param);
+     DIV_1_PARAMS;
+     s.r = randlimb_norm ();
+     param.function = speed_mpn_mod_1_tune;
+     one (mod_1_norm_threshold, 1, &param);
    }
-   printf("\n");
    {
      static struct param_t  param;
-     param.name[0] = "FIB_THRESHOLD";
+     param.name[0] = "MOD_1_UNNORM_THRESHOLD";
-     one (speed_mpz_fib_ui, fib_threshold, 1, &param);
+     DIV_1_PARAMS;
+     s.r = randlimb_half ();
+     param.function = speed_mpn_mod_1_tune;
+     one (mod_1_unnorm_threshold, 1, &param);
    }
-   printf("\n");
+ }
-   /* mpz_powm becomes slow before long, so stop soon after the determined
-      threshold stops changing. */
+ /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
+    imply that udiv_qrnnd_preinv is worth using, but it seems most
+    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
+    directly.  */
+ void
+ tune_preinv_divrem_1 (void)
+ {
+   static struct param_t  param;
+   speed_function_t  divrem_1;
+   const char        *divrem_1_name;
+   double            t1, t2;
+ #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
+ #define HAVE_NATIVE_mpn_preinv_divrem_1 0
+ #endif
+   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
+      it's faster than mpn_divrem_1.  */
+   if (HAVE_NATIVE_mpn_preinv_divrem_1)
+     {
+       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
+       return;
+     }
+   /* If udiv_qrnnd_preinv is the only division method then of course
+      mpn_preinv_divrem_1 should be used.  */
+   if (UDIV_PREINV_ALWAYS)
+     {
+       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
+       return;
+     }
+   /* If we've got an assembler version of mpn_divrem_1, then compare against
+      that, not the mpn_divrem_1_div generic C.  */
+   if (HAVE_NATIVE_mpn_divrem_1)
+     {
+       divrem_1 = speed_mpn_divrem_1;
+       divrem_1_name = "mpn_divrem_1";
+     }
+   else
+     {
+       divrem_1 = speed_mpn_divrem_1_div;
+       divrem_1_name = "mpn_divrem_1_div";
+     }
+   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
+   s.size = 200;                     /* generous but not too big */
+   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
+      since in general that's probably most common, though in fact for a
+-bit limb mp_bases[10].big_base is normalized.  */
+   s.r = urandom() & (MP_LIMB_T_MAX >> 4);
+   if (s.r == 0) s.r = 123;
+   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
+   t2 = tuneup_measure (divrem_1, &param, &s);
+   if (t1 == -1.0 || t2 == -1.0)
+     {
+       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
+               divrem_1_name, s.size);
+       abort ();
+     }
+   if (option_trace >= 1)
+     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
+             s.size, t1, divrem_1_name, t2);
+   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
+ }
+ /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
+    that udiv_qrnnd_preinv is worth using, but it seems most straightforward
+    to compare mpn_preinv_mod_1 and mpn_mod_1_div directly.  */
+ void
+ tune_preinv_mod_1 (void)
+ {
+   static struct param_t  param;
+   speed_function_t  mod_1;
+   const char        *mod_1_name;
+   double            t1, t2;
+ #ifndef HAVE_NATIVE_mpn_preinv_mod_1
+ #define HAVE_NATIVE_mpn_preinv_mod_1 0
+ #endif
+   /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
+      faster than mpn_mod_1.  */
+   if (HAVE_NATIVE_mpn_preinv_mod_1)
+     {
+       print_define_remark ("USE_PREINV_MOD_1", 1, "native");
+       return;
+     }
+   /* If udiv_qrnnd_preinv is the only division method then of course
+      mpn_preinv_mod_1 should be used.  */
+   if (UDIV_PREINV_ALWAYS)
+     {
+       print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
+       return;
+     }
+   /* If we've got an assembler version of mpn_mod_1, then compare against
+      that, not the mpn_mod_1_div generic C.  */
+   if (HAVE_NATIVE_mpn_mod_1)
+     {
+       mod_1 = speed_mpn_mod_1;
+       mod_1_name = "mpn_mod_1";
+     }
+   else
+     {
+       mod_1 = speed_mpn_mod_1_div;
+       mod_1_name = "mpn_mod_1_div";
+     }
+   param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
+   s.size = 200;                     /* generous but not too big */
+   s.r = randlimb_norm();            /* divisor */
+   t1 = tuneup_measure (speed_mpn_preinv_mod_1, &param, &s);
+   t2 = tuneup_measure (mod_1, &param, &s);
+   if (t1 == -1.0 || t2 == -1.0)
+     {
+       printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
+               mod_1_name, s.size);
+       abort ();
+     }
+   if (option_trace >= 1)
+     printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
+             s.size, t1, mod_1_name, t2);
+   print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
+ }
+ void
+ tune_divrem_2 (void)
+ {
+   static struct param_t  param;
+ #ifndef HAVE_NATIVE_mpn_divrem_2
+ #define HAVE_NATIVE_mpn_divrem_2 0
+ #endif
+   /* No support for tuning native assembler code, do that by hand and put
+      the results in the .asm file, and there's no need for such thresholds
+      to appear in gmp-mparam.h.  */
+   if (HAVE_NATIVE_mpn_divrem_2)
+     return;
+   if (UDIV_PREINV_ALWAYS)
+     {
+       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
+       return;
+     }
+   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
+      a bit out for the fractional part, but that's too bad, the integer part
+      is more important.
+      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
+      code space if plain division is better only at size==2 or size==3. */
+   param.name[0] = "DIVREM_2_THRESHOLD";
+   param.check_size = 256;
+   param.min_size[0] = 4;
+   param.min_is_always = 1;
+   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
+   param.stop_factor = 2.0;
+   s.r = randlimb_norm ();
+   param.function = speed_mpn_divrem_2;
+   one (divrem_2_threshold, 1, &param);
+ }
+ /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
+    tune for that.  Its speed can differ on odd or even divisor, so take an
+    average threshold for the two.
+    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
+    might not vary that way, but don't test this since high<divisor isn't
+    expected to occur often with small divisors.  */
+ void
+ tune_divexact_1 (void)
+ {
+   static struct param_t  param;
+   mp_size_t  thresh[2], average;
+   int        low, i;
+   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
+   param.name[0] = "DIVEXACT_1_THRESHOLD";
+   param.data_high = DATA_HIGH_GE_R;
+   param.check_size = 256;
+   param.min_size[0] = 2;
+   param.stop_factor = 1.5;
+   param.function  = tuned_speed_mpn_divrem_1;
+   param.function2 = speed_mpn_divexact_1;
+   param.noprint = 1;
+   print_define_start (param.name[0]);
+   for (low = 0; low <= 1; low++)
+     {
+       s.r = randlimb_half();
+       if (low == 0)
+         s.r |= 1;
+       else
+         s.r &= ~CNST_LIMB(7);
+       one (divexact_1_threshold, 1, &param);
+       if (option_trace)
+         printf ("low=%d thresh %ld\n", low, divexact_1_threshold[0]);
+       if (divexact_1_threshold[0] == MP_SIZE_T_MAX)
+         {
+           average = MP_SIZE_T_MAX;
+           goto divexact_1_done;
+         }
+       thresh[low] = divexact_1_threshold[0];
+     }
+   if (option_trace)
+     {
+       printf ("average of:");
+       for (i = 0; i < numberof(thresh); i++)
+         printf (" %ld", thresh[i]);
+       printf ("\n");
+     }
+   average = 0;
+   for (i = 0; i < numberof(thresh); i++)
+     average += thresh[i];
+   average /= numberof(thresh);
+   /* If divexact turns out to be better as early as 3 limbs, then use it
+      always, so as to reduce code size and conditional jumps.  */
+   if (average <= 3)
+     average = 0;
+  divexact_1_done:
+   print_define_end (param.name[0], average);
+ }
+ /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
+    same as mpn_mod_1, but this might not be true of an assembler
+    implementation.  The threshold used is an average based on data where a
+    divide can be skipped and where it can't.
+    If modexact turns out to be better as early as 3 limbs, then use it
+    always, so as to reduce code size and conditional jumps.  */
+ void
+ tune_modexact_1_odd (void)
+ {
+   static struct param_t  param;
+   mp_size_t  thresh_lt;
+   ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
+   param.name[0] = "MODEXACT_1_ODD_THRESHOLD";
+   param.check_size = 256;
+   param.min_size[0] = 2;
+   param.stop_factor = 1.5;
+   param.function  = tuned_speed_mpn_mod_1;
+   param.function2 = speed_mpn_modexact_1c_odd;
+   param.noprint = 1;
+   s.r = randlimb_half () | 1;
+   print_define_start (param.name[0]);
+   param.data_high = DATA_HIGH_LT_R;
+   one (modexact_1_odd_threshold, 1, &param);
+   if (option_trace)
+     printf ("lt thresh %ld\n", modexact_1_odd_threshold[0]);
+   thresh_lt = modexact_1_odd_threshold[0];
+   if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
+     {
+       param.data_high = DATA_HIGH_GE_R;
+       one (modexact_1_odd_threshold, 1, &param);
+       if (option_trace)
+         printf ("ge thresh %ld\n", modexact_1_odd_threshold[0]);
+       if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
+         {
+           modexact_1_odd_threshold[0]
+             = (modexact_1_odd_threshold[0] + thresh_lt) / 2;
+           if (modexact_1_odd_threshold[0] <= 3)
+             modexact_1_odd_threshold[0] = 0;
+         }
+     }
+   print_define_end (param.name[0], modexact_1_odd_threshold[0]);
+ }
+ void
+ tune_jacobi_base (void)
+ {
+   static struct param_t  param;
+   double   t1, t2, t3;
+   int      method;
+   s.size = BITS_PER_MP_LIMB * 3 / 4;
+   t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
+   if (option_trace >= 1)
+     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", s.size, t1);
+   t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
+   if (option_trace >= 1)
+     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", s.size, t2);
+   t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
+   if (option_trace >= 1)
+     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", s.size, t3);
+   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
+     {
+       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
+               s.size);
+       abort ();
+     }
+   if (t1 < t2 && t1 < t3)
+     method = 1;
+   else if (t2 < t3)
+     method = 2;
+   else
+     method = 3;
+   print_define ("JACOBI_BASE_METHOD", method);
+ }
+ void
+ tune_get_str (void)
+ {
+   /* Tune for decimal, it being most common.  Some rough testing suggests
+      other bases are different, but not by very much.  */
+   s.r = 10;
    {
      static struct param_t  param;
-     param.name[0] = "POWM_THRESHOLD";
+     get_str_precompute_threshold[0] = 0;
-     param.stop_since_change = 15;
+     param.name[0] = "GET_STR_DC_THRESHOLD";
-     one (speed_mpz_powm, powm_threshold, 1, &param);
+     param.function = speed_mpn_get_str;
+     param.min_size[0] = 2;
+     param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
+     one (get_str_basecase_threshold, 1, &param);
    }
-   printf("\n");
    {
      static struct param_t  param;
-     param.name[0] = "GCD_ACCEL_THRESHOLD";
+     param.name[0] = "GET_STR_PRECOMPUTE_THRESHOLD";
-     param.min_size = 1;
+     param.function = speed_mpn_get_str;
-     one (speed_mpn_gcd, gcd_accel_threshold, 1, &param);
+     param.min_size[0] = get_str_basecase_threshold[0];
+     param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
+     one (get_str_precompute_threshold, 1, &param);
    }
+ }
+ void
+ tune_set_str (void)
+ {
+   static struct param_t  param;
+   s.r = 10;  /* decimal */
+   param.step_factor = 0.04;
+   param.name[0] = "SET_STR_THRESHOLD";
+   param.function = speed_mpn_set_str_basecase;
+   param.function2 = speed_mpn_set_str_subquad;
+   param.min_size[0] = 100;
+   param.max_size[0] = 150000;
+   one (set_str_threshold, 1, &param);
+ }
+ void
+ tune_fft_mul (void)
+ {
+   static struct fft_param_t  param;
+   if (option_fft_max_size == 0)
+     return;
+   param.table_name          = "MUL_FFT_TABLE";
+   param.threshold_name      = "MUL_FFT_THRESHOLD";
+   param.p_threshold         = &MUL_FFT_THRESHOLD;
+   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
+   param.p_modf_threshold    = &MUL_FFT_MODF_THRESHOLD;
+   param.first_size          = MUL_TOOM3_THRESHOLD / 2;
+   param.max_size            = option_fft_max_size;
+   param.function            = speed_mpn_mul_fft;
+   param.mul_function        = speed_mpn_mul_n;
+   param.sqr = 0;
+   fft (&param);
+ }
+ void
+ tune_fft_sqr (void)
+ {
+   static struct fft_param_t  param;
+   if (option_fft_max_size == 0)
+     return;
+   param.table_name          = "SQR_FFT_TABLE";
+   param.threshold_name      = "SQR_FFT_THRESHOLD";
+   param.p_threshold         = &SQR_FFT_THRESHOLD;
+   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
+   param.p_modf_threshold    = &SQR_FFT_MODF_THRESHOLD;
+   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
+   param.max_size            = option_fft_max_size;
+   param.function            = speed_mpn_mul_fft_sqr;
+   param.mul_function        = speed_mpn_sqr_n;
+   param.sqr = 0;
+   fft (&param);
+ }
+ void
+ all (void)
+ {
+   time_t  start_time, end_time;
+   TMP_DECL (marker);
+   TMP_MARK (marker);
+   s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
+   s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
+   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
+   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
+   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
+   speed_time_init ();
+   fprintf (stderr, "Using: %s\n", speed_time_string);
+   fprintf (stderr, "speed_precision %d", speed_precision);
+   if (speed_unittime == 1.0)
+     fprintf (stderr, ", speed_unittime 1 cycle");
+   else
+     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
+   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
+     fprintf (stderr, ", CPU freq unknown\n");
+   else
+     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
+   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
+            DEFAULT_MAX_SIZE, option_fft_max_size);
+   fprintf (stderr, "\n");
+   time (&start_time);
    {
-     static struct param_t  param;
+     struct tm  *tp;
-     param.name[0] = "GCDEXT_THRESHOLD";
+     tp = localtime (&start_time);
-     param.min_size = 1;
+     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
-     param.max_size[0] = 200;
+             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
-     one (speed_mpn_gcdext, gcdext_threshold, 1, &param);
+ #ifdef __GNUC__
+     /* gcc sub-minor version doesn't seem to come through as a define */
+     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
+ #define PRINTED_COMPILER
+ #endif
+ #if defined (__SUNPRO_C)
+     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
+ #define PRINTED_COMPILER
+ #endif
+ #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
+     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
+     printf ("MIPSpro C %d.%d.%d */\n",
+             _COMPILER_VERSION / 100,
+             _COMPILER_VERSION / 10 % 10,
+             _COMPILER_VERSION % 10);
+ #define PRINTED_COMPILER
+ #endif
+ #if defined (__DECC) && defined (__DECC_VER)
+     printf ("DEC C %d */\n", __DECC_VER);
+ #define PRINTED_COMPILER
+ #endif
+ #if ! defined (PRINTED_COMPILER)
+     printf ("system compiler */\n");
+ #endif
    }
+   printf ("\n");
+   tune_mul ();
    printf("\n");
-   if (option_fft_max_size != 0)
+   tune_sqr ();
-     {
+   printf("\n");
-       {
-         static struct fft_param_t  param;
-         param.table_name          = "FFT_MUL_TABLE";
-         param.threshold_name      = "FFT_MUL_THRESHOLD";
-         param.p_threshold         = &FFT_MUL_THRESHOLD;
-         param.modf_threshold_name = "FFT_MODF_MUL_THRESHOLD";
-         param.p_modf_threshold    = &FFT_MODF_MUL_THRESHOLD;
-         param.first_size          = TOOM3_MUL_THRESHOLD / 2;
-         param.max_size            = option_fft_max_size;
-         param.function            = speed_mpn_mul_fft;
-         param.mul_function        = speed_mpn_mul_n;
-         param.sqr = 0;
-         fft (&param);
-       }
-       printf("\n");
-       {
-         static struct fft_param_t  param;
-         param.table_name          = "FFT_SQR_TABLE";
-         param.threshold_name      = "FFT_SQR_THRESHOLD";
-         param.p_threshold         = &FFT_SQR_THRESHOLD;
-         param.modf_threshold_name = "FFT_MODF_SQR_THRESHOLD";
-         param.p_modf_threshold    = &FFT_MODF_SQR_THRESHOLD;
-         param.first_size          = TOOM3_SQR_THRESHOLD / 2;
-         param.max_size            = option_fft_max_size;
-         param.function            = speed_mpn_mul_fft_sqr;
-         param.mul_function        = speed_mpn_sqr_n;
-         param.sqr = 0;
-         fft (&param);
-       }
-       printf ("\n");
-     }
+   tune_sb_preinv ();
+   tune_dc ();
+   tune_powm ();
+   printf("\n");
+   tune_gcd_accel ();
+   tune_gcdext ();
+   tune_jacobi_base ();
+   printf("\n");
+   tune_divrem_1 ();
+   tune_mod_1 ();
+   tune_preinv_divrem_1 ();
+   tune_preinv_mod_1 ();
+   tune_divrem_2 ();
+   tune_divexact_1 ();
+   tune_modexact_1_odd ();
+   printf("\n");
+   tune_get_str ();
+   tune_set_str ();
+   printf("\n");
+   tune_fft_mul ();
+   printf("\n");
+   tune_fft_sqr ();
+   printf ("\n");
+   time (&end_time);
+   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
+           end_time - start_time);
    TMP_FREE (marker);
  }
-Line 740  main (int argc, char *argv[])
+Line 1646  main (int argc, char *argv[])
 Line 740  main (int argc, char *argv[])
 Line 1646  main (int argc, char *argv[])
          exit(1);
        }
      }
    all ();
-   return 0;
+   exit (0);
  }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>