[BACK]Return to tuneup.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / tune

Diff for /OpenXM_contrib/gmp/tune/Attic/tuneup.c between version 1.1 and 1.1.1.2

version 1.1, 2000/09/09 14:13:19 version 1.1.1.2, 2003/08/25 16:06:38
Line 1 
Line 1 
 /* Create tuned thresholds for various algorithms. */  /* Create tuned thresholds for various algorithms.
   
 /*  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 Copyright (C) 1999, 2000 Free Software Foundation, Inc.  
   
 This file is part of the GNU MP Library.  This file is part of the GNU MP Library.
   
Line 18  License for more details.
Line 17  License for more details.
 You should have received a copy of the GNU Lesser General Public License  You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 MA 02111-1307, USA.  MA 02111-1307, USA. */
 */  
   
   
 /* Usage: tune [-t] [-t] [-p precision]  /* Usage: tune [-t] [-t] [-p precision]
   
    -t turns on some diagnostic traces, a second -t turns on more traces.     -t turns on some diagnostic traces, a second -t turns on more traces.
Line 52  MA 02111-1307, USA.
Line 51  MA 02111-1307, USA.
    to the final speed of the relevant routines, but nothing has been done to     to the final speed of the relevant routines, but nothing has been done to
    check that carefully.     check that carefully.
   
      Remarks:
   
      The code here isn't a vision of loveliness, mainly because it's subject
      to ongoing modifications according to new things wanting to be tuned and
      practical requirements of systems tested.
   
      The way parts of the library are recompiled to insinuate the tuning
      variables is a bit subtle, but unavoidable since of course the main
      library has fixed thresholds compiled-in but we want to vary them here.
      Most of the nonsense for this can be found in tune/Makefile.am and under
      TUNE_PROGRAM_BUILD in gmp-impl.h.
   
      The dirty hack which the "second_start_min" feature could perhaps be done
      more generally, so if say karatsuba is never better than toom3 then it
      can be detected and omitted.  Currently we're hoping very hard that this
      doesn't arise in practice, and if it does then it indicates something
      badly sub-optimal in the karatsuba implementation.
   
    Limitations:     Limitations:
   
    The FFTs aren't subject to the same badness rule as the other thresholds,     The FFTs aren't subject to the same badness rule as the other thresholds,
    so each k is probably being brought on a touch early.  This isn't likely     so each k is probably being brought on a touch early.  This isn't likely
    to make a difference, and the simpler probing means fewer tests.     to make a difference, and the simpler probing means fewer tests.
   
 */  */
   
 #define TUNE_PROGRAM_BUILD  1  #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
   
   #include "config.h"
   
 #include <math.h>  #include <math.h>
 #include <stdio.h>  #include <stdio.h>
 #include <stdlib.h>  #include <stdlib.h>
 #include <time.h>  #include <time.h>
   #if HAVE_UNISTD_H
 #include <unistd.h>  #include <unistd.h>
   #endif
   
 #include "gmp.h"  #include "gmp.h"
 #include "gmp-impl.h"  #include "gmp-impl.h"
   #include "longlong.h"
   
   #include "tests.h"
 #include "speed.h"  #include "speed.h"
 #include "sqr_basecase.h"  
   
 #if !HAVE_DECL_OPTARG  #if !HAVE_DECL_OPTARG
 extern char *optarg;  extern char *optarg;
Line 80  extern int optind, opterr;
Line 102  extern int optind, opterr;
 #endif  #endif
   
   
 #define MAX_SIZE        1000  /* limbs */  #define DEFAULT_MAX_SIZE   1000  /* limbs */
 #define STEP_FACTOR     0.01  /* how much to step sizes by (rounded down) */  #define MAX_TABLE             5  /* entries */
 #define MAX_TABLE       2     /* threshold entries */  
   
   
 #if WANT_FFT  #if WANT_FFT
 mp_size_t  option_fft_max_size = 50000;  /* limbs */  mp_size_t  option_fft_max_size = 50000;  /* limbs */
 #else  #else
Line 103  int  allocdat = 0;
Line 123  int  allocdat = 0;
   
   
 /* Each "_threshold" array must be 1 bigger than the number of thresholds  /* Each "_threshold" array must be 1 bigger than the number of thresholds
    being tuned in a set, because one() stores an value in the entry above     being tuned in a set, because one() stores a value in the entry above
    the one it's determining. */     the one it's determining. */
   
 mp_size_t  mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };  mp_size_t  mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
 mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;  
 mp_size_t  sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };  mp_size_t  sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
 mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;  mp_size_t  sb_preinv_threshold[2] = { MP_SIZE_T_MAX };
 mp_size_t  bz_threshold[2] = { MP_SIZE_T_MAX };  mp_size_t  dc_threshold[2] = { MP_SIZE_T_MAX };
 mp_size_t  fib_threshold[2] = { MP_SIZE_T_MAX };  
 mp_size_t  powm_threshold[2] = { MP_SIZE_T_MAX };  mp_size_t  powm_threshold[2] = { MP_SIZE_T_MAX };
 mp_size_t  gcd_accel_threshold[2] = { MP_SIZE_T_MAX };  mp_size_t  gcd_accel_threshold[2] = { MP_SIZE_T_MAX };
 mp_size_t  gcdext_threshold[2] = { MP_SIZE_T_MAX };  mp_size_t  gcdext_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  divexact_1_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  divrem_1_norm_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  divrem_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  divrem_2_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  mod_1_norm_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  mod_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  modexact_1_odd_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  get_str_basecase_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  get_str_precompute_threshold[2] = { MP_SIZE_T_MAX };
   mp_size_t  set_str_threshold[2] = { MP_SIZE_T_MAX };
   
   mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
   mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
   
 #ifndef KARATSUBA_SQR_MAX  #ifndef TUNE_SQR_KARATSUBA_MAX
 #define KARATSUBA_SQR_MAX  0 /* meaning no limit */  #define TUNE_SQR_KARATSUBA_MAX  0 /* meaning no limit */
 #endif  #endif
   
 struct param_t {  struct param_t {
   const char  *name[MAX_TABLE];    const char        *name[MAX_TABLE];
   int         stop_since_change;    speed_function_t  function;
   mp_size_t   min_size;    speed_function_t  function2;
   mp_size_t   max_size[MAX_TABLE];    double            step_factor;    /* how much to step sizes (rounded down) */
     double            function_fudge; /* multiplier for "function" speeds */
     int               stop_since_change;
     double            stop_factor;
     mp_size_t         min_size[MAX_TABLE];
     int               min_is_always;
     int               second_start_min;
     mp_size_t         max_size[MAX_TABLE];
     mp_size_t         check_size;
     mp_size_t         size_extra;
   
   #define DATA_HIGH_LT_R  1
   #define DATA_HIGH_GE_R  2
     int               data_high;
   
     int               noprint;
 };  };
   
   #ifndef UDIV_PREINV_ALWAYS
   #define UDIV_PREINV_ALWAYS 0
   #endif
   
   
   mp_limb_t
   randlimb_norm (void)
   {
     mp_limb_t  n;
     mpn_random (&n, 1);
     n |= GMP_LIMB_HIGHBIT;
     return n;
   }
   
   #define MP_LIMB_T_HALFMASK  ((CNST_LIMB(1) << (BITS_PER_MP_LIMB/2)) - 1)
   
   mp_limb_t
   randlimb_half (void)
   {
     mp_limb_t  n;
     mpn_random (&n, 1);
     n &= MP_LIMB_T_HALFMASK;
     n += (n==0);
     return n;
   }
   
   
 /* Add an entry to the end of the dat[] array, reallocing to make it bigger  /* Add an entry to the end of the dat[] array, reallocing to make it bigger
    if necessary.  */     if necessary.  */
 void  void
Line 140  add_dat (mp_size_t size, double d)
Line 211  add_dat (mp_size_t size, double d)
   
   if (ndat == allocdat)    if (ndat == allocdat)
     {      {
       dat = (struct dat_t *) _mp_allocate_or_reallocate        dat = (struct dat_t *) __gmp_allocate_or_reallocate
         (dat, allocdat * sizeof(dat[0]),          (dat, allocdat * sizeof(dat[0]),
          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));           (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
       allocdat += ALLOCDAT_STEP;        allocdat += ALLOCDAT_STEP;
Line 191  analyze_dat (int i, int final)
Line 262  analyze_dat (int i, int final)
           min_j = j;            min_j = j;
         }          }
     }      }
   
   return min_j;    return min_j;
 }  }
   
   
   /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
   
   mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
                                       mp_srcptr ap, mp_size_t size,
                                       mp_limb_t d));
   mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
   
 double  double
 tuneup_measure (speed_function_t fun, struct speed_params *s)  speed_mpn_mod_1_tune (struct speed_params *s)
 {  {
   static mp_ptr  xp, yp;    SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
   }
   double
   speed_mpn_divrem_1_tune (struct speed_params *s)
   {
     SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
   }
   
   
   double
   tuneup_measure (speed_function_t fun,
                   const struct param_t *param,
                   struct speed_params *s)
   {
     static struct param_t  dummy;
   double   t;    double   t;
   TMP_DECL (marker);    TMP_DECL (marker);
   
     if (! param)
       param = &dummy;
   
     s->size += param->size_extra;
   
   TMP_MARK (marker);    TMP_MARK (marker);
   s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);    s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
   s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);    s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
Line 210  tuneup_measure (speed_function_t fun, struct speed_par
Line 307  tuneup_measure (speed_function_t fun, struct speed_par
   mpn_random (s->xp, s->size);    mpn_random (s->xp, s->size);
   mpn_random (s->yp, s->size);    mpn_random (s->yp, s->size);
   
     switch (param->data_high) {
     case DATA_HIGH_LT_R:
       s->xp[s->size-1] %= s->r;
       s->yp[s->size-1] %= s->r;
       break;
     case DATA_HIGH_GE_R:
       s->xp[s->size-1] |= s->r;
       s->yp[s->size-1] |= s->r;
       break;
     }
   
   t = speed_measure (fun, s);    t = speed_measure (fun, s);
   
     s->size -= param->size_extra;
   
   TMP_FREE (marker);    TMP_FREE (marker);
   return t;    return t;
 }  }
   
   
   #define PRINT_WIDTH  28
   
 void  void
 print_define (const char *name, mp_size_t value)  print_define_start (const char *name)
 {  {
   printf ("#ifndef %s\n", name);    printf ("#define %-*s  ", PRINT_WIDTH, name);
   printf ("#define %-23s  ", name);    if (option_trace)
       printf ("...\n");
   }
   
   void
   print_define_end_remark (const char *name, mp_size_t value, const char *remark)
   {
     if (option_trace)
       printf ("#define %-*s  ", PRINT_WIDTH, name);
   
   if (value == MP_SIZE_T_MAX)    if (value == MP_SIZE_T_MAX)
     printf ("MP_SIZE_T_MAX\n");      printf ("MP_SIZE_T_MAX");
   else    else
     printf ("%5ld\n", value);      printf ("%5ld", value);
   printf ("#endif\n");  
     if (remark != NULL)
       printf ("  /* %s */", remark);
     printf ("\n");
 }  }
   
   void
   print_define_end (const char *name, mp_size_t value)
   {
     const char  *remark;
     if (value == MP_SIZE_T_MAX)
       remark = "never";
     else if (value == 0)
       remark = "always";
     else
       remark = NULL;
     print_define_end_remark (name, value, remark);
   }
   
   void
   print_define (const char *name, mp_size_t value)
   {
     print_define_start (name);
     print_define_end (name, value);
   }
   
   void
   print_define_remark (const char *name, mp_size_t value, const char *remark)
   {
     print_define_start (name);
     print_define_end_remark (name, value, remark);
   }
   
   
 /* table[i+1] needs to be set to a sensible value when testing method i+1  /* table[i+1] needs to be set to a sensible value when testing method i+1
    because mpn_mul_n uses TOOM3_MUL_THRESHOLD to size the temporary     because mpn_mul_n uses MUL_TOOM3_THRESHOLD to size the temporary
    workspace for mpn_kara_mul_n. */     workspace for mpn_kara_mul_n. */
   
 void  void
 one (speed_function_t function, mp_size_t table[], size_t max_table,  one (mp_size_t table[], size_t max_table, struct param_t *param)
      struct param_t *param)  
 {  {
   static struct param_t  dummy;    mp_size_t  table_save0 = 0;
     int  since_positive, since_thresh_change;
     int  thresh_idx, new_thresh_idx;
   int  i;    int  i;
   
   if (param == NULL)  param = &dummy;    ASSERT_ALWAYS (max_table <= MAX_TABLE);
   
 #define DEFAULT(x,n)  if (param->x == 0)  param->x = (n);  #define DEFAULT(x,n)  if (! (param->x))  param->x = (n);
   
     DEFAULT (function_fudge, 1.0);
     DEFAULT (function2, param->function);
     DEFAULT (step_factor, 0.01);  /* small steps by default */
   DEFAULT (stop_since_change, 80);    DEFAULT (stop_since_change, 80);
   DEFAULT (min_size, 10);    DEFAULT (stop_factor, 1.2);
   for (i = 0; i < numberof (param->max_size); i++)    for (i = 0; i < max_table; i++)
     DEFAULT (max_size[i], MAX_SIZE);      DEFAULT (min_size[i], 10);
     for (i = 0; i < max_table; i++)
       DEFAULT (max_size[i], DEFAULT_MAX_SIZE);
   
   s.size = param->min_size;    if (param->check_size != 0)
       {
         double   t1, t2;
         s.size = param->check_size;
   
   for (i = 0; i < max_table && s.size < MAX_SIZE; i++)        table[0] = s.size+1;
         table[1] = param->max_size[0];
         t1 = tuneup_measure (param->function, param, &s);
   
         table[0] = s.size;
         table[1] = s.size+1;
         t2 = tuneup_measure (param->function2, param, &s);
         if (t1 == -1.0 || t2 == -1.0)
           {
             printf ("Oops, can't run both functions at size %ld\n", s.size);
             abort ();
           }
         t1 *= param->function_fudge;
   
         /* ask that t2 is at least 4% below t1 */
         if (t1 < t2*1.04)
           {
             if (option_trace)
               printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
             table[0] = MP_SIZE_T_MAX;
             if (! param->noprint)
               print_define (param->name[0], table[0]);
             return;
           }
   
         if (option_trace >= 2)
           printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
                   s.size, t1, t2);
       }
   
     for (i = 0, s.size = 1; i < max_table && s.size < param->max_size[i]; i++)
     {      {
       int  since_positive, since_thresh_change;        if (i == 1 && param->second_start_min)
       int  thresh_idx, new_thresh_idx;          s.size = 1;
   
         if (s.size < param->min_size[i])
           s.size = param->min_size[i];
   
         if (! (param->noprint || (i == 1 && param->second_start_min)))
           print_define_start (param->name[i]);
   
       ndat = 0;        ndat = 0;
       since_positive = 0;        since_positive = 0;
       since_thresh_change = 0;        since_thresh_change = 0;
Line 268  one (speed_function_t function, mp_size_t table[], siz
Line 464  one (speed_function_t function, mp_size_t table[], siz
           printf ("              (seconds)    (seconds)    diff    thresh\n");            printf ("              (seconds)    (seconds)    diff    thresh\n");
         }          }
   
       for ( ; s.size < MAX_SIZE;        for (;
             s.size += MAX ((mp_size_t) floor (s.size * STEP_FACTOR), 1))             s.size < param->max_size[i];
              s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
         {          {
           double   ti, tiplus1, d;            double   ti, tiplus1, d;
   
Line 287  one (speed_function_t function, mp_size_t table[], siz
Line 484  one (speed_function_t function, mp_size_t table[], siz
           /*            /*
             FIXME: check minimum size requirements are met, possibly by just              FIXME: check minimum size requirements are met, possibly by just
             checking for the -1 returns from the speed functions.              checking for the -1 returns from the speed functions.
             if (s.size < MPN_TOOM_TABLE_TO_MINSIZE (i))  
             continue;  
           */            */
   
             /* under this hack, don't let method 0 get used at s.size */
             if (i == 1 && param->second_start_min)
               table[0] = MIN (s.size-1, table_save0);
   
           /* using method i at this size */            /* using method i at this size */
           table[i] = s.size+1;            table[i] = s.size+1;
           table[i+1] = MAX_SIZE;            table[i+1] = param->max_size[i];
           ti = tuneup_measure (function, &s);            ti = tuneup_measure (param->function, param, &s);
           if (ti == -1.0)            if (ti == -1.0)
             abort ();              abort ();
             ti *= param->function_fudge;
   
           /* using method i+1 at this size */            /* using method i+1 at this size */
           table[i] = s.size;            table[i] = s.size;
           table[i+1] = s.size+1;            table[i+1] = s.size+1;
           tiplus1 = tuneup_measure (function, &s);            tiplus1 = tuneup_measure (param->function2, param, &s);
           if (tiplus1 == -1.0)            if (tiplus1 == -1.0)
             abort ();              abort ();
   
Line 318  one (speed_function_t function, mp_size_t table[], siz
Line 518  one (speed_function_t function, mp_size_t table[], siz
   
   
           if (option_trace >= 2)            if (option_trace >= 2)
             printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %d\n",              printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
                     i, s.size, ti, tiplus1, d,                      i, s.size, ti, tiplus1, d,
                     ti > tiplus1 ? '#' : ' ',                      ti > tiplus1 ? '#' : ' ',
                     dat[new_thresh_idx].size);                      dat[new_thresh_idx].size);
Line 327  one (speed_function_t function, mp_size_t table[], siz
Line 527  one (speed_function_t function, mp_size_t table[], siz
              certain number of measurements ago.  */               certain number of measurements ago.  */
 #define STOP_SINCE_POSITIVE  200  #define STOP_SINCE_POSITIVE  200
           if (d >= 0)            if (d >= 0)
             since_positive = 0;              since_positive = 0;
           else            else
             if (++since_positive > STOP_SINCE_POSITIVE)              if (++since_positive > STOP_SINCE_POSITIVE)
               {                {
Line 338  one (speed_function_t function, mp_size_t table[], siz
Line 538  one (speed_function_t function, mp_size_t table[], siz
               }                }
   
           /* Stop if method i has become slower by a certain factor. */            /* Stop if method i has become slower by a certain factor. */
 #define STOP_FACTOR   1.2            if (ti >= tiplus1 * param->stop_factor)
           if (ti >= tiplus1 * STOP_FACTOR)  
             {              {
               if (option_trace >= 1)                if (option_trace >= 1)
                 printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n",                  printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n",
                         i, STOP_FACTOR);                          i, param->stop_factor);
               break;                break;
             }              }
   
Line 374  one (speed_function_t function, mp_size_t table[], siz
Line 573  one (speed_function_t function, mp_size_t table[], siz
         }          }
   
       /* Stop when the size limit is reached before the end of the        /* Stop when the size limit is reached before the end of the
          crossover, without a specified param->max_size[i]. */           crossover, but only show this as an error for >= the default max
       if (s.size >= MAX_SIZE)           size.  FIXME: Maybe should make it a param choice whether this is
            an error.  */
         if (s.size >= param->max_size[i]
             && param->max_size[i] >= DEFAULT_MAX_SIZE)
         {          {
           fprintf (stderr, "%s\n", param->name[i]);            fprintf (stderr, "%s\n", param->name[i]);
           fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n",            fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n",
Line 393  one (speed_function_t function, mp_size_t table[], siz
Line 595  one (speed_function_t function, mp_size_t table[], siz
   
       table[i] = dat[analyze_dat (i, 1)].size;        table[i] = dat[analyze_dat (i, 1)].size;
   
       print_define (param->name[i], table[i]);        /* fudge here, let min_is_always apply only to i==0, that's what the
            sqr_n thresholds want */
         if (i == 0 && param->min_is_always && table[i] == param->min_size[i])
           table[i] = 0;
   
       /* Look for the next threshold starting from the current one, but back        /* under the second_start_min fudge, if the second threshold turns out
          a bit. */           to be lower than the first, then the second method is unwanted, we
            should go straight from algorithm 1 to algorithm 3.  */
         if (param->second_start_min)
           {
             if (i == 0)
               {
                 table_save0 = table[0];
                 table[0] = 0;
               }
             else if (i == 1)
               {
                 table[0] = table_save0;
                 if (table[1] <= table[0])
                   {
                     table[0] = table[1];
                     table[1] = 0;
                   }
               }
             s.size = MAX (table[0], table[1]) + 1;
           }
   
         if (! (param->noprint || (i == 0 && param->second_start_min)))
           {
             if (i == 1 && param->second_start_min)
               {
                 print_define_end (param->name[0], table[0]);
                 print_define_start (param->name[1]);
               }
   
             print_define_end (param->name[i], table[i]);
           }
   
         /* Look for the next threshold starting from the current one. */
       s.size = table[i]+1;        s.size = table[i]+1;
     }  
         /* Take a MAX of all to allow for second_start_min producing a 0. */
         {
           int  j;
           for (j = 0; j < i; j++)
             s.size = MAX (s.size, table[j]+1);
         }
       }
 }  }
   
   
Line 426  struct fft_param_t {
Line 670  struct fft_param_t {
   mp_size_t         sqr;    mp_size_t         sqr;
 };  };
   
   
 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with  /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple     N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
    of 2^(k-1) bits. */     of 2^(k-1) bits. */
Line 433  struct fft_param_t {
Line 678  struct fft_param_t {
 mp_size_t  mp_size_t
 fft_step_size (int k)  fft_step_size (int k)
 {  {
   if (2*k-1 > BITS_PER_INT)    mp_size_t  step;
   
     step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB;
     step *= (mp_size_t) 1 << k;
   
     if (step <= 0)
     {      {
       printf ("Can't handle k=%d\n", k);        printf ("Can't handle k=%d\n", k);
       abort ();        abort ();
     }      }
   return (1<<k) * (MAX (1<<(k-1), BITS_PER_MP_LIMB)) / BITS_PER_MP_LIMB;  
     return step;
 }  }
   
 mp_size_t  mp_size_t
Line 469  fft (struct fft_param_t *p)
Line 720  fft (struct fft_param_t *p)
   
   option_trace = MAX (option_trace, option_fft_trace);    option_trace = MAX (option_trace, option_fft_trace);
   
   printf ("#ifndef %s\n", p->table_name);  
   printf ("#define %s  {", p->table_name);    printf ("#define %s  {", p->table_name);
   if (option_trace >= 2)    if (option_trace >= 2)
     printf ("\n");      printf ("\n");
Line 487  fft (struct fft_param_t *p)
Line 737  fft (struct fft_param_t *p)
       if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))        if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))
         break;          break;
   
       usleep(10000);  
   
       /* compare k to k+1 in the middle of the current k+1 step */        /* compare k to k+1 in the middle of the current k+1 step */
       s.size = size + fft_step_size (k+1) / 2;        s.size = size + fft_step_size (k+1) / 2;
       s.r = k;        s.r = k;
       tk = tuneup_measure (p->function, &s);        tk = tuneup_measure (p->function, NULL, &s);
       if (tk == -1.0)        if (tk == -1.0)
         abort ();          abort ();
   
       usleep(10000);  
   
       s.r = k+1;        s.r = k+1;
       tk1 = tuneup_measure (p->function, &s);        tk1 = tuneup_measure (p->function, NULL, &s);
       if (tk1 == -1.0)        if (tk1 == -1.0)
         abort ();          abort ();
   
       if (option_trace >= 2)        if (option_trace >= 2)
         printf ("at %ld   size=%ld  k=%d  %.9lf   k=%d %.9lf\n",          printf ("at %ld   size=%ld  k=%d  %.9f   k=%d %.9f\n",
                 size, s.size, k, tk, k+1, tk1);                  size, s.size, k, tk, k+1, tk1);
   
       /* declare the k+1 threshold as soon as it's faster at its midpoint */        /* declare the k+1 threshold as soon as it's faster at its midpoint */
Line 519  fft (struct fft_param_t *p)
Line 765  fft (struct fft_param_t *p)
   
   mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;    mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;
   printf (" 0 }\n");    printf (" 0 }\n");
   printf ("#endif\n");  
   
   
   size = p->first_size;    size = p->first_size;
   
   /* Declare an FFT faster than a plain toom3 etc multiplication found as    /* Declare an FFT faster than a plain toom3 etc multiplication found as
      soon as one faster measurement obtained.  A multiplication in the       soon as one faster measurement obtained.  A multiplication in the
      middle of the FFT step is tested.  */       middle of the FFT step is tested.  */
Line 548  fft (struct fft_param_t *p)
Line 793  fft (struct fft_param_t *p)
       if (size >= p->max_size)        if (size >= p->max_size)
         break;          break;
   
       usleep(10000);  
   
       s.size = size + fft_step_size (k) / 2;        s.size = size + fft_step_size (k) / 2;
       s.r = k;        s.r = k;
       tk = tuneup_measure (p->function, &s);        tk = tuneup_measure (p->function, NULL, &s);
       if (tk == -1.0)        if (tk == -1.0)
         abort ();          abort ();
   
       usleep(10000);  
   
       if (!modf)  s.size /= 2;        if (!modf)  s.size /= 2;
       tm = tuneup_measure (p->mul_function, &s);        tm = tuneup_measure (p->mul_function, NULL, &s);
       if (tm == -1.0)        if (tm == -1.0)
         abort ();          abort ();
   
       if (option_trace >= 2)        if (option_trace >= 2)
         printf ("at %ld   size=%ld   k=%d  %.9lf   size=%ld %s mul %.9lf\n",          printf ("at %ld   size=%ld   k=%d  %.9f   size=%ld %s mul %.9f\n",
                 size,                  size,
                 size + fft_step_size (k) / 2, k, tk,                  size + fft_step_size (k) / 2, k, tk,
                 s.size, modf ? "modf" : "full", tm);                  s.size, modf ? "modf" : "full", tm);
Line 588  fft (struct fft_param_t *p)
Line 829  fft (struct fft_param_t *p)
 }  }
   
   
   
   /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
      giving wrong results.  */
 void  void
 all (void)  tune_mul (void)
 {  {
   TMP_DECL (marker);    static struct param_t  param;
     param.name[0] = "MUL_KARATSUBA_THRESHOLD";
     param.name[1] = "MUL_TOOM3_THRESHOLD";
     param.function = speed_mpn_mul_n;
     param.min_size[0] = MAX (4, MPN_KARA_MUL_N_MINSIZE);
     param.max_size[0] = MUL_TOOM3_THRESHOLD_LIMIT-1;
     param.max_size[1] = MUL_TOOM3_THRESHOLD_LIMIT-1;
     one (mul_threshold, 2, &param);
   
   TMP_MARK (marker);    /* disabled until tuned */
   s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);    MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
   s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);  }
   
   speed_time_init ();  
   fprintf (stderr, "speed_precision %d, speed_unittime %.2e\n",  
            speed_precision, speed_unittime);  
   fprintf (stderr, "MAX_SIZE %ld, fft_max_size %ld, STEP_FACTOR %.3f\n",  
            MAX_SIZE, option_fft_max_size, STEP_FACTOR);  
   fprintf (stderr, "\n");  
   
   {  /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
     struct tm  *tp;     is faster only at size==2 then we don't want to bother with extra code
     time_t     t;     just for that.  Start karatsuba from 4 same as MUL above.  */
     time (&t);  void
     tp = localtime (&t);  tune_sqr (void)
     printf ("/* Generated by tuneup.c, %d-%02d-%02d. */\n\n",  {
             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);    static struct param_t  param;
     param.name[0] = "SQR_BASECASE_THRESHOLD";
     param.name[1] = "SQR_KARATSUBA_THRESHOLD";
     param.name[2] = "SQR_TOOM3_THRESHOLD";
     param.function = speed_mpn_sqr_n;
     param.min_is_always = 1;
     param.second_start_min = 1;
     param.min_size[0] = 3;
     param.min_size[1] = MAX (4, MPN_KARA_SQR_N_MINSIZE);
     param.min_size[2] = MPN_TOOM3_SQR_N_MINSIZE;
     param.max_size[0] = TUNE_SQR_KARATSUBA_MAX;
     param.max_size[1] = TUNE_SQR_KARATSUBA_MAX;
     one (sqr_threshold, 3, &param);
   
     /* disabled until tuned */
     SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
   }
   
   
   void
   tune_sb_preinv (void)
   {
     static struct param_t  param;
   
     if (UDIV_PREINV_ALWAYS)
       {
         print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always");
         return;
       }
   
     param.check_size = 256;
     param.min_size[0] = 3;
     param.min_is_always = 1;
     param.size_extra = 3;
     param.stop_factor = 2.0;
     param.name[0] = "DIV_SB_PREINV_THRESHOLD";
     param.function = speed_mpn_sb_divrem_m3;
     one (sb_preinv_threshold, 1, &param);
   }
   
   
   void
   tune_dc (void)
   {
     static struct param_t  param;
     param.name[0] = "DIV_DC_THRESHOLD";
     param.function = speed_mpn_dc_tdiv_qr;
     one (dc_threshold, 1, &param);
   }
   
   
   /* This is an indirect determination, based on a comparison between redc and
      mpz_mod.  A fudge factor of 1.04 is applied to redc, to represent
      additional overheads it gets in mpz_powm.
   
      stop_factor is 1.1 to hopefully help cray vector systems, where otherwise
      currently it hits the 1000 limb limit with only a factor of about 1.18
      (threshold should be around 650).  */
   
   void
   tune_powm (void)
   {
     static struct param_t  param;
     param.name[0] = "POWM_THRESHOLD";
     param.function = speed_redc;
     param.function2 = speed_mpz_mod;
     param.step_factor = 0.03;
     param.stop_factor = 1.1;
     param.function_fudge = 1.04;
     one (powm_threshold, 1, &param);
   }
   
   
   void
   tune_gcd_accel (void)
   {
     static struct param_t  param;
     param.name[0] = "GCD_ACCEL_THRESHOLD";
     param.function = speed_mpn_gcd;
     param.min_size[0] = 1;
     one (gcd_accel_threshold, 1, &param);
   }
   
   
   
   /* A comparison between the speed of a single limb step and a double limb
      step is made.  On a 32-bit limb the ratio is about 2.2 single steps to
      equal a double step, or on a 64-bit limb about 2.09.  (These were found
      from counting the steps on a 10000 limb gcdext.  */
   void
   tune_gcdext (void)
   {
     static struct param_t  param;
     param.name[0] = "GCDEXT_THRESHOLD";
     param.function = speed_mpn_gcdext_one_single;
     param.function2 = speed_mpn_gcdext_one_double;
     switch (BITS_PER_MP_LIMB) {
     case 32: param.function_fudge = 2.2; break;
     case 64: param.function_fudge = 2.09; break;
     default:
       printf ("Don't know GCDEXT_THERSHOLD factor for BITS_PER_MP_LIMB == %d\n",
               BITS_PER_MP_LIMB);
       abort ();
   }    }
     param.min_size[0] = 5;
     param.min_is_always = 1;
     param.max_size[0] = 300;
     param.check_size = 300;
     one (gcdext_threshold, 1, &param);
   }
   
   
   /* size_extra==1 reflects the fact that with high<divisor one division is
      always skipped.  Forcing high<divisor while testing ensures consistency
      while stepping through sizes, ie. that size-1 divides will be done each
      time.
   
      min_size==2 and min_is_always are used so that if plain division is only
      better at size==1 then don't bother including that code just for that
      case, instead go with preinv always and get a size saving.  */
   
   #define DIV_1_PARAMS                    \
     param.check_size = 256;               \
     param.min_size[0] = 2;                   \
     param.min_is_always = 1;              \
     param.data_high = DATA_HIGH_LT_R;     \
     param.size_extra = 1;                 \
     param.stop_factor = 2.0;
   
   
   double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
   
   void
   tune_divrem_1 (void)
   {
     /* plain version by default */
     tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
   
   #ifndef HAVE_NATIVE_mpn_divrem_1
   #define HAVE_NATIVE_mpn_divrem_1 0
   #endif
   
     /* No support for tuning native assembler code, do that by hand and put
        the results in the .asm file, there's no need for such thresholds to
        appear in gmp-mparam.h.  */
     if (HAVE_NATIVE_mpn_divrem_1)
       return;
   
     if (UDIV_PREINV_ALWAYS)
       {
         print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
         print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
         return;
       }
   
     tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
   
     /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
        a bit out for the fractional part, but that's too bad, the integer part
        is more important. */
   {    {
     static struct param_t  param;      static struct param_t  param;
     param.name[0] = "KARATSUBA_MUL_THRESHOLD";      param.name[0] = "DIVREM_1_NORM_THRESHOLD";
     param.name[1] = "TOOM3_MUL_THRESHOLD";      DIV_1_PARAMS;
     param.max_size[1] = TOOM3_MUL_THRESHOLD_LIMIT;      s.r = randlimb_norm ();
     one (speed_mpn_mul_n, mul_threshold, numberof(mul_threshold)-1, &param);      param.function = speed_mpn_divrem_1_tune;
       one (divrem_1_norm_threshold, 1, &param);
   }    }
   printf("\n");  
   
   {    {
     static struct param_t  param;      static struct param_t  param;
     param.name[0] = "KARATSUBA_SQR_THRESHOLD";      param.name[0] = "DIVREM_1_UNNORM_THRESHOLD";
     param.name[1] = "TOOM3_SQR_THRESHOLD";      DIV_1_PARAMS;
     param.max_size[0] = KARATSUBA_SQR_MAX;      s.r = randlimb_half ();
     one (speed_mpn_sqr_n, sqr_threshold, numberof(sqr_threshold)-1, &param);      param.function = speed_mpn_divrem_1_tune;
       one (divrem_1_unnorm_threshold, 1, &param);
   }    }
   printf("\n");  }
   
   
   double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
   
   void
   tune_mod_1 (void)
   {
     /* plain version by default */
     tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
   
   #ifndef HAVE_NATIVE_mpn_mod_1
   #define HAVE_NATIVE_mpn_mod_1 0
   #endif
   
     /* No support for tuning native assembler code, do that by hand and put
        the results in the .asm file, there's no need for such thresholds to
        appear in gmp-mparam.h.  */
     if (HAVE_NATIVE_mpn_mod_1)
       return;
   
     if (UDIV_PREINV_ALWAYS)
       {
         print_define ("MOD_1_NORM_THRESHOLD", 0L);
         print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
         return;
       }
   
     tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
   
   {    {
     static struct param_t  param;      static struct param_t  param;
     param.name[0] = "BZ_THRESHOLD";      param.name[0] = "MOD_1_NORM_THRESHOLD";
     one (speed_mpn_bz_tdiv_qr, bz_threshold, 1, &param);      DIV_1_PARAMS;
       s.r = randlimb_norm ();
       param.function = speed_mpn_mod_1_tune;
       one (mod_1_norm_threshold, 1, &param);
   }    }
   printf("\n");  
   
   {    {
     static struct param_t  param;      static struct param_t  param;
     param.name[0] = "FIB_THRESHOLD";      param.name[0] = "MOD_1_UNNORM_THRESHOLD";
     one (speed_mpz_fib_ui, fib_threshold, 1, &param);      DIV_1_PARAMS;
       s.r = randlimb_half ();
       param.function = speed_mpn_mod_1_tune;
       one (mod_1_unnorm_threshold, 1, &param);
   }    }
   printf("\n");  }
   
   /* mpz_powm becomes slow before long, so stop soon after the determined  
      threshold stops changing. */  /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
      imply that udiv_qrnnd_preinv is worth using, but it seems most
      straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
      directly.  */
   
   void
   tune_preinv_divrem_1 (void)
   {
     static struct param_t  param;
     speed_function_t  divrem_1;
     const char        *divrem_1_name;
     double            t1, t2;
   
   #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
   #define HAVE_NATIVE_mpn_preinv_divrem_1 0
   #endif
   
     /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
        it's faster than mpn_divrem_1.  */
     if (HAVE_NATIVE_mpn_preinv_divrem_1)
       {
         print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
         return;
       }
   
     /* If udiv_qrnnd_preinv is the only division method then of course
        mpn_preinv_divrem_1 should be used.  */
     if (UDIV_PREINV_ALWAYS)
       {
         print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
         return;
       }
   
     /* If we've got an assembler version of mpn_divrem_1, then compare against
        that, not the mpn_divrem_1_div generic C.  */
     if (HAVE_NATIVE_mpn_divrem_1)
       {
         divrem_1 = speed_mpn_divrem_1;
         divrem_1_name = "mpn_divrem_1";
       }
     else
       {
         divrem_1 = speed_mpn_divrem_1_div;
         divrem_1_name = "mpn_divrem_1_div";
       }
   
     param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
     s.size = 200;                     /* generous but not too big */
     /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
        since in general that's probably most common, though in fact for a
        64-bit limb mp_bases[10].big_base is normalized.  */
     s.r = urandom() & (MP_LIMB_T_MAX >> 4);
     if (s.r == 0) s.r = 123;
   
     t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
     t2 = tuneup_measure (divrem_1, &param, &s);
     if (t1 == -1.0 || t2 == -1.0)
       {
         printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
                 divrem_1_name, s.size);
         abort ();
       }
     if (option_trace >= 1)
       printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
               s.size, t1, divrem_1_name, t2);
   
     print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
   }
   
   
   /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
      that udiv_qrnnd_preinv is worth using, but it seems most straightforward
      to compare mpn_preinv_mod_1 and mpn_mod_1_div directly.  */
   
   void
   tune_preinv_mod_1 (void)
   {
     static struct param_t  param;
     speed_function_t  mod_1;
     const char        *mod_1_name;
     double            t1, t2;
   
   #ifndef HAVE_NATIVE_mpn_preinv_mod_1
   #define HAVE_NATIVE_mpn_preinv_mod_1 0
   #endif
   
     /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
        faster than mpn_mod_1.  */
     if (HAVE_NATIVE_mpn_preinv_mod_1)
       {
         print_define_remark ("USE_PREINV_MOD_1", 1, "native");
         return;
       }
   
     /* If udiv_qrnnd_preinv is the only division method then of course
        mpn_preinv_mod_1 should be used.  */
     if (UDIV_PREINV_ALWAYS)
       {
         print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
         return;
       }
   
     /* If we've got an assembler version of mpn_mod_1, then compare against
        that, not the mpn_mod_1_div generic C.  */
     if (HAVE_NATIVE_mpn_mod_1)
       {
         mod_1 = speed_mpn_mod_1;
         mod_1_name = "mpn_mod_1";
       }
     else
       {
         mod_1 = speed_mpn_mod_1_div;
         mod_1_name = "mpn_mod_1_div";
       }
   
     param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
     s.size = 200;                     /* generous but not too big */
     s.r = randlimb_norm();            /* divisor */
   
     t1 = tuneup_measure (speed_mpn_preinv_mod_1, &param, &s);
     t2 = tuneup_measure (mod_1, &param, &s);
     if (t1 == -1.0 || t2 == -1.0)
       {
         printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
                 mod_1_name, s.size);
         abort ();
       }
     if (option_trace >= 1)
       printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
               s.size, t1, mod_1_name, t2);
   
     print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
   }
   
   
   void
   tune_divrem_2 (void)
   {
     static struct param_t  param;
   
   #ifndef HAVE_NATIVE_mpn_divrem_2
   #define HAVE_NATIVE_mpn_divrem_2 0
   #endif
   
     /* No support for tuning native assembler code, do that by hand and put
        the results in the .asm file, and there's no need for such thresholds
        to appear in gmp-mparam.h.  */
     if (HAVE_NATIVE_mpn_divrem_2)
       return;
   
     if (UDIV_PREINV_ALWAYS)
       {
         print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
         return;
       }
   
     /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
        a bit out for the fractional part, but that's too bad, the integer part
        is more important.
   
        min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
        code space if plain division is better only at size==2 or size==3. */
     param.name[0] = "DIVREM_2_THRESHOLD";
     param.check_size = 256;
     param.min_size[0] = 4;
     param.min_is_always = 1;
     param.size_extra = 2;      /* does qsize==nsize-2 divisions */
     param.stop_factor = 2.0;
   
     s.r = randlimb_norm ();
     param.function = speed_mpn_divrem_2;
     one (divrem_2_threshold, 1, &param);
   }
   
   
   /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
      tune for that.  Its speed can differ on odd or even divisor, so take an
      average threshold for the two.
   
      mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
      might not vary that way, but don't test this since high<divisor isn't
      expected to occur often with small divisors.  */
   
   void
   tune_divexact_1 (void)
   {
     static struct param_t  param;
     mp_size_t  thresh[2], average;
     int        low, i;
   
     ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
   
     param.name[0] = "DIVEXACT_1_THRESHOLD";
     param.data_high = DATA_HIGH_GE_R;
     param.check_size = 256;
     param.min_size[0] = 2;
     param.stop_factor = 1.5;
     param.function  = tuned_speed_mpn_divrem_1;
     param.function2 = speed_mpn_divexact_1;
     param.noprint = 1;
   
     print_define_start (param.name[0]);
   
     for (low = 0; low <= 1; low++)
       {
         s.r = randlimb_half();
         if (low == 0)
           s.r |= 1;
         else
           s.r &= ~CNST_LIMB(7);
   
         one (divexact_1_threshold, 1, &param);
         if (option_trace)
           printf ("low=%d thresh %ld\n", low, divexact_1_threshold[0]);
   
         if (divexact_1_threshold[0] == MP_SIZE_T_MAX)
           {
             average = MP_SIZE_T_MAX;
             goto divexact_1_done;
           }
   
         thresh[low] = divexact_1_threshold[0];
       }
   
     if (option_trace)
       {
         printf ("average of:");
         for (i = 0; i < numberof(thresh); i++)
           printf (" %ld", thresh[i]);
         printf ("\n");
       }
   
     average = 0;
     for (i = 0; i < numberof(thresh); i++)
       average += thresh[i];
     average /= numberof(thresh);
   
     /* If divexact turns out to be better as early as 3 limbs, then use it
        always, so as to reduce code size and conditional jumps.  */
     if (average <= 3)
       average = 0;
   
    divexact_1_done:
     print_define_end (param.name[0], average);
   }
   
   
   /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
      same as mpn_mod_1, but this might not be true of an assembler
      implementation.  The threshold used is an average based on data where a
      divide can be skipped and where it can't.
   
      If modexact turns out to be better as early as 3 limbs, then use it
      always, so as to reduce code size and conditional jumps.  */
   
   void
   tune_modexact_1_odd (void)
   {
     static struct param_t  param;
     mp_size_t  thresh_lt;
   
     ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
   
     param.name[0] = "MODEXACT_1_ODD_THRESHOLD";
     param.check_size = 256;
     param.min_size[0] = 2;
     param.stop_factor = 1.5;
     param.function  = tuned_speed_mpn_mod_1;
     param.function2 = speed_mpn_modexact_1c_odd;
     param.noprint = 1;
     s.r = randlimb_half () | 1;
   
     print_define_start (param.name[0]);
   
     param.data_high = DATA_HIGH_LT_R;
     one (modexact_1_odd_threshold, 1, &param);
     if (option_trace)
       printf ("lt thresh %ld\n", modexact_1_odd_threshold[0]);
   
     thresh_lt = modexact_1_odd_threshold[0];
     if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
       {
         param.data_high = DATA_HIGH_GE_R;
         one (modexact_1_odd_threshold, 1, &param);
         if (option_trace)
           printf ("ge thresh %ld\n", modexact_1_odd_threshold[0]);
   
         if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
           {
             modexact_1_odd_threshold[0]
               = (modexact_1_odd_threshold[0] + thresh_lt) / 2;
             if (modexact_1_odd_threshold[0] <= 3)
               modexact_1_odd_threshold[0] = 0;
           }
       }
   
     print_define_end (param.name[0], modexact_1_odd_threshold[0]);
   }
   
   
   void
   tune_jacobi_base (void)
   {
     static struct param_t  param;
     double   t1, t2, t3;
     int      method;
   
     s.size = BITS_PER_MP_LIMB * 3 / 4;
   
     t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
     if (option_trace >= 1)
       printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", s.size, t1);
   
     t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
     if (option_trace >= 1)
       printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", s.size, t2);
   
     t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
     if (option_trace >= 1)
       printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", s.size, t3);
   
     if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
       {
         printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
                 s.size);
         abort ();
       }
   
     if (t1 < t2 && t1 < t3)
       method = 1;
     else if (t2 < t3)
       method = 2;
     else
       method = 3;
   
     print_define ("JACOBI_BASE_METHOD", method);
   }
   
   
   void
   tune_get_str (void)
   {
     /* Tune for decimal, it being most common.  Some rough testing suggests
        other bases are different, but not by very much.  */
     s.r = 10;
   {    {
     static struct param_t  param;      static struct param_t  param;
     param.name[0] = "POWM_THRESHOLD";      get_str_precompute_threshold[0] = 0;
     param.stop_since_change = 15;      param.name[0] = "GET_STR_DC_THRESHOLD";
     one (speed_mpz_powm, powm_threshold, 1, &param);      param.function = speed_mpn_get_str;
       param.min_size[0] = 2;
       param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
       one (get_str_basecase_threshold, 1, &param);
   }    }
   printf("\n");  
   
   {    {
     static struct param_t  param;      static struct param_t  param;
     param.name[0] = "GCD_ACCEL_THRESHOLD";      param.name[0] = "GET_STR_PRECOMPUTE_THRESHOLD";
     param.min_size = 1;      param.function = speed_mpn_get_str;
     one (speed_mpn_gcd, gcd_accel_threshold, 1, &param);      param.min_size[0] = get_str_basecase_threshold[0];
       param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
       one (get_str_precompute_threshold, 1, &param);
   }    }
   }
   
   
   void
   tune_set_str (void)
   {
     static struct param_t  param;
   
     s.r = 10;  /* decimal */
     param.step_factor = 0.04;
     param.name[0] = "SET_STR_THRESHOLD";
     param.function = speed_mpn_set_str_basecase;
     param.function2 = speed_mpn_set_str_subquad;
     param.min_size[0] = 100;
     param.max_size[0] = 150000;
     one (set_str_threshold, 1, &param);
   }
   
   
   void
   tune_fft_mul (void)
   {
     static struct fft_param_t  param;
   
     if (option_fft_max_size == 0)
       return;
   
     param.table_name          = "MUL_FFT_TABLE";
     param.threshold_name      = "MUL_FFT_THRESHOLD";
     param.p_threshold         = &MUL_FFT_THRESHOLD;
     param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
     param.p_modf_threshold    = &MUL_FFT_MODF_THRESHOLD;
     param.first_size          = MUL_TOOM3_THRESHOLD / 2;
     param.max_size            = option_fft_max_size;
     param.function            = speed_mpn_mul_fft;
     param.mul_function        = speed_mpn_mul_n;
     param.sqr = 0;
     fft (&param);
   }
   
   
   void
   tune_fft_sqr (void)
   {
     static struct fft_param_t  param;
   
     if (option_fft_max_size == 0)
       return;
   
     param.table_name          = "SQR_FFT_TABLE";
     param.threshold_name      = "SQR_FFT_THRESHOLD";
     param.p_threshold         = &SQR_FFT_THRESHOLD;
     param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
     param.p_modf_threshold    = &SQR_FFT_MODF_THRESHOLD;
     param.first_size          = SQR_TOOM3_THRESHOLD / 2;
     param.max_size            = option_fft_max_size;
     param.function            = speed_mpn_mul_fft_sqr;
     param.mul_function        = speed_mpn_sqr_n;
     param.sqr = 0;
     fft (&param);
   }
   
   
   void
   all (void)
   {
     time_t  start_time, end_time;
     TMP_DECL (marker);
   
     TMP_MARK (marker);
     s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
     s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
   
     mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
     mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
   
     fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
   
     speed_time_init ();
     fprintf (stderr, "Using: %s\n", speed_time_string);
   
     fprintf (stderr, "speed_precision %d", speed_precision);
     if (speed_unittime == 1.0)
       fprintf (stderr, ", speed_unittime 1 cycle");
     else
       fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
     if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
       fprintf (stderr, ", CPU freq unknown\n");
     else
       fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
   
     fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
              DEFAULT_MAX_SIZE, option_fft_max_size);
     fprintf (stderr, "\n");
   
     time (&start_time);
   {    {
     static struct param_t  param;      struct tm  *tp;
     param.name[0] = "GCDEXT_THRESHOLD";      tp = localtime (&start_time);
     param.min_size = 1;      printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
     param.max_size[0] = 200;              tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
     one (speed_mpn_gcdext, gcdext_threshold, 1, &param);  
   #ifdef __GNUC__
       /* gcc sub-minor version doesn't seem to come through as a define */
       printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
   #define PRINTED_COMPILER
   #endif
   #if defined (__SUNPRO_C)
       printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
   #define PRINTED_COMPILER
   #endif
   #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
       /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
       printf ("MIPSpro C %d.%d.%d */\n",
               _COMPILER_VERSION / 100,
               _COMPILER_VERSION / 10 % 10,
               _COMPILER_VERSION % 10);
   #define PRINTED_COMPILER
   #endif
   #if defined (__DECC) && defined (__DECC_VER)
       printf ("DEC C %d */\n", __DECC_VER);
   #define PRINTED_COMPILER
   #endif
   #if ! defined (PRINTED_COMPILER)
       printf ("system compiler */\n");
   #endif
   }    }
     printf ("\n");
   
     tune_mul ();
   printf("\n");    printf("\n");
   
   if (option_fft_max_size != 0)    tune_sqr ();
     {    printf("\n");
       {  
         static struct fft_param_t  param;  
         param.table_name          = "FFT_MUL_TABLE";  
         param.threshold_name      = "FFT_MUL_THRESHOLD";  
         param.p_threshold         = &FFT_MUL_THRESHOLD;  
         param.modf_threshold_name = "FFT_MODF_MUL_THRESHOLD";  
         param.p_modf_threshold    = &FFT_MODF_MUL_THRESHOLD;  
         param.first_size          = TOOM3_MUL_THRESHOLD / 2;  
         param.max_size            = option_fft_max_size;  
         param.function            = speed_mpn_mul_fft;  
         param.mul_function        = speed_mpn_mul_n;  
         param.sqr = 0;  
         fft (&param);  
       }  
       printf("\n");  
       {  
         static struct fft_param_t  param;  
         param.table_name          = "FFT_SQR_TABLE";  
         param.threshold_name      = "FFT_SQR_THRESHOLD";  
         param.p_threshold         = &FFT_SQR_THRESHOLD;  
         param.modf_threshold_name = "FFT_MODF_SQR_THRESHOLD";  
         param.p_modf_threshold    = &FFT_MODF_SQR_THRESHOLD;  
         param.first_size          = TOOM3_SQR_THRESHOLD / 2;  
         param.max_size            = option_fft_max_size;  
         param.function            = speed_mpn_mul_fft_sqr;  
         param.mul_function        = speed_mpn_sqr_n;  
         param.sqr = 0;  
         fft (&param);  
       }  
       printf ("\n");  
     }  
   
     tune_sb_preinv ();
     tune_dc ();
     tune_powm ();
     printf("\n");
   
     tune_gcd_accel ();
     tune_gcdext ();
     tune_jacobi_base ();
     printf("\n");
   
     tune_divrem_1 ();
     tune_mod_1 ();
     tune_preinv_divrem_1 ();
     tune_preinv_mod_1 ();
     tune_divrem_2 ();
     tune_divexact_1 ();
     tune_modexact_1_odd ();
     printf("\n");
   
     tune_get_str ();
     tune_set_str ();
     printf("\n");
   
     tune_fft_mul ();
     printf("\n");
   
     tune_fft_sqr ();
     printf ("\n");
   
     time (&end_time);
     printf ("/* Tuneup completed successfully, took %ld seconds */\n",
             end_time - start_time);
   
   TMP_FREE (marker);    TMP_FREE (marker);
 }  }
   
Line 740  main (int argc, char *argv[])
Line 1646  main (int argc, char *argv[])
         exit(1);          exit(1);
       }        }
     }      }
   
   all ();    all ();
   return 0;    exit (0);
 }  }

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.1.1.2

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>