=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/x86/k7/mmx/Attic/divrem_1.asm,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1.1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/x86/k7/mmx/Attic/divrem_1.asm 2000/09/09 14:12:42 1.1.1.1 +++ OpenXM_contrib/gmp/mpn/x86/k7/mmx/Attic/divrem_1.asm 2003/08/25 16:06:29 1.1.1.2 @@ -1,37 +1,41 @@ -dnl AMD K7 mpn_divrem_1 -- mpn by limb division. -dnl -dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. +dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb +dnl division. - -dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. -dnl +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. +dnl dnl This file is part of the GNU MP Library. -dnl +dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. -dnl +dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. -dnl +dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. - include(`../config.m4') +C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. + + C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor); C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); C C The method and nomenclature follow part 8 of "Division by Invariant C Integers using Multiplication" by Granlund and Montgomery, reference in @@ -42,12 +46,25 @@ C for m', and "d" for d_norm, which won't cause any co C only the normalized divisor that's of any use in the code. "b" is written C for 2^N, the size of a limb, N being 32 here. C -C mpn_divrem_1 avoids one division if the src high limb is less than the -C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal -C circumstances that will be a very rare event. +C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high +C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero +C carry, since in normal circumstances that will be a very rare event. C +C The test for skipping a division is branch free (once size>=1 is tested). +C The store to the destination high limb is 0 when a divide is skipped, or +C if it's not skipped then a copy of the src high limb is used. The latter +C is in case src==dst. +C C There's a small bias towards expecting xsize==0, by having code for C xsize==0 in a straight line and xsize!=0 under forward jumps. +C +C Alternatives: +C +C If the divisor is normalized (high bit set) then a division step can +C always be skipped, since the high destination limb is always 0 or 1 in +C that case. It doesn't seem worth checking for this though, since it +C probably occurs infrequently, in particular note that big_base for a +C decimal mpn_get_str is not normalized in a 32-bit limb. dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by @@ -62,7 +79,9 @@ dnl even more so on the fractional part. deflit(MUL_THRESHOLD, 3) -defframe(PARAM_CARRY, 24) +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c defframe(PARAM_DIVISOR,20) defframe(PARAM_SIZE, 16) defframe(PARAM_SRC, 12) @@ -82,9 +101,64 @@ defframe(VAR_DST_STOP,-36) deflit(STACK_SPACE, 36) - .text + TEXT ALIGN(32) +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + movl PARAM_XSIZE, %ecx + movl PARAM_DST, %edx + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edx, VAR_DST_STOP C &dst[xsize+2] + movl %edi, SAVE_EDI + xorl %edi, %edi C carry + + movl -4(%esi,%ebx,4), %eax C src high limb + xor %ecx, %ecx + + C + + C + + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edi) C high is carry if high=MUL_THRESHOLD, so with size==0 then - C must have xsize!=0 + je L(zero_done) jmp L(fraction_some) @@ -383,7 +474,7 @@ L(integer_top): C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag - leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d C @@ -438,7 +529,6 @@ L(integer_two_left): C edi n2 C ebp divisor C - C mm0 src limb, shifted C mm7 rshift cmpl $0x80000000, %esi C n1 as 0=c, 1=nc @@ -458,7 +548,7 @@ L(integer_two_left): C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag - leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 @@ -499,7 +589,6 @@ L(integer_one_left): C edi n2 C ebp divisor C - C mm0 src limb, shifted C mm7 rshift movl VAR_DST_STOP, %ecx @@ -519,7 +608,7 @@ L(integer_one_left): C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag - leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d C @@ -559,6 +648,7 @@ L(integer_none): movl %edi, %eax L(fraction_done): movl VAR_NORM, %ecx +L(zero_done): movl SAVE_EBP, %ebp movl SAVE_EDI, %edi @@ -652,11 +742,10 @@ L(fraction_some): C ebp divisor movl PARAM_DST, %esi - movl VAR_DST_STOP, %ecx + movl VAR_DST_STOP, %ecx C &dst[xsize+2] movl %edi, %eax - subl $8, %ecx - + subl $8, %ecx C &dst[xsize] jmp L(fraction_entry)