=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/x86/p6/mmx/Attic/divrem_1.asm,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1.1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/x86/p6/mmx/Attic/divrem_1.asm 2000/09/09 14:12:44 1.1.1.1 +++ OpenXM_contrib/gmp/mpn/x86/p6/mmx/Attic/divrem_1.asm 2003/08/25 16:06:29 1.1.1.2 @@ -1,9 +1,6 @@ dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. -dnl -dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. - -dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -22,19 +19,25 @@ dnl License along with the GNU MP Library; see the fi dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. - include(`../config.m4') +C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. + + C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor); C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); C C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, -C see that file for some comments. It's likely what's here can be improved. +C see that file for some comments. It's possible what's here can be improved. dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by @@ -52,7 +55,9 @@ dnl mul is much faster. deflit(MUL_THRESHOLD, 4) -defframe(PARAM_CARRY, 24) +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c defframe(PARAM_DIVISOR,20) defframe(PARAM_SIZE, 16) defframe(PARAM_SRC, 12) @@ -72,9 +77,66 @@ defframe(VAR_DST_STOP,-36) deflit(STACK_SPACE, 36) - .text + TEXT ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + movl PARAM_XSIZE, %ecx + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edx + + movl -4(%esi,%ebx,4), %eax C src high limb + xorl %edi, %edi C initial carry (if can't skip a div) + + C + + leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] + xor %ecx, %ecx + + movl %edx, VAR_DST_STOP C &dst[xsize+2] + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edi) C high is carry if high=MUL_THRESHOLD, so with size==0 then - C must have xsize!=0 + je L(zero_done) jmp L(fraction_some) @@ -353,7 +435,7 @@ L(integer_top): addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag movl %ebp, %eax C d - leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + leal 1(%edi), %ebx C n2+1 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 jz L(q1_ff) @@ -407,7 +489,6 @@ L(integer_two_left): C edi n2 C ebp divisor C - C mm0 src limb, shifted C mm7 rshift @@ -434,7 +515,7 @@ L(integer_two_left): C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag - leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 @@ -475,7 +556,6 @@ L(integer_one_left): C edi n2 C ebp divisor C - C mm0 src limb, shifted C mm7 rshift @@ -500,7 +580,7 @@ L(integer_one_left): C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag - leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d C @@ -541,6 +621,7 @@ L(integer_one_left): movl %edi, %eax L(fraction_done): movl VAR_NORM, %ecx +L(zero_done): movl SAVE_EBP, %ebp movl SAVE_EDI, %edi @@ -617,10 +698,10 @@ L(fraction_some): C ebp divisor movl PARAM_DST, %esi - movl VAR_DST_STOP, %ecx + movl VAR_DST_STOP, %ecx C &dst[xsize+2] movl %edi, %eax - subl $8, %ecx + subl $8, %ecx C &dst[xsize] ALIGN(16)