Multiplying big numbers using Karatsuba's method
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
11
down vote
favorite
The Karatsuba algorithm, first published in 1962, aims to speed up the multiplication of big numbers by reducing the number of 'single-digit-multiplications' involved.
Because of its complexity (overhead) this method is not particularly fast on the 'smaller' numbers. There's a threshold to be observed. Computing MAX_UINT2, I found this threshold to be at some 5000 decimal digits (in the product) before Karatsuba's becomes faster than my version of the classical long multiplication.
The multi-precision mpMul procedure that I present below requires 5 parameters passed on the stack.
- The 1st param specifies the precision (length in bits) of both the inputs which must be a power of 2, so one of $lbrace 2^5, 2^6, ldots, 2^30, 2^31rbrace$.
- The 2nd param points to the 1st input aka multiplicand.
- The 3rd param points to the 2nd input aka multiplier.
- The 4th param points to where the double length result needs to go.
- The 5th param points to a scratch buffer for storing intermediate results.
The address of the double length product is returned in the EAX
register and the carry flag will be set if the result exceeds the precision.
In an effort to make this procedure efficient I applied the following:
Considering
EAX
,EBX
,ECX
, andEDX
volatile registers reduced the recursion overhead a lot. This was especially important on the lowest level (the one with themul
instruction) that after all represents two thirds of all thecall
's that are made.I refrained from using
EBP
as a stackframe pointer, instead I usedESP
relative addressing. I think the extra byte for the 'sib-encoding' was well spent since an additional register to play with makes programming a bit easier.Because the recursive subroutine does not remove its parameters when it returns, I could simply re-use these parameters several times (with minimal changes).
Instead of liberally assigning local variables and probably obtaining code that is nicer to look at or at least easier to follow, I squeezed as much as I could in the scratch buffer, re-assigning fields whenever possible.
; ------------------------------------
; Multiplying using Karatsuba's method
; ------------------------------------
; IN (stack) OUT (eax,CF) MOD (stack)
mpMul: pushad
mov esi, [esp+32+4] ;Precision 32, 64, 128, ...
shr esi, 5 ;Bits -> dwords
mov eax, [esp+32+4+4] ;Multiplicand
mov ebx, [esp+32+4+8] ;Multiplier
mov edx, [esp+32+4+16] ;Scratch buffer
; - - - - - - - - - - - - - - - - - -
push ebx eax edx esi
call .MULT ; -> (EAX EBX ECX EDX)
pop ecx esi
add esp, 8
; - - - - - - - - - - - - - - - - - -
imul edx, ecx, 4 ;Distance between halves of result
mov edi, [esp+32+4+12] ;Double length result
mov [esp+28], edi ;pushad.EAX
xor ebp, ebp
@@: mov eax, [esi+edx]
mov [edi+edx], eax
or ebp, eax ;Gathers every bit from upper half
movsd
dec ecx
jnz @b
cmp ecx, ebp ;Sets CF if upper half is non-zero
popad
ret 20
; - - - - - - - - - - - - - - - - - -
; Recursive subroutine
; IN (stack) OUT (stack) MOD (eax,ebx,ecx,edx)
.MULT: mov ecx, [esp+4] ;1st incoming arg is IntSize in dwords
shr ecx, 1
jnz @f
mov ecx, [esp+4+4] ;2nd incoming arg is Scratch
mov eax, [esp+4+8] ;3rd incoming arg is N1
mov ebx, [esp+4+12] ;4th incoming arg is N2
mov eax, [eax]
mul dword [ebx]
mov [ecx], eax
mov [ecx+4], edx
ret
@@: push esi edi ebp ;12 bytes => [esp+12]
mov esi, ecx
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
push edi ;4th outgoing param
mov edx, [esp+4+12+4+12] ;4th incoming arg is N2
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;3rd outgoing param
mov edx, [esp+8+12+4+8] ;3rd incoming arg is N1
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;2nd outgoing param
push esi ;1st outgoing param
call .MULT ;H=Trunc(a+b)*Trunc(c+d) -> (EAX..EDX)
xor eax, eax
and ebp, 11b ;Test both overflow conditions
jnp @f
setnz al ;Only set if both Trunc's lost 1 bit
@@: mov [edi+esi*8], eax ;Setup dword after DoubleLengthProduct
shr ebp, 1 ;If Trunc(a+b) did loose 1 bit
jnc @f ; then adding Trunc(c+d) is needed.
mov ebx, [esp+12] ;Trunc(c+d)
call .CURE ; -> (EAX..EDX)
@@: shr ebp, 1 ;If Trunc(c+d) did loose 1 bit
jnc @f ; then adding Trunc(a+b) is needed.
mov ebx, [esp+8] ;Trunc(a+b)
call .CURE ; -> (EAX..EDX)
@@: mov eax, [esp+16+12+4+12];4th incoming arg is N2.d
mov [esp+12], eax
mov eax, [esp+16+12+4+8] ;3rd incoming arg is N1.b
mov [esp+8], eax
lea eax, [esi*8+4]
add [esp+4], eax ;Memory above H
call .MULT ;G=b*d -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
mov ebx, [esp+4] ;G
call .DIF ;H-=G R=G -> (EAX ECX EBP)
imul eax, esi, 4
add [esp+12], eax ;N2.c
add [esp+8], eax ;N1.a
call .MULT ;F=a*c -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
lea edx, [edx+esi*8] ;Half-way Scratch.DoubleLengthResult
mov ebx, [esp+4] ;F
call .DIF ;H-=F R+=F*m^2 -> (EAX ECX EBP)
add esp, 16 ;Finally discarding params
imul ecx, esi, 2
mov esi, edi ;H
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
lea edi, [edi+ecx*2] ;Quarter-way Scratch.DoubleLengthResult
inc ecx
@@: lodsd ;R+=H*m
adc eax, [edi]
stosd
dec ecx
jnz @b
@@: mov eax, [edi]
adc eax, ecx ;ECX=0
stosd
jc @b
pop ebp edi esi
ret
; - - - - - - - - - - - - - - - - - -
; Adds the coefficients of the number at EDX and stores the result in the
; double length buffer at EDI.
; IN (edx,esi,edi) OUT (edi,ebp) MOD (eax,ebx,edx)
.SUM: mov ebx, esi
clc
@@: mov eax, [edx+esi*4] ;First (c+d), later (a+b)
adc eax, [edx]
stosd
lea edx, [edx+4]
dec ebx
jnz @b
rcl ebp, 1 ;Preserve overflow condition
lea edi, [edi+esi*4] ;Skip high half for now
ret
; - - - - - - - - - - - - - - - - - -
; Cures the product in H if the sum of any 2 coefficients had an overflow.
; IN (ebx,esi,edi) OUT () MOD (eax,ebx,ecx,edx)
.CURE: mov ecx, esi
mov edx, edi
lea edi, [edi+esi*4]
xchg esi, ebx
clc
@@: lodsd
adc eax, [edi]
stosd
dec ebx
jnz @b
adc [edi], ebx ;EBX=0
mov edi, edx
mov esi, ecx
ret
; - - - - - - - - - - - - - - - - - -
; Moves the double length number at EBX to EDX
; at the same time subtracting it from the number at EDI.
; IN (ebx,edx,esi,edi) OUT () MOD (eax,ecx,ebp)
.DIF: imul ebp, esi, 2
xor ecx, ecx ;CF=0
@@: mov eax, [ebx+ecx*4] ;First H-G, later H-F
mov [edx+ecx*4], eax
sbb [edi+ecx*4], eax
inc ecx
dec ebp
jnz @b
sbb [edi+ecx*4], ebp ;EBP=0
ret
; ------------------------------------
Some interesting numbers to investigate, considering $n = log_2textPrecision$
- Number of recursive
call
s
$$sum_i=5^n 3^i-5$$
- Number of 'single-digit-multiplications'
$$3^n-5$$
- Additional stack space needed once in mpMul
$$52+(n-5)*32$$
- Size of the required scratch buffer
$$textPrecision*frac34+(n-9)*4$$
I want to push this algorithm to its limits.
Did I miss some opportunity to do so?
performance algorithm recursion assembly x86
add a comment |Â
up vote
11
down vote
favorite
The Karatsuba algorithm, first published in 1962, aims to speed up the multiplication of big numbers by reducing the number of 'single-digit-multiplications' involved.
Because of its complexity (overhead) this method is not particularly fast on the 'smaller' numbers. There's a threshold to be observed. Computing MAX_UINT2, I found this threshold to be at some 5000 decimal digits (in the product) before Karatsuba's becomes faster than my version of the classical long multiplication.
The multi-precision mpMul procedure that I present below requires 5 parameters passed on the stack.
- The 1st param specifies the precision (length in bits) of both the inputs which must be a power of 2, so one of $lbrace 2^5, 2^6, ldots, 2^30, 2^31rbrace$.
- The 2nd param points to the 1st input aka multiplicand.
- The 3rd param points to the 2nd input aka multiplier.
- The 4th param points to where the double length result needs to go.
- The 5th param points to a scratch buffer for storing intermediate results.
The address of the double length product is returned in the EAX
register and the carry flag will be set if the result exceeds the precision.
In an effort to make this procedure efficient I applied the following:
Considering
EAX
,EBX
,ECX
, andEDX
volatile registers reduced the recursion overhead a lot. This was especially important on the lowest level (the one with themul
instruction) that after all represents two thirds of all thecall
's that are made.I refrained from using
EBP
as a stackframe pointer, instead I usedESP
relative addressing. I think the extra byte for the 'sib-encoding' was well spent since an additional register to play with makes programming a bit easier.Because the recursive subroutine does not remove its parameters when it returns, I could simply re-use these parameters several times (with minimal changes).
Instead of liberally assigning local variables and probably obtaining code that is nicer to look at or at least easier to follow, I squeezed as much as I could in the scratch buffer, re-assigning fields whenever possible.
; ------------------------------------
; Multiplying using Karatsuba's method
; ------------------------------------
; IN (stack) OUT (eax,CF) MOD (stack)
mpMul: pushad
mov esi, [esp+32+4] ;Precision 32, 64, 128, ...
shr esi, 5 ;Bits -> dwords
mov eax, [esp+32+4+4] ;Multiplicand
mov ebx, [esp+32+4+8] ;Multiplier
mov edx, [esp+32+4+16] ;Scratch buffer
; - - - - - - - - - - - - - - - - - -
push ebx eax edx esi
call .MULT ; -> (EAX EBX ECX EDX)
pop ecx esi
add esp, 8
; - - - - - - - - - - - - - - - - - -
imul edx, ecx, 4 ;Distance between halves of result
mov edi, [esp+32+4+12] ;Double length result
mov [esp+28], edi ;pushad.EAX
xor ebp, ebp
@@: mov eax, [esi+edx]
mov [edi+edx], eax
or ebp, eax ;Gathers every bit from upper half
movsd
dec ecx
jnz @b
cmp ecx, ebp ;Sets CF if upper half is non-zero
popad
ret 20
; - - - - - - - - - - - - - - - - - -
; Recursive subroutine
; IN (stack) OUT (stack) MOD (eax,ebx,ecx,edx)
.MULT: mov ecx, [esp+4] ;1st incoming arg is IntSize in dwords
shr ecx, 1
jnz @f
mov ecx, [esp+4+4] ;2nd incoming arg is Scratch
mov eax, [esp+4+8] ;3rd incoming arg is N1
mov ebx, [esp+4+12] ;4th incoming arg is N2
mov eax, [eax]
mul dword [ebx]
mov [ecx], eax
mov [ecx+4], edx
ret
@@: push esi edi ebp ;12 bytes => [esp+12]
mov esi, ecx
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
push edi ;4th outgoing param
mov edx, [esp+4+12+4+12] ;4th incoming arg is N2
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;3rd outgoing param
mov edx, [esp+8+12+4+8] ;3rd incoming arg is N1
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;2nd outgoing param
push esi ;1st outgoing param
call .MULT ;H=Trunc(a+b)*Trunc(c+d) -> (EAX..EDX)
xor eax, eax
and ebp, 11b ;Test both overflow conditions
jnp @f
setnz al ;Only set if both Trunc's lost 1 bit
@@: mov [edi+esi*8], eax ;Setup dword after DoubleLengthProduct
shr ebp, 1 ;If Trunc(a+b) did loose 1 bit
jnc @f ; then adding Trunc(c+d) is needed.
mov ebx, [esp+12] ;Trunc(c+d)
call .CURE ; -> (EAX..EDX)
@@: shr ebp, 1 ;If Trunc(c+d) did loose 1 bit
jnc @f ; then adding Trunc(a+b) is needed.
mov ebx, [esp+8] ;Trunc(a+b)
call .CURE ; -> (EAX..EDX)
@@: mov eax, [esp+16+12+4+12];4th incoming arg is N2.d
mov [esp+12], eax
mov eax, [esp+16+12+4+8] ;3rd incoming arg is N1.b
mov [esp+8], eax
lea eax, [esi*8+4]
add [esp+4], eax ;Memory above H
call .MULT ;G=b*d -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
mov ebx, [esp+4] ;G
call .DIF ;H-=G R=G -> (EAX ECX EBP)
imul eax, esi, 4
add [esp+12], eax ;N2.c
add [esp+8], eax ;N1.a
call .MULT ;F=a*c -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
lea edx, [edx+esi*8] ;Half-way Scratch.DoubleLengthResult
mov ebx, [esp+4] ;F
call .DIF ;H-=F R+=F*m^2 -> (EAX ECX EBP)
add esp, 16 ;Finally discarding params
imul ecx, esi, 2
mov esi, edi ;H
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
lea edi, [edi+ecx*2] ;Quarter-way Scratch.DoubleLengthResult
inc ecx
@@: lodsd ;R+=H*m
adc eax, [edi]
stosd
dec ecx
jnz @b
@@: mov eax, [edi]
adc eax, ecx ;ECX=0
stosd
jc @b
pop ebp edi esi
ret
; - - - - - - - - - - - - - - - - - -
; Adds the coefficients of the number at EDX and stores the result in the
; double length buffer at EDI.
; IN (edx,esi,edi) OUT (edi,ebp) MOD (eax,ebx,edx)
.SUM: mov ebx, esi
clc
@@: mov eax, [edx+esi*4] ;First (c+d), later (a+b)
adc eax, [edx]
stosd
lea edx, [edx+4]
dec ebx
jnz @b
rcl ebp, 1 ;Preserve overflow condition
lea edi, [edi+esi*4] ;Skip high half for now
ret
; - - - - - - - - - - - - - - - - - -
; Cures the product in H if the sum of any 2 coefficients had an overflow.
; IN (ebx,esi,edi) OUT () MOD (eax,ebx,ecx,edx)
.CURE: mov ecx, esi
mov edx, edi
lea edi, [edi+esi*4]
xchg esi, ebx
clc
@@: lodsd
adc eax, [edi]
stosd
dec ebx
jnz @b
adc [edi], ebx ;EBX=0
mov edi, edx
mov esi, ecx
ret
; - - - - - - - - - - - - - - - - - -
; Moves the double length number at EBX to EDX
; at the same time subtracting it from the number at EDI.
; IN (ebx,edx,esi,edi) OUT () MOD (eax,ecx,ebp)
.DIF: imul ebp, esi, 2
xor ecx, ecx ;CF=0
@@: mov eax, [ebx+ecx*4] ;First H-G, later H-F
mov [edx+ecx*4], eax
sbb [edi+ecx*4], eax
inc ecx
dec ebp
jnz @b
sbb [edi+ecx*4], ebp ;EBP=0
ret
; ------------------------------------
Some interesting numbers to investigate, considering $n = log_2textPrecision$
- Number of recursive
call
s
$$sum_i=5^n 3^i-5$$
- Number of 'single-digit-multiplications'
$$3^n-5$$
- Additional stack space needed once in mpMul
$$52+(n-5)*32$$
- Size of the required scratch buffer
$$textPrecision*frac34+(n-9)*4$$
I want to push this algorithm to its limits.
Did I miss some opportunity to do so?
performance algorithm recursion assembly x86
What microarchitecture are you optimizing for?
â FUZxxl
May 16 at 14:43
@FUZxxl Sorry for the long delay but last weekend was a Belgian holiday. To answer your question: I wrote this code on an Intel® Pentium® dual-core processor T2080 and that would be Core⢠microarchitecture. The instruction set goes up to SSE3 (not including SSSE3).
â Sep Roland
May 27 at 13:01
add a comment |Â
up vote
11
down vote
favorite
up vote
11
down vote
favorite
The Karatsuba algorithm, first published in 1962, aims to speed up the multiplication of big numbers by reducing the number of 'single-digit-multiplications' involved.
Because of its complexity (overhead) this method is not particularly fast on the 'smaller' numbers. There's a threshold to be observed. Computing MAX_UINT2, I found this threshold to be at some 5000 decimal digits (in the product) before Karatsuba's becomes faster than my version of the classical long multiplication.
The multi-precision mpMul procedure that I present below requires 5 parameters passed on the stack.
- The 1st param specifies the precision (length in bits) of both the inputs which must be a power of 2, so one of $lbrace 2^5, 2^6, ldots, 2^30, 2^31rbrace$.
- The 2nd param points to the 1st input aka multiplicand.
- The 3rd param points to the 2nd input aka multiplier.
- The 4th param points to where the double length result needs to go.
- The 5th param points to a scratch buffer for storing intermediate results.
The address of the double length product is returned in the EAX
register and the carry flag will be set if the result exceeds the precision.
In an effort to make this procedure efficient I applied the following:
Considering
EAX
,EBX
,ECX
, andEDX
volatile registers reduced the recursion overhead a lot. This was especially important on the lowest level (the one with themul
instruction) that after all represents two thirds of all thecall
's that are made.I refrained from using
EBP
as a stackframe pointer, instead I usedESP
relative addressing. I think the extra byte for the 'sib-encoding' was well spent since an additional register to play with makes programming a bit easier.Because the recursive subroutine does not remove its parameters when it returns, I could simply re-use these parameters several times (with minimal changes).
Instead of liberally assigning local variables and probably obtaining code that is nicer to look at or at least easier to follow, I squeezed as much as I could in the scratch buffer, re-assigning fields whenever possible.
; ------------------------------------
; Multiplying using Karatsuba's method
; ------------------------------------
; IN (stack) OUT (eax,CF) MOD (stack)
mpMul: pushad
mov esi, [esp+32+4] ;Precision 32, 64, 128, ...
shr esi, 5 ;Bits -> dwords
mov eax, [esp+32+4+4] ;Multiplicand
mov ebx, [esp+32+4+8] ;Multiplier
mov edx, [esp+32+4+16] ;Scratch buffer
; - - - - - - - - - - - - - - - - - -
push ebx eax edx esi
call .MULT ; -> (EAX EBX ECX EDX)
pop ecx esi
add esp, 8
; - - - - - - - - - - - - - - - - - -
imul edx, ecx, 4 ;Distance between halves of result
mov edi, [esp+32+4+12] ;Double length result
mov [esp+28], edi ;pushad.EAX
xor ebp, ebp
@@: mov eax, [esi+edx]
mov [edi+edx], eax
or ebp, eax ;Gathers every bit from upper half
movsd
dec ecx
jnz @b
cmp ecx, ebp ;Sets CF if upper half is non-zero
popad
ret 20
; - - - - - - - - - - - - - - - - - -
; Recursive subroutine
; IN (stack) OUT (stack) MOD (eax,ebx,ecx,edx)
.MULT: mov ecx, [esp+4] ;1st incoming arg is IntSize in dwords
shr ecx, 1
jnz @f
mov ecx, [esp+4+4] ;2nd incoming arg is Scratch
mov eax, [esp+4+8] ;3rd incoming arg is N1
mov ebx, [esp+4+12] ;4th incoming arg is N2
mov eax, [eax]
mul dword [ebx]
mov [ecx], eax
mov [ecx+4], edx
ret
@@: push esi edi ebp ;12 bytes => [esp+12]
mov esi, ecx
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
push edi ;4th outgoing param
mov edx, [esp+4+12+4+12] ;4th incoming arg is N2
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;3rd outgoing param
mov edx, [esp+8+12+4+8] ;3rd incoming arg is N1
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;2nd outgoing param
push esi ;1st outgoing param
call .MULT ;H=Trunc(a+b)*Trunc(c+d) -> (EAX..EDX)
xor eax, eax
and ebp, 11b ;Test both overflow conditions
jnp @f
setnz al ;Only set if both Trunc's lost 1 bit
@@: mov [edi+esi*8], eax ;Setup dword after DoubleLengthProduct
shr ebp, 1 ;If Trunc(a+b) did loose 1 bit
jnc @f ; then adding Trunc(c+d) is needed.
mov ebx, [esp+12] ;Trunc(c+d)
call .CURE ; -> (EAX..EDX)
@@: shr ebp, 1 ;If Trunc(c+d) did loose 1 bit
jnc @f ; then adding Trunc(a+b) is needed.
mov ebx, [esp+8] ;Trunc(a+b)
call .CURE ; -> (EAX..EDX)
@@: mov eax, [esp+16+12+4+12];4th incoming arg is N2.d
mov [esp+12], eax
mov eax, [esp+16+12+4+8] ;3rd incoming arg is N1.b
mov [esp+8], eax
lea eax, [esi*8+4]
add [esp+4], eax ;Memory above H
call .MULT ;G=b*d -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
mov ebx, [esp+4] ;G
call .DIF ;H-=G R=G -> (EAX ECX EBP)
imul eax, esi, 4
add [esp+12], eax ;N2.c
add [esp+8], eax ;N1.a
call .MULT ;F=a*c -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
lea edx, [edx+esi*8] ;Half-way Scratch.DoubleLengthResult
mov ebx, [esp+4] ;F
call .DIF ;H-=F R+=F*m^2 -> (EAX ECX EBP)
add esp, 16 ;Finally discarding params
imul ecx, esi, 2
mov esi, edi ;H
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
lea edi, [edi+ecx*2] ;Quarter-way Scratch.DoubleLengthResult
inc ecx
@@: lodsd ;R+=H*m
adc eax, [edi]
stosd
dec ecx
jnz @b
@@: mov eax, [edi]
adc eax, ecx ;ECX=0
stosd
jc @b
pop ebp edi esi
ret
; - - - - - - - - - - - - - - - - - -
; Adds the coefficients of the number at EDX and stores the result in the
; double length buffer at EDI.
; IN (edx,esi,edi) OUT (edi,ebp) MOD (eax,ebx,edx)
.SUM: mov ebx, esi
clc
@@: mov eax, [edx+esi*4] ;First (c+d), later (a+b)
adc eax, [edx]
stosd
lea edx, [edx+4]
dec ebx
jnz @b
rcl ebp, 1 ;Preserve overflow condition
lea edi, [edi+esi*4] ;Skip high half for now
ret
; - - - - - - - - - - - - - - - - - -
; Cures the product in H if the sum of any 2 coefficients had an overflow.
; IN (ebx,esi,edi) OUT () MOD (eax,ebx,ecx,edx)
.CURE: mov ecx, esi
mov edx, edi
lea edi, [edi+esi*4]
xchg esi, ebx
clc
@@: lodsd
adc eax, [edi]
stosd
dec ebx
jnz @b
adc [edi], ebx ;EBX=0
mov edi, edx
mov esi, ecx
ret
; - - - - - - - - - - - - - - - - - -
; Moves the double length number at EBX to EDX
; at the same time subtracting it from the number at EDI.
; IN (ebx,edx,esi,edi) OUT () MOD (eax,ecx,ebp)
.DIF: imul ebp, esi, 2
xor ecx, ecx ;CF=0
@@: mov eax, [ebx+ecx*4] ;First H-G, later H-F
mov [edx+ecx*4], eax
sbb [edi+ecx*4], eax
inc ecx
dec ebp
jnz @b
sbb [edi+ecx*4], ebp ;EBP=0
ret
; ------------------------------------
Some interesting numbers to investigate, considering $n = log_2textPrecision$
- Number of recursive
call
s
$$sum_i=5^n 3^i-5$$
- Number of 'single-digit-multiplications'
$$3^n-5$$
- Additional stack space needed once in mpMul
$$52+(n-5)*32$$
- Size of the required scratch buffer
$$textPrecision*frac34+(n-9)*4$$
I want to push this algorithm to its limits.
Did I miss some opportunity to do so?
performance algorithm recursion assembly x86
The Karatsuba algorithm, first published in 1962, aims to speed up the multiplication of big numbers by reducing the number of 'single-digit-multiplications' involved.
Because of its complexity (overhead) this method is not particularly fast on the 'smaller' numbers. There's a threshold to be observed. Computing MAX_UINT2, I found this threshold to be at some 5000 decimal digits (in the product) before Karatsuba's becomes faster than my version of the classical long multiplication.
The multi-precision mpMul procedure that I present below requires 5 parameters passed on the stack.
- The 1st param specifies the precision (length in bits) of both the inputs which must be a power of 2, so one of $lbrace 2^5, 2^6, ldots, 2^30, 2^31rbrace$.
- The 2nd param points to the 1st input aka multiplicand.
- The 3rd param points to the 2nd input aka multiplier.
- The 4th param points to where the double length result needs to go.
- The 5th param points to a scratch buffer for storing intermediate results.
The address of the double length product is returned in the EAX
register and the carry flag will be set if the result exceeds the precision.
In an effort to make this procedure efficient I applied the following:
Considering
EAX
,EBX
,ECX
, andEDX
volatile registers reduced the recursion overhead a lot. This was especially important on the lowest level (the one with themul
instruction) that after all represents two thirds of all thecall
's that are made.I refrained from using
EBP
as a stackframe pointer, instead I usedESP
relative addressing. I think the extra byte for the 'sib-encoding' was well spent since an additional register to play with makes programming a bit easier.Because the recursive subroutine does not remove its parameters when it returns, I could simply re-use these parameters several times (with minimal changes).
Instead of liberally assigning local variables and probably obtaining code that is nicer to look at or at least easier to follow, I squeezed as much as I could in the scratch buffer, re-assigning fields whenever possible.
; ------------------------------------
; Multiplying using Karatsuba's method
; ------------------------------------
; IN (stack) OUT (eax,CF) MOD (stack)
mpMul: pushad
mov esi, [esp+32+4] ;Precision 32, 64, 128, ...
shr esi, 5 ;Bits -> dwords
mov eax, [esp+32+4+4] ;Multiplicand
mov ebx, [esp+32+4+8] ;Multiplier
mov edx, [esp+32+4+16] ;Scratch buffer
; - - - - - - - - - - - - - - - - - -
push ebx eax edx esi
call .MULT ; -> (EAX EBX ECX EDX)
pop ecx esi
add esp, 8
; - - - - - - - - - - - - - - - - - -
imul edx, ecx, 4 ;Distance between halves of result
mov edi, [esp+32+4+12] ;Double length result
mov [esp+28], edi ;pushad.EAX
xor ebp, ebp
@@: mov eax, [esi+edx]
mov [edi+edx], eax
or ebp, eax ;Gathers every bit from upper half
movsd
dec ecx
jnz @b
cmp ecx, ebp ;Sets CF if upper half is non-zero
popad
ret 20
; - - - - - - - - - - - - - - - - - -
; Recursive subroutine
; IN (stack) OUT (stack) MOD (eax,ebx,ecx,edx)
.MULT: mov ecx, [esp+4] ;1st incoming arg is IntSize in dwords
shr ecx, 1
jnz @f
mov ecx, [esp+4+4] ;2nd incoming arg is Scratch
mov eax, [esp+4+8] ;3rd incoming arg is N1
mov ebx, [esp+4+12] ;4th incoming arg is N2
mov eax, [eax]
mul dword [ebx]
mov [ecx], eax
mov [ecx+4], edx
ret
@@: push esi edi ebp ;12 bytes => [esp+12]
mov esi, ecx
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
push edi ;4th outgoing param
mov edx, [esp+4+12+4+12] ;4th incoming arg is N2
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;3rd outgoing param
mov edx, [esp+8+12+4+8] ;3rd incoming arg is N1
call .SUM ; -> EDI EBP (EAX EBX EDX)
push edi ;2nd outgoing param
push esi ;1st outgoing param
call .MULT ;H=Trunc(a+b)*Trunc(c+d) -> (EAX..EDX)
xor eax, eax
and ebp, 11b ;Test both overflow conditions
jnp @f
setnz al ;Only set if both Trunc's lost 1 bit
@@: mov [edi+esi*8], eax ;Setup dword after DoubleLengthProduct
shr ebp, 1 ;If Trunc(a+b) did loose 1 bit
jnc @f ; then adding Trunc(c+d) is needed.
mov ebx, [esp+12] ;Trunc(c+d)
call .CURE ; -> (EAX..EDX)
@@: shr ebp, 1 ;If Trunc(c+d) did loose 1 bit
jnc @f ; then adding Trunc(a+b) is needed.
mov ebx, [esp+8] ;Trunc(a+b)
call .CURE ; -> (EAX..EDX)
@@: mov eax, [esp+16+12+4+12];4th incoming arg is N2.d
mov [esp+12], eax
mov eax, [esp+16+12+4+8] ;3rd incoming arg is N1.b
mov [esp+8], eax
lea eax, [esi*8+4]
add [esp+4], eax ;Memory above H
call .MULT ;G=b*d -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
mov ebx, [esp+4] ;G
call .DIF ;H-=G R=G -> (EAX ECX EBP)
imul eax, esi, 4
add [esp+12], eax ;N2.c
add [esp+8], eax ;N1.a
call .MULT ;F=a*c -> (EAX..EDX)
mov edx, [esp+16+12+4+4] ;2nd incoming arg is Scratch
lea edx, [edx+esi*8] ;Half-way Scratch.DoubleLengthResult
mov ebx, [esp+4] ;F
call .DIF ;H-=F R+=F*m^2 -> (EAX ECX EBP)
add esp, 16 ;Finally discarding params
imul ecx, esi, 2
mov esi, edi ;H
mov edi, [esp+12+4+4] ;2nd incoming arg is Scratch
lea edi, [edi+ecx*2] ;Quarter-way Scratch.DoubleLengthResult
inc ecx
@@: lodsd ;R+=H*m
adc eax, [edi]
stosd
dec ecx
jnz @b
@@: mov eax, [edi]
adc eax, ecx ;ECX=0
stosd
jc @b
pop ebp edi esi
ret
; - - - - - - - - - - - - - - - - - -
; Adds the coefficients of the number at EDX and stores the result in the
; double length buffer at EDI.
; IN (edx,esi,edi) OUT (edi,ebp) MOD (eax,ebx,edx)
.SUM: mov ebx, esi
clc
@@: mov eax, [edx+esi*4] ;First (c+d), later (a+b)
adc eax, [edx]
stosd
lea edx, [edx+4]
dec ebx
jnz @b
rcl ebp, 1 ;Preserve overflow condition
lea edi, [edi+esi*4] ;Skip high half for now
ret
; - - - - - - - - - - - - - - - - - -
; Cures the product in H if the sum of any 2 coefficients had an overflow.
; IN (ebx,esi,edi) OUT () MOD (eax,ebx,ecx,edx)
.CURE: mov ecx, esi
mov edx, edi
lea edi, [edi+esi*4]
xchg esi, ebx
clc
@@: lodsd
adc eax, [edi]
stosd
dec ebx
jnz @b
adc [edi], ebx ;EBX=0
mov edi, edx
mov esi, ecx
ret
; - - - - - - - - - - - - - - - - - -
; Moves the double length number at EBX to EDX
; at the same time subtracting it from the number at EDI.
; IN (ebx,edx,esi,edi) OUT () MOD (eax,ecx,ebp)
.DIF: imul ebp, esi, 2
xor ecx, ecx ;CF=0
@@: mov eax, [ebx+ecx*4] ;First H-G, later H-F
mov [edx+ecx*4], eax
sbb [edi+ecx*4], eax
inc ecx
dec ebp
jnz @b
sbb [edi+ecx*4], ebp ;EBP=0
ret
; ------------------------------------
Some interesting numbers to investigate, considering $n = log_2textPrecision$
- Number of recursive
call
s
$$sum_i=5^n 3^i-5$$
- Number of 'single-digit-multiplications'
$$3^n-5$$
- Additional stack space needed once in mpMul
$$52+(n-5)*32$$
- Size of the required scratch buffer
$$textPrecision*frac34+(n-9)*4$$
I want to push this algorithm to its limits.
Did I miss some opportunity to do so?
performance algorithm recursion assembly x86
edited May 13 at 14:17
Vogel612â¦
20.9k345124
20.9k345124
asked May 13 at 13:30
Sep Roland
1,771617
1,771617
What microarchitecture are you optimizing for?
â FUZxxl
May 16 at 14:43
@FUZxxl Sorry for the long delay but last weekend was a Belgian holiday. To answer your question: I wrote this code on an Intel® Pentium® dual-core processor T2080 and that would be Core⢠microarchitecture. The instruction set goes up to SSE3 (not including SSSE3).
â Sep Roland
May 27 at 13:01
add a comment |Â
What microarchitecture are you optimizing for?
â FUZxxl
May 16 at 14:43
@FUZxxl Sorry for the long delay but last weekend was a Belgian holiday. To answer your question: I wrote this code on an Intel® Pentium® dual-core processor T2080 and that would be Core⢠microarchitecture. The instruction set goes up to SSE3 (not including SSSE3).
â Sep Roland
May 27 at 13:01
What microarchitecture are you optimizing for?
â FUZxxl
May 16 at 14:43
What microarchitecture are you optimizing for?
â FUZxxl
May 16 at 14:43
@FUZxxl Sorry for the long delay but last weekend was a Belgian holiday. To answer your question: I wrote this code on an Intel® Pentium® dual-core processor T2080 and that would be Core⢠microarchitecture. The instruction set goes up to SSE3 (not including SSSE3).
â Sep Roland
May 27 at 13:01
@FUZxxl Sorry for the long delay but last weekend was a Belgian holiday. To answer your question: I wrote this code on an Intel® Pentium® dual-core processor T2080 and that would be Core⢠microarchitecture. The instruction set goes up to SSE3 (not including SSSE3).
â Sep Roland
May 27 at 13:01
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f194309%2fmultiplying-big-numbers-using-karatsubas-method%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
What microarchitecture are you optimizing for?
â FUZxxl
May 16 at 14:43
@FUZxxl Sorry for the long delay but last weekend was a Belgian holiday. To answer your question: I wrote this code on an Intel® Pentium® dual-core processor T2080 and that would be Core⢠microarchitecture. The instruction set goes up to SSE3 (not including SSSE3).
â Sep Roland
May 27 at 13:01