星期六, 四月 15, 2006

汇编语言的sha1算法实现

手动做了一些优化,比linux上那个sha1sum快一些,但是比rfc上c的代码还是慢,很受打击

用cygwin下的gcc可以编译成dll文件,命令
gcc -shared -mno-cygwin -osha1.dll sha1.s

在cygwin下连同下面的测试程序编译成windows可执行文件用这个命令
gcc -mno-cygwin -osha1.exe main.c sha1.s

导出三个函数给c用

/*sha1.h*/
/*重新开始新的计算*/
int init();
/*输入新数据到缓冲区*/
int input(void* data, int length);
/*取得计算结果*/
int final(void* result);

#下面是汇编的代码,比c还慢的那个
#sha1.s
.equ K0TO19, 0x5A827999
.equ K20TO39, 0x6ED9EBA1
.equ K40TO59, 0x8F1BBCDC
.equ K60TO79, 0xCA62C1D6

.section .bss
.lcomm buffer, 64 #已经输入数据缓冲区
.lcomm length, 4 #已经输入数据长度
.lcomm totalenlow, 4 #数据总长度
.lcomm totalenhi, 4 #数据长度,高32bit
.lcomm stat, 4 #-1=error, 0=ok
.lcomm H0, 4
.lcomm H1, 4
.lcomm H2, 4
.lcomm H3, 4
.lcomm H4, 4
.lcomm W, 320

.section .text
.globl _init, _input, _final
# 填充数据缓冲至512bit
# 函数使用edi作缓冲区地址,ecx为块中已有数据长度,edx为数据总长度(填充末尾需要)
# %eax, %ebx用作临时变量
# 最后64位用来记录数据长度
_pad:
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
pushl %edi
movl $buffer, %edi
movl length, %ecx #肯定小于64
movb $0x80, %al #第一个填充字节1000 0000B
addl %ecx, %edi #寻址填充起点
cld #增长方向
stosb
movl $64, %eax #块大小,总数
subl %ecx, %eax #应填充数
subl $1, %eax #一个已经填充
cmp $8, %eax #查看预留空间是否够用
jl padbufferfull
jmp padbufferok
padbufferfull:
movl %eax, %ecx
xorl %eax, %eax
rep stosb
call _compute
movl $buffer, %edi
movl $64, %eax
padbufferok:
subl $8, %eax #留出数据长度存放空间
movl %eax, %ecx #准备填充
xorl %eax, %eax #eax清零
rep stosb
#由于计算机内存储方式与rfc中相反,故应作相应
#存放长度
movl totalenhi, %edx
movl %edx, %eax
shr $24, %eax
stosb
movl %edx, %eax
shr $16, %eax
stosb
movl %edx, %eax
shr $8, %eax
stosb
movb %dl, (%edi)
incl %edi
movl totalenlow, %edx
movl %edx, %eax
shr $24, %eax
stosb
movl %edx, %eax
shr $16, %eax
stosb
movl %edx, %eax
shr $8, %eax
stosb
movb %dl, (%edi)
incl %edi
call _compute
popl %edi
popl %edx
popl %ecx
popl %ebx
popl %eax
ret

# 重置各种参数,准备新的计算
# H0 = 67452301
# H1 = EFCDAB89
# H2 = 98BADCFE
# H3 = 10325476
# H4 = C3D2E1F0
# 已经输入数据长度清零
# int init()
_init:
movl $0x67452301, H0
movl $0xefcdab89, H1
movl $0x98badcfe, H2
movl $0x10325476, H3
movl $0xc3d2e1f0, H4
movl $0, length
movl $0, totalenlow
movl $0, totalenhi
movl $0, stat
xorl %eax, %eax
ret

# 散列计算函数
# rfc3174中method1
_compute:
pushl %esi
pushl %edi
pushl %ecx
pushl %eax
pushl %ebx
pushl %edx
pushl %ebp
# a. Divide M(i) into 16 words W(0), W(1), ... , W(15), where W(0)
# is the left-most word.
stepa:
movl $buffer, %esi
movl $W, %edi
movl $16, %ecx
movoneword:
lodsl #load a word to eax
#高位放低位
movl %eax, %ebx
shr $24, %eax
stosb
mov %ebx, %eax
shr $16, %eax
stosb
movl %ebx, %eax
shr $8, %eax
stosb
movb %bl, (%edi)
incl %edi
loop movoneword
# b. For t = 16 to 79 let
# W(t) = S^1(W(t-3) XOR W(t-8) XOR W(t-14) XOR W(t-16)).
# 使用ecx作为计数器
# edx作为索引
movl $16, %ecx
stepb:
movl %ecx, %edx
subl $3, %edx
movl W(, %edx, 4), %eax
movl %ecx, %edx
subl $8, %edx
xorl W(, %edx, 4), %eax
movl %ecx, %edx
subl $14, %edx
xorl W(, %edx, 4), %eax
movl %ecx, %edx
subl $16, %edx
xorl W(, %edx, 4), %eax
rol $1, %eax
movl %eax, W(, %ecx, 4)
inc %ecx
cmp $79, %ecx
jle stepb
# c. Let A = H0, B = H1, C = H2, D = H3, E = H4.
# use registers for a, b, c, d, e
# eax, ebx, ecx, edx, ebp for faster speed
stepc:
movl H0, %eax
movl H1, %ebx
movl H2, %ecx
movl H3, %edx
movl H4, %ebp

# d. For t = 0 to 79 do
# TEMP = S^5(A) + f(t;B,C,D) + E + W(t) + K(t);
# E = D; D = C; C = S^30(B); B = A; A = TEMP;
# then esi and edi are used for counter
# f(t;B,C,D) = (B AND C) OR ((NOT B) AND D) ( 0 <= t <= 19)
# f(t;B,C,D) = B XOR C XOR D (20 <= t <= 39)
# f(t;B,C,D) = (B AND C) OR (B AND D) OR (C AND D) (40 <= t <= 59)
# f(t;B,C,D) = B XOR C XOR D (60 <= t <= 79)
# K(t) = 5A827999 ( 0 <= t <= 19)
# K(t) = 6ED9EBA1 (20 <= t <= 39)
# K(t) = 8F1BBCDC (40 <= t <= 59)
# K(t) = CA62C1D6 (60 <= t <= 79)
xorl %esi, %esi
stepd:
stepd1:
movl %eax, %edi
rol $5, %edi #S^5(A)
addl %ebp, %edi #edi=S^5(A)+E
addl W(,%esi,4), %edi #+W(t)
addl $K0TO19, %edi #edi=S^5(A)+E+W(t)+K(t)
#E=ebp可用
#f(f;B,C,D)
#(B AND C) OR ((NOT B) AND D)
movl %ebx, %ebp
andl %ecx, %ebp
pushl %ebx
notl %ebx
andl %edx, %ebx
orl %ebx, %ebp
popl %ebx
addl %ebp, %edi #edi=TEMP, +f(t;B,C,D)
#E = D; D = C; C = S^30(B); B = A; A = TEMP
movl %edx, %ebp
movl %ecx, %edx
rol $30, %ebx
movl %ebx, %ecx
movl %eax, %ebx
movl %edi, %eax
incl %esi
cmpl $19, %esi
jle stepd1
stepd2:
movl %eax, %edi
rol $5, %edi
addl %ebp, %edi
addl W(,%esi,4), %edi
addl $K20TO39, %edi
movl %ebx, %ebp
xorl %ecx, %ebp
xorl %edx, %ebp
addl %ebp, %edi
movl %edx, %ebp
movl %ecx, %edx
rol $30, %ebx
movl %ebx, %ecx
movl %eax, %ebx
movl %edi, %eax
incl %esi
cmpl $39, %esi
jle stepd2
stepd3:
movl %eax, %edi
rol $5, %edi
addl %ebp, %edi
addl W(,%esi,4), %edi
addl $K40TO59, %edi
#f(t;B,C,D) = (B AND C) OR (B AND D) OR (C AND D)
movl %ebx, %ebp
andl %ecx, %ebp
pushl %ebx
andl %edx, %ebx
orl %ebx, %ebp
popl %ebx
pushl %ecx
andl %edx, %ecx
orl %ecx, %ebp
popl %ecx
addl %ebp, %edi
movl %edx, %ebp
movl %ecx, %edx
rol $30, %ebx
movl %ebx, %ecx
movl %eax, %ebx
movl %edi, %eax
incl %esi
cmpl $59, %esi
jle stepd3
stepd4:
movl %eax, %edi
rol $5, %edi
addl %ebp, %edi
addl W(,%esi,4), %edi
addl $K60TO79, %edi
movl %ebx, %ebp
xorl %ecx, %ebp
xorl %edx, %ebp
addl %ebp, %edi
movl %edx, %ebp
movl %ecx, %edx
rol $30, %ebx
movl %ebx, %ecx
movl %eax, %ebx
movl %edi, %eax
incl %esi
cmpl $79, %esi
jle stepd4

# e. Let H0 = H0 + A, H1 = H1 + B, H2 = H2 + C, H3 = H3 + D, H4 = H4
# + E.
stepe:
addl %eax, H0
addl %ebx, H1
addl %ecx, H2
addl %edx, H3
addl %ebp, H4
popl %ebp
popl %edx
popl %ebx
popl %eax
popl %ecx
popl %edi
popl %esi
ret

# 添加数据,数据到512bit时进行一次计算,并重新开始填充
# int input(void* data, int len);
_input:
movl stat, %eax
cmp $-1, %eax
je inputstaterr
jmp inputstatok
inputstaterr:
ret
inputstatok:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %edi
pushl %ecx
pushl %edx
pushl %eax
pushl %ebx
movl length, %edx #缓冲区已有数据长度(字节)
movl totalenlow, %eax #总长
movl 12(%ebp), %ebx #新送进来数据长度
movl 8(%ebp), %esi #用户缓冲区地址
movl totalenhi, %ebp
movl $buffer, %edi
addl %edx, %edi #定位填充起点
#%esi为源起点,%edi目的
movl $64, %ecx
subl %edx, %ecx #缓冲区剩余空间
#计算过程中ecx用作缓冲区剩余空间计数
#ebx未复制数据计数
#eax为总长度计数
inputnextbyte:
movsb
#inc %eax #总长度加一
addl $8, %eax
jo inctotalenhi
jmp countbuf
inctotalenhi:
inc %ebp
jo toolong
jmp countbuf
toolong:
movl $-1, %eax
movl %eax, stat
jmp finish
countbuf:
dec %ecx
jz bufferfull
jmp bufferok
bufferfull:
movl $64, %ecx #缓冲区空
movl $buffer, %edi
call _compute
bufferok:
dec %ebx
jz finish
jmp inputnextbyte
finish:
movl %eax, totalenlow
movl %ebp, totalenhi
movl $64, %edx
subl %ecx, %edx
movl %edx, length
popl %ebx
popl %eax
popl %edx
popl %ecx
popl %edi
popl %esi
pop %ebp
movl stat, %eax
ret

# 完成计算,取得最后散列结果
# int final(void* result);
_final:
movl stat, %eax
cmp $-1, %eax
je finalstaterr
jmp finalstatok
finalstaterr:
ret
finalstatok:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %edi
pushl %ecx
pushl %eax
pushl %ebx
call _pad
movl 8(%ebp), %edi
movl $H0, %esi
movl $5, %ecx
movoners:
pushl %ecx
lodsl
movl %eax, %ebx
shr $24, %eax
stosb
movl %ebx, %eax
shr $16, %eax
stosb
movl %ebx, %eax
shr $8, %eax
stosb
movb %bl, (%edi)
incl %edi
popl %ecx
loop movoners
popl %ebx
popl %eax
popl %ecx
popl %edi
popl %esi
popl %ebp
xorl %eax, %eax
ret

/*再给个测试使用的例子*/
/*main.c*/
#include
#include
#include "sha1.h"
int main()
{
char *buffer = "this is a test";
int len = 0;
char result[20];
int i;
init();
len = strlen(buffer);
input(buffer, len);
final(result);
for(i=0; i<20; i++)
{
printf("%02X ", 0x0ff & result[i]);
}
printf("\n");
return 0;
}