mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-31 21:28:11 +09:00 
			
		
		
		
	
		
			
				
	
	
		
			214 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			214 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| //+build !noasm
 | |
| //+build !appengine
 | |
| 
 | |
| // Copyright 2015, Klaus Post, see LICENSE for details.
 | |
| 
 | |
| // func crc32sse(a []byte) uint32
 | |
| TEXT ·crc32sse(SB), 4, $0
 | |
| 	MOVQ a+0(FP), R10
 | |
| 	XORQ BX, BX
 | |
| 
 | |
| 	// CRC32   dword (R10), EBX
 | |
| 	BYTE $0xF2; BYTE $0x41; BYTE $0x0f
 | |
| 	BYTE $0x38; BYTE $0xf1; BYTE $0x1a
 | |
| 
 | |
| 	MOVL BX, ret+24(FP)
 | |
| 	RET
 | |
| 
 | |
| // func crc32sseAll(a []byte, dst []uint32)
 | |
| TEXT ·crc32sseAll(SB), 4, $0
 | |
| 	MOVQ  a+0(FP), R8      // R8: src
 | |
| 	MOVQ  a_len+8(FP), R10 // input length
 | |
| 	MOVQ  dst+24(FP), R9   // R9: dst
 | |
| 	SUBQ  $4, R10
 | |
| 	JS    end
 | |
| 	JZ    one_crc
 | |
| 	MOVQ  R10, R13
 | |
| 	SHRQ  $2, R10          // len/4
 | |
| 	ANDQ  $3, R13          // len&3
 | |
| 	XORQ  BX, BX
 | |
| 	ADDQ  $1, R13
 | |
| 	TESTQ R10, R10
 | |
| 	JZ    rem_loop
 | |
| 
 | |
| crc_loop:
 | |
| 	MOVQ (R8), R11
 | |
| 	XORQ BX, BX
 | |
| 	XORQ DX, DX
 | |
| 	XORQ DI, DI
 | |
| 	MOVQ R11, R12
 | |
| 	SHRQ $8, R11
 | |
| 	MOVQ R12, AX
 | |
| 	MOVQ R11, CX
 | |
| 	SHRQ $16, R12
 | |
| 	SHRQ $16, R11
 | |
| 	MOVQ R12, SI
 | |
| 
 | |
| 	// CRC32   EAX, EBX
 | |
| 	BYTE $0xF2; BYTE $0x0f
 | |
| 	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
 | |
| 
 | |
| 	// CRC32   ECX, EDX
 | |
| 	BYTE $0xF2; BYTE $0x0f
 | |
| 	BYTE $0x38; BYTE $0xf1; BYTE $0xd1
 | |
| 
 | |
| 	// CRC32   ESI, EDI
 | |
| 	BYTE $0xF2; BYTE $0x0f
 | |
| 	BYTE $0x38; BYTE $0xf1; BYTE $0xfe
 | |
| 	MOVL BX, (R9)
 | |
| 	MOVL DX, 4(R9)
 | |
| 	MOVL DI, 8(R9)
 | |
| 
 | |
| 	XORQ BX, BX
 | |
| 	MOVL R11, AX
 | |
| 
 | |
| 	// CRC32   EAX, EBX
 | |
| 	BYTE $0xF2; BYTE $0x0f
 | |
| 	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
 | |
| 	MOVL BX, 12(R9)
 | |
| 
 | |
| 	ADDQ $16, R9
 | |
| 	ADDQ $4, R8
 | |
| 	XORQ BX, BX
 | |
| 	SUBQ $1, R10
 | |
| 	JNZ  crc_loop
 | |
| 
 | |
| rem_loop:
 | |
| 	MOVL (R8), AX
 | |
| 
 | |
| 	// CRC32   EAX, EBX
 | |
| 	BYTE $0xF2; BYTE $0x0f
 | |
| 	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
 | |
| 
 | |
| 	MOVL BX, (R9)
 | |
| 	ADDQ $4, R9
 | |
| 	ADDQ $1, R8
 | |
| 	XORQ BX, BX
 | |
| 	SUBQ $1, R13
 | |
| 	JNZ  rem_loop
 | |
| 
 | |
| end:
 | |
| 	RET
 | |
| 
 | |
| one_crc:
 | |
| 	MOVQ $1, R13
 | |
| 	XORQ BX, BX
 | |
| 	JMP  rem_loop
 | |
| 
 | |
| // func matchLenSSE4(a, b []byte, max int) int
 | |
| TEXT ·matchLenSSE4(SB), 4, $0
 | |
| 	MOVQ a_base+0(FP), SI
 | |
| 	MOVQ b_base+24(FP), DI
 | |
| 	MOVQ DI, DX
 | |
| 	MOVQ max+48(FP), CX
 | |
| 
 | |
| cmp8:
 | |
| 	// As long as we are 8 or more bytes before the end of max, we can load and
 | |
| 	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
 | |
| 	CMPQ CX, $8
 | |
| 	JLT  cmp1
 | |
| 	MOVQ (SI), AX
 | |
| 	MOVQ (DI), BX
 | |
| 	CMPQ AX, BX
 | |
| 	JNE  bsf
 | |
| 	ADDQ $8, SI
 | |
| 	ADDQ $8, DI
 | |
| 	SUBQ $8, CX
 | |
| 	JMP  cmp8
 | |
| 
 | |
| bsf:
 | |
| 	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
 | |
| 	// the index of the first byte that differs. The BSF instruction finds the
 | |
| 	// least significant 1 bit, the amd64 architecture is little-endian, and
 | |
| 	// the shift by 3 converts a bit index to a byte index.
 | |
| 	XORQ AX, BX
 | |
| 	BSFQ BX, BX
 | |
| 	SHRQ $3, BX
 | |
| 	ADDQ BX, DI
 | |
| 
 | |
| 	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
 | |
| 	SUBQ DX, DI
 | |
| 	MOVQ DI, ret+56(FP)
 | |
| 	RET
 | |
| 
 | |
| cmp1:
 | |
| 	// In the slices' tail, compare 1 byte at a time.
 | |
| 	CMPQ CX, $0
 | |
| 	JEQ  matchLenEnd
 | |
| 	MOVB (SI), AX
 | |
| 	MOVB (DI), BX
 | |
| 	CMPB AX, BX
 | |
| 	JNE  matchLenEnd
 | |
| 	ADDQ $1, SI
 | |
| 	ADDQ $1, DI
 | |
| 	SUBQ $1, CX
 | |
| 	JMP  cmp1
 | |
| 
 | |
| matchLenEnd:
 | |
| 	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
 | |
| 	SUBQ DX, DI
 | |
| 	MOVQ DI, ret+56(FP)
 | |
| 	RET
 | |
| 
 | |
| // func histogram(b []byte, h []int32)
 | |
| TEXT ·histogram(SB), 4, $0
 | |
| 	MOVQ b+0(FP), SI     // SI: &b
 | |
| 	MOVQ b_len+8(FP), R9 // R9: len(b)
 | |
| 	MOVQ h+24(FP), DI    // DI: Histogram
 | |
| 	MOVQ R9, R8
 | |
| 	SHRQ $3, R8
 | |
| 	JZ   hist1
 | |
| 	XORQ R11, R11
 | |
| 
 | |
| loop_hist8:
 | |
| 	MOVQ (SI), R10
 | |
| 
 | |
| 	MOVB R10, R11
 | |
| 	INCL (DI)(R11*4)
 | |
| 	SHRQ $8, R10
 | |
| 
 | |
| 	MOVB R10, R11
 | |
| 	INCL (DI)(R11*4)
 | |
| 	SHRQ $8, R10
 | |
| 
 | |
| 	MOVB R10, R11
 | |
| 	INCL (DI)(R11*4)
 | |
| 	SHRQ $8, R10
 | |
| 
 | |
| 	MOVB R10, R11
 | |
| 	INCL (DI)(R11*4)
 | |
| 	SHRQ $8, R10
 | |
| 
 | |
| 	MOVB R10, R11
 | |
| 	INCL (DI)(R11*4)
 | |
| 	SHRQ $8, R10
 | |
| 
 | |
| 	MOVB R10, R11
 | |
| 	INCL (DI)(R11*4)
 | |
| 	SHRQ $8, R10
 | |
| 
 | |
| 	MOVB R10, R11
 | |
| 	INCL (DI)(R11*4)
 | |
| 	SHRQ $8, R10
 | |
| 
 | |
| 	INCL (DI)(R10*4)
 | |
| 
 | |
| 	ADDQ $8, SI
 | |
| 	DECQ R8
 | |
| 	JNZ  loop_hist8
 | |
| 
 | |
| hist1:
 | |
| 	ANDQ $7, R9
 | |
| 	JZ   end_hist
 | |
| 	XORQ R10, R10
 | |
| 
 | |
| loop_hist1:
 | |
| 	MOVB (SI), R10
 | |
| 	INCL (DI)(R10*4)
 | |
| 	INCQ SI
 | |
| 	DECQ R9
 | |
| 	JNZ  loop_hist1
 | |
| 
 | |
| end_hist:
 | |
| 	RET
 |