/* assembly to compute poly1305 */

#include "crypto_asm_hidden.h"
// linker define poly1305_maa64_g1
// linker use mask2
// linker use mask2c
// linker use p0
// linker use p1
// linker use p2

#define mask2  CRYPTO_SHARED_NAMESPACE(mask2)
#define mask2c CRYPTO_SHARED_NAMESPACE(mask2c)
#define p0     CRYPTO_SHARED_NAMESPACE(p0)
#define p1     CRYPTO_SHARED_NAMESPACE(p1)
#define p2     CRYPTO_SHARED_NAMESPACE(p2)

	.p2align 5

ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
.global _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
.global CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
_CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1):
CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1):

	movq 	%rsp,%r11
	andq    $-32,%rsp
	subq 	$128,%rsp

	movq 	%r11,0(%rsp)
	movq 	%r12,8(%rsp)
	movq 	%r13,16(%rsp)
	movq 	%r14,24(%rsp)
	movq 	%r15,32(%rsp)
	movq 	%rbx,40(%rsp)
	movq 	%rbp,48(%rsp)
	movq 	%rdi,56(%rsp)
	movq 	%r8,64(%rsp)
	movq 	%r9,72(%rsp)	
	
	/* store high 16 bytes of key */
	movq    16(%rdx),%r14
	movq    24(%rdx),%r15		
	movq    %r14,80(%rsp)
	movq    %r15,88(%rsp)	

	/* key = (r15 : r14) */
	movq    0(%rdx),%r14
	movq    8(%rdx),%r15
	
	/* initialize a quad-word on the stack with 0 */	
	movq	$0,96(%rsp)	

	/* if the message has a single block */
	cmpq    $1,%rcx
	je      .L9
	
	/* message block = (rbp : rbx) */
	movq    0(%rsi),%rbx
	movq    8(%rsi),%rbp	

	/* else loop around and multiply the 129-byte (3-limb) 
	 * message block with the 128-byte (2-limb) key;
	 * read the 129th bit in %rdi before proceeding
	 */
	 
	movq    $1,%rdi

.L1:
	/* integer multiplication */	
	movq    %rdi,%rax
	mulq	%r15
	movq    %rax,%r8
	xorq    %r9,%r9
	movq    %rax,%r10
	xorq    %r11,%r11
	movq    %rdx,%r12
	xorq    %r13,%r13
	xorq    %rax,%rax
	shld    $62,%rdx,%rax
	shlq    $62,%rdx
	addq    %rdx,%r10
	adcq    %rax,%r11

	movq    %rbp,%rax
	mulq	%r15
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rdi,%rax
	mulq	%r14
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	shld    $62,%r8,%r9
	shlq    $62,%r8

	movq    %rbx,%rax
	mulq	%r14
	addq    %rax,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rbx,%rax
	mulq	%r15
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	movq    %rbp,%rax
	mulq	%r14
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
	addq    %r9,%r10
	adcq    $0,%r11

	addq    %r11,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12 : r10 : r8) */
	movq    %r12,%rdi

	andq    mask2(%rip),%rdi
	andq    mask2c(%rip),%r12

	addq    %r12,%r8
	adcq    %r13,%r10
	adcq    $0,%rdi

	shrd    $2,%r13,%r12
	shrq    $2,%r13 

	addq    %r12,%r8
	adcq    %r13,%r10
	adcq    $0,%rdi

	movq    %r8,%rbx
	movq    %r10,%rbp

	addq    $16,%rsi	 
	subq    $1,%rcx
	
	cmpq    $2,%rcx
	jg      .L2
	je	.L3
	jl	.L4
	
.L2:
	/* add the next message block */
	addq    0(%rsi),%rbx
	adcq    8(%rsi),%rbp
	adcq    $1,%rdi
	jmp	.L1	
	
.L3:	
	/* add the second last block and proceed */
	addq    0(%rsi),%rbx
	adcq    8(%rsi),%rbp
	adcq    $1,%rdi

	/* integer multiplication */	
	movq    %rdi,%rax
	mulq	%r15
	movq    %rax,%r8
	xorq    %r9,%r9
	movq    %rax,%r10
	xorq    %r11,%r11
	movq    %rdx,%r12
	xorq    %r13,%r13
	xorq    %rax,%rax
	shld    $62,%rdx,%rax
	shlq    $62,%rdx
	addq    %rdx,%r10
	adcq    %rax,%r11

	movq    %rbp,%rax
	mulq	%r15
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rdi,%rax
	mulq	%r14
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	shld    $62,%r8,%r9
	shlq    $62,%r8

	movq    %rbx,%rax
	mulq	%r14
	addq    %rax,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rbx,%rax
	mulq	%r15
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	movq    %rbp,%rax
	mulq	%r14
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
	addq    %r9,%r10
	adcq    $0,%r11

	addq    %r11,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12 : r10 : r8) */
	movq    %r12,%rdi

	andq    mask2(%rip),%rdi
	andq    mask2c(%rip),%r12

	addq    %r12,%r8
	adcq    %r13,%r10
	adcq    $0,%rdi

	shrd    $2,%r13,%r12
	shrq    $2,%r13 

	addq    %r12,%r8
	adcq    %r13,%r10
	adcq    $0,%rdi

	movq    %r8,%rbx
	movq    %r10,%rbp

	addq    $16,%rsi	
	subq    $1,%rcx
	
.L4:	
	/* process the last block */
	movq	%rdi,%r12

	/* if the last block is full */
	cmpq    $0,64(%rsp)
	je      .L7
	
	/* if the last block has 8 bytes */	
	cmpq    $64,64(%rsp)	
	je	.L6	

	/* if the last block has 1 to 7 bytes */	
	jl	.L5
	
	/* else if the last block has 9 to 15 bytes */
	
	/* first chunk of message block = (r8) */	
	movq    0(%rsi),%r8
	addq	$8,%rsi	
	
	movq	$128,%rax
	subq	64(%rsp),%rax
	movq	$64,%rcx
	subq	%rax,%rcx	
	shrq	$3,%rcx	
	leaq	96(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* second chunk of message block = (r9) */
	movq    96(%rsp),%r9
	
	movq	$-1,%r11
	movq	%rax,%rcx	
	shrq	%cl,%r11
	
	andq	%r11,%r9
	addq	$1,%r11	
	orq	%r11,%r9
	movq	$0,%r10
	
	jmp	.L8

.L5:
	movq	64(%rsp),%rcx
	shrq	$3,%rcx	
	leaq	96(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* first chunk of message block = (r8) */
	movq    96(%rsp),%r8
	
	movq	$-1,%r11	
	movb	$64,%cl
	
	subb	64(%rsp),%cl
	shrq	%cl,%r11
	
	andq	%r11,%r8
	addq	$1,%r11	
	orq	%r11,%r8

	/* second chunk of message block = (r9) */
	movq	$0,%r9	
	
	movq	$0,%r10

	jmp	.L8
	
.L6:
	movq	0(%rsi),%r8
	movq	$1,%r9
	movq	$0,%r10
	jmp	.L8

.L7:
	movq	0(%rsi),%r8
	movq	8(%rsi),%r9
	movq	$1,%r10
.L8:
	movq	%r12,%rdi
	addq    %r8,%rbx
	adcq    %r9,%rbp
	adcq    %r10,%rdi

	/* integer multiplication */	
	movq    %rdi,%rax
	mulq	%r15
	movq    %rax,%r8
	xorq    %r9,%r9
	movq    %rax,%r10
	xorq    %r11,%r11
	movq    %rdx,%r12
	xorq    %r13,%r13
	xorq    %rax,%rax
	shld    $62,%rdx,%rax
	shlq    $62,%rdx
	addq    %rdx,%r10
	adcq    %rax,%r11

	movq    %rbp,%rax
	mulq	%r15
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rdi,%rax
	mulq	%r14
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	shld    $62,%r8,%r9
	shlq    $62,%r8

	movq    %rbx,%rax
	mulq	%r14
	addq    %rax,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rbx,%rax
	mulq	%r15
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	movq    %rbp,%rax
	mulq	%r14
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
	addq    %r10,%r9
	adcq    $0,%r11

	addq    %r11,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12 : r9 : r8) */
	movq    %r12,%r10

	andq    mask2(%rip),%r10
	andq    mask2c(%rip),%r12

	addq    %r12,%r8
	adcq    %r13,%r9
	adcq    $0,%r10

	shrd    $2,%r13,%r12
	shrq    $2,%r13 

	addq    %r12,%r8
	adcq    %r13,%r9
	adcq    $0,%r10
	
	jmp     .L13
	
.L9:   
	/* if the single message block is full */
	cmpq    $0,64(%rsp)
	je      .L12

	/* if the single message block has 1 to 7 bytes */
	cmpq    $8,72(%rsp)
	jl      .L10
	
	/* if the single message block has 8 bytes */
	je     .L11	
	
	/* else if the single message block has 9 to 15 bytes */

	/* first chunk of message block = (rbx) */	
	movq    0(%rsi),%rbx
	addq	$8,%rsi	
	
	movq	$128,%rax
	subq	64(%rsp),%rax
	movq	$64,%rcx
	subq	%rax,%rcx	
	shrq	$3,%rcx	
	leaq	96(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* second chunk of message block = (rbp) */
	movq    96(%rsp),%rbp
	
	movq	$-1,%r11
	movq	%rax,%rcx	
	shrq	%cl,%r11
	
	andq	%r11,%rbp
	addq	$1,%r11	
	orq	%r11,%rbp
	
	/* integer multiplication */
	xorq	%r8,%r8
	xorq	%r9,%r9
	xorq	%r10,%r10
	xorq	%r11,%r11
	xorq	%r12,%r12
	xorq	%r13,%r13	

	movq    %rbp,%rax
	mulq	%r15
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	shld    $62,%r8,%r9
	shlq    $62,%r8

	movq    %rbx,%rax
	mulq	%r14
	addq    %rax,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rbx,%rax
	mulq	%r15
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	movq    %rbp,%rax
	mulq	%r14
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
	addq    %r10,%r9
	adcq    $0,%r11

	addq    %r11,%r12
	adcq    $0,%r13
	
	/* reduction on the integer product (r11 : r10 : r9 : r8) */
	movq    %r12,%r10

	andq    mask2(%rip),%r10
	andq    mask2c(%rip),%r12

	addq    %r12,%r8
	adcq    %r13,%r9
	adcq    $0,%r10

	shrd    $2,%r13,%r12
	shrq    $2,%r13 

	addq    %r12,%r8
	adcq    %r13,%r9
	adcq    $0,%r10

	jmp     .L13

.L10:   
	/* read the remainder bytes onto stack */
	movq	64(%rsp),%rcx
	shrq	$3,%rcx	
	leaq	96(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* message block = (rbx) */
	movq    96(%rsp),%rbx
	
	movq	$-1,%r11
	movb	$64,%cl

	subb	64(%rsp),%cl
	shrq	%cl,%r11

	andq	%r11,%rbx
	addq	$1,%r11
	orq	%r11,%rbx

	/* integer multiplication */
	movq    %r14,%rax
	mulq    %rbx
	movq	%rax,%r8
	movq	%rdx,%r9
	
	xorq	%r10,%r10
	movq    %r15,%rax			
	mulq    %rbx
	addq	%rax,%r9
	adcq	%rdx,%r10
	
	jmp     .L13
	
.L11:   
	/* message block = (rbx) */
	movq    0(%rsi),%rbx

	/* integer multiplication */
	movq    %r14,%rax
	mulq    %rbx
	movq	%rax,%r8
	movq	%rdx,%r9
	
	xorq	%r10,%r10
	movq    %r15,%rax			
	mulq    %rbx
	addq	%rax,%r9
	adcq	%rdx,%r10
	
	xorq	%r11,%r11
	addq	%r14,%r9
	adcq	%r15,%r10
	adcq	%r11,%r11
	
	/* reduction on the integer product (r11 : r10 : r9 : r8) */
	movq    %r10,%r13

	andq    mask2(%rip),%r10
	andq    mask2c(%rip),%r13

	addq    %r13,%r8
	adcq    %r11,%r9
	adcq    $0,%r10

	shrd    $2,%r11,%r13
	shrq    $2,%r11

	addq    %r13,%r8
	adcq    %r11,%r9
	adcq    $0,%r10		
	
	jmp     .L13		

.L12:
	/* message block = (rbp : rbx) */
	movq    0(%rsi),%rbx
	movq    8(%rsi),%rbp

	/* integer multiplication */	
	movq    %r15,%r8
	xorq    %r9,%r9
	movq    %r15,%r10
	xorq    %r11,%r11
	xorq    %r12,%r12
	xorq    %r13,%r13

	movq    %rbp,%rax
	mulq	%r15
	addq    %rax,%r12
	adcq    $0,%r13
	addq    %rdx,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	addq    %r14,%r12
	adcq    $0,%r13

	shld    $62,%r8,%r9
	shlq    $62,%r8

	movq    %rbx,%rax
	mulq	%r14
	addq    %rax,%r8
	adcq    $0,%r9
	addq    %rdx,%r10
	adcq    $0,%r11

	movq    %rbx,%rax
	mulq	%r15
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	movq    %rbp,%rax
	mulq	%r14
	addq    %rax,%r10
	adcq    $0,%r11
	addq    %rdx,%r12
	adcq    $0,%r13

	/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
	addq    %r10,%r9
	adcq    $0,%r11

	addq    %r11,%r12
	adcq    $0,%r13	

	/* reduction on the integer product (r13 : r12 : r9 : r8) */
	movq    %r12,%r10

	andq    mask2(%rip),%r10
	andq    mask2c(%rip),%r12

	addq    %r12,%r8
	adcq    %r13,%r9
	adcq    $0,%r10

	shrd    $2,%r13,%r12
	shrq    $2,%r13 

	addq    %r12,%r8
	adcq    %r13,%r9
	adcq    $0,%r10

.L13:	
	/* final reduction on (r10 : r9 : r8) */
	movq    %r10,%r11
	shrq    $2,%r11
	andq	mask2(%rip),%r10

	imul    $5,%r11,%r11
	addq    %r11,%r8
	adcq    $0,%r9
	adcq    $0,%r10
	
	/* freeze the reduced field element (r10 : r9 : r8) */
	movq    %r8,%r11
	movq    %r9,%r12
	movq    %r10,%r13

	subq    p0(%rip),%r8
	sbbq    p1(%rip),%r9
	sbbq    p2(%rip),%r10

	movq    %r10,%rcx
	shlq    $62,%rcx

	cmovc   %r11,%r8
	cmovc   %r12,%r9
	cmovc   %r13,%r10
	
	/* add last 16 bytes of the key */
	addq	80(%rsp),%r8
	adcq	88(%rsp),%r9
	adcq	$0,%r10	

	/* store first 128 bytes of the result */
	movq 	56(%rsp),%rdi
	movq    %r8,0(%rdi)
	movq    %r9,8(%rdi)

	movq 	0(%rsp),%r11
	movq 	8(%rsp),%r12
	movq 	16(%rsp),%r13
	movq 	24(%rsp),%r14
	movq 	32(%rsp),%r15
	movq 	40(%rsp),%rbx
	movq 	48(%rsp),%rbp

	movq 	%r11,%rsp

	ret
