#include <machine/asm.h>

.globl	ossl_rsaz_avx512ifma_eligible
.type	ossl_rsaz_avx512ifma_eligible,@function
.align	32
ossl_rsaz_avx512ifma_eligible:
	movl	OPENSSL_ia32cap_P+8(%rip),%ecx
	xorl	%eax,%eax
	andl	$2149777408,%ecx
	cmpl	$2149777408,%ecx
	cmovel	%ecx,%eax
	.byte	0xf3,0xc3
.size	ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
.text	

.globl	ossl_rsaz_amm52x20_x1_256
.type	ossl_rsaz_amm52x20_x1_256,@function
.align	32
ossl_rsaz_amm52x20_x1_256:
.cfi_startproc	
.byte	243,15,30,250
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56
.Lrsaz_amm52x20_x1_256_body:


	vpxord	%ymm0,%ymm0,%ymm0
	vmovdqa64	%ymm0,%ymm1
	vmovdqa64	%ymm0,%ymm16
	vmovdqa64	%ymm0,%ymm17
	vmovdqa64	%ymm0,%ymm18
	vmovdqa64	%ymm0,%ymm19

	xorl	%r9d,%r9d

	movq	%rdx,%r11
	movq	$0xfffffffffffff,%rax


	movl	$5,%ebx

.align	32
.Lloop5:
	movq	0(%r11),%r13

	vpbroadcastq	%r13,%ymm3
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm4
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm3,%ymm1
	vpmadd52luq	32(%rsi),%ymm3,%ymm16
	vpmadd52luq	64(%rsi),%ymm3,%ymm17
	vpmadd52luq	96(%rsi),%ymm3,%ymm18
	vpmadd52luq	128(%rsi),%ymm3,%ymm19

	vpmadd52luq	0(%rcx),%ymm4,%ymm1
	vpmadd52luq	32(%rcx),%ymm4,%ymm16
	vpmadd52luq	64(%rcx),%ymm4,%ymm17
	vpmadd52luq	96(%rcx),%ymm4,%ymm18
	vpmadd52luq	128(%rcx),%ymm4,%ymm19


	valignq	$1,%ymm1,%ymm16,%ymm1
	valignq	$1,%ymm16,%ymm17,%ymm16
	valignq	$1,%ymm17,%ymm18,%ymm17
	valignq	$1,%ymm18,%ymm19,%ymm18
	valignq	$1,%ymm19,%ymm0,%ymm19

	vmovq	%xmm1,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm3,%ymm1
	vpmadd52huq	32(%rsi),%ymm3,%ymm16
	vpmadd52huq	64(%rsi),%ymm3,%ymm17
	vpmadd52huq	96(%rsi),%ymm3,%ymm18
	vpmadd52huq	128(%rsi),%ymm3,%ymm19

	vpmadd52huq	0(%rcx),%ymm4,%ymm1
	vpmadd52huq	32(%rcx),%ymm4,%ymm16
	vpmadd52huq	64(%rcx),%ymm4,%ymm17
	vpmadd52huq	96(%rcx),%ymm4,%ymm18
	vpmadd52huq	128(%rcx),%ymm4,%ymm19
	movq	8(%r11),%r13

	vpbroadcastq	%r13,%ymm3
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm4
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm3,%ymm1
	vpmadd52luq	32(%rsi),%ymm3,%ymm16
	vpmadd52luq	64(%rsi),%ymm3,%ymm17
	vpmadd52luq	96(%rsi),%ymm3,%ymm18
	vpmadd52luq	128(%rsi),%ymm3,%ymm19

	vpmadd52luq	0(%rcx),%ymm4,%ymm1
	vpmadd52luq	32(%rcx),%ymm4,%ymm16
	vpmadd52luq	64(%rcx),%ymm4,%ymm17
	vpmadd52luq	96(%rcx),%ymm4,%ymm18
	vpmadd52luq	128(%rcx),%ymm4,%ymm19


	valignq	$1,%ymm1,%ymm16,%ymm1
	valignq	$1,%ymm16,%ymm17,%ymm16
	valignq	$1,%ymm17,%ymm18,%ymm17
	valignq	$1,%ymm18,%ymm19,%ymm18
	valignq	$1,%ymm19,%ymm0,%ymm19

	vmovq	%xmm1,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm3,%ymm1
	vpmadd52huq	32(%rsi),%ymm3,%ymm16
	vpmadd52huq	64(%rsi),%ymm3,%ymm17
	vpmadd52huq	96(%rsi),%ymm3,%ymm18
	vpmadd52huq	128(%rsi),%ymm3,%ymm19

	vpmadd52huq	0(%rcx),%ymm4,%ymm1
	vpmadd52huq	32(%rcx),%ymm4,%ymm16
	vpmadd52huq	64(%rcx),%ymm4,%ymm17
	vpmadd52huq	96(%rcx),%ymm4,%ymm18
	vpmadd52huq	128(%rcx),%ymm4,%ymm19
	movq	16(%r11),%r13

	vpbroadcastq	%r13,%ymm3
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm4
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm3,%ymm1
	vpmadd52luq	32(%rsi),%ymm3,%ymm16
	vpmadd52luq	64(%rsi),%ymm3,%ymm17
	vpmadd52luq	96(%rsi),%ymm3,%ymm18
	vpmadd52luq	128(%rsi),%ymm3,%ymm19

	vpmadd52luq	0(%rcx),%ymm4,%ymm1
	vpmadd52luq	32(%rcx),%ymm4,%ymm16
	vpmadd52luq	64(%rcx),%ymm4,%ymm17
	vpmadd52luq	96(%rcx),%ymm4,%ymm18
	vpmadd52luq	128(%rcx),%ymm4,%ymm19


	valignq	$1,%ymm1,%ymm16,%ymm1
	valignq	$1,%ymm16,%ymm17,%ymm16
	valignq	$1,%ymm17,%ymm18,%ymm17
	valignq	$1,%ymm18,%ymm19,%ymm18
	valignq	$1,%ymm19,%ymm0,%ymm19

	vmovq	%xmm1,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm3,%ymm1
	vpmadd52huq	32(%rsi),%ymm3,%ymm16
	vpmadd52huq	64(%rsi),%ymm3,%ymm17
	vpmadd52huq	96(%rsi),%ymm3,%ymm18
	vpmadd52huq	128(%rsi),%ymm3,%ymm19

	vpmadd52huq	0(%rcx),%ymm4,%ymm1
	vpmadd52huq	32(%rcx),%ymm4,%ymm16
	vpmadd52huq	64(%rcx),%ymm4,%ymm17
	vpmadd52huq	96(%rcx),%ymm4,%ymm18
	vpmadd52huq	128(%rcx),%ymm4,%ymm19
	movq	24(%r11),%r13

	vpbroadcastq	%r13,%ymm3
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	%r8,%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm4
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm3,%ymm1
	vpmadd52luq	32(%rsi),%ymm3,%ymm16
	vpmadd52luq	64(%rsi),%ymm3,%ymm17
	vpmadd52luq	96(%rsi),%ymm3,%ymm18
	vpmadd52luq	128(%rsi),%ymm3,%ymm19

	vpmadd52luq	0(%rcx),%ymm4,%ymm1
	vpmadd52luq	32(%rcx),%ymm4,%ymm16
	vpmadd52luq	64(%rcx),%ymm4,%ymm17
	vpmadd52luq	96(%rcx),%ymm4,%ymm18
	vpmadd52luq	128(%rcx),%ymm4,%ymm19


	valignq	$1,%ymm1,%ymm16,%ymm1
	valignq	$1,%ymm16,%ymm17,%ymm16
	valignq	$1,%ymm17,%ymm18,%ymm17
	valignq	$1,%ymm18,%ymm19,%ymm18
	valignq	$1,%ymm19,%ymm0,%ymm19

	vmovq	%xmm1,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm3,%ymm1
	vpmadd52huq	32(%rsi),%ymm3,%ymm16
	vpmadd52huq	64(%rsi),%ymm3,%ymm17
	vpmadd52huq	96(%rsi),%ymm3,%ymm18
	vpmadd52huq	128(%rsi),%ymm3,%ymm19

	vpmadd52huq	0(%rcx),%ymm4,%ymm1
	vpmadd52huq	32(%rcx),%ymm4,%ymm16
	vpmadd52huq	64(%rcx),%ymm4,%ymm17
	vpmadd52huq	96(%rcx),%ymm4,%ymm18
	vpmadd52huq	128(%rcx),%ymm4,%ymm19
	leaq	32(%r11),%r11
	decl	%ebx
	jne	.Lloop5

	vmovdqa64	.Lmask52x4(%rip),%ymm4

	vpbroadcastq	%r9,%ymm3
	vpblendd	$3,%ymm3,%ymm1,%ymm1



	vpsrlq	$52,%ymm1,%ymm24
	vpsrlq	$52,%ymm16,%ymm25
	vpsrlq	$52,%ymm17,%ymm26
	vpsrlq	$52,%ymm18,%ymm27
	vpsrlq	$52,%ymm19,%ymm28


	valignq	$3,%ymm27,%ymm28,%ymm28
	valignq	$3,%ymm26,%ymm27,%ymm27
	valignq	$3,%ymm25,%ymm26,%ymm26
	valignq	$3,%ymm24,%ymm25,%ymm25
	valignq	$3,%ymm0,%ymm24,%ymm24


	vpandq	%ymm4,%ymm1,%ymm1
	vpandq	%ymm4,%ymm16,%ymm16
	vpandq	%ymm4,%ymm17,%ymm17
	vpandq	%ymm4,%ymm18,%ymm18
	vpandq	%ymm4,%ymm19,%ymm19


	vpaddq	%ymm24,%ymm1,%ymm1
	vpaddq	%ymm25,%ymm16,%ymm16
	vpaddq	%ymm26,%ymm17,%ymm17
	vpaddq	%ymm27,%ymm18,%ymm18
	vpaddq	%ymm28,%ymm19,%ymm19



	vpcmpuq	$1,%ymm1,%ymm4,%k1
	vpcmpuq	$1,%ymm16,%ymm4,%k2
	vpcmpuq	$1,%ymm17,%ymm4,%k3
	vpcmpuq	$1,%ymm18,%ymm4,%k4
	vpcmpuq	$1,%ymm19,%ymm4,%k5
	kmovb	%k1,%r14d
	kmovb	%k2,%r13d
	kmovb	%k3,%r12d
	kmovb	%k4,%r11d
	kmovb	%k5,%r10d


	vpcmpuq	$0,%ymm1,%ymm4,%k1
	vpcmpuq	$0,%ymm16,%ymm4,%k2
	vpcmpuq	$0,%ymm17,%ymm4,%k3
	vpcmpuq	$0,%ymm18,%ymm4,%k4
	vpcmpuq	$0,%ymm19,%ymm4,%k5
	kmovb	%k1,%r9d
	kmovb	%k2,%r8d
	kmovb	%k3,%ebx
	kmovb	%k4,%ecx
	kmovb	%k5,%edx



	shlb	$4,%r13b
	orb	%r13b,%r14b
	shlb	$4,%r11b
	orb	%r11b,%r12b

	addb	%r14b,%r14b
	adcb	%r12b,%r12b
	adcb	%r10b,%r10b

	shlb	$4,%r8b
	orb	%r8b,%r9b
	shlb	$4,%cl
	orb	%cl,%bl

	addb	%r9b,%r14b
	adcb	%bl,%r12b
	adcb	%dl,%r10b

	xorb	%r9b,%r14b
	xorb	%bl,%r12b
	xorb	%dl,%r10b

	kmovb	%r14d,%k1
	shrb	$4,%r14b
	kmovb	%r14d,%k2
	kmovb	%r12d,%k3
	shrb	$4,%r12b
	kmovb	%r12d,%k4
	kmovb	%r10d,%k5


	vpsubq	%ymm4,%ymm1,%ymm1{%k1}
	vpsubq	%ymm4,%ymm16,%ymm16{%k2}
	vpsubq	%ymm4,%ymm17,%ymm17{%k3}
	vpsubq	%ymm4,%ymm18,%ymm18{%k4}
	vpsubq	%ymm4,%ymm19,%ymm19{%k5}

	vpandq	%ymm4,%ymm1,%ymm1
	vpandq	%ymm4,%ymm16,%ymm16
	vpandq	%ymm4,%ymm17,%ymm17
	vpandq	%ymm4,%ymm18,%ymm18
	vpandq	%ymm4,%ymm19,%ymm19

	vmovdqu64	%ymm1,(%rdi)
	vmovdqu64	%ymm16,32(%rdi)
	vmovdqu64	%ymm17,64(%rdi)
	vmovdqu64	%ymm18,96(%rdi)
	vmovdqu64	%ymm19,128(%rdi)

	vzeroupper
	movq	0(%rsp),%r15
.cfi_restore	%r15
	movq	8(%rsp),%r14
.cfi_restore	%r14
	movq	16(%rsp),%r13
.cfi_restore	%r13
	movq	24(%rsp),%r12
.cfi_restore	%r12
	movq	32(%rsp),%rbp
.cfi_restore	%rbp
	movq	40(%rsp),%rbx
.cfi_restore	%rbx
	leaq	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lrsaz_amm52x20_x1_256_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256
.data	
.align	32
.Lmask52x4:
.quad	0xfffffffffffff
.quad	0xfffffffffffff
.quad	0xfffffffffffff
.quad	0xfffffffffffff
.text	

.globl	ossl_rsaz_amm52x20_x2_256
.type	ossl_rsaz_amm52x20_x2_256,@function
.align	32
ossl_rsaz_amm52x20_x2_256:
.cfi_startproc	
.byte	243,15,30,250
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56
.Lrsaz_amm52x20_x2_256_body:


	vpxord	%ymm0,%ymm0,%ymm0
	vmovdqa64	%ymm0,%ymm1
	vmovdqa64	%ymm0,%ymm16
	vmovdqa64	%ymm0,%ymm17
	vmovdqa64	%ymm0,%ymm18
	vmovdqa64	%ymm0,%ymm19
	vmovdqa64	%ymm0,%ymm2
	vmovdqa64	%ymm0,%ymm20
	vmovdqa64	%ymm0,%ymm21
	vmovdqa64	%ymm0,%ymm22
	vmovdqa64	%ymm0,%ymm23

	xorl	%r9d,%r9d
	xorl	%r15d,%r15d

	movq	%rdx,%r11
	movq	$0xfffffffffffff,%rax

	movl	$20,%ebx

.align	32
.Lloop20:
	movq	0(%r11),%r13

	vpbroadcastq	%r13,%ymm3
	movq	0(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	movq	%r12,%r10
	adcq	$0,%r10

	movq	(%r8),%r13
	imulq	%r9,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm4
	movq	0(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r9
	adcq	%r12,%r10

	shrq	$52,%r9
	salq	$12,%r10
	orq	%r10,%r9

	vpmadd52luq	0(%rsi),%ymm3,%ymm1
	vpmadd52luq	32(%rsi),%ymm3,%ymm16
	vpmadd52luq	64(%rsi),%ymm3,%ymm17
	vpmadd52luq	96(%rsi),%ymm3,%ymm18
	vpmadd52luq	128(%rsi),%ymm3,%ymm19

	vpmadd52luq	0(%rcx),%ymm4,%ymm1
	vpmadd52luq	32(%rcx),%ymm4,%ymm16
	vpmadd52luq	64(%rcx),%ymm4,%ymm17
	vpmadd52luq	96(%rcx),%ymm4,%ymm18
	vpmadd52luq	128(%rcx),%ymm4,%ymm19


	valignq	$1,%ymm1,%ymm16,%ymm1
	valignq	$1,%ymm16,%ymm17,%ymm16
	valignq	$1,%ymm17,%ymm18,%ymm17
	valignq	$1,%ymm18,%ymm19,%ymm18
	valignq	$1,%ymm19,%ymm0,%ymm19

	vmovq	%xmm1,%r13
	addq	%r13,%r9

	vpmadd52huq	0(%rsi),%ymm3,%ymm1
	vpmadd52huq	32(%rsi),%ymm3,%ymm16
	vpmadd52huq	64(%rsi),%ymm3,%ymm17
	vpmadd52huq	96(%rsi),%ymm3,%ymm18
	vpmadd52huq	128(%rsi),%ymm3,%ymm19

	vpmadd52huq	0(%rcx),%ymm4,%ymm1
	vpmadd52huq	32(%rcx),%ymm4,%ymm16
	vpmadd52huq	64(%rcx),%ymm4,%ymm17
	vpmadd52huq	96(%rcx),%ymm4,%ymm18
	vpmadd52huq	128(%rcx),%ymm4,%ymm19
	movq	160(%r11),%r13

	vpbroadcastq	%r13,%ymm3
	movq	160(%rsi),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r15
	movq	%r12,%r10
	adcq	$0,%r10

	movq	8(%r8),%r13
	imulq	%r15,%r13
	andq	%rax,%r13

	vpbroadcastq	%r13,%ymm4
	movq	160(%rcx),%rdx
	mulxq	%r13,%r13,%r12
	addq	%r13,%r15
	adcq	%r12,%r10

	shrq	$52,%r15
	salq	$12,%r10
	orq	%r10,%r15

	vpmadd52luq	160(%rsi),%ymm3,%ymm2
	vpmadd52luq	192(%rsi),%ymm3,%ymm20
	vpmadd52luq	224(%rsi),%ymm3,%ymm21
	vpmadd52luq	256(%rsi),%ymm3,%ymm22
	vpmadd52luq	288(%rsi),%ymm3,%ymm23

	vpmadd52luq	160(%rcx),%ymm4,%ymm2
	vpmadd52luq	192(%rcx),%ymm4,%ymm20
	vpmadd52luq	224(%rcx),%ymm4,%ymm21
	vpmadd52luq	256(%rcx),%ymm4,%ymm22
	vpmadd52luq	288(%rcx),%ymm4,%ymm23


	valignq	$1,%ymm2,%ymm20,%ymm2
	valignq	$1,%ymm20,%ymm21,%ymm20
	valignq	$1,%ymm21,%ymm22,%ymm21
	valignq	$1,%ymm22,%ymm23,%ymm22
	valignq	$1,%ymm23,%ymm0,%ymm23

	vmovq	%xmm2,%r13
	addq	%r13,%r15

	vpmadd52huq	160(%rsi),%ymm3,%ymm2
	vpmadd52huq	192(%rsi),%ymm3,%ymm20
	vpmadd52huq	224(%rsi),%ymm3,%ymm21
	vpmadd52huq	256(%rsi),%ymm3,%ymm22
	vpmadd52huq	288(%rsi),%ymm3,%ymm23

	vpmadd52huq	160(%rcx),%ymm4,%ymm2
	vpmadd52huq	192(%rcx),%ymm4,%ymm20
	vpmadd52huq	224(%rcx),%ymm4,%ymm21
	vpmadd52huq	256(%rcx),%ymm4,%ymm22
	vpmadd52huq	288(%rcx),%ymm4,%ymm23
	leaq	8(%r11),%r11
	decl	%ebx
	jne	.Lloop20

	vmovdqa64	.Lmask52x4(%rip),%ymm4

	vpbroadcastq	%r9,%ymm3
	vpblendd	$3,%ymm3,%ymm1,%ymm1



	vpsrlq	$52,%ymm1,%ymm24
	vpsrlq	$52,%ymm16,%ymm25
	vpsrlq	$52,%ymm17,%ymm26
	vpsrlq	$52,%ymm18,%ymm27
	vpsrlq	$52,%ymm19,%ymm28


	valignq	$3,%ymm27,%ymm28,%ymm28
	valignq	$3,%ymm26,%ymm27,%ymm27
	valignq	$3,%ymm25,%ymm26,%ymm26
	valignq	$3,%ymm24,%ymm25,%ymm25
	valignq	$3,%ymm0,%ymm24,%ymm24


	vpandq	%ymm4,%ymm1,%ymm1
	vpandq	%ymm4,%ymm16,%ymm16
	vpandq	%ymm4,%ymm17,%ymm17
	vpandq	%ymm4,%ymm18,%ymm18
	vpandq	%ymm4,%ymm19,%ymm19


	vpaddq	%ymm24,%ymm1,%ymm1
	vpaddq	%ymm25,%ymm16,%ymm16
	vpaddq	%ymm26,%ymm17,%ymm17
	vpaddq	%ymm27,%ymm18,%ymm18
	vpaddq	%ymm28,%ymm19,%ymm19



	vpcmpuq	$1,%ymm1,%ymm4,%k1
	vpcmpuq	$1,%ymm16,%ymm4,%k2
	vpcmpuq	$1,%ymm17,%ymm4,%k3
	vpcmpuq	$1,%ymm18,%ymm4,%k4
	vpcmpuq	$1,%ymm19,%ymm4,%k5
	kmovb	%k1,%r14d
	kmovb	%k2,%r13d
	kmovb	%k3,%r12d
	kmovb	%k4,%r11d
	kmovb	%k5,%r10d


	vpcmpuq	$0,%ymm1,%ymm4,%k1
	vpcmpuq	$0,%ymm16,%ymm4,%k2
	vpcmpuq	$0,%ymm17,%ymm4,%k3
	vpcmpuq	$0,%ymm18,%ymm4,%k4
	vpcmpuq	$0,%ymm19,%ymm4,%k5
	kmovb	%k1,%r9d
	kmovb	%k2,%r8d
	kmovb	%k3,%ebx
	kmovb	%k4,%ecx
	kmovb	%k5,%edx



	shlb	$4,%r13b
	orb	%r13b,%r14b
	shlb	$4,%r11b
	orb	%r11b,%r12b

	addb	%r14b,%r14b
	adcb	%r12b,%r12b
	adcb	%r10b,%r10b

	shlb	$4,%r8b
	orb	%r8b,%r9b
	shlb	$4,%cl
	orb	%cl,%bl

	addb	%r9b,%r14b
	adcb	%bl,%r12b
	adcb	%dl,%r10b

	xorb	%r9b,%r14b
	xorb	%bl,%r12b
	xorb	%dl,%r10b

	kmovb	%r14d,%k1
	shrb	$4,%r14b
	kmovb	%r14d,%k2
	kmovb	%r12d,%k3
	shrb	$4,%r12b
	kmovb	%r12d,%k4
	kmovb	%r10d,%k5


	vpsubq	%ymm4,%ymm1,%ymm1{%k1}
	vpsubq	%ymm4,%ymm16,%ymm16{%k2}
	vpsubq	%ymm4,%ymm17,%ymm17{%k3}
	vpsubq	%ymm4,%ymm18,%ymm18{%k4}
	vpsubq	%ymm4,%ymm19,%ymm19{%k5}

	vpandq	%ymm4,%ymm1,%ymm1
	vpandq	%ymm4,%ymm16,%ymm16
	vpandq	%ymm4,%ymm17,%ymm17
	vpandq	%ymm4,%ymm18,%ymm18
	vpandq	%ymm4,%ymm19,%ymm19

	vpbroadcastq	%r15,%ymm3
	vpblendd	$3,%ymm3,%ymm2,%ymm2



	vpsrlq	$52,%ymm2,%ymm24
	vpsrlq	$52,%ymm20,%ymm25
	vpsrlq	$52,%ymm21,%ymm26
	vpsrlq	$52,%ymm22,%ymm27
	vpsrlq	$52,%ymm23,%ymm28


	valignq	$3,%ymm27,%ymm28,%ymm28
	valignq	$3,%ymm26,%ymm27,%ymm27
	valignq	$3,%ymm25,%ymm26,%ymm26
	valignq	$3,%ymm24,%ymm25,%ymm25
	valignq	$3,%ymm0,%ymm24,%ymm24


	vpandq	%ymm4,%ymm2,%ymm2
	vpandq	%ymm4,%ymm20,%ymm20
	vpandq	%ymm4,%ymm21,%ymm21
	vpandq	%ymm4,%ymm22,%ymm22
	vpandq	%ymm4,%ymm23,%ymm23


	vpaddq	%ymm24,%ymm2,%ymm2
	vpaddq	%ymm25,%ymm20,%ymm20
	vpaddq	%ymm26,%ymm21,%ymm21
	vpaddq	%ymm27,%ymm22,%ymm22
	vpaddq	%ymm28,%ymm23,%ymm23



	vpcmpuq	$1,%ymm2,%ymm4,%k1
	vpcmpuq	$1,%ymm20,%ymm4,%k2
	vpcmpuq	$1,%ymm21,%ymm4,%k3
	vpcmpuq	$1,%ymm22,%ymm4,%k4
	vpcmpuq	$1,%ymm23,%ymm4,%k5
	kmovb	%k1,%r14d
	kmovb	%k2,%r13d
	kmovb	%k3,%r12d
	kmovb	%k4,%r11d
	kmovb	%k5,%r10d


	vpcmpuq	$0,%ymm2,%ymm4,%k1
	vpcmpuq	$0,%ymm20,%ymm4,%k2
	vpcmpuq	$0,%ymm21,%ymm4,%k3
	vpcmpuq	$0,%ymm22,%ymm4,%k4
	vpcmpuq	$0,%ymm23,%ymm4,%k5
	kmovb	%k1,%r9d
	kmovb	%k2,%r8d
	kmovb	%k3,%ebx
	kmovb	%k4,%ecx
	kmovb	%k5,%edx



	shlb	$4,%r13b
	orb	%r13b,%r14b
	shlb	$4,%r11b
	orb	%r11b,%r12b

	addb	%r14b,%r14b
	adcb	%r12b,%r12b
	adcb	%r10b,%r10b

	shlb	$4,%r8b
	orb	%r8b,%r9b
	shlb	$4,%cl
	orb	%cl,%bl

	addb	%r9b,%r14b
	adcb	%bl,%r12b
	adcb	%dl,%r10b

	xorb	%r9b,%r14b
	xorb	%bl,%r12b
	xorb	%dl,%r10b

	kmovb	%r14d,%k1
	shrb	$4,%r14b
	kmovb	%r14d,%k2
	kmovb	%r12d,%k3
	shrb	$4,%r12b
	kmovb	%r12d,%k4
	kmovb	%r10d,%k5


	vpsubq	%ymm4,%ymm2,%ymm2{%k1}
	vpsubq	%ymm4,%ymm20,%ymm20{%k2}
	vpsubq	%ymm4,%ymm21,%ymm21{%k3}
	vpsubq	%ymm4,%ymm22,%ymm22{%k4}
	vpsubq	%ymm4,%ymm23,%ymm23{%k5}

	vpandq	%ymm4,%ymm2,%ymm2
	vpandq	%ymm4,%ymm20,%ymm20
	vpandq	%ymm4,%ymm21,%ymm21
	vpandq	%ymm4,%ymm22,%ymm22
	vpandq	%ymm4,%ymm23,%ymm23

	vmovdqu64	%ymm1,(%rdi)
	vmovdqu64	%ymm16,32(%rdi)
	vmovdqu64	%ymm17,64(%rdi)
	vmovdqu64	%ymm18,96(%rdi)
	vmovdqu64	%ymm19,128(%rdi)

	vmovdqu64	%ymm2,160(%rdi)
	vmovdqu64	%ymm20,192(%rdi)
	vmovdqu64	%ymm21,224(%rdi)
	vmovdqu64	%ymm22,256(%rdi)
	vmovdqu64	%ymm23,288(%rdi)

	vzeroupper
	movq	0(%rsp),%r15
.cfi_restore	%r15
	movq	8(%rsp),%r14
.cfi_restore	%r14
	movq	16(%rsp),%r13
.cfi_restore	%r13
	movq	24(%rsp),%r12
.cfi_restore	%r12
	movq	32(%rsp),%rbp
.cfi_restore	%rbp
	movq	40(%rsp),%rbx
.cfi_restore	%rbx
	leaq	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lrsaz_amm52x20_x2_256_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256
.text	

.align	32
.globl	ossl_extract_multiplier_2x20_win5
.type	ossl_extract_multiplier_2x20_win5,@function
ossl_extract_multiplier_2x20_win5:
.cfi_startproc	
.byte	243,15,30,250
	leaq	(%rcx,%rcx,4),%rax
	salq	$5,%rax
	addq	%rax,%rsi

	vmovdqa64	.Lones(%rip),%ymm23
	vpbroadcastq	%rdx,%ymm22
	leaq	10240(%rsi),%rax

	vpxor	%xmm4,%xmm4,%xmm4
	vmovdqa64	%ymm4,%ymm3
	vmovdqa64	%ymm4,%ymm2
	vmovdqa64	%ymm4,%ymm1
	vmovdqa64	%ymm4,%ymm0
	vmovdqa64	%ymm4,%ymm21

.align	32
.Lloop:
	vpcmpq	$0,%ymm21,%ymm22,%k1
	addq	$320,%rsi
	vpaddq	%ymm23,%ymm21,%ymm21
	vmovdqu64	-320(%rsi),%ymm16
	vmovdqu64	-288(%rsi),%ymm17
	vmovdqu64	-256(%rsi),%ymm18
	vmovdqu64	-224(%rsi),%ymm19
	vmovdqu64	-192(%rsi),%ymm20
	vpblendmq	%ymm16,%ymm0,%ymm0{%k1}
	vpblendmq	%ymm17,%ymm1,%ymm1{%k1}
	vpblendmq	%ymm18,%ymm2,%ymm2{%k1}
	vpblendmq	%ymm19,%ymm3,%ymm3{%k1}
	vpblendmq	%ymm20,%ymm4,%ymm4{%k1}
	cmpq	%rsi,%rax
	jne	.Lloop

	vmovdqu64	%ymm0,(%rdi)
	vmovdqu64	%ymm1,32(%rdi)
	vmovdqu64	%ymm2,64(%rdi)
	vmovdqu64	%ymm3,96(%rdi)
	vmovdqu64	%ymm4,128(%rdi)

	.byte	0xf3,0xc3
.cfi_endproc	
.size	ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
.data	
.align	32
.Lones:
.quad	1,1,1,1
	.section ".note.gnu.property", "a"
	.p2align 3
	.long 1f - 0f
	.long 4f - 1f
	.long 5
0:
	# "GNU" encoded with .byte, since .asciz isn't supported
	# on Solaris.
	.byte 0x47
	.byte 0x4e
	.byte 0x55
	.byte 0
1:
	.p2align 3
	.long 0xc0000002
	.long 3f - 2f
2:
	.long 3
3:
	.p2align 3
4:
