diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0ada434
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+libs
+obj
+local.properties
diff --git a/Android.mk b/Android.mk
index 741123b..9f5834e 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,9 +1,13 @@
LOCAL_PATH := $(call my-dir)
-subdirs := $(addprefix $(LOCAL_PATH)/,$(addsuffix /Android.mk, \
- crypto \
- ssl \
- apps \
- ))
+# Enable to be able to use ALOG* with #include "cutils/log.h"
+#log_c_includes += system/core/include
+#log_shared_libraries := liblog
-include $(subdirs)
+# These makefiles are here instead of being Android.mk files in the
+# respective crypto, ssl, and apps directories so
+# that import_openssl.sh import won't remove them.
+include $(LOCAL_PATH)/build-config.mk
+include $(LOCAL_PATH)/Crypto.mk
+include $(LOCAL_PATH)/Ssl.mk
+include $(LOCAL_PATH)/Apps.mk
diff --git a/AndroidManifest.xml b/AndroidManifest.xml
new file mode 100644
index 0000000..7b13d57
--- /dev/null
+++ b/AndroidManifest.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
diff --git a/Apps-config.mk b/Apps-config.mk
new file mode 100644
index 0000000..2ee8026
--- /dev/null
+++ b/Apps-config.mk
@@ -0,0 +1,133 @@
+# Auto-generated - DO NOT EDIT!
+# To regenerate, edit openssl.config, then run:
+# ./import_openssl.sh import /path/to/openssl-1.0.1e.tar.gz
+#
+# Before including this file, the local Android.mk must define the following
+# variables:
+#
+# local_c_flags
+# local_c_includes
+# local_additional_dependencies
+#
+# This script will define the following variables:
+#
+# target_c_flags
+# target_c_includes
+# target_src_files
+#
+# host_c_flags
+# host_c_includes
+# host_src_files
+#
+
+# Ensure these are empty.
+unknown_arch_c_flags :=
+unknown_arch_src_files :=
+unknown_arch_exclude_files :=
+
+
+common_c_flags := \
+ -DMONOLITH \
+
+common_src_files := \
+ apps/app_rand.c \
+ apps/apps.c \
+ apps/asn1pars.c \
+ apps/ca.c \
+ apps/ciphers.c \
+ apps/crl.c \
+ apps/crl2p7.c \
+ apps/dgst.c \
+ apps/dh.c \
+ apps/dhparam.c \
+ apps/dsa.c \
+ apps/dsaparam.c \
+ apps/ec.c \
+ apps/ecparam.c \
+ apps/enc.c \
+ apps/engine.c \
+ apps/errstr.c \
+ apps/gendh.c \
+ apps/gendsa.c \
+ apps/genpkey.c \
+ apps/genrsa.c \
+ apps/nseq.c \
+ apps/ocsp.c \
+ apps/openssl.c \
+ apps/passwd.c \
+ apps/pkcs12.c \
+ apps/pkcs7.c \
+ apps/pkcs8.c \
+ apps/pkey.c \
+ apps/pkeyparam.c \
+ apps/pkeyutl.c \
+ apps/prime.c \
+ apps/rand.c \
+ apps/req.c \
+ apps/rsa.c \
+ apps/rsautl.c \
+ apps/s_cb.c \
+ apps/s_client.c \
+ apps/s_server.c \
+ apps/s_socket.c \
+ apps/s_time.c \
+ apps/sess_id.c \
+ apps/smime.c \
+ apps/speed.c \
+ apps/spkac.c \
+ apps/srp.c \
+ apps/verify.c \
+ apps/version.c \
+ apps/x509.c \
+
+common_c_includes := \
+ . \
+ include \
+
+arm_c_flags :=
+
+arm_src_files :=
+
+arm_exclude_files :=
+
+x86_c_flags :=
+
+x86_src_files :=
+
+x86_exclude_files :=
+
+x86_64_c_flags :=
+
+x86_64_src_files :=
+
+x86_64_exclude_files :=
+
+mips_c_flags :=
+
+mips_src_files :=
+
+mips_exclude_files :=
+
+target_arch := $(TARGET_ARCH)
+ifeq ($(target_arch)-$(TARGET_HAS_BIGENDIAN),mips-true)
+target_arch := unknown_arch
+endif
+
+target_c_flags := $(common_c_flags) $($(target_arch)_c_flags) $(local_c_flags)
+target_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+target_src_files := $(common_src_files) $($(target_arch)_src_files)
+target_src_files := $(filter-out $($(target_arch)_exclude_files), $(target_src_files))
+
+ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
+host_arch := x86
+else
+host_arch := unknown_arch
+endif
+
+host_c_flags := $(common_c_flags) $($(host_arch)_c_flags) $(local_c_flags)
+host_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+host_src_files := $(common_src_files) $($(host_arch)_src_files)
+host_src_files := $(filter-out $($(host_arch)_exclude_files), $(host_src_files))
+
+local_additional_dependencies += $(LOCAL_PATH)/Apps-config.mk
+
diff --git a/Apps.mk b/Apps.mk
new file mode 100644
index 0000000..3ba6fe3
--- /dev/null
+++ b/Apps.mk
@@ -0,0 +1,34 @@
+# Copyright 2006 The Android Open Source Project
+
+LOCAL_PATH:= $(call my-dir)
+
+local_c_includes := $(NDK_PROJECT_PATH) \
+ $(NDK_PROJECT_PATH)/include
+local_c_flags :=
+
+local_additional_dependencies := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Apps.mk
+
+include $(LOCAL_PATH)/Apps-config.mk
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= openssl
+LOCAL_CLANG := true
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := $(target_src_files)
+LOCAL_SHARED_LIBRARIES := libssl libcrypto
+LOCAL_C_INCLUDES := $(target_c_includes)
+LOCAL_CFLAGS := $(target_c_flags)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(LOCAL_PATH)/android-config.mk
+include $(BUILD_EXECUTABLE)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= openssl
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := $(host_src_files)
+LOCAL_SHARED_LIBRARIES := libssl-host libcrypto-host
+LOCAL_C_INCLUDES := $(host_c_includes)
+LOCAL_CFLAGS := $(host_c_flags)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(LOCAL_PATH)/android-config.mk
+#include $(BUILD_HOST_EXECUTABLE)
diff --git a/CleanSpec.mk b/CleanSpec.mk
index 7549ef9..3c1f850 100644
--- a/CleanSpec.mk
+++ b/CleanSpec.mk
@@ -54,6 +54,7 @@ $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libssl_interme
$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libcrypto_intermediates)
$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libssl_static_intermediates)
$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libcrypto_static_intermediates)
+$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/EXECUTABLES/*ssl*_intermediates $(PRODUCT_OUT)/obj/*/libssl_*intermediates $(PRODUCT_OUT)/obj/*/libcrypto_*intermediates)
# ************************************************
# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
diff --git a/Crypto-config.mk b/Crypto-config.mk
new file mode 100644
index 0000000..d74a791
--- /dev/null
+++ b/Crypto-config.mk
@@ -0,0 +1,686 @@
+# Auto-generated - DO NOT EDIT!
+# To regenerate, edit openssl.config, then run:
+# ./import_openssl.sh import /path/to/openssl-1.0.1e.tar.gz
+#
+# Before including this file, the local Android.mk must define the following
+# variables:
+#
+# local_c_flags
+# local_c_includes
+# local_additional_dependencies
+#
+# This script will define the following variables:
+#
+# target_c_flags
+# target_c_includes
+# target_src_files
+#
+# host_c_flags
+# host_c_includes
+# host_src_files
+#
+
+# Ensure these are empty.
+unknown_arch_c_flags :=
+unknown_arch_src_files :=
+unknown_arch_exclude_files :=
+
+
+common_c_flags := \
+ -DNO_WINDOWS_BRAINDEATH \
+
+common_src_files := \
+ crypto/aes/aes_cbc.c \
+ crypto/aes/aes_cfb.c \
+ crypto/aes/aes_core.c \
+ crypto/aes/aes_ctr.c \
+ crypto/aes/aes_ecb.c \
+ crypto/aes/aes_misc.c \
+ crypto/aes/aes_ofb.c \
+ crypto/aes/aes_wrap.c \
+ crypto/asn1/a_bitstr.c \
+ crypto/asn1/a_bool.c \
+ crypto/asn1/a_bytes.c \
+ crypto/asn1/a_d2i_fp.c \
+ crypto/asn1/a_digest.c \
+ crypto/asn1/a_dup.c \
+ crypto/asn1/a_enum.c \
+ crypto/asn1/a_gentm.c \
+ crypto/asn1/a_i2d_fp.c \
+ crypto/asn1/a_int.c \
+ crypto/asn1/a_mbstr.c \
+ crypto/asn1/a_object.c \
+ crypto/asn1/a_octet.c \
+ crypto/asn1/a_print.c \
+ crypto/asn1/a_set.c \
+ crypto/asn1/a_sign.c \
+ crypto/asn1/a_strex.c \
+ crypto/asn1/a_strnid.c \
+ crypto/asn1/a_time.c \
+ crypto/asn1/a_type.c \
+ crypto/asn1/a_utctm.c \
+ crypto/asn1/a_utf8.c \
+ crypto/asn1/a_verify.c \
+ crypto/asn1/ameth_lib.c \
+ crypto/asn1/asn1_err.c \
+ crypto/asn1/asn1_gen.c \
+ crypto/asn1/asn1_lib.c \
+ crypto/asn1/asn1_par.c \
+ crypto/asn1/asn_mime.c \
+ crypto/asn1/asn_moid.c \
+ crypto/asn1/asn_pack.c \
+ crypto/asn1/bio_asn1.c \
+ crypto/asn1/bio_ndef.c \
+ crypto/asn1/d2i_pr.c \
+ crypto/asn1/d2i_pu.c \
+ crypto/asn1/evp_asn1.c \
+ crypto/asn1/f_enum.c \
+ crypto/asn1/f_int.c \
+ crypto/asn1/f_string.c \
+ crypto/asn1/i2d_pr.c \
+ crypto/asn1/i2d_pu.c \
+ crypto/asn1/n_pkey.c \
+ crypto/asn1/nsseq.c \
+ crypto/asn1/p5_pbe.c \
+ crypto/asn1/p5_pbev2.c \
+ crypto/asn1/p8_pkey.c \
+ crypto/asn1/t_bitst.c \
+ crypto/asn1/t_crl.c \
+ crypto/asn1/t_pkey.c \
+ crypto/asn1/t_req.c \
+ crypto/asn1/t_spki.c \
+ crypto/asn1/t_x509.c \
+ crypto/asn1/t_x509a.c \
+ crypto/asn1/tasn_dec.c \
+ crypto/asn1/tasn_enc.c \
+ crypto/asn1/tasn_fre.c \
+ crypto/asn1/tasn_new.c \
+ crypto/asn1/tasn_prn.c \
+ crypto/asn1/tasn_typ.c \
+ crypto/asn1/tasn_utl.c \
+ crypto/asn1/x_algor.c \
+ crypto/asn1/x_attrib.c \
+ crypto/asn1/x_bignum.c \
+ crypto/asn1/x_crl.c \
+ crypto/asn1/x_exten.c \
+ crypto/asn1/x_info.c \
+ crypto/asn1/x_long.c \
+ crypto/asn1/x_name.c \
+ crypto/asn1/x_nx509.c \
+ crypto/asn1/x_pkey.c \
+ crypto/asn1/x_pubkey.c \
+ crypto/asn1/x_req.c \
+ crypto/asn1/x_sig.c \
+ crypto/asn1/x_spki.c \
+ crypto/asn1/x_val.c \
+ crypto/asn1/x_x509.c \
+ crypto/asn1/x_x509a.c \
+ crypto/bf/bf_cfb64.c \
+ crypto/bf/bf_ecb.c \
+ crypto/bf/bf_enc.c \
+ crypto/bf/bf_ofb64.c \
+ crypto/bf/bf_skey.c \
+ crypto/bio/b_dump.c \
+ crypto/bio/b_print.c \
+ crypto/bio/b_sock.c \
+ crypto/bio/bf_buff.c \
+ crypto/bio/bf_nbio.c \
+ crypto/bio/bf_null.c \
+ crypto/bio/bio_cb.c \
+ crypto/bio/bio_err.c \
+ crypto/bio/bio_lib.c \
+ crypto/bio/bss_acpt.c \
+ crypto/bio/bss_bio.c \
+ crypto/bio/bss_conn.c \
+ crypto/bio/bss_dgram.c \
+ crypto/bio/bss_fd.c \
+ crypto/bio/bss_file.c \
+ crypto/bio/bss_log.c \
+ crypto/bio/bss_mem.c \
+ crypto/bio/bss_null.c \
+ crypto/bio/bss_sock.c \
+ crypto/bn/bn_add.c \
+ crypto/bn/bn_asm.c \
+ crypto/bn/bn_blind.c \
+ crypto/bn/bn_const.c \
+ crypto/bn/bn_ctx.c \
+ crypto/bn/bn_div.c \
+ crypto/bn/bn_err.c \
+ crypto/bn/bn_exp.c \
+ crypto/bn/bn_exp2.c \
+ crypto/bn/bn_gcd.c \
+ crypto/bn/bn_gf2m.c \
+ crypto/bn/bn_kron.c \
+ crypto/bn/bn_lib.c \
+ crypto/bn/bn_mod.c \
+ crypto/bn/bn_mont.c \
+ crypto/bn/bn_mpi.c \
+ crypto/bn/bn_mul.c \
+ crypto/bn/bn_nist.c \
+ crypto/bn/bn_prime.c \
+ crypto/bn/bn_print.c \
+ crypto/bn/bn_rand.c \
+ crypto/bn/bn_recp.c \
+ crypto/bn/bn_shift.c \
+ crypto/bn/bn_sqr.c \
+ crypto/bn/bn_sqrt.c \
+ crypto/bn/bn_word.c \
+ crypto/buffer/buf_err.c \
+ crypto/buffer/buf_str.c \
+ crypto/buffer/buffer.c \
+ crypto/cmac/cm_ameth.c \
+ crypto/cmac/cm_pmeth.c \
+ crypto/cmac/cmac.c \
+ crypto/comp/c_rle.c \
+ crypto/comp/c_zlib.c \
+ crypto/comp/comp_err.c \
+ crypto/comp/comp_lib.c \
+ crypto/conf/conf_api.c \
+ crypto/conf/conf_def.c \
+ crypto/conf/conf_err.c \
+ crypto/conf/conf_lib.c \
+ crypto/conf/conf_mall.c \
+ crypto/conf/conf_mod.c \
+ crypto/conf/conf_sap.c \
+ crypto/cpt_err.c \
+ crypto/cryptlib.c \
+ crypto/cversion.c \
+ crypto/des/cbc_cksm.c \
+ crypto/des/cbc_enc.c \
+ crypto/des/cfb64ede.c \
+ crypto/des/cfb64enc.c \
+ crypto/des/cfb_enc.c \
+ crypto/des/des_enc.c \
+ crypto/des/des_old.c \
+ crypto/des/des_old2.c \
+ crypto/des/ecb3_enc.c \
+ crypto/des/ecb_enc.c \
+ crypto/des/ede_cbcm_enc.c \
+ crypto/des/enc_read.c \
+ crypto/des/enc_writ.c \
+ crypto/des/fcrypt.c \
+ crypto/des/fcrypt_b.c \
+ crypto/des/ofb64ede.c \
+ crypto/des/ofb64enc.c \
+ crypto/des/ofb_enc.c \
+ crypto/des/pcbc_enc.c \
+ crypto/des/qud_cksm.c \
+ crypto/des/rand_key.c \
+ crypto/des/read2pwd.c \
+ crypto/des/rpc_enc.c \
+ crypto/des/set_key.c \
+ crypto/des/str2key.c \
+ crypto/des/xcbc_enc.c \
+ crypto/dh/dh_ameth.c \
+ crypto/dh/dh_asn1.c \
+ crypto/dh/dh_check.c \
+ crypto/dh/dh_depr.c \
+ crypto/dh/dh_err.c \
+ crypto/dh/dh_gen.c \
+ crypto/dh/dh_key.c \
+ crypto/dh/dh_lib.c \
+ crypto/dh/dh_pmeth.c \
+ crypto/dsa/dsa_ameth.c \
+ crypto/dsa/dsa_asn1.c \
+ crypto/dsa/dsa_depr.c \
+ crypto/dsa/dsa_err.c \
+ crypto/dsa/dsa_gen.c \
+ crypto/dsa/dsa_key.c \
+ crypto/dsa/dsa_lib.c \
+ crypto/dsa/dsa_ossl.c \
+ crypto/dsa/dsa_pmeth.c \
+ crypto/dsa/dsa_prn.c \
+ crypto/dsa/dsa_sign.c \
+ crypto/dsa/dsa_vrf.c \
+ crypto/dso/dso_dl.c \
+ crypto/dso/dso_dlfcn.c \
+ crypto/dso/dso_err.c \
+ crypto/dso/dso_lib.c \
+ crypto/dso/dso_null.c \
+ crypto/dso/dso_openssl.c \
+ crypto/ebcdic.c \
+ crypto/ec/ec2_mult.c \
+ crypto/ec/ec2_oct.c \
+ crypto/ec/ec2_smpl.c \
+ crypto/ec/ec_ameth.c \
+ crypto/ec/ec_asn1.c \
+ crypto/ec/ec_check.c \
+ crypto/ec/ec_curve.c \
+ crypto/ec/ec_cvt.c \
+ crypto/ec/ec_err.c \
+ crypto/ec/ec_key.c \
+ crypto/ec/ec_lib.c \
+ crypto/ec/ec_mult.c \
+ crypto/ec/ec_oct.c \
+ crypto/ec/ec_pmeth.c \
+ crypto/ec/ec_print.c \
+ crypto/ec/eck_prn.c \
+ crypto/ec/ecp_mont.c \
+ crypto/ec/ecp_nist.c \
+ crypto/ec/ecp_oct.c \
+ crypto/ec/ecp_smpl.c \
+ crypto/ecdh/ech_err.c \
+ crypto/ecdh/ech_key.c \
+ crypto/ecdh/ech_lib.c \
+ crypto/ecdh/ech_ossl.c \
+ crypto/ecdsa/ecs_asn1.c \
+ crypto/ecdsa/ecs_err.c \
+ crypto/ecdsa/ecs_lib.c \
+ crypto/ecdsa/ecs_ossl.c \
+ crypto/ecdsa/ecs_sign.c \
+ crypto/ecdsa/ecs_vrf.c \
+ crypto/engine/eng_all.c \
+ crypto/engine/eng_cnf.c \
+ crypto/engine/eng_ctrl.c \
+ crypto/engine/eng_dyn.c \
+ crypto/engine/eng_err.c \
+ crypto/engine/eng_fat.c \
+ crypto/engine/eng_init.c \
+ crypto/engine/eng_lib.c \
+ crypto/engine/eng_list.c \
+ crypto/engine/eng_pkey.c \
+ crypto/engine/eng_table.c \
+ crypto/engine/tb_asnmth.c \
+ crypto/engine/tb_cipher.c \
+ crypto/engine/tb_dh.c \
+ crypto/engine/tb_digest.c \
+ crypto/engine/tb_dsa.c \
+ crypto/engine/tb_ecdh.c \
+ crypto/engine/tb_ecdsa.c \
+ crypto/engine/tb_pkmeth.c \
+ crypto/engine/tb_rand.c \
+ crypto/engine/tb_rsa.c \
+ crypto/engine/tb_store.c \
+ crypto/err/err.c \
+ crypto/err/err_all.c \
+ crypto/err/err_prn.c \
+ crypto/evp/bio_b64.c \
+ crypto/evp/bio_enc.c \
+ crypto/evp/bio_md.c \
+ crypto/evp/bio_ok.c \
+ crypto/evp/c_all.c \
+ crypto/evp/c_allc.c \
+ crypto/evp/c_alld.c \
+ crypto/evp/digest.c \
+ crypto/evp/e_aes.c \
+ crypto/evp/e_aes_cbc_hmac_sha1.c \
+ crypto/evp/e_bf.c \
+ crypto/evp/e_des.c \
+ crypto/evp/e_des3.c \
+ crypto/evp/e_null.c \
+ crypto/evp/e_old.c \
+ crypto/evp/e_rc2.c \
+ crypto/evp/e_rc4.c \
+ crypto/evp/e_rc4_hmac_md5.c \
+ crypto/evp/e_rc5.c \
+ crypto/evp/e_xcbc_d.c \
+ crypto/evp/encode.c \
+ crypto/evp/evp_acnf.c \
+ crypto/evp/evp_cnf.c \
+ crypto/evp/evp_enc.c \
+ crypto/evp/evp_err.c \
+ crypto/evp/evp_key.c \
+ crypto/evp/evp_lib.c \
+ crypto/evp/evp_pbe.c \
+ crypto/evp/evp_pkey.c \
+ crypto/evp/m_dss.c \
+ crypto/evp/m_dss1.c \
+ crypto/evp/m_ecdsa.c \
+ crypto/evp/m_md4.c \
+ crypto/evp/m_md5.c \
+ crypto/evp/m_mdc2.c \
+ crypto/evp/m_null.c \
+ crypto/evp/m_ripemd.c \
+ crypto/evp/m_sha1.c \
+ crypto/evp/m_sigver.c \
+ crypto/evp/m_wp.c \
+ crypto/evp/names.c \
+ crypto/evp/p5_crpt.c \
+ crypto/evp/p5_crpt2.c \
+ crypto/evp/p_dec.c \
+ crypto/evp/p_enc.c \
+ crypto/evp/p_lib.c \
+ crypto/evp/p_open.c \
+ crypto/evp/p_seal.c \
+ crypto/evp/p_sign.c \
+ crypto/evp/p_verify.c \
+ crypto/evp/pmeth_fn.c \
+ crypto/evp/pmeth_gn.c \
+ crypto/evp/pmeth_lib.c \
+ crypto/ex_data.c \
+ crypto/hmac/hm_ameth.c \
+ crypto/hmac/hm_pmeth.c \
+ crypto/hmac/hmac.c \
+ crypto/krb5/krb5_asn.c \
+ crypto/lhash/lh_stats.c \
+ crypto/lhash/lhash.c \
+ crypto/md4/md4_dgst.c \
+ crypto/md4/md4_one.c \
+ crypto/md5/md5_dgst.c \
+ crypto/md5/md5_one.c \
+ crypto/mem.c \
+ crypto/mem_clr.c \
+ crypto/mem_dbg.c \
+ crypto/modes/cbc128.c \
+ crypto/modes/ccm128.c \
+ crypto/modes/cfb128.c \
+ crypto/modes/ctr128.c \
+ crypto/modes/gcm128.c \
+ crypto/modes/ofb128.c \
+ crypto/modes/xts128.c \
+ crypto/o_dir.c \
+ crypto/o_init.c \
+ crypto/o_str.c \
+ crypto/o_time.c \
+ crypto/objects/o_names.c \
+ crypto/objects/obj_dat.c \
+ crypto/objects/obj_err.c \
+ crypto/objects/obj_lib.c \
+ crypto/objects/obj_xref.c \
+ crypto/ocsp/ocsp_asn.c \
+ crypto/ocsp/ocsp_cl.c \
+ crypto/ocsp/ocsp_err.c \
+ crypto/ocsp/ocsp_ext.c \
+ crypto/ocsp/ocsp_ht.c \
+ crypto/ocsp/ocsp_lib.c \
+ crypto/ocsp/ocsp_prn.c \
+ crypto/ocsp/ocsp_srv.c \
+ crypto/ocsp/ocsp_vfy.c \
+ crypto/pem/pem_all.c \
+ crypto/pem/pem_err.c \
+ crypto/pem/pem_info.c \
+ crypto/pem/pem_lib.c \
+ crypto/pem/pem_oth.c \
+ crypto/pem/pem_pk8.c \
+ crypto/pem/pem_pkey.c \
+ crypto/pem/pem_seal.c \
+ crypto/pem/pem_sign.c \
+ crypto/pem/pem_x509.c \
+ crypto/pem/pem_xaux.c \
+ crypto/pem/pvkfmt.c \
+ crypto/pkcs12/p12_add.c \
+ crypto/pkcs12/p12_asn.c \
+ crypto/pkcs12/p12_attr.c \
+ crypto/pkcs12/p12_crpt.c \
+ crypto/pkcs12/p12_crt.c \
+ crypto/pkcs12/p12_decr.c \
+ crypto/pkcs12/p12_init.c \
+ crypto/pkcs12/p12_key.c \
+ crypto/pkcs12/p12_kiss.c \
+ crypto/pkcs12/p12_mutl.c \
+ crypto/pkcs12/p12_npas.c \
+ crypto/pkcs12/p12_p8d.c \
+ crypto/pkcs12/p12_p8e.c \
+ crypto/pkcs12/p12_utl.c \
+ crypto/pkcs12/pk12err.c \
+ crypto/pkcs7/pk7_asn1.c \
+ crypto/pkcs7/pk7_attr.c \
+ crypto/pkcs7/pk7_doit.c \
+ crypto/pkcs7/pk7_lib.c \
+ crypto/pkcs7/pk7_mime.c \
+ crypto/pkcs7/pk7_smime.c \
+ crypto/pkcs7/pkcs7err.c \
+ crypto/pqueue/pqueue.c \
+ crypto/rand/md_rand.c \
+ crypto/rand/rand_egd.c \
+ crypto/rand/rand_err.c \
+ crypto/rand/rand_lib.c \
+ crypto/rand/rand_unix.c \
+ crypto/rand/rand_win.c \
+ crypto/rand/randfile.c \
+ crypto/rc2/rc2_cbc.c \
+ crypto/rc2/rc2_ecb.c \
+ crypto/rc2/rc2_skey.c \
+ crypto/rc2/rc2cfb64.c \
+ crypto/rc2/rc2ofb64.c \
+ crypto/rc4/rc4_enc.c \
+ crypto/rc4/rc4_skey.c \
+ crypto/rc4/rc4_utl.c \
+ crypto/ripemd/rmd_dgst.c \
+ crypto/ripemd/rmd_one.c \
+ crypto/rsa/rsa_ameth.c \
+ crypto/rsa/rsa_asn1.c \
+ crypto/rsa/rsa_chk.c \
+ crypto/rsa/rsa_crpt.c \
+ crypto/rsa/rsa_eay.c \
+ crypto/rsa/rsa_err.c \
+ crypto/rsa/rsa_gen.c \
+ crypto/rsa/rsa_lib.c \
+ crypto/rsa/rsa_none.c \
+ crypto/rsa/rsa_null.c \
+ crypto/rsa/rsa_oaep.c \
+ crypto/rsa/rsa_pk1.c \
+ crypto/rsa/rsa_pmeth.c \
+ crypto/rsa/rsa_prn.c \
+ crypto/rsa/rsa_pss.c \
+ crypto/rsa/rsa_saos.c \
+ crypto/rsa/rsa_sign.c \
+ crypto/rsa/rsa_ssl.c \
+ crypto/rsa/rsa_x931.c \
+ crypto/sha/sha1_one.c \
+ crypto/sha/sha1dgst.c \
+ crypto/sha/sha256.c \
+ crypto/sha/sha512.c \
+ crypto/sha/sha_dgst.c \
+ crypto/srp/srp_lib.c \
+ crypto/srp/srp_vfy.c \
+ crypto/stack/stack.c \
+ crypto/ts/ts_err.c \
+ crypto/txt_db/txt_db.c \
+ crypto/ui/ui_compat.c \
+ crypto/ui/ui_err.c \
+ crypto/ui/ui_lib.c \
+ crypto/ui/ui_openssl.c \
+ crypto/ui/ui_util.c \
+ crypto/uid.c \
+ crypto/x509/by_dir.c \
+ crypto/x509/by_file.c \
+ crypto/x509/x509_att.c \
+ crypto/x509/x509_cmp.c \
+ crypto/x509/x509_d2.c \
+ crypto/x509/x509_def.c \
+ crypto/x509/x509_err.c \
+ crypto/x509/x509_ext.c \
+ crypto/x509/x509_lu.c \
+ crypto/x509/x509_obj.c \
+ crypto/x509/x509_r2x.c \
+ crypto/x509/x509_req.c \
+ crypto/x509/x509_set.c \
+ crypto/x509/x509_trs.c \
+ crypto/x509/x509_txt.c \
+ crypto/x509/x509_v3.c \
+ crypto/x509/x509_vfy.c \
+ crypto/x509/x509_vpm.c \
+ crypto/x509/x509cset.c \
+ crypto/x509/x509name.c \
+ crypto/x509/x509rset.c \
+ crypto/x509/x509spki.c \
+ crypto/x509/x509type.c \
+ crypto/x509/x_all.c \
+ crypto/x509v3/pcy_cache.c \
+ crypto/x509v3/pcy_data.c \
+ crypto/x509v3/pcy_lib.c \
+ crypto/x509v3/pcy_map.c \
+ crypto/x509v3/pcy_node.c \
+ crypto/x509v3/pcy_tree.c \
+ crypto/x509v3/v3_akey.c \
+ crypto/x509v3/v3_akeya.c \
+ crypto/x509v3/v3_alt.c \
+ crypto/x509v3/v3_bcons.c \
+ crypto/x509v3/v3_bitst.c \
+ crypto/x509v3/v3_conf.c \
+ crypto/x509v3/v3_cpols.c \
+ crypto/x509v3/v3_crld.c \
+ crypto/x509v3/v3_enum.c \
+ crypto/x509v3/v3_extku.c \
+ crypto/x509v3/v3_genn.c \
+ crypto/x509v3/v3_ia5.c \
+ crypto/x509v3/v3_info.c \
+ crypto/x509v3/v3_int.c \
+ crypto/x509v3/v3_lib.c \
+ crypto/x509v3/v3_ncons.c \
+ crypto/x509v3/v3_ocsp.c \
+ crypto/x509v3/v3_pci.c \
+ crypto/x509v3/v3_pcia.c \
+ crypto/x509v3/v3_pcons.c \
+ crypto/x509v3/v3_pku.c \
+ crypto/x509v3/v3_pmaps.c \
+ crypto/x509v3/v3_prn.c \
+ crypto/x509v3/v3_purp.c \
+ crypto/x509v3/v3_skey.c \
+ crypto/x509v3/v3_sxnet.c \
+ crypto/x509v3/v3_utl.c \
+ crypto/x509v3/v3err.c \
+
+common_c_includes := \
+ . \
+ crypto \
+ crypto/asn1 \
+ crypto/evp \
+ crypto/modes \
+ include \
+ include/openssl \
+
+arm_c_flags := \
+ -DAES_ASM \
+ -DGHASH_ASM \
+ -DOPENSSL_BN_ASM_GF2m \
+ -DOPENSSL_BN_ASM_MONT \
+ -DSHA1_ASM \
+ -DSHA256_ASM \
+ -DSHA512_ASM \
+
+arm_src_files := \
+ crypto/aes/asm/aes-armv4.S \
+ crypto/bn/asm/armv4-gf2m.S \
+ crypto/bn/asm/armv4-mont.S \
+ crypto/modes/asm/ghash-armv4.S \
+ crypto/sha/asm/sha1-armv4-large.S \
+ crypto/sha/asm/sha256-armv4.S \
+ crypto/sha/asm/sha512-armv4.S \
+
+arm_exclude_files := \
+ crypto/aes/aes_core.c \
+
+x86_c_flags := \
+ -DAES_ASM \
+ -DDES_PTR \
+ -DDES_RISC1 \
+ -DDES_UNROLL \
+ -DGHASH_ASM \
+ -DMD5_ASM \
+ -DOPENSSL_BN_ASM_GF2m \
+ -DOPENSSL_BN_ASM_MONT \
+ -DOPENSSL_BN_ASM_PART_WORDS \
+ -DOPENSSL_CPUID_OBJ \
+ -DSHA1_ASM \
+ -DSHA256_ASM \
+ -DSHA512_ASM \
+
+x86_src_files := \
+ crypto/aes/asm/aes-586.S \
+ crypto/aes/asm/aesni-x86.S \
+ crypto/aes/asm/vpaes-x86.S \
+ crypto/bf/asm/bf-586.S \
+ crypto/bn/asm/bn-586.S \
+ crypto/bn/asm/co-586.S \
+ crypto/bn/asm/x86-gf2m.S \
+ crypto/bn/asm/x86-mont.S \
+ crypto/des/asm/crypt586.S \
+ crypto/des/asm/des-586.S \
+ crypto/md5/asm/md5-586.S \
+ crypto/modes/asm/ghash-x86.S \
+ crypto/sha/asm/sha1-586.S \
+ crypto/sha/asm/sha256-586.S \
+ crypto/sha/asm/sha512-586.S \
+ crypto/x86cpuid.S \
+
+x86_exclude_files := \
+ crypto/aes/aes_cbc.c \
+ crypto/aes/aes_core.c \
+ crypto/bf/bf_enc.c \
+ crypto/bn/bn_asm.c \
+ crypto/des/des_enc.c \
+ crypto/des/fcrypt_b.c \
+ crypto/mem_clr.c \
+
+x86_64_c_flags := \
+ -DAES_ASM \
+ -DDES_PTR \
+ -DDES_RISC1 \
+ -DDES_UNROLL \
+ -DGHASH_ASM \
+ -DMD5_ASM \
+ -DOPENSSL_BN_ASM_GF2m \
+ -DOPENSSL_BN_ASM_MONT \
+ -DOPENSSL_CPUID_OBJ \
+ -DSHA1_ASM \
+ -DSHA256_ASM \
+ -DSHA512_ASM \
+
+x86_64_src_files := \
+ crypto/aes/asm/aes-x86_64.S \
+ crypto/aes/asm/aesni-sha1-x86_64.S \
+ crypto/aes/asm/aesni-x86_64.S \
+ crypto/aes/asm/bsaes-x86_64.S \
+ crypto/aes/asm/vpaes-x86_64.S \
+ crypto/bn/asm/modexp512-x86_64.S \
+ crypto/bn/asm/x86_64-gcc.c \
+ crypto/bn/asm/x86_64-gf2m.S \
+ crypto/bn/asm/x86_64-mont.S \
+ crypto/bn/asm/x86_64-mont5.S \
+ crypto/md5/asm/md5-x86_64.S \
+ crypto/modes/asm/ghash-x86_64.S \
+ crypto/rc4/asm/rc4-md5-x86_64.S \
+ crypto/rc4/asm/rc4-x86_64.S \
+ crypto/sha/asm/sha1-x86_64.S \
+ crypto/sha/asm/sha256-x86_64.S \
+ crypto/sha/asm/sha512-x86_64.S \
+ crypto/x86_64cpuid.S \
+
+x86_64_exclude_files := \
+ crypto/aes/aes_cbc.c \
+ crypto/aes/aes_core.c \
+ crypto/mem_clr.c \
+ crypto/rc4/rc4_enc.c \
+
+mips_c_flags := \
+ -DAES_ASM \
+ -DOPENSSL_BN_ASM_MONT \
+ -DSHA1_ASM \
+ -DSHA256_ASM \
+
+mips_src_files := \
+ crypto/aes/asm/aes-mips.S \
+ crypto/bn/asm/bn-mips.S \
+ crypto/bn/asm/mips-mont.S \
+ crypto/sha/asm/sha1-mips.S \
+ crypto/sha/asm/sha256-mips.S \
+
+mips_exclude_files := \
+ crypto/aes/aes_core.c \
+ crypto/bn/bn_asm.c \
+
+target_arch := $(TARGET_ARCH)
+ifeq ($(target_arch)-$(TARGET_HAS_BIGENDIAN),mips-true)
+target_arch := unknown_arch
+endif
+
+target_c_flags := $(common_c_flags) $($(target_arch)_c_flags) $(local_c_flags)
+target_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+target_src_files := $(common_src_files) $($(target_arch)_src_files)
+target_src_files := $(filter-out $($(target_arch)_exclude_files), $(target_src_files))
+
+ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
+host_arch := x86
+else
+host_arch := unknown_arch
+endif
+
+host_c_flags := $(common_c_flags) $($(host_arch)_c_flags) $(local_c_flags)
+host_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+host_src_files := $(common_src_files) $($(host_arch)_src_files)
+host_src_files := $(filter-out $($(host_arch)_exclude_files), $(host_src_files))
+
+local_additional_dependencies += $(LOCAL_PATH)/Crypto-config.mk
+
diff --git a/Crypto.mk b/Crypto.mk
new file mode 100644
index 0000000..d2be80e
--- /dev/null
+++ b/Crypto.mk
@@ -0,0 +1,92 @@
+local_c_flags :=
+
+local_c_includes := $(log_c_includes) \
+ $(APP_PROJECT_PATH) \
+ $(NDK_PROJECT_PATH)/crypto \
+ $(NDK_PROJECT_PATH)/crypto/asn1 \
+ $(NDK_PROJECT_PATH)/crypto/evp \
+ $(NDK_PROJECT_PATH)/crypto/modes \
+ $(NDK_PROJECT_PATH)/include \
+ $(NDK_PROJECT_PATH)/include/openssl
+
+local_additional_dependencies := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Crypto.mk
+
+include $(LOCAL_PATH)/Crypto-config.mk
+
+#######################################
+# target static library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto_static
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_STATIC_LIBRARY)
+
+#######################################
+# target shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+LOCAL_LDFLAGS += -ldl
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_SHARED_LIBRARY)
+
+#######################################
+# host shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+LOCAL_SRC_FILES += $(host_src_files)
+LOCAL_CFLAGS += $(host_c_flags) -DPURIFY
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_LDLIBS += -ldl
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto-host
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+#include $(BUILD_HOST_SHARED_LIBRARY)
+
+########################################
+# host static library, which is used by some SDK tools.
+
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+LOCAL_SRC_FILES += $(host_src_files)
+LOCAL_CFLAGS += $(host_c_flags) -DPURIFY
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_LDLIBS += -ldl
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto_static
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+#include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/README.android b/README.android
index e93fb00..a7c3cc1 100644
--- a/README.android
+++ b/README.android
@@ -23,7 +23,6 @@ The following steps are recommended for porting new OpenSSL versions.
2) Update the variables in openssl.config and openssl.version as appropriate.
At the very least you will need to update the openssl.version.
- Similarly update ThirdPartyProject.prop.
3) Run:
@@ -54,17 +53,23 @@ The following steps are recommended for porting new OpenSSL versions.
# Build and sync libcore tests
(croot && cd libcore && mm -j16 snod && adb remount && adb sync)
# Run tests from libcore
- (croot && vogar --classpath out/target/common/obj/JAVA_LIBRARIES/core-tests-support_intermediates/classes.jar --classpath out/target/common/obj/JAVA_LIBRARIES/core-tests_intermediates/classes.jar javax.net.ssl tests.api.javax.net)
+ (croot && vogar --classpath out/target/common/obj/JAVA_LIBRARIES/core-tests_intermediates/classes.jar javax.net.ssl tests.api.javax.net)
# Run tests from Harmony
- (croot && vogar --classpath harmony_tests.jar tests.api.java.math.BigIntegerTest org.apache.harmony.tests.java.math)
+ (croot && vogar --classpath out/target/common/obj/JAVA_LIBRARIES/apache-harmony-tests_intermediates/classes.jar tests.api.java.math.BigIntegerTest org.apache.harmony.tests.java.math)
# try an https website
adb shell am start https://online.citibank.com # confirm result in browser
The vogar tool can be found externally at http://code.google.com/p/vogar/
- Within Google it can be run with ~dalvik-prebuild/vogar/bin/vogar
- harmony_tests.jar is built from Subversion http://harmony.apache.org/
- Within Google it can be found at ~dalvik-prebuild/bin/harmony_tests.jar
+ Quick installation instructions (without rebuilding from source):
+ VOGAR=$HOME/vogar
+ svn co http://vogar.googlecode.com/svn/trunk/ $VOGAR
+ mkdir -p $VOGAR/build/
+ curl -o $VOGAR/build/vogar.jar https://vogar.googlecode.com/files/vogar.jar
+ PATH=$PATH:$VOGAR/bin
+
+ Within Google, you can find it under:
+ /home/dalvik-prebuild/vogar/bin/vogar
# You can also run openssl s_server as a test server on the device:
adb push ./android.testssl/CAss.cnf /sdcard/CAss.cnf
@@ -76,17 +81,7 @@ The following steps are recommended for porting new OpenSSL versions.
m -j16
-Optionally, check whether build flags (located in android-config.mk
-need to be updated. Doing this step will help ensure that the
-compiled library is appropriately optimized for speed and size. To
-update build flags:
-
-a) source openssl.config
-b) tar -zxf openssl-*.tar.gz
-c) cd openssl-*/
-d) ./Configure $CONFIGURE_ARGS
-e) examine Makefile and compare with ../android-config.mk
-f) modify ../openssl.config as appropriate and go to step 3) above.
-
-Alternatively, ."/import_openssl.sh import" now prints the
-post-Configure Makefile for review before deleting in on import.
+Optionally, check whether build flags (located in CONFIGURE_ARGS in
+openssl.config, plus some extras in android-config.mk), need to be updated.
+Doing this step will help ensure that the compiled library is appropriately
+optimized for speed and size.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4a22d32
--- /dev/null
+++ b/README.md
@@ -0,0 +1,22 @@
+This is the official Android OpenSSL sources with the makefiles lightly patched to build as an external project with the Android NDK. Similar to https://github.com/eighthave/openssl-android but updated for Android (OpenSSL 1.0.1e) and NDK r8d.
+
+To build, run:
+```
+ndk-build -j $(getconf _NPROCESSORS_ONLN)
+```
+
+* Upstream: https://android.googlesource.com/platform/external/openssl
+ commit ID: af3454c5e67108665d5f26a90aae1831e26d38b7 of master branch
+
+* Makefile patches inpired by: https://github.com/eighthave/openssl-android
+
+
+To see the patches, run:
+
+git remote add upstream https://android.googlesource.com/platform/external/openssl
+git fetch upstream
+git diff upstream/master
+
+
+# Background
+Although Android ships with OpenSSL, it understandably doesn't provide libcrypto and libssl as part of its public API. Thus developers porting OpenSSL using apps to Android need a project like this.
diff --git a/Ssl-config.mk b/Ssl-config.mk
new file mode 100644
index 0000000..e14e4bb
--- /dev/null
+++ b/Ssl-config.mk
@@ -0,0 +1,128 @@
+# Auto-generated - DO NOT EDIT!
+# To regenerate, edit openssl.config, then run:
+# ./import_openssl.sh import /path/to/openssl-1.0.1e.tar.gz
+#
+# Before including this file, the local Android.mk must define the following
+# variables:
+#
+# local_c_flags
+# local_c_includes
+# local_additional_dependencies
+#
+# This script will define the following variables:
+#
+# target_c_flags
+# target_c_includes
+# target_src_files
+#
+# host_c_flags
+# host_c_includes
+# host_src_files
+#
+
+# Ensure these are empty.
+unknown_arch_c_flags :=
+unknown_arch_src_files :=
+unknown_arch_exclude_files :=
+
+
+common_c_flags :=
+
+common_src_files := \
+ ssl/bio_ssl.c \
+ ssl/d1_both.c \
+ ssl/d1_enc.c \
+ ssl/d1_lib.c \
+ ssl/d1_pkt.c \
+ ssl/d1_srtp.c \
+ ssl/kssl.c \
+ ssl/s23_clnt.c \
+ ssl/s23_lib.c \
+ ssl/s23_meth.c \
+ ssl/s23_pkt.c \
+ ssl/s23_srvr.c \
+ ssl/s2_clnt.c \
+ ssl/s2_enc.c \
+ ssl/s2_lib.c \
+ ssl/s2_meth.c \
+ ssl/s2_pkt.c \
+ ssl/s2_srvr.c \
+ ssl/s3_both.c \
+ ssl/s3_cbc.c \
+ ssl/s3_clnt.c \
+ ssl/s3_enc.c \
+ ssl/s3_lib.c \
+ ssl/s3_meth.c \
+ ssl/s3_pkt.c \
+ ssl/s3_srvr.c \
+ ssl/ssl_algs.c \
+ ssl/ssl_asn1.c \
+ ssl/ssl_cert.c \
+ ssl/ssl_ciph.c \
+ ssl/ssl_err.c \
+ ssl/ssl_err2.c \
+ ssl/ssl_lib.c \
+ ssl/ssl_rsa.c \
+ ssl/ssl_sess.c \
+ ssl/ssl_stat.c \
+ ssl/ssl_txt.c \
+ ssl/t1_clnt.c \
+ ssl/t1_enc.c \
+ ssl/t1_lib.c \
+ ssl/t1_meth.c \
+ ssl/t1_reneg.c \
+ ssl/t1_srvr.c \
+ ssl/tls_srp.c \
+
+common_c_includes := \
+ . \
+ crypto \
+ include \
+
+arm_c_flags :=
+
+arm_src_files :=
+
+arm_exclude_files :=
+
+x86_c_flags :=
+
+x86_src_files :=
+
+x86_exclude_files :=
+
+x86_64_c_flags :=
+
+x86_64_src_files :=
+
+x86_64_exclude_files :=
+
+mips_c_flags :=
+
+mips_src_files :=
+
+mips_exclude_files :=
+
+target_arch := $(TARGET_ARCH)
+ifeq ($(target_arch)-$(TARGET_HAS_BIGENDIAN),mips-true)
+target_arch := unknown_arch
+endif
+
+target_c_flags := $(common_c_flags) $($(target_arch)_c_flags) $(local_c_flags)
+target_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+target_src_files := $(common_src_files) $($(target_arch)_src_files)
+target_src_files := $(filter-out $($(target_arch)_exclude_files), $(target_src_files))
+
+ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
+host_arch := x86
+else
+host_arch := unknown_arch
+endif
+
+host_c_flags := $(common_c_flags) $($(host_arch)_c_flags) $(local_c_flags)
+host_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+host_src_files := $(common_src_files) $($(host_arch)_src_files)
+host_src_files := $(filter-out $($(host_arch)_exclude_files), $(host_src_files))
+
+local_additional_dependencies += $(LOCAL_PATH)/Ssl-config.mk
+
diff --git a/Ssl.mk b/Ssl.mk
new file mode 100644
index 0000000..546fc08
--- /dev/null
+++ b/Ssl.mk
@@ -0,0 +1,84 @@
+local_c_flags :=
+
+local_c_includes := $(log_c_includes) \
+ $(NDK_PROJECT_PATH) \
+ $(NDK_PROJECT_PATH)/crypto \
+ $(NDK_PROJECT_PATH)/crypto/asn1 \
+ $(NDK_PROJECT_PATH)/crypto/evp \
+ $(NDK_PROJECT_PATH)/include \
+ $(NDK_PROJECT_PATH)/include/openssl
+
+local_additional_dependencies := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Ssl.mk
+
+include $(LOCAL_PATH)/Ssl-config.mk
+
+#######################################
+# target static library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_SHARED_LIBRARIES = $(log_shared_libraries)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libssl_static
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_STATIC_LIBRARY)
+
+#######################################
+# target shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_SHARED_LIBRARIES += libcrypto $(log_shared_libraries)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libssl
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_SHARED_LIBRARY)
+
+#######################################
+# host shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SRC_FILES += $(host_src_files)
+LOCAL_CFLAGS += $(host_c_flags)
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_SHARED_LIBRARIES += libcrypto-host $(log_shared_libraries)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libssl-host
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+#include $(BUILD_HOST_SHARED_LIBRARY)
+
+#######################################
+# ssltest
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SRC_FILES:= ssl/ssltest.c
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_SHARED_LIBRARIES := libssl libcrypto $(log_shared_libraries)
+LOCAL_MODULE:= ssltest
+LOCAL_MODULE_TAGS := optional
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_EXECUTABLE)
diff --git a/ThirdPartyProject.prop b/ThirdPartyProject.prop
deleted file mode 100644
index 34ad609..0000000
--- a/ThirdPartyProject.prop
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright 2010 Google Inc. All Rights Reserved.
-#Fri Jul 16 10:03:09 PDT 2010
-currentVersion=1.0.0e
-version=1.0.0e
-isNative=true
-feedurl=http\://www.openssl.org/
-name=openssl
-keywords=openssl
-onDevice=true
-homepage=http\://www.openssl.org/
diff --git a/android-config.mk b/android-config.mk
index d76d6e3..d1bda99 100644
--- a/android-config.mk
+++ b/android-config.mk
@@ -1,17 +1,30 @@
#
-# These flags represent the build-time configuration of openssl for android
+# These flags represent the build-time configuration of OpenSSL for android
#
-# They were pruned from the "Makefile" generated by running ./Configure from import_openssl.sh
+# The value of $(openssl_cflags) was pruned from the Makefile generated
+# by running ./Configure from import_openssl.sh.
#
+# This script performs minor but required patching for the Android build.
+#
+
+LOCAL_CFLAGS += $(openssl_cflags)
-# From CLFAG=
-LOCAL_CFLAGS += -DOPENSSL_THREADS -D_REENTRANT -DDSO_DLFCN -DHAVE_DLFCN_H -DL_ENDIAN #-DTERMIO
+LOCAL_CFLAGS := $(filter-out -DTERMIO, $(LOCAL_CFLAGS))
-# From DEPFLAG=
-LOCAL_CFLAGS += -DOPENSSL_NO_CAMELLIA -DOPENSSL_NO_CAPIENG -DOPENSSL_NO_CAST -DOPENSSL_NO_CMS -DOPENSSL_NO_GMP -DOPENSSL_NO_IDEA -DOPENSSL_NO_JPAKE -DOPENSSL_NO_MD2 -DOPENSSL_NO_MDC2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_SHA0 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SEED -DOPENSSL_NO_STORE -DOPENSSL_NO_WHIRLPOOL
+ifeq ($(HOST_OS),windows)
+LOCAL_CFLAGS := $(filter-out -DDSO_DLFCN -DHAVE_DLFCN_H,$(LOCAL_CFLAGS))
+endif
-# Extra
-LOCAL_CFLAGS += -DOPENSSL_NO_HW -DOPENSSL_NO_ENGINE -DZLIB
+# Directories
+LOCAL_CFLAGS += \
+ -DOPENSSLDIR="\"/system/lib/ssl\"" \
+ -DENGINESDIR="\"/system/lib/ssl/engines\""
+
+# Intentionally excluded http://b/7079965
+LOCAL_CFLAGS := $(filter-out -DZLIB, $(LOCAL_CFLAGS))
# Debug
# LOCAL_CFLAGS += -DCIPHER_DEBUG
+
+# Add clang here when it works on host
+# LOCAL_CLANG := true
diff --git a/android.testssl/CAss.cnf b/android.testssl/CAss.cnf
index 1173c08..77c01c3 100644
--- a/android.testssl/CAss.cnf
+++ b/android.testssl/CAss.cnf
@@ -7,7 +7,7 @@ RANDFILE = /sdcard/android.testssl/.rnd
####################################################################
[ req ]
-default_bits = 512
+default_bits = 2048
default_keyfile = keySS.pem
distinguished_name = req_distinguished_name
encrypt_rsa_key = no
diff --git a/android.testssl/Uss.cnf b/android.testssl/Uss.cnf
index 56dcdd5..317ab6d 100644
--- a/android.testssl/Uss.cnf
+++ b/android.testssl/Uss.cnf
@@ -7,11 +7,11 @@ RANDFILE = /sdcard/android.testssl/.rnd
####################################################################
[ req ]
-default_bits = 512
+default_bits = 2048
default_keyfile = keySS.pem
distinguished_name = req_distinguished_name
encrypt_rsa_key = no
-default_md = md2
+default_md = sha256
[ req_distinguished_name ]
countryName = Country Name (2 letter code)
diff --git a/android.testssl/server2.pem b/android.testssl/server2.pem
index 8bb6641..a3927cf 100644
--- a/android.testssl/server2.pem
+++ b/android.testssl/server2.pem
@@ -1,376 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (1024 bit)
------BEGIN CERTIFICATE-----
-MIICLjCCAZcCAQEwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU0WhcNOTgwNjA5
-MTM1NzU0WjBkMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxJDAiBgNVBAMTG1NlcnZlciB0ZXN0IGNl
-cnQgKDEwMjQgYml0KTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAsxH1PBPm
-RkxrR11eV4bzNi4N9n11CI8nV29+ARlT1+qDe/mjVUvXlmsr1v/vf71G9GgqopSa
-6RXrICLVdk/FYYYzhPvl1M+OrjaXDFO8BzBAF1Lnz6c7aRZvGRJNrRSr2nZEkqDf
-JW9dY7r2VZEpD5QeuaRYUnuECkqeieB65GMCAwEAATANBgkqhkiG9w0BAQQFAAOB
-gQCWsOta6C0wiVzXz8wPmJKyTrurMlgUss2iSuW9366iwofZddsNg7FXniMzkIf6
-dp7jnmWZwKZ9cXsNUS2o4OL07qOk2HOywC0YsNZQsOBu1CBTYYkIefDiKFL1zQHh
-8lwwNd4NP+OE3NzUNkCfh4DnFfg9WHkXUlD5UpxNRJ4gJA==
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQCzEfU8E+ZGTGtHXV5XhvM2Lg32fXUIjydXb34BGVPX6oN7+aNV
-S9eWayvW/+9/vUb0aCqilJrpFesgItV2T8VhhjOE++XUz46uNpcMU7wHMEAXUufP
-pztpFm8ZEk2tFKvadkSSoN8lb11juvZVkSkPlB65pFhSe4QKSp6J4HrkYwIDAQAB
-AoGBAKy8jvb0Lzby8q11yNLf7+78wCVdYi7ugMHcYA1JVFK8+zb1WfSm44FLQo/0
-dSChAjgz36TTexeLODPYxleJndjVcOMVzsLJjSM8dLpXsTS4FCeMbhw2s2u+xqKY
-bbPWfk+HOTyJjfnkcC5Nbg44eOmruq0gSmBeUXVM5UntlTnxAkEA7TGCA3h7kx5E
-Bl4zl2pc3gPAGt+dyfk5Po9mGJUUXhF5p2zueGmYWW74TmOWB1kzt4QRdYMzFePq
-zfDNXEa1CwJBAMFErdY0xp0UJ13WwBbUTk8rujqQdHtjw0klhpbuKkjxu2hN0wwM
-6p0D9qxF7JHaghqVRI0fAW/EE0OzdHMR9QkCQQDNR26dMFXKsoPu+vItljj/UEGf
-QG7gERiQ4yxaFBPHgdpGo0kT31eh9x9hQGDkxTe0GNG/YSgCRvm8+C3TMcKXAkBD
-dhGn36wkUFCddMSAM4NSJ1VN8/Z0y5HzCmI8dM3VwGtGMUQlxKxwOl30LEQzdS5M
-0SWojNYXiT2gOBfBwtbhAkEAhafl5QEOIgUz+XazS/IlZ8goNKdDVfYgK3mHHjvv
-nY5G+AuGebdNkXJr4KSWxDcN+C2i47zuj4QXA16MAOandA==
------END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert #2
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
+-----BEGIN CERTIFICATE-----
+MIID6jCCAtKgAwIBAgIJALnu1NlVpZ60MA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZzELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxHDAaBgNVBAMME1Rlc3QgU2VydmVyIENlcnQg
+IzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDrdi7j9yctG+L4EjBy
+gjPmEqZzOJEQba26MoQGzglU7e5Xf59Rb/hgVQuKAoiZe7/R8rK4zJ4W7iXdXw0L
+qBpyG8B5aGKeI32w+A9TcBApoXXL2CrYQEQjZwUIpLlYBIi2NkJj3nVkq5dgl1gO
+ALiQ+W8jg3kzg5Ec9rimp9r93N8wsSL3awsafurmYCvOf7leHaMP1WJ/zDRGUNHG
+/WtDjXc8ZUG1+6EXU9Jc2Fs+2Omf7fcN0l00AK/wPg8OaNS0rKyGq9JdIT9FRGV1
+bXe/rx58FaE5CItdwCSYhJvF/O95LWQoxJXye5bCFLmvDTEyVq9FMSCptfsmbXjE
+ZGsXAgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJ
+YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud
+DgQWBBR52UaWWTKzZGDH/X4mWNcuqeQVazAfBgNVHSMEGDAWgBQ2w2yI55X+sL3s
+zj49hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEANBW+XYLlHBqVY/31ie+3gRlS
+LPfy4SIqn0t3RJjagT29MXprblBO2cbMO8VGjkQdKGpmMXjxbht2arOOUXRHX4n/
+XTyn/QHEf0bcwIITMReO3DZUPAEw8hSjn9xEOM0IRVOCP+mH5fi74QzzQaZVCyYg
+5VtLKdww/+sc0nCbKl2KWgDluriH0nfVx95qgW3mg9dhXRr0zmf1w2zkBHYpARYL
+Dew6Z8EE4tS3HJu8/qM6meWzNtrfonQ3eiiMxjZBxzV46jchBwa2z9XYhP6AmpPb
+oeTSzcQNbWsxaGYzWo46oLDUZmJOwSBawbS31bZNMCoPIY6ukoesCzFSsUKZww==
-----END CERTIFICATE-----
-----BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
+MIIEowIBAAKCAQEA63Yu4/cnLRvi+BIwcoIz5hKmcziREG2tujKEBs4JVO3uV3+f
+UW/4YFULigKImXu/0fKyuMyeFu4l3V8NC6gachvAeWhiniN9sPgPU3AQKaF1y9gq
+2EBEI2cFCKS5WASItjZCY951ZKuXYJdYDgC4kPlvI4N5M4ORHPa4pqfa/dzfMLEi
+92sLGn7q5mArzn+5Xh2jD9Vif8w0RlDRxv1rQ413PGVBtfuhF1PSXNhbPtjpn+33
+DdJdNACv8D4PDmjUtKyshqvSXSE/RURldW13v68efBWhOQiLXcAkmISbxfzveS1k
+KMSV8nuWwhS5rw0xMlavRTEgqbX7Jm14xGRrFwIDAQABAoIBAHLsTPihIfLnYIE5
+x4GsQQ5zXeBw5ITDM37ktwHnQDC+rIzyUl1aLD1AZRBoKinXd4lOTqLZ4/NHKx4A
+DYr58mZtWyUmqLOMmQVuHXTZBlp7XtYuXMMNovQwjQlp9LicBeoBU6gQ5PVMtubD
+F4xGF89Sn0cTHW3iMkqTtQ5KcR1j57OcJO0FEb1vPvk2MXI5ZyAatUYE7YacbEzd
+rg02uIwx3FqNSkuSI79uz4hMdV5TPtuhxx9nTwj9aLUhXFeZ0mn2PVgVzEnnMoJb
++znlsZDgzDlJqdaD744YGWh8Z3OEssB35KfzFcdOeO6yH8lmv2Zfznk7pNPT7LTb
+Lae9VgkCgYEA92p1qnAB3NtJtNcaW53i0S5WJgS1hxWKvUDx3lTB9s8X9fHpqL1a
+E94fDfWzp/hax6FefUKIvBOukPLQ6bYjTMiFoOHzVirghAIuIUoMI5VtLhwD1hKs
+Lr7l/dptMgKb1nZHyXoKHRBthsy3K4+udsPi8TzMvYElgEqyQIe/Rk0CgYEA86GL
+8HC6zLszzKERDPBxrboRmoFvVUCTQDhsfj1M8aR3nQ8V5LkdIJc7Wqm/Ggfk9QRf
+rJ8M2WUMlU5CNnCn/KCrKzCNZIReze3fV+HnKdbcXGLvgbHPrhnz8yYehUFG+RGq
+bVyDWRU94T38izy2s5qMYrMJWZEYyXncSPbfcPMCgYAtaXfxcZ+V5xYPQFARMtiX
+5nZfggvDoJuXgx0h3tK/N2HBfcaSdzbaYLG4gTmZggc/jwnl2dl5E++9oSPhUdIG
+3ONSFUbxsOsGr9PBvnKd8WZZyUCXAVRjPBzAzF+whzQNWCZy/5htnz9LN7YDI9s0
+5113Q96cheDZPFydZY0hHQKBgQDVbEhNukM5xCiNcu+f2SaMnLp9EjQ4h5g3IvaP
+5B16daw/Dw8LzcohWboqIxeAsze0GD/D1ZUJAEd0qBjC3g+a9BjefervCjKOzXng
+38mEUm+6EwVjJSQcjSmycEs+Sr/kwr/8i5WYvU32+jk4tFgMoC+o6tQe/Uesf68k
+z/dPVwKBgGbF7Vv1/3SmhlOy+zYyvJ0CrWtKxH9QP6tLIEgEpd8x7YTSuCH94yok
+kToMXYA3sWNPt22GbRDZ+rcp4c7HkDx6I6vpdP9aQEwJTp0EPy0sgWr2XwYmreIQ
+NFmkk8Itn9EY2R9VBaP7GLv5kvwxDdLAnmwGmzVtbmaVdxCaBwUk
-----END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
- /OU=Certification Services Division/CN=Thawte Server CA
- /Email=server-certs@thawte.com
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
- /OU=Certification Services Division/CN=Thawte Server CA
- /Email=server-certs@thawte.com
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8=
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----
diff --git a/android.testssl/testssl b/android.testssl/testssl
index 3f24b1d..5ff4860 100755
--- a/android.testssl/testssl
+++ b/android.testssl/testssl
@@ -70,15 +70,6 @@ $ssltest -client_auth $CA $extra || exit 1
echo test sslv2/sslv3 with both client and server authentication
$ssltest -server_auth -client_auth $CA $extra || exit 1
-echo test sslv2/sslv3 with both client and server authentication and small client buffers
-$ssltest -server_auth -client_auth -c_small_records $CA $extra || exit 1
-
-echo test sslv2/sslv3 with both client and server authentication and small server buffers
-$ssltest -server_auth -client_auth -s_small_records $CA $extra || exit 1
-
-echo test sslv2/sslv3 with both client and server authentication and small client and server buffers
-$ssltest -server_auth -client_auth -c_small_records -s_small_records $CA $extra || exit 1
-
echo test sslv2/sslv3 with both client and server authentication and handshake cutthrough
$ssltest -server_auth -client_auth -cutthrough $CA $extra || exit 1
@@ -112,8 +103,8 @@ echo test sslv2/sslv3 via BIO pair
$ssltest $extra || exit 1
if [ $dsa_cert = NO ]; then
- echo test sslv2/sslv3 w/o DHE via BIO pair
- $ssltest -bio_pair -no_dhe $extra || exit 1
+ echo 'test sslv2/sslv3 w/o (EC)DHE via BIO pair'
+ $ssltest -bio_pair -no_dhe -no_ecdhe $extra || exit 1
fi
echo test sslv2/sslv3 with 1024bit DHE via BIO pair
@@ -131,6 +122,23 @@ $ssltest -bio_pair -server_auth -client_auth $CA $extra || exit 1
echo test sslv2/sslv3 with both client and server authentication via BIO pair and app verify
$ssltest -bio_pair -server_auth -client_auth -app_verify $CA $extra || exit 1
+echo "Testing ciphersuites"
+for protocol in TLSv1.2 SSLv3; do
+ echo "Testing ciphersuites for $protocol"
+ for cipher in `adb shell /system/bin/openssl ciphers "RSA+$protocol" | tr ':' ' '`; do
+ echo "Testing $cipher"
+ prot=""
+ if [ $protocol = "SSLv3" ] ; then
+ prot="-ssl3"
+ fi
+ $ssltest -cipher $cipher $prot
+ if [ $? -ne 0 ] ; then
+ echo "Failed $cipher"
+ exit 1
+ fi
+ done
+done
+
#############################################################################
if [ `adb shell /system/bin/openssl no-dh` = no-dh ]; then
@@ -143,8 +151,8 @@ fi
if [ `adb shell /system/bin/openssl no-rsa` = no-dh ]; then
echo skipping RSA tests
else
- echo test tls1 with 1024bit RSA, no DHE, multiple handshakes
- adb shell /system/bin/ssltest -v -bio_pair -tls1 -cert /sdcard/android.testssl/server2.pem -no_dhe -num 10 -f -time $extra || exit 1
+ echo 'test tls1 with 1024bit RSA, no (EC)DHE, multiple handshakes'
+ adb shell /system/bin/ssltest -v -bio_pair -tls1 -cert /sdcard/android.testssl/server2.pem -no_dhe -no_ecdhe -num 10 -f -time $extra || exit 1
if [ `adb shell /system/bin/openssl no-dh` = no-dh ]; then
echo skipping RSA+DHE tests
@@ -160,4 +168,14 @@ $ssltest -tls1 -cipher PSK -psk abc123 $extra || exit 1
echo test tls1 with PSK via BIO pair
$ssltest -bio_pair -tls1 -cipher PSK -psk abc123 $extra || exit 1
+if adb shell /system/bin/openssl no-srp; then
+ echo skipping SRP tests
+else
+ echo test tls1 with SRP
+ $ssltest -tls1 -cipher SRP -srpuser test -srppass abc123
+
+ echo test tls1 with SRP via BIO pair
+ $ssltest -bio_pair -tls1 -cipher SRP -srpuser test -srppass abc123
+fi
+
exit 0
diff --git a/ant.properties b/ant.properties
new file mode 100644
index 0000000..b0971e8
--- /dev/null
+++ b/ant.properties
@@ -0,0 +1,17 @@
+# This file is used to override default values used by the Ant build system.
+#
+# This file must be checked into Version Control Systems, as it is
+# integral to the build system of your project.
+
+# This file is only used by the Ant script.
+
+# You can use this to override default values such as
+# 'source.dir' for the location of your java source folder and
+# 'out.dir' for the location of your output folder.
+
+# You can also use it define how the release builds are signed by declaring
+# the following properties:
+# 'key.store' for the location of your keystore and
+# 'key.alias' for the name of the key to use.
+# The password will be asked during the build when you use the 'release' target.
+
diff --git a/apps/Android.mk b/apps/Android.mk
deleted file mode 100644
index 20cc5a9..0000000
--- a/apps/Android.mk
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2006 The Android Open Source Project
-
-LOCAL_PATH:= $(call my-dir)
-
-local_src_files:= \
- app_rand.c \
- apps.c \
- asn1pars.c \
- ca.c \
- ciphers.c \
- crl.c \
- crl2p7.c \
- dgst.c \
- dh.c \
- dhparam.c \
- dsa.c \
- dsaparam.c \
- ecparam.c \
- ec.c \
- enc.c \
- engine.c \
- errstr.c \
- gendh.c \
- gendsa.c \
- genpkey.c \
- genrsa.c \
- nseq.c \
- ocsp.c \
- openssl.c \
- passwd.c \
- pkcs12.c \
- pkcs7.c \
- pkcs8.c \
- pkey.c \
- pkeyparam.c \
- pkeyutl.c \
- prime.c \
- rand.c \
- req.c \
- rsa.c \
- rsautl.c \
- s_cb.c \
- s_client.c \
- s_server.c \
- s_socket.c \
- s_time.c \
- sess_id.c \
- smime.c \
- speed.c \
- spkac.c \
- verify.c \
- version.c \
- x509.c
-
-local_shared_libraries := \
- libssl \
- libcrypto
-
-local_c_includes := \
- external/openssl \
- external/openssl/include
-
-local_cflags := -DMONOLITH
-
-# These flags omit whole features from the commandline "openssl".
-# However, portions of these features are actually turned on.
-local_cflags += -DOPENSSL_NO_DTLS1
-
-include $(CLEAR_VARS)
-LOCAL_MODULE:= openssl
-LOCAL_MODULE_TAGS := optional
-LOCAL_SRC_FILES := $(local_src_files)
-LOCAL_SHARED_LIBRARIES := $(local_shared_libraries)
-LOCAL_C_INCLUDES := $(local_c_includes)
-LOCAL_CFLAGS := $(local_cflags)
-include $(LOCAL_PATH)/../android-config.mk
-include $(BUILD_EXECUTABLE)
-
-include $(CLEAR_VARS)
-LOCAL_MODULE:= openssl
-LOCAL_MODULE_TAGS := optional
-LOCAL_SRC_FILES := $(local_src_files)
-LOCAL_SHARED_LIBRARIES := $(local_shared_libraries)
-LOCAL_C_INCLUDES := $(local_c_includes)
-LOCAL_CFLAGS := $(local_cflags)
-include $(LOCAL_PATH)/../android-config.mk
-include $(BUILD_HOST_EXECUTABLE)
diff --git a/apps/apps.c b/apps/apps.c
index 38e6197..1096eee 100644
--- a/apps/apps.c
+++ b/apps/apps.c
@@ -109,7 +109,7 @@
*
*/
-#ifndef _POSIX_C_SOURCE
+#if !defined(_POSIX_C_SOURCE) && defined(OPENSSL_SYS_VMS)
#define _POSIX_C_SOURCE 2 /* On VMS, you need to define this to get
the declaration of fileno(). The value
2 is to make sure no function defined
@@ -1215,7 +1215,8 @@ STACK_OF(X509) *load_certs(BIO *err, const char *file, int format,
const char *pass, ENGINE *e, const char *desc)
{
STACK_OF(X509) *certs;
- load_certs_crls(err, file, format, pass, e, desc, &certs, NULL);
+ if (!load_certs_crls(err, file, format, pass, e, desc, &certs, NULL))
+ return NULL;
return certs;
}
@@ -1223,7 +1224,8 @@ STACK_OF(X509_CRL) *load_crls(BIO *err, const char *file, int format,
const char *pass, ENGINE *e, const char *desc)
{
STACK_OF(X509_CRL) *crls;
- load_certs_crls(err, file, format, pass, e, desc, NULL, &crls);
+ if (!load_certs_crls(err, file, format, pass, e, desc, NULL, &crls))
+ return NULL;
return crls;
}
@@ -2130,7 +2132,7 @@ X509_NAME *parse_name(char *subject, long chtype, int multirdn)
X509_NAME *n = NULL;
int nid;
- if (!buf || !ne_types || !ne_values)
+ if (!buf || !ne_types || !ne_values || !mval)
{
BIO_printf(bio_err, "malloc error\n");
goto error;
@@ -2234,6 +2236,7 @@ X509_NAME *parse_name(char *subject, long chtype, int multirdn)
OPENSSL_free(ne_values);
OPENSSL_free(ne_types);
OPENSSL_free(buf);
+ OPENSSL_free(mval);
return n;
error:
@@ -2242,6 +2245,8 @@ X509_NAME *parse_name(char *subject, long chtype, int multirdn)
OPENSSL_free(ne_values);
if (ne_types)
OPENSSL_free(ne_types);
+ if (mval)
+ OPENSSL_free(mval);
if (buf)
OPENSSL_free(buf);
return NULL;
@@ -2256,6 +2261,7 @@ int args_verify(char ***pargs, int *pargc,
int purpose = 0, depth = -1;
char **oldargs = *pargs;
char *arg = **pargs, *argn = (*pargs)[1];
+ time_t at_time = 0;
if (!strcmp(arg, "-policy"))
{
if (!argn)
@@ -2308,6 +2314,27 @@ int args_verify(char ***pargs, int *pargc,
}
(*pargs)++;
}
+ else if (strcmp(arg,"-attime") == 0)
+ {
+ if (!argn)
+ *badarg = 1;
+ else
+ {
+ long timestamp;
+ /* interpret the -attime argument as seconds since
+ * Epoch */
+ if (sscanf(argn, "%li", ×tamp) != 1)
+ {
+ BIO_printf(bio_err,
+ "Error parsing timestamp %s\n",
+ argn);
+ *badarg = 1;
+ }
+ /* on some platforms time_t may be a float */
+ at_time = (time_t) timestamp;
+ }
+ (*pargs)++;
+ }
else if (!strcmp(arg, "-ignore_critical"))
flags |= X509_V_FLAG_IGNORE_CRITICAL;
else if (!strcmp(arg, "-issuer_checks"))
@@ -2362,6 +2389,9 @@ int args_verify(char ***pargs, int *pargc,
if (depth >= 0)
X509_VERIFY_PARAM_set_depth(*pm, depth);
+ if (at_time)
+ X509_VERIFY_PARAM_set_time(*pm, at_time);
+
end:
(*pargs)++;
@@ -2693,6 +2723,50 @@ void jpake_server_auth(BIO *out, BIO *conn, const char *secret)
#endif
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+/* next_protos_parse parses a comma separated list of strings into a string
+ * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
+ * outlen: (output) set to the length of the resulting buffer on success.
+ * err: (maybe NULL) on failure, an error message line is written to this BIO.
+ * in: a NUL termianted string like "abc,def,ghi"
+ *
+ * returns: a malloced buffer or NULL on failure.
+ */
+unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
+ {
+ size_t len;
+ unsigned char *out;
+ size_t i, start = 0;
+
+ len = strlen(in);
+ if (len >= 65535)
+ return NULL;
+
+ out = OPENSSL_malloc(strlen(in) + 1);
+ if (!out)
+ return NULL;
+
+ for (i = 0; i <= len; ++i)
+ {
+ if (i == len || in[i] == ',')
+ {
+ if (i - start > 255)
+ {
+ OPENSSL_free(out);
+ return NULL;
+ }
+ out[start] = i - start;
+ start = i + 1;
+ }
+ else
+ out[i+1] = in[i];
+ }
+
+ *outlen = len + 1;
+ return out;
+ }
+#endif /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
+
/*
* Platform-specific sections
*/
@@ -3018,46 +3092,3 @@ int raw_write_stdout(const void *buf,int siz)
int raw_write_stdout(const void *buf,int siz)
{ return write(fileno(stdout),buf,siz); }
#endif
-
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-/* next_protos_parse parses a comma separated list of strings into a string
- * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
- * outlen: (output) set to the length of the resulting buffer on success.
- * in: a NUL termianted string like "abc,def,ghi"
- *
- * returns: a malloced buffer or NULL on failure.
- */
-unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
- {
- size_t len;
- unsigned char *out;
- size_t i, start = 0;
-
- len = strlen(in);
- if (len >= 65535)
- return NULL;
-
- out = OPENSSL_malloc(strlen(in) + 1);
- if (!out)
- return NULL;
-
- for (i = 0; i <= len; ++i)
- {
- if (i == len || in[i] == ',')
- {
- if (i - start > 255)
- {
- OPENSSL_free(out);
- return NULL;
- }
- out[start] = i - start;
- start = i + 1;
- }
- else
- out[i+1] = in[i];
- }
-
- *outlen = len + 1;
- return out;
- }
-#endif /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
diff --git a/apps/apps.h b/apps/apps.h
index 42072ec..c1ca99d 100644
--- a/apps/apps.h
+++ b/apps/apps.h
@@ -317,6 +317,12 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in);
int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value);
int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx,
const char *algname, ENGINE *e, int do_param);
+int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
+ STACK_OF(OPENSSL_STRING) *sigopts);
+int do_X509_REQ_sign(BIO *err, X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md,
+ STACK_OF(OPENSSL_STRING) *sigopts);
+int do_X509_CRL_sign(BIO *err, X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md,
+ STACK_OF(OPENSSL_STRING) *sigopts);
#ifndef OPENSSL_NO_PSK
extern char *psk_key;
#endif
@@ -325,6 +331,10 @@ void jpake_client_auth(BIO *out, BIO *conn, const char *secret);
void jpake_server_auth(BIO *out, BIO *conn, const char *secret);
#endif
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
+#endif /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
+
#define FORMAT_UNDEF 0
#define FORMAT_ASN1 1
#define FORMAT_TEXT 2
@@ -357,8 +367,7 @@ int raw_write_stdout(const void *,int);
#define TM_START 0
#define TM_STOP 1
double app_tminterval (int stop,int usertime);
-#endif
-#ifndef OPENSSL_NO_NEXTPROTONEG
-unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
+#define OPENSSL_NO_SSL_INTERN
+
#endif
diff --git a/apps/ca.c b/apps/ca.c
index 6b8b0ef..1cf50e0 100644
--- a/apps/ca.c
+++ b/apps/ca.c
@@ -197,26 +197,30 @@ extern int EF_ALIGNMENT;
static void lookup_fail(const char *name, const char *tag);
static int certify(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
- const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,CA_DB *db,
+ const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+ STACK_OF(CONF_VALUE) *policy,CA_DB *db,
BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn, char *startdate,
char *enddate, long days, int batch, char *ext_sect, CONF *conf,
int verbose, unsigned long certopt, unsigned long nameopt,
int default_op, int ext_copy, int selfsign);
static int certify_cert(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
- const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,
+ const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+ STACK_OF(CONF_VALUE) *policy,
CA_DB *db, BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn,
char *startdate, char *enddate, long days, int batch,
char *ext_sect, CONF *conf,int verbose, unsigned long certopt,
unsigned long nameopt, int default_op, int ext_copy,
ENGINE *e);
static int certify_spkac(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
- const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,
+ const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+ STACK_OF(CONF_VALUE) *policy,
CA_DB *db, BIGNUM *serial,char *subj,unsigned long chtype, int multirdn, int email_dn,
char *startdate, char *enddate, long days, char *ext_sect,
CONF *conf, int verbose, unsigned long certopt,
unsigned long nameopt, int default_op, int ext_copy);
static void write_new_certificate(BIO *bp, X509 *x, int output_der, int notext);
static int do_body(X509 **xret, EVP_PKEY *pkey, X509 *x509, const EVP_MD *dgst,
+ STACK_OF(OPENSSL_STRING) *sigopts,
STACK_OF(CONF_VALUE) *policy, CA_DB *db, BIGNUM *serial,char *subj,unsigned long chtype, int multirdn,
int email_dn, char *startdate, char *enddate, long days, int batch,
int verbose, X509_REQ *req, char *ext_sect, CONF *conf,
@@ -311,6 +315,7 @@ int MAIN(int argc, char **argv)
const EVP_MD *dgst=NULL;
STACK_OF(CONF_VALUE) *attribs=NULL;
STACK_OF(X509) *cert_sk=NULL;
+ STACK_OF(OPENSSL_STRING) *sigopts = NULL;
#undef BSIZE
#define BSIZE 256
MS_STATIC char buf[3][BSIZE];
@@ -435,6 +440,15 @@ EF_ALIGNMENT=0;
if (--argc < 1) goto bad;
outdir= *(++argv);
}
+ else if (strcmp(*argv,"-sigopt") == 0)
+ {
+ if (--argc < 1)
+ goto bad;
+ if (!sigopts)
+ sigopts = sk_OPENSSL_STRING_new_null();
+ if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+ goto bad;
+ }
else if (strcmp(*argv,"-notext") == 0)
notext=1;
else if (strcmp(*argv,"-batch") == 0)
@@ -1170,8 +1184,9 @@ EF_ALIGNMENT=0;
if (spkac_file != NULL)
{
total++;
- j=certify_spkac(&x,spkac_file,pkey,x509,dgst,attribs,db,
- serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,extensions,
+ j=certify_spkac(&x,spkac_file,pkey,x509,dgst,sigopts,
+ attribs,db, serial,subj,chtype,multirdn,
+ email_dn,startdate,enddate,days,extensions,
conf,verbose,certopt,nameopt,default_op,ext_copy);
if (j < 0) goto err;
if (j > 0)
@@ -1194,7 +1209,8 @@ EF_ALIGNMENT=0;
if (ss_cert_file != NULL)
{
total++;
- j=certify_cert(&x,ss_cert_file,pkey,x509,dgst,attribs,
+ j=certify_cert(&x,ss_cert_file,pkey,x509,dgst,sigopts,
+ attribs,
db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
extensions,conf,verbose, certopt, nameopt,
default_op, ext_copy, e);
@@ -1214,7 +1230,7 @@ EF_ALIGNMENT=0;
if (infile != NULL)
{
total++;
- j=certify(&x,infile,pkey,x509p,dgst,attribs,db,
+ j=certify(&x,infile,pkey,x509p,dgst,sigopts, attribs,db,
serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
extensions,conf,verbose, certopt, nameopt,
default_op, ext_copy, selfsign);
@@ -1234,7 +1250,7 @@ EF_ALIGNMENT=0;
for (i=0; iid;
+ unsigned long id = SSL_CIPHER_get_id(c);
int id0 = (int)(id >> 24);
int id1 = (int)((id >> 16) & 0xffL);
int id2 = (int)((id >> 8) & 0xffL);
diff --git a/apps/client.pem b/apps/client.pem
index 307910e..e7a47a7 100644
--- a/apps/client.pem
+++ b/apps/client.pem
@@ -1,24 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Client test cert (512 bit)
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Client Cert
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
-----BEGIN CERTIFICATE-----
-MIIB6TCCAVICAQIwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU2WhcNOTgwNjA5
-MTM1NzU2WjBjMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxIzAhBgNVBAMTGkNsaWVudCB0ZXN0IGNl
-cnQgKDUxMiBiaXQpMFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBALtv55QyzG6i2Plw
-Z1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexmq/R4KedLjFEIYjocDui+IXs62NNt
-XrT8odkCAwEAATANBgkqhkiG9w0BAQQFAAOBgQBwtMmI7oGUG8nKmftQssATViH5
-NRRtoEw07DxJp/LfatHdrhqQB73eGdL5WILZJXk46Xz2e9WMSUjVCSYhdKxtflU3
-UR2Ajv1Oo0sTNdfz0wDqJNirLNtzyhhsaq8qMTrLwXrCP31VxBiigFSQSUFnZyTE
-9TKwhS4GlwbtCfxSKQ==
+MIID5zCCAs+gAwIBAgIJALnu1NlVpZ6yMA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZDELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxGTAXBgNVBAMMEFRlc3QgQ2xpZW50IENlcnQw
+ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC0ranbHRLcLVqN+0BzcZpY
++yOLqxzDWT1LD9eW1stC4NzXX9/DCtSIVyN7YIHdGLrIPr64IDdXXaMRzgZ2rOKs
+lmHCAiFpO/ja99gGCJRxH0xwQatqAULfJVHeUhs7OEGOZc2nWifjqKvGfNTilP7D
+nwi69ipQFq9oS19FmhwVHk2wg7KZGHI1qDyG04UrfCZMRitvS9+UVhPpIPjuiBi2
+x3/FZIpL5gXJvvFK6xHY63oq2asyzBATntBgnP4qJFWWcvRx24wF1PnZabxuVoL2
+bPnQ/KvONDrw3IdqkKhYNTul7jEcu3OlcZIMw+7DiaKJLAzKb/bBF5gm/pwW6As9
+AgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJYIZI
+AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW
+BBSZHKyLoTh7Mb409Zn/mK1ceSDAjDAfBgNVHSMEGDAWgBQ2w2yI55X+sL3szj49
+hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEAD0mL7PtPYgCEuDyOQSbLpeND5hVS
+curxQdGnrJ6Acrhodb7E9ccATokeb0PLx6HBLQUicxhTZIQ9FbO43YkQcOU6C3BB
+IlwskqmtN6+VmrQzNolHCDzvxNZs9lYL2VbGPGqVRyjZeHpoAlf9cQr8PgDb4d4b
+vUx2KAhHQvV2nkmYvKyXcgnRuHggumF87mkxidriGAEFwH4qfOqetUg64WyxP7P2
+QLipm04SyQa7ONtIApfVXgHcE42Py4/f4arzCzMjKe3VyhGkS7nsT55X/fWgTaRm
+CQPkO+H94P958WTvQDt77bQ+D3IvYaVvfil8n6HJMOJfFT0LJuSUbpSXJg==
-----END CERTIFICATE-----
-----BEGIN RSA PRIVATE KEY-----
-MIIBOwIBAAJBALtv55QyzG6i2PlwZ1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexm
-q/R4KedLjFEIYjocDui+IXs62NNtXrT8odkCAwEAAQJAbwXq0vJ/+uyEvsNgxLko
-/V86mGXQ/KrSkeKlL0r4ENxjcyeMAGoKu6J9yMY7+X9+Zm4nxShNfTsf/+Freoe1
-HQIhAPOSm5Q1YI+KIsII2GeVJx1U69+wnd71OasIPakS1L1XAiEAxQAW+J3/JWE0
-ftEYakbhUOKL8tD1OaFZS71/5GdG7E8CIQCefUMmySSvwd6kC0VlATSWbW+d+jp/
-nWmM1KvqnAo5uQIhALqEADu5U1Wvt8UN8UDGBRPQulHWNycuNV45d3nnskWPAiAw
-ueTyr6WsZ5+SD8g/Hy3xuvF3nPmJRH+rwvVihlcFOg==
+MIIEpQIBAAKCAQEAtK2p2x0S3C1ajftAc3GaWPsji6scw1k9Sw/XltbLQuDc11/f
+wwrUiFcje2CB3Ri6yD6+uCA3V12jEc4GdqzirJZhwgIhaTv42vfYBgiUcR9McEGr
+agFC3yVR3lIbOzhBjmXNp1on46irxnzU4pT+w58IuvYqUBavaEtfRZocFR5NsIOy
+mRhyNag8htOFK3wmTEYrb0vflFYT6SD47ogYtsd/xWSKS+YFyb7xSusR2Ot6Ktmr
+MswQE57QYJz+KiRVlnL0cduMBdT52Wm8blaC9mz50PyrzjQ68NyHapCoWDU7pe4x
+HLtzpXGSDMPuw4miiSwMym/2wReYJv6cFugLPQIDAQABAoIBAAZOyc9MhIwLSU4L
+p4RgQvM4UVVe8/Id+3XTZ8NsXExJbWxXfIhiqGjaIfL8u4vsgRjcl+v1s/jo2/iT
+KMab4o4D8gXD7UavQVDjtjb/ta79WL3SjRl2Uc9YjjMkyq6WmDNQeo2NKDdafCTB
+1uzSJtLNipB8Z53ELPuHJhxX9QMHrMnuha49riQgXZ7buP9iQrHJFhImBjSzbxJx
+L+TI6rkyLSf9Wi0Pd3L27Ob3QWNfNRYNSeTE+08eSRChkur5W0RuXAcuAICdQlCl
+LBvWO/LmmvbzCqiDcgy/TliSb6CGGwgiNG7LJZmlkYNj8laGwalNlYZs3UrVv6NO
+Br2loAECgYEA2kvCvPGj0Dg/6g7WhXDvAkEbcaL1tSeCxBbNH+6HS2UWMWvyTtCn
+/bbD519QIdkvayy1QjEf32GV/UjUVmlULMLBcDy0DGjtL3+XpIhLKWDNxN1v1/ai
+1oz23ZJCOgnk6K4qtFtlRS1XtynjA+rBetvYvLP9SKeFrnpzCgaA2r0CgYEA0+KX
+1ACXDTNH5ySX3kMjSS9xdINf+OOw4CvPHFwbtc9aqk2HePlEsBTz5I/W3rKwXva3
+NqZ/bRqVVeZB/hHKFywgdUQk2Uc5z/S7Lw70/w1HubNTXGU06Ngb6zOFAo/o/TwZ
+zTP1BMIKSOB6PAZPS3l+aLO4FRIRotfFhgRHOoECgYEAmiZbqt8cJaJDB/5YYDzC
+mp3tSk6gIb936Q6M5VqkMYp9pIKsxhk0N8aDCnTU+kIK6SzWBpr3/d9Ecmqmfyq7
+5SvWO3KyVf0WWK9KH0abhOm2BKm2HBQvI0DB5u8sUx2/hsvOnjPYDISbZ11t0MtK
+u35Zy89yMYcSsIYJjG/ROCUCgYEAgI2P9G5PNxEP5OtMwOsW84Y3Xat/hPAQFlI+
+HES+AzbFGWJkeT8zL2nm95tVkFP1sggZ7Kxjz3w7cpx7GX0NkbWSE9O+T51pNASV
+tN1sQ3p5M+/a+cnlqgfEGJVvc7iAcXQPa3LEi5h2yPR49QYXAgG6cifn3dDSpmwn
+SUI7PQECgYEApGCIIpSRPLAEHTGmP87RBL1smurhwmy2s/pghkvUkWehtxg0sGHh
+kuaqDWcskogv+QC0sVdytiLSz8G0DwcEcsHK1Fkyb8A+ayiw6jWJDo2m9+IF4Fww
+1Te6jFPYDESnbhq7+TLGgHGhtwcu5cnb4vSuYXGXKupZGzoLOBbv1Zw=
-----END RSA PRIVATE KEY-----
diff --git a/apps/cms.c b/apps/cms.c
index d29a884..5f77f8f 100644
--- a/apps/cms.c
+++ b/apps/cms.c
@@ -136,6 +136,7 @@ int MAIN(int argc, char **argv)
char *engine=NULL;
#endif
unsigned char *secret_key = NULL, *secret_keyid = NULL;
+ unsigned char *pwri_pass = NULL, *pwri_tmp = NULL;
size_t secret_keylen = 0, secret_keyidlen = 0;
ASN1_OBJECT *econtent_type = NULL;
@@ -232,6 +233,8 @@ int MAIN(int argc, char **argv)
else if (!strcmp(*args,"-camellia256"))
cipher = EVP_camellia_256_cbc();
#endif
+ else if (!strcmp (*args, "-debug_decrypt"))
+ flags |= CMS_DEBUG_DECRYPT;
else if (!strcmp (*args, "-text"))
flags |= CMS_TEXT;
else if (!strcmp (*args, "-nointern"))
@@ -326,6 +329,13 @@ int MAIN(int argc, char **argv)
}
secret_keyidlen = (size_t)ltmp;
}
+ else if (!strcmp(*args,"-pwri_password"))
+ {
+ if (!args[1])
+ goto argerr;
+ args++;
+ pwri_pass = (unsigned char *)*args;
+ }
else if (!strcmp(*args,"-econtent_type"))
{
if (!args[1])
@@ -559,7 +569,7 @@ int MAIN(int argc, char **argv)
else if (operation == SMIME_DECRYPT)
{
- if (!recipfile && !keyfile && !secret_key)
+ if (!recipfile && !keyfile && !secret_key && !pwri_pass)
{
BIO_printf(bio_err, "No recipient certificate or key specified\n");
badarg = 1;
@@ -567,7 +577,7 @@ int MAIN(int argc, char **argv)
}
else if (operation == SMIME_ENCRYPT)
{
- if (!*args && !secret_key)
+ if (!*args && !secret_key && !pwri_pass)
{
BIO_printf(bio_err, "No recipient(s) certificate(s) specified\n");
badarg = 1;
@@ -618,7 +628,7 @@ int MAIN(int argc, char **argv)
BIO_printf (bio_err, "-certsout file certificate output file\n");
BIO_printf (bio_err, "-signer file signer certificate file\n");
BIO_printf (bio_err, "-recip file recipient certificate file for decryption\n");
- BIO_printf (bio_err, "-skeyid use subject key identifier\n");
+ BIO_printf (bio_err, "-keyid use subject key identifier\n");
BIO_printf (bio_err, "-in file input file\n");
BIO_printf (bio_err, "-inform arg input format SMIME (default), PEM or DER\n");
BIO_printf (bio_err, "-inkey file input private key (if not signer or recipient)\n");
@@ -917,6 +927,17 @@ int MAIN(int argc, char **argv)
secret_key = NULL;
secret_keyid = NULL;
}
+ if (pwri_pass)
+ {
+ pwri_tmp = (unsigned char *)BUF_strdup((char *)pwri_pass);
+ if (!pwri_tmp)
+ goto end;
+ if (!CMS_add0_recipient_password(cms,
+ -1, NID_undef, NID_undef,
+ pwri_tmp, -1, NULL))
+ goto end;
+ pwri_tmp = NULL;
+ }
if (!(flags & CMS_STREAM))
{
if (!CMS_final(cms, in, NULL, flags))
@@ -1020,6 +1041,8 @@ int MAIN(int argc, char **argv)
ret = 4;
if (operation == SMIME_DECRYPT)
{
+ if (flags & CMS_DEBUG_DECRYPT)
+ CMS_decrypt(cms, NULL, NULL, NULL, NULL, flags);
if (secret_key)
{
@@ -1043,6 +1066,16 @@ int MAIN(int argc, char **argv)
}
}
+ if (pwri_pass)
+ {
+ if (!CMS_decrypt_set1_password(cms, pwri_pass, -1))
+ {
+ BIO_puts(bio_err,
+ "Error decrypting CMS using password\n");
+ goto end;
+ }
+ }
+
if (!CMS_decrypt(cms, NULL, NULL, indata, out, flags))
{
BIO_printf(bio_err, "Error decrypting CMS structure\n");
@@ -1167,6 +1200,8 @@ int MAIN(int argc, char **argv)
OPENSSL_free(secret_key);
if (secret_keyid)
OPENSSL_free(secret_keyid);
+ if (pwri_tmp)
+ OPENSSL_free(pwri_tmp);
if (econtent_type)
ASN1_OBJECT_free(econtent_type);
if (rr)
diff --git a/apps/dgst.c b/apps/dgst.c
index 9bf38ce..81bd870 100644
--- a/apps/dgst.c
+++ b/apps/dgst.c
@@ -127,6 +127,7 @@ int MAIN(int argc, char **argv)
#endif
char *hmac_key=NULL;
char *mac_name=NULL;
+ int non_fips_allow = 0;
STACK_OF(OPENSSL_STRING) *sigopts = NULL, *macopts = NULL;
apps_startup();
@@ -215,6 +216,10 @@ int MAIN(int argc, char **argv)
out_bin = 1;
else if (strcmp(*argv,"-d") == 0)
debug=1;
+ else if (!strcmp(*argv,"-fips-fingerprint"))
+ hmac_key = "etaonrishdlcupfm";
+ else if (strcmp(*argv,"-non-fips-allow") == 0)
+ non_fips_allow=1;
else if (!strcmp(*argv,"-hmac"))
{
if (--argc < 1)
@@ -395,6 +400,13 @@ int MAIN(int argc, char **argv)
goto end;
}
+ if (non_fips_allow)
+ {
+ EVP_MD_CTX *md_ctx;
+ BIO_get_md_ctx(bmd,&md_ctx);
+ EVP_MD_CTX_set_flags(md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ }
+
if (hmac_key)
{
sigkey = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, e,
diff --git a/apps/dhparam.c b/apps/dhparam.c
index b47097c..1297d6f 100644
--- a/apps/dhparam.c
+++ b/apps/dhparam.c
@@ -332,7 +332,6 @@ int MAIN(int argc, char **argv)
BIO_printf(bio_err,"This is going to take a long time\n");
if(!dh || !DH_generate_parameters_ex(dh, num, g, &cb))
{
- if(dh) DH_free(dh);
ERR_print_errors(bio_err);
goto end;
}
diff --git a/apps/dsaparam.c b/apps/dsaparam.c
index fe72c1d..683d513 100644
--- a/apps/dsaparam.c
+++ b/apps/dsaparam.c
@@ -326,6 +326,7 @@ int MAIN(int argc, char **argv)
goto end;
}
#endif
+ ERR_print_errors(bio_err);
BIO_printf(bio_err,"Error, DSA key generation failed\n");
goto end;
}
@@ -429,13 +430,19 @@ int MAIN(int argc, char **argv)
assert(need_rand);
if ((dsakey=DSAparams_dup(dsa)) == NULL) goto end;
- if (!DSA_generate_key(dsakey)) goto end;
+ if (!DSA_generate_key(dsakey))
+ {
+ ERR_print_errors(bio_err);
+ DSA_free(dsakey);
+ goto end;
+ }
if (outformat == FORMAT_ASN1)
i=i2d_DSAPrivateKey_bio(out,dsakey);
else if (outformat == FORMAT_PEM)
i=PEM_write_bio_DSAPrivateKey(out,dsakey,NULL,NULL,0,NULL,NULL);
else {
BIO_printf(bio_err,"bad output format specified for outfile\n");
+ DSA_free(dsakey);
goto end;
}
DSA_free(dsakey);
diff --git a/apps/enc.c b/apps/enc.c
index 076225c..719acc3 100644
--- a/apps/enc.c
+++ b/apps/enc.c
@@ -129,6 +129,7 @@ int MAIN(int argc, char **argv)
char *engine = NULL;
#endif
const EVP_MD *dgst=NULL;
+ int non_fips_allow = 0;
apps_startup();
@@ -281,6 +282,8 @@ int MAIN(int argc, char **argv)
if (--argc < 1) goto bad;
md= *(++argv);
}
+ else if (strcmp(*argv,"-non-fips-allow") == 0)
+ non_fips_allow = 1;
else if ((argv[0][0] == '-') &&
((c=EVP_get_cipherbyname(&(argv[0][1]))) != NULL))
{
@@ -589,6 +592,11 @@ int MAIN(int argc, char **argv)
*/
BIO_get_cipher_ctx(benc, &ctx);
+
+ if (non_fips_allow)
+ EVP_CIPHER_CTX_set_flags(ctx,
+ EVP_CIPH_FLAG_NON_FIPS_ALLOW);
+
if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, enc))
{
BIO_printf(bio_err, "Error setting cipher %s\n",
diff --git a/apps/genrsa.c b/apps/genrsa.c
index 37e9310..ece114c 100644
--- a/apps/genrsa.c
+++ b/apps/genrsa.c
@@ -78,7 +78,7 @@
#include
#include
-#define DEFBITS 512
+#define DEFBITS 1024
#undef PROG
#define PROG genrsa_main
diff --git a/apps/ocsp.c b/apps/ocsp.c
index 01847df..83c5a76 100644
--- a/apps/ocsp.c
+++ b/apps/ocsp.c
@@ -617,7 +617,7 @@ int MAIN(int argc, char **argv)
BIO_printf (bio_err, "-ndays n number of days before next update\n");
BIO_printf (bio_err, "-resp_key_id identify reponse by signing certificate key ID\n");
BIO_printf (bio_err, "-nrequest n number of requests to accept (default unlimited)\n");
- BIO_printf (bio_err, "- use specified digest in the request");
+ BIO_printf (bio_err, "- use specified digest in the request\n");
goto end;
}
diff --git a/apps/openssl.c b/apps/openssl.c
index 1068957..33b1655 100644
--- a/apps/openssl.c
+++ b/apps/openssl.c
@@ -129,6 +129,9 @@
#include "progs.h"
#include "s_apps.h"
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
/* The LHASH callbacks ("hash" & "cmp") have been replaced by functions with the
* base prototypes (we cast each variable inside the function to the required
@@ -310,6 +313,19 @@ int main(int Argc, char *ARGV[])
CRYPTO_set_locking_callback(lock_dbg_cb);
}
+ if(getenv("OPENSSL_FIPS")) {
+#ifdef OPENSSL_FIPS
+ if (!FIPS_mode_set(1)) {
+ ERR_load_crypto_strings();
+ ERR_print_errors(BIO_new_fp(stderr,BIO_NOCLOSE));
+ EXIT(1);
+ }
+#else
+ fprintf(stderr, "FIPS mode not supported.\n");
+ EXIT(1);
+#endif
+ }
+
apps_startup();
/* Lets load up our environment a little */
diff --git a/apps/openssl.cnf b/apps/openssl.cnf
index 9d2cd5b..18760c6 100644
--- a/apps/openssl.cnf
+++ b/apps/openssl.cnf
@@ -145,7 +145,7 @@ localityName = Locality Name (eg, city)
organizationalUnitName = Organizational Unit Name (eg, section)
#organizationalUnitName_default =
-commonName = Common Name (eg, YOUR name)
+commonName = Common Name (e.g. server FQDN or YOUR name)
commonName_max = 64
emailAddress = Email Address
diff --git a/apps/progs.h b/apps/progs.h
index 728bb6d..dd2298b 100644
--- a/apps/progs.h
+++ b/apps/progs.h
@@ -46,6 +46,7 @@ extern int engine_main(int argc,char *argv[]);
extern int ocsp_main(int argc,char *argv[]);
extern int prime_main(int argc,char *argv[]);
extern int ts_main(int argc,char *argv[]);
+extern int srp_main(int argc,char *argv[]);
#define FUNC_TYPE_GENERAL 1
#define FUNC_TYPE_MD 2
@@ -149,6 +150,9 @@ FUNCTION functions[] = {
#if 0 /* ANDROID */
{FUNC_TYPE_GENERAL,"ts",ts_main},
#endif
+#ifndef OPENSSL_NO_SRP
+ {FUNC_TYPE_GENERAL,"srp",srp_main},
+#endif
#ifndef OPENSSL_NO_MD2
{FUNC_TYPE_MD,"md2",dgst_main},
#endif
diff --git a/apps/progs.pl b/apps/progs.pl
index de6fdea..39ca8f7 100644
--- a/apps/progs.pl
+++ b/apps/progs.pl
@@ -51,6 +51,8 @@
{ print "#ifndef OPENSSL_NO_CMS\n${str}#endif\n"; }
elsif ( ($_ =~ /^ocsp$/))
{ print "#ifndef OPENSSL_NO_OCSP\n${str}#endif\n"; }
+ elsif ( ($_ =~ /^srp$/))
+ { print "#ifndef OPENSSL_NO_SRP\n${str}#endif\n"; }
else
{ print $str; }
}
diff --git a/apps/req.c b/apps/req.c
index 820cd18..8552658 100644
--- a/apps/req.c
+++ b/apps/req.c
@@ -165,7 +165,7 @@ int MAIN(int argc, char **argv)
EVP_PKEY_CTX *genctx = NULL;
const char *keyalg = NULL;
char *keyalgstr = NULL;
- STACK_OF(OPENSSL_STRING) *pkeyopts = NULL;
+ STACK_OF(OPENSSL_STRING) *pkeyopts = NULL, *sigopts = NULL;
EVP_PKEY *pkey=NULL;
int i=0,badops=0,newreq=0,verbose=0,pkey_type=-1;
long newkey = -1;
@@ -310,6 +310,15 @@ int MAIN(int argc, char **argv)
if (!pkeyopts || !sk_OPENSSL_STRING_push(pkeyopts, *(++argv)))
goto bad;
}
+ else if (strcmp(*argv,"-sigopt") == 0)
+ {
+ if (--argc < 1)
+ goto bad;
+ if (!sigopts)
+ sigopts = sk_OPENSSL_STRING_new_null();
+ if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+ goto bad;
+ }
else if (strcmp(*argv,"-batch") == 0)
batch=1;
else if (strcmp(*argv,"-newhdr") == 0)
@@ -858,8 +867,9 @@ int MAIN(int argc, char **argv)
extensions);
goto end;
}
-
- if (!(i=X509_sign(x509ss,pkey,digest)))
+
+ i=do_X509_sign(bio_err, x509ss, pkey, digest, sigopts);
+ if (!i)
{
ERR_print_errors(bio_err);
goto end;
@@ -883,7 +893,8 @@ int MAIN(int argc, char **argv)
req_exts);
goto end;
}
- if (!(i=X509_REQ_sign(req,pkey,digest)))
+ i=do_X509_REQ_sign(bio_err, req, pkey, digest, sigopts);
+ if (!i)
{
ERR_print_errors(bio_err);
goto end;
@@ -1084,6 +1095,8 @@ int MAIN(int argc, char **argv)
EVP_PKEY_CTX_free(genctx);
if (pkeyopts)
sk_OPENSSL_STRING_free(pkeyopts);
+ if (sigopts)
+ sk_OPENSSL_STRING_free(sigopts);
#ifndef OPENSSL_NO_ENGINE
if (gen_eng)
ENGINE_free(gen_eng);
@@ -1756,3 +1769,68 @@ static int genpkey_cb(EVP_PKEY_CTX *ctx)
#endif
return 1;
}
+
+static int do_sign_init(BIO *err, EVP_MD_CTX *ctx, EVP_PKEY *pkey,
+ const EVP_MD *md, STACK_OF(OPENSSL_STRING) *sigopts)
+ {
+ EVP_PKEY_CTX *pkctx = NULL;
+ int i;
+ EVP_MD_CTX_init(ctx);
+ if (!EVP_DigestSignInit(ctx, &pkctx, md, NULL, pkey))
+ return 0;
+ for (i = 0; i < sk_OPENSSL_STRING_num(sigopts); i++)
+ {
+ char *sigopt = sk_OPENSSL_STRING_value(sigopts, i);
+ if (pkey_ctrl_string(pkctx, sigopt) <= 0)
+ {
+ BIO_printf(err, "parameter error \"%s\"\n", sigopt);
+ ERR_print_errors(bio_err);
+ return 0;
+ }
+ }
+ return 1;
+ }
+
+int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
+ STACK_OF(OPENSSL_STRING) *sigopts)
+ {
+ int rv;
+ EVP_MD_CTX mctx;
+ EVP_MD_CTX_init(&mctx);
+ rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+ if (rv > 0)
+ rv = X509_sign_ctx(x, &mctx);
+ EVP_MD_CTX_cleanup(&mctx);
+ return rv > 0 ? 1 : 0;
+ }
+
+
+int do_X509_REQ_sign(BIO *err, X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md,
+ STACK_OF(OPENSSL_STRING) *sigopts)
+ {
+ int rv;
+ EVP_MD_CTX mctx;
+ EVP_MD_CTX_init(&mctx);
+ rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+ if (rv > 0)
+ rv = X509_REQ_sign_ctx(x, &mctx);
+ EVP_MD_CTX_cleanup(&mctx);
+ return rv > 0 ? 1 : 0;
+ }
+
+
+
+int do_X509_CRL_sign(BIO *err, X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md,
+ STACK_OF(OPENSSL_STRING) *sigopts)
+ {
+ int rv;
+ EVP_MD_CTX mctx;
+ EVP_MD_CTX_init(&mctx);
+ rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+ if (rv > 0)
+ rv = X509_CRL_sign_ctx(x, &mctx);
+ EVP_MD_CTX_cleanup(&mctx);
+ return rv > 0 ? 1 : 0;
+ }
+
+
diff --git a/apps/s_cb.c b/apps/s_cb.c
index c4f5512..84c3b44 100644
--- a/apps/s_cb.c
+++ b/apps/s_cb.c
@@ -237,8 +237,8 @@ int set_cert_stuff(SSL_CTX *ctx, char *cert_file, char *key_file)
/* If we are using DSA, we can copy the parameters from
* the private key */
-
-
+
+
/* Now we know that a key and cert have been set against
* the SSL context */
if (!SSL_CTX_check_private_key(ctx))
@@ -357,6 +357,12 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
case TLS1_VERSION:
str_version = "TLS 1.0 ";
break;
+ case TLS1_1_VERSION:
+ str_version = "TLS 1.1 ";
+ break;
+ case TLS1_2_VERSION:
+ str_version = "TLS 1.2 ";
+ break;
case DTLS1_VERSION:
str_version = "DTLS 1.0 ";
break;
@@ -430,6 +436,8 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
if (version == SSL3_VERSION ||
version == TLS1_VERSION ||
+ version == TLS1_1_VERSION ||
+ version == TLS1_2_VERSION ||
version == DTLS1_VERSION ||
version == DTLS1_BAD_VER)
{
@@ -549,6 +557,9 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
case 114:
str_details2 = " bad_certificate_hash_value";
break;
+ case 115:
+ str_details2 = " unknown_psk_identity";
+ break;
}
}
}
@@ -597,6 +608,26 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
}
}
}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+ if (content_type == 24) /* Heartbeat */
+ {
+ str_details1 = ", Heartbeat";
+
+ if (len > 0)
+ {
+ switch (((const unsigned char*)buf)[0])
+ {
+ case 1:
+ str_details1 = ", HeartbeatRequest";
+ break;
+ case 2:
+ str_details1 = ", HeartbeatResponse";
+ break;
+ }
+ }
+ }
+#endif
}
BIO_printf(bio, "%s %s%s [length %04lx]%s%s\n", str_write_p, str_version, str_content_type, (unsigned long)len, str_details1, str_details2);
@@ -657,6 +688,22 @@ void MS_CALLBACK tlsext_cb(SSL *s, int client_server, int type,
extname = "status request";
break;
+ case TLSEXT_TYPE_user_mapping:
+ extname = "user mapping";
+ break;
+
+ case TLSEXT_TYPE_client_authz:
+ extname = "client authz";
+ break;
+
+ case TLSEXT_TYPE_server_authz:
+ extname = "server authz";
+ break;
+
+ case TLSEXT_TYPE_cert_type:
+ extname = "cert type";
+ break;
+
case TLSEXT_TYPE_elliptic_curves:
extname = "elliptic curves";
break;
@@ -665,12 +712,28 @@ void MS_CALLBACK tlsext_cb(SSL *s, int client_server, int type,
extname = "EC point formats";
break;
+ case TLSEXT_TYPE_srp:
+ extname = "SRP";
+ break;
+
+ case TLSEXT_TYPE_signature_algorithms:
+ extname = "signature algorithms";
+ break;
+
+ case TLSEXT_TYPE_use_srtp:
+ extname = "use SRTP";
+ break;
+
+ case TLSEXT_TYPE_heartbeat:
+ extname = "heartbeat";
+ break;
+
case TLSEXT_TYPE_session_ticket:
- extname = "server ticket";
+ extname = "session ticket";
break;
- case TLSEXT_TYPE_renegotiate:
- extname = "renegotiate";
+ case TLSEXT_TYPE_renegotiate:
+ extname = "renegotiation info";
break;
#ifdef TLSEXT_TYPE_opaque_prf_input
@@ -678,6 +741,11 @@ void MS_CALLBACK tlsext_cb(SSL *s, int client_server, int type,
extname = "opaque PRF input";
break;
#endif
+#ifdef TLSEXT_TYPE_next_proto_neg
+ case TLSEXT_TYPE_next_proto_neg:
+ extname = "next protocol";
+ break;
+#endif
default:
extname = "unknown";
diff --git a/apps/s_client.c b/apps/s_client.c
index b951513..cb1efcd 100644
--- a/apps/s_client.c
+++ b/apps/s_client.c
@@ -163,6 +163,9 @@ typedef unsigned int u_int;
#include
#include
#include
+#ifndef OPENSSL_NO_SRP
+#include
+#endif
#include "s_apps.h"
#include "timeouts.h"
@@ -203,6 +206,9 @@ static int c_status_req=0;
static int c_msg=0;
static int c_showcerts=0;
+static char *keymatexportlabel=NULL;
+static int keymatexportlen=20;
+
static void sc_usage(void);
static void print_stuff(BIO *berr,SSL *con,int full);
#ifndef OPENSSL_NO_TLSEXT
@@ -315,13 +321,22 @@ static void sc_usage(void)
# ifndef OPENSSL_NO_JPAKE
BIO_printf(bio_err," -jpake arg - JPAKE secret to use\n");
# endif
+#endif
+#ifndef OPENSSL_NO_SRP
+ BIO_printf(bio_err," -srpuser user - SRP authentification for 'user'\n");
+ BIO_printf(bio_err," -srppass arg - password for 'user'\n");
+ BIO_printf(bio_err," -srp_lateuser - SRP username into second ClientHello message\n");
+ BIO_printf(bio_err," -srp_moregroups - Tolerate other than the known g N values.\n");
+ BIO_printf(bio_err," -srp_strength int - minimal mength in bits for N (default %d).\n",SRP_MINIMAL_N);
#endif
BIO_printf(bio_err," -ssl2 - just use SSLv2\n");
BIO_printf(bio_err," -ssl3 - just use SSLv3\n");
+ BIO_printf(bio_err," -tls1_2 - just use TLSv1.2\n");
+ BIO_printf(bio_err," -tls1_1 - just use TLSv1.1\n");
BIO_printf(bio_err," -tls1 - just use TLSv1\n");
BIO_printf(bio_err," -dtls1 - just use DTLSv1\n");
BIO_printf(bio_err," -mtu - set the link layer MTU\n");
- BIO_printf(bio_err," -no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n");
+ BIO_printf(bio_err," -no_tls1_2/-no_tls1_1/-no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n");
BIO_printf(bio_err," -bugs - Switch on all SSL implementation bug workarounds\n");
BIO_printf(bio_err," -serverpref - Use server's cipher preferences (only SSLv2)\n");
BIO_printf(bio_err," -cipher - preferred cipher to use, use the 'openssl ciphers'\n");
@@ -344,10 +359,16 @@ static void sc_usage(void)
BIO_printf(bio_err," -no_ticket - disable use of RFC4507bis session tickets\n");
# ifndef OPENSSL_NO_NEXTPROTONEG
BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
+ BIO_printf(bio_err," -alpn arg - enable ALPN extension, considering named protocols supported (comma-separated list)\n");
# endif
- BIO_printf(bio_err," -cutthrough - enable 1-RTT full-handshake for strong ciphers\n");
#endif
+ BIO_printf(bio_err," -cutthrough - enable 1-RTT full-handshake for strong ciphers\n");
BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
+#ifndef OPENSSL_NO_SRTP
+ BIO_printf(bio_err," -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n");
+#endif
+ BIO_printf(bio_err," -keymatexport label - Export keying material using label\n");
+ BIO_printf(bio_err," -keymatexportlen len - Export len bytes of keying material (default 20)\n");
}
#ifndef OPENSSL_NO_TLSEXT
@@ -371,6 +392,124 @@ static int MS_CALLBACK ssl_servername_cb(SSL *s, int *ad, void *arg)
return SSL_TLSEXT_ERR_OK;
}
+#ifndef OPENSSL_NO_SRP
+
+/* This is a context that we pass to all callbacks */
+typedef struct srp_arg_st
+ {
+ char *srppassin;
+ char *srplogin;
+ int msg; /* copy from c_msg */
+ int debug; /* copy from c_debug */
+ int amp; /* allow more groups */
+ int strength /* minimal size for N */ ;
+ } SRP_ARG;
+
+#define SRP_NUMBER_ITERATIONS_FOR_PRIME 64
+
+static int srp_Verify_N_and_g(BIGNUM *N, BIGNUM *g)
+ {
+ BN_CTX *bn_ctx = BN_CTX_new();
+ BIGNUM *p = BN_new();
+ BIGNUM *r = BN_new();
+ int ret =
+ g != NULL && N != NULL && bn_ctx != NULL && BN_is_odd(N) &&
+ BN_is_prime_ex(N, SRP_NUMBER_ITERATIONS_FOR_PRIME, bn_ctx, NULL) &&
+ p != NULL && BN_rshift1(p, N) &&
+
+ /* p = (N-1)/2 */
+ BN_is_prime_ex(p, SRP_NUMBER_ITERATIONS_FOR_PRIME, bn_ctx, NULL) &&
+ r != NULL &&
+
+ /* verify g^((N-1)/2) == -1 (mod N) */
+ BN_mod_exp(r, g, p, N, bn_ctx) &&
+ BN_add_word(r, 1) &&
+ BN_cmp(r, N) == 0;
+
+ if(r)
+ BN_free(r);
+ if(p)
+ BN_free(p);
+ if(bn_ctx)
+ BN_CTX_free(bn_ctx);
+ return ret;
+ }
+
+/* This callback is used here for two purposes:
+ - extended debugging
+ - making some primality tests for unknown groups
+ The callback is only called for a non default group.
+
+ An application does not need the call back at all if
+ only the stanard groups are used. In real life situations,
+ client and server already share well known groups,
+ thus there is no need to verify them.
+ Furthermore, in case that a server actually proposes a group that
+ is not one of those defined in RFC 5054, it is more appropriate
+ to add the group to a static list and then compare since
+ primality tests are rather cpu consuming.
+*/
+
+static int MS_CALLBACK ssl_srp_verify_param_cb(SSL *s, void *arg)
+ {
+ SRP_ARG *srp_arg = (SRP_ARG *)arg;
+ BIGNUM *N = NULL, *g = NULL;
+ if (!(N = SSL_get_srp_N(s)) || !(g = SSL_get_srp_g(s)))
+ return 0;
+ if (srp_arg->debug || srp_arg->msg || srp_arg->amp == 1)
+ {
+ BIO_printf(bio_err, "SRP parameters:\n");
+ BIO_printf(bio_err,"\tN="); BN_print(bio_err,N);
+ BIO_printf(bio_err,"\n\tg="); BN_print(bio_err,g);
+ BIO_printf(bio_err,"\n");
+ }
+
+ if (SRP_check_known_gN_param(g,N))
+ return 1;
+
+ if (srp_arg->amp == 1)
+ {
+ if (srp_arg->debug)
+ BIO_printf(bio_err, "SRP param N and g are not known params, going to check deeper.\n");
+
+/* The srp_moregroups is a real debugging feature.
+ Implementors should rather add the value to the known ones.
+ The minimal size has already been tested.
+*/
+ if (BN_num_bits(g) <= BN_BITS && srp_Verify_N_and_g(N,g))
+ return 1;
+ }
+ BIO_printf(bio_err, "SRP param N and g rejected.\n");
+ return 0;
+ }
+
+#define PWD_STRLEN 1024
+
+static char * MS_CALLBACK ssl_give_srp_client_pwd_cb(SSL *s, void *arg)
+ {
+ SRP_ARG *srp_arg = (SRP_ARG *)arg;
+ char *pass = (char *)OPENSSL_malloc(PWD_STRLEN+1);
+ PW_CB_DATA cb_tmp;
+ int l;
+
+ cb_tmp.password = (char *)srp_arg->srppassin;
+ cb_tmp.prompt_info = "SRP user";
+ if ((l = password_callback(pass, PWD_STRLEN, 0, &cb_tmp))<0)
+ {
+ BIO_printf (bio_err, "Can't read Password\n");
+ OPENSSL_free(pass);
+ return NULL;
+ }
+ *(pass+l)= '\0';
+
+ return pass;
+ }
+
+#endif
+#ifndef OPENSSL_NO_SRTP
+ char *srtp_profiles = NULL;
+#endif
+
# ifndef OPENSSL_NO_NEXTPROTONEG
/* This the context that we pass to next_proto_cb */
typedef struct tlsextnextprotoctx_st {
@@ -422,6 +561,9 @@ int MAIN(int argc, char **argv)
{
unsigned int off=0, clr=0;
SSL *con=NULL;
+#ifndef OPENSSL_NO_KRB5
+ KSSL_CTX *kctx;
+#endif
int s,k,width,state=0;
char *cbuf=NULL,*sbuf=NULL,*mbuf=NULL;
int cbuf_len,cbuf_off;
@@ -470,6 +612,7 @@ int MAIN(int argc, char **argv)
{NULL,0};
# ifndef OPENSSL_NO_NEXTPROTONEG
const char *next_proto_neg_in = NULL;
+ const char *alpn_in = NULL;
# endif
#endif
char *sess_in = NULL;
@@ -481,14 +624,13 @@ int MAIN(int argc, char **argv)
#ifndef OPENSSL_NO_JPAKE
char *jpake_secret = NULL;
#endif
+#ifndef OPENSSL_NO_SRP
+ char * srppass = NULL;
+ int srp_lateuser = 0;
+ SRP_ARG srp_arg = {NULL,NULL,0,0,0,1024};
+#endif
-#if !defined(OPENSSL_NO_SSL2) && !defined(OPENSSL_NO_SSL3)
meth=SSLv23_client_method();
-#elif !defined(OPENSSL_NO_SSL3)
- meth=SSLv3_client_method();
-#elif !defined(OPENSSL_NO_SSL2)
- meth=SSLv2_client_method();
-#endif
apps_startup();
c_Pause=0;
@@ -623,13 +765,44 @@ int MAIN(int argc, char **argv)
psk_key=*(++argv);
for (j = 0; j < strlen(psk_key); j++)
{
- if (isxdigit((int)psk_key[j]))
+ if (isxdigit((unsigned char)psk_key[j]))
continue;
BIO_printf(bio_err,"Not a hex number '%s'\n",*argv);
goto bad;
}
}
#endif
+#ifndef OPENSSL_NO_SRP
+ else if (strcmp(*argv,"-srpuser") == 0)
+ {
+ if (--argc < 1) goto bad;
+ srp_arg.srplogin= *(++argv);
+ meth=TLSv1_client_method();
+ }
+ else if (strcmp(*argv,"-srppass") == 0)
+ {
+ if (--argc < 1) goto bad;
+ srppass= *(++argv);
+ meth=TLSv1_client_method();
+ }
+ else if (strcmp(*argv,"-srp_strength") == 0)
+ {
+ if (--argc < 1) goto bad;
+ srp_arg.strength=atoi(*(++argv));
+ BIO_printf(bio_err,"SRP minimal length for N is %d\n",srp_arg.strength);
+ meth=TLSv1_client_method();
+ }
+ else if (strcmp(*argv,"-srp_lateuser") == 0)
+ {
+ srp_lateuser= 1;
+ meth=TLSv1_client_method();
+ }
+ else if (strcmp(*argv,"-srp_moregroups") == 0)
+ {
+ srp_arg.amp=1;
+ meth=TLSv1_client_method();
+ }
+#endif
#ifndef OPENSSL_NO_SSL2
else if (strcmp(*argv,"-ssl2") == 0)
meth=SSLv2_client_method();
@@ -639,6 +812,10 @@ int MAIN(int argc, char **argv)
meth=SSLv3_client_method();
#endif
#ifndef OPENSSL_NO_TLS1
+ else if (strcmp(*argv,"-tls1_2") == 0)
+ meth=TLSv1_2_client_method();
+ else if (strcmp(*argv,"-tls1_1") == 0)
+ meth=TLSv1_1_client_method();
else if (strcmp(*argv,"-tls1") == 0)
meth=TLSv1_client_method();
#endif
@@ -687,6 +864,10 @@ int MAIN(int argc, char **argv)
if (--argc < 1) goto bad;
CAfile= *(++argv);
}
+ else if (strcmp(*argv,"-no_tls1_2") == 0)
+ off|=SSL_OP_NO_TLSv1_2;
+ else if (strcmp(*argv,"-no_tls1_1") == 0)
+ off|=SSL_OP_NO_TLSv1_1;
else if (strcmp(*argv,"-no_tls1") == 0)
off|=SSL_OP_NO_TLSv1;
else if (strcmp(*argv,"-no_ssl3") == 0)
@@ -704,6 +885,11 @@ int MAIN(int argc, char **argv)
if (--argc < 1) goto bad;
next_proto_neg_in = *(++argv);
}
+ else if (strcmp(*argv,"-alpn") == 0)
+ {
+ if (--argc < 1) goto bad;
+ alpn_in = *(++argv);
+ }
# endif
#endif
else if (strcmp(*argv,"-cutthrough") == 0)
@@ -774,7 +960,25 @@ int MAIN(int argc, char **argv)
jpake_secret = *++argv;
}
#endif
- else
+#ifndef OPENSSL_NO_SRTP
+ else if (strcmp(*argv,"-use_srtp") == 0)
+ {
+ if (--argc < 1) goto bad;
+ srtp_profiles = *(++argv);
+ }
+#endif
+ else if (strcmp(*argv,"-keymatexport") == 0)
+ {
+ if (--argc < 1) goto bad;
+ keymatexportlabel= *(++argv);
+ }
+ else if (strcmp(*argv,"-keymatexportlen") == 0)
+ {
+ if (--argc < 1) goto bad;
+ keymatexportlen=atoi(*(++argv));
+ if (keymatexportlen == 0) goto bad;
+ }
+ else
{
BIO_printf(bio_err,"unknown option %s\n",*argv);
badop=1;
@@ -800,14 +1004,13 @@ int MAIN(int argc, char **argv)
goto end;
}
psk_identity = "JPAKE";
+ if (cipher)
+ {
+ BIO_printf(bio_err, "JPAKE sets cipher to PSK\n");
+ goto end;
+ }
+ cipher = "PSK";
}
-
- if (cipher)
- {
- BIO_printf(bio_err, "JPAKE sets cipher to PSK\n");
- goto end;
- }
- cipher = "PSK";
#endif
OpenSSL_add_ssl_algorithms();
@@ -901,6 +1104,14 @@ int MAIN(int argc, char **argv)
}
}
+#ifndef OPENSSL_NO_SRP
+ if(!app_passwd(bio_err, srppass, NULL, &srp_arg.srppassin, NULL))
+ {
+ BIO_printf(bio_err, "Error getting password\n");
+ goto end;
+ }
+#endif
+
ctx=SSL_CTX_new(meth);
if (ctx == NULL)
{
@@ -936,6 +1147,10 @@ int MAIN(int argc, char **argv)
BIO_printf(bio_c_out, "PSK key given or JPAKE in use, setting client callback\n");
SSL_CTX_set_psk_client_callback(ctx, psk_client_cb);
}
+#endif
+#ifndef OPENSSL_NO_SRTP
+ if (srtp_profiles != NULL)
+ SSL_CTX_set_tlsext_use_srtp(ctx, srtp_profiles);
#endif
if (bugs)
SSL_CTX_set_options(ctx,SSL_OP_ALL|off);
@@ -949,6 +1164,25 @@ int MAIN(int argc, char **argv)
*/
if (socket_type == SOCK_DGRAM) SSL_CTX_set_read_ahead(ctx, 1);
+#if !defined(OPENSSL_NO_TLSEXT)
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
+ if (next_proto.data)
+ SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
+# endif
+ if (alpn_in)
+ {
+ unsigned short alpn_len;
+ unsigned char *alpn = next_protos_parse(&alpn_len, alpn_in);
+
+ if (alpn == NULL)
+ {
+ BIO_printf(bio_err, "Error parsing -alpn argument\n");
+ goto end;
+ }
+ SSL_CTX_set_alpn_protos(ctx, alpn, alpn_len);
+ }
+#endif
+
/* Enable handshake cutthrough for client connections using
* strong ciphers. */
if (cutthrough)
@@ -958,11 +1192,6 @@ int MAIN(int argc, char **argv)
SSL_CTX_set_mode(ctx, ssl_mode);
}
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
- if (next_proto.data)
- SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
-#endif
-
if (state) SSL_CTX_set_info_callback(ctx,apps_ssl_info_callback);
if (cipher != NULL)
if(!SSL_CTX_set_cipher_list(ctx,cipher)) {
@@ -994,6 +1223,24 @@ int MAIN(int argc, char **argv)
SSL_CTX_set_tlsext_servername_callback(ctx, ssl_servername_cb);
SSL_CTX_set_tlsext_servername_arg(ctx, &tlsextcbp);
}
+#ifndef OPENSSL_NO_SRP
+ if (srp_arg.srplogin)
+ {
+ if (!srp_lateuser && !SSL_CTX_set_srp_username(ctx, srp_arg.srplogin))
+ {
+ BIO_printf(bio_err,"Unable to set SRP username\n");
+ goto end;
+ }
+ srp_arg.msg = c_msg;
+ srp_arg.debug = c_debug ;
+ SSL_CTX_set_srp_cb_arg(ctx,&srp_arg);
+ SSL_CTX_set_srp_client_pwd_callback(ctx, ssl_give_srp_client_pwd_cb);
+ SSL_CTX_set_srp_strength(ctx, srp_arg.strength);
+ if (c_msg || c_debug || srp_arg.amp == 0)
+ SSL_CTX_set_srp_verify_param_callback(ctx, ssl_srp_verify_param_cb);
+ }
+
+#endif
#endif
con=SSL_new(ctx);
@@ -1032,9 +1279,10 @@ int MAIN(int argc, char **argv)
}
#endif
#ifndef OPENSSL_NO_KRB5
- if (con && (con->kssl_ctx = kssl_ctx_new()) != NULL)
+ if (con && (kctx = kssl_ctx_new()) != NULL)
{
- kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVER, host);
+ SSL_set0_kssl_ctx(con, kctx);
+ kssl_ctx_setstring(kctx, KSSL_SERVER, host);
}
#endif /* OPENSSL_NO_KRB5 */
/* SSL_set_cipher_list(con,"RC4-MD5"); */
@@ -1066,7 +1314,7 @@ int MAIN(int argc, char **argv)
}
}
#endif
- if (c_Pause & 0x01) con->debug=1;
+ if (c_Pause & 0x01) SSL_set_debug(con, 1);
if ( SSL_version(con) == DTLS1_VERSION)
{
@@ -1115,7 +1363,7 @@ int MAIN(int argc, char **argv)
if (c_debug)
{
- con->debug=1;
+ SSL_set_debug(con, 1);
BIO_set_callback(sbio,bio_dump_callback);
BIO_set_callback_arg(sbio,(char *)bio_c_out);
}
@@ -1649,6 +1897,14 @@ printf("read=%d pending=%d peek=%d\n",k,SSL_pending(con),SSL_peek(con,zbuf,10240
SSL_renegotiate(con);
cbuf_len=0;
}
+#ifndef OPENSSL_NO_HEARTBEATS
+ else if ((!c_ign_eof) && (cbuf[0] == 'B'))
+ {
+ BIO_printf(bio_err,"HEARTBEATING\n");
+ SSL_heartbeat(con);
+ cbuf_len=0;
+ }
+#endif
else
{
cbuf_len=i;
@@ -1676,6 +1932,10 @@ printf("read=%d pending=%d peek=%d\n",k,SSL_pending(con),SSL_peek(con,zbuf,10240
print_stuff(bio_c_out,con,1);
SSL_free(con);
}
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+ if (next_proto.data)
+ OPENSSL_free(next_proto.data);
+#endif
if (ctx != NULL) SSL_CTX_free(ctx);
if (cert)
X509_free(cert);
@@ -1683,6 +1943,8 @@ printf("read=%d pending=%d peek=%d\n",k,SSL_pending(con),SSL_peek(con,zbuf,10240
EVP_PKEY_free(key);
if (pass)
OPENSSL_free(pass);
+ if (vpm)
+ X509_VERIFY_PARAM_free(vpm);
if (cbuf != NULL) { OPENSSL_cleanse(cbuf,BUFSIZZ); OPENSSL_free(cbuf); }
if (sbuf != NULL) { OPENSSL_cleanse(sbuf,BUFSIZZ); OPENSSL_free(sbuf); }
if (mbuf != NULL) { OPENSSL_cleanse(mbuf,BUFSIZZ); OPENSSL_free(mbuf); }
@@ -1710,6 +1972,7 @@ static void print_stuff(BIO *bio, SSL *s, int full)
#ifndef OPENSSL_NO_COMP
const COMP_METHOD *comp, *expansion;
#endif
+ unsigned char *exportedkeymat;
if (full)
{
@@ -1800,7 +2063,7 @@ static void print_stuff(BIO *bio, SSL *s, int full)
BIO_number_read(SSL_get_rbio(s)),
BIO_number_written(SSL_get_wbio(s)));
}
- BIO_printf(bio,((s->hit)?"---\nReused, ":"---\nNew, "));
+ BIO_printf(bio,(SSL_cache_hit(s)?"---\nReused, ":"---\nNew, "));
c=SSL_get_current_cipher(s);
BIO_printf(bio,"%s, Cipher is %s\n",
SSL_CIPHER_get_version(c),
@@ -1822,8 +2085,21 @@ static void print_stuff(BIO *bio, SSL *s, int full)
BIO_printf(bio,"Expansion: %s\n",
expansion ? SSL_COMP_get_name(expansion) : "NONE");
#endif
+
+#ifdef SSL_DEBUG
+ {
+ /* Print out local port of connection: useful for debugging */
+ int sock;
+ struct sockaddr_in ladd;
+ socklen_t ladd_size = sizeof(ladd);
+ sock = SSL_get_fd(s);
+ getsockname(sock, (struct sockaddr *)&ladd, &ladd_size);
+ BIO_printf(bio_c_out, "LOCAL PORT is %u\n", ntohs(ladd.sin_port));
+ }
+#endif
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+#if !defined(OPENSSL_NO_TLSEXT)
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
if (next_proto.status != -1) {
const unsigned char *proto;
unsigned int proto_len;
@@ -1832,9 +2108,60 @@ static void print_stuff(BIO *bio, SSL *s, int full)
BIO_write(bio, proto, proto_len);
BIO_write(bio, "\n", 1);
}
+ {
+ const unsigned char *proto;
+ unsigned int proto_len;
+ SSL_get0_alpn_selected(s, &proto, &proto_len);
+ if (proto_len > 0)
+ {
+ BIO_printf(bio, "ALPN protocol: ");
+ BIO_write(bio, proto, proto_len);
+ BIO_write(bio, "\n", 1);
+ }
+ else
+ BIO_printf(bio, "No ALPN negotiated\n");
+ }
+# endif
#endif
+#ifndef OPENSSL_NO_SRTP
+ {
+ SRTP_PROTECTION_PROFILE *srtp_profile=SSL_get_selected_srtp_profile(s);
+
+ if(srtp_profile)
+ BIO_printf(bio,"SRTP Extension negotiated, profile=%s\n",
+ srtp_profile->name);
+ }
+#endif
+
SSL_SESSION_print(bio,SSL_get_session(s));
+ if (keymatexportlabel != NULL)
+ {
+ BIO_printf(bio, "Keying material exporter:\n");
+ BIO_printf(bio, " Label: '%s'\n", keymatexportlabel);
+ BIO_printf(bio, " Length: %i bytes\n", keymatexportlen);
+ exportedkeymat = OPENSSL_malloc(keymatexportlen);
+ if (exportedkeymat != NULL)
+ {
+ if (!SSL_export_keying_material(s, exportedkeymat,
+ keymatexportlen,
+ keymatexportlabel,
+ strlen(keymatexportlabel),
+ NULL, 0, 0))
+ {
+ BIO_printf(bio, " Error\n");
+ }
+ else
+ {
+ BIO_printf(bio, " Keying material: ");
+ for (i=0; i
#endif
+#ifndef OPENSSL_NO_SRP
+#include
+#endif
#include "s_apps.h"
#include "timeouts.h"
@@ -290,6 +293,9 @@ static int cert_status_cb(SSL *s, void *arg);
static int s_msg=0;
static int s_quiet=0;
+static char *keymatexportlabel=NULL;
+static int keymatexportlen=20;
+
static int hack=0;
#ifndef OPENSSL_NO_ENGINE
static char *engine_id=NULL;
@@ -302,6 +308,7 @@ static long socket_mtu;
static int cert_chain = 0;
#endif
+
#ifndef OPENSSL_NO_PSK
static char *psk_identity="Client_identity";
char *psk_key=NULL; /* by default PSK is not used */
@@ -369,6 +376,52 @@ static unsigned int psk_server_cb(SSL *ssl, const char *identity,
}
#endif
+#ifndef OPENSSL_NO_SRP
+/* This is a context that we pass to callbacks */
+typedef struct srpsrvparm_st
+ {
+ char *login;
+ SRP_VBASE *vb;
+ SRP_user_pwd *user;
+ } srpsrvparm;
+
+/* This callback pretends to require some asynchronous logic in order to obtain
+ a verifier. When the callback is called for a new connection we return
+ with a negative value. This will provoke the accept etc to return with
+ an LOOKUP_X509. The main logic of the reinvokes the suspended call
+ (which would normally occur after a worker has finished) and we
+ set the user parameters.
+*/
+static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
+ {
+ srpsrvparm *p = (srpsrvparm *)arg;
+ if (p->login == NULL && p->user == NULL )
+ {
+ p->login = SSL_get_srp_username(s);
+ BIO_printf(bio_err, "SRP username = \"%s\"\n", p->login);
+ return (-1) ;
+ }
+
+ if (p->user == NULL)
+ {
+ BIO_printf(bio_err, "User %s doesn't exist\n", p->login);
+ return SSL3_AL_FATAL;
+ }
+ if (SSL_set_srp_server_param(s, p->user->N, p->user->g, p->user->s, p->user->v,
+ p->user->info) < 0)
+ {
+ *ad = SSL_AD_INTERNAL_ERROR;
+ return SSL3_AL_FATAL;
+ }
+ BIO_printf(bio_err, "SRP parameters set: username = \"%s\" info=\"%s\" \n", p->login,p->user->info);
+ /* need to check whether there are memory leaks */
+ p->user = NULL;
+ p->login = NULL;
+ return SSL_ERROR_NONE;
+ }
+
+#endif
+
#ifdef MONOLITH
static void s_server_init(void)
{
@@ -455,9 +508,15 @@ static void sv_usage(void)
# ifndef OPENSSL_NO_JPAKE
BIO_printf(bio_err," -jpake arg - JPAKE secret to use\n");
# endif
+#endif
+#ifndef OPENSSL_NO_SRP
+ BIO_printf(bio_err," -srpvfile file - The verifier file for SRP\n");
+ BIO_printf(bio_err," -srpuserseed string - A seed string for a default user salt.\n");
#endif
BIO_printf(bio_err," -ssl2 - Just talk SSLv2\n");
BIO_printf(bio_err," -ssl3 - Just talk SSLv3\n");
+ BIO_printf(bio_err," -tls1_2 - Just talk TLSv1.2\n");
+ BIO_printf(bio_err," -tls1_1 - Just talk TLSv1.1\n");
BIO_printf(bio_err," -tls1 - Just talk TLSv1\n");
BIO_printf(bio_err," -dtls1 - Just talk DTLSv1\n");
BIO_printf(bio_err," -timeout - Enable timeouts\n");
@@ -466,6 +525,8 @@ static void sv_usage(void)
BIO_printf(bio_err," -no_ssl2 - Just disable SSLv2\n");
BIO_printf(bio_err," -no_ssl3 - Just disable SSLv3\n");
BIO_printf(bio_err," -no_tls1 - Just disable TLSv1\n");
+ BIO_printf(bio_err," -no_tls1_1 - Just disable TLSv1.1\n");
+ BIO_printf(bio_err," -no_tls1_2 - Just disable TLSv1.2\n");
#ifndef OPENSSL_NO_DH
BIO_printf(bio_err," -no_dhe - Disable ephemeral DH\n");
#endif
@@ -495,7 +556,12 @@ static void sv_usage(void)
# ifndef OPENSSL_NO_NEXTPROTONEG
BIO_printf(bio_err," -nextprotoneg arg - set the advertised protocols for the NPN extension (comma-separated list)\n");
# endif
+# ifndef OPENSSL_NO_SRTP
+ BIO_printf(bio_err," -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n");
+# endif
#endif
+ BIO_printf(bio_err," -keymatexport label - Export keying material using label\n");
+ BIO_printf(bio_err," -keymatexportlen len - Export len bytes of keying material (default 20)\n");
}
static int local_argc=0;
@@ -846,7 +912,9 @@ static int next_proto_cb(SSL *s, const unsigned char **data, unsigned int *len,
return SSL_TLSEXT_ERR_OK;
}
-# endif /* ndef OPENSSL_NO_NPN */
+# endif /* ndef OPENSSL_NO_NEXTPROTONEG */
+
+
#endif
int MAIN(int, char **);
@@ -854,6 +922,12 @@ int MAIN(int, char **);
#ifndef OPENSSL_NO_JPAKE
static char *jpake_secret = NULL;
#endif
+#ifndef OPENSSL_NO_SRP
+ static srpsrvparm srp_callback_parm;
+#endif
+#ifndef OPENSSL_NO_SRTP
+static char *srtp_profiles = NULL;
+#endif
int MAIN(int argc, char *argv[])
{
@@ -885,8 +959,6 @@ int MAIN(int argc, char *argv[])
#ifndef OPENSSL_NO_TLSEXT
EVP_PKEY *s_key2 = NULL;
X509 *s_cert2 = NULL;
-#endif
-#ifndef OPENSSL_NO_TLSEXT
tlsextctx tlsextcbp = {NULL, NULL, SSL_TLSEXT_ERR_ALERT_WARNING};
# ifndef OPENSSL_NO_NEXTPROTONEG
const char *next_proto_neg_in = NULL;
@@ -897,13 +969,11 @@ int MAIN(int argc, char *argv[])
/* by default do not send a PSK identity hint */
static char *psk_identity_hint=NULL;
#endif
-#if !defined(OPENSSL_NO_SSL2) && !defined(OPENSSL_NO_SSL3)
- meth=SSLv23_server_method();
-#elif !defined(OPENSSL_NO_SSL3)
- meth=SSLv3_server_method();
-#elif !defined(OPENSSL_NO_SSL2)
- meth=SSLv2_server_method();
+#ifndef OPENSSL_NO_SRP
+ char *srpuserseed = NULL;
+ char *srp_verifier_file = NULL;
#endif
+ meth=SSLv23_server_method();
local_argc=argc;
local_argv=argv;
@@ -1128,12 +1198,26 @@ int MAIN(int argc, char *argv[])
psk_key=*(++argv);
for (i=0; ikssl_ctx = kssl_ctx_new()) != NULL)
+ if ((kctx = kssl_ctx_new()) != NULL)
{
- kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVICE,
- KRB5SVC);
- kssl_ctx_setstring(con->kssl_ctx, KSSL_KEYTAB,
- KRB5KEYTAB);
+ SSL_set0_kssl_ctx(con, kctx);
+ kssl_ctx_setstring(kctx, KSSL_SERVICE, KRB5SVC);
+ kssl_ctx_setstring(kctx, KSSL_KEYTAB, KRB5KEYTAB);
}
#endif /* OPENSSL_NO_KRB5 */
if(context)
@@ -1924,7 +2068,7 @@ static int sv_body(char *hostname, int s, unsigned char *context)
if (s_debug)
{
- con->debug=1;
+ SSL_set_debug(con, 1);
BIO_set_callback(SSL_get_rbio(con),bio_dump_callback);
BIO_set_callback_arg(SSL_get_rbio(con),(char *)bio_s_out);
}
@@ -2053,6 +2197,16 @@ static int sv_body(char *hostname, int s, unsigned char *context)
goto err;
}
+#ifndef OPENSSL_NO_HEARTBEATS
+ if ((buf[0] == 'B') &&
+ ((buf[1] == '\n') || (buf[1] == '\r')))
+ {
+ BIO_printf(bio_err,"HEARTBEATING\n");
+ SSL_heartbeat(con);
+ i=0;
+ continue;
+ }
+#endif
if ((buf[0] == 'r') &&
((buf[1] == '\n') || (buf[1] == '\r')))
{
@@ -2096,6 +2250,18 @@ static int sv_body(char *hostname, int s, unsigned char *context)
{ static count=0; if (++count == 100) { count=0; SSL_renegotiate(con); } }
#endif
k=SSL_write(con,&(buf[l]),(unsigned int)i);
+#ifndef OPENSSL_NO_SRP
+ while (SSL_get_error(con,k) == SSL_ERROR_WANT_X509_LOOKUP)
+ {
+ BIO_printf(bio_s_out,"LOOKUP renego during write\n");
+ srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login);
+ if (srp_callback_parm.user)
+ BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+ else
+ BIO_printf(bio_s_out,"LOOKUP not successful\n");
+ k=SSL_write(con,&(buf[l]),(unsigned int)i);
+ }
+#endif
switch (SSL_get_error(con,k))
{
case SSL_ERROR_NONE:
@@ -2143,6 +2309,18 @@ static int sv_body(char *hostname, int s, unsigned char *context)
{
again:
i=SSL_read(con,(char *)buf,bufsize);
+#ifndef OPENSSL_NO_SRP
+ while (SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP)
+ {
+ BIO_printf(bio_s_out,"LOOKUP renego during read\n");
+ srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login);
+ if (srp_callback_parm.user)
+ BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+ else
+ BIO_printf(bio_s_out,"LOOKUP not successful\n");
+ i=SSL_read(con,(char *)buf,bufsize);
+ }
+#endif
switch (SSL_get_error(con,i))
{
case SSL_ERROR_NONE:
@@ -2155,7 +2333,6 @@ static int sv_body(char *hostname, int s, unsigned char *context)
break;
case SSL_ERROR_WANT_WRITE:
case SSL_ERROR_WANT_READ:
- case SSL_ERROR_WANT_X509_LOOKUP:
BIO_printf(bio_s_out,"Read BLOCK\n");
break;
case SSL_ERROR_SYSCALL:
@@ -2210,12 +2387,30 @@ static int init_ssl_connection(SSL *con)
X509 *peer;
long verify_error;
MS_STATIC char buf[BUFSIZ];
+#ifndef OPENSSL_NO_KRB5
+ char *client_princ;
+#endif
#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
const unsigned char *next_proto_neg;
unsigned next_proto_neg_len;
#endif
+ unsigned char *exportedkeymat;
+
- if ((i=SSL_accept(con)) <= 0)
+ i=SSL_accept(con);
+#ifndef OPENSSL_NO_SRP
+ while (i <= 0 && SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP)
+ {
+ BIO_printf(bio_s_out,"LOOKUP during accept %s\n",srp_callback_parm.login);
+ srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login);
+ if (srp_callback_parm.user)
+ BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+ else
+ BIO_printf(bio_s_out,"LOOKUP not successful\n");
+ i=SSL_accept(con);
+ }
+#endif
+ if (i <= 0)
{
if (BIO_sock_should_retry(i))
{
@@ -2253,6 +2448,7 @@ static int init_ssl_connection(SSL *con)
BIO_printf(bio_s_out,"Shared ciphers:%s\n",buf);
str=SSL_CIPHER_get_name(SSL_get_current_cipher(con));
BIO_printf(bio_s_out,"CIPHER is %s\n",(str != NULL)?str:"(NONE)");
+
#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
SSL_get0_next_proto_negotiated(con, &next_proto_neg, &next_proto_neg_len);
if (next_proto_neg)
@@ -2262,19 +2458,60 @@ static int init_ssl_connection(SSL *con)
BIO_printf(bio_s_out, "\n");
}
#endif
- if (con->hit) BIO_printf(bio_s_out,"Reused session-id\n");
+#ifndef OPENSSL_NO_SRTP
+ {
+ SRTP_PROTECTION_PROFILE *srtp_profile
+ = SSL_get_selected_srtp_profile(con);
+
+ if(srtp_profile)
+ BIO_printf(bio_s_out,"SRTP Extension negotiated, profile=%s\n",
+ srtp_profile->name);
+ }
+#endif
+ if (SSL_cache_hit(con)) BIO_printf(bio_s_out,"Reused session-id\n");
if (SSL_ctrl(con,SSL_CTRL_GET_FLAGS,0,NULL) &
TLS1_FLAGS_TLS_PADDING_BUG)
- BIO_printf(bio_s_out,"Peer has incorrect TLSv1 block padding\n");
+ BIO_printf(bio_s_out,
+ "Peer has incorrect TLSv1 block padding\n");
#ifndef OPENSSL_NO_KRB5
- if (con->kssl_ctx->client_princ != NULL)
+ client_princ = kssl_ctx_get0_client_princ(SSL_get0_kssl_ctx(con));
+ if (client_princ != NULL)
{
BIO_printf(bio_s_out,"Kerberos peer principal is %s\n",
- con->kssl_ctx->client_princ);
+ client_princ);
}
#endif /* OPENSSL_NO_KRB5 */
BIO_printf(bio_s_out, "Secure Renegotiation IS%s supported\n",
SSL_get_secure_renegotiation_support(con) ? "" : " NOT");
+ if (keymatexportlabel != NULL)
+ {
+ BIO_printf(bio_s_out, "Keying material exporter:\n");
+ BIO_printf(bio_s_out, " Label: '%s'\n", keymatexportlabel);
+ BIO_printf(bio_s_out, " Length: %i bytes\n",
+ keymatexportlen);
+ exportedkeymat = OPENSSL_malloc(keymatexportlen);
+ if (exportedkeymat != NULL)
+ {
+ if (!SSL_export_keying_material(con, exportedkeymat,
+ keymatexportlen,
+ keymatexportlabel,
+ strlen(keymatexportlabel),
+ NULL, 0, 0))
+ {
+ BIO_printf(bio_s_out, " Error\n");
+ }
+ else
+ {
+ BIO_printf(bio_s_out, " Keying material: ");
+ for (i=0; ikssl_ctx = kssl_ctx_new()) != NULL)
+ if ((kctx = kssl_ctx_new()) != NULL)
{
- kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVICE, KRB5SVC);
- kssl_ctx_setstring(con->kssl_ctx, KSSL_KEYTAB, KRB5KEYTAB);
+ kssl_ctx_setstring(kctx, KSSL_SERVICE, KRB5SVC);
+ kssl_ctx_setstring(kctx, KSSL_KEYTAB, KRB5KEYTAB);
}
#endif /* OPENSSL_NO_KRB5 */
if(context) SSL_set_session_id_context(con, context,
@@ -2382,7 +2625,7 @@ static int www_body(char *hostname, int s, unsigned char *context)
if (s_debug)
{
- con->debug=1;
+ SSL_set_debug(con, 1);
BIO_set_callback(SSL_get_rbio(con),bio_dump_callback);
BIO_set_callback_arg(SSL_get_rbio(con),(char *)bio_s_out);
}
@@ -2397,7 +2640,18 @@ static int www_body(char *hostname, int s, unsigned char *context)
if (hack)
{
i=SSL_accept(con);
-
+#ifndef OPENSSL_NO_SRP
+ while (i <= 0 && SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP)
+ {
+ BIO_printf(bio_s_out,"LOOKUP during accept %s\n",srp_callback_parm.login);
+ srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login);
+ if (srp_callback_parm.user)
+ BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+ else
+ BIO_printf(bio_s_out,"LOOKUP not successful\n");
+ i=SSL_accept(con);
+ }
+#endif
switch (SSL_get_error(con,i))
{
case SSL_ERROR_NONE:
@@ -2465,6 +2719,11 @@ static int www_body(char *hostname, int s, unsigned char *context)
}
BIO_puts(io,"\n");
+ BIO_printf(io,
+ "Secure Renegotiation IS%s supported\n",
+ SSL_get_secure_renegotiation_support(con) ?
+ "" : " NOT");
+
/* The following is evil and should not really
* be done */
BIO_printf(io,"Ciphers supported in s_server binary\n");
@@ -2503,7 +2762,7 @@ static int www_body(char *hostname, int s, unsigned char *context)
}
BIO_puts(io,"\n");
}
- BIO_printf(io,((con->hit)
+ BIO_printf(io,(SSL_cache_hit(con)
?"---\nReused, "
:"---\nNew, "));
c=SSL_get_current_cipher(con);
diff --git a/apps/s_socket.c b/apps/s_socket.c
index c08544a..380efdb 100644
--- a/apps/s_socket.c
+++ b/apps/s_socket.c
@@ -238,11 +238,10 @@ int init_client(int *sock, char *host, int port, int type)
{
unsigned char ip[4];
+ memset(ip, '\0', sizeof ip);
if (!host_ip(host,&(ip[0])))
- {
- return(0);
- }
- return(init_client_ip(sock,ip,port,type));
+ return 0;
+ return init_client_ip(sock,ip,port,type);
}
static int init_client_ip(int *sock, unsigned char ip[4], int port, int type)
diff --git a/apps/server.pem b/apps/server.pem
index 56248e5..d0fc265 100644
--- a/apps/server.pem
+++ b/apps/server.pem
@@ -1,369 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (512 bit)
------BEGIN CERTIFICATE-----
-MIIB6TCCAVICAQYwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNMDAxMDE2MjIzMTAzWhcNMDMwMTE0
-MjIzMTAzWjBjMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxIzAhBgNVBAMTGlNlcnZlciB0ZXN0IGNl
-cnQgKDUxMiBiaXQpMFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBAJ+zw4Qnlf8SMVIP
-Fe9GEcStgOY2Ww/dgNdhjeD8ckUJNP5VZkVDTGiXav6ooKXfX3j/7tdkuD8Ey2//
-Kv7+ue0CAwEAATANBgkqhkiG9w0BAQQFAAOBgQCT0grFQeZaqYb5EYfk20XixZV4
-GmyAbXMftG1Eo7qGiMhYzRwGNWxEYojf5PZkYZXvSqZ/ZXHXa4g59jK/rJNnaVGM
-k+xIX8mxQvlV0n5O9PIha5BX5teZnkHKgL8aKKLKW1BK7YTngsfSzzaeame5iKfz
-itAE+OjGF+PFKbwX8Q==
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIIBPAIBAAJBAJ+zw4Qnlf8SMVIPFe9GEcStgOY2Ww/dgNdhjeD8ckUJNP5VZkVD
-TGiXav6ooKXfX3j/7tdkuD8Ey2//Kv7+ue0CAwEAAQJAN6W31vDEP2DjdqhzCDDu
-OA4NACqoiFqyblo7yc2tM4h4xMbC3Yx5UKMN9ZkCtX0gzrz6DyF47bdKcWBzNWCj
-gQIhANEoojVt7hq+SQ6MCN6FTAysGgQf56Q3TYoJMoWvdiXVAiEAw3e3rc+VJpOz
-rHuDo6bgpjUAAXM+v3fcpsfZSNO6V7kCIQCtbVjanpUwvZkMI9by02oUk9taki3b
-PzPfAfNPYAbCJQIhAJXNQDWyqwn/lGmR11cqY2y9nZ1+5w3yHGatLrcDnQHxAiEA
-vnlEGo8K85u+KwIOimM48ZG8oTk7iFdkqLJR1utT3aU=
------END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
+-----BEGIN CERTIFICATE-----
+MIID5zCCAs+gAwIBAgIJALnu1NlVpZ6zMA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZDELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxGTAXBgNVBAMMEFRlc3QgU2VydmVyIENlcnQw
+ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDzhPOSNtyyRspmeuUpxfNJ
+KCLTuf7g3uQ4zu4iHOmRO5TQci+HhVlLZrHF9XqFXcIP0y4pWDbMSGuiorUmzmfi
+R7bfSdI/+qIQt8KXRH6HNG1t8ou0VSvWId5TS5Dq/er5ODUr9OaaDva7EquHIcMv
+vPQGuI+OEAcnleVCy9HVEIySrO4P3CNIicnGkwwiAud05yUAq/gPXBC1hTtmlPD7
+TVcGVSEiJdvzqqlgv02qedGrkki6GY4S7GjZxrrf7Foc2EP+51LJzwLQx3/JfrCU
+41NEWAsu/Sl0tQabXESN+zJ1pDqoZ3uHMgpQjeGiE0olr+YcsSW/tJmiU9OiAr8R
+AgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJYIZI
+AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW
+BBSCvM8AABPR9zklmifnr9LvIBturDAfBgNVHSMEGDAWgBQ2w2yI55X+sL3szj49
+hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEAqb1NV0B0/pbpK9Z4/bNjzPQLTRLK
+WnSNm/Jh5v0GEUOE/Beg7GNjNrmeNmqxAlpqWz9qoeoFZax+QBpIZYjROU3TS3fp
+yLsrnlr0CDQ5R7kCCDGa8dkXxemmpZZLbUCpW2Uoy8sAA4JjN9OtsZY7dvUXFgJ7
+vVNTRnI01ghknbtD+2SxSQd3CWF6QhcRMAzZJ1z1cbbwGDDzfvGFPzJ+Sq+zEPds
+xoVLLSetCiBc+40ZcDS5dV98h9XD7JMTQfxzA7mNGv73JoZJA6nFgj+ADSlJsY/t
+JBv+z1iQRueoh9Qeee+ZbRifPouCB8FDx+AltvHTANdAq0t/K3o+pplMVA==
-----END CERTIFICATE-----
-----BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
+MIIEpAIBAAKCAQEA84TzkjbcskbKZnrlKcXzSSgi07n+4N7kOM7uIhzpkTuU0HIv
+h4VZS2axxfV6hV3CD9MuKVg2zEhroqK1Js5n4ke230nSP/qiELfCl0R+hzRtbfKL
+tFUr1iHeU0uQ6v3q+Tg1K/Tmmg72uxKrhyHDL7z0BriPjhAHJ5XlQsvR1RCMkqzu
+D9wjSInJxpMMIgLndOclAKv4D1wQtYU7ZpTw+01XBlUhIiXb86qpYL9NqnnRq5JI
+uhmOEuxo2ca63+xaHNhD/udSyc8C0Md/yX6wlONTRFgLLv0pdLUGm1xEjfsydaQ6
+qGd7hzIKUI3hohNKJa/mHLElv7SZolPTogK/EQIDAQABAoIBAADq9FwNtuE5IRQn
+zGtO4q7Y5uCzZ8GDNYr9RKp+P2cbuWDbvVAecYq2NV9QoIiWJOAYZKklOvekIju3
+r0UZLA0PRiIrTg6NrESx3JrjWDK8QNlUO7CPTZ39/K+FrmMkV9lem9yxjJjyC34D
+AQB+YRTx+l14HppjdxNwHjAVQpIx/uO2F5xAMuk32+3K+pq9CZUtrofe1q4Agj9R
+5s8mSy9pbRo9kW9wl5xdEotz1LivFOEiqPUJTUq5J5PeMKao3vdK726XI4Z455Nm
+W2/MA0YV0ug2FYinHcZdvKM6dimH8GLfa3X8xKRfzjGjTiMSwsdjgMa4awY3tEHH
+674jhAECgYEA/zqMrc0zsbNk83sjgaYIug5kzEpN4ic020rSZsmQxSCerJTgNhmg
+utKSCt0Re09Jt3LqG48msahX8ycqDsHNvlEGPQSbMu9IYeO3Wr3fAm75GEtFWePY
+BhM73I7gkRt4s8bUiUepMG/wY45c5tRF23xi8foReHFFe9MDzh8fJFECgYEA9EFX
+4qAik1pOJGNei9BMwmx0I0gfVEIgu0tzeVqT45vcxbxr7RkTEaDoAG6PlbWP6D9a
+WQNLp4gsgRM90ZXOJ4up5DsAWDluvaF4/omabMA+MJJ5kGZ0gCj5rbZbKqUws7x8
+bp+6iBfUPJUbcqNqFmi/08Yt7vrDnMnyMw2A/sECgYEAiiuRMxnuzVm34hQcsbhH
+6ymVqf7j0PW2qK0F4H1ocT9qhzWFd+RB3kHWrCjnqODQoI6GbGr/4JepHUpre1ex
+4UEN5oSS3G0ru0rC3U4C59dZ5KwDHFm7ffZ1pr52ljfQDUsrjjIMRtuiwNK2OoRa
+WSsqiaL+SDzSB+nBmpnAizECgYBdt/y6rerWUx4MhDwwtTnel7JwHyo2MDFS6/5g
+n8qC2Lj6/fMDRE22w+CA2esp7EJNQJGv+b27iFpbJEDh+/Lf5YzIT4MwVskQ5bYB
+JFcmRxUVmf4e09D7o705U/DjCgMH09iCsbLmqQ38ONIRSHZaJtMDtNTHD1yi+jF+
+OT43gQKBgQC/2OHZoko6iRlNOAQ/tMVFNq7fL81GivoQ9F1U0Qr+DH3ZfaH8eIkX
+xT0ToMPJUzWAn8pZv0snA0um6SIgvkCuxO84OkANCVbttzXImIsL7pFzfcwV/ERK
+UM6j0ZuSMFOCr/lGPAoOQU0fskidGEHi1/kW+suSr28TqsyYZpwBDQ==
-----END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
- /OU=Certification Services Division/CN=Thawte Server CA
- /Email=server-certs@thawte.com
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
- /OU=Certification Services Division/CN=Thawte Server CA
- /Email=server-certs@thawte.com
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8=
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----
diff --git a/apps/server2.pem b/apps/server2.pem
index 8bb6641..a3927cf 100644
--- a/apps/server2.pem
+++ b/apps/server2.pem
@@ -1,376 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (1024 bit)
------BEGIN CERTIFICATE-----
-MIICLjCCAZcCAQEwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU0WhcNOTgwNjA5
-MTM1NzU0WjBkMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxJDAiBgNVBAMTG1NlcnZlciB0ZXN0IGNl
-cnQgKDEwMjQgYml0KTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAsxH1PBPm
-RkxrR11eV4bzNi4N9n11CI8nV29+ARlT1+qDe/mjVUvXlmsr1v/vf71G9GgqopSa
-6RXrICLVdk/FYYYzhPvl1M+OrjaXDFO8BzBAF1Lnz6c7aRZvGRJNrRSr2nZEkqDf
-JW9dY7r2VZEpD5QeuaRYUnuECkqeieB65GMCAwEAATANBgkqhkiG9w0BAQQFAAOB
-gQCWsOta6C0wiVzXz8wPmJKyTrurMlgUss2iSuW9366iwofZddsNg7FXniMzkIf6
-dp7jnmWZwKZ9cXsNUS2o4OL07qOk2HOywC0YsNZQsOBu1CBTYYkIefDiKFL1zQHh
-8lwwNd4NP+OE3NzUNkCfh4DnFfg9WHkXUlD5UpxNRJ4gJA==
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQCzEfU8E+ZGTGtHXV5XhvM2Lg32fXUIjydXb34BGVPX6oN7+aNV
-S9eWayvW/+9/vUb0aCqilJrpFesgItV2T8VhhjOE++XUz46uNpcMU7wHMEAXUufP
-pztpFm8ZEk2tFKvadkSSoN8lb11juvZVkSkPlB65pFhSe4QKSp6J4HrkYwIDAQAB
-AoGBAKy8jvb0Lzby8q11yNLf7+78wCVdYi7ugMHcYA1JVFK8+zb1WfSm44FLQo/0
-dSChAjgz36TTexeLODPYxleJndjVcOMVzsLJjSM8dLpXsTS4FCeMbhw2s2u+xqKY
-bbPWfk+HOTyJjfnkcC5Nbg44eOmruq0gSmBeUXVM5UntlTnxAkEA7TGCA3h7kx5E
-Bl4zl2pc3gPAGt+dyfk5Po9mGJUUXhF5p2zueGmYWW74TmOWB1kzt4QRdYMzFePq
-zfDNXEa1CwJBAMFErdY0xp0UJ13WwBbUTk8rujqQdHtjw0klhpbuKkjxu2hN0wwM
-6p0D9qxF7JHaghqVRI0fAW/EE0OzdHMR9QkCQQDNR26dMFXKsoPu+vItljj/UEGf
-QG7gERiQ4yxaFBPHgdpGo0kT31eh9x9hQGDkxTe0GNG/YSgCRvm8+C3TMcKXAkBD
-dhGn36wkUFCddMSAM4NSJ1VN8/Z0y5HzCmI8dM3VwGtGMUQlxKxwOl30LEQzdS5M
-0SWojNYXiT2gOBfBwtbhAkEAhafl5QEOIgUz+XazS/IlZ8goNKdDVfYgK3mHHjvv
-nY5G+AuGebdNkXJr4KSWxDcN+C2i47zuj4QXA16MAOandA==
------END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert #2
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
+-----BEGIN CERTIFICATE-----
+MIID6jCCAtKgAwIBAgIJALnu1NlVpZ60MA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZzELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxHDAaBgNVBAMME1Rlc3QgU2VydmVyIENlcnQg
+IzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDrdi7j9yctG+L4EjBy
+gjPmEqZzOJEQba26MoQGzglU7e5Xf59Rb/hgVQuKAoiZe7/R8rK4zJ4W7iXdXw0L
+qBpyG8B5aGKeI32w+A9TcBApoXXL2CrYQEQjZwUIpLlYBIi2NkJj3nVkq5dgl1gO
+ALiQ+W8jg3kzg5Ec9rimp9r93N8wsSL3awsafurmYCvOf7leHaMP1WJ/zDRGUNHG
+/WtDjXc8ZUG1+6EXU9Jc2Fs+2Omf7fcN0l00AK/wPg8OaNS0rKyGq9JdIT9FRGV1
+bXe/rx58FaE5CItdwCSYhJvF/O95LWQoxJXye5bCFLmvDTEyVq9FMSCptfsmbXjE
+ZGsXAgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJ
+YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud
+DgQWBBR52UaWWTKzZGDH/X4mWNcuqeQVazAfBgNVHSMEGDAWgBQ2w2yI55X+sL3s
+zj49hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEANBW+XYLlHBqVY/31ie+3gRlS
+LPfy4SIqn0t3RJjagT29MXprblBO2cbMO8VGjkQdKGpmMXjxbht2arOOUXRHX4n/
+XTyn/QHEf0bcwIITMReO3DZUPAEw8hSjn9xEOM0IRVOCP+mH5fi74QzzQaZVCyYg
+5VtLKdww/+sc0nCbKl2KWgDluriH0nfVx95qgW3mg9dhXRr0zmf1w2zkBHYpARYL
+Dew6Z8EE4tS3HJu8/qM6meWzNtrfonQ3eiiMxjZBxzV46jchBwa2z9XYhP6AmpPb
+oeTSzcQNbWsxaGYzWo46oLDUZmJOwSBawbS31bZNMCoPIY6ukoesCzFSsUKZww==
-----END CERTIFICATE-----
-----BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
+MIIEowIBAAKCAQEA63Yu4/cnLRvi+BIwcoIz5hKmcziREG2tujKEBs4JVO3uV3+f
+UW/4YFULigKImXu/0fKyuMyeFu4l3V8NC6gachvAeWhiniN9sPgPU3AQKaF1y9gq
+2EBEI2cFCKS5WASItjZCY951ZKuXYJdYDgC4kPlvI4N5M4ORHPa4pqfa/dzfMLEi
+92sLGn7q5mArzn+5Xh2jD9Vif8w0RlDRxv1rQ413PGVBtfuhF1PSXNhbPtjpn+33
+DdJdNACv8D4PDmjUtKyshqvSXSE/RURldW13v68efBWhOQiLXcAkmISbxfzveS1k
+KMSV8nuWwhS5rw0xMlavRTEgqbX7Jm14xGRrFwIDAQABAoIBAHLsTPihIfLnYIE5
+x4GsQQ5zXeBw5ITDM37ktwHnQDC+rIzyUl1aLD1AZRBoKinXd4lOTqLZ4/NHKx4A
+DYr58mZtWyUmqLOMmQVuHXTZBlp7XtYuXMMNovQwjQlp9LicBeoBU6gQ5PVMtubD
+F4xGF89Sn0cTHW3iMkqTtQ5KcR1j57OcJO0FEb1vPvk2MXI5ZyAatUYE7YacbEzd
+rg02uIwx3FqNSkuSI79uz4hMdV5TPtuhxx9nTwj9aLUhXFeZ0mn2PVgVzEnnMoJb
++znlsZDgzDlJqdaD744YGWh8Z3OEssB35KfzFcdOeO6yH8lmv2Zfznk7pNPT7LTb
+Lae9VgkCgYEA92p1qnAB3NtJtNcaW53i0S5WJgS1hxWKvUDx3lTB9s8X9fHpqL1a
+E94fDfWzp/hax6FefUKIvBOukPLQ6bYjTMiFoOHzVirghAIuIUoMI5VtLhwD1hKs
+Lr7l/dptMgKb1nZHyXoKHRBthsy3K4+udsPi8TzMvYElgEqyQIe/Rk0CgYEA86GL
+8HC6zLszzKERDPBxrboRmoFvVUCTQDhsfj1M8aR3nQ8V5LkdIJc7Wqm/Ggfk9QRf
+rJ8M2WUMlU5CNnCn/KCrKzCNZIReze3fV+HnKdbcXGLvgbHPrhnz8yYehUFG+RGq
+bVyDWRU94T38izy2s5qMYrMJWZEYyXncSPbfcPMCgYAtaXfxcZ+V5xYPQFARMtiX
+5nZfggvDoJuXgx0h3tK/N2HBfcaSdzbaYLG4gTmZggc/jwnl2dl5E++9oSPhUdIG
+3ONSFUbxsOsGr9PBvnKd8WZZyUCXAVRjPBzAzF+whzQNWCZy/5htnz9LN7YDI9s0
+5113Q96cheDZPFydZY0hHQKBgQDVbEhNukM5xCiNcu+f2SaMnLp9EjQ4h5g3IvaP
+5B16daw/Dw8LzcohWboqIxeAsze0GD/D1ZUJAEd0qBjC3g+a9BjefervCjKOzXng
+38mEUm+6EwVjJSQcjSmycEs+Sr/kwr/8i5WYvU32+jk4tFgMoC+o6tQe/Uesf68k
+z/dPVwKBgGbF7Vv1/3SmhlOy+zYyvJ0CrWtKxH9QP6tLIEgEpd8x7YTSuCH94yok
+kToMXYA3sWNPt22GbRDZ+rcp4c7HkDx6I6vpdP9aQEwJTp0EPy0sgWr2XwYmreIQ
+NFmkk8Itn9EY2R9VBaP7GLv5kvwxDdLAnmwGmzVtbmaVdxCaBwUk
-----END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
- /OU=Certification Services Division/CN=Thawte Server CA
- /Email=server-certs@thawte.com
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
- /OU=Certification Services Division/CN=Thawte Server CA
- /Email=server-certs@thawte.com
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8=
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----
diff --git a/apps/sess_id.c b/apps/sess_id.c
index b99179f..b16686c 100644
--- a/apps/sess_id.c
+++ b/apps/sess_id.c
@@ -90,6 +90,7 @@ int MAIN(int, char **);
int MAIN(int argc, char **argv)
{
SSL_SESSION *x=NULL;
+ X509 *peer = NULL;
int ret=1,i,num,badops=0;
BIO *out=NULL;
int informat,outformat;
@@ -163,16 +164,17 @@ int MAIN(int argc, char **argv)
ERR_load_crypto_strings();
x=load_sess_id(infile,informat);
if (x == NULL) { goto end; }
+ peer = SSL_SESSION_get0_peer(x);
if(context)
{
- x->sid_ctx_length=strlen(context);
- if(x->sid_ctx_length > SSL_MAX_SID_CTX_LENGTH)
+ size_t ctx_len = strlen(context);
+ if(ctx_len > SSL_MAX_SID_CTX_LENGTH)
{
BIO_printf(bio_err,"Context too long\n");
goto end;
}
- memcpy(x->sid_ctx,context,x->sid_ctx_length);
+ SSL_SESSION_set1_id_context(x, (unsigned char *)context, ctx_len);
}
#ifdef undef
@@ -231,10 +233,10 @@ int MAIN(int argc, char **argv)
if (cert)
{
- if (x->peer == NULL)
+ if (peer == NULL)
BIO_puts(out,"No certificate present\n");
else
- X509_print(out,x->peer);
+ X509_print(out,peer);
}
}
@@ -253,12 +255,12 @@ int MAIN(int argc, char **argv)
goto end;
}
}
- else if (!noout && (x->peer != NULL)) /* just print the certificate */
+ else if (!noout && (peer != NULL)) /* just print the certificate */
{
if (outformat == FORMAT_ASN1)
- i=(int)i2d_X509_bio(out,x->peer);
+ i=(int)i2d_X509_bio(out,peer);
else if (outformat == FORMAT_PEM)
- i=PEM_write_bio_X509(out,x->peer);
+ i=PEM_write_bio_X509(out,peer);
else {
BIO_printf(bio_err,"bad output format specified for outfile\n");
goto end;
diff --git a/apps/speed.c b/apps/speed.c
index b3c5442..9c251eb 100644
--- a/apps/speed.c
+++ b/apps/speed.c
@@ -108,8 +108,14 @@
#include
#endif
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__CYGWIN__)
#include
+# if defined(__CYGWIN__) && !defined(_WIN32)
+ /* should define _WIN32, which normally is mutually
+ * exclusive with __CYGWIN__, but if it didn't... */
+# define _WIN32
+ /* this is done because Cygwin alarm() fails sometimes. */
+# endif
#endif
#include
@@ -183,6 +189,25 @@
#ifndef OPENSSL_NO_ECDH
#include
#endif
+#include
+
+#ifdef OPENSSL_FIPS
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef AES_set_encrypt_key
+#undef AES_set_decrypt_key
+#undef DES_set_key_unchecked
+#endif
+#define BF_set_key private_BF_set_key
+#define CAST_set_key private_CAST_set_key
+#define idea_set_encrypt_key private_idea_set_encrypt_key
+#define SEED_set_key private_SEED_set_key
+#define RC2_set_key private_RC2_set_key
+#define RC4_set_key private_RC4_set_key
+#define DES_set_key_unchecked private_DES_set_key_unchecked
+#define AES_set_encrypt_key private_AES_set_encrypt_key
+#define AES_set_decrypt_key private_AES_set_decrypt_key
+#define Camellia_set_key private_Camellia_set_key
+#endif
#ifndef HAVE_FORK
# if defined(OPENSSL_SYS_VMS) || defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_MACINTOSH_CLASSIC) || defined(OPENSSL_SYS_OS2) || defined(OPENSSL_SYS_NETWARE)
@@ -214,7 +239,7 @@ static void print_result(int alg,int run_no,int count,double time_used);
static int do_multi(int multi);
#endif
-#define ALGOR_NUM 29
+#define ALGOR_NUM 30
#define SIZE_NUM 5
#define RSA_NUM 4
#define DSA_NUM 3
@@ -229,7 +254,7 @@ static const char *names[ALGOR_NUM]={
"aes-128 cbc","aes-192 cbc","aes-256 cbc",
"camellia-128 cbc","camellia-192 cbc","camellia-256 cbc",
"evp","sha256","sha512","whirlpool",
- "aes-128 ige","aes-192 ige","aes-256 ige"};
+ "aes-128 ige","aes-192 ige","aes-256 ige","ghash" };
static double results[ALGOR_NUM][SIZE_NUM];
static int lengths[SIZE_NUM]={16,64,256,1024,8*1024};
#ifndef OPENSSL_NO_RSA
@@ -273,9 +298,12 @@ static SIGRETTYPE sig_done(int sig)
#if defined(_WIN32)
-#define SIGALRM
+#if !defined(SIGALRM)
+# define SIGALRM
+#endif
static unsigned int lapse,schlock;
-static void alarm(unsigned int secs) { lapse = secs*1000; }
+static void alarm_win32(unsigned int secs) { lapse = secs*1000; }
+#define alarm alarm_win32
static DWORD WINAPI sleepy(VOID *arg)
{
@@ -469,6 +497,7 @@ int MAIN(int argc, char **argv)
#define D_IGE_128_AES 26
#define D_IGE_192_AES 27
#define D_IGE_256_AES 28
+#define D_GHASH 29
double d=0.0;
long c[ALGOR_NUM][SIZE_NUM];
#define R_DSA_512 0
@@ -894,6 +923,10 @@ int MAIN(int argc, char **argv)
doit[D_CBC_192_AES]=1;
doit[D_CBC_256_AES]=1;
}
+ else if (strcmp(*argv,"ghash") == 0)
+ {
+ doit[D_GHASH]=1;
+ }
else
#endif
#ifndef OPENSSL_NO_CAMELLIA
@@ -1264,6 +1297,7 @@ int MAIN(int argc, char **argv)
c[D_IGE_128_AES][0]=count;
c[D_IGE_192_AES][0]=count;
c[D_IGE_256_AES][0]=count;
+ c[D_GHASH][0]=count;
for (i=1; i
+
+#ifndef OPENSSL_NO_SRP
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "apps.h"
+
+#undef PROG
+#define PROG srp_main
+
+#define BASE_SECTION "srp"
+#define CONFIG_FILE "openssl.cnf"
+
+#define ENV_RANDFILE "RANDFILE"
+
+#define ENV_DATABASE "srpvfile"
+#define ENV_DEFAULT_SRP "default_srp"
+
+static char *srp_usage[]={
+"usage: srp [args] [user] \n",
+"\n",
+" -verbose Talk alot while doing things\n",
+" -config file A config file\n",
+" -name arg The particular srp definition to use\n",
+" -srpvfile arg The srp verifier file name\n",
+" -add add an user and srp verifier\n",
+" -modify modify the srp verifier of an existing user\n",
+" -delete delete user from verifier file\n",
+" -list list user\n",
+" -gn arg g and N values to be used for new verifier\n",
+" -userinfo arg additional info to be set for user\n",
+" -passin arg input file pass phrase source\n",
+" -passout arg output file pass phrase source\n",
+#ifndef OPENSSL_NO_ENGINE
+" -engine e - use engine e, possibly a hardware device.\n",
+#endif
+NULL
+};
+
+#ifdef EFENCE
+extern int EF_PROTECT_FREE;
+extern int EF_PROTECT_BELOW;
+extern int EF_ALIGNMENT;
+#endif
+
+static CONF *conf=NULL;
+static char *section=NULL;
+
+#define VERBOSE if (verbose)
+#define VVERBOSE if (verbose>1)
+
+
+int MAIN(int, char **);
+
+static int get_index(CA_DB *db, char* id, char type)
+ {
+ char ** pp;
+ int i;
+ if (id == NULL) return -1;
+ if (type == DB_SRP_INDEX)
+ for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+ {
+ pp = sk_OPENSSL_PSTRING_value(db->db->data,i);
+ if (pp[DB_srptype][0] == DB_SRP_INDEX && !strcmp(id,pp[DB_srpid]))
+ return i;
+ }
+ else for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+ {
+ pp = sk_OPENSSL_PSTRING_value(db->db->data,i);
+
+ if (pp[DB_srptype][0] != DB_SRP_INDEX && !strcmp(id,pp[DB_srpid]))
+ return i;
+ }
+
+ return -1 ;
+ }
+
+static void print_entry(CA_DB *db, BIO *bio, int indx, int verbose, char *s)
+ {
+ if (indx >= 0 && verbose)
+ {
+ int j;
+ char **pp = sk_OPENSSL_PSTRING_value(db->db->data, indx);
+ BIO_printf(bio, "%s \"%s\"\n", s, pp[DB_srpid]);
+ for (j = 0; j < DB_NUMBER; j++)
+ {
+ BIO_printf(bio_err," %d = \"%s\"\n", j, pp[j]);
+ }
+ }
+ }
+
+static void print_index(CA_DB *db, BIO *bio, int indexindex, int verbose)
+ {
+ print_entry(db, bio, indexindex, verbose, "g N entry") ;
+ }
+
+static void print_user(CA_DB *db, BIO *bio, int userindex, int verbose)
+ {
+ if (verbose > 0)
+ {
+ char **pp = sk_OPENSSL_PSTRING_value(db->db->data,userindex);
+
+ if (pp[DB_srptype][0] != 'I')
+ {
+ print_entry(db, bio, userindex, verbose, "User entry");
+ print_entry(db, bio, get_index(db, pp[DB_srpgN], 'I'), verbose, "g N entry");
+ }
+
+ }
+ }
+
+static int update_index(CA_DB *db, BIO *bio, char **row)
+ {
+ char ** irow;
+ int i;
+
+ if ((irow=(char **)OPENSSL_malloc(sizeof(char *)*(DB_NUMBER+1))) == NULL)
+ {
+ BIO_printf(bio_err,"Memory allocation failure\n");
+ return 0;
+ }
+
+ for (i=0; idb,irow))
+ {
+ BIO_printf(bio,"failed to update srpvfile\n");
+ BIO_printf(bio,"TXT_DB error number %ld\n",db->db->error);
+ OPENSSL_free(irow);
+ return 0;
+ }
+ return 1;
+ }
+
+static void lookup_fail(const char *name, char *tag)
+ {
+ BIO_printf(bio_err,"variable lookup failed for %s::%s\n",name,tag);
+ }
+
+
+static char *srp_verify_user(const char *user, const char *srp_verifier,
+ char *srp_usersalt, const char *g, const char *N,
+ const char *passin, BIO *bio, int verbose)
+ {
+ char password[1024];
+ PW_CB_DATA cb_tmp;
+ char *verifier = NULL;
+ char *gNid = NULL;
+
+ cb_tmp.prompt_info = user;
+ cb_tmp.password = passin;
+
+ if (password_callback(password, 1024, 0, &cb_tmp) >0)
+ {
+ VERBOSE BIO_printf(bio,"Validating\n user=\"%s\"\n srp_verifier=\"%s\"\n srp_usersalt=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,srp_verifier,srp_usersalt, g, N);
+ BIO_printf(bio, "Pass %s\n", password);
+
+ if (!(gNid=SRP_create_verifier(user, password, &srp_usersalt, &verifier, N, g)))
+ {
+ BIO_printf(bio, "Internal error validating SRP verifier\n");
+ }
+ else
+ {
+ if (strcmp(verifier, srp_verifier))
+ gNid = NULL;
+ OPENSSL_free(verifier);
+ }
+ }
+ return gNid;
+ }
+
+static char *srp_create_user(char *user, char **srp_verifier,
+ char **srp_usersalt, char *g, char *N,
+ char *passout, BIO *bio, int verbose)
+ {
+ char password[1024];
+ PW_CB_DATA cb_tmp;
+ char *gNid = NULL;
+ char *salt = NULL;
+ cb_tmp.prompt_info = user;
+ cb_tmp.password = passout;
+
+ if (password_callback(password,1024,1,&cb_tmp) >0)
+ {
+ VERBOSE BIO_printf(bio,"Creating\n user=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,g,N);
+ if (!(gNid =SRP_create_verifier(user, password, &salt, srp_verifier, N, g)))
+ {
+ BIO_printf(bio,"Internal error creating SRP verifier\n");
+ }
+ else
+ *srp_usersalt = salt;
+ VVERBOSE BIO_printf(bio,"gNid=%s salt =\"%s\"\n verifier =\"%s\"\n", gNid,salt, *srp_verifier);
+
+ }
+ return gNid;
+ }
+
+int MAIN(int argc, char **argv)
+ {
+ int add_user = 0;
+ int list_user= 0;
+ int delete_user= 0;
+ int modify_user= 0;
+ char * user = NULL;
+
+ char *passargin = NULL, *passargout = NULL;
+ char *passin = NULL, *passout = NULL;
+ char * gN = NULL;
+ int gNindex = -1;
+ char ** gNrow = NULL;
+ int maxgN = -1;
+
+ char * userinfo = NULL;
+
+ int badops=0;
+ int ret=1;
+ int errors=0;
+ int verbose=0;
+ int doupdatedb=0;
+ char *configfile=NULL;
+ char *dbfile=NULL;
+ CA_DB *db=NULL;
+ char **pp ;
+ int i;
+ long errorline = -1;
+ char *randfile=NULL;
+#ifndef OPENSSL_NO_ENGINE
+ char *engine = NULL;
+#endif
+ char *tofree=NULL;
+ DB_ATTR db_attr;
+
+#ifdef EFENCE
+EF_PROTECT_FREE=1;
+EF_PROTECT_BELOW=1;
+EF_ALIGNMENT=0;
+#endif
+
+ apps_startup();
+
+ conf = NULL;
+ section = NULL;
+
+ if (bio_err == NULL)
+ if ((bio_err=BIO_new(BIO_s_file())) != NULL)
+ BIO_set_fp(bio_err,stderr,BIO_NOCLOSE|BIO_FP_TEXT);
+
+ argc--;
+ argv++;
+ while (argc >= 1 && badops == 0)
+ {
+ if (strcmp(*argv,"-verbose") == 0)
+ verbose++;
+ else if (strcmp(*argv,"-config") == 0)
+ {
+ if (--argc < 1) goto bad;
+ configfile= *(++argv);
+ }
+ else if (strcmp(*argv,"-name") == 0)
+ {
+ if (--argc < 1) goto bad;
+ section= *(++argv);
+ }
+ else if (strcmp(*argv,"-srpvfile") == 0)
+ {
+ if (--argc < 1) goto bad;
+ dbfile= *(++argv);
+ }
+ else if (strcmp(*argv,"-add") == 0)
+ add_user=1;
+ else if (strcmp(*argv,"-delete") == 0)
+ delete_user=1;
+ else if (strcmp(*argv,"-modify") == 0)
+ modify_user=1;
+ else if (strcmp(*argv,"-list") == 0)
+ list_user=1;
+ else if (strcmp(*argv,"-gn") == 0)
+ {
+ if (--argc < 1) goto bad;
+ gN= *(++argv);
+ }
+ else if (strcmp(*argv,"-userinfo") == 0)
+ {
+ if (--argc < 1) goto bad;
+ userinfo= *(++argv);
+ }
+ else if (strcmp(*argv,"-passin") == 0)
+ {
+ if (--argc < 1) goto bad;
+ passargin= *(++argv);
+ }
+ else if (strcmp(*argv,"-passout") == 0)
+ {
+ if (--argc < 1) goto bad;
+ passargout= *(++argv);
+ }
+#ifndef OPENSSL_NO_ENGINE
+ else if (strcmp(*argv,"-engine") == 0)
+ {
+ if (--argc < 1) goto bad;
+ engine= *(++argv);
+ }
+#endif
+
+ else if (**argv == '-')
+ {
+bad:
+ BIO_printf(bio_err,"unknown option %s\n",*argv);
+ badops=1;
+ break;
+ }
+ else
+ break;
+
+ argc--;
+ argv++;
+ }
+
+ if (dbfile && configfile)
+ {
+ BIO_printf(bio_err,"-dbfile and -configfile cannot be specified together.\n");
+ badops = 1;
+ }
+ if (add_user+delete_user+modify_user+list_user != 1)
+ {
+ BIO_printf(bio_err,"Exactly one of the options -add, -delete, -modify -list must be specified.\n");
+ badops = 1;
+ }
+ if (delete_user+modify_user+delete_user== 1 && argc <= 0)
+ {
+ BIO_printf(bio_err,"Need at least one user for options -add, -delete, -modify. \n");
+ badops = 1;
+ }
+ if ((passin || passout) && argc != 1 )
+ {
+ BIO_printf(bio_err,"-passin, -passout arguments only valid with one user.\n");
+ badops = 1;
+ }
+
+ if (badops)
+ {
+ for (pp=srp_usage; (*pp != NULL); pp++)
+ BIO_printf(bio_err,"%s",*pp);
+
+ BIO_printf(bio_err," -rand file%cfile%c...\n", LIST_SEPARATOR_CHAR, LIST_SEPARATOR_CHAR);
+ BIO_printf(bio_err," load the file (or the files in the directory) into\n");
+ BIO_printf(bio_err," the random number generator\n");
+ goto err;
+ }
+
+ ERR_load_crypto_strings();
+
+#ifndef OPENSSL_NO_ENGINE
+ setup_engine(bio_err, engine, 0);
+#endif
+
+ if(!app_passwd(bio_err, passargin, passargout, &passin, &passout))
+ {
+ BIO_printf(bio_err, "Error getting passwords\n");
+ goto err;
+ }
+
+ if (!dbfile)
+ {
+
+
+ /*****************************************************************/
+ tofree=NULL;
+ if (configfile == NULL) configfile = getenv("OPENSSL_CONF");
+ if (configfile == NULL) configfile = getenv("SSLEAY_CONF");
+ if (configfile == NULL)
+ {
+ const char *s=X509_get_default_cert_area();
+ size_t len;
+
+#ifdef OPENSSL_SYS_VMS
+ len = strlen(s)+sizeof(CONFIG_FILE);
+ tofree=OPENSSL_malloc(len);
+ strcpy(tofree,s);
+#else
+ len = strlen(s)+sizeof(CONFIG_FILE)+1;
+ tofree=OPENSSL_malloc(len);
+ BUF_strlcpy(tofree,s,len);
+ BUF_strlcat(tofree,"/",len);
+#endif
+ BUF_strlcat(tofree,CONFIG_FILE,len);
+ configfile=tofree;
+ }
+
+ VERBOSE BIO_printf(bio_err,"Using configuration from %s\n",configfile);
+ conf = NCONF_new(NULL);
+ if (NCONF_load(conf,configfile,&errorline) <= 0)
+ {
+ if (errorline <= 0)
+ BIO_printf(bio_err,"error loading the config file '%s'\n",
+ configfile);
+ else
+ BIO_printf(bio_err,"error on line %ld of config file '%s'\n"
+ ,errorline,configfile);
+ goto err;
+ }
+ if(tofree)
+ {
+ OPENSSL_free(tofree);
+ tofree = NULL;
+ }
+
+ if (!load_config(bio_err, conf))
+ goto err;
+
+ /* Lets get the config section we are using */
+ if (section == NULL)
+ {
+ VERBOSE BIO_printf(bio_err,"trying to read " ENV_DEFAULT_SRP " in \" BASE_SECTION \"\n");
+
+ section=NCONF_get_string(conf,BASE_SECTION,ENV_DEFAULT_SRP);
+ if (section == NULL)
+ {
+ lookup_fail(BASE_SECTION,ENV_DEFAULT_SRP);
+ goto err;
+ }
+ }
+
+ if (randfile == NULL && conf)
+ randfile = NCONF_get_string(conf, BASE_SECTION, "RANDFILE");
+
+
+ VERBOSE BIO_printf(bio_err,"trying to read " ENV_DATABASE " in section \"%s\"\n",section);
+
+ if ((dbfile=NCONF_get_string(conf,section,ENV_DATABASE)) == NULL)
+ {
+ lookup_fail(section,ENV_DATABASE);
+ goto err;
+ }
+
+ }
+ if (randfile == NULL)
+ ERR_clear_error();
+ else
+ app_RAND_load_file(randfile, bio_err, 0);
+
+ VERBOSE BIO_printf(bio_err,"Trying to read SRP verifier file \"%s\"\n",dbfile);
+
+ db = load_index(dbfile, &db_attr);
+ if (db == NULL) goto err;
+
+ /* Lets check some fields */
+ for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+ {
+ pp = sk_OPENSSL_PSTRING_value(db->db->data, i);
+
+ if (pp[DB_srptype][0] == DB_SRP_INDEX)
+ {
+ maxgN = i;
+ if (gNindex < 0 && gN != NULL && !strcmp(gN, pp[DB_srpid]))
+ gNindex = i;
+
+ print_index(db, bio_err, i, verbose > 1);
+ }
+ }
+
+ VERBOSE BIO_printf(bio_err, "Database initialised\n");
+
+ if (gNindex >= 0)
+ {
+ gNrow = sk_OPENSSL_PSTRING_value(db->db->data,gNindex);
+ print_entry(db, bio_err, gNindex, verbose > 1, "Default g and N");
+ }
+ else if (maxgN > 0 && !SRP_get_default_gN(gN))
+ {
+ BIO_printf(bio_err, "No g and N value for index \"%s\"\n", gN);
+ goto err;
+ }
+ else
+ {
+ VERBOSE BIO_printf(bio_err, "Database has no g N information.\n");
+ gNrow = NULL;
+ }
+
+
+ VVERBOSE BIO_printf(bio_err,"Starting user processing\n");
+
+ if (argc > 0)
+ user = *(argv++) ;
+
+ while (list_user || user)
+ {
+ int userindex = -1;
+ if (user)
+ VVERBOSE BIO_printf(bio_err, "Processing user \"%s\"\n", user);
+ if ((userindex = get_index(db, user, 'U')) >= 0)
+ {
+ print_user(db, bio_err, userindex, (verbose > 0) || list_user);
+ }
+
+ if (list_user)
+ {
+ if (user == NULL)
+ {
+ BIO_printf(bio_err,"List all users\n");
+
+ for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+ {
+ print_user(db,bio_err, i, 1);
+ }
+ list_user = 0;
+ }
+ else if (userindex < 0)
+ {
+ BIO_printf(bio_err, "user \"%s\" does not exist, ignored. t\n",
+ user);
+ errors++;
+ }
+ }
+ else if (add_user)
+ {
+ if (userindex >= 0)
+ {
+ /* reactivation of a new user */
+ char **row = sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+ BIO_printf(bio_err, "user \"%s\" reactivated.\n", user);
+ row[DB_srptype][0] = 'V';
+
+ doupdatedb = 1;
+ }
+ else
+ {
+ char *row[DB_NUMBER] ; char *gNid;
+ row[DB_srpverifier] = NULL;
+ row[DB_srpsalt] = NULL;
+ row[DB_srpinfo] = NULL;
+ if (!(gNid = srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:gN,gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose)))
+ {
+ BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned .\n", user);
+ errors++;
+ goto err;
+ }
+ row[DB_srpid] = BUF_strdup(user);
+ row[DB_srptype] = BUF_strdup("v");
+ row[DB_srpgN] = BUF_strdup(gNid);
+
+ if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] ||
+ (userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo)))) ||
+ !update_index(db, bio_err, row))
+ {
+ if (row[DB_srpid]) OPENSSL_free(row[DB_srpid]);
+ if (row[DB_srpgN]) OPENSSL_free(row[DB_srpgN]);
+ if (row[DB_srpinfo]) OPENSSL_free(row[DB_srpinfo]);
+ if (row[DB_srptype]) OPENSSL_free(row[DB_srptype]);
+ if (row[DB_srpverifier]) OPENSSL_free(row[DB_srpverifier]);
+ if (row[DB_srpsalt]) OPENSSL_free(row[DB_srpsalt]);
+ goto err;
+ }
+ doupdatedb = 1;
+ }
+ }
+ else if (modify_user)
+ {
+ if (userindex < 0)
+ {
+ BIO_printf(bio_err,"user \"%s\" does not exist, operation ignored.\n",user);
+ errors++;
+ }
+ else
+ {
+
+ char **row = sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+ char type = row[DB_srptype][0];
+ if (type == 'v')
+ {
+ BIO_printf(bio_err,"user \"%s\" already updated, operation ignored.\n",user);
+ errors++;
+ }
+ else
+ {
+ char *gNid;
+
+ if (row[DB_srptype][0] == 'V')
+ {
+ int user_gN;
+ char **irow = NULL;
+ VERBOSE BIO_printf(bio_err,"Verifying password for user \"%s\"\n",user);
+ if ( (user_gN = get_index(db, row[DB_srpgN], DB_SRP_INDEX)) >= 0)
+ irow = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+
+ if (!srp_verify_user(user, row[DB_srpverifier], row[DB_srpsalt], irow ? irow[DB_srpsalt] : row[DB_srpgN], irow ? irow[DB_srpverifier] : NULL, passin, bio_err, verbose))
+ {
+ BIO_printf(bio_err, "Invalid password for user \"%s\", operation abandoned.\n", user);
+ errors++;
+ goto err;
+ }
+ }
+ VERBOSE BIO_printf(bio_err,"Password for user \"%s\" ok.\n",user);
+
+ if (!(gNid=srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:NULL, gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose)))
+ {
+ BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned.\n", user);
+ errors++;
+ goto err;
+ }
+
+ row[DB_srptype][0] = 'v';
+ row[DB_srpgN] = BUF_strdup(gNid);
+
+ if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] ||
+ (userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo)))))
+ goto err;
+
+ doupdatedb = 1;
+ }
+ }
+ }
+ else if (delete_user)
+ {
+ if (userindex < 0)
+ {
+ BIO_printf(bio_err, "user \"%s\" does not exist, operation ignored. t\n", user);
+ errors++;
+ }
+ else
+ {
+ char **xpp = sk_OPENSSL_PSTRING_value(db->db->data,userindex);
+ BIO_printf(bio_err, "user \"%s\" revoked. t\n", user);
+
+ xpp[DB_srptype][0] = 'R';
+
+ doupdatedb = 1;
+ }
+ }
+ if (--argc > 0)
+ user = *(argv++) ;
+ else
+ {
+ user = NULL;
+ list_user = 0;
+ }
+ }
+
+ VERBOSE BIO_printf(bio_err,"User procession done.\n");
+
+
+ if (doupdatedb)
+ {
+ /* Lets check some fields */
+ for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+ {
+ pp = sk_OPENSSL_PSTRING_value(db->db->data,i);
+
+ if (pp[DB_srptype][0] == 'v')
+ {
+ pp[DB_srptype][0] = 'V';
+ print_user(db, bio_err, i, verbose);
+ }
+ }
+
+ VERBOSE BIO_printf(bio_err, "Trying to update srpvfile.\n");
+ if (!save_index(dbfile, "new", db)) goto err;
+
+ VERBOSE BIO_printf(bio_err, "Temporary srpvfile created.\n");
+ if (!rotate_index(dbfile, "new", "old")) goto err;
+
+ VERBOSE BIO_printf(bio_err, "srpvfile updated.\n");
+ }
+
+ ret = (errors != 0);
+err:
+ if (errors != 0)
+ VERBOSE BIO_printf(bio_err,"User errors %d.\n",errors);
+
+ VERBOSE BIO_printf(bio_err,"SRP terminating with code %d.\n",ret);
+ if(tofree)
+ OPENSSL_free(tofree);
+ if (ret) ERR_print_errors(bio_err);
+ if (randfile) app_RAND_write_file(randfile, bio_err);
+ if (conf) NCONF_free(conf);
+ if (db) free_index(db);
+
+ OBJ_cleanup();
+ apps_shutdown();
+ OPENSSL_EXIT(ret);
+ }
+
+
+
+#endif
+
diff --git a/apps/verify.c b/apps/verify.c
index 9163997..893670f 100644
--- a/apps/verify.c
+++ b/apps/verify.c
@@ -222,25 +222,37 @@ int MAIN(int argc, char **argv)
goto end;
}
- if (argc < 1) check(cert_ctx, NULL, untrusted, trusted, crls, e);
+ ret = 0;
+ if (argc < 1)
+ {
+ if (1 != check(cert_ctx, NULL, untrusted, trusted, crls, e))
+ ret = -1;
+ }
else
+ {
for (i=0; i
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/check-all-builds.sh b/check-all-builds.sh
new file mode 100755
index 0000000..98dc391
--- /dev/null
+++ b/check-all-builds.sh
@@ -0,0 +1,628 @@
+#!/bin/sh
+#
+
+set -e
+export LANG=C
+export LC_ALL=C
+
+PROGDIR=$(dirname "$0")
+PROGNAME=$(basename "$0")
+
+panic () {
+ echo "ERROR: $@"
+ exit 1
+}
+
+VERBOSE=1
+
+# Dump message is $VERBOSE >= $1
+# $1+: message.
+dump_n () {
+ local LOG_LEVEL=$1
+ shift
+ if [ "$VERBOSE" -ge "$LOG_LEVEL" ]; then
+ printf "%s\n" "$@"
+ fi
+}
+
+# Dump a message unless --quiet is used.
+# $1+: message.
+dump () {
+ dump_n 1 "$@"
+}
+
+# Dump a message if --verbose is used only.
+# $1+: message.
+log () {
+ dump_n 2 "$@"
+}
+
+# Run a command silently, unless --verbose or '--verbose --verbose'
+# is used.
+# $1+: Command
+# Return: command status.
+run () {
+ log "COMMAND: $*"
+ case $VERBOSE in
+ 0)
+ "$@" >/dev/null 2>&1 || return $?
+ ;;
+ 1)
+ "$@" >/dev/null || return $?
+ ;;
+ *)
+ "$@" || return $?
+ ;;
+ esac
+}
+
+# $1: string
+# Out: input string, with capital letters replaced by small ones.
+tolower () {
+ echo "$1" | tr '[A-Z]' '[a-z]'
+}
+
+# Return value of a given variable.
+# $1: Variable name
+var_value () {
+ eval printf \"%s\" \"\$$1\"
+}
+
+# Remove some items from a list
+# $1: input space-separated list
+# $2: space-separated list of items to remove from 1
+# Out: items of $1 without items of $2
+filter_out () {
+ local TMP=$(mktemp)
+ local RESULT
+ printf "" > $TMP
+ echo "$2" | tr ' ' '\n' > $TMP
+ RESULT=$(echo "$1" | tr ' ' '\n' | fgrep -x -v -f $TMP | tr '\n' ' ')
+ rm -f $TMP
+ echo "$RESULT"
+}
+
+src_to_obj () {
+ case $1 in
+ *.c)
+ echo ${1%%.c}.o
+ ;;
+ *.S)
+ echo ${1%%.S}.o
+ ;;
+ *)
+ echo $1
+ ;;
+ esac
+}
+
+# Determine host operating system.
+HOST_OS=$(uname -s)
+case $HOST_OS in
+ Linux)
+ HOST_OS=linux
+ ;;
+ Darwin)
+ HOST_OS=darwin
+ ;;
+esac
+
+# Determine host architecture
+HOST_ARCH=$(uname -m)
+case $HOST_ARCH in
+ i?86)
+ HOST_ARCH=x86
+ ;;
+esac
+
+ANDROID_HOST_TAG=$HOST_OS-$HOST_ARCH
+
+case $ANDROID_HOST_TAG in
+ linux-x86_64|darwin-x86-64)
+ ANDROID_HOST_TAG=$HOST_OS-x86
+ ;;
+ *)
+ panic "Sorry, this script can only run on 64-bit Linux or Darwin"
+esac
+
+# Determine number of cores
+case $HOST_OS in
+ linux)
+ NUM_CORES=$(grep -c "processor" /proc/cpuinfo)
+ ;;
+ darwin)
+ NUM_CORES=$(sysctl -n hw.ncpu)
+ ;;
+ *)
+ NUM_CORES=1
+ ;;
+esac
+
+# The list of supported Android target architectures.
+ANDROID_ARCHS="arm x86 mips"
+
+BUILD_TYPES=
+for ARCH in $ANDROID_ARCHS; do
+ BUILD_TYPES="$BUILD_TYPES android-$ARCH"
+done
+ANDROID_BUILD_TYPES=$BUILD_TYPES
+
+# NOTE: The $HOST_OS-x86_64 is currently broken because the single
+# header is tailored for 32-bits.
+HOST_BUILD_TYPES="$HOST_OS-x86 $HOST_OS-generic32 $HOST_OS-generic64"
+
+BUILD_TYPES="$ANDROID_BUILD_TYPES $HOST_BUILD_TYPES"
+
+# Parse command-line
+DO_HELP=
+SRC_DIR=$(cd $PROGDIR && pwd)
+OUT_DIR=out
+BUILD_DIR=
+BUILD_TYPES=
+NUM_JOBS=$NUM_CORES
+ANDROID_BUILD_TOP=$(cd $PROGDIR/../.. && pwd)
+for OPT; do
+ case $OPT in
+ --help|-h|-?)
+ DO_HELP=true
+ ;;
+ --build-dir=*)
+ BUILD_DIR=${OPT##--build-dir=}
+ ;;
+ --verbose)
+ VERBOSE=$(( $VERBOSE + 1 ))
+ ;;
+ --jobs=*)
+ NUM_JOBS=${OPT##--jobs=}
+ ;;
+ --quiet)
+ VERBOSE=$(( $VERBOSE - 1 ))
+ ;;
+ -j*)
+ NUM_JOBS=${OPT##-j}
+ ;;
+ -*)
+ panic "Unknown option '$OPT', see --help for details."
+ ;;
+ *)
+ BUILD_TYPES="$BUILD_TYPES $OPT"
+ ;;
+ esac
+done
+
+# Print help when needed.
+if [ "$DO_HELP" ]; then
+ echo \
+"Usage: $PROGNAME [options] [ ...]
+
+This script is used to ensure that all OpenSSL build variants compile
+properly. It can be used after modifying external/openssl/openssl.config
+and re-running import_openssl.sh to check that any changes didn't break
+the build.
+
+A is a description of a given build of the library and its
+program. Its format is:
+
+ --
+
+Where: is either 'gcc' or 'clang'.
+ is 'android', 'linux' or 'darwin'.
+ is 'arm', 'x86' or 'mips'.
+
+By default, it rebuilds the sources for the following build types:
+"
+ for BUILD_TYPE in $BUILD_TYPES; do
+ echo " $BUILD_TYPE"
+ done
+
+ echo \
+"However, you can pass custom values on the command-line instead.
+
+This scripts generates a custom Makefile in a temporary directory, then
+launches 'make' in it to build all binaries in parallel. In case of
+problem, you can use the --build-dir= option to specify a custom
+build-directory, which will _not_ be removed when the script exits.
+
+For example, to better see why a build fails:
+
+ ./$PROGNAME --build-dir=/tmp/mydir
+ make -C /tmp/mydir V=1
+
+Valid options:
+
+ --help|-h|-? Print this message.
+ --build-dir= Specify build directory.
+ --jobs= Run parallel build jobs [$NUM_JOBS].
+ -j Same as --jobs=.
+ --verbose Increase verbosity.
+ --quiet Decrease verbosity.
+"
+ exit 0
+fi
+
+log "Host OS: $HOST_OS"
+log "Host arch: $HOST_ARCH"
+log "Host CPU count: $NUM_CORES"
+
+if [ -z "$BUILD_TYPES" ]; then
+ BUILD_TYPES="$ANDROID_BUILD_TYPES $HOST_BUILD_TYPES"
+fi
+log "Build types: $BUILD_TYPES"
+
+if [ -z "$BUILD_DIR" ]; then
+ # Create a temporary directory, ensure it gets destroyed properly
+ # when the script exits.
+ BUILD_DIR=$(mktemp -d)
+ clean_build_dir () {
+ log "Cleaning up temporary directory: $BUILD_DIR"
+ rm -rf "$BUILD_DIR"
+ exit $1
+ }
+ trap "clean_build_dir 0" EXIT
+ trap "clean_build_dir \$?" INT HUP QUIT TERM
+ log "Using temporary build directory: $BUILD_DIR"
+else
+ log "Using user build directory: $BUILD_DIR"
+fi
+
+mkdir -p "$BUILD_DIR" && rm -rf "$BUILD_DIR"/*
+
+MAKEFILE=$BUILD_DIR/GNUmakefile
+
+# Return source files for a given module and architecture.
+# $1: module prefix (e.g. CRYPTO)
+# $2: build arch.
+get_module_src_files_for_arch () {
+ local prefix=$1
+ local arch=$2
+ local src_files="$(var_value OPENSSL_${prefix}_SOURCES)"
+ src_files="$src_files $(var_value OPENSSL_${prefix}_SOURCES_${arch})"
+ local exclude_files="$(var_value OPENSSL_${prefix}_SOURCES_EXCLUDES_${arch})"
+ src_files=$(filter_out "$src_files" "$exclude_files")
+ echo "$src_files"
+}
+
+# Return the compiler defines for a given module and architecture
+# $1: module prefix (e.g. CRYPTO)
+# $2 build arch.
+get_module_defines_for_arch () {
+ local prefix=$1
+ local arch=$2
+ local defines="$(var_value OPENSSL_${prefix}_DEFINES)"
+ defines="$defines $(var_value OPENSSL_${prefix}_DEFINES_${arch})"
+ echo "$defines"
+}
+
+# $1: module prefix (e.g. CRYPTO)
+get_module_c_includes () {
+ var_value OPENSSL_$1_INCLUDES
+}
+
+# $1: build type (e.g. gcc-android-arm)
+# Out: build arch.
+get_build_arch () {
+ echo "$1" | cut -d- -f3
+}
+
+# $1: build arch
+# Out: GNU configuration target (e.g. arm-linux-androideabi)
+get_build_arch_target () {
+ case $1 in
+ arm)
+ echo "arm-linux-androideabi"
+ ;;
+ x86)
+ echo "i686-linux-android"
+ ;;
+ mips)
+ echo "mipsel-linux-android"
+ ;;
+ *)
+ echo "$1-linux-android"
+ ;;
+ esac
+}
+
+GCC_VERSION=4.7
+CLANG_VERSION=3.1
+
+get_prebuilt_gcc_dir_for_arch () {
+ local arch=$1
+ local target=$(get_build_arch_target $arch)
+ echo "$ANDROID_BUILD_TOP/prebuilts/gcc/$ANDROID_HOST_TAG/$arch/$target-$GCC_VERSION"
+}
+
+get_prebuilt_clang () {
+ echo "$ANDROID_BUILD_TOP/prebuilts/clang/$ANDROID_HOST_TAG/$CLANG_VERSION/clang"
+}
+
+get_prebuilt_ndk_sysroot_for_arch () {
+ echo "$ANDROID_BUILD_TOP/prebuilts/ndk/current/platforms/android-9/arch-$1"
+}
+
+get_c_runtime_file () {
+ local build_type=$1
+ local arch=$(get_build_arch $build_type)
+ local filename=$2
+ echo "$(get_prebuilt_ndk_sysroot_for_arch $arch)/usr/lib/$filename"
+}
+
+# $1: build type (e.g. gcc-android-arm)
+get_build_compiler () {
+ local arch=$(get_build_arch $1)
+ local target=$(get_build_arch_target $arch)
+ local gcc_dir=$(get_prebuilt_gcc_dir_for_arch $arch);
+ local result
+
+ # Get the toolchain binary.
+ case $1 in
+ gcc-android-*)
+ result="$gcc_dir/bin/$target-gcc"
+ ;;
+ clang-android-*)
+ result="$(get_prebuilt_clang) -target $target -B$gcc_dir/$target/bin -I$gcc_dir/lib/gcc/$target/$GCC_VERSION/include"
+ ;;
+ gcc-*)
+ result=gcc
+ ;;
+ clang-*) # Must have host clang compiler.
+ result=clang
+ ;;
+ esac
+
+ compiler_check=$(which $result 2>/dev/null || echo "")
+ if [ -z "$compiler_check" ]; then
+ panic "Could not find compiler: $result"
+ fi
+
+ # Get the Android sysroot if needed.
+ case $1 in
+ *-android-*)
+ result="$result --sysroot=$(get_prebuilt_ndk_sysroot_for_arch $arch)"
+ ;;
+ esac
+
+ # Force -m32 flag when needed for 32-bit builds.
+ case $1 in
+ *-linux-x86|*-darwin-x86|*-generic32)
+ result="$result -m32"
+ ;;
+ esac
+ echo "$result"
+}
+
+# $1: build type.
+# Out: common compiler flags for this build.
+get_build_c_flags () {
+ local result="-O2 -fPIC"
+ case $1 in
+ *-android-arm)
+ result="$result -march=armv7-a -mfpu=vfpv3-d16"
+ ;;
+ esac
+
+ case $1 in
+ *-generic32|*-generic64)
+ # Generic builds do not compile without this flag.
+ result="$result -DOPENSSL_NO_ASM"
+ ;;
+ esac
+ echo "$result"
+}
+
+# $1: build type.
+# Out: linker for this build.
+get_build_linker () {
+ get_build_compiler $1
+}
+
+clear_sources () {
+ g_all_objs=""
+}
+
+# Generate build instructions to compile source files.
+# Also update g_all_objs.
+# $1: module prefix (e.g. CRYPTO)
+# $2: build type
+build_sources () {
+ local prefix=$1
+ local build_type=$2
+ echo "## build_sources prefix='$prefix' build_type='$build_type'"
+ local arch=$(get_build_arch $build_type)
+ local src_files=$(get_module_src_files_for_arch $prefix $arch)
+ local c_defines=$(get_module_defines_for_arch $prefix $arch)
+ local c_includes=$(get_module_c_includes $prefix "$SRC_DIR")
+ local build_cc=$(get_build_compiler $build_type)
+ local build_cflags=$(get_build_c_flags $build_type)
+ local build_linker=$(get_build_linker $build_type)
+ local src obj def inc
+
+ printf "OUT_DIR := $OUT_DIR/$build_type\n\n"
+ printf "BUILD_CC := $build_cc\n\n"
+ printf "BUILD_LINKER := $build_linker\n\n"
+ printf "BUILD_CFLAGS := $build_cflags"
+ for inc in $c_includes; do
+ printf " -I\$(SRC_DIR)/$inc"
+ done
+ for def in $c_defines; do
+ printf " -D$def"
+ done
+ printf "\n\n"
+ printf "BUILD_OBJECTS :=\n\n"
+
+ case $build_type in
+ clang-android-*)
+ # The version of clang that comes with the platform build doesn't
+ # support simple linking of shared libraries and executables. One
+ # has to provide the C runtime files explicitely.
+ local crtbegin_so=$(get_c_runtime_file $build_type crtbegin_so.o)
+ local crtend_so=$(get_c_runtime_file $build_type crtend_so.o)
+ local crtbegin_exe=$(get_c_runtime_file $build_type crtbegin_dynamic.o)
+ local crtend_exe=$(get_c_runtime_file $build_type crtend_android.o)
+ printf "CRTBEGIN_SO := $crtbegin_so\n"
+ printf "CRTEND_SO := $crtend_so\n"
+ printf "CRTBEGIN_EXE := $crtbegin_exe\n"
+ printf "CRTEND_EXE := $crtend_exe\n"
+ printf "\n"
+ ;;
+ esac
+
+ for src in $src_files; do
+ obj=$(src_to_obj $src)
+ g_all_objs="$g_all_objs $obj"
+ printf "OBJ := \$(OUT_DIR)/$obj\n"
+ printf "BUILD_OBJECTS += \$(OBJ)\n"
+ printf "\$(OBJ): PRIVATE_CC := \$(BUILD_CC)\n"
+ printf "\$(OBJ): PRIVATE_CFLAGS := \$(BUILD_CFLAGS)\n"
+ printf "\$(OBJ): \$(SRC_DIR)/$src\n"
+ printf "\t@echo [$build_type] CC $src\n"
+ printf "\t@mkdir -p \$\$(dirname \$@)\n"
+ printf "\t\$(hide) \$(PRIVATE_CC) \$(PRIVATE_CFLAGS) -c -o \$@ \$<\n"
+ printf "\n"
+ done
+ printf "\n"
+}
+
+# $1: library name (e.g. crypto).
+# $2: module prefix (e.g. CRYPTO).
+# $3: build type.
+# $4: source directory.
+# $5: output directory.
+build_shared_library () {
+ local name=$1
+ local prefix=$2
+ local build_type=$3
+ local src_dir="$4"
+ local out_dir="$5"
+ local shlib="lib${name}.so"
+ local build_linker=$(get_build_linker $build_type)
+ clear_sources
+ build_sources $prefix $build_type
+
+ # TODO(digit): Make the clang build link properly.
+ printf "SHLIB=\$(OUT_DIR)/$shlib\n"
+ printf "\$(SHLIB): PRIVATE_LINKER := \$(BUILD_LINKER)\n"
+ case $build_type in
+ clang-android-*)
+ printf "\$(SHLIB): PRIVATE_CRTBEGIN := \$(CRTBEGIN_SO)\n"
+ printf "\$(SHLIB): PRIVATE_CRTEND := \$(CRTEND_SO)\n"
+ ;;
+ esac
+ printf "\$(SHLIB): \$(BUILD_OBJECTS)\n"
+ printf "\t@echo [$build_type] SHARED_LIBRARY $(basename $shlib)\n"
+ printf "\t@mkdir -p \$\$(dirname \$@)\n"
+ case $build_type in
+ clang-android-*)
+ printf "\t\$(hide) \$(PRIVATE_LINKER) -nostdlib -shared -o \$@ \$(PRIVATE_CRTBEGIN) \$^ \$(PRIVATE_CRTEND)\n"
+ ;;
+ *)
+ printf "\t\$(hide) \$(PRIVATE_LINKER) -shared -o \$@ \$^\n"
+ ;;
+ esac
+ printf "\n"
+}
+
+# $1: executable name.
+# $2: module prefix (e.g. APPS).
+# $3: build type.
+# $4: source directory.
+# $5: output directory.
+# $6: dependent shared libraries (e.g. 'crypto ssl')
+build_executable () {
+ local name=$1
+ local prefix=$2
+ local build_type=$3
+ local src_dir="$4"
+ local out_dir="$5"
+ local shlibs="$6"
+ local build_linker=$(get_build_linker $build_type)
+ clear_sources
+ build_sources $prefix $build_type
+
+ # TODO(digit): Make the clang build link properly.
+ exec=$name
+ all_shlibs=
+ printf "EXEC := \$(OUT_DIR)/$name\n"
+ printf "openssl_all: \$(EXEC)\n"
+ printf "\$(EXEC): PRIVATE_LINKER := \$(BUILD_LINKER)\n"
+ printf "\$(EXEC): \$(BUILD_OBJECTS)"
+ for lib in $shlibs; do
+ printf " \$(OUT_DIR)/lib${lib}.so"
+ done
+ printf "\n"
+ printf "\t@echo [$build_type] EXECUTABLE $name\n"
+ printf "\t@mkdir -p \$\$(dirname \$@)\n"
+ printf "\t\$(hide) \$(PRIVATE_LINKER) -o \$@ \$^\n"
+ printf "\n"
+}
+
+ALL_BUILDS=
+
+generate_openssl_build () {
+ local build_type=$1
+ local out="$OUT_DIR/$build_type"
+ ALL_BUILDS="$ALL_BUILDS $build_type"
+ echo "# Build type: $build_type"
+ build_shared_library crypto CRYPTO $build_type "$SRC_DIR" "$out"
+ build_shared_library ssl SSL $build_type "$SRC_DIR" "$out"
+ build_executable openssl APPS $build_type "$SRC_DIR" "$out" "crypto ssl"
+}
+
+generate_makefile () {
+ echo \
+"# Auto-generated by $PROGDIR - do not edit
+
+.PHONY: openssl_all
+
+all: openssl_all
+
+# Use 'make V=1' to print build commands.
+ifeq (1,\$(V))
+hide :=
+else
+hide := @
+endif
+
+SRC_DIR=$SRC_DIR
+OUT_DIR=$OUT_DIR
+"
+
+ for BUILD_TYPE in $BUILD_TYPES; do
+ generate_openssl_build gcc-$BUILD_TYPE
+ done
+
+# TODO(digit): Make the Clang build run.
+# for BUILD_TYPE in $ANDROID_BUILD_TYPES; do
+# generate_openssl_build clang-$BUILD_TYPE
+# done
+}
+
+. $SRC_DIR/openssl.config
+
+
+
+dump "Generating Makefile"
+log "Makefile path: $MAKEFILE"
+generate_makefile > $MAKEFILE
+
+dump "Building libraries with $NUM_JOBS jobs"
+dump "For the following builds:"
+for BUILD in $ALL_BUILDS; do
+ dump " $BUILD"
+done
+MAKE_FLAGS="-j$NUM_JOBS"
+if [ "$VERBOSE" -gt 2 ]; then
+ MAKE_FLAGS="$MAKE_FLAGS V=1"
+fi
+run make $MAKE_FLAGS -f "$MAKEFILE" -C "$BUILD_DIR"
+case $? in
+ 0)
+ dump "All OK, congratulations!"
+ ;;
+ *)
+ dump "Error, try doing the following to inspect the issues:"
+ dump " $PROGNAME --build-dir=/tmp/mybuild"
+ dump " make -C /tmp/mybuild V=1"
+ dump ""
+ ;;
+esac
diff --git a/crypto/Android.mk b/crypto/Android.mk
deleted file mode 100644
index a5bfe3c..0000000
--- a/crypto/Android.mk
+++ /dev/null
@@ -1,589 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
-mips_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM
-
-arm_src_files := \
- aes/asm/aes-armv4.s \
- bn/asm/armv4-mont.s \
- bn/bn_asm.c \
- sha/asm/sha1-armv4-large.s \
- sha/asm/sha256-armv4.s \
- sha/asm/sha512-armv4.s
-
-mips_src_files := \
- aes/asm/aes-mips.s \
- bn/asm/bn-mips.s \
- bn/asm/mips-mont.s \
- sha/asm/sha1-mips.s \
- sha/asm/sha256-mips.s
-
-other_arch_src_files := \
- aes/aes_core.c \
- bn/bn_asm.c
-
-local_src_files := \
- cryptlib.c \
- mem.c \
- mem_clr.c \
- mem_dbg.c \
- cversion.c \
- ex_data.c \
- cpt_err.c \
- ebcdic.c \
- uid.c \
- o_time.c \
- o_str.c \
- o_dir.c \
- aes/aes_cbc.c \
- aes/aes_cfb.c \
- aes/aes_ctr.c \
- aes/aes_ecb.c \
- aes/aes_misc.c \
- aes/aes_ofb.c \
- aes/aes_wrap.c \
- asn1/a_bitstr.c \
- asn1/a_bool.c \
- asn1/a_bytes.c \
- asn1/a_d2i_fp.c \
- asn1/a_digest.c \
- asn1/a_dup.c \
- asn1/a_enum.c \
- asn1/a_gentm.c \
- asn1/a_i2d_fp.c \
- asn1/a_int.c \
- asn1/a_mbstr.c \
- asn1/a_object.c \
- asn1/a_octet.c \
- asn1/a_print.c \
- asn1/a_set.c \
- asn1/a_sign.c \
- asn1/a_strex.c \
- asn1/a_strnid.c \
- asn1/a_time.c \
- asn1/a_type.c \
- asn1/a_utctm.c \
- asn1/a_utf8.c \
- asn1/a_verify.c \
- asn1/ameth_lib.c \
- asn1/asn1_err.c \
- asn1/asn1_gen.c \
- asn1/asn1_lib.c \
- asn1/asn1_par.c \
- asn1/asn_mime.c \
- asn1/asn_moid.c \
- asn1/asn_pack.c \
- asn1/bio_asn1.c \
- asn1/bio_ndef.c \
- asn1/d2i_pr.c \
- asn1/d2i_pu.c \
- asn1/evp_asn1.c \
- asn1/f_enum.c \
- asn1/f_int.c \
- asn1/f_string.c \
- asn1/i2d_pr.c \
- asn1/i2d_pu.c \
- asn1/n_pkey.c \
- asn1/nsseq.c \
- asn1/p5_pbe.c \
- asn1/p5_pbev2.c \
- asn1/p8_pkey.c \
- asn1/t_bitst.c \
- asn1/t_crl.c \
- asn1/t_pkey.c \
- asn1/t_req.c \
- asn1/t_spki.c \
- asn1/t_x509.c \
- asn1/t_x509a.c \
- asn1/tasn_dec.c \
- asn1/tasn_enc.c \
- asn1/tasn_fre.c \
- asn1/tasn_new.c \
- asn1/tasn_prn.c \
- asn1/tasn_typ.c \
- asn1/tasn_utl.c \
- asn1/x_algor.c \
- asn1/x_attrib.c \
- asn1/x_bignum.c \
- asn1/x_crl.c \
- asn1/x_exten.c \
- asn1/x_info.c \
- asn1/x_long.c \
- asn1/x_name.c \
- asn1/x_nx509.c \
- asn1/x_pkey.c \
- asn1/x_pubkey.c \
- asn1/x_req.c \
- asn1/x_sig.c \
- asn1/x_spki.c \
- asn1/x_val.c \
- asn1/x_x509.c \
- asn1/x_x509a.c \
- bf/bf_cfb64.c \
- bf/bf_ecb.c \
- bf/bf_enc.c \
- bf/bf_ofb64.c \
- bf/bf_skey.c \
- bio/b_dump.c \
- bio/b_print.c \
- bio/b_sock.c \
- bio/bf_buff.c \
- bio/bf_nbio.c \
- bio/bf_null.c \
- bio/bio_cb.c \
- bio/bio_err.c \
- bio/bio_lib.c \
- bio/bss_acpt.c \
- bio/bss_bio.c \
- bio/bss_conn.c \
- bio/bss_dgram.c \
- bio/bss_fd.c \
- bio/bss_file.c \
- bio/bss_log.c \
- bio/bss_mem.c \
- bio/bss_null.c \
- bio/bss_sock.c \
- bn/bn_add.c \
- bn/bn_blind.c \
- bn/bn_const.c \
- bn/bn_ctx.c \
- bn/bn_div.c \
- bn/bn_err.c \
- bn/bn_exp.c \
- bn/bn_exp2.c \
- bn/bn_gcd.c \
- bn/bn_gf2m.c \
- bn/bn_kron.c \
- bn/bn_lib.c \
- bn/bn_mod.c \
- bn/bn_mont.c \
- bn/bn_mpi.c \
- bn/bn_mul.c \
- bn/bn_nist.c \
- bn/bn_prime.c \
- bn/bn_print.c \
- bn/bn_rand.c \
- bn/bn_recp.c \
- bn/bn_shift.c \
- bn/bn_sqr.c \
- bn/bn_sqrt.c \
- bn/bn_word.c \
- buffer/buf_err.c \
- buffer/buffer.c \
- comp/c_rle.c \
- comp/c_zlib.c \
- comp/comp_err.c \
- comp/comp_lib.c \
- conf/conf_api.c \
- conf/conf_def.c \
- conf/conf_err.c \
- conf/conf_lib.c \
- conf/conf_mall.c \
- conf/conf_mod.c \
- conf/conf_sap.c \
- des/cbc_cksm.c \
- des/cbc_enc.c \
- des/cfb64ede.c \
- des/cfb64enc.c \
- des/cfb_enc.c \
- des/des_enc.c \
- des/des_old.c \
- des/des_old2.c \
- des/ecb3_enc.c \
- des/ecb_enc.c \
- des/ede_cbcm_enc.c \
- des/enc_read.c \
- des/enc_writ.c \
- des/fcrypt.c \
- des/fcrypt_b.c \
- des/ofb64ede.c \
- des/ofb64enc.c \
- des/ofb_enc.c \
- des/pcbc_enc.c \
- des/qud_cksm.c \
- des/rand_key.c \
- des/read2pwd.c \
- des/rpc_enc.c \
- des/set_key.c \
- des/str2key.c \
- des/xcbc_enc.c \
- dh/dh_ameth.c \
- dh/dh_asn1.c \
- dh/dh_check.c \
- dh/dh_depr.c \
- dh/dh_err.c \
- dh/dh_gen.c \
- dh/dh_key.c \
- dh/dh_lib.c \
- dh/dh_pmeth.c \
- dsa/dsa_ameth.c \
- dsa/dsa_asn1.c \
- dsa/dsa_depr.c \
- dsa/dsa_err.c \
- dsa/dsa_gen.c \
- dsa/dsa_key.c \
- dsa/dsa_lib.c \
- dsa/dsa_ossl.c \
- dsa/dsa_pmeth.c \
- dsa/dsa_prn.c \
- dsa/dsa_sign.c \
- dsa/dsa_vrf.c \
- dso/dso_dl.c \
- dso/dso_dlfcn.c \
- dso/dso_err.c \
- dso/dso_lib.c \
- dso/dso_null.c \
- dso/dso_openssl.c \
- ec/ec2_mult.c \
- ec/ec2_smpl.c \
- ec/ec_ameth.c \
- ec/ec_asn1.c \
- ec/ec_check.c \
- ec/ec_curve.c \
- ec/ec_cvt.c \
- ec/ec_err.c \
- ec/ec_key.c \
- ec/ec_lib.c \
- ec/ec_mult.c \
- ec/ec_pmeth.c \
- ec/ec_print.c \
- ec/eck_prn.c \
- ec/ecp_mont.c \
- ec/ecp_nist.c \
- ec/ecp_smpl.c \
- ecdh/ech_err.c \
- ecdh/ech_key.c \
- ecdh/ech_lib.c \
- ecdh/ech_ossl.c \
- ecdsa/ecs_asn1.c \
- ecdsa/ecs_err.c \
- ecdsa/ecs_lib.c \
- ecdsa/ecs_ossl.c \
- ecdsa/ecs_sign.c \
- ecdsa/ecs_vrf.c \
- err/err.c \
- err/err_all.c \
- err/err_prn.c \
- evp/bio_b64.c \
- evp/bio_enc.c \
- evp/bio_md.c \
- evp/bio_ok.c \
- evp/c_all.c \
- evp/c_allc.c \
- evp/c_alld.c \
- evp/digest.c \
- evp/e_aes.c \
- evp/e_bf.c \
- evp/e_des.c \
- evp/e_des3.c \
- evp/e_null.c \
- evp/e_old.c \
- evp/e_rc2.c \
- evp/e_rc4.c \
- evp/e_rc5.c \
- evp/e_xcbc_d.c \
- evp/encode.c \
- evp/evp_acnf.c \
- evp/evp_enc.c \
- evp/evp_err.c \
- evp/evp_key.c \
- evp/evp_lib.c \
- evp/evp_pbe.c \
- evp/evp_pkey.c \
- evp/m_dss.c \
- evp/m_dss1.c \
- evp/m_ecdsa.c \
- evp/m_md4.c \
- evp/m_md5.c \
- evp/m_mdc2.c \
- evp/m_null.c \
- evp/m_ripemd.c \
- evp/m_sha1.c \
- evp/m_sigver.c \
- evp/m_wp.c \
- evp/names.c \
- evp/p5_crpt.c \
- evp/p5_crpt2.c \
- evp/p_dec.c \
- evp/p_enc.c \
- evp/p_lib.c \
- evp/p_open.c \
- evp/p_seal.c \
- evp/p_sign.c \
- evp/p_verify.c \
- evp/pmeth_fn.c \
- evp/pmeth_gn.c \
- evp/pmeth_lib.c \
- hmac/hm_ameth.c \
- hmac/hm_pmeth.c \
- hmac/hmac.c \
- krb5/krb5_asn.c \
- lhash/lh_stats.c \
- lhash/lhash.c \
- md4/md4_dgst.c \
- md4/md4_one.c \
- md5/md5_dgst.c \
- md5/md5_one.c \
- modes/cbc128.c \
- modes/cfb128.c \
- modes/ctr128.c \
- modes/ofb128.c \
- objects/o_names.c \
- objects/obj_dat.c \
- objects/obj_err.c \
- objects/obj_lib.c \
- objects/obj_xref.c \
- ocsp/ocsp_asn.c \
- ocsp/ocsp_cl.c \
- ocsp/ocsp_err.c \
- ocsp/ocsp_ext.c \
- ocsp/ocsp_ht.c \
- ocsp/ocsp_lib.c \
- ocsp/ocsp_prn.c \
- ocsp/ocsp_srv.c \
- ocsp/ocsp_vfy.c \
- pem/pem_all.c \
- pem/pem_err.c \
- pem/pem_info.c \
- pem/pem_lib.c \
- pem/pem_oth.c \
- pem/pem_pk8.c \
- pem/pem_pkey.c \
- pem/pem_seal.c \
- pem/pem_sign.c \
- pem/pem_x509.c \
- pem/pem_xaux.c \
- pem/pvkfmt.c \
- pkcs12/p12_add.c \
- pkcs12/p12_asn.c \
- pkcs12/p12_attr.c \
- pkcs12/p12_crpt.c \
- pkcs12/p12_crt.c \
- pkcs12/p12_decr.c \
- pkcs12/p12_init.c \
- pkcs12/p12_key.c \
- pkcs12/p12_kiss.c \
- pkcs12/p12_mutl.c \
- pkcs12/p12_npas.c \
- pkcs12/p12_p8d.c \
- pkcs12/p12_p8e.c \
- pkcs12/p12_utl.c \
- pkcs12/pk12err.c \
- pkcs7/pk7_asn1.c \
- pkcs7/pk7_attr.c \
- pkcs7/pk7_doit.c \
- pkcs7/pk7_lib.c \
- pkcs7/pk7_mime.c \
- pkcs7/pk7_smime.c \
- pkcs7/pkcs7err.c \
- rand/md_rand.c \
- rand/rand_egd.c \
- rand/rand_err.c \
- rand/rand_lib.c \
- rand/rand_unix.c \
- rand/randfile.c \
- rc2/rc2_cbc.c \
- rc2/rc2_ecb.c \
- rc2/rc2_skey.c \
- rc2/rc2cfb64.c \
- rc2/rc2ofb64.c \
- rc4/rc4_enc.c \
- rc4/rc4_skey.c \
- ripemd/rmd_dgst.c \
- ripemd/rmd_one.c \
- rsa/rsa_ameth.c \
- rsa/rsa_asn1.c \
- rsa/rsa_chk.c \
- rsa/rsa_eay.c \
- rsa/rsa_err.c \
- rsa/rsa_gen.c \
- rsa/rsa_lib.c \
- rsa/rsa_none.c \
- rsa/rsa_null.c \
- rsa/rsa_oaep.c \
- rsa/rsa_pk1.c \
- rsa/rsa_pmeth.c \
- rsa/rsa_prn.c \
- rsa/rsa_pss.c \
- rsa/rsa_saos.c \
- rsa/rsa_sign.c \
- rsa/rsa_ssl.c \
- rsa/rsa_x931.c \
- sha/sha1_one.c \
- sha/sha1dgst.c \
- sha/sha256.c \
- sha/sha512.c \
- sha/sha_dgst.c \
- stack/stack.c \
- ts/ts_err.c \
- txt_db/txt_db.c \
- ui/ui_compat.c \
- ui/ui_err.c \
- ui/ui_lib.c \
- ui/ui_openssl.c \
- ui/ui_util.c \
- x509/by_dir.c \
- x509/by_file.c \
- x509/x509_att.c \
- x509/x509_cmp.c \
- x509/x509_d2.c \
- x509/x509_def.c \
- x509/x509_err.c \
- x509/x509_ext.c \
- x509/x509_lu.c \
- x509/x509_obj.c \
- x509/x509_r2x.c \
- x509/x509_req.c \
- x509/x509_set.c \
- x509/x509_trs.c \
- x509/x509_txt.c \
- x509/x509_v3.c \
- x509/x509_vfy.c \
- x509/x509_vpm.c \
- x509/x509cset.c \
- x509/x509name.c \
- x509/x509rset.c \
- x509/x509spki.c \
- x509/x509type.c \
- x509/x_all.c \
- x509v3/pcy_cache.c \
- x509v3/pcy_data.c \
- x509v3/pcy_lib.c \
- x509v3/pcy_map.c \
- x509v3/pcy_node.c \
- x509v3/pcy_tree.c \
- x509v3/v3_akey.c \
- x509v3/v3_akeya.c \
- x509v3/v3_alt.c \
- x509v3/v3_bcons.c \
- x509v3/v3_bitst.c \
- x509v3/v3_conf.c \
- x509v3/v3_cpols.c \
- x509v3/v3_crld.c \
- x509v3/v3_enum.c \
- x509v3/v3_extku.c \
- x509v3/v3_genn.c \
- x509v3/v3_ia5.c \
- x509v3/v3_info.c \
- x509v3/v3_int.c \
- x509v3/v3_lib.c \
- x509v3/v3_ncons.c \
- x509v3/v3_ocsp.c \
- x509v3/v3_pci.c \
- x509v3/v3_pcia.c \
- x509v3/v3_pcons.c \
- x509v3/v3_pku.c \
- x509v3/v3_pmaps.c \
- x509v3/v3_prn.c \
- x509v3/v3_purp.c \
- x509v3/v3_skey.c \
- x509v3/v3_sxnet.c \
- x509v3/v3_utl.c \
- x509v3/v3err.c
-
-local_c_includes := \
- external/openssl \
- external/openssl/crypto/asn1 \
- external/openssl/crypto/evp \
- external/openssl/include \
- external/openssl/include/openssl \
- external/zlib
-
-local_c_flags := -DNO_WINDOWS_BRAINDEATH
-
-#######################################
-# target static library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
- LOCAL_SRC_FILES += $(arm_src_files)
- LOCAL_CFLAGS += $(arm_cflags)
-endif
-ifeq ($(TARGET_ARCH),mips)
- ifneq (($TARGET_HAS_BIGENDIAN),true)
- LOCAL_SRC_FILES += $(mips_src_files)
- LOCAL_CFLAGS += $(mips_cflags)
- else
- LOCAL_SRC_FILES += $(other_arch_src_files)
- endif
-endif
-ifeq ($(TARGET_ARCH),x86)
- LOCAL_SRC_FILES += $(other_arch_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_STATIC_LIBRARY)
-
-#######################################
-# target shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-# Use the NDK prebuilt libz and libdl.
-LOCAL_LDFLAGS += -lz -ldl
-else
-LOCAL_SHARED_LIBRARIES += libz libdl
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
- LOCAL_SRC_FILES += $(arm_src_files)
- LOCAL_CFLAGS += $(arm_cflags)
-endif
-ifeq ($(TARGET_ARCH),mips)
- ifneq (($TARGET_HAS_BIGENDIAN),true)
- LOCAL_SRC_FILES += $(mips_src_files)
- LOCAL_CFLAGS += $(mips_cflags)
- else
- LOCAL_SRC_FILES += $(other_arch_src_files)
- endif
-endif
-ifeq ($(TARGET_ARCH),x86)
- LOCAL_SRC_FILES += $(other_arch_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_SHARED_LIBRARY)
-
-#######################################
-# host shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SRC_FILES += $(other_arch_src_files)
-LOCAL_STATIC_LIBRARIES += libz
-LOCAL_LDLIBS += -ldl
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_HOST_SHARED_LIBRARY)
-
-########################################
-# host static library, which is used by some SDK tools.
-
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SRC_FILES += $(other_arch_src_files)
-LOCAL_STATIC_LIBRARIES += libz
-LOCAL_LDLIBS += -ldl
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/crypto/LPdir_win32.c b/crypto/LPdir_win32.c
new file mode 100644
index 0000000..e39872d
--- /dev/null
+++ b/crypto/LPdir_win32.c
@@ -0,0 +1,30 @@
+/* $LP: LPlib/source/LPdir_win32.c,v 1.3 2004/08/26 13:36:05 _cvs_levitte Exp $ */
+/*
+ * Copyright (c) 2004, Richard Levitte
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define LP_SYS_WIN32
+#define LP_MULTIBYTE_AVAILABLE
+#include "LPdir_win.c"
diff --git a/crypto/aes/aes.h b/crypto/aes/aes.h
index d2c9973..031abf0 100644
--- a/crypto/aes/aes.h
+++ b/crypto/aes/aes.h
@@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+
void AES_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void AES_decrypt(const unsigned char *in, unsigned char *out,
diff --git a/crypto/aes/aes_core.c b/crypto/aes/aes_core.c
index a7ec54f..8f5210a 100644
--- a/crypto/aes/aes_core.c
+++ b/crypto/aes/aes_core.c
@@ -625,7 +625,7 @@ static const u32 rcon[] = {
/**
* Expand the cipher key into the encryption key schedule.
*/
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
@@ -726,7 +726,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
/**
* Expand the cipher key into the decryption key schedule.
*/
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
@@ -734,7 +734,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
u32 temp;
/* first, start with an encryption schedule */
- status = AES_set_encrypt_key(userKey, bits, key);
+ status = private_AES_set_encrypt_key(userKey, bits, key);
if (status < 0)
return status;
@@ -1201,7 +1201,7 @@ static const u32 rcon[] = {
/**
* Expand the cipher key into the encryption key schedule.
*/
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
int i = 0;
@@ -1301,7 +1301,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
/**
* Expand the cipher key into the decryption key schedule.
*/
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
@@ -1309,7 +1309,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
u32 temp;
/* first, start with an encryption schedule */
- status = AES_set_encrypt_key(userKey, bits, key);
+ status = private_AES_set_encrypt_key(userKey, bits, key);
if (status < 0)
return status;
diff --git a/crypto/aes/aes_misc.c b/crypto/aes/aes_misc.c
index 4fead1b..f083488 100644
--- a/crypto/aes/aes_misc.c
+++ b/crypto/aes/aes_misc.c
@@ -50,6 +50,7 @@
*/
#include
+#include
#include
#include "aes_locl.h"
@@ -62,3 +63,23 @@ const char *AES_options(void) {
return "aes(partial)";
#endif
}
+
+/* FIPS wrapper functions to block low level AES calls in FIPS mode */
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+ {
+#ifdef OPENSSL_FIPS
+ fips_cipher_abort(AES);
+#endif
+ return private_AES_set_encrypt_key(userKey, bits, key);
+ }
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+ {
+#ifdef OPENSSL_FIPS
+ fips_cipher_abort(AES);
+#endif
+ return private_AES_set_decrypt_key(userKey, bits, key);
+ }
diff --git a/crypto/aes/asm/aes-586.S b/crypto/aes/asm/aes-586.S
new file mode 100644
index 0000000..20c0238
--- /dev/null
+++ b/crypto/aes/asm/aes-586.S
@@ -0,0 +1,3239 @@
+.file "aes-586.s"
+.text
+.type _x86_AES_encrypt_compact,@function
+.align 16
+_x86_AES_encrypt_compact:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl -128(%ebp),%edi
+ movl -96(%ebp),%esi
+ movl -64(%ebp),%edi
+ movl -32(%ebp),%esi
+ movl (%ebp),%edi
+ movl 32(%ebp),%esi
+ movl 64(%ebp),%edi
+ movl 96(%ebp),%esi
+.align 16
+.L000loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ch,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ah,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $8,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $24,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+
+ movl %ecx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ xorl %esi,%ecx
+ roll $24,%ecx
+ xorl %esi,%ecx
+ rorl $16,%ebp
+ xorl %ebp,%ecx
+ rorl $8,%ebp
+ xorl %ebp,%ecx
+ movl %edx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %edx,%ebp
+ xorl %edi,%esi
+ xorl %esi,%edx
+ roll $24,%edx
+ xorl %esi,%edx
+ rorl $16,%ebp
+ xorl %ebp,%edx
+ rorl $8,%ebp
+ xorl %ebp,%edx
+ movl %eax,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %eax,%ebp
+ xorl %edi,%esi
+ xorl %esi,%eax
+ roll $24,%eax
+ xorl %esi,%eax
+ rorl $16,%ebp
+ xorl %ebp,%eax
+ rorl $8,%ebp
+ xorl %ebp,%eax
+ movl %ebx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %ebx,%ebp
+ xorl %edi,%esi
+ xorl %esi,%ebx
+ roll $24,%ebx
+ xorl %esi,%ebx
+ rorl $16,%ebp
+ xorl %ebp,%ebx
+ rorl $8,%ebp
+ xorl %ebp,%ebx
+ movl 20(%esp),%edi
+ movl 28(%esp),%ebp
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L000loop
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ch,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ah,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $8,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $24,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+
+ xorl 16(%edi),%eax
+ xorl 20(%edi),%ebx
+ xorl 24(%edi),%ecx
+ xorl 28(%edi),%edx
+ ret
+.size _x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact
+.type _sse_AES_encrypt_compact,@function
+.align 16
+_sse_AES_encrypt_compact:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl $454761243,%eax
+ movl %eax,8(%esp)
+ movl %eax,12(%esp)
+ movl -128(%ebp),%eax
+ movl -96(%ebp),%ebx
+ movl -64(%ebp),%ecx
+ movl -32(%ebp),%edx
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%edx
+.align 16
+.L001loop:
+ pshufw $8,%mm0,%mm1
+ pshufw $13,%mm4,%mm5
+ movd %mm1,%eax
+ movd %mm5,%ebx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ pshufw $13,%mm0,%mm2
+ movzbl %ah,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ shll $8,%edx
+ shrl $16,%eax
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ pshufw $8,%mm4,%mm6
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%edx
+ shrl $16,%ebx
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm0
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ movd %mm2,%eax
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ movd %mm6,%ebx
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm1
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ shrl $16,%ebx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ shrl $16,%eax
+ punpckldq %mm1,%mm0
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ andl $255,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $16,%eax
+ orl %eax,%edx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm4
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ orl %ebx,%edx
+ movd %edx,%mm5
+ punpckldq %mm5,%mm4
+ addl $16,%edi
+ cmpl 24(%esp),%edi
+ ja .L002out
+ movq 8(%esp),%mm2
+ pxor %mm3,%mm3
+ pxor %mm7,%mm7
+ movq %mm0,%mm1
+ movq %mm4,%mm5
+ pcmpgtb %mm0,%mm3
+ pcmpgtb %mm4,%mm7
+ pand %mm2,%mm3
+ pand %mm2,%mm7
+ pshufw $177,%mm0,%mm2
+ pshufw $177,%mm4,%mm6
+ paddb %mm0,%mm0
+ paddb %mm4,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pshufw $177,%mm2,%mm3
+ pshufw $177,%mm6,%mm7
+ pxor %mm0,%mm1
+ pxor %mm4,%mm5
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ movq %mm3,%mm2
+ movq %mm7,%mm6
+ pslld $8,%mm3
+ pslld $8,%mm7
+ psrld $24,%mm2
+ psrld $24,%mm6
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ movq %mm1,%mm3
+ movq %mm5,%mm7
+ movq (%edi),%mm2
+ movq 8(%edi),%mm6
+ psrld $8,%mm1
+ psrld $8,%mm5
+ movl -128(%ebp),%eax
+ pslld $24,%mm3
+ pslld $24,%mm7
+ movl -64(%ebp),%ebx
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ movl (%ebp),%ecx
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ movl 64(%ebp),%edx
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ jmp .L001loop
+.align 16
+.L002out:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ ret
+.size _sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact
+.type _x86_AES_encrypt,@function
+.align 16
+_x86_AES_encrypt:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+.align 16
+.L003loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %bh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movl (%ebp,%esi,8),%esi
+ movzbl %ch,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movl (%ebp,%esi,8),%esi
+ movzbl %dh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movzbl %bh,%edi
+ xorl 1(%ebp,%edi,8),%esi
+
+ movl 20(%esp),%edi
+ movl (%ebp,%edx,8),%edx
+ movzbl %ah,%eax
+ xorl 3(%ebp,%eax,8),%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ xorl 2(%ebp,%ebx,8),%edx
+ movl 8(%esp),%ebx
+ xorl 1(%ebp,%ecx,8),%edx
+ movl %esi,%ecx
+
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L003loop
+ movl %eax,%esi
+ andl $255,%esi
+ movl 2(%ebp,%esi,8),%esi
+ andl $255,%esi
+ movzbl %bh,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $65280,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $16711680,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movl 2(%ebp,%edi,8),%edi
+ andl $4278190080,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movl 2(%ebp,%esi,8),%esi
+ andl $255,%esi
+ movzbl %ch,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $65280,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $16711680,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ movl 2(%ebp,%edi,8),%edi
+ andl $4278190080,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movl 2(%ebp,%esi,8),%esi
+ andl $255,%esi
+ movzbl %dh,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $65280,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $16711680,%edi
+ xorl %edi,%esi
+ movzbl %bh,%edi
+ movl 2(%ebp,%edi,8),%edi
+ andl $4278190080,%edi
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movl 2(%ebp,%edx,8),%edx
+ andl $255,%edx
+ movzbl %ah,%eax
+ movl (%ebp,%eax,8),%eax
+ andl $65280,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ movl (%ebp,%ebx,8),%ebx
+ andl $16711680,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ movl 2(%ebp,%ecx,8),%ecx
+ andl $4278190080,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ ret
+.align 64
+.LAES_Te:
+.long 2774754246,2774754246
+.long 2222750968,2222750968
+.long 2574743534,2574743534
+.long 2373680118,2373680118
+.long 234025727,234025727
+.long 3177933782,3177933782
+.long 2976870366,2976870366
+.long 1422247313,1422247313
+.long 1345335392,1345335392
+.long 50397442,50397442
+.long 2842126286,2842126286
+.long 2099981142,2099981142
+.long 436141799,436141799
+.long 1658312629,1658312629
+.long 3870010189,3870010189
+.long 2591454956,2591454956
+.long 1170918031,1170918031
+.long 2642575903,2642575903
+.long 1086966153,1086966153
+.long 2273148410,2273148410
+.long 368769775,368769775
+.long 3948501426,3948501426
+.long 3376891790,3376891790
+.long 200339707,200339707
+.long 3970805057,3970805057
+.long 1742001331,1742001331
+.long 4255294047,4255294047
+.long 3937382213,3937382213
+.long 3214711843,3214711843
+.long 4154762323,4154762323
+.long 2524082916,2524082916
+.long 1539358875,1539358875
+.long 3266819957,3266819957
+.long 486407649,486407649
+.long 2928907069,2928907069
+.long 1780885068,1780885068
+.long 1513502316,1513502316
+.long 1094664062,1094664062
+.long 49805301,49805301
+.long 1338821763,1338821763
+.long 1546925160,1546925160
+.long 4104496465,4104496465
+.long 887481809,887481809
+.long 150073849,150073849
+.long 2473685474,2473685474
+.long 1943591083,1943591083
+.long 1395732834,1395732834
+.long 1058346282,1058346282
+.long 201589768,201589768
+.long 1388824469,1388824469
+.long 1696801606,1696801606
+.long 1589887901,1589887901
+.long 672667696,672667696
+.long 2711000631,2711000631
+.long 251987210,251987210
+.long 3046808111,3046808111
+.long 151455502,151455502
+.long 907153956,907153956
+.long 2608889883,2608889883
+.long 1038279391,1038279391
+.long 652995533,652995533
+.long 1764173646,1764173646
+.long 3451040383,3451040383
+.long 2675275242,2675275242
+.long 453576978,453576978
+.long 2659418909,2659418909
+.long 1949051992,1949051992
+.long 773462580,773462580
+.long 756751158,756751158
+.long 2993581788,2993581788
+.long 3998898868,3998898868
+.long 4221608027,4221608027
+.long 4132590244,4132590244
+.long 1295727478,1295727478
+.long 1641469623,1641469623
+.long 3467883389,3467883389
+.long 2066295122,2066295122
+.long 1055122397,1055122397
+.long 1898917726,1898917726
+.long 2542044179,2542044179
+.long 4115878822,4115878822
+.long 1758581177,1758581177
+.long 0,0
+.long 753790401,753790401
+.long 1612718144,1612718144
+.long 536673507,536673507
+.long 3367088505,3367088505
+.long 3982187446,3982187446
+.long 3194645204,3194645204
+.long 1187761037,1187761037
+.long 3653156455,3653156455
+.long 1262041458,1262041458
+.long 3729410708,3729410708
+.long 3561770136,3561770136
+.long 3898103984,3898103984
+.long 1255133061,1255133061
+.long 1808847035,1808847035
+.long 720367557,720367557
+.long 3853167183,3853167183
+.long 385612781,385612781
+.long 3309519750,3309519750
+.long 3612167578,3612167578
+.long 1429418854,1429418854
+.long 2491778321,2491778321
+.long 3477423498,3477423498
+.long 284817897,284817897
+.long 100794884,100794884
+.long 2172616702,2172616702
+.long 4031795360,4031795360
+.long 1144798328,1144798328
+.long 3131023141,3131023141
+.long 3819481163,3819481163
+.long 4082192802,4082192802
+.long 4272137053,4272137053
+.long 3225436288,3225436288
+.long 2324664069,2324664069
+.long 2912064063,2912064063
+.long 3164445985,3164445985
+.long 1211644016,1211644016
+.long 83228145,83228145
+.long 3753688163,3753688163
+.long 3249976951,3249976951
+.long 1977277103,1977277103
+.long 1663115586,1663115586
+.long 806359072,806359072
+.long 452984805,452984805
+.long 250868733,250868733
+.long 1842533055,1842533055
+.long 1288555905,1288555905
+.long 336333848,336333848
+.long 890442534,890442534
+.long 804056259,804056259
+.long 3781124030,3781124030
+.long 2727843637,2727843637
+.long 3427026056,3427026056
+.long 957814574,957814574
+.long 1472513171,1472513171
+.long 4071073621,4071073621
+.long 2189328124,2189328124
+.long 1195195770,1195195770
+.long 2892260552,2892260552
+.long 3881655738,3881655738
+.long 723065138,723065138
+.long 2507371494,2507371494
+.long 2690670784,2690670784
+.long 2558624025,2558624025
+.long 3511635870,3511635870
+.long 2145180835,2145180835
+.long 1713513028,1713513028
+.long 2116692564,2116692564
+.long 2878378043,2878378043
+.long 2206763019,2206763019
+.long 3393603212,3393603212
+.long 703524551,703524551
+.long 3552098411,3552098411
+.long 1007948840,1007948840
+.long 2044649127,2044649127
+.long 3797835452,3797835452
+.long 487262998,487262998
+.long 1994120109,1994120109
+.long 1004593371,1004593371
+.long 1446130276,1446130276
+.long 1312438900,1312438900
+.long 503974420,503974420
+.long 3679013266,3679013266
+.long 168166924,168166924
+.long 1814307912,1814307912
+.long 3831258296,3831258296
+.long 1573044895,1573044895
+.long 1859376061,1859376061
+.long 4021070915,4021070915
+.long 2791465668,2791465668
+.long 2828112185,2828112185
+.long 2761266481,2761266481
+.long 937747667,937747667
+.long 2339994098,2339994098
+.long 854058965,854058965
+.long 1137232011,1137232011
+.long 1496790894,1496790894
+.long 3077402074,3077402074
+.long 2358086913,2358086913
+.long 1691735473,1691735473
+.long 3528347292,3528347292
+.long 3769215305,3769215305
+.long 3027004632,3027004632
+.long 4199962284,4199962284
+.long 133494003,133494003
+.long 636152527,636152527
+.long 2942657994,2942657994
+.long 2390391540,2390391540
+.long 3920539207,3920539207
+.long 403179536,403179536
+.long 3585784431,3585784431
+.long 2289596656,2289596656
+.long 1864705354,1864705354
+.long 1915629148,1915629148
+.long 605822008,605822008
+.long 4054230615,4054230615
+.long 3350508659,3350508659
+.long 1371981463,1371981463
+.long 602466507,602466507
+.long 2094914977,2094914977
+.long 2624877800,2624877800
+.long 555687742,555687742
+.long 3712699286,3712699286
+.long 3703422305,3703422305
+.long 2257292045,2257292045
+.long 2240449039,2240449039
+.long 2423288032,2423288032
+.long 1111375484,1111375484
+.long 3300242801,3300242801
+.long 2858837708,2858837708
+.long 3628615824,3628615824
+.long 84083462,84083462
+.long 32962295,32962295
+.long 302911004,302911004
+.long 2741068226,2741068226
+.long 1597322602,1597322602
+.long 4183250862,4183250862
+.long 3501832553,3501832553
+.long 2441512471,2441512471
+.long 1489093017,1489093017
+.long 656219450,656219450
+.long 3114180135,3114180135
+.long 954327513,954327513
+.long 335083755,335083755
+.long 3013122091,3013122091
+.long 856756514,856756514
+.long 3144247762,3144247762
+.long 1893325225,1893325225
+.long 2307821063,2307821063
+.long 2811532339,2811532339
+.long 3063651117,3063651117
+.long 572399164,572399164
+.long 2458355477,2458355477
+.long 552200649,552200649
+.long 1238290055,1238290055
+.long 4283782570,4283782570
+.long 2015897680,2015897680
+.long 2061492133,2061492133
+.long 2408352771,2408352771
+.long 4171342169,4171342169
+.long 2156497161,2156497161
+.long 386731290,386731290
+.long 3669999461,3669999461
+.long 837215959,837215959
+.long 3326231172,3326231172
+.long 3093850320,3093850320
+.long 3275833730,3275833730
+.long 2962856233,2962856233
+.long 1999449434,1999449434
+.long 286199582,286199582
+.long 3417354363,3417354363
+.long 4233385128,4233385128
+.long 3602627437,3602627437
+.long 974525996,974525996
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.long 1,2,4,8
+.long 16,32,64,128
+.long 27,54,0,0
+.long 0,0,0,0
+.size _x86_AES_encrypt,.-_x86_AES_encrypt
+.globl AES_encrypt
+.type AES_encrypt,@function
+.align 16
+AES_encrypt:
+.L_AES_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 28(%esp),%edi
+ movl %esp,%eax
+ subl $36,%esp
+ andl $-64,%esp
+ leal -127(%edi),%ebx
+ subl %esp,%ebx
+ negl %ebx
+ andl $960,%ebx
+ subl %ebx,%esp
+ addl $4,%esp
+ movl %eax,28(%esp)
+ call .L004pic_point
+.L004pic_point:
+ popl %ebp
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L004pic_point](%ebp),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ leal .LAES_Te-.L004pic_point(%ebp),%ebp
+ leal 764(%esp),%ebx
+ subl %ebp,%ebx
+ andl $768,%ebx
+ leal 2176(%ebp,%ebx,1),%ebp
+ btl $25,(%eax)
+ jnc .L005x86
+ movq (%esi),%mm0
+ movq 8(%esi),%mm4
+ call _sse_AES_encrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movq %mm0,(%esi)
+ movq %mm4,8(%esi)
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 16
+.L005x86:
+ movl %ebp,24(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ call _x86_AES_encrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size AES_encrypt,.-.L_AES_encrypt_begin
+.type _x86_AES_decrypt_compact,@function
+.align 16
+_x86_AES_decrypt_compact:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl -128(%ebp),%edi
+ movl -96(%ebp),%esi
+ movl -64(%ebp),%edi
+ movl -32(%ebp),%esi
+ movl (%ebp),%edi
+ movl 32(%ebp),%esi
+ movl 64(%ebp),%edi
+ movl 96(%ebp),%esi
+.align 16
+.L006loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ah,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ch,%ecx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $8,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ shrl $24,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $24,%eax
+ xorl %eax,%edx
+ movl %ecx,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%eax
+ subl %edi,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ xorl %eax,%esi
+ movl %esi,%eax
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%eax,%eax,1),%ebx
+ subl %edi,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %ecx,%eax
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ebx,%ebx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %ecx,%ebx
+ roll $8,%ecx
+ xorl %esi,%ebp
+ xorl %eax,%ecx
+ xorl %ebp,%eax
+ roll $24,%eax
+ xorl %ebx,%ecx
+ xorl %ebp,%ebx
+ roll $16,%ebx
+ xorl %ebp,%ecx
+ roll $8,%ebp
+ xorl %eax,%ecx
+ xorl %ebx,%ecx
+ movl 4(%esp),%eax
+ xorl %ebp,%ecx
+ movl %ecx,12(%esp)
+ movl %edx,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%edx,%edx,1),%ebx
+ subl %edi,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ebx,%ebx,1),%ecx
+ subl %edi,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %edx,%ebx
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %edx,%ecx
+ roll $8,%edx
+ xorl %esi,%ebp
+ xorl %ebx,%edx
+ xorl %ebp,%ebx
+ roll $24,%ebx
+ xorl %ecx,%edx
+ xorl %ebp,%ecx
+ roll $16,%ecx
+ xorl %ebp,%edx
+ roll $8,%ebp
+ xorl %ebx,%edx
+ xorl %ecx,%edx
+ movl 8(%esp),%ebx
+ xorl %ebp,%edx
+ movl %edx,16(%esp)
+ movl %eax,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%eax,%eax,1),%ecx
+ subl %edi,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%edx
+ subl %edi,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %eax,%ecx
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%edx,%edx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %eax,%edx
+ roll $8,%eax
+ xorl %esi,%ebp
+ xorl %ecx,%eax
+ xorl %ebp,%ecx
+ roll $24,%ecx
+ xorl %edx,%eax
+ xorl %ebp,%edx
+ roll $16,%edx
+ xorl %ebp,%eax
+ roll $8,%ebp
+ xorl %ecx,%eax
+ xorl %edx,%eax
+ xorl %ebp,%eax
+ movl %ebx,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ebx,%ebx,1),%ecx
+ subl %edi,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%edx
+ subl %edi,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %ebx,%ecx
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%edx,%edx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %ebx,%edx
+ roll $8,%ebx
+ xorl %esi,%ebp
+ xorl %ecx,%ebx
+ xorl %ebp,%ecx
+ roll $24,%ecx
+ xorl %edx,%ebx
+ xorl %ebp,%edx
+ roll $16,%edx
+ xorl %ebp,%ebx
+ roll $8,%ebp
+ xorl %ecx,%ebx
+ xorl %edx,%ebx
+ movl 12(%esp),%ecx
+ xorl %ebp,%ebx
+ movl 16(%esp),%edx
+ movl 20(%esp),%edi
+ movl 28(%esp),%ebp
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L006loop
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ah,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ch,%ecx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $8,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ shrl $24,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $24,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ xorl 16(%edi),%eax
+ xorl 20(%edi),%ebx
+ xorl 24(%edi),%ecx
+ xorl 28(%edi),%edx
+ ret
+.size _x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact
+.type _sse_AES_decrypt_compact,@function
+.align 16
+_sse_AES_decrypt_compact:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl $454761243,%eax
+ movl %eax,8(%esp)
+ movl %eax,12(%esp)
+ movl -128(%ebp),%eax
+ movl -96(%ebp),%ebx
+ movl -64(%ebp),%ecx
+ movl -32(%ebp),%edx
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%edx
+.align 16
+.L007loop:
+ pshufw $12,%mm0,%mm1
+ movd %mm1,%eax
+ pshufw $9,%mm4,%mm5
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ movd %mm5,%ebx
+ movzbl %ah,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ shll $8,%edx
+ pshufw $6,%mm0,%mm2
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ shrl $16,%eax
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%edx
+ shrl $16,%ebx
+ pshufw $3,%mm4,%mm6
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm0
+ movzbl %al,%esi
+ movd %mm2,%eax
+ movzbl -128(%ebp,%esi,1),%ecx
+ shll $16,%ecx
+ movzbl %bl,%esi
+ movd %mm6,%ebx
+ movzbl -128(%ebp,%esi,1),%esi
+ orl %esi,%ecx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ orl %esi,%edx
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%edx
+ movd %edx,%mm1
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%edx
+ shll $8,%edx
+ movzbl %bh,%esi
+ shrl $16,%eax
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%edx
+ shrl $16,%ebx
+ punpckldq %mm1,%mm0
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ orl %ebx,%edx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%edx
+ movd %edx,%mm4
+ movzbl %ah,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $24,%eax
+ orl %eax,%ecx
+ movd %ecx,%mm5
+ punpckldq %mm5,%mm4
+ addl $16,%edi
+ cmpl 24(%esp),%edi
+ ja .L008out
+ movq %mm0,%mm3
+ movq %mm4,%mm7
+ pshufw $228,%mm0,%mm2
+ pshufw $228,%mm4,%mm6
+ movq %mm0,%mm1
+ movq %mm4,%mm5
+ pshufw $177,%mm0,%mm0
+ pshufw $177,%mm4,%mm4
+ pslld $8,%mm2
+ pslld $8,%mm6
+ psrld $8,%mm3
+ psrld $8,%mm7
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pslld $16,%mm2
+ pslld $16,%mm6
+ psrld $16,%mm3
+ psrld $16,%mm7
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ movq 8(%esp),%mm3
+ pxor %mm2,%mm2
+ pxor %mm6,%mm6
+ pcmpgtb %mm1,%mm2
+ pcmpgtb %mm5,%mm6
+ pand %mm3,%mm2
+ pand %mm3,%mm6
+ paddb %mm1,%mm1
+ paddb %mm5,%mm5
+ pxor %mm2,%mm1
+ pxor %mm6,%mm5
+ movq %mm1,%mm3
+ movq %mm5,%mm7
+ movq %mm1,%mm2
+ movq %mm5,%mm6
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ pslld $24,%mm3
+ pslld $24,%mm7
+ psrld $8,%mm2
+ psrld $8,%mm6
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ movq 8(%esp),%mm2
+ pxor %mm3,%mm3
+ pxor %mm7,%mm7
+ pcmpgtb %mm1,%mm3
+ pcmpgtb %mm5,%mm7
+ pand %mm2,%mm3
+ pand %mm2,%mm7
+ paddb %mm1,%mm1
+ paddb %mm5,%mm5
+ pxor %mm3,%mm1
+ pxor %mm7,%mm5
+ pshufw $177,%mm1,%mm3
+ pshufw $177,%mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm3,%mm3
+ pxor %mm7,%mm7
+ pcmpgtb %mm1,%mm3
+ pcmpgtb %mm5,%mm7
+ pand %mm2,%mm3
+ pand %mm2,%mm7
+ paddb %mm1,%mm1
+ paddb %mm5,%mm5
+ pxor %mm3,%mm1
+ pxor %mm7,%mm5
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ movq %mm1,%mm3
+ movq %mm5,%mm7
+ pshufw $177,%mm1,%mm2
+ pshufw $177,%mm5,%mm6
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ pslld $8,%mm1
+ pslld $8,%mm5
+ psrld $8,%mm3
+ psrld $8,%mm7
+ movq (%edi),%mm2
+ movq 8(%edi),%mm6
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ movl -128(%ebp),%eax
+ pslld $16,%mm1
+ pslld $16,%mm5
+ movl -64(%ebp),%ebx
+ psrld $16,%mm3
+ psrld $16,%mm7
+ movl (%ebp),%ecx
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ movl 64(%ebp),%edx
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ jmp .L007loop
+.align 16
+.L008out:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ ret
+.size _sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact
+.type _x86_AES_decrypt,@function
+.align 16
+_x86_AES_decrypt:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+.align 16
+.L009loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %dh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %ah,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %bh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movl (%ebp,%edx,8),%edx
+ movzbl %ch,%ecx
+ xorl 3(%ebp,%ecx,8),%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ xorl 2(%ebp,%ebx,8),%edx
+ movl 8(%esp),%ebx
+ shrl $24,%eax
+ xorl 1(%ebp,%eax,8),%edx
+ movl 4(%esp),%eax
+
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L009loop
+ leal 2176(%ebp),%ebp
+ movl -128(%ebp),%edi
+ movl -96(%ebp),%esi
+ movl -64(%ebp),%edi
+ movl -32(%ebp),%esi
+ movl (%ebp),%edi
+ movl 32(%ebp),%esi
+ movl 64(%ebp),%edi
+ movl 96(%ebp),%esi
+ leal -128(%ebp),%ebp
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl (%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ movzbl (%ebp,%esi,1),%esi
+ movzbl %ah,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ movzbl (%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movzbl (%ebp,%edx,1),%edx
+ movzbl %ch,%ecx
+ movzbl (%ebp,%ecx,1),%ecx
+ shll $8,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ movzbl (%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ shrl $24,%eax
+ movzbl (%ebp,%eax,1),%eax
+ shll $24,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ leal -2048(%ebp),%ebp
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ ret
+.align 64
+.LAES_Td:
+.long 1353184337,1353184337
+.long 1399144830,1399144830
+.long 3282310938,3282310938
+.long 2522752826,2522752826
+.long 3412831035,3412831035
+.long 4047871263,4047871263
+.long 2874735276,2874735276
+.long 2466505547,2466505547
+.long 1442459680,1442459680
+.long 4134368941,4134368941
+.long 2440481928,2440481928
+.long 625738485,625738485
+.long 4242007375,4242007375
+.long 3620416197,3620416197
+.long 2151953702,2151953702
+.long 2409849525,2409849525
+.long 1230680542,1230680542
+.long 1729870373,1729870373
+.long 2551114309,2551114309
+.long 3787521629,3787521629
+.long 41234371,41234371
+.long 317738113,317738113
+.long 2744600205,2744600205
+.long 3338261355,3338261355
+.long 3881799427,3881799427
+.long 2510066197,2510066197
+.long 3950669247,3950669247
+.long 3663286933,3663286933
+.long 763608788,763608788
+.long 3542185048,3542185048
+.long 694804553,694804553
+.long 1154009486,1154009486
+.long 1787413109,1787413109
+.long 2021232372,2021232372
+.long 1799248025,1799248025
+.long 3715217703,3715217703
+.long 3058688446,3058688446
+.long 397248752,397248752
+.long 1722556617,1722556617
+.long 3023752829,3023752829
+.long 407560035,407560035
+.long 2184256229,2184256229
+.long 1613975959,1613975959
+.long 1165972322,1165972322
+.long 3765920945,3765920945
+.long 2226023355,2226023355
+.long 480281086,480281086
+.long 2485848313,2485848313
+.long 1483229296,1483229296
+.long 436028815,436028815
+.long 2272059028,2272059028
+.long 3086515026,3086515026
+.long 601060267,601060267
+.long 3791801202,3791801202
+.long 1468997603,1468997603
+.long 715871590,715871590
+.long 120122290,120122290
+.long 63092015,63092015
+.long 2591802758,2591802758
+.long 2768779219,2768779219
+.long 4068943920,4068943920
+.long 2997206819,2997206819
+.long 3127509762,3127509762
+.long 1552029421,1552029421
+.long 723308426,723308426
+.long 2461301159,2461301159
+.long 4042393587,4042393587
+.long 2715969870,2715969870
+.long 3455375973,3455375973
+.long 3586000134,3586000134
+.long 526529745,526529745
+.long 2331944644,2331944644
+.long 2639474228,2639474228
+.long 2689987490,2689987490
+.long 853641733,853641733
+.long 1978398372,1978398372
+.long 971801355,971801355
+.long 2867814464,2867814464
+.long 111112542,111112542
+.long 1360031421,1360031421
+.long 4186579262,4186579262
+.long 1023860118,1023860118
+.long 2919579357,2919579357
+.long 1186850381,1186850381
+.long 3045938321,3045938321
+.long 90031217,90031217
+.long 1876166148,1876166148
+.long 4279586912,4279586912
+.long 620468249,620468249
+.long 2548678102,2548678102
+.long 3426959497,3426959497
+.long 2006899047,2006899047
+.long 3175278768,3175278768
+.long 2290845959,2290845959
+.long 945494503,945494503
+.long 3689859193,3689859193
+.long 1191869601,1191869601
+.long 3910091388,3910091388
+.long 3374220536,3374220536
+.long 0,0
+.long 2206629897,2206629897
+.long 1223502642,1223502642
+.long 2893025566,2893025566
+.long 1316117100,1316117100
+.long 4227796733,4227796733
+.long 1446544655,1446544655
+.long 517320253,517320253
+.long 658058550,658058550
+.long 1691946762,1691946762
+.long 564550760,564550760
+.long 3511966619,3511966619
+.long 976107044,976107044
+.long 2976320012,2976320012
+.long 266819475,266819475
+.long 3533106868,3533106868
+.long 2660342555,2660342555
+.long 1338359936,1338359936
+.long 2720062561,2720062561
+.long 1766553434,1766553434
+.long 370807324,370807324
+.long 179999714,179999714
+.long 3844776128,3844776128
+.long 1138762300,1138762300
+.long 488053522,488053522
+.long 185403662,185403662
+.long 2915535858,2915535858
+.long 3114841645,3114841645
+.long 3366526484,3366526484
+.long 2233069911,2233069911
+.long 1275557295,1275557295
+.long 3151862254,3151862254
+.long 4250959779,4250959779
+.long 2670068215,2670068215
+.long 3170202204,3170202204
+.long 3309004356,3309004356
+.long 880737115,880737115
+.long 1982415755,1982415755
+.long 3703972811,3703972811
+.long 1761406390,1761406390
+.long 1676797112,1676797112
+.long 3403428311,3403428311
+.long 277177154,277177154
+.long 1076008723,1076008723
+.long 538035844,538035844
+.long 2099530373,2099530373
+.long 4164795346,4164795346
+.long 288553390,288553390
+.long 1839278535,1839278535
+.long 1261411869,1261411869
+.long 4080055004,4080055004
+.long 3964831245,3964831245
+.long 3504587127,3504587127
+.long 1813426987,1813426987
+.long 2579067049,2579067049
+.long 4199060497,4199060497
+.long 577038663,577038663
+.long 3297574056,3297574056
+.long 440397984,440397984
+.long 3626794326,3626794326
+.long 4019204898,4019204898
+.long 3343796615,3343796615
+.long 3251714265,3251714265
+.long 4272081548,4272081548
+.long 906744984,906744984
+.long 3481400742,3481400742
+.long 685669029,685669029
+.long 646887386,646887386
+.long 2764025151,2764025151
+.long 3835509292,3835509292
+.long 227702864,227702864
+.long 2613862250,2613862250
+.long 1648787028,1648787028
+.long 3256061430,3256061430
+.long 3904428176,3904428176
+.long 1593260334,1593260334
+.long 4121936770,4121936770
+.long 3196083615,3196083615
+.long 2090061929,2090061929
+.long 2838353263,2838353263
+.long 3004310991,3004310991
+.long 999926984,999926984
+.long 2809993232,2809993232
+.long 1852021992,1852021992
+.long 2075868123,2075868123
+.long 158869197,158869197
+.long 4095236462,4095236462
+.long 28809964,28809964
+.long 2828685187,2828685187
+.long 1701746150,1701746150
+.long 2129067946,2129067946
+.long 147831841,147831841
+.long 3873969647,3873969647
+.long 3650873274,3650873274
+.long 3459673930,3459673930
+.long 3557400554,3557400554
+.long 3598495785,3598495785
+.long 2947720241,2947720241
+.long 824393514,824393514
+.long 815048134,815048134
+.long 3227951669,3227951669
+.long 935087732,935087732
+.long 2798289660,2798289660
+.long 2966458592,2966458592
+.long 366520115,366520115
+.long 1251476721,1251476721
+.long 4158319681,4158319681
+.long 240176511,240176511
+.long 804688151,804688151
+.long 2379631990,2379631990
+.long 1303441219,1303441219
+.long 1414376140,1414376140
+.long 3741619940,3741619940
+.long 3820343710,3820343710
+.long 461924940,461924940
+.long 3089050817,3089050817
+.long 2136040774,2136040774
+.long 82468509,82468509
+.long 1563790337,1563790337
+.long 1937016826,1937016826
+.long 776014843,776014843
+.long 1511876531,1511876531
+.long 1389550482,1389550482
+.long 861278441,861278441
+.long 323475053,323475053
+.long 2355222426,2355222426
+.long 2047648055,2047648055
+.long 2383738969,2383738969
+.long 2302415851,2302415851
+.long 3995576782,3995576782
+.long 902390199,902390199
+.long 3991215329,3991215329
+.long 1018251130,1018251130
+.long 1507840668,1507840668
+.long 1064563285,1064563285
+.long 2043548696,2043548696
+.long 3208103795,3208103795
+.long 3939366739,3939366739
+.long 1537932639,1537932639
+.long 342834655,342834655
+.long 2262516856,2262516856
+.long 2180231114,2180231114
+.long 1053059257,1053059257
+.long 741614648,741614648
+.long 1598071746,1598071746
+.long 1925389590,1925389590
+.long 203809468,203809468
+.long 2336832552,2336832552
+.long 1100287487,1100287487
+.long 1895934009,1895934009
+.long 3736275976,3736275976
+.long 2632234200,2632234200
+.long 2428589668,2428589668
+.long 1636092795,1636092795
+.long 1890988757,1890988757
+.long 1952214088,1952214088
+.long 1113045200,1113045200
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.size _x86_AES_decrypt,.-_x86_AES_decrypt
+.globl AES_decrypt
+.type AES_decrypt,@function
+.align 16
+AES_decrypt:
+.L_AES_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 28(%esp),%edi
+ movl %esp,%eax
+ subl $36,%esp
+ andl $-64,%esp
+ leal -127(%edi),%ebx
+ subl %esp,%ebx
+ negl %ebx
+ andl $960,%ebx
+ subl %ebx,%esp
+ addl $4,%esp
+ movl %eax,28(%esp)
+ call .L010pic_point
+.L010pic_point:
+ popl %ebp
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L010pic_point](%ebp),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ leal .LAES_Td-.L010pic_point(%ebp),%ebp
+ leal 764(%esp),%ebx
+ subl %ebp,%ebx
+ andl $768,%ebx
+ leal 2176(%ebp,%ebx,1),%ebp
+ btl $25,(%eax)
+ jnc .L011x86
+ movq (%esi),%mm0
+ movq 8(%esi),%mm4
+ call _sse_AES_decrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movq %mm0,(%esi)
+ movq %mm4,8(%esi)
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 16
+.L011x86:
+ movl %ebp,24(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ call _x86_AES_decrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size AES_decrypt,.-.L_AES_decrypt_begin
+.globl AES_cbc_encrypt
+.type AES_cbc_encrypt,@function
+.align 16
+AES_cbc_encrypt:
+.L_AES_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ecx
+ cmpl $0,%ecx
+ je .L012drop_out
+ call .L013pic_point
+.L013pic_point:
+ popl %ebp
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L013pic_point](%ebp),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ cmpl $0,40(%esp)
+ leal .LAES_Te-.L013pic_point(%ebp),%ebp
+ jne .L014picked_te
+ leal .LAES_Td-.LAES_Te(%ebp),%ebp
+.L014picked_te:
+ pushfl
+ cld
+ cmpl $512,%ecx
+ jb .L015slow_way
+ testl $15,%ecx
+ jnz .L015slow_way
+ btl $28,(%eax)
+ jc .L015slow_way
+ leal -324(%esp),%esi
+ andl $-64,%esi
+ movl %ebp,%eax
+ leal 2304(%ebp),%ebx
+ movl %esi,%edx
+ andl $4095,%eax
+ andl $4095,%ebx
+ andl $4095,%edx
+ cmpl %ebx,%edx
+ jb .L016tbl_break_out
+ subl %ebx,%edx
+ subl %edx,%esi
+ jmp .L017tbl_ok
+.align 4
+.L016tbl_break_out:
+ subl %eax,%edx
+ andl $4095,%edx
+ addl $384,%edx
+ subl %edx,%esi
+.align 4
+.L017tbl_ok:
+ leal 24(%esp),%edx
+ xchgl %esi,%esp
+ addl $4,%esp
+ movl %ebp,24(%esp)
+ movl %esi,28(%esp)
+ movl (%edx),%eax
+ movl 4(%edx),%ebx
+ movl 12(%edx),%edi
+ movl 16(%edx),%esi
+ movl 20(%edx),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edi,44(%esp)
+ movl %esi,48(%esp)
+ movl $0,316(%esp)
+ movl %edi,%ebx
+ movl $61,%ecx
+ subl %ebp,%ebx
+ movl %edi,%esi
+ andl $4095,%ebx
+ leal 76(%esp),%edi
+ cmpl $2304,%ebx
+ jb .L018do_copy
+ cmpl $3852,%ebx
+ jb .L019skip_copy
+.align 4
+.L018do_copy:
+ movl %edi,44(%esp)
+.long 2784229001
+.L019skip_copy:
+ movl $16,%edi
+.align 4
+.L020prefetch_tbl:
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%esi
+ leal 128(%ebp),%ebp
+ subl $1,%edi
+ jnz .L020prefetch_tbl
+ subl $2048,%ebp
+ movl 32(%esp),%esi
+ movl 48(%esp),%edi
+ cmpl $0,%edx
+ je .L021fast_decrypt
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+.align 16
+.L022fast_enc_loop:
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ xorl (%esi),%eax
+ xorl 4(%esi),%ebx
+ xorl 8(%esi),%ecx
+ xorl 12(%esi),%edx
+ movl 44(%esp),%edi
+ call _x86_AES_encrypt
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ leal 16(%esi),%esi
+ movl 40(%esp),%ecx
+ movl %esi,32(%esp)
+ leal 16(%edi),%edx
+ movl %edx,36(%esp)
+ subl $16,%ecx
+ movl %ecx,40(%esp)
+ jnz .L022fast_enc_loop
+ movl 48(%esp),%esi
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ cmpl $0,316(%esp)
+ movl 44(%esp),%edi
+ je .L023skip_ezero
+ movl $60,%ecx
+ xorl %eax,%eax
+.align 4
+.long 2884892297
+.L023skip_ezero:
+ movl 28(%esp),%esp
+ popfl
+.L012drop_out:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L021fast_decrypt:
+ cmpl 36(%esp),%esi
+ je .L024fast_dec_in_place
+ movl %edi,52(%esp)
+.align 4
+.align 16
+.L025fast_dec_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl 44(%esp),%edi
+ call _x86_AES_decrypt
+ movl 52(%esp),%edi
+ movl 40(%esp),%esi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 36(%esp),%edi
+ movl 32(%esp),%esi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 40(%esp),%ecx
+ movl %esi,52(%esp)
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ leal 16(%edi),%edi
+ movl %edi,36(%esp)
+ subl $16,%ecx
+ movl %ecx,40(%esp)
+ jnz .L025fast_dec_loop
+ movl 52(%esp),%edi
+ movl 48(%esp),%esi
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ jmp .L026fast_dec_out
+.align 16
+.L024fast_dec_in_place:
+.L027fast_dec_in_place_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ leal 60(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 44(%esp),%edi
+ call _x86_AES_decrypt
+ movl 48(%esp),%edi
+ movl 36(%esp),%esi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ leal 16(%esi),%esi
+ movl %esi,36(%esp)
+ leal 60(%esp),%esi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 32(%esp),%esi
+ movl 40(%esp),%ecx
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ subl $16,%ecx
+ movl %ecx,40(%esp)
+ jnz .L027fast_dec_in_place_loop
+.align 4
+.L026fast_dec_out:
+ cmpl $0,316(%esp)
+ movl 44(%esp),%edi
+ je .L028skip_dzero
+ movl $60,%ecx
+ xorl %eax,%eax
+.align 4
+.long 2884892297
+.L028skip_dzero:
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L015slow_way:
+ movl (%eax),%eax
+ movl 36(%esp),%edi
+ leal -80(%esp),%esi
+ andl $-64,%esi
+ leal -143(%edi),%ebx
+ subl %esi,%ebx
+ negl %ebx
+ andl $960,%ebx
+ subl %ebx,%esi
+ leal 768(%esi),%ebx
+ subl %ebp,%ebx
+ andl $768,%ebx
+ leal 2176(%ebp,%ebx,1),%ebp
+ leal 24(%esp),%edx
+ xchgl %esi,%esp
+ addl $4,%esp
+ movl %ebp,24(%esp)
+ movl %esi,28(%esp)
+ movl %eax,52(%esp)
+ movl (%edx),%eax
+ movl 4(%edx),%ebx
+ movl 16(%edx),%esi
+ movl 20(%edx),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edi,44(%esp)
+ movl %esi,48(%esp)
+ movl %esi,%edi
+ movl %eax,%esi
+ cmpl $0,%edx
+ je .L029slow_decrypt
+ cmpl $16,%ecx
+ movl %ebx,%edx
+ jb .L030slow_enc_tail
+ btl $25,52(%esp)
+ jnc .L031slow_enc_x86
+ movq (%edi),%mm0
+ movq 8(%edi),%mm4
+.align 16
+.L032slow_enc_loop_sse:
+ pxor (%esi),%mm0
+ pxor 8(%esi),%mm4
+ movl 44(%esp),%edi
+ call _sse_AES_encrypt_compact
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ movl 40(%esp),%ecx
+ movq %mm0,(%edi)
+ movq %mm4,8(%edi)
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ leal 16(%edi),%edx
+ movl %edx,36(%esp)
+ subl $16,%ecx
+ cmpl $16,%ecx
+ movl %ecx,40(%esp)
+ jae .L032slow_enc_loop_sse
+ testl $15,%ecx
+ jnz .L030slow_enc_tail
+ movl 48(%esp),%esi
+ movq %mm0,(%esi)
+ movq %mm4,8(%esi)
+ emms
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L031slow_enc_x86:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+.align 4
+.L033slow_enc_loop_x86:
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ xorl (%esi),%eax
+ xorl 4(%esi),%ebx
+ xorl 8(%esi),%ecx
+ xorl 12(%esi),%edx
+ movl 44(%esp),%edi
+ call _x86_AES_encrypt_compact
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 40(%esp),%ecx
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ leal 16(%edi),%edx
+ movl %edx,36(%esp)
+ subl $16,%ecx
+ cmpl $16,%ecx
+ movl %ecx,40(%esp)
+ jae .L033slow_enc_loop_x86
+ testl $15,%ecx
+ jnz .L030slow_enc_tail
+ movl 48(%esp),%esi
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L030slow_enc_tail:
+ emms
+ movl %edx,%edi
+ movl $16,%ebx
+ subl %ecx,%ebx
+ cmpl %esi,%edi
+ je .L034enc_in_place
+.align 4
+.long 2767451785
+ jmp .L035enc_skip_in_place
+.L034enc_in_place:
+ leal (%edi,%ecx,1),%edi
+.L035enc_skip_in_place:
+ movl %ebx,%ecx
+ xorl %eax,%eax
+.align 4
+.long 2868115081
+ movl 48(%esp),%edi
+ movl %edx,%esi
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl $16,40(%esp)
+ jmp .L033slow_enc_loop_x86
+.align 16
+.L029slow_decrypt:
+ btl $25,52(%esp)
+ jnc .L036slow_dec_loop_x86
+.align 4
+.L037slow_dec_loop_sse:
+ movq (%esi),%mm0
+ movq 8(%esi),%mm4
+ movl 44(%esp),%edi
+ call _sse_AES_decrypt_compact
+ movl 32(%esp),%esi
+ leal 60(%esp),%eax
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl 48(%esp),%edi
+ movq (%esi),%mm1
+ movq 8(%esi),%mm5
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ movq %mm1,(%edi)
+ movq %mm5,8(%edi)
+ subl $16,%ecx
+ jc .L038slow_dec_partial_sse
+ movq %mm0,(%ebx)
+ movq %mm4,8(%ebx)
+ leal 16(%ebx),%ebx
+ movl %ebx,36(%esp)
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ movl %ecx,40(%esp)
+ jnz .L037slow_dec_loop_sse
+ emms
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L038slow_dec_partial_sse:
+ movq %mm0,(%eax)
+ movq %mm4,8(%eax)
+ emms
+ addl $16,%ecx
+ movl %ebx,%edi
+ movl %eax,%esi
+.align 4
+.long 2767451785
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L036slow_dec_loop_x86:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ leal 60(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 44(%esp),%edi
+ call _x86_AES_decrypt_compact
+ movl 48(%esp),%edi
+ movl 40(%esp),%esi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ subl $16,%esi
+ jc .L039slow_dec_partial_x86
+ movl %esi,40(%esp)
+ movl 36(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ leal 16(%esi),%esi
+ movl %esi,36(%esp)
+ leal 60(%esp),%esi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 32(%esp),%esi
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ jnz .L036slow_dec_loop_x86
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L039slow_dec_partial_x86:
+ leal 60(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 32(%esp),%esi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 40(%esp),%ecx
+ movl 36(%esp),%edi
+ leal 60(%esp),%esi
+.align 4
+.long 2767451785
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin
+.type _x86_AES_set_encrypt_key,@function
+.align 16
+_x86_AES_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 24(%esp),%esi
+ movl 32(%esp),%edi
+ testl $-1,%esi
+ jz .L040badpointer
+ testl $-1,%edi
+ jz .L040badpointer
+ call .L041pic_point
+.L041pic_point:
+ popl %ebp
+ leal .LAES_Te-.L041pic_point(%ebp),%ebp
+ leal 2176(%ebp),%ebp
+ movl -128(%ebp),%eax
+ movl -96(%ebp),%ebx
+ movl -64(%ebp),%ecx
+ movl -32(%ebp),%edx
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%edx
+ movl 28(%esp),%ecx
+ cmpl $128,%ecx
+ je .L04210rounds
+ cmpl $192,%ecx
+ je .L04312rounds
+ cmpl $256,%ecx
+ je .L04414rounds
+ movl $-2,%eax
+ jmp .L045exit
+.L04210rounds:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ xorl %ecx,%ecx
+ jmp .L04610shortcut
+.align 4
+.L04710loop:
+ movl (%edi),%eax
+ movl 12(%edi),%edx
+.L04610shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+ xorl 896(%ebp,%ecx,4),%eax
+ movl %eax,16(%edi)
+ xorl 4(%edi),%eax
+ movl %eax,20(%edi)
+ xorl 8(%edi),%eax
+ movl %eax,24(%edi)
+ xorl 12(%edi),%eax
+ movl %eax,28(%edi)
+ incl %ecx
+ addl $16,%edi
+ cmpl $10,%ecx
+ jl .L04710loop
+ movl $10,80(%edi)
+ xorl %eax,%eax
+ jmp .L045exit
+.L04312rounds:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 16(%esi),%ecx
+ movl 20(%esi),%edx
+ movl %ecx,16(%edi)
+ movl %edx,20(%edi)
+ xorl %ecx,%ecx
+ jmp .L04812shortcut
+.align 4
+.L04912loop:
+ movl (%edi),%eax
+ movl 20(%edi),%edx
+.L04812shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+ xorl 896(%ebp,%ecx,4),%eax
+ movl %eax,24(%edi)
+ xorl 4(%edi),%eax
+ movl %eax,28(%edi)
+ xorl 8(%edi),%eax
+ movl %eax,32(%edi)
+ xorl 12(%edi),%eax
+ movl %eax,36(%edi)
+ cmpl $7,%ecx
+ je .L05012break
+ incl %ecx
+ xorl 16(%edi),%eax
+ movl %eax,40(%edi)
+ xorl 20(%edi),%eax
+ movl %eax,44(%edi)
+ addl $24,%edi
+ jmp .L04912loop
+.L05012break:
+ movl $12,72(%edi)
+ xorl %eax,%eax
+ jmp .L045exit
+.L04414rounds:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ movl %eax,16(%edi)
+ movl %ebx,20(%edi)
+ movl %ecx,24(%edi)
+ movl %edx,28(%edi)
+ xorl %ecx,%ecx
+ jmp .L05114shortcut
+.align 4
+.L05214loop:
+ movl 28(%edi),%edx
+.L05114shortcut:
+ movl (%edi),%eax
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+ xorl 896(%ebp,%ecx,4),%eax
+ movl %eax,32(%edi)
+ xorl 4(%edi),%eax
+ movl %eax,36(%edi)
+ xorl 8(%edi),%eax
+ movl %eax,40(%edi)
+ xorl 12(%edi),%eax
+ movl %eax,44(%edi)
+ cmpl $6,%ecx
+ je .L05314break
+ incl %ecx
+ movl %eax,%edx
+ movl 16(%edi),%eax
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ shll $8,%ebx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $16,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movl %eax,48(%edi)
+ xorl 20(%edi),%eax
+ movl %eax,52(%edi)
+ xorl 24(%edi),%eax
+ movl %eax,56(%edi)
+ xorl 28(%edi),%eax
+ movl %eax,60(%edi)
+ addl $32,%edi
+ jmp .L05214loop
+.L05314break:
+ movl $14,48(%edi)
+ xorl %eax,%eax
+ jmp .L045exit
+.L040badpointer:
+ movl $-1,%eax
+.L045exit:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,@function
+.align 16
+private_AES_set_encrypt_key:
+.L_private_AES_set_encrypt_key_begin:
+ call _x86_AES_set_encrypt_key
+ ret
+.size private_AES_set_encrypt_key,.-.L_private_AES_set_encrypt_key_begin
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,@function
+.align 16
+private_AES_set_decrypt_key:
+.L_private_AES_set_decrypt_key_begin:
+ call _x86_AES_set_encrypt_key
+ cmpl $0,%eax
+ je .L054proceed
+ ret
+.L054proceed:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%esi
+ movl 240(%esi),%ecx
+ leal (,%ecx,4),%ecx
+ leal (%esi,%ecx,4),%edi
+.align 4
+.L055invert:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl (%edi),%ecx
+ movl 4(%edi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,(%esi)
+ movl %edx,4(%esi)
+ movl 8(%esi),%eax
+ movl 12(%esi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,8(%edi)
+ movl %ebx,12(%edi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ addl $16,%esi
+ subl $16,%edi
+ cmpl %edi,%esi
+ jne .L055invert
+ movl 28(%esp),%edi
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,28(%esp)
+ movl 16(%edi),%eax
+.align 4
+.L056permute:
+ addl $16,%edi
+ movl %eax,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%ebx
+ subl %ebp,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%ecx
+ subl %ebp,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %eax,%ebx
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edx
+ xorl %eax,%ecx
+ subl %ebp,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ roll $8,%eax
+ xorl %esi,%edx
+ movl 4(%edi),%ebp
+ xorl %ebx,%eax
+ xorl %edx,%ebx
+ xorl %ecx,%eax
+ roll $24,%ebx
+ xorl %edx,%ecx
+ xorl %edx,%eax
+ roll $16,%ecx
+ xorl %ebx,%eax
+ roll $8,%edx
+ xorl %ecx,%eax
+ movl %ebp,%ebx
+ xorl %edx,%eax
+ movl %eax,(%edi)
+ movl %ebx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%ecx
+ subl %ebp,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edx
+ subl %ebp,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %ebx,%ecx
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%eax
+ xorl %ebx,%edx
+ subl %ebp,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ roll $8,%ebx
+ xorl %esi,%eax
+ movl 8(%edi),%ebp
+ xorl %ecx,%ebx
+ xorl %eax,%ecx
+ xorl %edx,%ebx
+ roll $24,%ecx
+ xorl %eax,%edx
+ xorl %eax,%ebx
+ roll $16,%edx
+ xorl %ecx,%ebx
+ roll $8,%eax
+ xorl %edx,%ebx
+ movl %ebp,%ecx
+ xorl %eax,%ebx
+ movl %ebx,4(%edi)
+ movl %ecx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edx
+ subl %ebp,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%eax
+ subl %ebp,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ xorl %ecx,%edx
+ xorl %eax,%esi
+ movl %esi,%eax
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%ebx
+ xorl %ecx,%eax
+ subl %ebp,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ roll $8,%ecx
+ xorl %esi,%ebx
+ movl 12(%edi),%ebp
+ xorl %edx,%ecx
+ xorl %ebx,%edx
+ xorl %eax,%ecx
+ roll $24,%edx
+ xorl %ebx,%eax
+ xorl %ebx,%ecx
+ roll $16,%eax
+ xorl %edx,%ecx
+ roll $8,%ebx
+ xorl %eax,%ecx
+ movl %ebp,%edx
+ xorl %ebx,%ecx
+ movl %ecx,8(%edi)
+ movl %edx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%eax
+ subl %ebp,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ xorl %eax,%esi
+ movl %esi,%eax
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%ebx
+ subl %ebp,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %edx,%eax
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%ecx
+ xorl %edx,%ebx
+ subl %ebp,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ roll $8,%edx
+ xorl %esi,%ecx
+ movl 16(%edi),%ebp
+ xorl %eax,%edx
+ xorl %ecx,%eax
+ xorl %ebx,%edx
+ roll $24,%eax
+ xorl %ecx,%ebx
+ xorl %ecx,%edx
+ roll $16,%ebx
+ xorl %eax,%edx
+ roll $8,%ecx
+ xorl %ebx,%edx
+ movl %ebp,%eax
+ xorl %ecx,%edx
+ movl %edx,12(%edi)
+ cmpl 28(%esp),%edi
+ jb .L056permute
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size private_AES_set_decrypt_key,.-.L_private_AES_set_decrypt_key_begin
+.byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89
+.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114
+.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm OPENSSL_ia32cap_P,8,4
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
old mode 100755
new mode 100644
index aab40e6..51b500d
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -39,13 +39,13 @@
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
# register window little-endian flavor could achieve slightly higher
# Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent µ-archs...
+# better performance on most recent µ-archs...
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
# up to 40% performance imrovement of CBC benchmark results. 40% was
@@ -223,7 +223,7 @@
$speed_limit=512; # chunks smaller than $speed_limit are
# processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
- # recent µ-archs], but ~5 times smaller!
+ # recent µ-archs], but ~5 times smaller!
# I favor compact code to minimize cache
# contention and in hope to "collect" 5% back
# in real-life applications...
@@ -562,7 +562,7 @@ ()
# Performance is not actually extraordinary in comparison to pure
# x86 code. In particular encrypt performance is virtually the same.
# Decrypt performance on the other hand is 15-20% better on newer
-# µ-archs [but we're thankful for *any* improvement here], and ~50%
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
# better on PIII:-) And additionally on the pros side this code
# eliminates redundant references to stack and thus relieves/
# minimizes the pressure on the memory bus.
@@ -2854,12 +2854,12 @@ ()
&set_label("exit");
&function_end("_x86_AES_set_encrypt_key");
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
+&function_begin_B("private_AES_set_encrypt_key");
&call ("_x86_AES_set_encrypt_key");
&ret ();
-&function_end_B("AES_set_encrypt_key");
+&function_end_B("private_AES_set_encrypt_key");
sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2916,9 +2916,9 @@ ()
&mov (&DWP(4*$i,$key),$tp1);
}
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
+&function_begin_B("private_AES_set_decrypt_key");
&call ("_x86_AES_set_encrypt_key");
&cmp ("eax",0);
&je (&label("proceed"));
@@ -2974,7 +2974,7 @@ ()
&jb (&label("permute"));
&xor ("eax","eax"); # return success
-&function_end("AES_set_decrypt_key");
+&function_end("private_AES_set_decrypt_key");
&asciz("AES for x86, CRYPTOGAMS by ");
&asm_finish();
diff --git a/crypto/aes/asm/aes-armv4.s b/crypto/aes/asm/aes-armv4.S
similarity index 93%
rename from crypto/aes/asm/aes-armv4.s
rename to crypto/aes/asm/aes-armv4.S
index 27c681c..2697d4c 100644
--- a/crypto/aes/asm/aes-armv4.s
+++ b/crypto/aes/asm/aes-armv4.S
@@ -1,3 +1,4 @@
+#include "arm_arch.h"
.text
.code 32
@@ -118,7 +119,7 @@ AES_encrypt:
mov r12,r0 @ inp
mov r11,r2
sub r10,r3,#AES_encrypt-AES_Te @ Te
-
+#if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner...
ldrb r5,[r12,#1]
@@ -147,10 +148,33 @@ AES_encrypt:
orr r3,r3,r4,lsl#8
orr r3,r3,r5,lsl#16
orr r3,r3,r6,lsl#24
-
+#else
+ ldr r0,[r12,#0]
+ ldr r1,[r12,#4]
+ ldr r2,[r12,#8]
+ ldr r3,[r12,#12]
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#endif
bl _armv4_AES_encrypt
ldr r12,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ str r0,[r12,#0]
+ str r1,[r12,#4]
+ str r2,[r12,#8]
+ str r3,[r12,#12]
+#else
mov r4,r0,lsr#24 @ write output in endian-neutral
mov r5,r0,lsr#16 @ manner...
mov r6,r0,lsr#8
@@ -179,11 +203,15 @@ AES_encrypt:
strb r5,[r12,#13]
strb r6,[r12,#14]
strb r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
.size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function
@@ -223,11 +251,11 @@ _armv4_AES_encrypt:
and r8,lr,r2,lsr#16 @ i1
eor r6,r6,r9,ror#8
and r9,lr,r2
- eor r1,r1,r4,ror#24
ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8]
+ eor r1,r1,r4,ror#24
+ ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16]
mov r2,r2,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16]
ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0]
eor r0,r0,r7,ror#16
ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24]
@@ -236,16 +264,16 @@ _armv4_AES_encrypt:
and r8,lr,r3,lsr#8 @ i1
eor r6,r6,r9,ror#16
and r9,lr,r3,lsr#16 @ i2
- eor r2,r2,r5,ror#16
ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0]
+ eor r2,r2,r5,ror#16
+ ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8]
mov r3,r3,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8]
ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16]
eor r0,r0,r7,ror#24
- ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24]
- eor r1,r1,r8,ror#16
ldr r7,[r11],#16
+ eor r1,r1,r8,ror#16
+ ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24]
eor r2,r2,r9,ror#8
ldr r4,[r11,#-12]
eor r3,r3,r6,ror#8
@@ -285,11 +313,11 @@ _armv4_AES_encrypt:
and r8,lr,r2,lsr#16 @ i1
eor r6,r9,r6,lsl#8
and r9,lr,r2
- eor r1,r4,r1,lsl#24
ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8]
+ eor r1,r4,r1,lsl#24
+ ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16]
mov r2,r2,lsr#24
- ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16]
ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0]
eor r0,r7,r0,lsl#8
ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24]
@@ -298,15 +326,15 @@ _armv4_AES_encrypt:
and r8,lr,r3,lsr#8 @ i1
eor r6,r9,r6,lsl#8
and r9,lr,r3,lsr#16 @ i2
- eor r2,r5,r2,lsl#24
ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0]
+ eor r2,r5,r2,lsl#24
+ ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8]
mov r3,r3,lsr#24
- ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8]
ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16]
eor r0,r7,r0,lsl#8
- ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24]
ldr r7,[r11,#0]
+ ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24]
eor r1,r1,r8,lsl#8
ldr r4,[r11,#4]
eor r2,r2,r9,lsl#16
@@ -323,10 +351,11 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
-.global AES_set_encrypt_key
-.type AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,%function
.align 5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
sub r3,pc,#8 @ AES_set_encrypt_key
teq r0,#0
moveq r0,#-1
@@ -344,12 +373,13 @@ AES_set_encrypt_key:
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
- sub r10,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
+ sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
mov r12,r0 @ inp
mov lr,r1 @ bits
mov r11,r2 @ key
+#if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner...
ldrb r5,[r12,#1]
@@ -382,6 +412,22 @@ AES_set_encrypt_key:
orr r3,r3,r6,lsl#24
str r2,[r11,#-8]
str r3,[r11,#-4]
+#else
+ ldr r0,[r12,#0]
+ ldr r1,[r12,#4]
+ ldr r2,[r12,#8]
+ ldr r3,[r12,#12]
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ str r0,[r11],#16
+ str r1,[r11,#-12]
+ str r2,[r11,#-8]
+ str r3,[r11,#-4]
+#endif
teq lr,#128
bne .Lnot128
@@ -418,6 +464,7 @@ AES_set_encrypt_key:
b .Ldone
.Lnot128:
+#if __ARM_ARCH__<7
ldrb r8,[r12,#19]
ldrb r4,[r12,#18]
ldrb r5,[r12,#17]
@@ -434,6 +481,16 @@ AES_set_encrypt_key:
str r8,[r11],#8
orr r9,r9,r6,lsl#24
str r9,[r11,#-4]
+#else
+ ldr r8,[r12,#16]
+ ldr r9,[r12,#20]
+#ifdef __ARMEL__
+ rev r8,r8
+ rev r9,r9
+#endif
+ str r8,[r11],#8
+ str r9,[r11,#-4]
+#endif
teq lr,#192
bne .Lnot192
@@ -478,6 +535,7 @@ AES_set_encrypt_key:
b .L192_loop
.Lnot192:
+#if __ARM_ARCH__<7
ldrb r8,[r12,#27]
ldrb r4,[r12,#26]
ldrb r5,[r12,#25]
@@ -494,6 +552,16 @@ AES_set_encrypt_key:
str r8,[r11],#8
orr r9,r9,r6,lsl#24
str r9,[r11,#-4]
+#else
+ ldr r8,[r12,#24]
+ ldr r9,[r12,#28]
+#ifdef __ARMEL__
+ rev r8,r8
+ rev r9,r9
+#endif
+ str r8,[r11],#8
+ str r9,[r11,#-4]
+#endif
mov r12,#14
str r12,[r11,#240-32]
@@ -558,14 +626,14 @@ AES_set_encrypt_key:
.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
-.global AES_set_decrypt_key
-.type AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,%function
.align 5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
- bl AES_set_encrypt_key
+ bl _armv4_AES_set_encrypt_key
teq r0,#0
ldrne lr,[sp],#4 @ pop lr
bne .Labrt
@@ -639,11 +707,15 @@ AES_set_decrypt_key:
bne .Lmix
mov r0,#0
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.type AES_Td,%object
.align 5
@@ -758,7 +830,7 @@ AES_decrypt:
mov r12,r0 @ inp
mov r11,r2
sub r10,r3,#AES_decrypt-AES_Td @ Td
-
+#if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner...
ldrb r5,[r12,#1]
@@ -787,10 +859,33 @@ AES_decrypt:
orr r3,r3,r4,lsl#8
orr r3,r3,r5,lsl#16
orr r3,r3,r6,lsl#24
-
+#else
+ ldr r0,[r12,#0]
+ ldr r1,[r12,#4]
+ ldr r2,[r12,#8]
+ ldr r3,[r12,#12]
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#endif
bl _armv4_AES_decrypt
ldr r12,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ str r0,[r12,#0]
+ str r1,[r12,#4]
+ str r2,[r12,#8]
+ str r3,[r12,#12]
+#else
mov r4,r0,lsr#24 @ write output in endian-neutral
mov r5,r0,lsr#16 @ manner...
mov r6,r0,lsr#8
@@ -819,11 +914,15 @@ AES_decrypt:
strb r5,[r12,#13]
strb r6,[r12,#14]
strb r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
.size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function
@@ -863,11 +962,11 @@ _armv4_AES_decrypt:
and r8,lr,r2 @ i1
eor r6,r9,r6,ror#8
and r9,lr,r2,lsr#16
- eor r1,r1,r4,ror#8
ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8]
+ eor r1,r1,r4,ror#8
+ ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0]
mov r2,r2,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0]
ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16]
eor r0,r0,r7,ror#16
ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24]
@@ -876,22 +975,22 @@ _armv4_AES_decrypt:
and r8,lr,r3,lsr#8 @ i1
eor r6,r9,r6,ror#8
and r9,lr,r3 @ i2
- eor r2,r2,r5,ror#8
ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16]
+ eor r2,r2,r5,ror#8
+ ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8]
mov r3,r3,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8]
ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0]
eor r0,r0,r7,ror#8
- ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24]
+ ldr r7,[r11],#16
eor r1,r1,r8,ror#16
+ ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24]
eor r2,r2,r9,ror#24
- ldr r7,[r11],#16
- eor r3,r3,r6,ror#8
ldr r4,[r11,#-12]
- ldr r5,[r11,#-8]
eor r0,r0,r7
+ ldr r5,[r11,#-8]
+ eor r3,r3,r6,ror#8
ldr r6,[r11,#-4]
and r7,lr,r0,lsr#16
eor r1,r1,r4
@@ -932,11 +1031,11 @@ _armv4_AES_decrypt:
and r7,lr,r2,lsr#8 @ i0
eor r5,r5,r8,lsl#8
and r8,lr,r2 @ i1
- eor r6,r6,r9,lsl#8
ldrb r7,[r10,r7] @ Td4[s2>>8]
+ eor r6,r6,r9,lsl#8
+ ldrb r8,[r10,r8] @ Td4[s2>>0]
and r9,lr,r2,lsr#16
- ldrb r8,[r10,r8] @ Td4[s2>>0]
ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24]
eor r0,r0,r7,lsl#8
ldrb r9,[r10,r9] @ Td4[s2>>16]
@@ -944,11 +1043,11 @@ _armv4_AES_decrypt:
and r7,lr,r3,lsr#16 @ i0
eor r2,r5,r2,lsl#16
and r8,lr,r3,lsr#8 @ i1
- eor r6,r6,r9,lsl#16
ldrb r7,[r10,r7] @ Td4[s3>>16]
+ eor r6,r6,r9,lsl#16
+ ldrb r8,[r10,r8] @ Td4[s3>>8]
and r9,lr,r3 @ i2
- ldrb r8,[r10,r8] @ Td4[s3>>8]
ldrb r9,[r10,r9] @ Td4[s3>>0]
ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24]
eor r0,r0,r7,lsl#16
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index c51ee1f..86b86c4 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -27,6 +27,11 @@
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -46,6 +51,7 @@
$rounds="r12";
$code=<<___;
+#include "arm_arch.h"
.text
.code 32
@@ -166,7 +172,7 @@
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
-
+#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@@ -195,10 +201,33 @@
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
-
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+#endif
bl _armv4_AES_encrypt
ldr $rounds,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$rounds,#0]
+ str $s1,[$rounds,#4]
+ str $s2,[$rounds,#8]
+ str $s3,[$rounds,#12]
+#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@@ -227,11 +256,15 @@
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2
- eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
+ eor $s1,$s1,$t1,ror#24
+ ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
mov $s2,$s2,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
@@ -284,16 +317,16 @@
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2
- eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
+ eor $s2,$s2,$t2,ror#16
+ ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
mov $s3,$s3,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
eor $s0,$s0,$i1,ror#24
- ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
- eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16
+ eor $s1,$s1,$i2,ror#16
+ ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2
- eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
+ eor $s1,$t1,$s1,lsl#24
+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
mov $s2,$s2,lsr#24
- ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
@@ -346,15 +379,15 @@
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2
- eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
+ eor $s2,$t2,$s2,lsl#24
+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
mov $s3,$s3,lsr#24
- ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
eor $s0,$i1,$s0,lsl#8
- ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0]
+ ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16
@@ -371,10 +404,11 @@
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
-.global AES_set_encrypt_key
-.type AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,%function
.align 5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
sub r3,pc,#8 @ AES_set_encrypt_key
teq r0,#0
moveq r0,#-1
@@ -392,12 +426,13 @@
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
- sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
+ sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
mov $rounds,r0 @ inp
mov lr,r1 @ bits
mov $key,r2 @ key
+#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@@ -430,6 +465,22 @@
orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8]
str $s3,[$key,#-4]
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$key],#16
+ str $s1,[$key,#-12]
+ str $s2,[$key,#-8]
+ str $s3,[$key,#-4]
+#endif
teq lr,#128
bne .Lnot128
@@ -466,6 +517,7 @@
b .Ldone
.Lnot128:
+#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#19]
ldrb $t1,[$rounds,#18]
ldrb $t2,[$rounds,#17]
@@ -482,6 +534,16 @@
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
+#else
+ ldr $i2,[$rounds,#16]
+ ldr $i3,[$rounds,#20]
+#ifdef __ARMEL__
+ rev $i2,$i2
+ rev $i3,$i3
+#endif
+ str $i2,[$key],#8
+ str $i3,[$key,#-4]
+#endif
teq lr,#192
bne .Lnot192
@@ -526,6 +588,7 @@
b .L192_loop
.Lnot192:
+#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#27]
ldrb $t1,[$rounds,#26]
ldrb $t2,[$rounds,#25]
@@ -542,6 +605,16 @@
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
+#else
+ ldr $i2,[$rounds,#24]
+ ldr $i3,[$rounds,#28]
+#ifdef __ARMEL__
+ rev $i2,$i2
+ rev $i3,$i3
+#endif
+ str $i2,[$key],#8
+ str $i3,[$key,#-4]
+#endif
mov $rounds,#14
str $rounds,[$key,#240-32]
@@ -606,14 +679,14 @@
.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
-.global AES_set_decrypt_key
-.type AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,%function
.align 5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
- bl AES_set_encrypt_key
+ bl _armv4_AES_set_encrypt_key
teq r0,#0
ldrne lr,[sp],#4 @ pop lr
bne .Labrt
@@ -692,11 +765,15 @@
bne .Lmix
mov r0,#0
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.type AES_Td,%object
.align 5
@@ -811,7 +888,7 @@
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
-
+#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@@ -840,10 +917,33 @@
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
-
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+#endif
bl _armv4_AES_decrypt
ldr $rounds,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$rounds,#0]
+ str $s1,[$rounds,#4]
+ str $s2,[$rounds,#8]
+ str $s3,[$rounds,#12]
+#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@@ -872,11 +972,15 @@
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function
@@ -916,11 +1020,11 @@
and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16
- eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
+ eor $s1,$s1,$t1,ror#8
+ ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
mov $s2,$s2,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
@@ -929,22 +1033,22 @@
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2
- eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
+ eor $s2,$s2,$t2,ror#8
+ ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
mov $s3,$s3,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
eor $s0,$s0,$i1,ror#8
- ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
+ ldr $i1,[$key],#16
eor $s1,$s1,$i2,ror#16
+ ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s2,$s2,$i3,ror#24
- ldr $i1,[$key],#16
- eor $s3,$s3,$t3,ror#8
ldr $t1,[$key,#-12]
- ldr $t2,[$key,#-8]
eor $s0,$s0,$i1
+ ldr $t2,[$key,#-8]
+ eor $s3,$s3,$t3,ror#8
ldr $t3,[$key,#-4]
and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t1
@@ -985,11 +1089,11 @@
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1
- eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
+ eor $t3,$t3,$i3,lsl#8
+ ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
- ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
@@ -997,11 +1101,11 @@
and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
- eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
+ eor $t3,$t3,$i3,lsl#16
+ ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
- ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
diff --git a/crypto/aes/asm/aes-mips.s b/crypto/aes/asm/aes-mips.S
similarity index 98%
rename from crypto/aes/asm/aes-mips.s
rename to crypto/aes/asm/aes-mips.S
index 0c1d85a..f5750bf 100644
--- a/crypto/aes/asm/aes-mips.s
+++ b/crypto/aes/asm/aes-mips.S
@@ -259,6 +259,7 @@ AES_encrypt:
.frame $29,64,$31
.mask 3237937152,-4
.set noreorder
+ .cpload $25
sub $29,64
sw $31,64-1*4($29)
sw $30,64-2*4($29)
@@ -270,8 +271,6 @@ AES_encrypt:
sw $18,64-8*4($29)
sw $17,64-9*4($29)
sw $16,64-10*4($29)
- .cplocal $7
- .cpsetup $25,$0,AES_encrypt
.set reorder
la $7,AES_Te # PIC-ified 'load address'
@@ -567,6 +566,7 @@ AES_decrypt:
.frame $29,64,$31
.mask 3237937152,-4
.set noreorder
+ .cpload $25
sub $29,64
sw $31,64-1*4($29)
sw $30,64-2*4($29)
@@ -578,8 +578,6 @@ AES_decrypt:
sw $18,64-8*4($29)
sw $17,64-9*4($29)
sw $16,64-10*4($29)
- .cplocal $7
- .cpsetup $25,$0,AES_decrypt
.set reorder
la $7,AES_Td # PIC-ified 'load address'
@@ -870,17 +868,16 @@ _mips_AES_set_encrypt_key:
nop
.end _mips_AES_set_encrypt_key
-.globl AES_set_encrypt_key
-.ent AES_set_encrypt_key
-AES_set_encrypt_key:
+.globl private_AES_set_encrypt_key
+.ent private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
.frame $29,32,$31
.mask 3221225472,-4
.set noreorder
+ .cpload $25
sub $29,32
sw $31,32-1*4($29)
sw $30,32-2*4($29)
- .cplocal $7
- .cpsetup $25,$0,AES_set_encrypt_key
.set reorder
la $7,AES_Te # PIC-ified 'load address'
@@ -892,19 +889,18 @@ AES_set_encrypt_key:
lw $30,32-2*4($29)
jr $31
add $29,32
-.end AES_set_encrypt_key
+.end private_AES_set_encrypt_key
.align 5
-.globl AES_set_decrypt_key
-.ent AES_set_decrypt_key
-AES_set_decrypt_key:
+.globl private_AES_set_decrypt_key
+.ent private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
.frame $29,32,$31
.mask 3221225472,-4
.set noreorder
+ .cpload $25
sub $29,32
sw $31,32-1*4($29)
sw $30,32-2*4($29)
- .cplocal $7
- .cpsetup $25,$0,AES_set_decrypt_key
.set reorder
la $7,AES_Te # PIC-ified 'load address'
@@ -1004,7 +1000,7 @@ AES_set_decrypt_key:
lw $30,32-2*4($29)
jr $31
add $29,32
-.end AES_set_decrypt_key
+.end private_AES_set_decrypt_key
.rdata
.align 6
AES_Te:
diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl
index 2ce6def..e523954 100644
--- a/crypto/aes/asm/aes-mips.pl
+++ b/crypto/aes/asm/aes-mips.pl
@@ -1036,9 +1036,9 @@
nop
.end _mips_AES_set_encrypt_key
-.globl AES_set_encrypt_key
-.ent AES_set_encrypt_key
-AES_set_encrypt_key:
+.globl private_AES_set_encrypt_key
+.ent private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
.frame $sp,$FRAMESIZE,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
@@ -1060,7 +1060,7 @@
___
$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
.cplocal $Tbl
- .cpsetup $pf,$zero,AES_set_encrypt_key
+ .cpsetup $pf,$zero,private_AES_set_encrypt_key
___
$code.=<<___;
.set reorder
@@ -1083,7 +1083,7 @@
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE
-.end AES_set_encrypt_key
+.end private_AES_set_encrypt_key
___
my ($head,$tail)=($inp,$bits);
@@ -1091,9 +1091,9 @@
my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
$code.=<<___;
.align 5
-.globl AES_set_decrypt_key
-.ent AES_set_decrypt_key
-AES_set_decrypt_key:
+.globl private_AES_set_decrypt_key
+.ent private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
.frame $sp,$FRAMESIZE,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
@@ -1115,7 +1115,7 @@
___
$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
.cplocal $Tbl
- .cpsetup $pf,$zero,AES_set_decrypt_key
+ .cpsetup $pf,$zero,private_AES_set_decrypt_key
___
$code.=<<___;
.set reorder
@@ -1226,7 +1226,7 @@
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE
-.end AES_set_decrypt_key
+.end private_AES_set_decrypt_key
___
}}}
diff --git a/crypto/aes/asm/aes-parisc.pl b/crypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000..c36b6a2
--- /dev/null
+++ b/crypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1021 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+} else {
+ $LEVEL ="1.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+ # [+ argument transfer]
+$inp="%r26"; # arg0
+$out="%r25"; # arg1
+$key="%r24"; # arg2
+
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+
+$tbl="%r28";
+$rounds="%r29";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 64
+AES_encrypt
+ .PROC
+ .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+ $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
+ $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+ blr %r0,$tbl
+ ldi 3,$t0
+L\$enc_pic
+ andcm $tbl,$t0,$tbl
+ ldo L\$AES_Te-L\$enc_pic($tbl),$tbl
+
+ and $inp,$t0,$t0
+ sub $inp,$t0,$inp
+ ldw 0($inp),$s0
+ ldw 4($inp),$s1
+ ldw 8($inp),$s2
+ comib,= 0,$t0,L\$enc_inp_aligned
+ ldw 12($inp),$s3
+
+ sh3addl $t0,%r0,$t0
+ subi 32,$t0,$t0
+ mtctl $t0,%cr11
+ ldw 16($inp),$t1
+ vshd $s0,$s1,$s0
+ vshd $s1,$s2,$s1
+ vshd $s2,$s3,$s2
+ vshd $s3,$t1,$s3
+
+L\$enc_inp_aligned
+ bl _parisc_AES_encrypt,%r31
+ nop
+
+ extru,<> $out,31,2,%r0
+ b L\$enc_out_aligned
+ nop
+
+ _srm $s0,24,$acc0
+ _srm $s0,16,$acc1
+ stb $acc0,0($out)
+ _srm $s0,8,$acc2
+ stb $acc1,1($out)
+ _srm $s1,24,$acc4
+ stb $acc2,2($out)
+ _srm $s1,16,$acc5
+ stb $s0,3($out)
+ _srm $s1,8,$acc6
+ stb $acc4,4($out)
+ _srm $s2,24,$acc0
+ stb $acc5,5($out)
+ _srm $s2,16,$acc1
+ stb $acc6,6($out)
+ _srm $s2,8,$acc2
+ stb $s1,7($out)
+ _srm $s3,24,$acc4
+ stb $acc0,8($out)
+ _srm $s3,16,$acc5
+ stb $acc1,9($out)
+ _srm $s3,8,$acc6
+ stb $acc2,10($out)
+ stb $s2,11($out)
+ stb $acc4,12($out)
+ stb $acc5,13($out)
+ stb $acc6,14($out)
+ b L\$enc_done
+ stb $s3,15($out)
+
+L\$enc_out_aligned
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+
+L\$enc_done
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
+ $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 16
+_parisc_AES_encrypt
+ .PROC
+ .CALLINFO MILLICODE
+ .ENTRY
+ ldw 240($key),$rounds
+ ldw 0($key),$t0
+ ldw 4($key),$t1
+ ldw 8($key),$t2
+ _srm $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ldw 12($key),$t3
+ _srm $s0,24,$acc0
+ xor $t1,$s1,$s1
+ ldw 16($key),$t0
+ _srm $s1,16,$acc1
+ xor $t2,$s2,$s2
+ ldw 20($key),$t1
+ xor $t3,$s3,$s3
+ ldw 24($key),$t2
+ ldw 28($key),$t3
+L\$enc_loop
+ _srm $s2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $s3,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $s1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $s2,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $s3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $s0,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $s2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $s3,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $s0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $s1,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $s3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $s0,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $s1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $s2,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ ldwx,s $acc14($tbl),$acc14
+ ldwx,s $acc15($tbl),$acc15
+ addib,= -1,$rounds,L\$enc_last
+ ldo 32($key),$key
+
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ _srm $t1,16,$acc1
+ xor $acc15,$t3,$t3
+
+ _srm $t2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $t3,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $t1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $t2,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $t3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $t0,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $t2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $t3,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $t0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $t1,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $t3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $t0,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $t1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $t2,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ _ror $acc1,8,$acc1
+ ldwx,s $acc14($tbl),$acc14
+
+ _ror $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldwx,s $acc15($tbl),$acc15
+ _ror $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ldw 16($key),$t0
+ _ror $acc5,8,$acc5
+ xor $acc2,$s0,$s0
+ ldw 20($key),$t1
+ _ror $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ldw 24($key),$t2
+ _ror $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ldw 28($key),$t3
+ _ror $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldw 1024+0($tbl),%r0 ; prefetch te4
+ _ror $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldw 1024+32($tbl),%r0 ; prefetch te4
+ _ror $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldw 1024+64($tbl),%r0 ; prefetch te4
+ _ror $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldw 1024+96($tbl),%r0 ; prefetch te4
+ _ror $acc14,16,$acc14
+ xor $acc9,$s2,$s2
+ ldw 1024+128($tbl),%r0 ; prefetch te4
+ _ror $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldw 1024+160($tbl),%r0 ; prefetch te4
+ _srm $s0,24,$acc0
+ xor $acc11,$s2,$s2
+ ldw 1024+192($tbl),%r0 ; prefetch te4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldw 1024+224($tbl),%r0 ; prefetch te4
+ _srm $s1,16,$acc1
+ xor $acc14,$s3,$s3
+ b L\$enc_loop
+ xor $acc15,$s3,$s3
+
+ .ALIGN 16
+L\$enc_last
+ ldo 1024($tbl),$rounds
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ _srm $t1,16,$acc1
+ xor $acc15,$t3,$t3
+
+ _srm $t2,8,$acc2
+ ldbx $acc0($rounds),$acc0
+ _srm $t1,24,$acc4
+ ldbx $acc1($rounds),$acc1
+ _srm $t2,16,$acc5
+ _srm $t3,0,$acc3
+ ldbx $acc2($rounds),$acc2
+ ldbx $acc3($rounds),$acc3
+ _srm $t3,8,$acc6
+ ldbx $acc4($rounds),$acc4
+ _srm $t2,24,$acc8
+ ldbx $acc5($rounds),$acc5
+ _srm $t3,16,$acc9
+ _srm $t0,0,$acc7
+ ldbx $acc6($rounds),$acc6
+ ldbx $acc7($rounds),$acc7
+ _srm $t0,8,$acc10
+ ldbx $acc8($rounds),$acc8
+ _srm $t3,24,$acc12
+ ldbx $acc9($rounds),$acc9
+ _srm $t0,16,$acc13
+ _srm $t1,0,$acc11
+ ldbx $acc10($rounds),$acc10
+ _srm $t1,8,$acc14
+ ldbx $acc11($rounds),$acc11
+ ldbx $acc12($rounds),$acc12
+ ldbx $acc13($rounds),$acc13
+ _srm $t2,0,$acc15
+ ldbx $acc14($rounds),$acc14
+
+ dep $acc0,7,8,$acc3
+ ldbx $acc15($rounds),$acc15
+ dep $acc4,7,8,$acc7
+ dep $acc1,15,8,$acc3
+ dep $acc5,15,8,$acc7
+ dep $acc2,23,8,$acc3
+ dep $acc6,23,8,$acc7
+ xor $acc3,$s0,$s0
+ xor $acc7,$s1,$s1
+ dep $acc8,7,8,$acc11
+ dep $acc12,7,8,$acc15
+ dep $acc9,15,8,$acc11
+ dep $acc13,15,8,$acc15
+ dep $acc10,23,8,$acc11
+ dep $acc14,23,8,$acc15
+ xor $acc11,$s2,$s2
+
+ bv (%r31)
+ .EXIT
+ xor $acc15,$s3,$s3
+ .PROCEND
+
+ .ALIGN 64
+L\$AES_Te
+ .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+ .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+ .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+ .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+ .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+ .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+ .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+ .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+ .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+ .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+ .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+ .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+ .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+ .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+ .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+ .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+ .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+ .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+ .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+ .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+ .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+ .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+ .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+ .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+ .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+ .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+ .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+ .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+ .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+ .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+ .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+ .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+ .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+ .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+ .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+ .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+ .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+ .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+ .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+ .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+ .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+ .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+ .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+ .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+ .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+ .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+ .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+ .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+ .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+ .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+ .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+ .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+ .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+ .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+ .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+ .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+ .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+ .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+ .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+ .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+ .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+ .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+ .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+ .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+ .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+
+$code.=<<___;
+ .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 16
+AES_decrypt
+ .PROC
+ .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+ $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
+ $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+ blr %r0,$tbl
+ ldi 3,$t0
+L\$dec_pic
+ andcm $tbl,$t0,$tbl
+ ldo L\$AES_Td-L\$dec_pic($tbl),$tbl
+
+ and $inp,$t0,$t0
+ sub $inp,$t0,$inp
+ ldw 0($inp),$s0
+ ldw 4($inp),$s1
+ ldw 8($inp),$s2
+ comib,= 0,$t0,L\$dec_inp_aligned
+ ldw 12($inp),$s3
+
+ sh3addl $t0,%r0,$t0
+ subi 32,$t0,$t0
+ mtctl $t0,%cr11
+ ldw 16($inp),$t1
+ vshd $s0,$s1,$s0
+ vshd $s1,$s2,$s1
+ vshd $s2,$s3,$s2
+ vshd $s3,$t1,$s3
+
+L\$dec_inp_aligned
+ bl _parisc_AES_decrypt,%r31
+ nop
+
+ extru,<> $out,31,2,%r0
+ b L\$dec_out_aligned
+ nop
+
+ _srm $s0,24,$acc0
+ _srm $s0,16,$acc1
+ stb $acc0,0($out)
+ _srm $s0,8,$acc2
+ stb $acc1,1($out)
+ _srm $s1,24,$acc4
+ stb $acc2,2($out)
+ _srm $s1,16,$acc5
+ stb $s0,3($out)
+ _srm $s1,8,$acc6
+ stb $acc4,4($out)
+ _srm $s2,24,$acc0
+ stb $acc5,5($out)
+ _srm $s2,16,$acc1
+ stb $acc6,6($out)
+ _srm $s2,8,$acc2
+ stb $s1,7($out)
+ _srm $s3,24,$acc4
+ stb $acc0,8($out)
+ _srm $s3,16,$acc5
+ stb $acc1,9($out)
+ _srm $s3,8,$acc6
+ stb $acc2,10($out)
+ stb $s2,11($out)
+ stb $acc4,12($out)
+ stb $acc5,13($out)
+ stb $acc6,14($out)
+ b L\$dec_done
+ stb $s3,15($out)
+
+L\$dec_out_aligned
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+
+L\$dec_done
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
+ $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 16
+_parisc_AES_decrypt
+ .PROC
+ .CALLINFO MILLICODE
+ .ENTRY
+ ldw 240($key),$rounds
+ ldw 0($key),$t0
+ ldw 4($key),$t1
+ ldw 8($key),$t2
+ ldw 12($key),$t3
+ _srm $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ldw 16($key),$t0
+ xor $t1,$s1,$s1
+ ldw 20($key),$t1
+ _srm $s0,24,$acc0
+ xor $t2,$s2,$s2
+ ldw 24($key),$t2
+ xor $t3,$s3,$s3
+ ldw 28($key),$t3
+ _srm $s3,16,$acc1
+L\$dec_loop
+ _srm $s2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $s1,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $s1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $s0,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $s3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $s2,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $s2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $s1,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $s0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $s3,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $s3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $s2,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $s1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $s0,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ ldwx,s $acc14($tbl),$acc14
+ ldwx,s $acc15($tbl),$acc15
+ addib,= -1,$rounds,L\$dec_last
+ ldo 32($key),$key
+
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3
+ _srm $t3,16,$acc1
+
+ _srm $t2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $t1,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $t1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $t0,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $t3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $t2,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $t2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $t1,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $t0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $t3,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $t3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $t2,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $t1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $t0,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ _ror $acc1,8,$acc1
+ ldwx,s $acc14($tbl),$acc14
+
+ _ror $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldwx,s $acc15($tbl),$acc15
+ _ror $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ldw 16($key),$t0
+ _ror $acc5,8,$acc5
+ xor $acc2,$s0,$s0
+ ldw 20($key),$t1
+ _ror $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ldw 24($key),$t2
+ _ror $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ldw 28($key),$t3
+ _ror $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldw 1024+0($tbl),%r0 ; prefetch td4
+ _ror $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldw 1024+32($tbl),%r0 ; prefetch td4
+ _ror $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldw 1024+64($tbl),%r0 ; prefetch td4
+ _ror $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldw 1024+96($tbl),%r0 ; prefetch td4
+ _ror $acc14,16,$acc14
+ xor $acc9,$s2,$s2
+ ldw 1024+128($tbl),%r0 ; prefetch td4
+ _ror $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldw 1024+160($tbl),%r0 ; prefetch td4
+ _srm $s0,24,$acc0
+ xor $acc11,$s2,$s2
+ ldw 1024+192($tbl),%r0 ; prefetch td4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldw 1024+224($tbl),%r0 ; prefetch td4
+ xor $acc14,$s3,$s3
+ xor $acc15,$s3,$s3
+ b L\$dec_loop
+ _srm $s3,16,$acc1
+
+ .ALIGN 16
+L\$dec_last
+ ldo 1024($tbl),$rounds
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3
+ _srm $t3,16,$acc1
+
+ _srm $t2,8,$acc2
+ ldbx $acc0($rounds),$acc0
+ _srm $t1,24,$acc4
+ ldbx $acc1($rounds),$acc1
+ _srm $t0,16,$acc5
+ _srm $t1,0,$acc3
+ ldbx $acc2($rounds),$acc2
+ ldbx $acc3($rounds),$acc3
+ _srm $t3,8,$acc6
+ ldbx $acc4($rounds),$acc4
+ _srm $t2,24,$acc8
+ ldbx $acc5($rounds),$acc5
+ _srm $t1,16,$acc9
+ _srm $t2,0,$acc7
+ ldbx $acc6($rounds),$acc6
+ ldbx $acc7($rounds),$acc7
+ _srm $t0,8,$acc10
+ ldbx $acc8($rounds),$acc8
+ _srm $t3,24,$acc12
+ ldbx $acc9($rounds),$acc9
+ _srm $t2,16,$acc13
+ _srm $t3,0,$acc11
+ ldbx $acc10($rounds),$acc10
+ _srm $t1,8,$acc14
+ ldbx $acc11($rounds),$acc11
+ ldbx $acc12($rounds),$acc12
+ ldbx $acc13($rounds),$acc13
+ _srm $t0,0,$acc15
+ ldbx $acc14($rounds),$acc14
+
+ dep $acc0,7,8,$acc3
+ ldbx $acc15($rounds),$acc15
+ dep $acc4,7,8,$acc7
+ dep $acc1,15,8,$acc3
+ dep $acc5,15,8,$acc7
+ dep $acc2,23,8,$acc3
+ dep $acc6,23,8,$acc7
+ xor $acc3,$s0,$s0
+ xor $acc7,$s1,$s1
+ dep $acc8,7,8,$acc11
+ dep $acc12,7,8,$acc15
+ dep $acc9,15,8,$acc11
+ dep $acc13,15,8,$acc15
+ dep $acc10,23,8,$acc11
+ dep $acc14,23,8,$acc15
+ xor $acc11,$s2,$s2
+
+ bv (%r31)
+ .EXIT
+ xor $acc15,$s3,$s3
+ .PROCEND
+
+ .ALIGN 64
+L\$AES_Td
+ .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+ .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+ .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+ .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+ .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+ .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+ .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+ .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+ .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+ .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+ .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+ .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+ .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+ .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+ .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+ .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+ .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+ .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+ .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+ .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+ .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+ .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+ .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+ .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+ .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+ .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+ .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+ .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+ .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+ .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+ .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+ .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+ .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+ .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+ .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+ .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+ .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+ .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+ .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+ .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+ .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+ .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+ .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+ .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+ .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+ .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+ .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+ .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+ .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+ .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+ .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+ .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+ .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+ .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+ .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+ .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+ .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+ .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+ .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+ .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+ .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+ .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+ .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+ .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+ .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .STRINGZ "AES for PA-RISC, CRYPTOGAMS by "
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ # translate made up instructons: _ror, _srm
+ s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
+
+ s/_srm(\s+%r[0-9]+),([0-9]+),/
+ $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+ : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+
+ s/,\*/,/ if ($SIZE_T==4);
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl
index f82c5e1..7c52cbe 100644
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl
@@ -7,7 +7,7 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
#
# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
# 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
# February 2010
#
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
# performance improvement on the platfrom in question (and marginal
# improvement even on others). It should be noted that Power6 fails
# to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@
if ($flavour =~ /64/) {
$SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
+ $LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
@@ -116,15 +118,19 @@ ()
addi $Tbl0,$Tbl0,`128-8`
mtlr r0
blr
- .space `32-24`
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
LAES_Td:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
- addi $Tbl0,$Tbl0,`128-8-32+2048+256`
+ addi $Tbl0,$Tbl0,`128-64-8+2048+256`
mtlr r0
blr
- .space `128-32-24`
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `128-64-9*4`
___
&_data_word(
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ ()
.globl .AES_encrypt
.align 7
.AES_encrypt:
- mflr r0
$STU $sp,-$FRAME($sp)
+ mflr r0
- $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ ()
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ andi. $t0,$inp,3
+ andi. $t1,$out,3
+ or. $t0,$t0,$t1
+ bne Lenc_unaligned
+Lenc_unaligned_ok:
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
@@ -363,8 +375,80 @@ ()
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
+ b Lenc_done
+
+Lenc_unaligned:
+ subfic $t0,$inp,4096
+ subfic $t1,$out,4096
+ andi. $t0,$t0,4096-16
+ beq Lenc_xpage
+ andi. $t1,$t1,4096-16
+ bne Lenc_unaligned_ok
+
+Lenc_xpage:
+ lbz $acc00,0($inp)
+ lbz $acc01,1($inp)
+ lbz $acc02,2($inp)
+ lbz $s0,3($inp)
+ lbz $acc04,4($inp)
+ lbz $acc05,5($inp)
+ lbz $acc06,6($inp)
+ lbz $s1,7($inp)
+ lbz $acc08,8($inp)
+ lbz $acc09,9($inp)
+ lbz $acc10,10($inp)
+ insrwi $s0,$acc00,8,0
+ lbz $s2,11($inp)
+ insrwi $s1,$acc04,8,0
+ lbz $acc12,12($inp)
+ insrwi $s0,$acc01,8,8
+ lbz $acc13,13($inp)
+ insrwi $s1,$acc05,8,8
+ lbz $acc14,14($inp)
+ insrwi $s0,$acc02,8,16
+ lbz $s3,15($inp)
+ insrwi $s1,$acc06,8,16
+ insrwi $s2,$acc08,8,0
+ insrwi $s3,$acc12,8,0
+ insrwi $s2,$acc09,8,8
+ insrwi $s3,$acc13,8,8
+ insrwi $s2,$acc10,8,16
+ insrwi $s3,$acc14,8,16
+
+ bl LAES_Te
+ bl Lppc_AES_encrypt_compact
+
+ extrwi $acc00,$s0,8,0
+ extrwi $acc01,$s0,8,8
+ stb $acc00,0($out)
+ extrwi $acc02,$s0,8,16
+ stb $acc01,1($out)
+ stb $acc02,2($out)
+ extrwi $acc04,$s1,8,0
+ stb $s0,3($out)
+ extrwi $acc05,$s1,8,8
+ stb $acc04,4($out)
+ extrwi $acc06,$s1,8,16
+ stb $acc05,5($out)
+ stb $acc06,6($out)
+ extrwi $acc08,$s2,8,0
+ stb $s1,7($out)
+ extrwi $acc09,$s2,8,8
+ stb $acc08,8($out)
+ extrwi $acc10,$s2,8,16
+ stb $acc09,9($out)
+ stb $acc10,10($out)
+ extrwi $acc12,$s3,8,0
+ stb $s2,11($out)
+ extrwi $acc13,$s3,8,8
+ stb $acc12,12($out)
+ extrwi $acc14,$s3,8,16
+ stb $acc13,13($out)
+ stb $acc14,14($out)
+ stb $s3,15($out)
- $POP r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_done:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ ()
mtlr r0
addi $sp,$sp,$FRAME
blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
.align 5
Lppc_AES_encrypt:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
+ lwz $t0,0($key)
addi $Tbl2,$Tbl0,2
+ lwz $t1,4($key)
addi $Tbl3,$Tbl0,1
+ lwz $t2,8($key)
addi $acc00,$acc00,-1
+ lwz $t3,12($key)
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
@@ -413,44 +500,44 @@ ()
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc04,$s1,`32-16+3`,21,28
+ lwz $t1,4($key)
rlwinm $acc05,$s2,`32-16+3`,21,28
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc06,$s3,`32-16+3`,21,28
+ lwz $t3,12($key)
rlwinm $acc07,$s0,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
- lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
+ lwzx $acc01,$Tbl0,$acc01
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
- lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
+ lwzx $acc03,$Tbl0,$acc03
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
- lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0+3`,21,28
+ lwzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s0,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
- lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0+3`,21,28
+ lwzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s2,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
- lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
+ lwzx $acc09,$Tbl2,$acc09
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
- lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
+ lwzx $acc11,$Tbl2,$acc11
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
- lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
+ lwzx $acc13,$Tbl3,$acc13
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
- lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
+ lwzx $acc15,$Tbl3,$acc15
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
@@ -466,60 +553,60 @@ ()
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
+ lwz $t1,4($key)
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
+ lwz $t3,12($key)
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
- lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s1,`32-16`,24,31
+ lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc05,$s2,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
- lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc06,$s3,`32-16`,24,31
+ lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc07,$s0,`32-16`,24,31
lwz $acc12,`2048+128`($Tbl0)
- lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
+ lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc09,$s3,`32-8`,24,31
lwz $acc14,`2048+192`($Tbl0)
- lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc10,$s0,`32-8`,24,31
+ lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc00,$Tbl2,$acc00
- lbzx $acc01,$Tbl2,$acc01
rlwinm $acc12,$s3,`0`,24,31
+ lbzx $acc01,$Tbl2,$acc01
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc02,$Tbl2,$acc02
- lbzx $acc03,$Tbl2,$acc03
rlwinm $acc14,$s1,`0`,24,31
+ lbzx $acc03,$Tbl2,$acc03
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc04,$Tbl2,$acc04
- lbzx $acc05,$Tbl2,$acc05
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc05,$Tbl2,$acc05
rlwinm $s1,$acc01,24,0,7
lbzx $acc06,$Tbl2,$acc06
- lbzx $acc07,$Tbl2,$acc07
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc07,$Tbl2,$acc07
rlwinm $s3,$acc03,24,0,7
lbzx $acc08,$Tbl2,$acc08
- lbzx $acc09,$Tbl2,$acc09
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc09,$Tbl2,$acc09
rlwimi $s1,$acc05,16,8,15
lbzx $acc10,$Tbl2,$acc10
- lbzx $acc11,$Tbl2,$acc11
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc11,$Tbl2,$acc11
rlwimi $s3,$acc07,16,8,15
lbzx $acc12,$Tbl2,$acc12
- lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc08,8,16,23
+ lbzx $acc13,$Tbl2,$acc13
rlwimi $s1,$acc09,8,16,23
lbzx $acc14,$Tbl2,$acc14
- lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc10,8,16,23
+ lbzx $acc15,$Tbl2,$acc15
rlwimi $s3,$acc11,8,16,23
or $s0,$s0,$acc12
or $s1,$s1,$acc13
@@ -530,29 +617,31 @@ ()
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_encrypt_compact:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
+ lwz $t0,0($key)
lis $mask80,0x8080
+ lwz $t1,4($key)
lis $mask1b,0x1b1b
- addi $key,$key,16
+ lwz $t2,8($key)
ori $mask80,$mask80,0x8080
+ lwz $t3,12($key)
ori $mask1b,$mask1b,0x1b1b
+ addi $key,$key,16
mtctr $acc00
.align 4
Lenc_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
- xor $s2,$s2,$t2
- xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
+ xor $s2,$s2,$t2
rlwinm $acc01,$s1,`32-24`,24,31
+ xor $s3,$s3,$t3
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ ()
rlwinm $acc06,$s3,`32-16`,24,31
rlwinm $acc07,$s0,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
- lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
+ lbzx $acc01,$Tbl1,$acc01
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl1,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
- lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0`,24,31
+ lbzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0`,24,31
+ lbzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
- lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl1,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl1,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
- lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl1,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl1,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
- lwz $t1,4($key)
or $s0,$s0,$acc12
+ lwz $t1,4($key)
or $s1,$s1,$acc13
lwz $t2,8($key)
- lwz $t3,12($key)
or $s2,$s2,$acc14
+ lwz $t3,12($key)
or $s3,$s3,$acc15
addi $key,$key,16
@@ -612,12 +701,12 @@ ()
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
- srwi $acc05,$acc01,7
- srwi $acc06,$acc02,7
- srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ srwi $acc05,$acc01,7
andc $acc09,$s1,$mask80
+ srwi $acc06,$acc02,7
andc $acc10,$s2,$mask80
+ srwi $acc07,$acc03,7
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ ()
and $acc03,$acc03,$mask1b
xor $acc00,$acc00,$acc08 # r2
xor $acc01,$acc01,$acc09
+ rotlwi $acc12,$s0,16 # ROTATE(r0,16)
xor $acc02,$acc02,$acc10
+ rotlwi $acc13,$s1,16
xor $acc03,$acc03,$acc11
+ rotlwi $acc14,$s2,16
- rotlwi $acc12,$s0,16 # ROTATE(r0,16)
- rotlwi $acc13,$s1,16
- rotlwi $acc14,$s2,16
- rotlwi $acc15,$s3,16
xor $s0,$s0,$acc00 # r0^r2
+ rotlwi $acc15,$s3,16
xor $s1,$s1,$acc01
- xor $s2,$s2,$acc02
- xor $s3,$s3,$acc03
rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
+ xor $s2,$s2,$acc02
rotrwi $s1,$s1,24
+ xor $s3,$s3,$acc03
rotrwi $s2,$s2,24
- rotrwi $s3,$s3,24
xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
+ rotrwi $s3,$s3,24
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
- rotlwi $acc09,$acc13,8
- rotlwi $acc10,$acc14,8
- rotlwi $acc11,$acc15,8
xor $s0,$s0,$acc12 #
+ rotlwi $acc09,$acc13,8
xor $s1,$s1,$acc13
+ rotlwi $acc10,$acc14,8
xor $s2,$s2,$acc14
+ rotlwi $acc11,$acc15,8
xor $s3,$s3,$acc15
xor $s0,$s0,$acc08 #
xor $s1,$s1,$acc09
@@ -673,14 +762,15 @@ ()
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.globl .AES_decrypt
.align 7
.AES_decrypt:
- mflr r0
$STU $sp,-$FRAME($sp)
+ mflr r0
- $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ ()
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+ andi. $t0,$inp,3
+ andi. $t1,$out,3
+ or. $t0,$t0,$t1
+ bne Ldec_unaligned
+
+Ldec_unaligned_ok:
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
@@ -712,8 +809,80 @@ ()
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
+ b Ldec_done
+
+Ldec_unaligned:
+ subfic $t0,$inp,4096
+ subfic $t1,$out,4096
+ andi. $t0,$t0,4096-16
+ beq Ldec_xpage
+ andi. $t1,$t1,4096-16
+ bne Ldec_unaligned_ok
+
+Ldec_xpage:
+ lbz $acc00,0($inp)
+ lbz $acc01,1($inp)
+ lbz $acc02,2($inp)
+ lbz $s0,3($inp)
+ lbz $acc04,4($inp)
+ lbz $acc05,5($inp)
+ lbz $acc06,6($inp)
+ lbz $s1,7($inp)
+ lbz $acc08,8($inp)
+ lbz $acc09,9($inp)
+ lbz $acc10,10($inp)
+ insrwi $s0,$acc00,8,0
+ lbz $s2,11($inp)
+ insrwi $s1,$acc04,8,0
+ lbz $acc12,12($inp)
+ insrwi $s0,$acc01,8,8
+ lbz $acc13,13($inp)
+ insrwi $s1,$acc05,8,8
+ lbz $acc14,14($inp)
+ insrwi $s0,$acc02,8,16
+ lbz $s3,15($inp)
+ insrwi $s1,$acc06,8,16
+ insrwi $s2,$acc08,8,0
+ insrwi $s3,$acc12,8,0
+ insrwi $s2,$acc09,8,8
+ insrwi $s3,$acc13,8,8
+ insrwi $s2,$acc10,8,16
+ insrwi $s3,$acc14,8,16
+
+ bl LAES_Td
+ bl Lppc_AES_decrypt_compact
- $POP r0,`$FRAME-$SIZE_T*21`($sp)
+ extrwi $acc00,$s0,8,0
+ extrwi $acc01,$s0,8,8
+ stb $acc00,0($out)
+ extrwi $acc02,$s0,8,16
+ stb $acc01,1($out)
+ stb $acc02,2($out)
+ extrwi $acc04,$s1,8,0
+ stb $s0,3($out)
+ extrwi $acc05,$s1,8,8
+ stb $acc04,4($out)
+ extrwi $acc06,$s1,8,16
+ stb $acc05,5($out)
+ stb $acc06,6($out)
+ extrwi $acc08,$s2,8,0
+ stb $s1,7($out)
+ extrwi $acc09,$s2,8,8
+ stb $acc08,8($out)
+ extrwi $acc10,$s2,8,16
+ stb $acc09,9($out)
+ stb $acc10,10($out)
+ extrwi $acc12,$s3,8,0
+ stb $s2,11($out)
+ extrwi $acc13,$s3,8,8
+ stb $acc12,12($out)
+ extrwi $acc14,$s3,8,16
+ stb $acc13,13($out)
+ stb $acc14,14($out)
+ stb $s3,15($out)
+
+Ldec_done:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ ()
mtlr r0
addi $sp,$sp,$FRAME
blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
.align 5
Lppc_AES_decrypt:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
+ lwz $t0,0($key)
addi $Tbl2,$Tbl0,2
+ lwz $t1,4($key)
addi $Tbl3,$Tbl0,1
+ lwz $t2,8($key)
addi $acc00,$acc00,-1
+ lwz $t3,12($key)
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
@@ -762,44 +934,44 @@ ()
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc04,$s3,`32-16+3`,21,28
+ lwz $t1,4($key)
rlwinm $acc05,$s0,`32-16+3`,21,28
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc06,$s1,`32-16+3`,21,28
+ lwz $t3,12($key)
rlwinm $acc07,$s2,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
- lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
+ lwzx $acc01,$Tbl0,$acc01
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
- lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
+ lwzx $acc03,$Tbl0,$acc03
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
- lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0+3`,21,28
+ lwzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s2,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
- lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0+3`,21,28
+ lwzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s0,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
- lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
+ lwzx $acc09,$Tbl2,$acc09
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
- lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
+ lwzx $acc11,$Tbl2,$acc11
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
- lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
+ lwzx $acc13,$Tbl3,$acc13
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
- lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
+ lwzx $acc15,$Tbl3,$acc15
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
@@ -815,56 +987,56 @@ ()
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
+ lwz $t1,4($key)
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
+ lwz $t3,12($key)
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
- lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s3,`32-16`,24,31
+ lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc05,$s0,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
- lwz $acc11,`2048+96`($Tbl0)
lbzx $acc00,$Tbl2,$acc00
+ lwz $acc11,`2048+96`($Tbl0)
lbzx $acc01,$Tbl2,$acc01
lwz $acc12,`2048+128`($Tbl0)
- lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc06,$s1,`32-16`,24,31
+ lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc07,$s2,`32-16`,24,31
lwz $acc14,`2048+192`($Tbl0)
- lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
+ lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl2,$acc02
- lbzx $acc03,$Tbl2,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl2,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl2,$acc04
- lbzx $acc05,$Tbl2,$acc05
rlwinm $acc12,$s1,`0`,24,31
+ lbzx $acc05,$Tbl2,$acc05
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl2,$acc06
- lbzx $acc07,$Tbl2,$acc07
rlwinm $acc14,$s3,`0`,24,31
+ lbzx $acc07,$Tbl2,$acc07
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl2,$acc08
- lbzx $acc09,$Tbl2,$acc09
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl2,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl2,$acc10
- lbzx $acc11,$Tbl2,$acc11
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl2,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl2,$acc12
- lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl2,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl2,$acc14
- lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl2,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ ()
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_decrypt_compact:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
+ lwz $t0,0($key)
lis $mask80,0x8080
+ lwz $t1,4($key)
lis $mask1b,0x1b1b
- addi $key,$key,16
+ lwz $t2,8($key)
ori $mask80,$mask80,0x8080
+ lwz $t3,12($key)
ori $mask1b,$mask1b,0x1b1b
+ addi $key,$key,16
___
$code.=<<___ if ($SIZE_T==8);
insrdi $mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ ()
Ldec_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
- xor $s2,$s2,$t2
- xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
+ xor $s2,$s2,$t2
rlwinm $acc01,$s1,`32-24`,24,31
+ xor $s3,$s3,$t3
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ ()
rlwinm $acc06,$s1,`32-16`,24,31
rlwinm $acc07,$s2,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
- lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
+ lbzx $acc01,$Tbl1,$acc01
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl1,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
- lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0`,24,31
+ lbzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0`,24,31
+ lbzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
- lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl1,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl1,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
- lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl1,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl1,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
- lwz $t1,4($key)
or $s0,$s0,$acc12
+ lwz $t1,4($key)
or $s1,$s1,$acc13
lwz $t2,8($key)
- lwz $t3,12($key)
or $s2,$s2,$acc14
+ lwz $t3,12($key)
or $s3,$s3,$acc15
addi $key,$key,16
@@ -1030,12 +1204,12 @@ ()
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
- srwi $acc05,$acc01,7
- srwi $acc06,$acc02,7
- srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ srwi $acc05,$acc01,7
andc $acc09,$s1,$mask80
+ srwi $acc06,$acc02,7
andc $acc10,$s2,$mask80
+ srwi $acc07,$acc03,7
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ ()
and $acc06,$acc02,$mask80
and $acc07,$acc03,$mask80
srwi $acc08,$acc04,7 # r1>>7
- srwi $acc09,$acc05,7
- srwi $acc10,$acc06,7
- srwi $acc11,$acc07,7
andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
+ srwi $acc09,$acc05,7
andc $acc13,$acc01,$mask80
+ srwi $acc10,$acc06,7
andc $acc14,$acc02,$mask80
+ srwi $acc11,$acc07,7
andc $acc15,$acc03,$mask80
sub $acc04,$acc04,$acc08 # r1-(r1>>7)
sub $acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ ()
and $acc08,$acc04,$mask80 # r1=r4&0x80808080
and $acc09,$acc05,$mask80
- and $acc10,$acc06,$mask80
- and $acc11,$acc07,$mask80
srwi $acc12,$acc08,7 # r1>>7
+ and $acc10,$acc06,$mask80
srwi $acc13,$acc09,7
+ and $acc11,$acc07,$mask80
srwi $acc14,$acc10,7
- srwi $acc15,$acc11,7
sub $acc08,$acc08,$acc12 # r1-(r1>>7)
+ srwi $acc15,$acc11,7
sub $acc09,$acc09,$acc13
sub $acc10,$acc10,$acc14
sub $acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ()
$code.=<<___;
rotrwi $s0,$s0,8 # = ROTATE(r0,8)
rotrwi $s1,$s1,8
- rotrwi $s2,$s2,8
- rotrwi $s3,$s3,8
xor $s0,$s0,$acc00 # ^= r2^r0
+ rotrwi $s2,$s2,8
xor $s1,$s1,$acc01
+ rotrwi $s3,$s3,8
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
xor $acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ ()
xor $acc02,$acc02,$acc10
xor $acc03,$acc03,$acc11
xor $s0,$s0,$acc04 # ^= r4^r0
- xor $s1,$s1,$acc05
- xor $s2,$s2,$acc06
- xor $s3,$s3,$acc07
rotrwi $acc00,$acc00,24
+ xor $s1,$s1,$acc05
rotrwi $acc01,$acc01,24
+ xor $s2,$s2,$acc06
rotrwi $acc02,$acc02,24
+ xor $s3,$s3,$acc07
rotrwi $acc03,$acc03,24
xor $acc04,$acc04,$acc08
xor $acc05,$acc05,$acc09
xor $acc06,$acc06,$acc10
xor $acc07,$acc07,$acc11
xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
- xor $s1,$s1,$acc09
- xor $s2,$s2,$acc10
- xor $s3,$s3,$acc11
rotrwi $acc04,$acc04,16
+ xor $s1,$s1,$acc09
rotrwi $acc05,$acc05,16
+ xor $s2,$s2,$acc10
rotrwi $acc06,$acc06,16
+ xor $s3,$s3,$acc11
rotrwi $acc07,$acc07,16
xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
- xor $s1,$s1,$acc01
- xor $s2,$s2,$acc02
- xor $s3,$s3,$acc03
rotrwi $acc08,$acc08,8
+ xor $s1,$s1,$acc01
rotrwi $acc09,$acc09,8
+ xor $s2,$s2,$acc02
rotrwi $acc10,$acc10,8
+ xor $s3,$s3,$acc03
rotrwi $acc11,$acc11,8
xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
xor $s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ ()
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
-.long 0
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
.asciz "AES for PPC, CRYPTOGAMS by "
.align 7
___
diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl
index 7e01889..e75dcd0 100644
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
# Unlike previous version hardware support detection takes place only
# at the moment of key schedule setup, which is denoted in key->rounds.
# This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
#
# Add AES_cbc_encrypt, which gives incredible performance improvement,
# it was measured to be ~6.6x. It's less than previously mentioned 8x,
# because software implementation was optimized.
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
$softonly=0; # allow hardware support
$t0="%r0"; $mask="%r0";
@@ -69,6 +114,8 @@
$ra="%r14";
$sp="%r15";
+$stdframe=16*$SIZE_T+4*8;
+
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ ()
.Lesoft:
___
$code.=<<___;
- stmg %r3,$ra,24($sp)
+ stm${g} %r3,$ra,3*$SIZE_T($sp)
llgf $s0,0($inp)
llgf $s1,4($inp)
@@ -220,20 +267,20 @@ ()
larl $tbl,AES_Te
bras $ra,_s390x_AES_encrypt
- lg $out,24($sp)
+ l${g} $out,3*$SIZE_T($sp)
st $s0,0($out)
st $s1,4($out)
st $s2,8($out)
st $s3,12($out)
- lmg %r6,$ra,48($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
br $ra
.size AES_encrypt,.-AES_encrypt
.type _s390x_AES_encrypt,\@function
.align 16
_s390x_AES_encrypt:
- stg $ra,152($sp)
+ st${g} $ra,15*$SIZE_T($sp)
x $s0,0($key)
x $s1,4($key)
x $s2,8($key)
@@ -397,7 +444,7 @@ ()
or $s2,$i3
or $s3,$t3
- lg $ra,152($sp)
+ l${g} $ra,15*$SIZE_T($sp)
xr $s0,$t0
xr $s1,$t2
x $s2,24($key)
@@ -536,7 +583,7 @@ ()
.Ldsoft:
___
$code.=<<___;
- stmg %r3,$ra,24($sp)
+ stm${g} %r3,$ra,3*$SIZE_T($sp)
llgf $s0,0($inp)
llgf $s1,4($inp)
@@ -546,20 +593,20 @@ ()
larl $tbl,AES_Td
bras $ra,_s390x_AES_decrypt
- lg $out,24($sp)
+ l${g} $out,3*$SIZE_T($sp)
st $s0,0($out)
st $s1,4($out)
st $s2,8($out)
st $s3,12($out)
- lmg %r6,$ra,48($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
br $ra
.size AES_decrypt,.-AES_decrypt
.type _s390x_AES_decrypt,\@function
.align 16
_s390x_AES_decrypt:
- stg $ra,152($sp)
+ st${g} $ra,15*$SIZE_T($sp)
x $s0,0($key)
x $s1,4($key)
x $s2,8($key)
@@ -703,7 +750,7 @@ ()
nr $i1,$mask
nr $i2,$mask
- lg $ra,152($sp)
+ l${g} $ra,15*$SIZE_T($sp)
or $s1,$t1
l $t0,16($key)
l $t1,20($key)
@@ -732,14 +779,15 @@ ()
$code.=<<___;
# void AES_set_encrypt_key(const unsigned char *in, int bits,
# AES_KEY *key) {
-.globl AES_set_encrypt_key
-.type AES_set_encrypt_key,\@function
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,\@function
.align 16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
lghi $t0,0
- clgr $inp,$t0
+ cl${g}r $inp,$t0
je .Lminus1
- clgr $key,$t0
+ cl${g}r $key,$t0
je .Lminus1
lghi $t0,128
@@ -789,7 +837,8 @@ ()
je 1f
lg %r1,24($inp)
stg %r1,24($key)
-1: st $bits,236($key) # save bits
+1: st $bits,236($key) # save bits [for debugging purposes]
+ lgr $t0,%r5
st %r5,240($key) # save km code
lghi %r2,0
br %r14
@@ -797,7 +846,7 @@ ()
$code.=<<___;
.align 16
.Lekey_internal:
- stmg %r6,%r13,48($sp) # all non-volatile regs
+ stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
larl $tbl,AES_Te+2048
@@ -857,8 +906,9 @@ ()
la $key,16($key) # key+=4
la $t3,4($t3) # i++
brct $rounds,.L128_loop
+ lghi $t0,10
lghi %r2,0
- lmg %r6,%r13,48($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
@@ -905,8 +955,9 @@ ()
st $s2,32($key)
st $s3,36($key)
brct $rounds,.L192_continue
+ lghi $t0,12
lghi %r2,0
- lmg %r6,%r13,48($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
@@ -967,8 +1018,9 @@ ()
st $s2,40($key)
st $s3,44($key)
brct $rounds,.L256_continue
+ lghi $t0,14
lghi %r2,0
- lmg %r6,%r13,48($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
@@ -1011,42 +1063,34 @@ ()
.Lminus1:
lghi %r2,-1
br $ra
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
# void AES_set_decrypt_key(const unsigned char *in, int bits,
# AES_KEY *key) {
-.globl AES_set_decrypt_key
-.type AES_set_decrypt_key,\@function
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,\@function
.align 16
-AES_set_decrypt_key:
- stg $key,32($sp) # I rely on AES_set_encrypt_key to
- stg $ra,112($sp) # save non-volatile registers!
- bras $ra,AES_set_encrypt_key
- lg $key,32($sp)
- lg $ra,112($sp)
+private_AES_set_decrypt_key:
+ #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
+ st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
+ bras $ra,_s390x_AES_set_encrypt_key
+ #l${g} $key,4*$SIZE_T($sp)
+ l${g} $ra,14*$SIZE_T($sp)
ltgr %r2,%r2
bnzr $ra
___
$code.=<<___ if (!$softonly);
- l $t0,240($key)
+ #l $t0,240($key)
lhi $t1,16
cr $t0,$t1
jl .Lgo
oill $t0,0x80 # set "decrypt" bit
st $t0,240($key)
br $ra
-
-.align 16
-.Ldkey_internal:
- stg $key,32($sp)
- stg $ra,40($sp)
- bras $ra,.Lekey_internal
- lg $key,32($sp)
- lg $ra,40($sp)
___
$code.=<<___;
-
-.Lgo: llgf $rounds,240($key)
+.align 16
+.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
la $i1,0($key)
sllg $i2,$rounds,4
la $i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ ()
la $key,4($key)
brct $rounds,.Lmix
- lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+ lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
lghi %r2,0
br $ra
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
___
-#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivec, const int enc)
{
@@ -1163,7 +1208,7 @@ ()
l %r0,240($key) # load kmc code
lghi $key,15 # res=len%16, len-=res;
ngr $key,$len
- slgr $len,$key
+ sl${g}r $len,$key
la %r1,16($sp) # parameter block - ivec || key
jz .Lkmc_truncated
.long 0xb92f0042 # kmc %r4,%r2
@@ -1181,34 +1226,34 @@ ()
tmll %r0,0x80
jnz .Lkmc_truncated_dec
lghi %r1,0
- stg %r1,128($sp)
- stg %r1,136($sp)
+ stg %r1,16*$SIZE_T($sp)
+ stg %r1,16*$SIZE_T+8($sp)
bras %r1,1f
- mvc 128(1,$sp),0($inp)
+ mvc 16*$SIZE_T(1,$sp),0($inp)
1: ex $key,0(%r1)
la %r1,16($sp) # restore parameter block
- la $inp,128($sp)
+ la $inp,16*$SIZE_T($sp)
lghi $len,16
.long 0xb92f0042 # kmc %r4,%r2
j .Lkmc_done
.align 16
.Lkmc_truncated_dec:
- stg $out,64($sp)
- la $out,128($sp)
+ st${g} $out,4*$SIZE_T($sp)
+ la $out,16*$SIZE_T($sp)
lghi $len,16
.long 0xb92f0042 # kmc %r4,%r2
- lg $out,64($sp)
+ l${g} $out,4*$SIZE_T($sp)
bras %r1,2f
- mvc 0(1,$out),128($sp)
+ mvc 0(1,$out),16*$SIZE_T($sp)
2: ex $key,0(%r1)
j .Lkmc_done
.align 16
.Lcbc_software:
___
$code.=<<___;
- stmg $key,$ra,40($sp)
+ stm${g} $key,$ra,5*$SIZE_T($sp)
lhi %r0,0
- cl %r0,164($sp)
+ cl %r0,`$stdframe+$SIZE_T-4`($sp)
je .Lcbc_decrypt
larl $tbl,AES_Te
@@ -1219,10 +1264,10 @@ ()
llgf $s3,12($ivp)
lghi $t0,16
- slgr $len,$t0
+ sl${g}r $len,$t0
brc 4,.Lcbc_enc_tail # if borrow
.Lcbc_enc_loop:
- stmg $inp,$out,16($sp)
+ stm${g} $inp,$out,2*$SIZE_T($sp)
x $s0,0($inp)
x $s1,4($inp)
x $s2,8($inp)
@@ -1231,7 +1276,7 @@ ()
bras $ra,_s390x_AES_encrypt
- lmg $inp,$key,16($sp)
+ lm${g} $inp,$key,2*$SIZE_T($sp)
st $s0,0($out)
st $s1,4($out)
st $s2,8($out)
@@ -1240,33 +1285,33 @@ ()
la $inp,16($inp)
la $out,16($out)
lghi $t0,16
- ltgr $len,$len
+ lt${g}r $len,$len
jz .Lcbc_enc_done
- slgr $len,$t0
+ sl${g}r $len,$t0
brc 4,.Lcbc_enc_tail # if borrow
j .Lcbc_enc_loop
.align 16
.Lcbc_enc_done:
- lg $ivp,48($sp)
+ l${g} $ivp,6*$SIZE_T($sp)
st $s0,0($ivp)
st $s1,4($ivp)
st $s2,8($ivp)
st $s3,12($ivp)
- lmg %r7,$ra,56($sp)
+ lm${g} %r7,$ra,7*$SIZE_T($sp)
br $ra
.align 16
.Lcbc_enc_tail:
aghi $len,15
lghi $t0,0
- stg $t0,128($sp)
- stg $t0,136($sp)
+ stg $t0,16*$SIZE_T($sp)
+ stg $t0,16*$SIZE_T+8($sp)
bras $t1,3f
- mvc 128(1,$sp),0($inp)
+ mvc 16*$SIZE_T(1,$sp),0($inp)
3: ex $len,0($t1)
lghi $len,0
- la $inp,128($sp)
+ la $inp,16*$SIZE_T($sp)
j .Lcbc_enc_loop
.align 16
@@ -1275,10 +1320,10 @@ ()
lg $t0,0($ivp)
lg $t1,8($ivp)
- stmg $t0,$t1,128($sp)
+ stmg $t0,$t1,16*$SIZE_T($sp)
.Lcbc_dec_loop:
- stmg $inp,$out,16($sp)
+ stm${g} $inp,$out,2*$SIZE_T($sp)
llgf $s0,0($inp)
llgf $s1,4($inp)
llgf $s2,8($inp)
@@ -1287,7 +1332,7 @@ ()
bras $ra,_s390x_AES_decrypt
- lmg $inp,$key,16($sp)
+ lm${g} $inp,$key,2*$SIZE_T($sp)
sllg $s0,$s0,32
sllg $s2,$s2,32
lr $s0,$s1
@@ -1295,15 +1340,15 @@ ()
lg $t0,0($inp)
lg $t1,8($inp)
- xg $s0,128($sp)
- xg $s2,136($sp)
+ xg $s0,16*$SIZE_T($sp)
+ xg $s2,16*$SIZE_T+8($sp)
lghi $s1,16
- slgr $len,$s1
+ sl${g}r $len,$s1
brc 4,.Lcbc_dec_tail # if borrow
brc 2,.Lcbc_dec_done # if zero
stg $s0,0($out)
stg $s2,8($out)
- stmg $t0,$t1,128($sp)
+ stmg $t0,$t1,16*$SIZE_T($sp)
la $inp,16($inp)
la $out,16($out)
@@ -1313,7 +1358,7 @@ ()
stg $s0,0($out)
stg $s2,8($out)
.Lcbc_dec_exit:
- lmg $ivp,$ra,48($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
stmg $t0,$t1,0($ivp)
br $ra
@@ -1321,19 +1366,872 @@ ()
.align 16
.Lcbc_dec_tail:
aghi $len,15
- stg $s0,128($sp)
- stg $s2,136($sp)
+ stg $s0,16*$SIZE_T($sp)
+ stg $s2,16*$SIZE_T+8($sp)
bras $s1,4f
- mvc 0(1,$out),128($sp)
+ mvc 0(1,$out),16*$SIZE_T($sp)
4: ex $len,0($s1)
j .Lcbc_dec_exit
.size AES_cbc_encrypt,.-AES_cbc_encrypt
-.comm OPENSSL_s390xcap_P,8,8
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+# size_t blocks, const AES_KEY *key,
+# const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4"; # blocks and out are swapped
+my $len="%r3";
+my $key="%r5"; my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl AES_ctr32_encrypt
+.type AES_ctr32_encrypt,\@function
+.align 16
+AES_ctr32_encrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+ llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+ l %r0,240($key)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lctr32_software
+
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ slgr $out,$inp
+ la %r1,0($key) # %r1 is permanent copy of $key
+ lg $iv0,0($ivp) # load ivec
+ lg $ivp,8($ivp)
+
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
+ lghi $s1,-4096
+ algr $s0,$sp
+ lgr $fp,$sp
+ ngr $s0,$s1 # align at page boundary
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
+ st${g} $s2,0($sp) # back-chain
+ st${g} $fp,$SIZE_T($sp)
+
+ slgr $len,$fp
+ brc 1,.Lctr32_hw_switch # not zero, no borrow
+ algr $fp,$len # input is shorter than allocated buffer
+ lghi $len,0
+ st${g} $fp,$SIZE_T($sp)
+
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
+ larl $s0,OPENSSL_s390xcap_P
+ lg $s0,8($s0)
+ tmhh $s0,0x0004 # check for message_security-assist-4
+ jz .Lctr32_km_loop
+
+ llgfr $s0,%r0
+ lgr $s1,%r1
+ lghi %r0,0
+ la %r1,16($sp)
+ .long 0xb92d2042 # kmctr %r4,%r2,%r2
+
+ llihh %r0,0x8000 # check if kmctr supports the function code
+ srlg %r0,%r0,0($s0)
+ ng %r0,16($sp)
+ lgr %r0,$s0
+ lgr %r1,$s1
+ jz .Lctr32_km_loop
+
+####### kmctr code
+ algr $out,$inp # restore $out
+ lgr $s1,$len # $s1 undertakes $len
+ j .Lctr32_kmctr_loop
+.align 16
+.Lctr32_kmctr_loop:
+ la $s2,16($sp)
+ lgr $s3,$fp
+.Lctr32_kmctr_prepare:
+ stg $iv0,0($s2)
+ stg $ivp,8($s2)
+ la $s2,16($s2)
+ ahi $ivp,1 # 32-bit increment, preserves upper half
+ brct $s3,.Lctr32_kmctr_prepare
+
+ #la $inp,0($inp) # inp
+ sllg $len,$fp,4 # len
+ #la $out,0($out) # out
+ la $s2,16($sp) # iv
+ .long 0xb92da042 # kmctr $out,$s2,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+
+ slgr $s1,$fp
+ brc 1,.Lctr32_kmctr_loop # not zero, no borrow
+ algr $fp,$s1
+ lghi $s1,0
+ brc 4+1,.Lctr32_kmctr_loop # not zero
+
+ l${g} $sp,0($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+___
+$code.=<<___;
+.Lctr32_km_loop:
+ la $s2,16($sp)
+ lgr $s3,$fp
+.Lctr32_km_prepare:
+ stg $iv0,0($s2)
+ stg $ivp,8($s2)
+ la $s2,16($s2)
+ ahi $ivp,1 # 32-bit increment, preserves upper half
+ brct $s3,.Lctr32_km_prepare
+
+ la $s0,16($sp) # inp
+ sllg $s1,$fp,4 # len
+ la $s2,16($sp) # out
+ .long 0xb92e00a8 # km %r10,%r8
+ brc 1,.-4 # pay attention to "partial completion"
+
+ la $s2,16($sp)
+ lgr $s3,$fp
+ slgr $s2,$inp
+.Lctr32_km_xor:
+ lg $s0,0($inp)
+ lg $s1,8($inp)
+ xg $s0,0($s2,$inp)
+ xg $s1,8($s2,$inp)
+ stg $s0,0($out,$inp)
+ stg $s1,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lctr32_km_xor
+
+ slgr $len,$fp
+ brc 1,.Lctr32_km_loop # not zero, no borrow
+ algr $fp,$len
+ lghi $len,0
+ brc 4+1,.Lctr32_km_loop # not zero
+
+ l${g} $s0,0($sp)
+ l${g} $s1,$SIZE_T($sp)
+ la $s2,16($sp)
+.Lctr32_km_zap:
+ stg $s0,0($s2)
+ stg $s0,8($s2)
+ la $s2,16($s2)
+ brct $s1,.Lctr32_km_zap
+
+ la $sp,0($s0)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lctr32_software:
+___
+$code.=<<___;
+ stm${g} $key,$ra,5*$SIZE_T($sp)
+ sl${g}r $inp,$out
+ larl $tbl,AES_Te
+ llgf $t1,12($ivp)
+
+.Lctr32_loop:
+ stm${g} $inp,$out,2*$SIZE_T($sp)
+ llgf $s0,0($ivp)
+ llgf $s1,4($ivp)
+ llgf $s2,8($ivp)
+ lgr $s3,$t1
+ st $t1,16*$SIZE_T($sp)
+ lgr %r4,$key
+
+ bras $ra,_s390x_AES_encrypt
+
+ lm${g} $inp,$ivp,2*$SIZE_T($sp)
+ llgf $t1,16*$SIZE_T($sp)
+ x $s0,0($inp,$out)
+ x $s1,4($inp,$out)
+ x $s2,8($inp,$out)
+ x $s3,12($inp,$out)
+ stm $s0,$s3,0($out)
+
+ la $out,16($out)
+ ahi $t1,1 # 32-bit increment
+ brct $len,.Lctr32_loop
+
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4"; # len and out are swapped
+my $len="%r3";
+my $key1="%r5"; # $i1
+my $key2="%r6"; # $i2
+my $fp="%r7"; # $i3
+my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
+
+$code.=<<___;
+.type _s390x_xts_km,\@function
+.align 16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+ llgfr $s0,%r0 # put aside the function code
+ lghi $s1,0x7f
+ nr $s1,%r0
+ lghi %r0,0 # query capability vector
+ la %r1,$tweak-16($sp)
+ .long 0xb92e0042 # km %r4,%r2
+ llihh %r1,0x8000
+ srlg %r1,%r1,32($s1) # check for 32+function code
+ ng %r1,$tweak-16($sp)
+ lgr %r0,$s0 # restore the function code
+ la %r1,0($key1) # restore $key1
+ jz .Lxts_km_vanilla
+
+ lmg $i2,$i3,$tweak($sp) # put aside the tweak value
+ algr $out,$inp
+
+ oill %r0,32 # switch to xts function code
+ aghi $s1,-18 #
+ sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
+ la %r1,$tweak-16($sp)
+ slgr %r1,$s1 # parameter block position
+ lmg $s0,$s3,0($key1) # load 256 bits of key material,
+ stmg $s0,$s3,0(%r1) # and copy it to parameter block.
+ # yes, it contains junk and overlaps
+ # with the tweak in 128-bit case.
+ # it's done to avoid conditional
+ # branch.
+ stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
+
+ .long 0xb92e0042 # km %r4,%r2
+ brc 1,.-4 # pay attention to "partial completion"
+
+ lrvg $s0,$tweak+0($sp) # load the last tweak
+ lrvg $s1,$tweak+8($sp)
+ stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
+
+ nill %r0,0xffdf # switch back to original function code
+ la %r1,0($key1) # restore pointer to $key1
+ slgr $out,$inp
+
+ llgc $len,2*$SIZE_T-1($sp)
+ nill $len,0x0f # $len%=16
+ br $ra
+
+.align 16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
+ lghi $s1,-4096
+ algr $s0,$sp
+ lgr $fp,$sp
+ ngr $s0,$s1 # align at page boundary
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ nill $fp,0xfff0 # round to 16*n
+ st${g} $s2,0($sp) # back-chain
+ nill $len,0xfff0 # redundant
+ st${g} $fp,$SIZE_T($sp)
+
+ slgr $len,$fp
+ brc 1,.Lxts_km_go # not zero, no borrow
+ algr $fp,$len # input is shorter than allocated buffer
+ lghi $len,0
+ st${g} $fp,$SIZE_T($sp)
+
+.Lxts_km_go:
+ lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
+ lrvg $s1,$tweak+8($s2)
+
+ la $s2,16($sp) # vector of ascending tweak values
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+ j .Lxts_km_start
+
+.Lxts_km_loop:
+ la $s2,16($sp)
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+.Lxts_km_prepare:
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+.Lxts_km_start:
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ stg $i1,0($s2,$inp)
+ stg $i2,8($s2,$inp)
+ xg $i1,0($inp)
+ xg $i2,8($inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lxts_km_prepare
+
+ slgr $inp,$fp # rewind $inp
+ la $s2,0($out,$inp)
+ lgr $s3,$fp
+ .long 0xb92e00aa # km $s2,$s2
+ brc 1,.-4 # pay attention to "partial completion"
+
+ la $s2,16($sp)
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+.Lxts_km_xor:
+ lg $i1,0($out,$inp)
+ lg $i2,8($out,$inp)
+ xg $i1,0($s2,$inp)
+ xg $i2,8($s2,$inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lxts_km_xor
+
+ slgr $len,$fp
+ brc 1,.Lxts_km_loop # not zero, no borrow
+ algr $fp,$len
+ lghi $len,0
+ brc 4+1,.Lxts_km_loop # not zero
+
+ l${g} $i1,0($sp) # back-chain
+ llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
+ la $i2,16($sp)
+ srlg $fp,$fp,4
+.Lxts_km_zap:
+ stg $i1,0($i2)
+ stg $i1,8($i2)
+ la $i2,16($i2)
+ brct $fp,.Lxts_km_zap
+
+ la $sp,0($i1)
+ llgc $len,2*$SIZE_T-1($i1)
+ nill $len,0x0f # $len%=16
+ bzr $ra
+
+ # generate one more tweak...
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+
+ ltr $len,$len # clear zero flag
+ br $ra
+.size _s390x_xts_km,.-_s390x_xts_km
+
+.globl AES_xts_encrypt
+.type AES_xts_encrypt,\@function
+.align 16
+AES_xts_encrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+ llgfr $len,$len
+___
+$code.=<<___;
+ st${g} $len,1*$SIZE_T($sp) # save copy of $len
+ srag $len,$len,4 # formally wrong, because it expands
+ # sign byte, but who can afford asking
+ # to process more than 2^63-1 bytes?
+ # I use it, because it sets condition
+ # code...
+ bcr 8,$ra # abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+ llgf %r0,240($key2)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lxts_enc_software
+
+ st${g} $ra,5*$SIZE_T($sp)
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ sllg $len,$len,4 # $len&=~15
+ slgr $out,$inp
+
+ # generate the tweak value
+ l${g} $s3,$stdframe($sp) # pointer to iv
+ la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
+ lghi $s3,16
+ stmg $s0,$s1,0($s2)
+ la %r1,0($key2) # $key2 is not needed anymore
+ .long 0xb92e00aa # km $s2,$s2, generate the tweak
+ brc 1,.-4 # can this happen?
+
+ l %r0,240($key1)
+ la %r1,0($key1) # $key1 is not needed anymore
+ bras $ra,_s390x_xts_km
+ jz .Lxts_enc_km_done
+
+ aghi $inp,-16 # take one step back
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_enc_km_steal:
+ llgc $i1,16($inp)
+ llgc $i2,0($out,$inp)
+ stc $i1,0($out,$inp)
+ stc $i2,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_enc_km_steal
+
+ la $s2,0($i3)
+ lghi $s3,16
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ xg $i1,0($s2)
+ xg $i2,8($s2)
+ stg $i1,0($s2)
+ stg $i2,8($s2)
+ .long 0xb92e00aa # km $s2,$s2
+ brc 1,.-4 # can this happen?
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ xg $i1,0($i3)
+ xg $i2,8($i3)
+ stg $i1,0($i3)
+ stg $i2,8($i3)
+
+.Lxts_enc_km_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lxts_enc_software:
+___
+$code.=<<___;
+ stm${g} %r6,$ra,6*$SIZE_T($sp)
+
+ slgr $out,$inp
+
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
+ stm${g} %r2,%r5,2*$SIZE_T($sp)
+ la $key,0($key2)
+ larl $tbl,AES_Te
+ bras $ra,_s390x_AES_encrypt # generate the tweak
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ stm $s0,$s3,$tweak($sp) # save the tweak
+ j .Lxts_enc_enter
+
+.align 16
+.Lxts_enc_loop:
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+ la $inp,16($inp) # $inp+=16
+.Lxts_enc_enter:
+ x $s0,0($inp) # ^=*($inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
+ la $key,0($key1)
+ bras $ra,_s390x_AES_encrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+ brct${g} $len,.Lxts_enc_loop
+
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ jz .Lxts_enc_done
+
+ la $i3,0($inp,$out) # put aside real $out
+.Lxts_enc_steal:
+ llgc %r0,16($inp)
+ llgc %r1,0($out,$inp)
+ stc %r0,0($out,$inp)
+ stc %r1,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_enc_steal
+ la $out,0($i3) # restore real $out
+
+ # generate last tweak...
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+
+ x $s0,0($out) # ^=*(inp)|stolen cipther-text
+ x $s1,4($out)
+ x $s2,8($out)
+ x $s3,12($out)
+ st${g} $out,4*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_encrypt
+ l${g} $out,4*$SIZE_T($sp)
+ x $s0,`$tweak+0`($sp) # ^=tweak
+ x $s1,`$tweak+4`($sp)
+ x $s2,`$tweak+8`($sp)
+ x $s3,`$tweak+12`($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+
+.Lxts_enc_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$twesk+8($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+$code.=<<___;
+.globl AES_xts_decrypt
+.type AES_xts_decrypt,\@function
+.align 16
+AES_xts_decrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+ llgfr $len,$len
+___
+$code.=<<___;
+ st${g} $len,1*$SIZE_T($sp) # save copy of $len
+ aghi $len,-16
+ bcr 4,$ra # abort if less than zero. formally
+ # wrong, because $len is unsigned,
+ # but who can afford asking to
+ # process more than 2^63-1 bytes?
+ tmll $len,0x0f
+ jnz .Lxts_dec_proceed
+ aghi $len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+ llgf %r0,240($key2)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lxts_dec_software
+
+ st${g} $ra,5*$SIZE_T($sp)
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ nill $len,0xfff0 # $len&=~15
+ slgr $out,$inp
+
+ # generate the tweak value
+ l${g} $s3,$stdframe($sp) # pointer to iv
+ la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
+ lghi $s3,16
+ stmg $s0,$s1,0($s2)
+ la %r1,0($key2) # $key2 is not needed past this point
+ .long 0xb92e00aa # km $s2,$s2, generate the tweak
+ brc 1,.-4 # can this happen?
+
+ l %r0,240($key1)
+ la %r1,0($key1) # $key1 is not needed anymore
+
+ ltgr $len,$len
+ jz .Lxts_dec_km_short
+ bras $ra,_s390x_xts_km
+ jz .Lxts_dec_km_done
+
+ lrvgr $s2,$s0 # make copy in reverse byte order
+ lrvgr $s3,$s1
+ j .Lxts_dec_km_2ndtweak
+
+.Lxts_dec_km_short:
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%=16
+ lrvg $s0,$tweak+0($sp) # load the tweak
+ lrvg $s1,$tweak+8($sp)
+ lrvgr $s2,$s0 # make copy in reverse byte order
+ lrvgr $s3,$s1
+
+.Lxts_dec_km_2ndtweak:
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+
+ xg $i1,0($inp)
+ xg $i2,8($inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $i2,0($out,$inp)
+ lghi $i3,16
+ .long 0xb92e0066 # km $i2,$i2
+ brc 1,.-4 # can this happen?
+ lrvgr $i1,$s0
+ lrvgr $i2,$s1
+ xg $i1,0($out,$inp)
+ xg $i2,8($out,$inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_dec_km_steal:
+ llgc $i1,16($inp)
+ llgc $i2,0($out,$inp)
+ stc $i1,0($out,$inp)
+ stc $i2,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_dec_km_steal
+
+ lgr $s0,$s2
+ lgr $s1,$s3
+ xg $s0,0($i3)
+ xg $s1,8($i3)
+ stg $s0,0($i3)
+ stg $s1,8($i3)
+ la $s0,0($i3)
+ lghi $s1,16
+ .long 0xb92e0088 # km $s0,$s0
+ brc 1,.-4 # can this happen?
+ xg $s2,0($i3)
+ xg $s3,8($i3)
+ stg $s2,0($i3)
+ stg $s3,8($i3)
+.Lxts_dec_km_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lxts_dec_software:
+___
+$code.=<<___;
+ stm${g} %r6,$ra,6*$SIZE_T($sp)
+
+ srlg $len,$len,4
+ slgr $out,$inp
+
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
+ stm${g} %r2,%r5,2*$SIZE_T($sp)
+ la $key,0($key2)
+ larl $tbl,AES_Te
+ bras $ra,_s390x_AES_encrypt # generate the tweak
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ larl $tbl,AES_Td
+ lt${g}r $len,$len
+ stm $s0,$s3,$tweak($sp) # save the tweak
+ jz .Lxts_dec_short
+ j .Lxts_dec_enter
+
+.align 16
+.Lxts_dec_loop:
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+.Lxts_dec_enter:
+ x $s0,0($inp) # tweak^=*(inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+ la $inp,16($inp)
+ brct${g} $len,.Lxts_dec_loop
+
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ jz .Lxts_dec_done
+
+ # generate pair of tweaks...
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $i2,$s1 # flip byte order
+ lrvgr $i3,$s3
+ stmg $i2,$i3,$tweak($sp) # save the 1st tweak
+ j .Lxts_dec_2ndtweak
+
+.align 16
+.Lxts_dec_short:
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak-16+0($sp) # save the 2nd tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak-16+8($sp)
+ llgfr $s3,$s3
+
+ x $s0,0($inp) # tweak_the_2nd^=*(inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
+ x $s1,$tweak-16+4($sp)
+ x $s2,$tweak-16+8($sp)
+ x $s3,$tweak-16+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_dec_steal:
+ llgc %r0,16($inp)
+ llgc %r1,0($out,$inp)
+ stc %r0,0($out,$inp)
+ stc %r1,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_dec_steal
+ la $out,0($i3) # restore real $out
+
+ lm $s0,$s3,$tweak($sp) # load the 1st tweak
+ x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
+ x $s1,4($out)
+ x $s2,8($out)
+ x $s3,12($out)
+ st${g} $out,4*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ l${g} $out,4*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+ stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
+ stg $sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$twesk+8($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_xts_decrypt,.-AES_xts_decrypt
___
}
$code.=<<___;
.string "AES for s390x, CRYPTOGAMS by "
+.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
+close STDOUT; # force flush
diff --git a/crypto/aes/asm/aes-sparcv9.pl b/crypto/aes/asm/aes-sparcv9.pl
index c57b3a2..403c4d1 100755
--- a/crypto/aes/asm/aes-sparcv9.pl
+++ b/crypto/aes/asm/aes-sparcv9.pl
@@ -1176,6 +1176,7 @@ ()
# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
# undesired effect, so just omit them and sacrifice some portion of
# percent in performance...
-$code =~ s/fmovs.*$//gem;
+$code =~ s/fmovs.*$//gm;
print $code;
+close STDOUT; # ensure flush
diff --git a/crypto/aes/asm/aes-x86_64.S b/crypto/aes/asm/aes-x86_64.S
new file mode 100644
index 0000000..e385566
--- /dev/null
+++ b/crypto/aes/asm/aes-x86_64.S
@@ -0,0 +1,2541 @@
+.text
+.type _x86_64_AES_encrypt,@function
+.align 16
+_x86_64_AES_encrypt:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+
+ movl 240(%r15),%r13d
+ subl $1,%r13d
+ jmp .Lenc_loop
+.align 16
+.Lenc_loop:
+
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movl 0(%r14,%rsi,8),%r10d
+ movl 0(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r12d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl %dl,%ebp
+ xorl 3(%r14,%rsi,8),%r10d
+ xorl 3(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r8d
+
+ movzbl %dh,%esi
+ shrl $16,%ecx
+ movzbl %ah,%ebp
+ xorl 3(%r14,%rsi,8),%r12d
+ shrl $16,%edx
+ xorl 3(%r14,%rbp,8),%r8d
+
+ shrl $16,%ebx
+ leaq 16(%r15),%r15
+ shrl $16,%eax
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ xorl 2(%r14,%rsi,8),%r10d
+ xorl 2(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r12d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl %bl,%ebp
+ xorl 1(%r14,%rsi,8),%r10d
+ xorl 1(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r8d
+
+ movl 12(%r15),%edx
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movl 0(%r15),%eax
+ xorl 1(%r14,%rdi,8),%r12d
+ xorl 1(%r14,%rbp,8),%r8d
+
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ subl $1,%r13d
+ jnz .Lenc_loop
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movzbl 2(%r14,%rsi,8),%r10d
+ movzbl 2(%r14,%rdi,8),%r11d
+ movzbl 2(%r14,%rbp,8),%r12d
+
+ movzbl %dl,%esi
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movzbl 2(%r14,%rsi,8),%r8d
+ movl 0(%r14,%rdi,8),%edi
+ movl 0(%r14,%rbp,8),%ebp
+
+ andl $65280,%edi
+ andl $65280,%ebp
+
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+ shrl $16,%ecx
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ shrl $16,%edx
+ movl 0(%r14,%rsi,8),%esi
+ movl 0(%r14,%rdi,8),%edi
+
+ andl $65280,%esi
+ andl $65280,%edi
+ shrl $16,%ebx
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+ shrl $16,%eax
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ movl 0(%r14,%rsi,8),%esi
+ movl 0(%r14,%rdi,8),%edi
+ movl 0(%r14,%rbp,8),%ebp
+
+ andl $16711680,%esi
+ andl $16711680,%edi
+ andl $16711680,%ebp
+
+ xorl %esi,%r10d
+ xorl %edi,%r11d
+ xorl %ebp,%r12d
+
+ movzbl %bl,%esi
+ movzbl %dh,%edi
+ movzbl %ah,%ebp
+ movl 0(%r14,%rsi,8),%esi
+ movl 2(%r14,%rdi,8),%edi
+ movl 2(%r14,%rbp,8),%ebp
+
+ andl $16711680,%esi
+ andl $4278190080,%edi
+ andl $4278190080,%ebp
+
+ xorl %esi,%r8d
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movl 16+12(%r15),%edx
+ movl 2(%r14,%rsi,8),%esi
+ movl 2(%r14,%rdi,8),%edi
+ movl 16+0(%r15),%eax
+
+ andl $4278190080,%esi
+ andl $4278190080,%edi
+
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+
+ movl 16+4(%r15),%ebx
+ movl 16+8(%r15),%ecx
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
+.type _x86_64_AES_encrypt_compact,@function
+.align 16
+_x86_64_AES_encrypt_compact:
+ leaq 128(%r14),%r8
+ movl 0-128(%r8),%edi
+ movl 32-128(%r8),%ebp
+ movl 64-128(%r8),%r10d
+ movl 96-128(%r8),%r11d
+ movl 128-128(%r8),%edi
+ movl 160-128(%r8),%ebp
+ movl 192-128(%r8),%r10d
+ movl 224-128(%r8),%r11d
+ jmp .Lenc_loop_compact
+.align 16
+.Lenc_loop_compact:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+ leaq 16(%r15),%r15
+ movzbl %al,%r10d
+ movzbl %bl,%r11d
+ movzbl %cl,%r12d
+ movzbl (%r14,%r10,1),%r10d
+ movzbl (%r14,%r11,1),%r11d
+ movzbl (%r14,%r12,1),%r12d
+
+ movzbl %dl,%r8d
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl (%r14,%r8,1),%r8d
+ movzbl (%r14,%rsi,1),%r9d
+ movzbl (%r14,%rdi,1),%r13d
+
+ movzbl %dh,%ebp
+ movzbl %ah,%esi
+ shrl $16,%ecx
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ shrl $16,%edx
+
+ movzbl %cl,%edi
+ shll $8,%r9d
+ shll $8,%r13d
+ movzbl (%r14,%rdi,1),%edi
+ xorl %r9d,%r10d
+ xorl %r13d,%r11d
+
+ movzbl %dl,%r9d
+ shrl $16,%eax
+ shrl $16,%ebx
+ movzbl %al,%r13d
+ shll $8,%ebp
+ shll $8,%esi
+ movzbl (%r14,%r9,1),%r9d
+ movzbl (%r14,%r13,1),%r13d
+ xorl %ebp,%r12d
+ xorl %esi,%r8d
+
+ movzbl %bl,%ebp
+ movzbl %dh,%esi
+ shll $16,%edi
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ xorl %edi,%r10d
+
+ movzbl %ah,%edi
+ shrl $8,%ecx
+ shrl $8,%ebx
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rcx,1),%edx
+ movzbl (%r14,%rbx,1),%ecx
+ shll $16,%r9d
+ shll $16,%r13d
+ shll $16,%ebp
+ xorl %r9d,%r11d
+ xorl %r13d,%r12d
+ xorl %ebp,%r8d
+
+ shll $24,%esi
+ shll $24,%edi
+ shll $24,%edx
+ xorl %esi,%r10d
+ shll $24,%ecx
+ xorl %edi,%r11d
+ movl %r10d,%eax
+ movl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ cmpq 16(%rsp),%r15
+ je .Lenc_compact_done
+ movl %eax,%esi
+ movl %ebx,%edi
+ andl $2155905152,%esi
+ andl $2155905152,%edi
+ movl %esi,%r10d
+ movl %edi,%r11d
+ shrl $7,%r10d
+ leal (%rax,%rax,1),%r8d
+ shrl $7,%r11d
+ leal (%rbx,%rbx,1),%r9d
+ subl %r10d,%esi
+ subl %r11d,%edi
+ andl $4278124286,%r8d
+ andl $4278124286,%r9d
+ andl $454761243,%esi
+ andl $454761243,%edi
+ movl %eax,%r10d
+ movl %ebx,%r11d
+ xorl %esi,%r8d
+ xorl %edi,%r9d
+
+ xorl %r8d,%eax
+ xorl %r9d,%ebx
+ movl %ecx,%esi
+ movl %edx,%edi
+ roll $24,%eax
+ roll $24,%ebx
+ andl $2155905152,%esi
+ andl $2155905152,%edi
+ xorl %r8d,%eax
+ xorl %r9d,%ebx
+ movl %esi,%r12d
+ movl %edi,%ebp
+ rorl $16,%r10d
+ rorl $16,%r11d
+ shrl $7,%r12d
+ leal (%rcx,%rcx,1),%r8d
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ shrl $7,%ebp
+ leal (%rdx,%rdx,1),%r9d
+ rorl $8,%r10d
+ rorl $8,%r11d
+ subl %r12d,%esi
+ subl %ebp,%edi
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+
+ andl $4278124286,%r8d
+ andl $4278124286,%r9d
+ andl $454761243,%esi
+ andl $454761243,%edi
+ movl %ecx,%r12d
+ movl %edx,%ebp
+ xorl %esi,%r8d
+ xorl %edi,%r9d
+
+ xorl %r8d,%ecx
+ xorl %r9d,%edx
+ roll $24,%ecx
+ roll $24,%edx
+ xorl %r8d,%ecx
+ xorl %r9d,%edx
+ movl 0(%r14),%esi
+ rorl $16,%r12d
+ rorl $16,%ebp
+ movl 64(%r14),%edi
+ xorl %r12d,%ecx
+ xorl %ebp,%edx
+ movl 128(%r14),%r8d
+ rorl $8,%r12d
+ rorl $8,%ebp
+ movl 192(%r14),%r9d
+ xorl %r12d,%ecx
+ xorl %ebp,%edx
+ jmp .Lenc_loop_compact
+.align 16
+.Lenc_compact_done:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
+.globl AES_encrypt
+.type AES_encrypt,@function
+.align 16
+.globl asm_AES_encrypt
+.hidden asm_AES_encrypt
+asm_AES_encrypt:
+AES_encrypt:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r10
+ leaq -63(%rdx),%rcx
+ andq $-64,%rsp
+ subq %rsp,%rcx
+ negq %rcx
+ andq $960,%rcx
+ subq %rcx,%rsp
+ subq $32,%rsp
+
+ movq %rsi,16(%rsp)
+ movq %r10,24(%rsp)
+.Lenc_prologue:
+
+ movq %rdx,%r15
+ movl 240(%r15),%r13d
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+
+ shll $4,%r13d
+ leaq (%r15,%r13,1),%rbp
+ movq %r15,(%rsp)
+ movq %rbp,8(%rsp)
+
+
+ leaq .LAES_Te+2048(%rip),%r14
+ leaq 768(%rsp),%rbp
+ subq %r14,%rbp
+ andq $768,%rbp
+ leaq (%r14,%rbp,1),%r14
+
+ call _x86_64_AES_encrypt_compact
+
+ movq 16(%rsp),%r9
+ movq 24(%rsp),%rsi
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lenc_epilogue:
+ .byte 0xf3,0xc3
+.size AES_encrypt,.-AES_encrypt
+.type _x86_64_AES_decrypt,@function
+.align 16
+_x86_64_AES_decrypt:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+
+ movl 240(%r15),%r13d
+ subl $1,%r13d
+ jmp .Ldec_loop
+.align 16
+.Ldec_loop:
+
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movl 0(%r14,%rsi,8),%r10d
+ movl 0(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r12d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl %dl,%ebp
+ xorl 3(%r14,%rsi,8),%r10d
+ xorl 3(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r8d
+
+ movzbl %bh,%esi
+ shrl $16,%eax
+ movzbl %ch,%ebp
+ xorl 3(%r14,%rsi,8),%r12d
+ shrl $16,%edx
+ xorl 3(%r14,%rbp,8),%r8d
+
+ shrl $16,%ebx
+ leaq 16(%r15),%r15
+ shrl $16,%ecx
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ xorl 2(%r14,%rsi,8),%r10d
+ xorl 2(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r12d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl %bl,%ebp
+ xorl 1(%r14,%rsi,8),%r10d
+ xorl 1(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r8d
+
+ movzbl %dh,%esi
+ movl 12(%r15),%edx
+ movzbl %ah,%ebp
+ xorl 1(%r14,%rsi,8),%r12d
+ movl 0(%r15),%eax
+ xorl 1(%r14,%rbp,8),%r8d
+
+ xorl %r10d,%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ xorl %r12d,%ecx
+ xorl %r11d,%ebx
+ xorl %r8d,%edx
+ subl $1,%r13d
+ jnz .Ldec_loop
+ leaq 2048(%r14),%r14
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movzbl (%r14,%rsi,1),%r10d
+ movzbl (%r14,%rdi,1),%r11d
+ movzbl (%r14,%rbp,1),%r12d
+
+ movzbl %dl,%esi
+ movzbl %dh,%edi
+ movzbl %ah,%ebp
+ movzbl (%r14,%rsi,1),%r8d
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $8,%edi
+ shll $8,%ebp
+
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+ shrl $16,%edx
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ shrl $16,%eax
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+
+ shll $8,%esi
+ shll $8,%edi
+ shrl $16,%ebx
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+ shrl $16,%ecx
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $16,%esi
+ shll $16,%edi
+ shll $16,%ebp
+
+ xorl %esi,%r10d
+ xorl %edi,%r11d
+ xorl %ebp,%r12d
+
+ movzbl %bl,%esi
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $16,%esi
+ shll $24,%edi
+ shll $24,%ebp
+
+ xorl %esi,%r8d
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movl 16+12(%r15),%edx
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movl 16+0(%r15),%eax
+
+ shll $24,%esi
+ shll $24,%edi
+
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+
+ movl 16+4(%r15),%ebx
+ movl 16+8(%r15),%ecx
+ leaq -2048(%r14),%r14
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
+.type _x86_64_AES_decrypt_compact,@function
+.align 16
+_x86_64_AES_decrypt_compact:
+ leaq 128(%r14),%r8
+ movl 0-128(%r8),%edi
+ movl 32-128(%r8),%ebp
+ movl 64-128(%r8),%r10d
+ movl 96-128(%r8),%r11d
+ movl 128-128(%r8),%edi
+ movl 160-128(%r8),%ebp
+ movl 192-128(%r8),%r10d
+ movl 224-128(%r8),%r11d
+ jmp .Ldec_loop_compact
+
+.align 16
+.Ldec_loop_compact:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+ leaq 16(%r15),%r15
+ movzbl %al,%r10d
+ movzbl %bl,%r11d
+ movzbl %cl,%r12d
+ movzbl (%r14,%r10,1),%r10d
+ movzbl (%r14,%r11,1),%r11d
+ movzbl (%r14,%r12,1),%r12d
+
+ movzbl %dl,%r8d
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl (%r14,%r8,1),%r8d
+ movzbl (%r14,%rsi,1),%r9d
+ movzbl (%r14,%rdi,1),%r13d
+
+ movzbl %bh,%ebp
+ movzbl %ch,%esi
+ shrl $16,%ecx
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ shrl $16,%edx
+
+ movzbl %cl,%edi
+ shll $8,%r9d
+ shll $8,%r13d
+ movzbl (%r14,%rdi,1),%edi
+ xorl %r9d,%r10d
+ xorl %r13d,%r11d
+
+ movzbl %dl,%r9d
+ shrl $16,%eax
+ shrl $16,%ebx
+ movzbl %al,%r13d
+ shll $8,%ebp
+ shll $8,%esi
+ movzbl (%r14,%r9,1),%r9d
+ movzbl (%r14,%r13,1),%r13d
+ xorl %ebp,%r12d
+ xorl %esi,%r8d
+
+ movzbl %bl,%ebp
+ movzbl %bh,%esi
+ shll $16,%edi
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ xorl %edi,%r10d
+
+ movzbl %ch,%edi
+ shll $16,%r9d
+ shll $16,%r13d
+ movzbl (%r14,%rdi,1),%ebx
+ xorl %r9d,%r11d
+ xorl %r13d,%r12d
+
+ movzbl %dh,%edi
+ shrl $8,%eax
+ shll $16,%ebp
+ movzbl (%r14,%rdi,1),%ecx
+ movzbl (%r14,%rax,1),%edx
+ xorl %ebp,%r8d
+
+ shll $24,%esi
+ shll $24,%ebx
+ shll $24,%ecx
+ xorl %esi,%r10d
+ shll $24,%edx
+ xorl %r11d,%ebx
+ movl %r10d,%eax
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ cmpq 16(%rsp),%r15
+ je .Ldec_compact_done
+
+ movq 256+0(%r14),%rsi
+ shlq $32,%rbx
+ shlq $32,%rdx
+ movq 256+8(%r14),%rdi
+ orq %rbx,%rax
+ orq %rdx,%rcx
+ movq 256+16(%r14),%rbp
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+ shrq $7,%r9
+ leaq (%rax,%rax,1),%r8
+ shrq $7,%r12
+ leaq (%rcx,%rcx,1),%r11
+ subq %r9,%rbx
+ subq %r12,%rdx
+ andq %rdi,%r8
+ andq %rdi,%r11
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r8,%rbx
+ xorq %r11,%rdx
+ movq %rbx,%r8
+ movq %rdx,%r11
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ leaq (%r8,%r8,1),%r9
+ shrq $7,%r13
+ leaq (%r11,%r11,1),%r12
+ subq %r10,%rbx
+ subq %r13,%rdx
+ andq %rdi,%r9
+ andq %rdi,%r12
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r9,%rbx
+ xorq %r12,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ xorq %rax,%r8
+ shrq $7,%r13
+ xorq %rcx,%r11
+ subq %r10,%rbx
+ subq %r13,%rdx
+ leaq (%r9,%r9,1),%r10
+ leaq (%r12,%r12,1),%r13
+ xorq %rax,%r9
+ xorq %rcx,%r12
+ andq %rdi,%r10
+ andq %rdi,%r13
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r10
+ xorq %rdx,%r13
+
+ xorq %r10,%rax
+ xorq %r13,%rcx
+ xorq %r10,%r8
+ xorq %r13,%r11
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ xorq %r10,%r9
+ xorq %r13,%r12
+ shrq $32,%rbx
+ shrq $32,%rdx
+ xorq %r8,%r10
+ xorq %r11,%r13
+ roll $8,%eax
+ roll $8,%ecx
+ xorq %r9,%r10
+ xorq %r12,%r13
+
+ roll $8,%ebx
+ roll $8,%edx
+ xorl %r10d,%eax
+ xorl %r13d,%ecx
+ shrq $32,%r10
+ shrq $32,%r13
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq %r8,%r10
+ movq %r11,%r13
+ shrq $32,%r10
+ shrq $32,%r13
+ roll $24,%r8d
+ roll $24,%r11d
+ roll $24,%r10d
+ roll $24,%r13d
+ xorl %r8d,%eax
+ xorl %r11d,%ecx
+ movq %r9,%r8
+ movq %r12,%r11
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq 0(%r14),%rsi
+ shrq $32,%r8
+ shrq $32,%r11
+ movq 64(%r14),%rdi
+ roll $16,%r9d
+ roll $16,%r12d
+ movq 128(%r14),%rbp
+ roll $16,%r8d
+ roll $16,%r11d
+ movq 192(%r14),%r10
+ xorl %r9d,%eax
+ xorl %r12d,%ecx
+ movq 256(%r14),%r13
+ xorl %r8d,%ebx
+ xorl %r11d,%edx
+ jmp .Ldec_loop_compact
+.align 16
+.Ldec_compact_done:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
+.globl AES_decrypt
+.type AES_decrypt,@function
+.align 16
+.globl asm_AES_decrypt
+.hidden asm_AES_decrypt
+asm_AES_decrypt:
+AES_decrypt:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r10
+ leaq -63(%rdx),%rcx
+ andq $-64,%rsp
+ subq %rsp,%rcx
+ negq %rcx
+ andq $960,%rcx
+ subq %rcx,%rsp
+ subq $32,%rsp
+
+ movq %rsi,16(%rsp)
+ movq %r10,24(%rsp)
+.Ldec_prologue:
+
+ movq %rdx,%r15
+ movl 240(%r15),%r13d
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+
+ shll $4,%r13d
+ leaq (%r15,%r13,1),%rbp
+ movq %r15,(%rsp)
+ movq %rbp,8(%rsp)
+
+
+ leaq .LAES_Td+2048(%rip),%r14
+ leaq 768(%rsp),%rbp
+ subq %r14,%rbp
+ andq $768,%rbp
+ leaq (%r14,%rbp,1),%r14
+ shrq $3,%rbp
+ addq %rbp,%r14
+
+ call _x86_64_AES_decrypt_compact
+
+ movq 16(%rsp),%r9
+ movq 24(%rsp),%rsi
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Ldec_epilogue:
+ .byte 0xf3,0xc3
+.size AES_decrypt,.-AES_decrypt
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,@function
+.align 16
+private_AES_set_encrypt_key:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $8,%rsp
+.Lenc_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbp
+ movq 48(%rsp),%rbx
+ addq $56,%rsp
+.Lenc_key_epilogue:
+ .byte 0xf3,0xc3
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+
+.type _x86_64_AES_set_encrypt_key,@function
+.align 16
+_x86_64_AES_set_encrypt_key:
+ movl %esi,%ecx
+ movq %rdi,%rsi
+ movq %rdx,%rdi
+
+ testq $-1,%rsi
+ jz .Lbadpointer
+ testq $-1,%rdi
+ jz .Lbadpointer
+
+ leaq .LAES_Te(%rip),%rbp
+ leaq 2048+128(%rbp),%rbp
+
+
+ movl 0-128(%rbp),%eax
+ movl 32-128(%rbp),%ebx
+ movl 64-128(%rbp),%r8d
+ movl 96-128(%rbp),%edx
+ movl 128-128(%rbp),%eax
+ movl 160-128(%rbp),%ebx
+ movl 192-128(%rbp),%r8d
+ movl 224-128(%rbp),%edx
+
+ cmpl $128,%ecx
+ je .L10rounds
+ cmpl $192,%ecx
+ je .L12rounds
+ cmpl $256,%ecx
+ je .L14rounds
+ movq $-2,%rax
+ jmp .Lexit
+
+.L10rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rdx,8(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp .L10shortcut
+.align 4
+.L10loop:
+ movl 0(%rdi),%eax
+ movl 12(%rdi),%edx
+.L10shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,16(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,20(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,24(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,28(%rdi)
+ addl $1,%ecx
+ leaq 16(%rdi),%rdi
+ cmpl $10,%ecx
+ jl .L10loop
+
+ movl $10,80(%rdi)
+ xorq %rax,%rax
+ jmp .Lexit
+
+.L12rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rdx,16(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp .L12shortcut
+.align 4
+.L12loop:
+ movl 0(%rdi),%eax
+ movl 20(%rdi),%edx
+.L12shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,24(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,28(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,32(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,36(%rdi)
+
+ cmpl $7,%ecx
+ je .L12break
+ addl $1,%ecx
+
+ xorl 16(%rdi),%eax
+ movl %eax,40(%rdi)
+ xorl 20(%rdi),%eax
+ movl %eax,44(%rdi)
+
+ leaq 24(%rdi),%rdi
+ jmp .L12loop
+.L12break:
+ movl $12,72(%rdi)
+ xorq %rax,%rax
+ jmp .Lexit
+
+.L14rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rcx
+ movq 24(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp .L14shortcut
+.align 4
+.L14loop:
+ movl 0(%rdi),%eax
+ movl 28(%rdi),%edx
+.L14shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,32(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,36(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,40(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,44(%rdi)
+
+ cmpl $6,%ecx
+ je .L14break
+ addl $1,%ecx
+
+ movl %eax,%edx
+ movl 16(%rdi),%eax
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ shll $8,%ebx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movl %eax,48(%rdi)
+ xorl 20(%rdi),%eax
+ movl %eax,52(%rdi)
+ xorl 24(%rdi),%eax
+ movl %eax,56(%rdi)
+ xorl 28(%rdi),%eax
+ movl %eax,60(%rdi)
+
+ leaq 32(%rdi),%rdi
+ jmp .L14loop
+.L14break:
+ movl $14,48(%rdi)
+ xorq %rax,%rax
+ jmp .Lexit
+
+.Lbadpointer:
+ movq $-1,%rax
+.Lexit:
+.byte 0xf3,0xc3
+.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,@function
+.align 16
+private_AES_set_decrypt_key:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdx
+.Ldec_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+ movq (%rsp),%r8
+ cmpl $0,%eax
+ jne .Labort
+
+ movl 240(%r8),%r14d
+ xorq %rdi,%rdi
+ leaq (%rdi,%r14,4),%rcx
+ movq %r8,%rsi
+ leaq (%r8,%rcx,4),%rdi
+.align 4
+.Linvert:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 0(%rdi),%rcx
+ movq 8(%rdi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,0(%rsi)
+ movq %rdx,8(%rsi)
+ leaq 16(%rsi),%rsi
+ leaq -16(%rdi),%rdi
+ cmpq %rsi,%rdi
+ jne .Linvert
+
+ leaq .LAES_Te+2048+1024(%rip),%rax
+
+ movq 40(%rax),%rsi
+ movq 48(%rax),%rdi
+ movq 56(%rax),%rbp
+
+ movq %r8,%r15
+ subl $1,%r14d
+.align 4
+.Lpermute:
+ leaq 16(%r15),%r15
+ movq 0(%r15),%rax
+ movq 8(%r15),%rcx
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+ shrq $7,%r9
+ leaq (%rax,%rax,1),%r8
+ shrq $7,%r12
+ leaq (%rcx,%rcx,1),%r11
+ subq %r9,%rbx
+ subq %r12,%rdx
+ andq %rdi,%r8
+ andq %rdi,%r11
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r8,%rbx
+ xorq %r11,%rdx
+ movq %rbx,%r8
+ movq %rdx,%r11
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ leaq (%r8,%r8,1),%r9
+ shrq $7,%r13
+ leaq (%r11,%r11,1),%r12
+ subq %r10,%rbx
+ subq %r13,%rdx
+ andq %rdi,%r9
+ andq %rdi,%r12
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r9,%rbx
+ xorq %r12,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ xorq %rax,%r8
+ shrq $7,%r13
+ xorq %rcx,%r11
+ subq %r10,%rbx
+ subq %r13,%rdx
+ leaq (%r9,%r9,1),%r10
+ leaq (%r12,%r12,1),%r13
+ xorq %rax,%r9
+ xorq %rcx,%r12
+ andq %rdi,%r10
+ andq %rdi,%r13
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r10
+ xorq %rdx,%r13
+
+ xorq %r10,%rax
+ xorq %r13,%rcx
+ xorq %r10,%r8
+ xorq %r13,%r11
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ xorq %r10,%r9
+ xorq %r13,%r12
+ shrq $32,%rbx
+ shrq $32,%rdx
+ xorq %r8,%r10
+ xorq %r11,%r13
+ roll $8,%eax
+ roll $8,%ecx
+ xorq %r9,%r10
+ xorq %r12,%r13
+
+ roll $8,%ebx
+ roll $8,%edx
+ xorl %r10d,%eax
+ xorl %r13d,%ecx
+ shrq $32,%r10
+ shrq $32,%r13
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq %r8,%r10
+ movq %r11,%r13
+ shrq $32,%r10
+ shrq $32,%r13
+ roll $24,%r8d
+ roll $24,%r11d
+ roll $24,%r10d
+ roll $24,%r13d
+ xorl %r8d,%eax
+ xorl %r11d,%ecx
+ movq %r9,%r8
+ movq %r12,%r11
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+
+ shrq $32,%r8
+ shrq $32,%r11
+
+ roll $16,%r9d
+ roll $16,%r12d
+
+ roll $16,%r8d
+ roll $16,%r11d
+
+ xorl %r9d,%eax
+ xorl %r12d,%ecx
+
+ xorl %r8d,%ebx
+ xorl %r11d,%edx
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ subl $1,%r14d
+ jnz .Lpermute
+
+ xorq %rax,%rax
+.Labort:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbp
+ movq 48(%rsp),%rbx
+ addq $56,%rsp
+.Ldec_key_epilogue:
+ .byte 0xf3,0xc3
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.globl AES_cbc_encrypt
+.type AES_cbc_encrypt,@function
+.align 16
+
+.globl asm_AES_cbc_encrypt
+.hidden asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
+AES_cbc_encrypt:
+ cmpq $0,%rdx
+ je .Lcbc_epilogue
+ pushfq
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+.Lcbc_prologue:
+
+ cld
+ movl %r9d,%r9d
+
+ leaq .LAES_Te(%rip),%r14
+ cmpq $0,%r9
+ jne .Lcbc_picked_te
+ leaq .LAES_Td(%rip),%r14
+.Lcbc_picked_te:
+
+ movl OPENSSL_ia32cap_P(%rip),%r10d
+ cmpq $512,%rdx
+ jb .Lcbc_slow_prologue
+ testq $15,%rdx
+ jnz .Lcbc_slow_prologue
+ btl $28,%r10d
+ jc .Lcbc_slow_prologue
+
+
+ leaq -88-248(%rsp),%r15
+ andq $-64,%r15
+
+
+ movq %r14,%r10
+ leaq 2304(%r14),%r11
+ movq %r15,%r12
+ andq $4095,%r10
+ andq $4095,%r11
+ andq $4095,%r12
+
+ cmpq %r11,%r12
+ jb .Lcbc_te_break_out
+ subq %r11,%r12
+ subq %r12,%r15
+ jmp .Lcbc_te_ok
+.Lcbc_te_break_out:
+ subq %r10,%r12
+ andq $4095,%r12
+ addq $320,%r12
+ subq %r12,%r15
+.align 4
+.Lcbc_te_ok:
+
+ xchgq %rsp,%r15
+
+ movq %r15,16(%rsp)
+.Lcbc_fast_body:
+ movq %rdi,24(%rsp)
+ movq %rsi,32(%rsp)
+ movq %rdx,40(%rsp)
+ movq %rcx,48(%rsp)
+ movq %r8,56(%rsp)
+ movl $0,80+240(%rsp)
+ movq %r8,%rbp
+ movq %r9,%rbx
+ movq %rsi,%r9
+ movq %rdi,%r8
+ movq %rcx,%r15
+
+ movl 240(%r15),%eax
+
+ movq %r15,%r10
+ subq %r14,%r10
+ andq $4095,%r10
+ cmpq $2304,%r10
+ jb .Lcbc_do_ecopy
+ cmpq $4096-248,%r10
+ jb .Lcbc_skip_ecopy
+.align 4
+.Lcbc_do_ecopy:
+ movq %r15,%rsi
+ leaq 80(%rsp),%rdi
+ leaq 80(%rsp),%r15
+ movl $30,%ecx
+.long 0x90A548F3
+ movl %eax,(%rdi)
+.Lcbc_skip_ecopy:
+ movq %r15,0(%rsp)
+
+ movl $18,%ecx
+.align 4
+.Lcbc_prefetch_te:
+ movq 0(%r14),%r10
+ movq 32(%r14),%r11
+ movq 64(%r14),%r12
+ movq 96(%r14),%r13
+ leaq 128(%r14),%r14
+ subl $1,%ecx
+ jnz .Lcbc_prefetch_te
+ leaq -2304(%r14),%r14
+
+ cmpq $0,%rbx
+ je .LFAST_DECRYPT
+
+
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+.align 4
+.Lcbc_fast_enc_loop:
+ xorl 0(%r8),%eax
+ xorl 4(%r8),%ebx
+ xorl 8(%r8),%ecx
+ xorl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_encrypt
+
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ subq $16,%r10
+ testq $-16,%r10
+ movq %r10,40(%rsp)
+ jnz .Lcbc_fast_enc_loop
+ movq 56(%rsp),%rbp
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ jmp .Lcbc_fast_cleanup
+
+
+.align 16
+.LFAST_DECRYPT:
+ cmpq %r8,%r9
+ je .Lcbc_fast_dec_in_place
+
+ movq %rbp,64(%rsp)
+.align 4
+.Lcbc_fast_dec_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_decrypt
+
+ movq 64(%rsp),%rbp
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ xorl 0(%rbp),%eax
+ xorl 4(%rbp),%ebx
+ xorl 8(%rbp),%ecx
+ xorl 12(%rbp),%edx
+ movq %r8,%rbp
+
+ subq $16,%r10
+ movq %r10,40(%rsp)
+ movq %rbp,64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ jnz .Lcbc_fast_dec_loop
+ movq 56(%rsp),%r12
+ movq 0(%rbp),%r10
+ movq 8(%rbp),%r11
+ movq %r10,0(%r12)
+ movq %r11,8(%r12)
+ jmp .Lcbc_fast_cleanup
+
+.align 16
+.Lcbc_fast_dec_in_place:
+ movq 0(%rbp),%r10
+ movq 8(%rbp),%r11
+ movq %r10,0+64(%rsp)
+ movq %r11,8+64(%rsp)
+.align 4
+.Lcbc_fast_dec_in_place_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_decrypt
+
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ xorl 0+64(%rsp),%eax
+ xorl 4+64(%rsp),%ebx
+ xorl 8+64(%rsp),%ecx
+ xorl 12+64(%rsp),%edx
+
+ movq 0(%r8),%r11
+ movq 8(%r8),%r12
+ subq $16,%r10
+ jz .Lcbc_fast_dec_in_place_done
+
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ movq %r10,40(%rsp)
+ jmp .Lcbc_fast_dec_in_place_loop
+.Lcbc_fast_dec_in_place_done:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+.align 4
+.Lcbc_fast_cleanup:
+ cmpl $0,80+240(%rsp)
+ leaq 80(%rsp),%rdi
+ je .Lcbc_exit
+ movl $30,%ecx
+ xorq %rax,%rax
+.long 0x90AB48F3
+
+ jmp .Lcbc_exit
+
+
+.align 16
+.Lcbc_slow_prologue:
+
+ leaq -88(%rsp),%rbp
+ andq $-64,%rbp
+
+ leaq -88-63(%rcx),%r10
+ subq %rbp,%r10
+ negq %r10
+ andq $960,%r10
+ subq %r10,%rbp
+
+ xchgq %rsp,%rbp
+
+ movq %rbp,16(%rsp)
+.Lcbc_slow_body:
+
+
+
+
+ movq %r8,56(%rsp)
+ movq %r8,%rbp
+ movq %r9,%rbx
+ movq %rsi,%r9
+ movq %rdi,%r8
+ movq %rcx,%r15
+ movq %rdx,%r10
+
+ movl 240(%r15),%eax
+ movq %r15,0(%rsp)
+ shll $4,%eax
+ leaq (%r15,%rax,1),%rax
+ movq %rax,8(%rsp)
+
+
+ leaq 2048(%r14),%r14
+ leaq 768-8(%rsp),%rax
+ subq %r14,%rax
+ andq $768,%rax
+ leaq (%r14,%rax,1),%r14
+
+ cmpq $0,%rbx
+ je .LSLOW_DECRYPT
+
+
+ testq $-16,%r10
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+ jz .Lcbc_slow_enc_tail
+
+.align 4
+.Lcbc_slow_enc_loop:
+ xorl 0(%r8),%eax
+ xorl 4(%r8),%ebx
+ xorl 8(%r8),%ecx
+ xorl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+ movq %r9,32(%rsp)
+ movq %r10,40(%rsp)
+
+ call _x86_64_AES_encrypt_compact
+
+ movq 24(%rsp),%r8
+ movq 32(%rsp),%r9
+ movq 40(%rsp),%r10
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ subq $16,%r10
+ testq $-16,%r10
+ jnz .Lcbc_slow_enc_loop
+ testq $15,%r10
+ jnz .Lcbc_slow_enc_tail
+ movq 56(%rsp),%rbp
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ jmp .Lcbc_exit
+
+.align 4
+.Lcbc_slow_enc_tail:
+ movq %rax,%r11
+ movq %rcx,%r12
+ movq %r10,%rcx
+ movq %r8,%rsi
+ movq %r9,%rdi
+.long 0x9066A4F3
+ movq $16,%rcx
+ subq %r10,%rcx
+ xorq %rax,%rax
+.long 0x9066AAF3
+ movq %r9,%r8
+ movq $16,%r10
+ movq %r11,%rax
+ movq %r12,%rcx
+ jmp .Lcbc_slow_enc_loop
+
+.align 16
+.LSLOW_DECRYPT:
+ shrq $3,%rax
+ addq %rax,%r14
+
+ movq 0(%rbp),%r11
+ movq 8(%rbp),%r12
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+.align 4
+.Lcbc_slow_dec_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+ movq %r9,32(%rsp)
+ movq %r10,40(%rsp)
+
+ call _x86_64_AES_decrypt_compact
+
+ movq 24(%rsp),%r8
+ movq 32(%rsp),%r9
+ movq 40(%rsp),%r10
+ xorl 0+64(%rsp),%eax
+ xorl 4+64(%rsp),%ebx
+ xorl 8+64(%rsp),%ecx
+ xorl 12+64(%rsp),%edx
+
+ movq 0(%r8),%r11
+ movq 8(%r8),%r12
+ subq $16,%r10
+ jc .Lcbc_slow_dec_partial
+ jz .Lcbc_slow_dec_done
+
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ jmp .Lcbc_slow_dec_loop
+.Lcbc_slow_dec_done:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ jmp .Lcbc_exit
+
+.align 4
+.Lcbc_slow_dec_partial:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0+64(%rsp)
+ movl %ebx,4+64(%rsp)
+ movl %ecx,8+64(%rsp)
+ movl %edx,12+64(%rsp)
+
+ movq %r9,%rdi
+ leaq 64(%rsp),%rsi
+ leaq 16(%r10),%rcx
+.long 0x9066A4F3
+ jmp .Lcbc_exit
+
+.align 16
+.Lcbc_exit:
+ movq 16(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lcbc_popfq:
+ popfq
+.Lcbc_epilogue:
+ .byte 0xf3,0xc3
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+.align 64
+.LAES_Te:
+.long 0xa56363c6,0xa56363c6
+.long 0x847c7cf8,0x847c7cf8
+.long 0x997777ee,0x997777ee
+.long 0x8d7b7bf6,0x8d7b7bf6
+.long 0x0df2f2ff,0x0df2f2ff
+.long 0xbd6b6bd6,0xbd6b6bd6
+.long 0xb16f6fde,0xb16f6fde
+.long 0x54c5c591,0x54c5c591
+.long 0x50303060,0x50303060
+.long 0x03010102,0x03010102
+.long 0xa96767ce,0xa96767ce
+.long 0x7d2b2b56,0x7d2b2b56
+.long 0x19fefee7,0x19fefee7
+.long 0x62d7d7b5,0x62d7d7b5
+.long 0xe6abab4d,0xe6abab4d
+.long 0x9a7676ec,0x9a7676ec
+.long 0x45caca8f,0x45caca8f
+.long 0x9d82821f,0x9d82821f
+.long 0x40c9c989,0x40c9c989
+.long 0x877d7dfa,0x877d7dfa
+.long 0x15fafaef,0x15fafaef
+.long 0xeb5959b2,0xeb5959b2
+.long 0xc947478e,0xc947478e
+.long 0x0bf0f0fb,0x0bf0f0fb
+.long 0xecadad41,0xecadad41
+.long 0x67d4d4b3,0x67d4d4b3
+.long 0xfda2a25f,0xfda2a25f
+.long 0xeaafaf45,0xeaafaf45
+.long 0xbf9c9c23,0xbf9c9c23
+.long 0xf7a4a453,0xf7a4a453
+.long 0x967272e4,0x967272e4
+.long 0x5bc0c09b,0x5bc0c09b
+.long 0xc2b7b775,0xc2b7b775
+.long 0x1cfdfde1,0x1cfdfde1
+.long 0xae93933d,0xae93933d
+.long 0x6a26264c,0x6a26264c
+.long 0x5a36366c,0x5a36366c
+.long 0x413f3f7e,0x413f3f7e
+.long 0x02f7f7f5,0x02f7f7f5
+.long 0x4fcccc83,0x4fcccc83
+.long 0x5c343468,0x5c343468
+.long 0xf4a5a551,0xf4a5a551
+.long 0x34e5e5d1,0x34e5e5d1
+.long 0x08f1f1f9,0x08f1f1f9
+.long 0x937171e2,0x937171e2
+.long 0x73d8d8ab,0x73d8d8ab
+.long 0x53313162,0x53313162
+.long 0x3f15152a,0x3f15152a
+.long 0x0c040408,0x0c040408
+.long 0x52c7c795,0x52c7c795
+.long 0x65232346,0x65232346
+.long 0x5ec3c39d,0x5ec3c39d
+.long 0x28181830,0x28181830
+.long 0xa1969637,0xa1969637
+.long 0x0f05050a,0x0f05050a
+.long 0xb59a9a2f,0xb59a9a2f
+.long 0x0907070e,0x0907070e
+.long 0x36121224,0x36121224
+.long 0x9b80801b,0x9b80801b
+.long 0x3de2e2df,0x3de2e2df
+.long 0x26ebebcd,0x26ebebcd
+.long 0x6927274e,0x6927274e
+.long 0xcdb2b27f,0xcdb2b27f
+.long 0x9f7575ea,0x9f7575ea
+.long 0x1b090912,0x1b090912
+.long 0x9e83831d,0x9e83831d
+.long 0x742c2c58,0x742c2c58
+.long 0x2e1a1a34,0x2e1a1a34
+.long 0x2d1b1b36,0x2d1b1b36
+.long 0xb26e6edc,0xb26e6edc
+.long 0xee5a5ab4,0xee5a5ab4
+.long 0xfba0a05b,0xfba0a05b
+.long 0xf65252a4,0xf65252a4
+.long 0x4d3b3b76,0x4d3b3b76
+.long 0x61d6d6b7,0x61d6d6b7
+.long 0xceb3b37d,0xceb3b37d
+.long 0x7b292952,0x7b292952
+.long 0x3ee3e3dd,0x3ee3e3dd
+.long 0x712f2f5e,0x712f2f5e
+.long 0x97848413,0x97848413
+.long 0xf55353a6,0xf55353a6
+.long 0x68d1d1b9,0x68d1d1b9
+.long 0x00000000,0x00000000
+.long 0x2cededc1,0x2cededc1
+.long 0x60202040,0x60202040
+.long 0x1ffcfce3,0x1ffcfce3
+.long 0xc8b1b179,0xc8b1b179
+.long 0xed5b5bb6,0xed5b5bb6
+.long 0xbe6a6ad4,0xbe6a6ad4
+.long 0x46cbcb8d,0x46cbcb8d
+.long 0xd9bebe67,0xd9bebe67
+.long 0x4b393972,0x4b393972
+.long 0xde4a4a94,0xde4a4a94
+.long 0xd44c4c98,0xd44c4c98
+.long 0xe85858b0,0xe85858b0
+.long 0x4acfcf85,0x4acfcf85
+.long 0x6bd0d0bb,0x6bd0d0bb
+.long 0x2aefefc5,0x2aefefc5
+.long 0xe5aaaa4f,0xe5aaaa4f
+.long 0x16fbfbed,0x16fbfbed
+.long 0xc5434386,0xc5434386
+.long 0xd74d4d9a,0xd74d4d9a
+.long 0x55333366,0x55333366
+.long 0x94858511,0x94858511
+.long 0xcf45458a,0xcf45458a
+.long 0x10f9f9e9,0x10f9f9e9
+.long 0x06020204,0x06020204
+.long 0x817f7ffe,0x817f7ffe
+.long 0xf05050a0,0xf05050a0
+.long 0x443c3c78,0x443c3c78
+.long 0xba9f9f25,0xba9f9f25
+.long 0xe3a8a84b,0xe3a8a84b
+.long 0xf35151a2,0xf35151a2
+.long 0xfea3a35d,0xfea3a35d
+.long 0xc0404080,0xc0404080
+.long 0x8a8f8f05,0x8a8f8f05
+.long 0xad92923f,0xad92923f
+.long 0xbc9d9d21,0xbc9d9d21
+.long 0x48383870,0x48383870
+.long 0x04f5f5f1,0x04f5f5f1
+.long 0xdfbcbc63,0xdfbcbc63
+.long 0xc1b6b677,0xc1b6b677
+.long 0x75dadaaf,0x75dadaaf
+.long 0x63212142,0x63212142
+.long 0x30101020,0x30101020
+.long 0x1affffe5,0x1affffe5
+.long 0x0ef3f3fd,0x0ef3f3fd
+.long 0x6dd2d2bf,0x6dd2d2bf
+.long 0x4ccdcd81,0x4ccdcd81
+.long 0x140c0c18,0x140c0c18
+.long 0x35131326,0x35131326
+.long 0x2fececc3,0x2fececc3
+.long 0xe15f5fbe,0xe15f5fbe
+.long 0xa2979735,0xa2979735
+.long 0xcc444488,0xcc444488
+.long 0x3917172e,0x3917172e
+.long 0x57c4c493,0x57c4c493
+.long 0xf2a7a755,0xf2a7a755
+.long 0x827e7efc,0x827e7efc
+.long 0x473d3d7a,0x473d3d7a
+.long 0xac6464c8,0xac6464c8
+.long 0xe75d5dba,0xe75d5dba
+.long 0x2b191932,0x2b191932
+.long 0x957373e6,0x957373e6
+.long 0xa06060c0,0xa06060c0
+.long 0x98818119,0x98818119
+.long 0xd14f4f9e,0xd14f4f9e
+.long 0x7fdcdca3,0x7fdcdca3
+.long 0x66222244,0x66222244
+.long 0x7e2a2a54,0x7e2a2a54
+.long 0xab90903b,0xab90903b
+.long 0x8388880b,0x8388880b
+.long 0xca46468c,0xca46468c
+.long 0x29eeeec7,0x29eeeec7
+.long 0xd3b8b86b,0xd3b8b86b
+.long 0x3c141428,0x3c141428
+.long 0x79dedea7,0x79dedea7
+.long 0xe25e5ebc,0xe25e5ebc
+.long 0x1d0b0b16,0x1d0b0b16
+.long 0x76dbdbad,0x76dbdbad
+.long 0x3be0e0db,0x3be0e0db
+.long 0x56323264,0x56323264
+.long 0x4e3a3a74,0x4e3a3a74
+.long 0x1e0a0a14,0x1e0a0a14
+.long 0xdb494992,0xdb494992
+.long 0x0a06060c,0x0a06060c
+.long 0x6c242448,0x6c242448
+.long 0xe45c5cb8,0xe45c5cb8
+.long 0x5dc2c29f,0x5dc2c29f
+.long 0x6ed3d3bd,0x6ed3d3bd
+.long 0xefacac43,0xefacac43
+.long 0xa66262c4,0xa66262c4
+.long 0xa8919139,0xa8919139
+.long 0xa4959531,0xa4959531
+.long 0x37e4e4d3,0x37e4e4d3
+.long 0x8b7979f2,0x8b7979f2
+.long 0x32e7e7d5,0x32e7e7d5
+.long 0x43c8c88b,0x43c8c88b
+.long 0x5937376e,0x5937376e
+.long 0xb76d6dda,0xb76d6dda
+.long 0x8c8d8d01,0x8c8d8d01
+.long 0x64d5d5b1,0x64d5d5b1
+.long 0xd24e4e9c,0xd24e4e9c
+.long 0xe0a9a949,0xe0a9a949
+.long 0xb46c6cd8,0xb46c6cd8
+.long 0xfa5656ac,0xfa5656ac
+.long 0x07f4f4f3,0x07f4f4f3
+.long 0x25eaeacf,0x25eaeacf
+.long 0xaf6565ca,0xaf6565ca
+.long 0x8e7a7af4,0x8e7a7af4
+.long 0xe9aeae47,0xe9aeae47
+.long 0x18080810,0x18080810
+.long 0xd5baba6f,0xd5baba6f
+.long 0x887878f0,0x887878f0
+.long 0x6f25254a,0x6f25254a
+.long 0x722e2e5c,0x722e2e5c
+.long 0x241c1c38,0x241c1c38
+.long 0xf1a6a657,0xf1a6a657
+.long 0xc7b4b473,0xc7b4b473
+.long 0x51c6c697,0x51c6c697
+.long 0x23e8e8cb,0x23e8e8cb
+.long 0x7cdddda1,0x7cdddda1
+.long 0x9c7474e8,0x9c7474e8
+.long 0x211f1f3e,0x211f1f3e
+.long 0xdd4b4b96,0xdd4b4b96
+.long 0xdcbdbd61,0xdcbdbd61
+.long 0x868b8b0d,0x868b8b0d
+.long 0x858a8a0f,0x858a8a0f
+.long 0x907070e0,0x907070e0
+.long 0x423e3e7c,0x423e3e7c
+.long 0xc4b5b571,0xc4b5b571
+.long 0xaa6666cc,0xaa6666cc
+.long 0xd8484890,0xd8484890
+.long 0x05030306,0x05030306
+.long 0x01f6f6f7,0x01f6f6f7
+.long 0x120e0e1c,0x120e0e1c
+.long 0xa36161c2,0xa36161c2
+.long 0x5f35356a,0x5f35356a
+.long 0xf95757ae,0xf95757ae
+.long 0xd0b9b969,0xd0b9b969
+.long 0x91868617,0x91868617
+.long 0x58c1c199,0x58c1c199
+.long 0x271d1d3a,0x271d1d3a
+.long 0xb99e9e27,0xb99e9e27
+.long 0x38e1e1d9,0x38e1e1d9
+.long 0x13f8f8eb,0x13f8f8eb
+.long 0xb398982b,0xb398982b
+.long 0x33111122,0x33111122
+.long 0xbb6969d2,0xbb6969d2
+.long 0x70d9d9a9,0x70d9d9a9
+.long 0x898e8e07,0x898e8e07
+.long 0xa7949433,0xa7949433
+.long 0xb69b9b2d,0xb69b9b2d
+.long 0x221e1e3c,0x221e1e3c
+.long 0x92878715,0x92878715
+.long 0x20e9e9c9,0x20e9e9c9
+.long 0x49cece87,0x49cece87
+.long 0xff5555aa,0xff5555aa
+.long 0x78282850,0x78282850
+.long 0x7adfdfa5,0x7adfdfa5
+.long 0x8f8c8c03,0x8f8c8c03
+.long 0xf8a1a159,0xf8a1a159
+.long 0x80898909,0x80898909
+.long 0x170d0d1a,0x170d0d1a
+.long 0xdabfbf65,0xdabfbf65
+.long 0x31e6e6d7,0x31e6e6d7
+.long 0xc6424284,0xc6424284
+.long 0xb86868d0,0xb86868d0
+.long 0xc3414182,0xc3414182
+.long 0xb0999929,0xb0999929
+.long 0x772d2d5a,0x772d2d5a
+.long 0x110f0f1e,0x110f0f1e
+.long 0xcbb0b07b,0xcbb0b07b
+.long 0xfc5454a8,0xfc5454a8
+.long 0xd6bbbb6d,0xd6bbbb6d
+.long 0x3a16162c,0x3a16162c
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.long 0x00000001, 0x00000002, 0x00000004, 0x00000008
+.long 0x00000010, 0x00000020, 0x00000040, 0x00000080
+.long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
+.long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
+.align 64
+.LAES_Td:
+.long 0x50a7f451,0x50a7f451
+.long 0x5365417e,0x5365417e
+.long 0xc3a4171a,0xc3a4171a
+.long 0x965e273a,0x965e273a
+.long 0xcb6bab3b,0xcb6bab3b
+.long 0xf1459d1f,0xf1459d1f
+.long 0xab58faac,0xab58faac
+.long 0x9303e34b,0x9303e34b
+.long 0x55fa3020,0x55fa3020
+.long 0xf66d76ad,0xf66d76ad
+.long 0x9176cc88,0x9176cc88
+.long 0x254c02f5,0x254c02f5
+.long 0xfcd7e54f,0xfcd7e54f
+.long 0xd7cb2ac5,0xd7cb2ac5
+.long 0x80443526,0x80443526
+.long 0x8fa362b5,0x8fa362b5
+.long 0x495ab1de,0x495ab1de
+.long 0x671bba25,0x671bba25
+.long 0x980eea45,0x980eea45
+.long 0xe1c0fe5d,0xe1c0fe5d
+.long 0x02752fc3,0x02752fc3
+.long 0x12f04c81,0x12f04c81
+.long 0xa397468d,0xa397468d
+.long 0xc6f9d36b,0xc6f9d36b
+.long 0xe75f8f03,0xe75f8f03
+.long 0x959c9215,0x959c9215
+.long 0xeb7a6dbf,0xeb7a6dbf
+.long 0xda595295,0xda595295
+.long 0x2d83bed4,0x2d83bed4
+.long 0xd3217458,0xd3217458
+.long 0x2969e049,0x2969e049
+.long 0x44c8c98e,0x44c8c98e
+.long 0x6a89c275,0x6a89c275
+.long 0x78798ef4,0x78798ef4
+.long 0x6b3e5899,0x6b3e5899
+.long 0xdd71b927,0xdd71b927
+.long 0xb64fe1be,0xb64fe1be
+.long 0x17ad88f0,0x17ad88f0
+.long 0x66ac20c9,0x66ac20c9
+.long 0xb43ace7d,0xb43ace7d
+.long 0x184adf63,0x184adf63
+.long 0x82311ae5,0x82311ae5
+.long 0x60335197,0x60335197
+.long 0x457f5362,0x457f5362
+.long 0xe07764b1,0xe07764b1
+.long 0x84ae6bbb,0x84ae6bbb
+.long 0x1ca081fe,0x1ca081fe
+.long 0x942b08f9,0x942b08f9
+.long 0x58684870,0x58684870
+.long 0x19fd458f,0x19fd458f
+.long 0x876cde94,0x876cde94
+.long 0xb7f87b52,0xb7f87b52
+.long 0x23d373ab,0x23d373ab
+.long 0xe2024b72,0xe2024b72
+.long 0x578f1fe3,0x578f1fe3
+.long 0x2aab5566,0x2aab5566
+.long 0x0728ebb2,0x0728ebb2
+.long 0x03c2b52f,0x03c2b52f
+.long 0x9a7bc586,0x9a7bc586
+.long 0xa50837d3,0xa50837d3
+.long 0xf2872830,0xf2872830
+.long 0xb2a5bf23,0xb2a5bf23
+.long 0xba6a0302,0xba6a0302
+.long 0x5c8216ed,0x5c8216ed
+.long 0x2b1ccf8a,0x2b1ccf8a
+.long 0x92b479a7,0x92b479a7
+.long 0xf0f207f3,0xf0f207f3
+.long 0xa1e2694e,0xa1e2694e
+.long 0xcdf4da65,0xcdf4da65
+.long 0xd5be0506,0xd5be0506
+.long 0x1f6234d1,0x1f6234d1
+.long 0x8afea6c4,0x8afea6c4
+.long 0x9d532e34,0x9d532e34
+.long 0xa055f3a2,0xa055f3a2
+.long 0x32e18a05,0x32e18a05
+.long 0x75ebf6a4,0x75ebf6a4
+.long 0x39ec830b,0x39ec830b
+.long 0xaaef6040,0xaaef6040
+.long 0x069f715e,0x069f715e
+.long 0x51106ebd,0x51106ebd
+.long 0xf98a213e,0xf98a213e
+.long 0x3d06dd96,0x3d06dd96
+.long 0xae053edd,0xae053edd
+.long 0x46bde64d,0x46bde64d
+.long 0xb58d5491,0xb58d5491
+.long 0x055dc471,0x055dc471
+.long 0x6fd40604,0x6fd40604
+.long 0xff155060,0xff155060
+.long 0x24fb9819,0x24fb9819
+.long 0x97e9bdd6,0x97e9bdd6
+.long 0xcc434089,0xcc434089
+.long 0x779ed967,0x779ed967
+.long 0xbd42e8b0,0xbd42e8b0
+.long 0x888b8907,0x888b8907
+.long 0x385b19e7,0x385b19e7
+.long 0xdbeec879,0xdbeec879
+.long 0x470a7ca1,0x470a7ca1
+.long 0xe90f427c,0xe90f427c
+.long 0xc91e84f8,0xc91e84f8
+.long 0x00000000,0x00000000
+.long 0x83868009,0x83868009
+.long 0x48ed2b32,0x48ed2b32
+.long 0xac70111e,0xac70111e
+.long 0x4e725a6c,0x4e725a6c
+.long 0xfbff0efd,0xfbff0efd
+.long 0x5638850f,0x5638850f
+.long 0x1ed5ae3d,0x1ed5ae3d
+.long 0x27392d36,0x27392d36
+.long 0x64d90f0a,0x64d90f0a
+.long 0x21a65c68,0x21a65c68
+.long 0xd1545b9b,0xd1545b9b
+.long 0x3a2e3624,0x3a2e3624
+.long 0xb1670a0c,0xb1670a0c
+.long 0x0fe75793,0x0fe75793
+.long 0xd296eeb4,0xd296eeb4
+.long 0x9e919b1b,0x9e919b1b
+.long 0x4fc5c080,0x4fc5c080
+.long 0xa220dc61,0xa220dc61
+.long 0x694b775a,0x694b775a
+.long 0x161a121c,0x161a121c
+.long 0x0aba93e2,0x0aba93e2
+.long 0xe52aa0c0,0xe52aa0c0
+.long 0x43e0223c,0x43e0223c
+.long 0x1d171b12,0x1d171b12
+.long 0x0b0d090e,0x0b0d090e
+.long 0xadc78bf2,0xadc78bf2
+.long 0xb9a8b62d,0xb9a8b62d
+.long 0xc8a91e14,0xc8a91e14
+.long 0x8519f157,0x8519f157
+.long 0x4c0775af,0x4c0775af
+.long 0xbbdd99ee,0xbbdd99ee
+.long 0xfd607fa3,0xfd607fa3
+.long 0x9f2601f7,0x9f2601f7
+.long 0xbcf5725c,0xbcf5725c
+.long 0xc53b6644,0xc53b6644
+.long 0x347efb5b,0x347efb5b
+.long 0x7629438b,0x7629438b
+.long 0xdcc623cb,0xdcc623cb
+.long 0x68fcedb6,0x68fcedb6
+.long 0x63f1e4b8,0x63f1e4b8
+.long 0xcadc31d7,0xcadc31d7
+.long 0x10856342,0x10856342
+.long 0x40229713,0x40229713
+.long 0x2011c684,0x2011c684
+.long 0x7d244a85,0x7d244a85
+.long 0xf83dbbd2,0xf83dbbd2
+.long 0x1132f9ae,0x1132f9ae
+.long 0x6da129c7,0x6da129c7
+.long 0x4b2f9e1d,0x4b2f9e1d
+.long 0xf330b2dc,0xf330b2dc
+.long 0xec52860d,0xec52860d
+.long 0xd0e3c177,0xd0e3c177
+.long 0x6c16b32b,0x6c16b32b
+.long 0x99b970a9,0x99b970a9
+.long 0xfa489411,0xfa489411
+.long 0x2264e947,0x2264e947
+.long 0xc48cfca8,0xc48cfca8
+.long 0x1a3ff0a0,0x1a3ff0a0
+.long 0xd82c7d56,0xd82c7d56
+.long 0xef903322,0xef903322
+.long 0xc74e4987,0xc74e4987
+.long 0xc1d138d9,0xc1d138d9
+.long 0xfea2ca8c,0xfea2ca8c
+.long 0x360bd498,0x360bd498
+.long 0xcf81f5a6,0xcf81f5a6
+.long 0x28de7aa5,0x28de7aa5
+.long 0x268eb7da,0x268eb7da
+.long 0xa4bfad3f,0xa4bfad3f
+.long 0xe49d3a2c,0xe49d3a2c
+.long 0x0d927850,0x0d927850
+.long 0x9bcc5f6a,0x9bcc5f6a
+.long 0x62467e54,0x62467e54
+.long 0xc2138df6,0xc2138df6
+.long 0xe8b8d890,0xe8b8d890
+.long 0x5ef7392e,0x5ef7392e
+.long 0xf5afc382,0xf5afc382
+.long 0xbe805d9f,0xbe805d9f
+.long 0x7c93d069,0x7c93d069
+.long 0xa92dd56f,0xa92dd56f
+.long 0xb31225cf,0xb31225cf
+.long 0x3b99acc8,0x3b99acc8
+.long 0xa77d1810,0xa77d1810
+.long 0x6e639ce8,0x6e639ce8
+.long 0x7bbb3bdb,0x7bbb3bdb
+.long 0x097826cd,0x097826cd
+.long 0xf418596e,0xf418596e
+.long 0x01b79aec,0x01b79aec
+.long 0xa89a4f83,0xa89a4f83
+.long 0x656e95e6,0x656e95e6
+.long 0x7ee6ffaa,0x7ee6ffaa
+.long 0x08cfbc21,0x08cfbc21
+.long 0xe6e815ef,0xe6e815ef
+.long 0xd99be7ba,0xd99be7ba
+.long 0xce366f4a,0xce366f4a
+.long 0xd4099fea,0xd4099fea
+.long 0xd67cb029,0xd67cb029
+.long 0xafb2a431,0xafb2a431
+.long 0x31233f2a,0x31233f2a
+.long 0x3094a5c6,0x3094a5c6
+.long 0xc066a235,0xc066a235
+.long 0x37bc4e74,0x37bc4e74
+.long 0xa6ca82fc,0xa6ca82fc
+.long 0xb0d090e0,0xb0d090e0
+.long 0x15d8a733,0x15d8a733
+.long 0x4a9804f1,0x4a9804f1
+.long 0xf7daec41,0xf7daec41
+.long 0x0e50cd7f,0x0e50cd7f
+.long 0x2ff69117,0x2ff69117
+.long 0x8dd64d76,0x8dd64d76
+.long 0x4db0ef43,0x4db0ef43
+.long 0x544daacc,0x544daacc
+.long 0xdf0496e4,0xdf0496e4
+.long 0xe3b5d19e,0xe3b5d19e
+.long 0x1b886a4c,0x1b886a4c
+.long 0xb81f2cc1,0xb81f2cc1
+.long 0x7f516546,0x7f516546
+.long 0x04ea5e9d,0x04ea5e9d
+.long 0x5d358c01,0x5d358c01
+.long 0x737487fa,0x737487fa
+.long 0x2e410bfb,0x2e410bfb
+.long 0x5a1d67b3,0x5a1d67b3
+.long 0x52d2db92,0x52d2db92
+.long 0x335610e9,0x335610e9
+.long 0x1347d66d,0x1347d66d
+.long 0x8c61d79a,0x8c61d79a
+.long 0x7a0ca137,0x7a0ca137
+.long 0x8e14f859,0x8e14f859
+.long 0x893c13eb,0x893c13eb
+.long 0xee27a9ce,0xee27a9ce
+.long 0x35c961b7,0x35c961b7
+.long 0xede51ce1,0xede51ce1
+.long 0x3cb1477a,0x3cb1477a
+.long 0x59dfd29c,0x59dfd29c
+.long 0x3f73f255,0x3f73f255
+.long 0x79ce1418,0x79ce1418
+.long 0xbf37c773,0xbf37c773
+.long 0xeacdf753,0xeacdf753
+.long 0x5baafd5f,0x5baafd5f
+.long 0x146f3ddf,0x146f3ddf
+.long 0x86db4478,0x86db4478
+.long 0x81f3afca,0x81f3afca
+.long 0x3ec468b9,0x3ec468b9
+.long 0x2c342438,0x2c342438
+.long 0x5f40a3c2,0x5f40a3c2
+.long 0x72c31d16,0x72c31d16
+.long 0x0c25e2bc,0x0c25e2bc
+.long 0x8b493c28,0x8b493c28
+.long 0x41950dff,0x41950dff
+.long 0x7101a839,0x7101a839
+.long 0xdeb30c08,0xdeb30c08
+.long 0x9ce4b4d8,0x9ce4b4d8
+.long 0x90c15664,0x90c15664
+.long 0x6184cb7b,0x6184cb7b
+.long 0x70b632d5,0x70b632d5
+.long 0x745c6c48,0x745c6c48
+.long 0x4257b8d0,0x4257b8d0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
index a545e89..34cbb5d 100755
--- a/crypto/aes/asm/aes-x86_64.pl
+++ b/crypto/aes/asm/aes-x86_64.pl
@@ -36,7 +36,8 @@
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$verticalspin=1; # unlike 32-bit version $verticalspin performs
# ~15% better on both AMD and Intel cores
@@ -588,6 +589,9 @@ ()
.globl AES_encrypt
.type AES_encrypt,\@function,3
.align 16
+.globl asm_AES_encrypt
+.hidden asm_AES_encrypt
+asm_AES_encrypt:
AES_encrypt:
push %rbx
push %rbp
@@ -1184,6 +1188,9 @@ ()
.globl AES_decrypt
.type AES_decrypt,\@function,3
.align 16
+.globl asm_AES_decrypt
+.hidden asm_AES_decrypt
+asm_AES_decrypt:
AES_decrypt:
push %rbx
push %rbp
@@ -1277,13 +1284,13 @@ ()
___
}
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
$code.=<<___;
-.globl AES_set_encrypt_key
-.type AES_set_encrypt_key,\@function,3
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,\@function,3
.align 16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
push %rbx
push %rbp
push %r12 # redundant, but allows to share
@@ -1304,7 +1311,7 @@ ()
add \$56,%rsp
.Lenc_key_epilogue:
ret
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
.align 16
@@ -1547,13 +1554,13 @@ ()
___
}
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
$code.=<<___;
-.globl AES_set_decrypt_key
-.type AES_set_decrypt_key,\@function,3
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,\@function,3
.align 16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
push %rbx
push %rbp
push %r12
@@ -1622,7 +1629,7 @@ ()
add \$56,%rsp
.Ldec_key_epilogue:
ret
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
___
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -1648,6 +1655,9 @@ ()
.type AES_cbc_encrypt,\@function,6
.align 16
.extern OPENSSL_ia32cap_P
+.globl asm_AES_cbc_encrypt
+.hidden asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
AES_cbc_encrypt:
cmp \$0,%rdx # check length
je .Lcbc_epilogue
@@ -2766,13 +2776,13 @@ ()
.rva .LSEH_end_AES_decrypt
.rva .LSEH_info_AES_decrypt
- .rva .LSEH_begin_AES_set_encrypt_key
- .rva .LSEH_end_AES_set_encrypt_key
- .rva .LSEH_info_AES_set_encrypt_key
+ .rva .LSEH_begin_private_AES_set_encrypt_key
+ .rva .LSEH_end_private_AES_set_encrypt_key
+ .rva .LSEH_info_private_AES_set_encrypt_key
- .rva .LSEH_begin_AES_set_decrypt_key
- .rva .LSEH_end_AES_set_decrypt_key
- .rva .LSEH_info_AES_set_decrypt_key
+ .rva .LSEH_begin_private_AES_set_decrypt_key
+ .rva .LSEH_end_private_AES_set_decrypt_key
+ .rva .LSEH_info_private_AES_set_decrypt_key
.rva .LSEH_begin_AES_cbc_encrypt
.rva .LSEH_end_AES_cbc_encrypt
@@ -2788,11 +2798,11 @@ ()
.byte 9,0,0,0
.rva block_se_handler
.rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
+.LSEH_info_private_AES_set_encrypt_key:
.byte 9,0,0,0
.rva key_se_handler
.rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
+.LSEH_info_private_AES_set_decrypt_key:
.byte 9,0,0,0
.rva key_se_handler
.rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.S b/crypto/aes/asm/aesni-sha1-x86_64.S
new file mode 100644
index 0000000..32fd600
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha1-x86_64.S
@@ -0,0 +1,1396 @@
+.text
+
+
+.globl aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc,@function
+.align 16
+aesni_cbc_sha1_enc:
+
+ movl OPENSSL_ia32cap_P+0(%rip),%r10d
+ movl OPENSSL_ia32cap_P+4(%rip),%r11d
+ jmp aesni_cbc_sha1_enc_ssse3
+ .byte 0xf3,0xc3
+.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc_ssse3,@function
+.align 16
+aesni_cbc_sha1_enc_ssse3:
+ movq 8(%rsp),%r10
+
+
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -104(%rsp),%rsp
+
+
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movdqu (%r8),%xmm11
+ movq %r8,88(%rsp)
+ shlq $6,%r14
+ subq %r12,%r13
+ movl 240(%r15),%r8d
+ addq %r10,%r14
+
+ leaq K_XX_XX(%rip),%r11
+ movl 0(%r9),%eax
+ movl 4(%r9),%ebx
+ movl 8(%r9),%ecx
+ movl 12(%r9),%edx
+ movl %ebx,%esi
+ movl 16(%r9),%ebp
+
+ movdqa 64(%r11),%xmm6
+ movdqa 0(%r11),%xmm9
+ movdqu 0(%r10),%xmm0
+ movdqu 16(%r10),%xmm1
+ movdqu 32(%r10),%xmm2
+ movdqu 48(%r10),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r10
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm0
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ movups (%r15),%xmm13
+ movups 16(%r15),%xmm14
+ jmp .Loop_ssse3
+.align 16
+.Loop_ssse3:
+ movdqa %xmm1,%xmm4
+ addl 0(%rsp),%ebp
+ movups 0(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ xorl %edx,%ecx
+ movdqa %xmm3,%xmm8
+.byte 102,15,58,15,224,8
+ movl %eax,%edi
+ roll $5,%eax
+ paddd %xmm3,%xmm9
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ psrldq $4,%xmm8
+ xorl %edx,%esi
+ addl %eax,%ebp
+ pxor %xmm0,%xmm4
+ rorl $2,%ebx
+ addl %esi,%ebp
+ pxor %xmm2,%xmm8
+ addl 4(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ roll $5,%ebp
+ pxor %xmm8,%xmm4
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa %xmm9,48(%rsp)
+ xorl %ecx,%edi
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ addl %ebp,%edx
+ movdqa %xmm4,%xmm10
+ movdqa %xmm4,%xmm8
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 8(%rsp),%ecx
+ xorl %ebx,%eax
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ roll $5,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ psrld $31,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ movdqa %xmm10,%xmm9
+ rorl $7,%ebp
+ addl %esi,%ecx
+ psrld $30,%xmm10
+ por %xmm8,%xmm4
+ addl 12(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa 0(%r11),%xmm10
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm4
+ rorl $7,%edx
+ addl %edi,%ebx
+ movdqa %xmm2,%xmm5
+ addl 16(%rsp),%eax
+ xorl %ebp,%edx
+ movdqa %xmm4,%xmm9
+.byte 102,15,58,15,233,8
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm4,%xmm10
+ andl %edx,%esi
+ xorl %ebp,%edx
+ psrldq $4,%xmm9
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ pxor %xmm1,%xmm5
+ rorl $7,%ecx
+ addl %esi,%eax
+ pxor %xmm3,%xmm9
+ addl 20(%rsp),%ebp
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ xorl %edx,%ecx
+ movl %eax,%esi
+ roll $5,%eax
+ pxor %xmm9,%xmm5
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ movdqa %xmm10,0(%rsp)
+ xorl %edx,%edi
+ addl %eax,%ebp
+ movdqa %xmm5,%xmm8
+ movdqa %xmm5,%xmm9
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 24(%rsp),%edx
+ xorl %ecx,%ebx
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ roll $5,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ psrld $31,%xmm9
+ xorl %ecx,%esi
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ addl %ebp,%edx
+ movdqa %xmm8,%xmm10
+ rorl $7,%eax
+ addl %esi,%edx
+ psrld $30,%xmm8
+ por %xmm9,%xmm5
+ addl 28(%rsp),%ecx
+ xorl %ebx,%eax
+ movl %edx,%esi
+ roll $5,%edx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ andl %eax,%edi
+ xorl %ebx,%eax
+ movdqa 16(%r11),%xmm8
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ pxor %xmm10,%xmm5
+ rorl $7,%ebp
+ addl %edi,%ecx
+ movdqa %xmm3,%xmm6
+ addl 32(%rsp),%ebx
+ xorl %eax,%ebp
+ movdqa %xmm5,%xmm10
+.byte 102,15,58,15,242,8
+ movl %ecx,%edi
+ roll $5,%ecx
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ paddd %xmm5,%xmm8
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ psrldq $4,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ pxor %xmm2,%xmm6
+ rorl $7,%edx
+ addl %esi,%ebx
+ pxor %xmm4,%xmm10
+ addl 36(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ roll $5,%ebx
+ pxor %xmm10,%xmm6
+ andl %edx,%edi
+ xorl %ebp,%edx
+ movdqa %xmm8,16(%rsp)
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ movdqa %xmm6,%xmm9
+ movdqa %xmm6,%xmm10
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 40(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%ecx
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ roll $5,%eax
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ psrld $31,%xmm10
+ xorl %edx,%esi
+ addl %eax,%ebp
+ movdqa %xmm9,%xmm8
+ rorl $7,%ebx
+ addl %esi,%ebp
+ psrld $30,%xmm9
+ por %xmm10,%xmm6
+ addl 44(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ roll $5,%ebp
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa 16(%r11),%xmm9
+ xorl %ecx,%edi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %ebp,%edx
+ pxor %xmm8,%xmm6
+ rorl $7,%eax
+ addl %edi,%edx
+ movdqa %xmm4,%xmm7
+ addl 48(%rsp),%ecx
+ xorl %ebx,%eax
+ movdqa %xmm6,%xmm8
+.byte 102,15,58,15,251,8
+ movl %edx,%edi
+ roll $5,%edx
+ paddd %xmm6,%xmm9
+ andl %eax,%esi
+ xorl %ebx,%eax
+ psrldq $4,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ pxor %xmm3,%xmm7
+ rorl $7,%ebp
+ addl %esi,%ecx
+ pxor %xmm5,%xmm8
+ addl 52(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ pxor %xmm8,%xmm7
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa %xmm9,32(%rsp)
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ movdqa %xmm7,%xmm10
+ movdqa %xmm7,%xmm8
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 56(%rsp),%eax
+ xorl %ebp,%edx
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ roll $5,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ psrld $31,%xmm8
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ movdqa %xmm10,%xmm9
+ rorl $7,%ecx
+ addl %esi,%eax
+ psrld $30,%xmm10
+ por %xmm8,%xmm7
+ addl 60(%rsp),%ebp
+ cmpl $11,%r8d
+ jb .Laesenclast1
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast1
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast1:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ xorl %edx,%ecx
+ movl %eax,%esi
+ roll $5,%eax
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ movdqa 16(%r11),%xmm10
+ xorl %edx,%edi
+ addl %eax,%ebp
+ pxor %xmm9,%xmm7
+ rorl $7,%ebx
+ addl %edi,%ebp
+ movdqa %xmm7,%xmm9
+ addl 0(%rsp),%edx
+ pxor %xmm4,%xmm0
+.byte 102,68,15,58,15,206,8
+ xorl %ecx,%ebx
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm7,%xmm10
+ xorl %ecx,%esi
+ movups 16(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ movups %xmm11,0(%r13,%r12,1)
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 4(%rsp),%ecx
+ xorl %ebx,%eax
+ movdqa %xmm0,%xmm9
+ movdqa %xmm10,48(%rsp)
+ movl %edx,%esi
+ roll $5,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ pslld $2,%xmm0
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ psrld $30,%xmm9
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 8(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%edi
+ roll $5,%ecx
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ por %xmm9,%xmm0
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ movdqa %xmm0,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ roll $5,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 16(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ pxor %xmm5,%xmm1
+.byte 102,68,15,58,15,215,8
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ movdqa %xmm8,%xmm9
+ paddd %xmm0,%xmm8
+ rorl $7,%ebx
+ addl %esi,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ movdqa %xmm8,0(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %ebx,%esi
+ psrld $30,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ por %xmm10,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %eax,%edi
+ movdqa %xmm1,%xmm8
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 32(%rsp),%eax
+ pxor %xmm6,%xmm2
+.byte 102,68,15,58,15,192,8
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ xorl %edx,%esi
+ addl %ebx,%eax
+ movdqa 32(%r11),%xmm10
+ paddd %xmm1,%xmm9
+ rorl $7,%ecx
+ addl %esi,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ movdqa %xmm9,16(%rsp)
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ pslld $2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ecx,%esi
+ psrld $30,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ por %xmm8,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %ebx,%edi
+ movdqa %xmm2,%xmm9
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 48(%rsp),%ebx
+ pxor %xmm7,%xmm3
+.byte 102,68,15,58,15,201,8
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm2,%xmm10
+ rorl $7,%edx
+ addl %esi,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ movdqa %xmm10,32(%rsp)
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ pslld $2,%xmm3
+ addl 56(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%esi
+ psrld $30,%xmm9
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ por %xmm9,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ecx,%edi
+ movdqa %xmm3,%xmm10
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 0(%rsp),%ecx
+ pxor %xmm0,%xmm4
+.byte 102,68,15,58,15,210,8
+ xorl %ebx,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edx,%ecx
+ movdqa %xmm8,%xmm9
+ paddd %xmm3,%xmm8
+ rorl $7,%ebp
+ addl %esi,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ movdqa %xmm8,48(%rsp)
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ pslld $2,%xmm4
+ addl 8(%rsp),%eax
+ xorl %ebp,%esi
+ psrld $30,%xmm10
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ por %xmm10,%xmm4
+ addl 12(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ xorl %edx,%edi
+ movdqa %xmm4,%xmm8
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 16(%rsp),%edx
+ pxor %xmm1,%xmm5
+.byte 102,68,15,58,15,195,8
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ movdqa %xmm9,%xmm10
+ paddd %xmm4,%xmm9
+ rorl $7,%eax
+ addl %esi,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ movdqa %xmm9,0(%rsp)
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Laesenclast2
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast2
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast2:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ pslld $2,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %eax,%esi
+ psrld $30,%xmm8
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ por %xmm8,%xmm5
+ addl 28(%rsp),%eax
+ xorl %ebp,%edi
+ movdqa %xmm5,%xmm9
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ movl %ecx,%edi
+ movups 32(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ movups %xmm11,16(%r13,%r12,1)
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ pxor %xmm2,%xmm6
+.byte 102,68,15,58,15,204,8
+ xorl %edx,%ecx
+ addl 32(%rsp),%ebp
+ andl %edx,%edi
+ pxor %xmm7,%xmm6
+ andl %ecx,%esi
+ rorl $7,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm5,%xmm10
+ addl %edi,%ebp
+ movl %eax,%edi
+ pxor %xmm9,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movdqa %xmm6,%xmm9
+ movdqa %xmm10,16(%rsp)
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 36(%rsp),%edx
+ andl %ecx,%esi
+ pslld $2,%xmm6
+ andl %ebx,%edi
+ rorl $7,%eax
+ psrld $30,%xmm9
+ addl %esi,%edx
+ movl %ebp,%esi
+ roll $5,%ebp
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ por %xmm9,%xmm6
+ movl %eax,%edi
+ xorl %ebx,%eax
+ movdqa %xmm6,%xmm10
+ addl 40(%rsp),%ecx
+ andl %ebx,%edi
+ andl %eax,%esi
+ rorl $7,%ebp
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 44(%rsp),%ebx
+ andl %eax,%esi
+ andl %ebp,%edi
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ rorl $7,%edx
+ addl %esi,%ebx
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%edi
+ pxor %xmm3,%xmm7
+.byte 102,68,15,58,15,213,8
+ xorl %ebp,%edx
+ addl 48(%rsp),%eax
+ andl %ebp,%edi
+ pxor %xmm0,%xmm7
+ andl %edx,%esi
+ rorl $7,%ecx
+ movdqa 48(%r11),%xmm9
+ paddd %xmm6,%xmm8
+ addl %edi,%eax
+ movl %ebx,%edi
+ pxor %xmm10,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movdqa %xmm7,%xmm10
+ movdqa %xmm8,32(%rsp)
+ movl %ecx,%esi
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ xorl %edx,%ecx
+ addl 52(%rsp),%ebp
+ andl %edx,%esi
+ pslld $2,%xmm7
+ andl %ecx,%edi
+ rorl $7,%ebx
+ psrld $30,%xmm10
+ addl %esi,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ por %xmm10,%xmm7
+ movl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa %xmm7,%xmm8
+ addl 56(%rsp),%edx
+ andl %ecx,%edi
+ andl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 60(%rsp),%ecx
+ andl %ebx,%esi
+ andl %eax,%edi
+ rorl $7,%ebp
+ addl %esi,%ecx
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%edi
+ pxor %xmm4,%xmm0
+.byte 102,68,15,58,15,198,8
+ xorl %eax,%ebp
+ addl 0(%rsp),%ebx
+ andl %eax,%edi
+ pxor %xmm1,%xmm0
+ andl %ebp,%esi
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ rorl $7,%edx
+ movdqa %xmm9,%xmm10
+ paddd %xmm7,%xmm9
+ addl %edi,%ebx
+ movl %ecx,%edi
+ pxor %xmm8,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movdqa %xmm0,%xmm8
+ movdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 4(%rsp),%eax
+ andl %ebp,%esi
+ pslld $2,%xmm0
+ andl %edx,%edi
+ rorl $7,%ecx
+ psrld $30,%xmm8
+ addl %esi,%eax
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ por %xmm8,%xmm0
+ movl %ecx,%edi
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%ecx
+ movdqa %xmm0,%xmm9
+ addl 8(%rsp),%ebp
+ andl %edx,%edi
+ andl %ecx,%esi
+ rorl $7,%ebx
+ addl %edi,%ebp
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 12(%rsp),%edx
+ andl %ecx,%esi
+ andl %ebx,%edi
+ rorl $7,%eax
+ addl %esi,%edx
+ movl %ebp,%esi
+ roll $5,%ebp
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%edi
+ pxor %xmm5,%xmm1
+.byte 102,68,15,58,15,207,8
+ xorl %ebx,%eax
+ addl 16(%rsp),%ecx
+ andl %ebx,%edi
+ pxor %xmm2,%xmm1
+ andl %eax,%esi
+ rorl $7,%ebp
+ movdqa %xmm10,%xmm8
+ paddd %xmm0,%xmm10
+ addl %edi,%ecx
+ movl %edx,%edi
+ pxor %xmm9,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movdqa %xmm1,%xmm9
+ movdqa %xmm10,0(%rsp)
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 20(%rsp),%ebx
+ andl %eax,%esi
+ pslld $2,%xmm1
+ andl %ebp,%edi
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ rorl $7,%edx
+ psrld $30,%xmm9
+ addl %esi,%ebx
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ por %xmm9,%xmm1
+ movl %edx,%edi
+ xorl %ebp,%edx
+ movdqa %xmm1,%xmm10
+ addl 24(%rsp),%eax
+ andl %ebp,%edi
+ andl %edx,%esi
+ rorl $7,%ecx
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movl %ecx,%esi
+ cmpl $11,%r8d
+ jb .Laesenclast3
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast3
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast3:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ xorl %edx,%ecx
+ addl 28(%rsp),%ebp
+ andl %edx,%esi
+ andl %ecx,%edi
+ rorl $7,%ebx
+ addl %esi,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%edi
+ pxor %xmm6,%xmm2
+.byte 102,68,15,58,15,208,8
+ xorl %ecx,%ebx
+ addl 32(%rsp),%edx
+ andl %ecx,%edi
+ pxor %xmm3,%xmm2
+ andl %ebx,%esi
+ rorl $7,%eax
+ movdqa %xmm8,%xmm9
+ paddd %xmm1,%xmm8
+ addl %edi,%edx
+ movl %ebp,%edi
+ pxor %xmm10,%xmm2
+ roll $5,%ebp
+ movups 48(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ movups %xmm11,32(%r13,%r12,1)
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movdqa %xmm2,%xmm10
+ movdqa %xmm8,16(%rsp)
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 36(%rsp),%ecx
+ andl %ebx,%esi
+ pslld $2,%xmm2
+ andl %eax,%edi
+ rorl $7,%ebp
+ psrld $30,%xmm10
+ addl %esi,%ecx
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ por %xmm10,%xmm2
+ movl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa %xmm2,%xmm8
+ addl 40(%rsp),%ebx
+ andl %eax,%edi
+ andl %ebp,%esi
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ rorl $7,%edx
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 44(%rsp),%eax
+ andl %ebp,%esi
+ andl %edx,%edi
+ rorl $7,%ecx
+ addl %esi,%eax
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ addl 48(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ pxor %xmm7,%xmm3
+.byte 102,68,15,58,15,193,8
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ movdqa %xmm9,%xmm10
+ paddd %xmm2,%xmm9
+ rorl $7,%ebx
+ addl %esi,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ movdqa %xmm9,32(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %ebx,%esi
+ psrld $30,%xmm8
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ por %xmm8,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 0(%rsp),%eax
+ paddd %xmm3,%xmm10
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ movdqa %xmm10,48(%rsp)
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 4(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 8(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 12(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ cmpq %r14,%r10
+ je .Ldone_ssse3
+ movdqa 64(%r11),%xmm6
+ movdqa 0(%r11),%xmm9
+ movdqu 0(%r10),%xmm0
+ movdqu 16(%r10),%xmm1
+ movdqu 32(%r10),%xmm2
+ movdqu 48(%r10),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r10
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+.byte 102,15,56,0,206
+ movl %ecx,%edi
+ roll $5,%ecx
+ paddd %xmm9,%xmm0
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ movdqa %xmm0,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ psubd %xmm9,%xmm0
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+.byte 102,15,56,0,214
+ movl %edx,%edi
+ roll $5,%edx
+ paddd %xmm9,%xmm1
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ movdqa %xmm1,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ psubd %xmm9,%xmm1
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+.byte 102,15,56,0,222
+ movl %ebp,%edi
+ roll $5,%ebp
+ paddd %xmm9,%xmm2
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ movdqa %xmm2,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ psubd %xmm9,%xmm2
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Laesenclast4
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast4
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast4:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ movups %xmm11,48(%r13,%r12,1)
+ leaq 64(%r12),%r12
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ addl 12(%r9),%edx
+ movl %eax,0(%r9)
+ addl 16(%r9),%ebp
+ movl %esi,4(%r9)
+ movl %esi,%ebx
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Laesenclast5
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast5
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast5:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ movups %xmm11,48(%r13,%r12,1)
+ movq 88(%rsp),%r8
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ movl %eax,0(%r9)
+ addl 12(%r9),%edx
+ movl %esi,4(%r9)
+ addl 16(%r9),%ebp
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ movups %xmm11,(%r8)
+ leaq 104(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+
+.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000..3c8f6c1
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1250 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+# AES-128-CBC +SHA1 stitch gain
+# Westmere 3.77[+5.6] 9.37 6.65 +41%
+# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
+#
+# AES-192-CBC
+# Westmere 4.51 10.11 6.97 +45%
+# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
+#
+# AES-256-CBC
+# Westmere 5.25 10.85 7.25 +50%
+# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
+#
+# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+# background information. Above numbers in parentheses are SSSE3
+# results collected on AVX-capable CPU, i.e. apply on OSes that
+# don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+# AES-128-CBC AES-192-CBC AES-256-CBC
+# Westmere 1.31 1.55 1.80
+# Sandy Bridge 0.93 1.06 1.22
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void aesni_cbc_sha1_enc(const void *inp,
+# void *out,
+# size_t length,
+# const AES_KEY *key,
+# unsigned char *iv,
+# SHA_CTX *ctx,
+# const void *in0);
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc,\@abi-omnipotent
+.align 16
+aesni_cbc_sha1_enc:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+ jmp aesni_cbc_sha1_enc_ssse3
+ ret
+.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_ssse3,\@function,6
+.align 16
+aesni_cbc_sha1_enc_ssse3:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_ssse3 # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ mov $key,%r15
+ movdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240($key),$rounds
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ movdqa 64($K_XX_XX),@X[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @X[2],@X[-4&7] # byte swap
+ add \$64,$inp
+ pshufb @X[2],@X[-3&7]
+ pshufb @X[2],@X[-2&7]
+ pshufb @X[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movups ($key),$rndkey0 # $key[0]
+ movups 16($key),$rndkey[0] # forward reference
+ jmp .Loop_ssse3
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ movups `16*$n`($in0),$in # load input
+ xorps $rndkey0,$in
+___
+ $code.=<<___ if ($n);
+ movups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ xorps $in,$iv
+ aesenc $rndkey[0],$iv
+ movups `32+16*$k`($key),$rndkey[1]
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Laesenclast$sn
+ movups `32+16*($k+0)`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+1)`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+ je .Laesenclast$sn
+ movups `32+16*($k+2)`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+3)`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+.Laesenclast$sn:
+ aesenclast $rndkey[0],$iv
+ movups 16($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ aesenc $rndkey[0],$iv
+ movups `32+16*$k`($key),$rndkey[1]
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[0],@X[-1&7]);
+ &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@Tx[2],@X[0]);
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[2],30);
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@Tx[1],2);
+ &pxor (@X[0],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ if ($Xi%5) {
+ &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &movdqa (@Tx[0],@X[0]);
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pslld (@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &psrld (@Tx[0],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (".Ldone_ssse3");
+
+ unshift(@Tx,pop(@Tx));
+
+ &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqu (@X[-4&7],"0($inp)"); # load input
+ &movdqu (@X[-3&7],"16($inp)");
+ &movdqu (@X[-2&7],"32($inp)");
+ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
+ '&xor ($c,$d);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&xor ($c,$d);', # restore $c
+ '&xor (@T[0],$d);',
+ '&add ($e,$a);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+
+sub body_20_39 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&xor (@T[0],$d);', # ($b^$d)
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&xor (@T[0],$c);', # ($b^$d^$c)
+ '&add ($e,$a);',
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+
+sub body_40_59 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&mov (@T[1],$c);',
+ '&xor ($c,$d);',
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&and (@T[1],$d);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[1]);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor ($c,$d);', # restore $c
+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+$code.=<<___;
+.align 16
+.Loop_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups $iv,($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ ret
+.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_avx,\@function,6
+.align 16
+aesni_cbc_sha1_enc_avx:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_avx # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ mov $key,%r15
+ vmovdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240($key),$rounds
+ add \$112,$key # size optimization
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
+ vpaddd @Tx[1],@X[-3&7],@X[1]
+ vpaddd @Tx[1],@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ vmovups -112($key),$rndkey0 # $key[0]
+ vmovups 16-112($key),$rndkey[0] # forward reference
+ jmp .Loop_avx
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ vmovups `16*$n`($in0),$in # load input
+ vxorps $rndkey0,$in,$in
+___
+ $code.=<<___ if ($n);
+ vmovups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ vxorps $in,$iv,$iv
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+0)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+1)-112`($key),$rndkey[0]
+ je .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+2)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+ vaesenclast $rndkey[0],$iv,$iv
+ vmovups 16-112($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ if ($Xi%5) {
+ &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (".Ldone_avx");
+
+ unshift(@Tx,pop(@Tx));
+
+ &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
+ &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &vmovdqu(@X[-4&7],"0($inp)"); # load input
+ &vmovdqu(@X[-3&7],"16($inp)");
+ &vmovdqu(@X[-2&7],"32($inp)");
+ &vmovdqu(@X[-1&7],"48($inp)");
+ &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 16
+.Loop_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xuplast_avx_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ vmovups $iv,($ivp) # write IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx:
+ ret
+.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+
+.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by "
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->RipRsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 96(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea `104+10*16`(%rax),%rax # adjust stack pointer
+
+ mov 0(%rax),%r15
+ mov 8(%rax),%r14
+ mov 16(%rax),%r13
+ mov 24(%rax),%r12
+ mov 32(%rax),%rbp
+ mov 40(%rax),%rbx
+ lea 48(%rax),%rax
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size ssse3_handler,.-ssse3_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_end_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/aes/asm/aesni-x86.S b/crypto/aes/asm/aesni-x86.S
new file mode 100644
index 0000000..0766bb5
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86.S
@@ -0,0 +1,2143 @@
+.file "crypto/aes/asm/aesni-x86.s"
+.text
+.globl aesni_encrypt
+.type aesni_encrypt,@function
+.align 16
+aesni_encrypt:
+.L_aesni_encrypt_begin:
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L000enc1_loop_1:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L000enc1_loop_1
+.byte 102,15,56,221,209
+ movups %xmm2,(%eax)
+ ret
+.size aesni_encrypt,.-.L_aesni_encrypt_begin
+.globl aesni_decrypt
+.type aesni_decrypt,@function
+.align 16
+aesni_decrypt:
+.L_aesni_decrypt_begin:
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L001dec1_loop_2:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L001dec1_loop_2
+.byte 102,15,56,223,209
+ movups %xmm2,(%eax)
+ ret
+.size aesni_decrypt,.-.L_aesni_decrypt_begin
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups (%edx),%xmm0
+.L002enc3_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %ecx
+.byte 102,15,56,220,225
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leal 32(%edx),%edx
+.byte 102,15,56,220,224
+ movups (%edx),%xmm0
+ jnz .L002enc3_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ ret
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups (%edx),%xmm0
+.L003dec3_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %ecx
+.byte 102,15,56,222,225
+ movups 16(%edx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leal 32(%edx),%edx
+.byte 102,15,56,222,224
+ movups (%edx),%xmm0
+ jnz .L003dec3_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ ret
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shrl $1,%ecx
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups (%edx),%xmm0
+.L004enc4_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leal 32(%edx),%edx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups (%edx),%xmm0
+ jnz .L004enc4_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ ret
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shrl $1,%ecx
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups (%edx),%xmm0
+.L005dec4_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %ecx
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 16(%edx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leal 32(%edx),%edx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups (%edx),%xmm0
+ jnz .L005dec4_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ ret
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ decl %ecx
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,241
+ movups (%edx),%xmm0
+.byte 102,15,56,220,249
+ jmp .L_aesni_encrypt6_enter
+.align 16
+.L006enc6_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.align 16
+.L_aesni_encrypt6_enter:
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leal 32(%edx),%edx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%edx),%xmm0
+ jnz .L006enc6_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ ret
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ decl %ecx
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,222,241
+ movups (%edx),%xmm0
+.byte 102,15,56,222,249
+ jmp .L_aesni_decrypt6_enter
+.align 16
+.L007dec6_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %ecx
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.align 16
+.L_aesni_decrypt6_enter:
+ movups 16(%edx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leal 32(%edx),%edx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups (%edx),%xmm0
+ jnz .L007dec6_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ ret
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,@function
+.align 16
+aesni_ecb_encrypt:
+.L_aesni_ecb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ andl $-16,%eax
+ jz .L008ecb_ret
+ movl 240(%edx),%ecx
+ testl %ebx,%ebx
+ jz .L009ecb_decrypt
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L010ecb_enc_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L011ecb_enc_loop6_enter
+.align 16
+.L012ecb_enc_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L011ecb_enc_loop6_enter:
+ call _aesni_encrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L012ecb_enc_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L008ecb_ret
+.L010ecb_enc_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L013ecb_enc_one
+ movups 16(%esi),%xmm3
+ je .L014ecb_enc_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L015ecb_enc_three
+ movups 48(%esi),%xmm5
+ je .L016ecb_enc_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L013ecb_enc_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L017enc1_loop_3:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L017enc1_loop_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L014ecb_enc_two:
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L015ecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L016ecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L009ecb_decrypt:
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L018ecb_dec_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L019ecb_dec_loop6_enter
+.align 16
+.L020ecb_dec_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L019ecb_dec_loop6_enter:
+ call _aesni_decrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L020ecb_dec_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L008ecb_ret
+.L018ecb_dec_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L021ecb_dec_one
+ movups 16(%esi),%xmm3
+ je .L022ecb_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L023ecb_dec_three
+ movups 48(%esi),%xmm5
+ je .L024ecb_dec_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L021ecb_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L025dec1_loop_4:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L025dec1_loop_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L022ecb_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L023ecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L024ecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L008ecb_ret:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ecb_encrypt,.-.L_aesni_ecb_encrypt_begin
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,@function
+.align 16
+aesni_ccm64_encrypt_blocks:
+.L_aesni_ccm64_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ shrl $1,%ecx
+ leal (%edx),%ebp
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ movl %ecx,%ebx
+.byte 102,15,56,0,253
+.L026ccm64_enc_outer:
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups (%esi),%xmm6
+ xorps %xmm0,%xmm2
+ movups 16(%ebp),%xmm1
+ xorps %xmm6,%xmm0
+ leal 32(%ebp),%edx
+ xorps %xmm0,%xmm3
+ movups (%edx),%xmm0
+.L027ccm64_enc2_loop:
+.byte 102,15,56,220,209
+ decl %ecx
+.byte 102,15,56,220,217
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+ leal 32(%edx),%edx
+.byte 102,15,56,220,216
+ movups (%edx),%xmm0
+ jnz .L027ccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq 16(%esp),%xmm7
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ decl %eax
+ leal 16(%esi),%esi
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+ leal 16(%edi),%edi
+.byte 102,15,56,0,213
+ jnz .L026ccm64_enc_outer
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ccm64_encrypt_blocks,.-.L_aesni_ccm64_encrypt_blocks_begin
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,@function
+.align 16
+aesni_ccm64_decrypt_blocks:
+.L_aesni_ccm64_decrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ movl %edx,%ebp
+ movl %ecx,%ebx
+.byte 102,15,56,0,253
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L028enc1_loop_5:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L028enc1_loop_5
+.byte 102,15,56,221,209
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+ leal 16(%esi),%esi
+ jmp .L029ccm64_dec_outer
+.align 16
+.L029ccm64_dec_outer:
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movl %ebx,%ecx
+ movups %xmm6,(%edi)
+ leal 16(%edi),%edi
+.byte 102,15,56,0,213
+ subl $1,%eax
+ jz .L030ccm64_dec_break
+ movups (%ebp),%xmm0
+ shrl $1,%ecx
+ movups 16(%ebp),%xmm1
+ xorps %xmm0,%xmm6
+ leal 32(%ebp),%edx
+ xorps %xmm0,%xmm2
+ xorps %xmm6,%xmm3
+ movups (%edx),%xmm0
+.L031ccm64_dec2_loop:
+.byte 102,15,56,220,209
+ decl %ecx
+.byte 102,15,56,220,217
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+ leal 32(%edx),%edx
+.byte 102,15,56,220,216
+ movups (%edx),%xmm0
+ jnz .L031ccm64_dec2_loop
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leal 16(%esi),%esi
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ jmp .L029ccm64_dec_outer
+.align 16
+.L030ccm64_dec_break:
+ movl %ebp,%edx
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm6
+ leal 32(%edx),%edx
+ xorps %xmm6,%xmm3
+.L032enc1_loop_6:
+.byte 102,15,56,220,217
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L032enc1_loop_6
+.byte 102,15,56,221,217
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ccm64_decrypt_blocks,.-.L_aesni_ccm64_decrypt_blocks_begin
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,@function
+.align 16
+aesni_ctr32_encrypt_blocks:
+.L_aesni_ctr32_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $88,%esp
+ andl $-16,%esp
+ movl %ebp,80(%esp)
+ cmpl $1,%eax
+ je .L033ctr32_one_shortcut
+ movdqu (%ebx),%xmm7
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $6,%ecx
+ xorl %ebp,%ebp
+ movl %ecx,16(%esp)
+ movl %ecx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %ebp,28(%esp)
+.byte 102,15,58,22,251,3
+.byte 102,15,58,34,253,3
+ movl 240(%edx),%ecx
+ bswap %ebx
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa (%esp),%xmm2
+.byte 102,15,58,34,203,0
+ leal 3(%ebx),%ebp
+.byte 102,15,58,34,197,0
+ incl %ebx
+.byte 102,15,58,34,203,1
+ incl %ebp
+.byte 102,15,58,34,197,1
+ incl %ebx
+.byte 102,15,58,34,203,2
+ incl %ebp
+.byte 102,15,58,34,197,2
+ movdqa %xmm1,48(%esp)
+.byte 102,15,56,0,202
+ movdqa %xmm0,64(%esp)
+.byte 102,15,56,0,194
+ pshufd $192,%xmm1,%xmm2
+ pshufd $128,%xmm1,%xmm3
+ cmpl $6,%eax
+ jb .L034ctr32_tail
+ movdqa %xmm7,32(%esp)
+ shrl $1,%ecx
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ subl $6,%eax
+ jmp .L035ctr32_loop6
+.align 16
+.L035ctr32_loop6:
+ pshufd $64,%xmm1,%xmm4
+ movdqa 32(%esp),%xmm1
+ pshufd $192,%xmm0,%xmm5
+ por %xmm1,%xmm2
+ pshufd $128,%xmm0,%xmm6
+ por %xmm1,%xmm3
+ pshufd $64,%xmm0,%xmm7
+ por %xmm1,%xmm4
+ por %xmm1,%xmm5
+ por %xmm1,%xmm6
+ por %xmm1,%xmm7
+ movups (%ebp),%xmm0
+ movups 16(%ebp),%xmm1
+ leal 32(%ebp),%edx
+ decl %ecx
+ pxor %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,241
+ movups (%edx),%xmm0
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups %xmm2,(%edi)
+ movdqa 16(%esp),%xmm0
+ xorps %xmm1,%xmm4
+ movdqa 48(%esp),%xmm1
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ paddd %xmm0,%xmm1
+ paddd 64(%esp),%xmm0
+ movdqa (%esp),%xmm2
+ movups 48(%esi),%xmm3
+ movups 64(%esi),%xmm4
+ xorps %xmm3,%xmm5
+ movups 80(%esi),%xmm3
+ leal 96(%esi),%esi
+ movdqa %xmm1,48(%esp)
+.byte 102,15,56,0,202
+ xorps %xmm4,%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+.byte 102,15,56,0,194
+ movups %xmm6,64(%edi)
+ pshufd $192,%xmm1,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movl %ebx,%ecx
+ pshufd $128,%xmm1,%xmm3
+ subl $6,%eax
+ jnc .L035ctr32_loop6
+ addl $6,%eax
+ jz .L036ctr32_ret
+ movl %ebp,%edx
+ leal 1(,%ecx,2),%ecx
+ movdqa 32(%esp),%xmm7
+.L034ctr32_tail:
+ por %xmm7,%xmm2
+ cmpl $2,%eax
+ jb .L037ctr32_one
+ pshufd $64,%xmm1,%xmm4
+ por %xmm7,%xmm3
+ je .L038ctr32_two
+ pshufd $192,%xmm0,%xmm5
+ por %xmm7,%xmm4
+ cmpl $4,%eax
+ jb .L039ctr32_three
+ pshufd $128,%xmm0,%xmm6
+ por %xmm7,%xmm5
+ je .L040ctr32_four
+ por %xmm7,%xmm6
+ call _aesni_encrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm4
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm5
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L033ctr32_one_shortcut:
+ movups (%ebx),%xmm2
+ movl 240(%edx),%ecx
+.L037ctr32_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L041enc1_loop_7:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L041enc1_loop_7
+.byte 102,15,56,221,209
+ movups (%esi),%xmm6
+ xorps %xmm2,%xmm6
+ movups %xmm6,(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L038ctr32_two:
+ call _aesni_encrypt3
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L039ctr32_three:
+ call _aesni_encrypt3
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ movups 32(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L040ctr32_four:
+ call _aesni_encrypt4
+ movups (%esi),%xmm6
+ movups 16(%esi),%xmm7
+ movups 32(%esi),%xmm1
+ xorps %xmm6,%xmm2
+ movups 48(%esi),%xmm0
+ xorps %xmm7,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L036ctr32_ret:
+ movl 80(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ctr32_encrypt_blocks,.-.L_aesni_ctr32_encrypt_blocks_begin
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,@function
+.align 16
+aesni_xts_encrypt:
+.L_aesni_xts_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L042enc1_loop_8:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L042enc1_loop_8
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ movl 240(%edx),%ecx
+ andl $-16,%esp
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ subl $96,%eax
+ jc .L043xts_enc_short
+ shrl $1,%ecx
+ movl %ecx,%ebx
+ jmp .L044xts_enc_loop6
+.align 16
+.L044xts_enc_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ leal 32(%ebp),%edx
+ pxor 16(%esp),%xmm3
+.byte 102,15,56,220,209
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,220,217
+ pxor 48(%esp),%xmm5
+ decl %ecx
+.byte 102,15,56,220,225
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,241
+ movups (%edx),%xmm0
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ movl %ebx,%ecx
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L044xts_enc_loop6
+ leal 1(,%ecx,2),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L043xts_enc_short:
+ addl $96,%eax
+ jz .L045xts_enc_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L046xts_enc_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L047xts_enc_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L048xts_enc_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L049xts_enc_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_encrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L050xts_enc_done
+.align 16
+.L046xts_enc_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L051enc1_loop_9:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L051enc1_loop_9
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L047xts_enc_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L048xts_enc_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L049xts_enc_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_encrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L045xts_enc_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L052xts_enc_ret
+ movdqa %xmm1,%xmm5
+ movl %eax,112(%esp)
+ jmp .L053xts_enc_steal
+.align 16
+.L050xts_enc_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L052xts_enc_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm5
+ paddq %xmm1,%xmm1
+ pand 96(%esp),%xmm5
+ pxor %xmm1,%xmm5
+.L053xts_enc_steal:
+ movzbl (%esi),%ecx
+ movzbl -16(%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,-16(%edi)
+ movb %dl,(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L053xts_enc_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups -16(%edi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L054enc1_loop_10:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L054enc1_loop_10
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,-16(%edi)
+.L052xts_enc_ret:
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_xts_encrypt,.-.L_aesni_xts_encrypt_begin
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,@function
+.align 16
+aesni_xts_decrypt:
+.L_aesni_xts_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L055enc1_loop_11:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L055enc1_loop_11
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ andl $-16,%esp
+ xorl %ebx,%ebx
+ testl $15,%eax
+ setnz %bl
+ shll $4,%ebx
+ subl %ebx,%eax
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ subl $96,%eax
+ jc .L056xts_dec_short
+ shrl $1,%ecx
+ movl %ecx,%ebx
+ jmp .L057xts_dec_loop6
+.align 16
+.L057xts_dec_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ leal 32(%ebp),%edx
+ pxor 16(%esp),%xmm3
+.byte 102,15,56,222,209
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,222,217
+ pxor 48(%esp),%xmm5
+ decl %ecx
+.byte 102,15,56,222,225
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,222,241
+ movups (%edx),%xmm0
+.byte 102,15,56,222,249
+ call .L_aesni_decrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ movl %ebx,%ecx
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L057xts_dec_loop6
+ leal 1(,%ecx,2),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L056xts_dec_short:
+ addl $96,%eax
+ jz .L058xts_dec_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L059xts_dec_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L060xts_dec_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L061xts_dec_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L062xts_dec_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_decrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L063xts_dec_done
+.align 16
+.L059xts_dec_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L064dec1_loop_12:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L064dec1_loop_12
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L060xts_dec_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call _aesni_decrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L061xts_dec_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L062xts_dec_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_decrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L058xts_dec_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L065xts_dec_ret
+ movl %eax,112(%esp)
+ jmp .L066xts_dec_only_one_more
+.align 16
+.L063xts_dec_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L065xts_dec_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+.L066xts_dec_only_one_more:
+ pshufd $19,%xmm0,%xmm5
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm5
+ pxor %xmm1,%xmm5
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%esi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L067dec1_loop_13:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L067dec1_loop_13
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+.L068xts_dec_steal:
+ movzbl 16(%esi),%ecx
+ movzbl (%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,(%edi)
+ movb %dl,16(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L068xts_dec_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%edi),%xmm2
+ xorps %xmm6,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L069dec1_loop_14:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L069dec1_loop_14
+.byte 102,15,56,223,209
+ xorps %xmm6,%xmm2
+ movups %xmm2,(%edi)
+.L065xts_dec_ret:
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin
+.globl aesni_cbc_encrypt
+.type aesni_cbc_encrypt,@function
+.align 16
+aesni_cbc_encrypt:
+.L_aesni_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl %esp,%ebx
+ movl 24(%esp),%edi
+ subl $24,%ebx
+ movl 28(%esp),%eax
+ andl $-16,%ebx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebp
+ testl %eax,%eax
+ jz .L070cbc_abort
+ cmpl $0,40(%esp)
+ xchgl %esp,%ebx
+ movups (%ebp),%xmm7
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ebx,16(%esp)
+ movl %ecx,%ebx
+ je .L071cbc_decrypt
+ movaps %xmm7,%xmm2
+ cmpl $16,%eax
+ jb .L072cbc_enc_tail
+ subl $16,%eax
+ jmp .L073cbc_enc_loop
+.align 16
+.L073cbc_enc_loop:
+ movups (%esi),%xmm7
+ leal 16(%esi),%esi
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm7
+ leal 32(%edx),%edx
+ xorps %xmm7,%xmm2
+.L074enc1_loop_15:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L074enc1_loop_15
+.byte 102,15,56,221,209
+ movl %ebx,%ecx
+ movl %ebp,%edx
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ subl $16,%eax
+ jnc .L073cbc_enc_loop
+ addl $16,%eax
+ jnz .L072cbc_enc_tail
+ movaps %xmm2,%xmm7
+ jmp .L075cbc_ret
+.L072cbc_enc_tail:
+ movl %eax,%ecx
+.long 2767451785
+ movl $16,%ecx
+ subl %eax,%ecx
+ xorl %eax,%eax
+.long 2868115081
+ leal -16(%edi),%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+ movl %ebp,%edx
+ jmp .L073cbc_enc_loop
+.align 16
+.L071cbc_decrypt:
+ cmpl $80,%eax
+ jbe .L076cbc_dec_tail
+ movaps %xmm7,(%esp)
+ subl $80,%eax
+ jmp .L077cbc_dec_loop6_enter
+.align 16
+.L078cbc_dec_loop6:
+ movaps %xmm0,(%esp)
+ movups %xmm7,(%edi)
+ leal 16(%edi),%edi
+.L077cbc_dec_loop6_enter:
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%esi),%xmm0
+ xorps %xmm1,%xmm7
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 96(%esi),%esi
+ movups %xmm4,32(%edi)
+ movl %ebx,%ecx
+ movups %xmm5,48(%edi)
+ movl %ebp,%edx
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ subl $96,%eax
+ ja .L078cbc_dec_loop6
+ movaps %xmm7,%xmm2
+ movaps %xmm0,%xmm7
+ addl $80,%eax
+ jle .L079cbc_dec_tail_collected
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+.L076cbc_dec_tail:
+ movups (%esi),%xmm2
+ movaps %xmm2,%xmm6
+ cmpl $16,%eax
+ jbe .L080cbc_dec_one
+ movups 16(%esi),%xmm3
+ movaps %xmm3,%xmm5
+ cmpl $32,%eax
+ jbe .L081cbc_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $48,%eax
+ jbe .L082cbc_dec_three
+ movups 48(%esi),%xmm5
+ cmpl $64,%eax
+ jbe .L083cbc_dec_four
+ movups 64(%esi),%xmm6
+ movaps %xmm7,(%esp)
+ movups (%esi),%xmm2
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm7
+ xorps %xmm0,%xmm6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movaps %xmm6,%xmm2
+ subl $80,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L080cbc_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L084dec1_loop_16:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L084dec1_loop_16
+.byte 102,15,56,223,209
+ xorps %xmm7,%xmm2
+ movaps %xmm6,%xmm7
+ subl $16,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L081cbc_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movaps %xmm3,%xmm2
+ leal 16(%edi),%edi
+ movaps %xmm5,%xmm7
+ subl $32,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L082cbc_dec_three:
+ call _aesni_decrypt3
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm5,%xmm4
+ movups %xmm2,(%edi)
+ movaps %xmm4,%xmm2
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movups 32(%esi),%xmm7
+ subl $48,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_four:
+ call _aesni_decrypt4
+ movups 16(%esi),%xmm1
+ movups 32(%esi),%xmm0
+ xorps %xmm7,%xmm2
+ movups 48(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movaps %xmm5,%xmm2
+ subl $64,%eax
+.L079cbc_dec_tail_collected:
+ andl $15,%eax
+ jnz .L085cbc_dec_tail_partial
+ movups %xmm2,(%edi)
+ jmp .L075cbc_ret
+.align 16
+.L085cbc_dec_tail_partial:
+ movaps %xmm2,(%esp)
+ movl $16,%ecx
+ movl %esp,%esi
+ subl %eax,%ecx
+.long 2767451785
+.L075cbc_ret:
+ movl 16(%esp),%esp
+ movl 36(%esp),%ebp
+ movups %xmm7,(%ebp)
+.L070cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_cbc_encrypt,.-.L_aesni_cbc_encrypt_begin
+.type _aesni_set_encrypt_key,@function
+.align 16
+_aesni_set_encrypt_key:
+ testl %eax,%eax
+ jz .L086bad_pointer
+ testl %edx,%edx
+ jz .L086bad_pointer
+ movups (%eax),%xmm0
+ xorps %xmm4,%xmm4
+ leal 16(%edx),%edx
+ cmpl $256,%ecx
+ je .L08714rounds
+ cmpl $192,%ecx
+ je .L08812rounds
+ cmpl $128,%ecx
+ jne .L089bad_keybits
+.align 16
+.L09010rounds:
+ movl $9,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,200,1
+ call .L091key_128_cold
+.byte 102,15,58,223,200,2
+ call .L092key_128
+.byte 102,15,58,223,200,4
+ call .L092key_128
+.byte 102,15,58,223,200,8
+ call .L092key_128
+.byte 102,15,58,223,200,16
+ call .L092key_128
+.byte 102,15,58,223,200,32
+ call .L092key_128
+.byte 102,15,58,223,200,64
+ call .L092key_128
+.byte 102,15,58,223,200,128
+ call .L092key_128
+.byte 102,15,58,223,200,27
+ call .L092key_128
+.byte 102,15,58,223,200,54
+ call .L092key_128
+ movups %xmm0,(%edx)
+ movl %ecx,80(%edx)
+ xorl %eax,%eax
+ ret
+.align 16
+.L092key_128:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.L091key_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L08812rounds:
+ movq 16(%eax),%xmm2
+ movl $11,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L093key_192a_cold
+.byte 102,15,58,223,202,2
+ call .L094key_192b
+.byte 102,15,58,223,202,4
+ call .L095key_192a
+.byte 102,15,58,223,202,8
+ call .L094key_192b
+.byte 102,15,58,223,202,16
+ call .L095key_192a
+.byte 102,15,58,223,202,32
+ call .L094key_192b
+.byte 102,15,58,223,202,64
+ call .L095key_192a
+.byte 102,15,58,223,202,128
+ call .L094key_192b
+ movups %xmm0,(%edx)
+ movl %ecx,48(%edx)
+ xorl %eax,%eax
+ ret
+.align 16
+.L095key_192a:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.align 16
+.L093key_192a_cold:
+ movaps %xmm2,%xmm5
+.L096key_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+.align 16
+.L094key_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%edx)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%edx)
+ leal 32(%edx),%edx
+ jmp .L096key_192b_warm
+.align 16
+.L08714rounds:
+ movups 16(%eax),%xmm2
+ movl $13,%ecx
+ leal 16(%edx),%edx
+ movups %xmm0,-32(%edx)
+ movups %xmm2,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L097key_256a_cold
+.byte 102,15,58,223,200,1
+ call .L098key_256b
+.byte 102,15,58,223,202,2
+ call .L099key_256a
+.byte 102,15,58,223,200,2
+ call .L098key_256b
+.byte 102,15,58,223,202,4
+ call .L099key_256a
+.byte 102,15,58,223,200,4
+ call .L098key_256b
+.byte 102,15,58,223,202,8
+ call .L099key_256a
+.byte 102,15,58,223,200,8
+ call .L098key_256b
+.byte 102,15,58,223,202,16
+ call .L099key_256a
+.byte 102,15,58,223,200,16
+ call .L098key_256b
+.byte 102,15,58,223,202,32
+ call .L099key_256a
+.byte 102,15,58,223,200,32
+ call .L098key_256b
+.byte 102,15,58,223,202,64
+ call .L099key_256a
+ movups %xmm0,(%edx)
+ movl %ecx,16(%edx)
+ xorl %eax,%eax
+ ret
+.align 16
+.L099key_256a:
+ movups %xmm2,(%edx)
+ leal 16(%edx),%edx
+.L097key_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L098key_256b:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+.align 4
+.L086bad_pointer:
+ movl $-1,%eax
+ ret
+.align 4
+.L089bad_keybits:
+ movl $-2,%eax
+ ret
+.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl aesni_set_encrypt_key
+.type aesni_set_encrypt_key,@function
+.align 16
+aesni_set_encrypt_key:
+.L_aesni_set_encrypt_key_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ ret
+.size aesni_set_encrypt_key,.-.L_aesni_set_encrypt_key_begin
+.globl aesni_set_decrypt_key
+.type aesni_set_decrypt_key,@function
+.align 16
+aesni_set_decrypt_key:
+.L_aesni_set_decrypt_key_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ movl 12(%esp),%edx
+ shll $4,%ecx
+ testl %eax,%eax
+ jnz .L100dec_key_ret
+ leal 16(%edx,%ecx,1),%eax
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+ movups %xmm0,(%eax)
+ movups %xmm1,(%edx)
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+.L101dec_key_inverse:
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+ movups %xmm0,16(%eax)
+ movups %xmm1,-16(%edx)
+ cmpl %edx,%eax
+ ja .L101dec_key_inverse
+ movups (%edx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%edx)
+ xorl %eax,%eax
+.L100dec_key_ret:
+ ret
+.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte 115,108,46,111,114,103,62,0
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000..3dc345b
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
+# 53-67% 67-84% 91-94% 95-98% 97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
+ # generates drop-in replacement for
+ # crypto/aes/asm/aes-586.pl:-)
+$inline=1; # inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+if ($PREFIX eq "aesni") { $movekey=*movups; }
+else { $movekey=*movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx"; # backup copy for $rounds
+$key_="ebp"; # backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5"; $in1="xmm5";
+$inout4="xmm6"; $in0="xmm6";
+$inout5="xmm7"; $ivec="xmm7";
+
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc { aescommon(0xdb,@_); }
+sub aesenc { aescommon(0xdc,@_); }
+sub aesenclast { aescommon(0xdd,@_); }
+sub aesdec { aescommon(0xde,@_); }
+sub aesdeclast { aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+ $sn++;
+
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($ivec,$rndkey0) if (defined($ivec));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout,$ivec) if (defined($ivec));
+ &xorps ($inout,$rndkey0) if (!defined($ivec));
+ &set_label("${p}1_loop_$sn");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &dec ($rounds);
+ &$movekey ($rndkey1,&QWP(0,$key));
+ &lea ($key,&DWP(16,$key));
+ &jnz (&label("${p}1_loop_$sn"));
+ eval"&aes${p}last ($inout,$rndkey1)";
+}}
+
+sub aesni_generate1 # fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+ &function_begin_B("_aesni_${p}rypt1");
+ &movups ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(0x10,$key));
+ &xorps ($inout,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0x20,$key));
+ &lea ($key,&DWP(0x30,$key));
+ &cmp ($rounds,11);
+ &jb (&label("${p}128"));
+ &lea ($key,&DWP(0x20,$key));
+ &je (&label("${p}192"));
+ &lea ($key,&DWP(0x20,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x40,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x30,$key));
+ &set_label("${p}192");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x20,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x10,$key));
+ &set_label("${p}128");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x10,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x20,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x30,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x40,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x50,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x60,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x70,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ eval"&aes${p}last ($inout,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups (&QWP(0,"eax"),$inout0);
+ &ret ();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &movups (&QWP(0,"eax"),$inout0);
+ &ret ();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
+sub aesni_generate3
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt3");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+ &set_label("${p}3_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}3_loop"));
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt4");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &shr ($rounds,1);
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+ &set_label("${p}4_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}4_loop"));
+
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ eval"&aes${p}last ($inout3,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt6");
+ &static_label("_aesni_${p}rypt6_enter");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0); # pxor does better here
+ eval"&aes${p} ($inout0,$rndkey1)";
+ &pxor ($inout2,$rndkey0);
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &pxor ($inout3,$rndkey0);
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &pxor ($inout4,$rndkey0);
+ eval"&aes${p} ($inout3,$rndkey1)";
+ &pxor ($inout5,$rndkey0);
+ eval"&aes${p} ($inout4,$rndkey1)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ eval"&aes${p} ($inout5,$rndkey1)";
+ &jmp (&label("_aesni_${p}rypt6_enter"));
+
+ &set_label("${p}6_loop",16);
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ &set_label("_aesni_${p}rypt6_enter",16);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+ eval"&aes${p} ($inout4,$rndkey0)";
+ eval"&aes${p} ($inout5,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}6_loop"));
+
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ eval"&aes${p}last ($inout3,$rndkey0)";
+ eval"&aes${p}last ($inout4,$rndkey0)";
+ eval"&aes${p}last ($inout5,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+# size_t length, const AES_KEY *key,
+# int enc);
+&function_begin("aesni_ecb_encrypt");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &and ($len,-16);
+ &jz (&label("ecb_ret"));
+ &mov ($rounds,&DWP(240,$key));
+ &test ($rounds_,$rounds_);
+ &jz (&label("ecb_decrypt"));
+
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &cmp ($len,0x60);
+ &jb (&label("ecb_enc_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+ &call ("_aesni_encrypt6");
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_enc_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &cmp ($len,0x20);
+ &jb (&label("ecb_enc_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_enc_two"));
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_enc_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+ &je (&label("ecb_enc_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_encrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+ &xorps ($inout2,$inout2);
+ &call ("_aesni_encrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+ &call ("_aesni_encrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+ &call ("_aesni_encrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &jmp (&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &cmp ($len,0x60);
+ &jb (&label("ecb_dec_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+ &call ("_aesni_decrypt6");
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_dec_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &cmp ($len,0x20);
+ &jb (&label("ecb_dec_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_dec_two"));
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_dec_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+ &je (&label("ecb_dec_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_decrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+ &xorps ($inout2,$inout2);
+ &call ("_aesni_decrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+ &call ("_aesni_decrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+ &call ("_aesni_decrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
+ &mov ($rounds,&DWP(240,$key));
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds_,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds_);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &shr ($rounds,1);
+ &lea ($key_,&DWP(0,$key));
+ &movdqa ($inout3,&QWP(0,"esp"));
+ &movdqa ($inout0,$ivec);
+ &mov ($rounds_,$rounds);
+ &pshufb ($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &mov ($rounds,$rounds_);
+ &movups ($in0,&QWP(0,$inp));
+
+ &xorps ($inout0,$rndkey0);
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &xorps ($rndkey0,$in0);
+ &lea ($key,&DWP(32,$key_));
+ &xorps ($cmac,$rndkey0); # cmac^=inp
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &dec ($rounds);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &aesenc ($inout0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("ccm64_enc2_loop"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &paddq ($ivec,&QWP(16,"esp"));
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
+
+ &dec ($len);
+ &lea ($inp,&DWP(16,$inp));
+ &xorps ($in0,$inout0); # inp^=E(ivec)
+ &movdqa ($inout0,$ivec);
+ &movups (&QWP(0,$out),$in0); # save output
+ &lea ($out,&DWP(16,$out));
+ &pshufb ($inout0,$inout3);
+ &jnz (&label("ccm64_enc_outer"));
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
+ &mov ($rounds,&DWP(240,$key));
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds_,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds_);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
+ &movdqa ($inout0,$ivec);
+
+ &mov ($key_,$key);
+ &mov ($rounds_,$rounds);
+
+ &pshufb ($ivec,$inout3);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups ($in0,&QWP(0,$inp)); # load inp
+ &paddq ($ivec,&QWP(16,"esp"));
+ &lea ($inp,&QWP(16,$inp));
+ &jmp (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+ &xorps ($in0,$inout0); # inp ^= E(ivec)
+ &movdqa ($inout0,$ivec);
+ &mov ($rounds,$rounds_);
+ &movups (&QWP(0,$out),$in0); # save output
+ &lea ($out,&DWP(16,$out));
+ &pshufb ($inout0,$inout3);
+
+ &sub ($len,1);
+ &jz (&label("ccm64_dec_break"));
+
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &xorps ($in0,$rndkey0);
+ &lea ($key,&DWP(32,$key_));
+ &xorps ($inout0,$rndkey0);
+ &xorps ($cmac,$in0); # cmac^=out
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_dec2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &dec ($rounds);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &aesenc ($inout0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("ccm64_dec2_loop"));
+ &movups ($in0,&QWP(0,$inp)); # load inp
+ &paddq ($ivec,&QWP(16,"esp"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &lea ($inp,&QWP(16,$inp));
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
+ &jmp (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+ &mov ($key,$key_);
+ if ($inline)
+ { &aesni_inline_generate1("enc",$cmac,$in0); }
+ else
+ { &call ("_aesni_encrypt1",$cmac); }
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+# 0 pshufb mask
+# 16 vector addend: 0,6,6,6
+# 32 counter-less ivec
+# 48 1st triplet of counter vector
+# 64 2nd triplet of counter vector
+# 80 saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($key_,"esp");
+ &sub ("esp",88);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(80,"esp"),$key_);
+
+ &cmp ($len,1);
+ &je (&label("ctr32_one_shortcut"));
+
+ &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds,6);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds);
+ &mov (&DWP(20,"esp"),$rounds);
+ &mov (&DWP(24,"esp"),$rounds);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
+ &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
+
+ &mov ($rounds,&DWP(240,$key)); # key->rounds
+
+ # compose 2 vectors of 3x32-bit counters
+ &bswap ($rounds_);
+ &pxor ($rndkey1,$rndkey1);
+ &pxor ($rndkey0,$rndkey0);
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
+ &pinsrd ($rndkey1,$rounds_,0);
+ &lea ($key_,&DWP(3,$rounds_));
+ &pinsrd ($rndkey0,$key_,0);
+ &inc ($rounds_);
+ &pinsrd ($rndkey1,$rounds_,1);
+ &inc ($key_);
+ &pinsrd ($rndkey0,$key_,1);
+ &inc ($rounds_);
+ &pinsrd ($rndkey1,$rounds_,2);
+ &inc ($key_);
+ &pinsrd ($rndkey0,$key_,2);
+ &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+ &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+
+ &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
+ &pshufd ($inout1,$rndkey1,2<<6);
+ &cmp ($len,6);
+ &jb (&label("ctr32_tail"));
+ &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
+ &shr ($rounds,1);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($len,6);
+ &jmp (&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+ &pshufd ($inout2,$rndkey1,1<<6);
+ &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
+ &pshufd ($inout3,$rndkey0,3<<6);
+ &por ($inout0,$rndkey1); # merge counter-less ivec
+ &pshufd ($inout4,$rndkey0,2<<6);
+ &por ($inout1,$rndkey1);
+ &pshufd ($inout5,$rndkey0,1<<6);
+ &por ($inout2,$rndkey1);
+ &por ($inout3,$rndkey1);
+ &por ($inout4,$rndkey1);
+ &por ($inout5,$rndkey1);
+
+ # inlining _aesni_encrypt6's prologue gives ~4% improvement...
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &dec ($rounds);
+ &pxor ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &aesenc ($inout0,$rndkey1);
+ &pxor ($inout2,$rndkey0);
+ &aesenc ($inout1,$rndkey1);
+ &pxor ($inout3,$rndkey0);
+ &aesenc ($inout2,$rndkey1);
+ &pxor ($inout4,$rndkey0);
+ &aesenc ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesenc ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesenc ($inout5,$rndkey1);
+
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
+ &xorps ($inout2,$rndkey1);
+ &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+
+ &paddd ($rndkey1,$rndkey0); # 1st triplet increment
+ &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
+
+ &movups ($inout1,&QWP(0x30,$inp));
+ &movups ($inout2,&QWP(0x40,$inp));
+ &xorps ($inout3,$inout1);
+ &movups ($inout1,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+ &xorps ($inout4,$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &xorps ($inout5,$inout1);
+ &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+ &movups (&QWP(0x40,$out),$inout4);
+ &pshufd ($inout0,$rndkey1,3<<6);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+
+ &mov ($rounds,$rounds_);
+ &pshufd ($inout1,$rndkey1,2<<6);
+ &sub ($len,6);
+ &jnc (&label("ctr32_loop6"));
+
+ &add ($len,6);
+ &jz (&label("ctr32_ret"));
+ &mov ($key,$key_);
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
+
+&set_label("ctr32_tail");
+ &por ($inout0,$inout5);
+ &cmp ($len,2);
+ &jb (&label("ctr32_one"));
+
+ &pshufd ($inout2,$rndkey1,1<<6);
+ &por ($inout1,$inout5);
+ &je (&label("ctr32_two"));
+
+ &pshufd ($inout3,$rndkey0,3<<6);
+ &por ($inout2,$inout5);
+ &cmp ($len,4);
+ &jb (&label("ctr32_three"));
+
+ &pshufd ($inout4,$rndkey0,2<<6);
+ &por ($inout3,$inout5);
+ &je (&label("ctr32_four"));
+
+ &por ($inout4,$inout5);
+ &call ("_aesni_encrypt6");
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout2,$rndkey1);
+ &movups ($rndkey1,&QWP(0x40,$inp));
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout4,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+ &movups ($inout0,&QWP(0,$rounds_)); # load ivec
+ &mov ($rounds,&DWP(240,$key));
+
+&set_label("ctr32_one");
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups ($in0,&QWP(0,$inp));
+ &xorps ($in0,$inout0);
+ &movups (&QWP(0,$out),$in0);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+ &call ("_aesni_encrypt3");
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,$inp));
+ &xorps ($inout0,$inout3);
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+ &call ("_aesni_encrypt3");
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,$inp));
+ &xorps ($inout0,$inout3);
+ &movups ($inout5,&QWP(0x20,$inp));
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+ &call ("_aesni_encrypt4");
+ &movups ($inout4,&QWP(0,$inp));
+ &movups ($inout5,&QWP(0x10,$inp));
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout0,$inout4);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout1,$inout5);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+ &mov ("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &and ("esp",-16); # align stack
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($len,16*6);
+ &jc (&label("xts_enc_short"));
+
+ &shr ($rounds,1);
+ &mov ($rounds_,$rounds);
+ &jmp (&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &aesenc ($inout0,$rndkey1);
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesenc ($inout1,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &dec ($rounds);
+ &aesenc ($inout2,$rndkey1);
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesenc ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesenc ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesenc ($inout5,$rndkey1);
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov ($rounds,$rounds_); # restore $rounds
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_enc_loop6"));
+
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+ &add ($len,16*6);
+ &jz (&label("xts_enc_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_enc_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_enc_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_enc_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_enc_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_encrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout2);
+
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_encrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+ &movdqa ($inout3,$tweak);
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($inout3,$twtmp,0x13);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+ &movz ($rounds,&BP(0,$inp));
+ &movz ($key,&BP(-16,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(-16,$out),&LB($rounds));
+ &mov (&BP(0,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_enc_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(-16,$out)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(-16,$out),$inout0); # write output
+
+&set_label("xts_enc_ret");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &and ("esp",-16); # align stack
+
+ &xor ($rounds_,$rounds_); # if(len%16) len-=16;
+ &test ($len,15);
+ &setnz (&LB($rounds_));
+ &shl ($rounds_,4);
+ &sub ($len,$rounds_);
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &sub ($len,16*6);
+ &jc (&label("xts_dec_short"));
+
+ &shr ($rounds,1);
+ &mov ($rounds_,$rounds);
+ &jmp (&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &aesdec ($inout0,$rndkey1);
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesdec ($inout1,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &dec ($rounds);
+ &aesdec ($inout2,$rndkey1);
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesdec ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesdec ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesdec ($inout5,$rndkey1);
+ &call (&label("_aesni_decrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov ($rounds,$rounds_); # restore $rounds
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_dec_loop6"));
+
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+ &add ($len,16*6);
+ &jz (&label("xts_dec_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_dec_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_dec_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_dec_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_dec_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_decrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+
+ &call ("_aesni_decrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_decrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_decrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(16*6,"esp"));
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+ &pshufd ($inout3,$twtmp,0x13);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,$twmask); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_steal");
+ &movz ($rounds,&BP(16,$inp));
+ &movz ($key,&BP(0,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(0,$out),&LB($rounds));
+ &mov (&BP(16,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_dec_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$out)); # load input
+ &xorps ($inout0,$inout4); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout4); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_ret");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+ &mov ($inp,&wparam(0));
+ &mov ($rounds_,"esp");
+ &mov ($out,&wparam(1));
+ &sub ($rounds_,24);
+ &mov ($len,&wparam(2));
+ &and ($rounds_,-16);
+ &mov ($key,&wparam(3));
+ &mov ($key_,&wparam(4));
+ &test ($len,$len);
+ &jz (&label("cbc_abort"));
+
+ &cmp (&wparam(5),0);
+ &xchg ($rounds_,"esp"); # alloca
+ &movups ($ivec,&QWP(0,$key_)); # load IV
+ &mov ($rounds,&DWP(240,$key));
+ &mov ($key_,$key); # backup $key
+ &mov (&DWP(16,"esp"),$rounds_); # save original %esp
+ &mov ($rounds_,$rounds); # backup $rounds
+ &je (&label("cbc_decrypt"));
+
+ &movaps ($inout0,$ivec);
+ &cmp ($len,16);
+ &jb (&label("cbc_enc_tail"));
+ &sub ($len,16);
+ &jmp (&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+ &movups ($ivec,&QWP(0,$inp)); # input actually
+ &lea ($inp,&DWP(16,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc",$inout0,$ivec); }
+ else
+ { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
+ &mov ($rounds,$rounds_); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &movups (&QWP(0,$out),$inout0); # store output
+ &lea ($out,&DWP(16,$out));
+ &sub ($len,16);
+ &jnc (&label("cbc_enc_loop"));
+ &add ($len,16);
+ &jnz (&label("cbc_enc_tail"));
+ &movaps ($ivec,$inout0);
+ &jmp (&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+ &mov ("ecx",$len); # zaps $rounds
+ &data_word(0xA4F3F689); # rep movsb
+ &mov ("ecx",16); # zero tail
+ &sub ("ecx",$len);
+ &xor ("eax","eax"); # zaps $len
+ &data_word(0xAAF3F689); # rep stosb
+ &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
+ &mov ($rounds,$rounds_); # restore $rounds
+ &mov ($inp,$out); # $inp and $out are the same
+ &mov ($key,$key_); # restore $key
+ &jmp (&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+ &cmp ($len,0x50);
+ &jbe (&label("cbc_dec_tail"));
+ &movaps (&QWP(0,"esp"),$ivec); # save IV
+ &sub ($len,0x50);
+ &jmp (&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+ &movaps (&QWP(0,"esp"),$rndkey0); # save IV
+ &movups (&QWP(0,$out),$inout5);
+ &lea ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+
+ &call ("_aesni_decrypt6");
+
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,&QWP(0,"esp")); # ^=IV
+ &xorps ($inout1,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout2,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout3,$rndkey1);
+ &movups ($rndkey1,&QWP(0x40,$inp));
+ &xorps ($inout4,$rndkey0);
+ &movups ($rndkey0,&QWP(0x50,$inp)); # IV
+ &xorps ($inout5,$rndkey1);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &lea ($inp,&DWP(0x60,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &mov ($rounds,$rounds_) # restore $rounds
+ &movups (&QWP(0x30,$out),$inout3);
+ &mov ($key,$key_); # restore $key
+ &movups (&QWP(0x40,$out),$inout4);
+ &lea ($out,&DWP(0x50,$out));
+ &sub ($len,0x60);
+ &ja (&label("cbc_dec_loop6"));
+
+ &movaps ($inout0,$inout5);
+ &movaps ($ivec,$rndkey0);
+ &add ($len,0x50);
+ &jle (&label("cbc_dec_tail_collected"));
+ &movups (&QWP(0,$out),$inout0);
+ &lea ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &movaps ($in0,$inout0);
+ &cmp ($len,0x10);
+ &jbe (&label("cbc_dec_one"));
+
+ &movups ($inout1,&QWP(0x10,$inp));
+ &movaps ($in1,$inout1);
+ &cmp ($len,0x20);
+ &jbe (&label("cbc_dec_two"));
+
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x30);
+ &jbe (&label("cbc_dec_three"));
+
+ &movups ($inout3,&QWP(0x30,$inp));
+ &cmp ($len,0x40);
+ &jbe (&label("cbc_dec_four"));
+
+ &movups ($inout4,&QWP(0x40,$inp));
+ &movaps (&QWP(0,"esp"),$ivec); # save IV
+ &movups ($inout0,&QWP(0,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_decrypt6");
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,&QWP(0,"esp")); # ^= IV
+ &xorps ($inout1,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout2,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout3,$rndkey1);
+ &movups ($ivec,&QWP(0x40,$inp)); # IV
+ &xorps ($inout4,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &lea ($out,&DWP(0x40,$out));
+ &movaps ($inout0,$inout4);
+ &sub ($len,0x50);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$ivec);
+ &movaps ($ivec,$in0);
+ &sub ($len,0x10);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+ &xorps ($inout2,$inout2);
+ &call ("_aesni_decrypt3");
+ &xorps ($inout0,$ivec);
+ &xorps ($inout1,$in0);
+ &movups (&QWP(0,$out),$inout0);
+ &movaps ($inout0,$inout1);
+ &lea ($out,&DWP(0x10,$out));
+ &movaps ($ivec,$in1);
+ &sub ($len,0x20);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+ &call ("_aesni_decrypt3");
+ &xorps ($inout0,$ivec);
+ &xorps ($inout1,$in0);
+ &xorps ($inout2,$in1);
+ &movups (&QWP(0,$out),$inout0);
+ &movaps ($inout0,$inout2);
+ &movups (&QWP(0x10,$out),$inout1);
+ &lea ($out,&DWP(0x20,$out));
+ &movups ($ivec,&QWP(0x20,$inp));
+ &sub ($len,0x30);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+ &call ("_aesni_decrypt4");
+ &movups ($rndkey1,&QWP(0x10,$inp));
+ &movups ($rndkey0,&QWP(0x20,$inp));
+ &xorps ($inout0,$ivec);
+ &movups ($ivec,&QWP(0x30,$inp));
+ &xorps ($inout1,$in0);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0x20,$out),$inout2);
+ &lea ($out,&DWP(0x30,$out));
+ &movaps ($inout0,$inout3);
+ &sub ($len,0x40);
+
+&set_label("cbc_dec_tail_collected");
+ &and ($len,15);
+ &jnz (&label("cbc_dec_tail_partial"));
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+ &movaps (&QWP(0,"esp"),$inout0);
+ &mov ("ecx",16);
+ &mov ($inp,"esp");
+ &sub ("ecx",$len);
+ &data_word(0xA4F3F689); # rep movsb
+
+&set_label("cbc_ret");
+ &mov ("esp",&DWP(16,"esp")); # pull original %esp
+ &mov ($key_,&wparam(4));
+ &movups (&QWP(0,$key_),$ivec); # output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+# "eax" const unsigned char *userKey
+# $rounds int bits
+# $key AES_KEY *key
+# output:
+# "eax" return code
+# $round rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+ &test ("eax","eax");
+ &jz (&label("bad_pointer"));
+ &test ($key,$key);
+ &jz (&label("bad_pointer"));
+
+ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &lea ($key,&DWP(16,$key));
+ &cmp ($rounds,256);
+ &je (&label("14rounds"));
+ &cmp ($rounds,192);
+ &je (&label("12rounds"));
+ &cmp ($rounds,128);
+ &jne (&label("bad_keybits"));
+
+&set_label("10rounds",16);
+ &mov ($rounds,9);
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 1
+ &call (&label("key_128_cold"));
+ &aeskeygenassist("xmm1","xmm0",0x2); # round 2
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 3
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 4
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 5
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 6
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x40); # round 7
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x80); # round 8
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x36); # round 10
+ &call (&label("key_128"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(80,$key),$rounds);
+ &xor ("eax","eax");
+ &ret();
+
+&set_label("key_128",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_128_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
+ &xorps ("xmm0","xmm1");
+ &ret();
+
+&set_label("12rounds",16);
+ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
+ &mov ($rounds,11);
+ &$movekey (&QWP(-16,$key),"xmm0") # round 0
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
+ &call (&label("key_192a_cold"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
+ &call (&label("key_192b"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(48,$key),$rounds);
+ &xor ("eax","eax");
+ &ret();
+
+&set_label("key_192a",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+ &movaps ("xmm5","xmm2");
+&set_label("key_192b_warm");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &movdqa ("xmm3","xmm2");
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &pslldq ("xmm3",4);
+ &xorps ("xmm0","xmm4");
+ &pshufd ("xmm1","xmm1",0b01010101); # critical path
+ &pxor ("xmm2","xmm3");
+ &pxor ("xmm0","xmm1");
+ &pshufd ("xmm3","xmm0",0b11111111);
+ &pxor ("xmm2","xmm3");
+ &ret();
+
+&set_label("key_192b",16);
+ &movaps ("xmm3","xmm0");
+ &shufps ("xmm5","xmm0",0b01000100);
+ &$movekey (&QWP(0,$key),"xmm5");
+ &shufps ("xmm3","xmm2",0b01001110);
+ &$movekey (&QWP(16,$key),"xmm3");
+ &lea ($key,&DWP(32,$key));
+ &jmp (&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
+ &mov ($rounds,13);
+ &lea ($key,&DWP(16,$key));
+ &$movekey (&QWP(-32,$key),"xmm0"); # round 0
+ &$movekey (&QWP(-16,$key),"xmm2"); # round 1
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 2
+ &call (&label("key_256a_cold"));
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 3
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 4
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x02); # round 5
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 6
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 7
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 8
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 9
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 10
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 11
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 12
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 13
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 14
+ &call (&label("key_256a"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(16,$key),$rounds);
+ &xor ("eax","eax");
+ &ret();
+
+&set_label("key_256a",16);
+ &$movekey (&QWP(0,$key),"xmm2");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
+ &xorps ("xmm0","xmm1");
+ &ret();
+
+&set_label("key_256b",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+
+ &shufps ("xmm4","xmm2",0b00010000);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm4","xmm2",0b10001100);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm1","xmm1",0b10101010); # critical path
+ &xorps ("xmm2","xmm1");
+ &ret();
+
+&set_label("bad_pointer",4);
+ &mov ("eax",-1);
+ &ret ();
+&set_label("bad_keybits",4);
+ &mov ("eax",-2);
+ &ret ();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &call ("_aesni_set_encrypt_key");
+ &ret ();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &call ("_aesni_set_encrypt_key");
+ &mov ($key,&wparam(2));
+ &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
+ &test ("eax","eax");
+ &jnz (&label("dec_key_ret"));
+ &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
+
+ &$movekey ("xmm0",&QWP(0,$key)); # just swap
+ &$movekey ("xmm1",&QWP(0,"eax"));
+ &$movekey (&QWP(0,"eax"),"xmm0");
+ &$movekey (&QWP(0,$key),"xmm1");
+ &lea ($key,&DWP(16,$key));
+ &lea ("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+ &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
+ &$movekey ("xmm1",&QWP(0,"eax"));
+ &aesimc ("xmm0","xmm0");
+ &aesimc ("xmm1","xmm1");
+ &lea ($key,&DWP(16,$key));
+ &lea ("eax",&DWP(-16,"eax"));
+ &$movekey (&QWP(16,"eax"),"xmm0");
+ &$movekey (&QWP(-16,$key),"xmm1");
+ &cmp ("eax",$key);
+ &ja (&label("dec_key_inverse"));
+
+ &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
+ &aesimc ("xmm0","xmm0");
+ &$movekey (&QWP(0,$key),"xmm0");
+
+ &xor ("eax","eax"); # return success
+&set_label("dec_key_ret");
+ &ret ();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by ");
+
+&asm_finish();
diff --git a/crypto/aes/asm/aesni-x86_64.S b/crypto/aes/asm/aesni-x86_64.S
new file mode 100644
index 0000000..917c832
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86_64.S
@@ -0,0 +1,2535 @@
+.text
+.globl aesni_encrypt
+.type aesni_encrypt,@function
+.align 16
+aesni_encrypt:
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_enc1_1:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_enc1_1
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ .byte 0xf3,0xc3
+.size aesni_encrypt,.-aesni_encrypt
+
+.globl aesni_decrypt
+.type aesni_decrypt,@function
+.align 16
+aesni_decrypt:
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_dec1_2:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_dec1_2
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ .byte 0xf3,0xc3
+.size aesni_decrypt, .-aesni_decrypt
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups (%rcx),%xmm0
+
+.Lenc_loop3:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop3
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ .byte 0xf3,0xc3
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups (%rcx),%xmm0
+
+.Ldec_loop3:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop3
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ .byte 0xf3,0xc3
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups (%rcx),%xmm0
+
+.Lenc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ .byte 0xf3,0xc3
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups (%rcx),%xmm0
+
+.Ldec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ .byte 0xf3,0xc3
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,220,241
+ movups (%rcx),%xmm0
+.byte 102,15,56,220,249
+ jmp .Lenc_loop6_enter
+.align 16
+.Lenc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.Lenc_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ .byte 0xf3,0xc3
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,222,241
+ movups (%rcx),%xmm0
+.byte 102,15,56,222,249
+ jmp .Ldec_loop6_enter
+.align 16
+.Ldec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.Ldec_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ .byte 0xf3,0xc3
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.type _aesni_encrypt8,@function
+.align 16
+_aesni_encrypt8:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,220,241
+ pxor %xmm0,%xmm8
+.byte 102,15,56,220,249
+ pxor %xmm0,%xmm9
+ movups (%rcx),%xmm0
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 16(%rcx),%xmm1
+ jmp .Lenc_loop8_enter
+.align 16
+.Lenc_loop8:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 16(%rcx),%xmm1
+.Lenc_loop8_enter:
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop8
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+.byte 102,68,15,56,221,192
+.byte 102,68,15,56,221,200
+ .byte 0xf3,0xc3
+.size _aesni_encrypt8,.-_aesni_encrypt8
+.type _aesni_decrypt8,@function
+.align 16
+_aesni_decrypt8:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,222,241
+ pxor %xmm0,%xmm8
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm9
+ movups (%rcx),%xmm0
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 16(%rcx),%xmm1
+ jmp .Ldec_loop8_enter
+.align 16
+.Ldec_loop8:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 16(%rcx),%xmm1
+.Ldec_loop8_enter:
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop8
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+.byte 102,68,15,56,223,192
+.byte 102,68,15,56,223,200
+ .byte 0xf3,0xc3
+.size _aesni_decrypt8,.-_aesni_decrypt8
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,@function
+.align 16
+aesni_ecb_encrypt:
+ andq $-16,%rdx
+ jz .Lecb_ret
+
+ movl 240(%rcx),%eax
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %eax,%r10d
+ testl %r8d,%r8d
+ jz .Lecb_decrypt
+
+ cmpq $128,%rdx
+ jb .Lecb_enc_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ subq $128,%rdx
+ jnc .Lecb_enc_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $128,%rdx
+ jz .Lecb_ret
+
+.Lecb_enc_tail:
+ movups (%rdi),%xmm2
+ cmpq $32,%rdx
+ jb .Lecb_enc_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_enc_two
+ movups 32(%rdi),%xmm4
+ cmpq $64,%rdx
+ jb .Lecb_enc_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_enc_four
+ movups 64(%rdi),%xmm6
+ cmpq $96,%rdx
+ jb .Lecb_enc_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_enc_six
+ movdqu 96(%rdi),%xmm8
+ call _aesni_encrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_3:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ jmp .Lecb_ret
+
+.align 16
+.Lecb_decrypt:
+ cmpq $128,%rdx
+ jb .Lecb_dec_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ movups (%r11),%xmm0
+ subq $128,%rdx
+ jnc .Lecb_dec_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $128,%rdx
+ jz .Lecb_ret
+
+.Lecb_dec_tail:
+ movups (%rdi),%xmm2
+ cmpq $32,%rdx
+ jb .Lecb_dec_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_dec_two
+ movups 32(%rdi),%xmm4
+ cmpq $64,%rdx
+ jb .Lecb_dec_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_dec_four
+ movups 64(%rdi),%xmm6
+ cmpq $96,%rdx
+ jb .Lecb_dec_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_dec_six
+ movups 96(%rdi),%xmm8
+ movups (%rcx),%xmm0
+ call _aesni_decrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_4:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+
+.Lecb_ret:
+ .byte 0xf3,0xc3
+.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,@function
+.align 16
+aesni_ccm64_encrypt_blocks:
+ movl 240(%rcx),%eax
+ movdqu (%r8),%xmm9
+ movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lbswap_mask(%rip),%xmm7
+
+ shrl $1,%eax
+ leaq 0(%rcx),%r11
+ movdqu (%r9),%xmm3
+ movdqa %xmm9,%xmm2
+ movl %eax,%r10d
+.byte 102,68,15,56,0,207
+ jmp .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+ movups (%r11),%xmm0
+ movl %r10d,%eax
+ movups (%rdi),%xmm8
+
+ xorps %xmm0,%xmm2
+ movups 16(%r11),%xmm1
+ xorps %xmm8,%xmm0
+ leaq 32(%r11),%rcx
+ xorps %xmm0,%xmm3
+ movups (%rcx),%xmm0
+
+.Lccm64_enc2_loop:
+.byte 102,15,56,220,209
+ decl %eax
+.byte 102,15,56,220,217
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,216
+ movups 0(%rcx),%xmm0
+ jnz .Lccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq %xmm6,%xmm9
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+
+ decq %rdx
+ leaq 16(%rdi),%rdi
+ xorps %xmm2,%xmm8
+ movdqa %xmm9,%xmm2
+ movups %xmm8,(%rsi)
+ leaq 16(%rsi),%rsi
+.byte 102,15,56,0,215
+ jnz .Lccm64_enc_outer
+
+ movups %xmm3,(%r9)
+ .byte 0xf3,0xc3
+.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,@function
+.align 16
+aesni_ccm64_decrypt_blocks:
+ movl 240(%rcx),%eax
+ movups (%r8),%xmm9
+ movdqu (%r9),%xmm3
+ movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lbswap_mask(%rip),%xmm7
+
+ movaps %xmm9,%xmm2
+ movl %eax,%r10d
+ movq %rcx,%r11
+.byte 102,68,15,56,0,207
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_5:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_5
+.byte 102,15,56,221,209
+ movups (%rdi),%xmm8
+ paddq %xmm6,%xmm9
+ leaq 16(%rdi),%rdi
+ jmp .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+ xorps %xmm2,%xmm8
+ movdqa %xmm9,%xmm2
+ movl %r10d,%eax
+ movups %xmm8,(%rsi)
+ leaq 16(%rsi),%rsi
+.byte 102,15,56,0,215
+
+ subq $1,%rdx
+ jz .Lccm64_dec_break
+
+ movups (%r11),%xmm0
+ shrl $1,%eax
+ movups 16(%r11),%xmm1
+ xorps %xmm0,%xmm8
+ leaq 32(%r11),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm8,%xmm3
+ movups (%rcx),%xmm0
+
+.Lccm64_dec2_loop:
+.byte 102,15,56,220,209
+ decl %eax
+.byte 102,15,56,220,217
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,216
+ movups 0(%rcx),%xmm0
+ jnz .Lccm64_dec2_loop
+ movups (%rdi),%xmm8
+ paddq %xmm6,%xmm9
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 16(%rdi),%rdi
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ jmp .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+
+ movups (%r11),%xmm0
+ movups 16(%r11),%xmm1
+ xorps %xmm0,%xmm8
+ leaq 32(%r11),%r11
+ xorps %xmm8,%xmm3
+.Loop_enc1_6:
+.byte 102,15,56,220,217
+ decl %eax
+ movups (%r11),%xmm1
+ leaq 16(%r11),%r11
+ jnz .Loop_enc1_6
+.byte 102,15,56,221,217
+ movups %xmm3,(%r9)
+ .byte 0xf3,0xc3
+.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,@function
+.align 16
+aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ je .Lctr32_one_shortcut
+
+ movdqu (%r8),%xmm14
+ movdqa .Lbswap_mask(%rip),%xmm15
+ xorl %eax,%eax
+.byte 102,69,15,58,22,242,3
+.byte 102,68,15,58,34,240,3
+
+ movl 240(%rcx),%eax
+ bswapl %r10d
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+.byte 102,69,15,58,34,226,0
+ leaq 3(%r10),%r11
+.byte 102,69,15,58,34,235,0
+ incl %r10d
+.byte 102,69,15,58,34,226,1
+ incq %r11
+.byte 102,69,15,58,34,235,1
+ incl %r10d
+.byte 102,69,15,58,34,226,2
+ incq %r11
+.byte 102,69,15,58,34,235,2
+ movdqa %xmm12,-40(%rsp)
+.byte 102,69,15,56,0,231
+ movdqa %xmm13,-24(%rsp)
+.byte 102,69,15,56,0,239
+
+ pshufd $192,%xmm12,%xmm2
+ pshufd $128,%xmm12,%xmm3
+ pshufd $64,%xmm12,%xmm4
+ cmpq $6,%rdx
+ jb .Lctr32_tail
+ shrl $1,%eax
+ movq %rcx,%r11
+ movl %eax,%r10d
+ subq $6,%rdx
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ pshufd $192,%xmm13,%xmm5
+ por %xmm14,%xmm2
+ movups (%r11),%xmm0
+ pshufd $128,%xmm13,%xmm6
+ por %xmm14,%xmm3
+ movups 16(%r11),%xmm1
+ pshufd $64,%xmm13,%xmm7
+ por %xmm14,%xmm4
+ por %xmm14,%xmm5
+ xorps %xmm0,%xmm2
+ por %xmm14,%xmm6
+ por %xmm14,%xmm7
+
+
+
+
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ leaq 32(%r11),%rcx
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ movdqa .Lincrement32(%rip),%xmm13
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ movdqa -40(%rsp),%xmm12
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ movups (%rcx),%xmm0
+ decl %eax
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ jmp .Lctr32_enc_loop6_enter
+.align 16
+.Lctr32_enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.Lctr32_enc_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%rcx),%xmm0
+ jnz .Lctr32_enc_loop6
+
+.byte 102,15,56,220,209
+ paddd %xmm13,%xmm12
+.byte 102,15,56,220,217
+ paddd -24(%rsp),%xmm13
+.byte 102,15,56,220,225
+ movdqa %xmm12,-40(%rsp)
+.byte 102,15,56,220,233
+ movdqa %xmm13,-24(%rsp)
+.byte 102,15,56,220,241
+.byte 102,69,15,56,0,231
+.byte 102,15,56,220,249
+.byte 102,69,15,56,0,239
+
+.byte 102,15,56,221,208
+ movups (%rdi),%xmm8
+.byte 102,15,56,221,216
+ movups 16(%rdi),%xmm9
+.byte 102,15,56,221,224
+ movups 32(%rdi),%xmm10
+.byte 102,15,56,221,232
+ movups 48(%rdi),%xmm11
+.byte 102,15,56,221,240
+ movups 64(%rdi),%xmm1
+.byte 102,15,56,221,248
+ movups 80(%rdi),%xmm0
+ leaq 96(%rdi),%rdi
+
+ xorps %xmm2,%xmm8
+ pshufd $192,%xmm12,%xmm2
+ xorps %xmm3,%xmm9
+ pshufd $128,%xmm12,%xmm3
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ pshufd $64,%xmm12,%xmm4
+ movups %xmm9,16(%rsi)
+ xorps %xmm5,%xmm11
+ movups %xmm10,32(%rsi)
+ xorps %xmm6,%xmm1
+ movups %xmm11,48(%rsi)
+ xorps %xmm7,%xmm0
+ movups %xmm1,64(%rsi)
+ movups %xmm0,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movl %r10d,%eax
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+ movq %r11,%rcx
+ leal 1(%rax,%rax,1),%eax
+
+.Lctr32_tail:
+ por %xmm14,%xmm2
+ movups (%rdi),%xmm8
+ cmpq $2,%rdx
+ jb .Lctr32_one
+
+ por %xmm14,%xmm3
+ movups 16(%rdi),%xmm9
+ je .Lctr32_two
+
+ pshufd $192,%xmm13,%xmm5
+ por %xmm14,%xmm4
+ movups 32(%rdi),%xmm10
+ cmpq $4,%rdx
+ jb .Lctr32_three
+
+ pshufd $128,%xmm13,%xmm6
+ por %xmm14,%xmm5
+ movups 48(%rdi),%xmm11
+ je .Lctr32_four
+
+ por %xmm14,%xmm6
+ xorps %xmm7,%xmm7
+
+ call _aesni_encrypt6
+
+ movups 64(%rdi),%xmm1
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ movups %xmm9,16(%rsi)
+ xorps %xmm5,%xmm11
+ movups %xmm10,32(%rsi)
+ xorps %xmm6,%xmm1
+ movups %xmm11,48(%rsi)
+ movups %xmm1,64(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_one_shortcut:
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm8
+ movl 240(%rcx),%eax
+.Lctr32_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ xorps %xmm2,%xmm8
+ movups %xmm8,(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_two:
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ movups %xmm9,16(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_three:
+ call _aesni_encrypt3
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ movups %xmm9,16(%rsi)
+ movups %xmm10,32(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_four:
+ call _aesni_encrypt4
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ movups %xmm9,16(%rsi)
+ xorps %xmm5,%xmm11
+ movups %xmm10,32(%rsi)
+ movups %xmm11,48(%rsi)
+
+.Lctr32_done:
+ .byte 0xf3,0xc3
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,@function
+.align 16
+aesni_xts_encrypt:
+ leaq -104(%rsp),%rsp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+ movups (%r8),%xmm0
+ movups 16(%r8),%xmm1
+ leaq 32(%r8),%r8
+ xorps %xmm0,%xmm15
+.Loop_enc1_8:
+.byte 102,68,15,56,220,249
+ decl %eax
+ movups (%r8),%xmm1
+ leaq 16(%r8),%r8
+ jnz .Loop_enc1_8
+.byte 102,68,15,56,221,249
+ movq %rcx,%r11
+ movl %r10d,%eax
+ movq %rdx,%r9
+ andq $-16,%rdx
+
+ movdqa .Lxts_magic(%rip),%xmm8
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ subq $96,%rdx
+ jc .Lxts_enc_short
+
+ shrl $1,%eax
+ subl $1,%eax
+ movl %eax,%r10d
+ jmp .Lxts_enc_grandloop
+
+.align 16
+.Lxts_enc_grandloop:
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm12,%xmm4
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+ pxor %xmm13,%xmm5
+ movups (%r11),%xmm0
+ pxor %xmm14,%xmm6
+ pxor %xmm15,%xmm7
+
+
+
+ movups 16(%r11),%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movdqa %xmm10,0(%rsp)
+.byte 102,15,56,220,209
+ leaq 32(%r11),%rcx
+ pxor %xmm0,%xmm4
+ movdqa %xmm11,16(%rsp)
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+ movdqa %xmm13,48(%rsp)
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ movups (%rcx),%xmm0
+ decl %eax
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,220,241
+ movdqa %xmm15,80(%rsp)
+.byte 102,15,56,220,249
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ jmp .Lxts_enc_loop6_enter
+
+.align 16
+.Lxts_enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.Lxts_enc_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%rcx),%xmm0
+ jnz .Lxts_enc_loop6
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ paddq %xmm15,%xmm15
+.byte 102,15,56,220,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,220,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,220,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%rcx),%xmm1
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+.byte 102,15,56,220,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,220,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,220,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 32(%rcx),%xmm0
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+.byte 102,15,56,220,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,220,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,220,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,221,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,221,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+ xorps 16(%rsp),%xmm3
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+
+ xorps 32(%rsp),%xmm4
+ movups %xmm2,0(%rsi)
+ xorps 48(%rsp),%xmm5
+ movups %xmm3,16(%rsi)
+ xorps 64(%rsp),%xmm6
+ movups %xmm4,32(%rsi)
+ xorps 80(%rsp),%xmm7
+ movups %xmm5,48(%rsi)
+ movl %r10d,%eax
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $96,%rdx
+ jnc .Lxts_enc_grandloop
+
+ leal 3(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+.Lxts_enc_short:
+ addq $96,%rdx
+ jz .Lxts_enc_done
+
+ cmpq $32,%rdx
+ jb .Lxts_enc_one
+ je .Lxts_enc_two
+
+ cmpq $64,%rdx
+ jb .Lxts_enc_three
+ je .Lxts_enc_four
+
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ leaq 80(%rdi),%rdi
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm6
+
+ call _aesni_encrypt6
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm15,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ xorps %xmm14,%xmm6
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+ movups (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_9:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_9
+.byte 102,15,56,221,209
+ xorps %xmm10,%xmm2
+ movdqa %xmm11,%xmm10
+ movups %xmm2,(%rsi)
+ leaq 16(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ leaq 32(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+
+ call _aesni_encrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm12,%xmm10
+ xorps %xmm11,%xmm3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ leaq 32(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ leaq 48(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+
+ call _aesni_encrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ leaq 48(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+ leaq 64(%rdi),%rdi
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ xorps %xmm13,%xmm5
+
+ call _aesni_encrypt4
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm15,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+ andq $15,%r9
+ jz .Lxts_enc_ret
+ movq %r9,%rdx
+
+.Lxts_enc_steal:
+ movzbl (%rdi),%eax
+ movzbl -16(%rsi),%ecx
+ leaq 1(%rdi),%rdi
+ movb %al,-16(%rsi)
+ movb %cl,0(%rsi)
+ leaq 1(%rsi),%rsi
+ subq $1,%rdx
+ jnz .Lxts_enc_steal
+
+ subq %r9,%rsi
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups -16(%rsi),%xmm2
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_10:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_10
+.byte 102,15,56,221,209
+ xorps %xmm10,%xmm2
+ movups %xmm2,-16(%rsi)
+
+.Lxts_enc_ret:
+ leaq 104(%rsp),%rsp
+.Lxts_enc_epilogue:
+ .byte 0xf3,0xc3
+.size aesni_xts_encrypt,.-aesni_xts_encrypt
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,@function
+.align 16
+aesni_xts_decrypt:
+ leaq -104(%rsp),%rsp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+ movups (%r8),%xmm0
+ movups 16(%r8),%xmm1
+ leaq 32(%r8),%r8
+ xorps %xmm0,%xmm15
+.Loop_enc1_11:
+.byte 102,68,15,56,220,249
+ decl %eax
+ movups (%r8),%xmm1
+ leaq 16(%r8),%r8
+ jnz .Loop_enc1_11
+.byte 102,68,15,56,221,249
+ xorl %eax,%eax
+ testq $15,%rdx
+ setnz %al
+ shlq $4,%rax
+ subq %rax,%rdx
+
+ movq %rcx,%r11
+ movl %r10d,%eax
+ movq %rdx,%r9
+ andq $-16,%rdx
+
+ movdqa .Lxts_magic(%rip),%xmm8
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ subq $96,%rdx
+ jc .Lxts_dec_short
+
+ shrl $1,%eax
+ subl $1,%eax
+ movl %eax,%r10d
+ jmp .Lxts_dec_grandloop
+
+.align 16
+.Lxts_dec_grandloop:
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm12,%xmm4
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+ pxor %xmm13,%xmm5
+ movups (%r11),%xmm0
+ pxor %xmm14,%xmm6
+ pxor %xmm15,%xmm7
+
+
+
+ movups 16(%r11),%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movdqa %xmm10,0(%rsp)
+.byte 102,15,56,222,209
+ leaq 32(%r11),%rcx
+ pxor %xmm0,%xmm4
+ movdqa %xmm11,16(%rsp)
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+ movdqa %xmm13,48(%rsp)
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ movups (%rcx),%xmm0
+ decl %eax
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,222,241
+ movdqa %xmm15,80(%rsp)
+.byte 102,15,56,222,249
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ jmp .Lxts_dec_loop6_enter
+
+.align 16
+.Lxts_dec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.Lxts_dec_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups (%rcx),%xmm0
+ jnz .Lxts_dec_loop6
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ paddq %xmm15,%xmm15
+.byte 102,15,56,222,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,222,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,222,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%rcx),%xmm1
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+.byte 102,15,56,222,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,222,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,222,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 32(%rcx),%xmm0
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+.byte 102,15,56,222,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,222,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,222,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+.byte 102,15,56,223,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,223,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,223,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+ xorps 16(%rsp),%xmm3
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+
+ xorps 32(%rsp),%xmm4
+ movups %xmm2,0(%rsi)
+ xorps 48(%rsp),%xmm5
+ movups %xmm3,16(%rsi)
+ xorps 64(%rsp),%xmm6
+ movups %xmm4,32(%rsi)
+ xorps 80(%rsp),%xmm7
+ movups %xmm5,48(%rsi)
+ movl %r10d,%eax
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $96,%rdx
+ jnc .Lxts_dec_grandloop
+
+ leal 3(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+.Lxts_dec_short:
+ addq $96,%rdx
+ jz .Lxts_dec_done
+
+ cmpq $32,%rdx
+ jb .Lxts_dec_one
+ je .Lxts_dec_two
+
+ cmpq $64,%rdx
+ jb .Lxts_dec_three
+ je .Lxts_dec_four
+
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ leaq 80(%rdi),%rdi
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm6
+
+ call _aesni_decrypt6
+
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ xorps %xmm14,%xmm6
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm14
+ movdqu %xmm5,48(%rsi)
+ pcmpgtd %xmm15,%xmm14
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ pshufd $19,%xmm14,%xmm11
+ andq $15,%r9
+ jz .Lxts_dec_ret
+
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm11
+ pxor %xmm15,%xmm11
+ jmp .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+ movups (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_12:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_12
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movdqa %xmm11,%xmm10
+ movups %xmm2,(%rsi)
+ movdqa %xmm12,%xmm11
+ leaq 16(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ leaq 32(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+
+ call _aesni_decrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm12,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm13,%xmm11
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ leaq 32(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ leaq 48(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+
+ call _aesni_decrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ leaq 48(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movups (%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movups 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+ leaq 64(%rdi),%rdi
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ xorps %xmm13,%xmm5
+
+ call _aesni_decrypt4
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+ andq $15,%r9
+ jz .Lxts_dec_ret
+.Lxts_dec_done2:
+ movq %r9,%rdx
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups (%rdi),%xmm2
+ xorps %xmm11,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_13:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_13
+.byte 102,15,56,223,209
+ xorps %xmm11,%xmm2
+ movups %xmm2,(%rsi)
+
+.Lxts_dec_steal:
+ movzbl 16(%rdi),%eax
+ movzbl (%rsi),%ecx
+ leaq 1(%rdi),%rdi
+ movb %al,(%rsi)
+ movb %cl,16(%rsi)
+ leaq 1(%rsi),%rsi
+ subq $1,%rdx
+ jnz .Lxts_dec_steal
+
+ subq %r9,%rsi
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups (%rsi),%xmm2
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_14:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_14
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+
+.Lxts_dec_ret:
+ leaq 104(%rsp),%rsp
+.Lxts_dec_epilogue:
+ .byte 0xf3,0xc3
+.size aesni_xts_decrypt,.-aesni_xts_decrypt
+.globl aesni_cbc_encrypt
+.type aesni_cbc_encrypt,@function
+.align 16
+aesni_cbc_encrypt:
+ testq %rdx,%rdx
+ jz .Lcbc_ret
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ testl %r9d,%r9d
+ jz .Lcbc_decrypt
+
+ movups (%r8),%xmm2
+ movl %r10d,%eax
+ cmpq $16,%rdx
+ jb .Lcbc_enc_tail
+ subq $16,%rdx
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups (%rdi),%xmm3
+ leaq 16(%rdi),%rdi
+
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm3
+ leaq 32(%rcx),%rcx
+ xorps %xmm3,%xmm2
+.Loop_enc1_15:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_15
+.byte 102,15,56,221,209
+ movl %r10d,%eax
+ movq %r11,%rcx
+ movups %xmm2,0(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $16,%rdx
+ jnc .Lcbc_enc_loop
+ addq $16,%rdx
+ jnz .Lcbc_enc_tail
+ movups %xmm2,(%r8)
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ movq %rdx,%rcx
+ xchgq %rdi,%rsi
+.long 0x9066A4F3
+ movl $16,%ecx
+ subq %rdx,%rcx
+ xorl %eax,%eax
+.long 0x9066AAF3
+ leaq -16(%rdi),%rdi
+ movl %r10d,%eax
+ movq %rdi,%rsi
+ movq %r11,%rcx
+ xorq %rdx,%rdx
+ jmp .Lcbc_enc_loop
+
+.align 16
+.Lcbc_decrypt:
+ movups (%r8),%xmm9
+ movl %r10d,%eax
+ cmpq $112,%rdx
+ jbe .Lcbc_dec_tail
+ shrl $1,%r10d
+ subq $112,%rdx
+ movl %r10d,%eax
+ movaps %xmm9,-24(%rsp)
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movaps %xmm0,-24(%rsp)
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+ movups (%rcx),%xmm0
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 16(%rcx),%xmm1
+
+ leaq 32(%rcx),%rcx
+ movdqu 32(%rdi),%xmm4
+ xorps %xmm0,%xmm2
+ movdqu 48(%rdi),%xmm5
+ xorps %xmm0,%xmm3
+ movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+ movdqu 80(%rdi),%xmm7
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ movdqu 96(%rdi),%xmm8
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+ movdqu 112(%rdi),%xmm9
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,222,241
+ pxor %xmm0,%xmm8
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm9
+ movups (%rcx),%xmm0
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 16(%rcx),%xmm1
+
+ call .Ldec_loop8_enter
+
+ movups (%rdi),%xmm1
+ movups 16(%rdi),%xmm0
+ xorps -24(%rsp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%rdi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%rdi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%rdi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%rdi),%xmm0
+ xorps %xmm1,%xmm7
+ movups 96(%rdi),%xmm1
+ xorps %xmm0,%xmm8
+ movups 112(%rdi),%xmm0
+ xorps %xmm1,%xmm9
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movl %r10d,%eax
+ movups %xmm6,64(%rsi)
+ movq %r11,%rcx
+ movups %xmm7,80(%rsi)
+ leaq 128(%rdi),%rdi
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
+ subq $128,%rdx
+ ja .Lcbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+ movaps %xmm0,%xmm9
+ addq $112,%rdx
+ jle .Lcbc_dec_tail_collected
+ movups %xmm2,(%rsi)
+ leal 1(%r10,%r10,1),%eax
+ leaq 16(%rsi),%rsi
+.Lcbc_dec_tail:
+ movups (%rdi),%xmm2
+ movaps %xmm2,%xmm8
+ cmpq $16,%rdx
+ jbe .Lcbc_dec_one
+
+ movups 16(%rdi),%xmm3
+ movaps %xmm3,%xmm7
+ cmpq $32,%rdx
+ jbe .Lcbc_dec_two
+
+ movups 32(%rdi),%xmm4
+ movaps %xmm4,%xmm6
+ cmpq $48,%rdx
+ jbe .Lcbc_dec_three
+
+ movups 48(%rdi),%xmm5
+ cmpq $64,%rdx
+ jbe .Lcbc_dec_four
+
+ movups 64(%rdi),%xmm6
+ cmpq $80,%rdx
+ jbe .Lcbc_dec_five
+
+ movups 80(%rdi),%xmm7
+ cmpq $96,%rdx
+ jbe .Lcbc_dec_six
+
+ movups 96(%rdi),%xmm8
+ movaps %xmm9,-24(%rsp)
+ call _aesni_decrypt8
+ movups (%rdi),%xmm1
+ movups 16(%rdi),%xmm0
+ xorps -24(%rsp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%rdi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%rdi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%rdi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%rdi),%xmm0
+ xorps %xmm1,%xmm7
+ movups 96(%rdi),%xmm9
+ xorps %xmm0,%xmm8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movaps %xmm8,%xmm2
+ subq $112,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ xorps %xmm9,%xmm2
+ movaps %xmm8,%xmm9
+ subq $16,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ movups %xmm2,(%rsi)
+ movaps %xmm7,%xmm9
+ movaps %xmm3,%xmm2
+ leaq 16(%rsi),%rsi
+ subq $32,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ call _aesni_decrypt3
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%rsi)
+ movaps %xmm6,%xmm9
+ movaps %xmm4,%xmm2
+ leaq 32(%rsi),%rsi
+ subq $48,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ call _aesni_decrypt4
+ xorps %xmm9,%xmm2
+ movups 48(%rdi),%xmm9
+ xorps %xmm8,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%rsi)
+ xorps %xmm6,%xmm5
+ movups %xmm4,32(%rsi)
+ movaps %xmm5,%xmm2
+ leaq 48(%rsi),%rsi
+ subq $64,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups 16(%rdi),%xmm1
+ movups 32(%rdi),%xmm0
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ xorps %xmm1,%xmm4
+ movups 48(%rdi),%xmm1
+ xorps %xmm0,%xmm5
+ movups 64(%rdi),%xmm9
+ xorps %xmm1,%xmm6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movaps %xmm6,%xmm2
+ subq $80,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_six:
+ call _aesni_decrypt6
+ movups 16(%rdi),%xmm1
+ movups 32(%rdi),%xmm0
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ xorps %xmm1,%xmm4
+ movups 48(%rdi),%xmm1
+ xorps %xmm0,%xmm5
+ movups 64(%rdi),%xmm0
+ xorps %xmm1,%xmm6
+ movups 80(%rdi),%xmm9
+ xorps %xmm0,%xmm7
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movaps %xmm7,%xmm2
+ subq $96,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_tail_collected:
+ andq $15,%rdx
+ movups %xmm9,(%r8)
+ jnz .Lcbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps %xmm2,-24(%rsp)
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+ leaq -24(%rsp),%rsi
+.long 0x9066A4F3
+
+.Lcbc_dec_ret:
+.Lcbc_ret:
+ .byte 0xf3,0xc3
+.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
+.globl aesni_set_decrypt_key
+.type aesni_set_decrypt_key,@function
+.align 16
+aesni_set_decrypt_key:
+.byte 0x48,0x83,0xEC,0x08
+ call __aesni_set_encrypt_key
+ shll $4,%esi
+ testl %eax,%eax
+ jnz .Ldec_key_ret
+ leaq 16(%rdx,%rsi,1),%rdi
+
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+ movups %xmm0,(%rdi)
+ movups %xmm1,(%rdx)
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+
+.Ldec_key_inverse:
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+ movups %xmm0,16(%rdi)
+ movups %xmm1,-16(%rdx)
+ cmpq %rdx,%rdi
+ ja .Ldec_key_inverse
+
+ movups (%rdx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%rdi)
+.Ldec_key_ret:
+ addq $8,%rsp
+ .byte 0xf3,0xc3
+.LSEH_end_set_decrypt_key:
+.size aesni_set_decrypt_key,.-aesni_set_decrypt_key
+.globl aesni_set_encrypt_key
+.type aesni_set_encrypt_key,@function
+.align 16
+aesni_set_encrypt_key:
+__aesni_set_encrypt_key:
+.byte 0x48,0x83,0xEC,0x08
+ movq $-1,%rax
+ testq %rdi,%rdi
+ jz .Lenc_key_ret
+ testq %rdx,%rdx
+ jz .Lenc_key_ret
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je .L14rounds
+ cmpl $192,%esi
+ je .L12rounds
+ cmpl $128,%esi
+ jne .Lbad_keybits
+
+.L10rounds:
+ movl $9,%esi
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_128_cold
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,64
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,128
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,27
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,54
+ call .Lkey_expansion_128
+ movups %xmm0,(%rax)
+ movl %esi,80(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_192a_cold
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,128
+ call .Lkey_expansion_192b
+ movups %xmm0,(%rax)
+ movl %esi,48(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ movups %xmm0,(%rdx)
+ movups %xmm2,16(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_256a_cold
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_256a
+ movups %xmm0,(%rax)
+ movl %esi,16(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ movq $-2,%rax
+.Lenc_key_ret:
+ addq $8,%rsp
+ .byte 0xf3,0xc3
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_192a:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%rax)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%rax)
+ leaq 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ movups %xmm2,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_256b:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ .byte 0xf3,0xc3
+.size aesni_set_encrypt_key,.-aesni_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long 6,6,6,0
+.Lincrement64:
+.long 1,0,0,0
+.Lxts_magic:
+.long 0x87,0,1,0
+
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
new file mode 100644
index 0000000..0dbb194
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -0,0 +1,3069 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
+# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
+# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
+# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
+# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor 3x 6x 8x
+# theoretical asymptotic limit 1.67 0.83 0.625
+# measured performance for 8KB block 1.05 0.86 0.84
+#
+# "as if" interleave factor 4.7x 5.8x 6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt 1.16 0.93 0.93
+# CTR 1.14 0.91 n/a
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
+ # generates drop-in replacement for
+ # crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+$code=".text\n";
+
+$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8"; # cbc, ctr, ...
+
+$rnds_="%r10d"; # backup copy for $rounds
+$key_="%r11"; # backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0"; $rndkey1="%xmm1";
+$inout0="%xmm2"; $inout1="%xmm3";
+$inout2="%xmm4"; $inout3="%xmm5";
+$inout4="%xmm6"; $inout5="%xmm7";
+$inout6="%xmm8"; $inout7="%xmm9";
+
+$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
+$in0="%xmm8"; $iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+ $movkey ($key),$rndkey0
+ $movkey 16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+ xorps $rndkey0,$ivec
+ lea 32($key),$key
+ xorps $ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+ lea 32($key),$key
+ xorps $rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+ aes${p} $rndkey1,$inout
+ dec $rounds
+ $movkey ($key),$rndkey1
+ lea 16($key),$key
+ jnz .Loop_${p}1_$sn # loop body is 16 bytes
+ aes${p}last $rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_encrypt:
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # output
+ ret
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_decrypt:
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # output
+ ret
+.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x...
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt3,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt3:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ $movkey ($key),$rndkey0
+
+.L${dir}_loop3:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ $movkey 16($key),$rndkey1
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop3
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ ret
+.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt4,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt4:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ xorps $rndkey0,$inout3
+ $movkey ($key),$rndkey0
+
+.L${dir}_loop4:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ $movkey 16($key),$rndkey1
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop4
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ ret
+.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt6,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt6:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ aes${dir} $rndkey1,$inout0
+ pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ aes${dir} $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ aes${dir} $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ dec $rounds
+ aes${dir} $rndkey1,$inout4
+ $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout5
+ jmp .L${dir}_loop6_enter
+.align 16
+.L${dir}_loop6:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+.L${dir}_loop6_enter: # happens to be 16-byte aligned
+ $movkey 16($key),$rndkey1
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop6
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ ret
+.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt8,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt8:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ aes${dir} $rndkey1,$inout0
+ pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ aes${dir} $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ aes${dir} $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ dec $rounds
+ aes${dir} $rndkey1,$inout4
+ pxor $rndkey0,$inout6
+ aes${dir} $rndkey1,$inout5
+ pxor $rndkey0,$inout7
+ $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ $movkey 16($key),$rndkey1
+ jmp .L${dir}_loop8_enter
+.align 16
+.L${dir}_loop8:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ $movkey 16($key),$rndkey1
+.L${dir}_loop8_enter: # happens to be 16-byte aligned
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ aes${dir} $rndkey0,$inout6
+ aes${dir} $rndkey0,$inout7
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop8
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ aes${dir}last $rndkey0,$inout6
+ aes${dir}last $rndkey0,$inout7
+ ret
+.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+# size_t length, const AES_KEY *key,
+# int enc);
+$code.=<<___;
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,\@function,5
+.align 16
+aesni_ecb_encrypt:
+ and \$-16,$len
+ jz .Lecb_ret
+
+ mov 240($key),$rounds # key->rounds
+ $movkey ($key),$rndkey0
+ mov $key,$key_ # backup $key
+ mov $rounds,$rnds_ # backup $rounds
+ test %r8d,%r8d # 5th argument
+ jz .Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+ cmp \$0x80,$len
+ jb .Lecb_enc_tail
+
+ movdqu ($inp),$inout0
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ sub \$0x80,$len
+ jnc .Lecb_enc_loop8
+
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ add \$0x80,$len
+ jz .Lecb_ret
+
+.Lecb_enc_tail:
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_enc_one
+ movups 0x10($inp),$inout1
+ je .Lecb_enc_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_enc_three
+ movups 0x30($inp),$inout3
+ je .Lecb_enc_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_enc_five
+ movups 0x50($inp),$inout5
+ je .Lecb_enc_six
+ movdqu 0x60($inp),$inout6
+ call _aesni_encrypt8
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ xorps $inout2,$inout2
+ call _aesni_encrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps $inout5,$inout5
+ call _aesni_encrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ jmp .Lecb_ret
+#--------------------------- ECB DECRYPT ------------------------------#
+.align 16
+.Lecb_decrypt:
+ cmp \$0x80,$len
+ jb .Lecb_dec_tail
+
+ movdqu ($inp),$inout0
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ $movkey ($key_),$rndkey0
+ sub \$0x80,$len
+ jnc .Lecb_dec_loop8
+
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ add \$0x80,$len
+ jz .Lecb_ret
+
+.Lecb_dec_tail:
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_dec_one
+ movups 0x10($inp),$inout1
+ je .Lecb_dec_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_dec_three
+ movups 0x30($inp),$inout3
+ je .Lecb_dec_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_dec_five
+ movups 0x50($inp),$inout5
+ je .Lecb_dec_six
+ movups 0x60($inp),$inout6
+ $movkey ($key),$rndkey0
+ call _aesni_decrypt8
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ xorps $inout2,$inout2
+ call _aesni_decrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+
+.Lecb_ret:
+ ret
+.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9"; # 6th argument
+
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lccm64_enc_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movdqu ($ivp),$iv
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ shr \$1,$rounds
+ lea 0($key),$key_
+ movdqu ($cmac),$inout1
+ movdqa $iv,$inout0
+ mov $rounds,$rnds_
+ pshufb $bswap_mask,$iv
+ jmp .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+ $movkey ($key_),$rndkey0
+ mov $rnds_,$rounds
+ movups ($inp),$in0 # load inp
+
+ xorps $rndkey0,$inout0 # counter
+ $movkey 16($key_),$rndkey1
+ xorps $in0,$rndkey0
+ lea 32($key_),$key
+ xorps $rndkey0,$inout1 # cmac^=inp
+ $movkey ($key),$rndkey0
+
+.Lccm64_enc2_loop:
+ aesenc $rndkey1,$inout0
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ lea 32($key),$key
+ aesenc $rndkey0,$inout1
+ $movkey 0($key),$rndkey0
+ jnz .Lccm64_enc2_loop
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ paddq $increment,$iv
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+
+ dec $len
+ lea 16($inp),$inp
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ lea 16($out),$out
+ pshufb $bswap_mask,$inout0
+ jnz .Lccm64_enc_outer
+
+ movups $inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ lea 0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lccm64_dec_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movups ($ivp),$iv
+ movdqu ($cmac),$inout1
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ movaps $iv,$inout0
+ mov $rounds,$rnds_
+ mov $key,$key_
+ pshufb $bswap_mask,$iv
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups ($inp),$in0 # load inp
+ paddq $increment,$iv
+ lea 16($inp),$inp
+ jmp .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ mov $rnds_,$rounds
+ movups $in0,($out) # save output
+ lea 16($out),$out
+ pshufb $bswap_mask,$inout0
+
+ sub \$1,$len
+ jz .Lccm64_dec_break
+
+ $movkey ($key_),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key_),$rndkey1
+ xorps $rndkey0,$in0
+ lea 32($key_),$key
+ xorps $rndkey0,$inout0
+ xorps $in0,$inout1 # cmac^=out
+ $movkey ($key),$rndkey0
+
+.Lccm64_dec2_loop:
+ aesenc $rndkey1,$inout0
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ lea 32($key),$key
+ aesenc $rndkey0,$inout1
+ $movkey 0($key),$rndkey0
+ jnz .Lccm64_dec2_loop
+ movups ($inp),$in0 # load inp
+ paddq $increment,$iv
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ lea 16($inp),$inp
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+ jmp .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+ #xorps $in0,$inout1 # cmac^=out
+___
+ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+ movups $inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ lea 0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+{
+my $reserved = $win64?0:-0x28;
+my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
+my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
+my $bswap_mask="%xmm15";
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,\@function,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0xc8(%rsp),%rsp
+ movaps %xmm6,0x20(%rsp)
+ movaps %xmm7,0x30(%rsp)
+ movaps %xmm8,0x40(%rsp)
+ movaps %xmm9,0x50(%rsp)
+ movaps %xmm10,0x60(%rsp)
+ movaps %xmm11,0x70(%rsp)
+ movaps %xmm12,0x80(%rsp)
+ movaps %xmm13,0x90(%rsp)
+ movaps %xmm14,0xa0(%rsp)
+ movaps %xmm15,0xb0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+ cmp \$1,$len
+ je .Lctr32_one_shortcut
+
+ movdqu ($ivp),$ivec
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+ xor $rounds,$rounds
+ pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
+ pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
+
+ mov 240($key),$rounds # key->rounds
+ bswap $rnds_
+ pxor $iv0,$iv0 # vector of 3 32-bit counters
+ pxor $iv1,$iv1 # vector of 3 32-bit counters
+ pinsrd \$0,$rnds_,$iv0
+ lea 3($rnds_),$key_
+ pinsrd \$0,$key_,$iv1
+ inc $rnds_
+ pinsrd \$1,$rnds_,$iv0
+ inc $key_
+ pinsrd \$1,$key_,$iv1
+ inc $rnds_
+ pinsrd \$2,$rnds_,$iv0
+ inc $key_
+ pinsrd \$2,$key_,$iv1
+ movdqa $iv0,$reserved(%rsp)
+ pshufb $bswap_mask,$iv0
+ movdqa $iv1,`$reserved+0x10`(%rsp)
+ pshufb $bswap_mask,$iv1
+
+ pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
+ pshufd \$`2<<6`,$iv0,$inout1
+ pshufd \$`1<<6`,$iv0,$inout2
+ cmp \$6,$len
+ jb .Lctr32_tail
+ shr \$1,$rounds
+ mov $key,$key_ # backup $key
+ mov $rounds,$rnds_ # backup $rounds
+ sub \$6,$len
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ pshufd \$`3<<6`,$iv1,$inout3
+ por $ivec,$inout0 # merge counter-less ivec
+ $movkey ($key_),$rndkey0
+ pshufd \$`2<<6`,$iv1,$inout4
+ por $ivec,$inout1
+ $movkey 16($key_),$rndkey1
+ pshufd \$`1<<6`,$iv1,$inout5
+ por $ivec,$inout2
+ por $ivec,$inout3
+ xorps $rndkey0,$inout0
+ por $ivec,$inout4
+ por $ivec,$inout5
+
+ # inline _aesni_encrypt6 and interleave last rounds
+ # with own code...
+
+ pxor $rndkey0,$inout1
+ aesenc $rndkey1,$inout0
+ lea 32($key_),$key
+ pxor $rndkey0,$inout2
+ aesenc $rndkey1,$inout1
+ movdqa .Lincrement32(%rip),$iv1
+ pxor $rndkey0,$inout3
+ aesenc $rndkey1,$inout2
+ movdqa $reserved(%rsp),$iv0
+ pxor $rndkey0,$inout4
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ dec $rounds
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ jmp .Lctr32_enc_loop6_enter
+.align 16
+.Lctr32_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ dec $rounds
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+.Lctr32_enc_loop6_enter:
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ lea 32($key),$key
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .Lctr32_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ paddd $iv1,$iv0 # increment counter vector
+ aesenc $rndkey1,$inout1
+ paddd `$reserved+0x10`(%rsp),$iv1
+ aesenc $rndkey1,$inout2
+ movdqa $iv0,$reserved(%rsp) # save counter vector
+ aesenc $rndkey1,$inout3
+ movdqa $iv1,`$reserved+0x10`(%rsp)
+ aesenc $rndkey1,$inout4
+ pshufb $bswap_mask,$iv0 # byte swap
+ aesenc $rndkey1,$inout5
+ pshufb $bswap_mask,$iv1
+
+ aesenclast $rndkey0,$inout0
+ movups ($inp),$in0 # load input
+ aesenclast $rndkey0,$inout1
+ movups 0x10($inp),$in1
+ aesenclast $rndkey0,$inout2
+ movups 0x20($inp),$in2
+ aesenclast $rndkey0,$inout3
+ movups 0x30($inp),$in3
+ aesenclast $rndkey0,$inout4
+ movups 0x40($inp),$rndkey1
+ aesenclast $rndkey0,$inout5
+ movups 0x50($inp),$rndkey0
+ lea 0x60($inp),$inp
+
+ xorps $inout0,$in0 # xor
+ pshufd \$`3<<6`,$iv0,$inout0
+ xorps $inout1,$in1
+ pshufd \$`2<<6`,$iv0,$inout1
+ movups $in0,($out) # store output
+ xorps $inout2,$in2
+ pshufd \$`1<<6`,$iv0,$inout2
+ movups $in1,0x10($out)
+ xorps $inout3,$in3
+ movups $in2,0x20($out)
+ xorps $inout4,$rndkey1
+ movups $in3,0x30($out)
+ xorps $inout5,$rndkey0
+ movups $rndkey1,0x40($out)
+ movups $rndkey0,0x50($out)
+ lea 0x60($out),$out
+ mov $rnds_,$rounds
+ sub \$6,$len
+ jnc .Lctr32_loop6
+
+ add \$6,$len
+ jz .Lctr32_done
+ mov $key_,$key # restore $key
+ lea 1($rounds,$rounds),$rounds # restore original value
+
+.Lctr32_tail:
+ por $ivec,$inout0
+ movups ($inp),$in0
+ cmp \$2,$len
+ jb .Lctr32_one
+
+ por $ivec,$inout1
+ movups 0x10($inp),$in1
+ je .Lctr32_two
+
+ pshufd \$`3<<6`,$iv1,$inout3
+ por $ivec,$inout2
+ movups 0x20($inp),$in2
+ cmp \$4,$len
+ jb .Lctr32_three
+
+ pshufd \$`2<<6`,$iv1,$inout4
+ por $ivec,$inout3
+ movups 0x30($inp),$in3
+ je .Lctr32_four
+
+ por $ivec,$inout4
+ xorps $inout5,$inout5
+
+ call _aesni_encrypt6
+
+ movups 0x40($inp),$rndkey1
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ xorps $inout2,$in2
+ movups $in1,0x10($out)
+ xorps $inout3,$in3
+ movups $in2,0x20($out)
+ xorps $inout4,$rndkey1
+ movups $in3,0x30($out)
+ movups $rndkey1,0x40($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_one_shortcut:
+ movups ($ivp),$inout0
+ movups ($inp),$in0
+ mov 240($key),$rounds # key->rounds
+.Lctr32_one:
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps $inout0,$in0
+ movups $in0,($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_two:
+ xorps $inout2,$inout2
+ call _aesni_encrypt3
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ movups $in1,0x10($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_three:
+ call _aesni_encrypt3
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ xorps $inout2,$in2
+ movups $in1,0x10($out)
+ movups $in2,0x20($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_four:
+ call _aesni_encrypt4
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ xorps $inout2,$in2
+ movups $in1,0x10($out)
+ xorps $inout3,$in3
+ movups $in2,0x20($out)
+ movups $in3,0x30($out)
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+ movaps 0x20(%rsp),%xmm6
+ movaps 0x30(%rsp),%xmm7
+ movaps 0x40(%rsp),%xmm8
+ movaps 0x50(%rsp),%xmm9
+ movaps 0x60(%rsp),%xmm10
+ movaps 0x70(%rsp),%xmm11
+ movaps 0x80(%rsp),%xmm12
+ movaps 0x90(%rsp),%xmm13
+ movaps 0xa0(%rsp),%xmm14
+ movaps 0xb0(%rsp),%xmm15
+ lea 0xc8(%rsp),%rsp
+.Lctr32_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x68 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,\@function,6
+.align 16
+aesni_xts_encrypt:
+ lea -$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0x60(%rsp)
+ movaps %xmm7,0x70(%rsp)
+ movaps %xmm8,0x80(%rsp)
+ movaps %xmm9,0x90(%rsp)
+ movaps %xmm10,0xa0(%rsp)
+ movaps %xmm11,0xb0(%rsp)
+ movaps %xmm12,0xc0(%rsp)
+ movaps %xmm13,0xd0(%rsp)
+ movaps %xmm14,0xe0(%rsp)
+ movaps %xmm15,0xf0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ movups ($ivp),@tweak[5] # load clear-text tweak
+ mov 240(%r8),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ movdqa .Lxts_magic(%rip),$twmask
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+___
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[$i]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,$twres # isolate carry and residue
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+___
+ }
+$code.=<<___;
+ sub \$16*6,$len
+ jc .Lxts_enc_short
+
+ shr \$1,$rounds
+ sub \$1,$rounds
+ mov $rounds,$rnds_
+ jmp .Lxts_enc_grandloop
+
+.align 16
+.Lxts_enc_grandloop:
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu `16*0`($inp),$inout0 # load input
+ pand $twmask,$twres # isolate carry and residue
+ movdqu `16*1`($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[0],$inout0 # input^=tweak
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[2],$inout2
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+ pxor @tweak[3],$inout3
+ $movkey ($key_),$rndkey0
+ pxor @tweak[4],$inout4
+ pxor @tweak[5],$inout5
+
+ # inline _aesni_encrypt6 and interleave first and last rounds
+ # with own code...
+ $movkey 16($key_),$rndkey1
+ pxor $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
+ aesenc $rndkey1,$inout0
+ lea 32($key_),$key
+ pxor $rndkey0,$inout2
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ movdqa @tweak[3],`16*3`(%rsp)
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ dec $rounds
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey1,$inout4
+ movdqa @tweak[5],`16*5`(%rsp)
+ aesenc $rndkey1,$inout5
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp
+ jmp .Lxts_enc_loop6_enter
+
+.align 16
+.Lxts_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ dec $rounds
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+.Lxts_enc_loop6_enter:
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ lea 32($key),$key
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .Lxts_enc_loop6
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenc $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenc $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ aesenc $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key),$rndkey1
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenc $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenc $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesenc $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 32($key),$rndkey0
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[1]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenc $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenc $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesenc $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[2]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenclast $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenclast $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesenclast $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesenclast $rndkey0,$inout3
+ aesenclast $rndkey0,$inout4
+ aesenclast $rndkey0,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[3]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ xorps `16*0`(%rsp),$inout0 # output^=tweak
+ pand $twmask,$twres # isolate carry and residue
+ xorps `16*1`(%rsp),$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+
+ xorps `16*2`(%rsp),$inout2
+ movups $inout0,`16*0`($out) # write output
+ xorps `16*3`(%rsp),$inout3
+ movups $inout1,`16*1`($out)
+ xorps `16*4`(%rsp),$inout4
+ movups $inout2,`16*2`($out)
+ xorps `16*5`(%rsp),$inout5
+ movups $inout3,`16*3`($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$16*6,$len
+ jnc .Lxts_enc_grandloop
+
+ lea 3($rounds,$rounds),$rounds # restore original value
+ mov $key_,$key # restore $key
+ mov $rounds,$rnds_ # backup $rounds
+
+.Lxts_enc_short:
+ add \$16*6,$len
+ jz .Lxts_enc_done
+
+ cmp \$0x20,$len
+ jb .Lxts_enc_one
+ je .Lxts_enc_two
+
+ cmp \$0x40,$len
+ jb .Lxts_enc_three
+ je .Lxts_enc_four
+
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu ($inp),$inout0
+ pand $twmask,$twres # isolate carry and residue
+ movdqu 16*1($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+
+ call _aesni_encrypt6
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[5],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out)
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out)
+ lea 16*1($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_encrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_encrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_encrypt4
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[5],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ xorps @tweak[3],$inout3
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ movups $inout3,16*3($out)
+ lea 16*4($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+ and \$15,$len_
+ jz .Lxts_enc_ret
+ mov $len_,$len
+
+.Lxts_enc_steal:
+ movzb ($inp),%eax # borrow $rounds ...
+ movzb -16($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,-16($out)
+ mov %cl,0($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_enc_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups -16($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,-16($out)
+
+.Lxts_enc_ret:
+___
+$code.=<<___ if ($win64);
+ movaps 0x60(%rsp),%xmm6
+ movaps 0x70(%rsp),%xmm7
+ movaps 0x80(%rsp),%xmm8
+ movaps 0x90(%rsp),%xmm9
+ movaps 0xa0(%rsp),%xmm10
+ movaps 0xb0(%rsp),%xmm11
+ movaps 0xc0(%rsp),%xmm12
+ movaps 0xd0(%rsp),%xmm13
+ movaps 0xe0(%rsp),%xmm14
+ movaps 0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+ lea $frame_size(%rsp),%rsp
+.Lxts_enc_epilogue:
+ ret
+.size aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,\@function,6
+.align 16
+aesni_xts_decrypt:
+ lea -$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0x60(%rsp)
+ movaps %xmm7,0x70(%rsp)
+ movaps %xmm8,0x80(%rsp)
+ movaps %xmm9,0x90(%rsp)
+ movaps %xmm10,0xa0(%rsp)
+ movaps %xmm11,0xb0(%rsp)
+ movaps %xmm12,0xc0(%rsp)
+ movaps %xmm13,0xd0(%rsp)
+ movaps %xmm14,0xe0(%rsp)
+ movaps %xmm15,0xf0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ movups ($ivp),@tweak[5] # load clear-text tweak
+ mov 240($key2),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+ xor %eax,%eax # if ($len%16) len-=16;
+ test \$15,$len
+ setnz %al
+ shl \$4,%rax
+ sub %rax,$len
+
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ movdqa .Lxts_magic(%rip),$twmask
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+___
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[$i]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,$twres # isolate carry and residue
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+___
+ }
+$code.=<<___;
+ sub \$16*6,$len
+ jc .Lxts_dec_short
+
+ shr \$1,$rounds
+ sub \$1,$rounds
+ mov $rounds,$rnds_
+ jmp .Lxts_dec_grandloop
+
+.align 16
+.Lxts_dec_grandloop:
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu `16*0`($inp),$inout0 # load input
+ pand $twmask,$twres # isolate carry and residue
+ movdqu `16*1`($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[0],$inout0 # input^=tweak
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[2],$inout2
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+ pxor @tweak[3],$inout3
+ $movkey ($key_),$rndkey0
+ pxor @tweak[4],$inout4
+ pxor @tweak[5],$inout5
+
+ # inline _aesni_decrypt6 and interleave first and last rounds
+ # with own code...
+ $movkey 16($key_),$rndkey1
+ pxor $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
+ aesdec $rndkey1,$inout0
+ lea 32($key_),$key
+ pxor $rndkey0,$inout2
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ movdqa @tweak[3],`16*3`(%rsp)
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ dec $rounds
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey1,$inout4
+ movdqa @tweak[5],`16*5`(%rsp)
+ aesdec $rndkey1,$inout5
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp
+ jmp .Lxts_dec_loop6_enter
+
+.align 16
+.Lxts_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ dec $rounds
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+.Lxts_dec_loop6_enter:
+ $movkey 16($key),$rndkey1
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ lea 32($key),$key
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .Lxts_dec_loop6
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdec $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdec $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ aesdec $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key),$rndkey1
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdec $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdec $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesdec $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 32($key),$rndkey0
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[1]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdec $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdec $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesdec $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[2]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdeclast $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdeclast $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesdeclast $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesdeclast $rndkey0,$inout3
+ aesdeclast $rndkey0,$inout4
+ aesdeclast $rndkey0,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[3]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ xorps `16*0`(%rsp),$inout0 # output^=tweak
+ pand $twmask,$twres # isolate carry and residue
+ xorps `16*1`(%rsp),$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+
+ xorps `16*2`(%rsp),$inout2
+ movups $inout0,`16*0`($out) # write output
+ xorps `16*3`(%rsp),$inout3
+ movups $inout1,`16*1`($out)
+ xorps `16*4`(%rsp),$inout4
+ movups $inout2,`16*2`($out)
+ xorps `16*5`(%rsp),$inout5
+ movups $inout3,`16*3`($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$16*6,$len
+ jnc .Lxts_dec_grandloop
+
+ lea 3($rounds,$rounds),$rounds # restore original value
+ mov $key_,$key # restore $key
+ mov $rounds,$rnds_ # backup $rounds
+
+.Lxts_dec_short:
+ add \$16*6,$len
+ jz .Lxts_dec_done
+
+ cmp \$0x20,$len
+ jb .Lxts_dec_one
+ je .Lxts_dec_two
+
+ cmp \$0x40,$len
+ jb .Lxts_dec_three
+ je .Lxts_dec_four
+
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu ($inp),$inout0
+ pand $twmask,$twres # isolate carry and residue
+ movdqu 16*1($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+
+ call _aesni_decrypt6
+
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out)
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ pxor $twtmp,$twtmp
+ movdqu $inout3,16*3($out)
+ pcmpgtd @tweak[5],$twtmp
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out
+ pshufd \$0x13,$twtmp,@tweak[1] # $twres
+ and \$15,$len_
+ jz .Lxts_dec_ret
+
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,@tweak[1] # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ jmp .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out)
+ movdqa @tweak[2],@tweak[1]
+ lea 16*1($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_decrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[3],@tweak[1]
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_decrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[5],@tweak[1]
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movups ($inp),$inout0
+ pand $twmask,$twres # isolate carry and residue
+ movups 16*1($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_decrypt4
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[5],@tweak[1]
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ xorps @tweak[3],$inout3
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ movups $inout3,16*3($out)
+ lea 16*4($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+ and \$15,$len_
+ jz .Lxts_dec_ret
+.Lxts_dec_done2:
+ mov $len_,$len
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($inp),$inout0
+ xorps @tweak[1],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[1],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp),%eax # borrow $rounds ...
+ movzb ($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,($out)
+ mov %cl,16($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_dec_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_ret:
+___
+$code.=<<___ if ($win64);
+ movaps 0x60(%rsp),%xmm6
+ movaps 0x70(%rsp),%xmm7
+ movaps 0x80(%rsp),%xmm8
+ movaps 0x90(%rsp),%xmm9
+ movaps 0xa0(%rsp),%xmm10
+ movaps 0xb0(%rsp),%xmm11
+ movaps 0xc0(%rsp),%xmm12
+ movaps 0xd0(%rsp),%xmm13
+ movaps 0xe0(%rsp),%xmm14
+ movaps 0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+ lea $frame_size(%rsp),%rsp
+.Lxts_dec_epilogue:
+ ret
+.size aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+} }}
+
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+{
+my $reserved = $win64?0x40:-0x18; # used in decrypt
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+ test $len,$len # check length
+ jz .Lcbc_ret
+
+ mov 240($key),$rnds_ # key->rounds
+ mov $key,$key_ # backup $key
+ test %r9d,%r9d # 6th argument
+ jz .Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+ movups ($ivp),$inout0 # load iv as initial state
+ mov $rnds_,$rounds
+ cmp \$16,$len
+ jb .Lcbc_enc_tail
+ sub \$16,$len
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups ($inp),$inout1 # load input
+ lea 16($inp),$inp
+ #xorps $inout1,$inout0
+___
+ &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+ mov $rnds_,$rounds # restore $rounds
+ mov $key_,$key # restore $key
+ movups $inout0,0($out) # store output
+ lea 16($out),$out
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ add \$16,$len
+ jnz .Lcbc_enc_tail
+ movups $inout0,($ivp)
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ mov $len,%rcx # zaps $key
+ xchg $inp,$out # $inp is %rsi and $out is %rdi now
+ .long 0x9066A4F3 # rep movsb
+ mov \$16,%ecx # zero tail
+ sub $len,%rcx
+ xor %eax,%eax
+ .long 0x9066AAF3 # rep stosb
+ lea -16(%rdi),%rdi # rewind $out by 1 block
+ mov $rnds_,$rounds # restore $rounds
+ mov %rdi,%rsi # $inp and $out are the same
+ mov $key_,$key # restore $key
+ xor $len,$len # len=16
+ jmp .Lcbc_enc_loop # one more spin
+#--------------------------- CBC DECRYPT ------------------------------#
+.align 16
+.Lcbc_decrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lcbc_decrypt_body:
+___
+$code.=<<___;
+ movups ($ivp),$iv
+ mov $rnds_,$rounds
+ cmp \$0x70,$len
+ jbe .Lcbc_dec_tail
+ shr \$1,$rnds_
+ sub \$0x70,$len
+ mov $rnds_,$rounds
+ movaps $iv,$reserved(%rsp)
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movaps $rndkey0,$reserved(%rsp) # save IV
+ movups $inout7,($out)
+ lea 0x10($out),$out
+.Lcbc_dec_loop8_enter:
+ $movkey ($key),$rndkey0
+ movups ($inp),$inout0 # load input
+ movups 0x10($inp),$inout1
+ $movkey 16($key),$rndkey1
+
+ lea 32($key),$key
+ movdqu 0x20($inp),$inout2
+ xorps $rndkey0,$inout0
+ movdqu 0x30($inp),$inout3
+ xorps $rndkey0,$inout1
+ movdqu 0x40($inp),$inout4
+ aesdec $rndkey1,$inout0
+ pxor $rndkey0,$inout2
+ movdqu 0x50($inp),$inout5
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ movdqu 0x60($inp),$inout6
+ aesdec $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ movdqu 0x70($inp),$inout7
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ dec $rounds
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0,$inout6
+ aesdec $rndkey1,$inout5
+ pxor $rndkey0,$inout7
+ $movkey ($key),$rndkey0
+ aesdec $rndkey1,$inout6
+ aesdec $rndkey1,$inout7
+ $movkey 16($key),$rndkey1
+
+ call .Ldec_loop8_enter
+
+ movups ($inp),$rndkey1 # re-load input
+ movups 0x10($inp),$rndkey0
+ xorps $reserved(%rsp),$inout0 # ^= IV
+ xorps $rndkey1,$inout1
+ movups 0x20($inp),$rndkey1
+ xorps $rndkey0,$inout2
+ movups 0x30($inp),$rndkey0
+ xorps $rndkey1,$inout3
+ movups 0x40($inp),$rndkey1
+ xorps $rndkey0,$inout4
+ movups 0x50($inp),$rndkey0
+ xorps $rndkey1,$inout5
+ movups 0x60($inp),$rndkey1
+ xorps $rndkey0,$inout6
+ movups 0x70($inp),$rndkey0 # IV
+ xorps $rndkey1,$inout7
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout4,0x40($out)
+ mov $key_,$key # restore $key
+ movups $inout5,0x50($out)
+ lea 0x80($inp),$inp
+ movups $inout6,0x60($out)
+ lea 0x70($out),$out
+ sub \$0x80,$len
+ ja .Lcbc_dec_loop8
+
+ movaps $inout7,$inout0
+ movaps $rndkey0,$iv
+ add \$0x70,$len
+ jle .Lcbc_dec_tail_collected
+ movups $inout0,($out)
+ lea 1($rnds_,$rnds_),$rounds
+ lea 0x10($out),$out
+.Lcbc_dec_tail:
+ movups ($inp),$inout0
+ movaps $inout0,$in0
+ cmp \$0x10,$len
+ jbe .Lcbc_dec_one
+
+ movups 0x10($inp),$inout1
+ movaps $inout1,$in1
+ cmp \$0x20,$len
+ jbe .Lcbc_dec_two
+
+ movups 0x20($inp),$inout2
+ movaps $inout2,$in2
+ cmp \$0x30,$len
+ jbe .Lcbc_dec_three
+
+ movups 0x30($inp),$inout3
+ cmp \$0x40,$len
+ jbe .Lcbc_dec_four
+
+ movups 0x40($inp),$inout4
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_five
+
+ movups 0x50($inp),$inout5
+ cmp \$0x60,$len
+ jbe .Lcbc_dec_six
+
+ movups 0x60($inp),$inout6
+ movaps $iv,$reserved(%rsp) # save IV
+ call _aesni_decrypt8
+ movups ($inp),$rndkey1
+ movups 0x10($inp),$rndkey0
+ xorps $reserved(%rsp),$inout0 # ^= IV
+ xorps $rndkey1,$inout1
+ movups 0x20($inp),$rndkey1
+ xorps $rndkey0,$inout2
+ movups 0x30($inp),$rndkey0
+ xorps $rndkey1,$inout3
+ movups 0x40($inp),$rndkey1
+ xorps $rndkey0,$inout4
+ movups 0x50($inp),$rndkey0
+ xorps $rndkey1,$inout5
+ movups 0x60($inp),$iv # IV
+ xorps $rndkey0,$inout6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ lea 0x60($out),$out
+ movaps $inout6,$inout0
+ sub \$0x70,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_one:
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps $iv,$inout0
+ movaps $in0,$iv
+ sub \$0x10,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ xorps $inout2,$inout2
+ call _aesni_decrypt3
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ movups $inout0,($out)
+ movaps $in1,$iv
+ movaps $inout1,$inout0
+ lea 0x10($out),$out
+ sub \$0x20,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ call _aesni_decrypt3
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ movups $inout0,($out)
+ xorps $in1,$inout2
+ movups $inout1,0x10($out)
+ movaps $in2,$iv
+ movaps $inout2,$inout0
+ lea 0x20($out),$out
+ sub \$0x30,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ call _aesni_decrypt4
+ xorps $iv,$inout0
+ movups 0x30($inp),$iv
+ xorps $in0,$inout1
+ movups $inout0,($out)
+ xorps $in1,$inout2
+ movups $inout1,0x10($out)
+ xorps $in2,$inout3
+ movups $inout2,0x20($out)
+ movaps $inout3,$inout0
+ lea 0x30($out),$out
+ sub \$0x40,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_five:
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ movups 0x10($inp),$rndkey1
+ movups 0x20($inp),$rndkey0
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ xorps $rndkey1,$inout2
+ movups 0x30($inp),$rndkey1
+ xorps $rndkey0,$inout3
+ movups 0x40($inp),$iv
+ xorps $rndkey1,$inout4
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ lea 0x40($out),$out
+ movaps $inout4,$inout0
+ sub \$0x50,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_six:
+ call _aesni_decrypt6
+ movups 0x10($inp),$rndkey1
+ movups 0x20($inp),$rndkey0
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ xorps $rndkey1,$inout2
+ movups 0x30($inp),$rndkey1
+ xorps $rndkey0,$inout3
+ movups 0x40($inp),$rndkey0
+ xorps $rndkey1,$inout4
+ movups 0x50($inp),$iv
+ xorps $rndkey0,$inout5
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ lea 0x50($out),$out
+ movaps $inout5,$inout0
+ sub \$0x60,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_tail_collected:
+ and \$15,$len
+ movups $iv,($ivp)
+ jnz .Lcbc_dec_tail_partial
+ movups $inout0,($out)
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps $inout0,$reserved(%rsp)
+ mov \$16,%rcx
+ mov $out,%rdi
+ sub $len,%rcx
+ lea $reserved(%rsp),%rsi
+ .long 0x9066A4F3 # rep movsb
+
+.Lcbc_dec_ret:
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ lea 0x58(%rsp),%rsp
+___
+$code.=<<___;
+.Lcbc_ret:
+ ret
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+# int bits, AES_KEY *key)
+{ my ($inp,$bits,$key) = @_4args;
+ $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_decrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+ call __aesni_set_encrypt_key
+ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
+ test %eax,%eax
+ jnz .Ldec_key_ret
+ lea 16($key,$bits),$inp # points at the end of key schedule
+
+ $movkey ($key),%xmm0 # just swap
+ $movkey ($inp),%xmm1
+ $movkey %xmm0,($inp)
+ $movkey %xmm1,($key)
+ lea 16($key),$key
+ lea -16($inp),$inp
+
+.Ldec_key_inverse:
+ $movkey ($key),%xmm0 # swap and inverse
+ $movkey ($inp),%xmm1
+ aesimc %xmm0,%xmm0
+ aesimc %xmm1,%xmm1
+ lea 16($key),$key
+ lea -16($inp),$inp
+ $movkey %xmm0,16($inp)
+ $movkey %xmm1,-16($key)
+ cmp $key,$inp
+ ja .Ldec_key_inverse
+
+ $movkey ($key),%xmm0 # inverse middle
+ aesimc %xmm0,%xmm0
+ $movkey %xmm0,($inp)
+.Ldec_key_ret:
+ add \$8,%rsp
+ ret
+.LSEH_end_set_decrypt_key:
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+
+# This is based on submission by
+#
+# Huang Ying
+# Vinodh Gopal
+# Kahraman Akdemir
+#
+# Agressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+$code.=<<___;
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+ mov \$-1,%rax
+ test $inp,$inp
+ jz .Lenc_key_ret
+ test $key,$key
+ jz .Lenc_key_ret
+
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
+ lea 16($key),%rax
+ cmp \$256,$bits
+ je .L14rounds
+ cmp \$192,$bits
+ je .L12rounds
+ cmp \$128,$bits
+ jne .Lbad_keybits
+
+.L10rounds:
+ mov \$9,$bits # 10 rounds for 128-bit key
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
+ call .Lkey_expansion_128_cold
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
+ call .Lkey_expansion_128
+ $movkey %xmm0,(%rax)
+ mov $bits,80(%rax) # 240(%rdx)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
+ mov \$11,$bits # 12 rounds for 192
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
+ call .Lkey_expansion_192a_cold
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
+ call .Lkey_expansion_192b
+ $movkey %xmm0,(%rax)
+ mov $bits,48(%rax) # 240(%rdx)
+ xor %rax, %rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16($inp),%xmm2 # remaning half of *userKey
+ mov \$13,$bits # 14 rounds for 256
+ lea 16(%rax),%rax
+ $movkey %xmm0,($key) # round 0
+ $movkey %xmm2,16($key) # round 1
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
+ call .Lkey_expansion_256a_cold
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
+ call .Lkey_expansion_256a
+ $movkey %xmm0,(%rax)
+ mov $bits,16(%rax) # 240(%rdx)
+ xor %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ mov \$-2,%rax
+.Lenc_key_ret:
+ add \$8,%rsp
+ ret
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_192a:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+ shufps \$0b00010000,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ pslldq \$4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd \$0b11111111,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps \$0b01000100,%xmm0,%xmm5
+ $movkey %xmm5,(%rax)
+ shufps \$0b01001110,%xmm2,%xmm3
+ $movkey %xmm3,16(%rax)
+ lea 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ $movkey %xmm2,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_256b:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+
+ shufps \$0b00010000,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10001100,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm2
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+ .long 6,6,6,0
+.Lincrement64:
+ .long 1,0,0,0
+.Lxts_magic:
+ .long 0x87,0,1,0
+
+.asciz "AES for Intel AES-NI, CRYPTOGAMS by "
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type ecb_se_handler,\@abi-omnipotent
+.align 16
+ecb_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+
+ jmp .Lcommon_seh_tail
+.size ecb_se_handler,.-ecb_se_handler
+
+.type ccm64_se_handler,\@abi-omnipotent
+.align 16
+ccm64_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->RipRsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 0(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x58(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size ccm64_se_handler,.-ccm64_se_handler
+
+.type ctr32_se_handler,\@abi-omnipotent
+.align 16
+ctr32_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lctr32_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lctr32_ret(%rip),%r10
+ cmp %r10,%rbx
+ jae .Lcommon_seh_tail
+
+ lea 0x20(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xc8(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size ctr32_se_handler,.-ctr32_se_handler
+
+.type xts_se_handler,\@abi-omnipotent
+.align 16
+xts_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->RipRsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 0x60(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x68+160(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size xts_se_handler,.-xts_se_handler
+___
+$code.=<<___;
+.type cbc_se_handler,\@abi-omnipotent
+.align 16
+cbc_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lcbc_decrypt(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lcommon_seh_tail
+
+ lea .Lcbc_decrypt_body(%rip),%r10
+ cmp %r10,%rbx # context->RipRip>="epilogue" label
+ jae .Lcommon_seh_tail
+
+ lea 0(%rax),%rsi # top of stack
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x58(%rax),%rax # adjust stack pointer
+ jmp .Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+ mov 120($context),%rax
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size cbc_se_handler,.-cbc_se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+ .rva .LSEH_begin_aesni_ecb_encrypt
+ .rva .LSEH_end_aesni_ecb_encrypt
+ .rva .LSEH_info_ecb
+
+ .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_info_ccm64_enc
+
+ .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_info_ccm64_dec
+
+ .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_end_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_info_ctr32
+
+ .rva .LSEH_begin_aesni_xts_encrypt
+ .rva .LSEH_end_aesni_xts_encrypt
+ .rva .LSEH_info_xts_enc
+
+ .rva .LSEH_begin_aesni_xts_decrypt
+ .rva .LSEH_end_aesni_xts_decrypt
+ .rva .LSEH_info_xts_dec
+___
+$code.=<<___;
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_cbc
+
+ .rva ${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_set_decrypt_key
+ .rva .LSEH_info_key
+
+ .rva ${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_set_encrypt_key
+ .rva .LSEH_info_key
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+ .byte 9,0,0,0
+ .rva ecb_se_handler
+.LSEH_info_ccm64_enc:
+ .byte 9,0,0,0
+ .rva ccm64_se_handler
+ .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
+.LSEH_info_ccm64_dec:
+ .byte 9,0,0,0
+ .rva ccm64_se_handler
+ .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
+.LSEH_info_ctr32:
+ .byte 9,0,0,0
+ .rva ctr32_se_handler
+.LSEH_info_xts_enc:
+ .byte 9,0,0,0
+ .rva xts_se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.LSEH_info_xts_dec:
+ .byte 9,0,0,0
+ .rva xts_se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+___
+$code.=<<___;
+.LSEH_info_cbc:
+ .byte 9,0,0,0
+ .rva cbc_se_handler
+.LSEH_info_key:
+ .byte 0x01,0x04,0x01,0x00
+ .byte 0x04,0x02,0x00,0x00 # sub rsp,8
+___
+}
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ rex(\@opcode,$4,$3);
+ push @opcode,0x0f,0x3a,0xdf;
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
+ my $c=$2;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesimc" => 0xdb,
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/aes/asm/bsaes-x86_64.S b/crypto/aes/asm/bsaes-x86_64.S
new file mode 100644
index 0000000..6ceb3da
--- /dev/null
+++ b/crypto/aes/asm/bsaes-x86_64.S
@@ -0,0 +1,2561 @@
+.text
+
+
+
+
+.type _bsaes_encrypt8,@function
+.align 64
+_bsaes_encrypt8:
+ leaq .LBS0(%rip),%r11
+
+ movdqa (%rax),%xmm8
+ leaq 16(%rax),%rax
+ movdqa 80(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+.byte 102,68,15,56,0,255
+ pxor %xmm8,%xmm1
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm2
+.byte 102,15,56,0,207
+ pxor %xmm8,%xmm3
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,223
+ pxor %xmm8,%xmm5
+.byte 102,15,56,0,231
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $1,%xmm3
+ pxor %xmm6,%xmm5
+ pxor %xmm4,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm6
+ psllq $1,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $1,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm2,%xmm1
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm15
+ pxor %xmm1,%xmm2
+ psllq $1,%xmm1
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm4,%xmm9
+ psrlq $2,%xmm4
+ movdqa %xmm3,%xmm10
+ psrlq $2,%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ pand %xmm8,%xmm4
+ pand %xmm8,%xmm3
+ pxor %xmm4,%xmm6
+ psllq $2,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $2,%xmm3
+ pxor %xmm9,%xmm4
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm2,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm2
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm2,%xmm9
+ psrlq $4,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $4,%xmm1
+ pxor %xmm6,%xmm2
+ pxor %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $4,%xmm2
+ pxor %xmm1,%xmm5
+ psllq $4,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm4
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ decl %r10d
+ jmp .Lenc_sbox
+.align 16
+.Lenc_loop:
+ pxor 0(%rax),%xmm15
+ pxor 16(%rax),%xmm0
+.byte 102,68,15,56,0,255
+ pxor 32(%rax),%xmm1
+.byte 102,15,56,0,199
+ pxor 48(%rax),%xmm2
+.byte 102,15,56,0,207
+ pxor 64(%rax),%xmm3
+.byte 102,15,56,0,215
+ pxor 80(%rax),%xmm4
+.byte 102,15,56,0,223
+ pxor 96(%rax),%xmm5
+.byte 102,15,56,0,231
+ pxor 112(%rax),%xmm6
+.byte 102,15,56,0,239
+ leaq 128(%rax),%rax
+.byte 102,15,56,0,247
+.Lenc_sbox:
+ pxor %xmm5,%xmm4
+ pxor %xmm0,%xmm1
+ pxor %xmm15,%xmm2
+ pxor %xmm1,%xmm5
+ pxor %xmm15,%xmm4
+
+ pxor %xmm2,%xmm5
+ pxor %xmm6,%xmm2
+ pxor %xmm4,%xmm6
+ pxor %xmm3,%xmm2
+ pxor %xmm4,%xmm3
+ pxor %xmm0,%xmm2
+
+ pxor %xmm6,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm6,%xmm10
+ movdqa %xmm0,%xmm9
+ movdqa %xmm4,%xmm8
+ movdqa %xmm1,%xmm12
+ movdqa %xmm5,%xmm11
+
+ pxor %xmm3,%xmm10
+ pxor %xmm1,%xmm9
+ pxor %xmm2,%xmm8
+ movdqa %xmm10,%xmm13
+ pxor %xmm3,%xmm12
+ movdqa %xmm9,%xmm7
+ pxor %xmm15,%xmm11
+ movdqa %xmm10,%xmm14
+
+ por %xmm8,%xmm9
+ por %xmm11,%xmm10
+ pxor %xmm7,%xmm14
+ pand %xmm11,%xmm13
+ pxor %xmm8,%xmm11
+ pand %xmm8,%xmm7
+ pand %xmm11,%xmm14
+ movdqa %xmm2,%xmm11
+ pxor %xmm15,%xmm11
+ pand %xmm11,%xmm12
+ pxor %xmm12,%xmm10
+ pxor %xmm12,%xmm9
+ movdqa %xmm6,%xmm12
+ movdqa %xmm4,%xmm11
+ pxor %xmm0,%xmm12
+ pxor %xmm5,%xmm11
+ movdqa %xmm12,%xmm8
+ pand %xmm11,%xmm12
+ por %xmm11,%xmm8
+ pxor %xmm12,%xmm7
+ pxor %xmm14,%xmm10
+ pxor %xmm13,%xmm9
+ pxor %xmm14,%xmm8
+ movdqa %xmm1,%xmm11
+ pxor %xmm13,%xmm7
+ movdqa %xmm3,%xmm12
+ pxor %xmm13,%xmm8
+ movdqa %xmm0,%xmm13
+ pand %xmm2,%xmm11
+ movdqa %xmm6,%xmm14
+ pand %xmm15,%xmm12
+ pand %xmm4,%xmm13
+ por %xmm5,%xmm14
+ pxor %xmm11,%xmm10
+ pxor %xmm12,%xmm9
+ pxor %xmm13,%xmm8
+ pxor %xmm14,%xmm7
+
+
+
+
+
+ movdqa %xmm10,%xmm11
+ pand %xmm8,%xmm10
+ pxor %xmm9,%xmm11
+
+ movdqa %xmm7,%xmm13
+ movdqa %xmm11,%xmm14
+ pxor %xmm10,%xmm13
+ pand %xmm13,%xmm14
+
+ movdqa %xmm8,%xmm12
+ pxor %xmm9,%xmm14
+ pxor %xmm7,%xmm12
+
+ pxor %xmm9,%xmm10
+
+ pand %xmm10,%xmm12
+
+ movdqa %xmm13,%xmm9
+ pxor %xmm7,%xmm12
+
+ pxor %xmm12,%xmm9
+ pxor %xmm12,%xmm8
+
+ pand %xmm7,%xmm9
+
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm8
+
+ pand %xmm14,%xmm13
+
+ pxor %xmm11,%xmm13
+ movdqa %xmm5,%xmm11
+ movdqa %xmm4,%xmm7
+ movdqa %xmm14,%xmm9
+ pxor %xmm13,%xmm9
+ pand %xmm5,%xmm9
+ pxor %xmm4,%xmm5
+ pand %xmm14,%xmm4
+ pand %xmm13,%xmm5
+ pxor %xmm4,%xmm5
+ pxor %xmm9,%xmm4
+ pxor %xmm15,%xmm11
+ pxor %xmm2,%xmm7
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm15,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm2,%xmm15
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm2
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm15
+ pxor %xmm11,%xmm7
+ pxor %xmm2,%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm11,%xmm5
+ pxor %xmm11,%xmm15
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm2
+
+ movdqa %xmm6,%xmm11
+ movdqa %xmm0,%xmm7
+ pxor %xmm3,%xmm11
+ pxor %xmm1,%xmm7
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm3,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm1,%xmm3
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm1
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm3
+ pxor %xmm11,%xmm7
+ pxor %xmm1,%xmm3
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm1
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ pxor %xmm13,%xmm10
+ pand %xmm6,%xmm10
+ pxor %xmm0,%xmm6
+ pand %xmm14,%xmm0
+ pand %xmm13,%xmm6
+ pxor %xmm0,%xmm6
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm6
+ pxor %xmm11,%xmm3
+ pxor %xmm7,%xmm0
+ pxor %xmm7,%xmm1
+ pxor %xmm15,%xmm6
+ pxor %xmm5,%xmm0
+ pxor %xmm6,%xmm3
+ pxor %xmm15,%xmm5
+ pxor %xmm0,%xmm15
+
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ pxor %xmm2,%xmm1
+ pxor %xmm4,%xmm2
+ pxor %xmm4,%xmm3
+
+ pxor %xmm2,%xmm5
+ decl %r10d
+ jl .Lenc_done
+ pshufd $147,%xmm15,%xmm7
+ pshufd $147,%xmm0,%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $147,%xmm3,%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $147,%xmm5,%xmm10
+ pxor %xmm9,%xmm3
+ pshufd $147,%xmm2,%xmm11
+ pxor %xmm10,%xmm5
+ pshufd $147,%xmm6,%xmm12
+ pxor %xmm11,%xmm2
+ pshufd $147,%xmm1,%xmm13
+ pxor %xmm12,%xmm6
+ pshufd $147,%xmm4,%xmm14
+ pxor %xmm13,%xmm1
+ pxor %xmm14,%xmm4
+
+ pxor %xmm15,%xmm8
+ pxor %xmm4,%xmm7
+ pxor %xmm4,%xmm8
+ pshufd $78,%xmm15,%xmm15
+ pxor %xmm0,%xmm9
+ pshufd $78,%xmm0,%xmm0
+ pxor %xmm2,%xmm12
+ pxor %xmm7,%xmm15
+ pxor %xmm6,%xmm13
+ pxor %xmm8,%xmm0
+ pxor %xmm5,%xmm11
+ pshufd $78,%xmm2,%xmm7
+ pxor %xmm1,%xmm14
+ pshufd $78,%xmm6,%xmm8
+ pxor %xmm3,%xmm10
+ pshufd $78,%xmm5,%xmm2
+ pxor %xmm4,%xmm10
+ pshufd $78,%xmm4,%xmm6
+ pxor %xmm4,%xmm11
+ pshufd $78,%xmm1,%xmm5
+ pxor %xmm11,%xmm7
+ pshufd $78,%xmm3,%xmm1
+ pxor %xmm12,%xmm8
+
+ pxor %xmm10,%xmm2
+ pxor %xmm14,%xmm6
+ pxor %xmm13,%xmm5
+ movdqa %xmm7,%xmm3
+ pxor %xmm9,%xmm1
+ movdqa %xmm8,%xmm4
+ movdqa 48(%r11),%xmm7
+ jnz .Lenc_loop
+ movdqa 64(%r11),%xmm7
+ jmp .Lenc_loop
+.align 16
+.Lenc_done:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm2,%xmm10
+ psrlq $1,%xmm2
+ pxor %xmm4,%xmm1
+ pxor %xmm6,%xmm2
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm2
+ pxor %xmm1,%xmm4
+ psllq $1,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $1,%xmm2
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm2
+ movdqa %xmm3,%xmm9
+ psrlq $1,%xmm3
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm5,%xmm3
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm3
+ pand %xmm7,%xmm15
+ pxor %xmm3,%xmm5
+ psllq $1,%xmm3
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm3
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm6,%xmm9
+ psrlq $2,%xmm6
+ movdqa %xmm2,%xmm10
+ psrlq $2,%xmm2
+ pxor %xmm4,%xmm6
+ pxor %xmm1,%xmm2
+ pand %xmm8,%xmm6
+ pand %xmm8,%xmm2
+ pxor %xmm6,%xmm4
+ psllq $2,%xmm6
+ pxor %xmm2,%xmm1
+ psllq $2,%xmm2
+ pxor %xmm9,%xmm6
+ pxor %xmm10,%xmm2
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm5,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm5
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm5,%xmm9
+ psrlq $4,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $4,%xmm3
+ pxor %xmm4,%xmm5
+ pxor %xmm1,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm4
+ psllq $4,%xmm5
+ pxor %xmm3,%xmm1
+ psllq $4,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm6,%xmm0
+ pxor %xmm2,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm6
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm2
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa (%rax),%xmm7
+ pxor %xmm7,%xmm3
+ pxor %xmm7,%xmm5
+ pxor %xmm7,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm1
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm15
+ pxor %xmm7,%xmm0
+ .byte 0xf3,0xc3
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_decrypt8,@function
+.align 64
+_bsaes_decrypt8:
+ leaq .LBS0(%rip),%r11
+
+ movdqa (%rax),%xmm8
+ leaq 16(%rax),%rax
+ movdqa -48(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+.byte 102,68,15,56,0,255
+ pxor %xmm8,%xmm1
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm2
+.byte 102,15,56,0,207
+ pxor %xmm8,%xmm3
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,223
+ pxor %xmm8,%xmm5
+.byte 102,15,56,0,231
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $1,%xmm3
+ pxor %xmm6,%xmm5
+ pxor %xmm4,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm6
+ psllq $1,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $1,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm2,%xmm1
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm15
+ pxor %xmm1,%xmm2
+ psllq $1,%xmm1
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm4,%xmm9
+ psrlq $2,%xmm4
+ movdqa %xmm3,%xmm10
+ psrlq $2,%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ pand %xmm8,%xmm4
+ pand %xmm8,%xmm3
+ pxor %xmm4,%xmm6
+ psllq $2,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $2,%xmm3
+ pxor %xmm9,%xmm4
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm2,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm2
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm2,%xmm9
+ psrlq $4,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $4,%xmm1
+ pxor %xmm6,%xmm2
+ pxor %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $4,%xmm2
+ pxor %xmm1,%xmm5
+ psllq $4,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm4
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ decl %r10d
+ jmp .Ldec_sbox
+.align 16
+.Ldec_loop:
+ pxor 0(%rax),%xmm15
+ pxor 16(%rax),%xmm0
+.byte 102,68,15,56,0,255
+ pxor 32(%rax),%xmm1
+.byte 102,15,56,0,199
+ pxor 48(%rax),%xmm2
+.byte 102,15,56,0,207
+ pxor 64(%rax),%xmm3
+.byte 102,15,56,0,215
+ pxor 80(%rax),%xmm4
+.byte 102,15,56,0,223
+ pxor 96(%rax),%xmm5
+.byte 102,15,56,0,231
+ pxor 112(%rax),%xmm6
+.byte 102,15,56,0,239
+ leaq 128(%rax),%rax
+.byte 102,15,56,0,247
+.Ldec_sbox:
+ pxor %xmm3,%xmm2
+
+ pxor %xmm6,%xmm3
+ pxor %xmm6,%xmm1
+ pxor %xmm3,%xmm5
+ pxor %xmm5,%xmm6
+ pxor %xmm6,%xmm0
+
+ pxor %xmm0,%xmm15
+ pxor %xmm4,%xmm1
+ pxor %xmm15,%xmm2
+ pxor %xmm15,%xmm4
+ pxor %xmm2,%xmm0
+ movdqa %xmm2,%xmm10
+ movdqa %xmm6,%xmm9
+ movdqa %xmm0,%xmm8
+ movdqa %xmm3,%xmm12
+ movdqa %xmm4,%xmm11
+
+ pxor %xmm15,%xmm10
+ pxor %xmm3,%xmm9
+ pxor %xmm5,%xmm8
+ movdqa %xmm10,%xmm13
+ pxor %xmm15,%xmm12
+ movdqa %xmm9,%xmm7
+ pxor %xmm1,%xmm11
+ movdqa %xmm10,%xmm14
+
+ por %xmm8,%xmm9
+ por %xmm11,%xmm10
+ pxor %xmm7,%xmm14
+ pand %xmm11,%xmm13
+ pxor %xmm8,%xmm11
+ pand %xmm8,%xmm7
+ pand %xmm11,%xmm14
+ movdqa %xmm5,%xmm11
+ pxor %xmm1,%xmm11
+ pand %xmm11,%xmm12
+ pxor %xmm12,%xmm10
+ pxor %xmm12,%xmm9
+ movdqa %xmm2,%xmm12
+ movdqa %xmm0,%xmm11
+ pxor %xmm6,%xmm12
+ pxor %xmm4,%xmm11
+ movdqa %xmm12,%xmm8
+ pand %xmm11,%xmm12
+ por %xmm11,%xmm8
+ pxor %xmm12,%xmm7
+ pxor %xmm14,%xmm10
+ pxor %xmm13,%xmm9
+ pxor %xmm14,%xmm8
+ movdqa %xmm3,%xmm11
+ pxor %xmm13,%xmm7
+ movdqa %xmm15,%xmm12
+ pxor %xmm13,%xmm8
+ movdqa %xmm6,%xmm13
+ pand %xmm5,%xmm11
+ movdqa %xmm2,%xmm14
+ pand %xmm1,%xmm12
+ pand %xmm0,%xmm13
+ por %xmm4,%xmm14
+ pxor %xmm11,%xmm10
+ pxor %xmm12,%xmm9
+ pxor %xmm13,%xmm8
+ pxor %xmm14,%xmm7
+
+
+
+
+
+ movdqa %xmm10,%xmm11
+ pand %xmm8,%xmm10
+ pxor %xmm9,%xmm11
+
+ movdqa %xmm7,%xmm13
+ movdqa %xmm11,%xmm14
+ pxor %xmm10,%xmm13
+ pand %xmm13,%xmm14
+
+ movdqa %xmm8,%xmm12
+ pxor %xmm9,%xmm14
+ pxor %xmm7,%xmm12
+
+ pxor %xmm9,%xmm10
+
+ pand %xmm10,%xmm12
+
+ movdqa %xmm13,%xmm9
+ pxor %xmm7,%xmm12
+
+ pxor %xmm12,%xmm9
+ pxor %xmm12,%xmm8
+
+ pand %xmm7,%xmm9
+
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm8
+
+ pand %xmm14,%xmm13
+
+ pxor %xmm11,%xmm13
+ movdqa %xmm4,%xmm11
+ movdqa %xmm0,%xmm7
+ movdqa %xmm14,%xmm9
+ pxor %xmm13,%xmm9
+ pand %xmm4,%xmm9
+ pxor %xmm0,%xmm4
+ pand %xmm14,%xmm0
+ pand %xmm13,%xmm4
+ pxor %xmm0,%xmm4
+ pxor %xmm9,%xmm0
+ pxor %xmm1,%xmm11
+ pxor %xmm5,%xmm7
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm1,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm5,%xmm1
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm5
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm1
+ pxor %xmm11,%xmm7
+ pxor %xmm5,%xmm1
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm5
+ pxor %xmm11,%xmm4
+ pxor %xmm11,%xmm1
+ pxor %xmm7,%xmm0
+ pxor %xmm7,%xmm5
+
+ movdqa %xmm2,%xmm11
+ movdqa %xmm6,%xmm7
+ pxor %xmm15,%xmm11
+ pxor %xmm3,%xmm7
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm15,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm3,%xmm15
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm3
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm15
+ pxor %xmm11,%xmm7
+ pxor %xmm3,%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm3
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ pxor %xmm13,%xmm10
+ pand %xmm2,%xmm10
+ pxor %xmm6,%xmm2
+ pand %xmm14,%xmm6
+ pand %xmm13,%xmm2
+ pxor %xmm6,%xmm2
+ pxor %xmm10,%xmm6
+ pxor %xmm11,%xmm2
+ pxor %xmm11,%xmm15
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm3
+ pxor %xmm6,%xmm0
+ pxor %xmm4,%xmm5
+
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm1
+ pxor %xmm6,%xmm4
+ pxor %xmm1,%xmm3
+ pxor %xmm15,%xmm6
+ pxor %xmm4,%xmm3
+ pxor %xmm5,%xmm2
+ pxor %xmm0,%xmm5
+ pxor %xmm3,%xmm2
+
+ pxor %xmm15,%xmm3
+ pxor %xmm2,%xmm6
+ decl %r10d
+ jl .Ldec_done
+
+ pshufd $147,%xmm4,%xmm14
+ movdqa %xmm5,%xmm9
+ pxor %xmm6,%xmm4
+ pxor %xmm6,%xmm5
+ pshufd $147,%xmm15,%xmm7
+ movdqa %xmm6,%xmm12
+ pxor %xmm15,%xmm6
+ pxor %xmm0,%xmm15
+ pshufd $147,%xmm0,%xmm8
+ pxor %xmm5,%xmm0
+ pxor %xmm2,%xmm15
+ pxor %xmm3,%xmm0
+ pshufd $147,%xmm3,%xmm10
+ pxor %xmm15,%xmm5
+ pxor %xmm4,%xmm3
+ pxor %xmm2,%xmm4
+ pshufd $147,%xmm2,%xmm13
+ movdqa %xmm1,%xmm11
+ pxor %xmm1,%xmm2
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm3
+ pxor %xmm12,%xmm2
+ pxor %xmm9,%xmm3
+ pxor %xmm11,%xmm3
+ pshufd $147,%xmm12,%xmm12
+
+ pxor %xmm4,%xmm6
+ pxor %xmm7,%xmm4
+ pxor %xmm8,%xmm6
+ pshufd $147,%xmm9,%xmm9
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm6
+ pxor %xmm14,%xmm4
+ pshufd $147,%xmm11,%xmm11
+ pxor %xmm13,%xmm14
+ pxor %xmm4,%xmm6
+
+ pxor %xmm7,%xmm5
+ pshufd $147,%xmm7,%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm15
+ pshufd $147,%xmm8,%xmm8
+ pxor %xmm9,%xmm5
+ pxor %xmm9,%xmm3
+ pxor %xmm14,%xmm15
+ pshufd $147,%xmm9,%xmm9
+ pxor %xmm10,%xmm5
+ pxor %xmm10,%xmm1
+ pxor %xmm10,%xmm0
+ pshufd $147,%xmm10,%xmm10
+ pxor %xmm11,%xmm2
+ pxor %xmm11,%xmm3
+ pxor %xmm14,%xmm2
+ pxor %xmm12,%xmm5
+ pxor %xmm11,%xmm0
+ pxor %xmm12,%xmm14
+
+ pxor %xmm14,%xmm3
+ pshufd $147,%xmm11,%xmm11
+ pxor %xmm14,%xmm1
+ pxor %xmm14,%xmm0
+
+ pxor %xmm12,%xmm14
+ pshufd $147,%xmm12,%xmm12
+ pxor %xmm13,%xmm14
+
+
+ pxor %xmm2,%xmm0
+ pxor %xmm11,%xmm2
+ pshufd $147,%xmm13,%xmm13
+ pxor %xmm7,%xmm15
+ pxor %xmm12,%xmm2
+ pxor %xmm9,%xmm15
+ pshufd $147,%xmm14,%xmm14
+
+ pxor %xmm6,%xmm5
+ pxor %xmm8,%xmm6
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm5
+ pxor %xmm12,%xmm6
+ pxor %xmm12,%xmm4
+ pxor %xmm14,%xmm6
+ pshufd $147,%xmm7,%xmm7
+ pxor %xmm13,%xmm4
+ pxor %xmm6,%xmm5
+ pxor %xmm8,%xmm0
+ pshufd $147,%xmm8,%xmm8
+
+ pxor %xmm14,%xmm2
+ pxor %xmm9,%xmm0
+ pxor %xmm9,%xmm3
+ pshufd $147,%xmm9,%xmm9
+ pxor %xmm13,%xmm15
+ pxor %xmm10,%xmm13
+ pxor %xmm2,%xmm0
+ pxor %xmm13,%xmm5
+
+ pxor %xmm13,%xmm1
+ pxor %xmm12,%xmm3
+ pxor %xmm11,%xmm1
+ pshufd $147,%xmm11,%xmm11
+ pxor %xmm13,%xmm3
+ pxor %xmm14,%xmm1
+ pxor %xmm10,%xmm13
+
+ pshufd $147,%xmm12,%xmm12
+ pshufd $147,%xmm13,%xmm13
+ pshufd $147,%xmm14,%xmm14
+ pshufd $147,%xmm10,%xmm10
+
+
+ pxor %xmm6,%xmm0
+ pxor %xmm6,%xmm8
+ pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm8
+ pxor %xmm7,%xmm5
+ pxor %xmm4,%xmm7
+ pxor %xmm13,%xmm8
+ pxor %xmm14,%xmm13
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm2
+ pxor %xmm0,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm5,%xmm10
+ pxor %xmm9,%xmm3
+ pxor %xmm15,%xmm9
+ pxor %xmm14,%xmm10
+ pxor %xmm3,%xmm12
+ pxor %xmm13,%xmm9
+ pxor %xmm13,%xmm12
+ pxor %xmm1,%xmm13
+ pxor %xmm2,%xmm14
+
+ movdqa %xmm7,%xmm15
+ movdqa %xmm8,%xmm0
+ movdqa %xmm9,%xmm1
+ movdqa %xmm10,%xmm2
+ movdqa %xmm11,%xmm3
+ movdqa %xmm12,%xmm4
+ movdqa %xmm13,%xmm5
+ movdqa %xmm14,%xmm6
+ movdqa -16(%r11),%xmm7
+ jnz .Ldec_loop
+ movdqa -32(%r11),%xmm7
+ jmp .Ldec_loop
+.align 16
+.Ldec_done:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm2,%xmm9
+ psrlq $1,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $1,%xmm1
+ pxor %xmm4,%xmm2
+ pxor %xmm6,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm4
+ psllq $1,%xmm2
+ pxor %xmm1,%xmm6
+ psllq $1,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm3,%xmm5
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm15
+ pxor %xmm5,%xmm3
+ psllq $1,%xmm5
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm6,%xmm9
+ psrlq $2,%xmm6
+ movdqa %xmm1,%xmm10
+ psrlq $2,%xmm1
+ pxor %xmm4,%xmm6
+ pxor %xmm2,%xmm1
+ pand %xmm8,%xmm6
+ pand %xmm8,%xmm1
+ pxor %xmm6,%xmm4
+ psllq $2,%xmm6
+ pxor %xmm1,%xmm2
+ psllq $2,%xmm1
+ pxor %xmm9,%xmm6
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm3
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm5
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm3,%xmm9
+ psrlq $4,%xmm3
+ movdqa %xmm5,%xmm10
+ psrlq $4,%xmm5
+ pxor %xmm4,%xmm3
+ pxor %xmm2,%xmm5
+ pand %xmm7,%xmm3
+ pand %xmm7,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $4,%xmm3
+ pxor %xmm5,%xmm2
+ psllq $4,%xmm5
+ pxor %xmm9,%xmm3
+ pxor %xmm10,%xmm5
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm6,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm6
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa (%rax),%xmm7
+ pxor %xmm7,%xmm5
+ pxor %xmm7,%xmm3
+ pxor %xmm7,%xmm1
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm2
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm15
+ pxor %xmm7,%xmm0
+ .byte 0xf3,0xc3
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+.type _bsaes_key_convert,@function
+.align 16
+_bsaes_key_convert:
+ leaq .Lmasks(%rip),%r11
+ movdqu (%rcx),%xmm7
+ leaq 16(%rcx),%rcx
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ movdqa 48(%r11),%xmm3
+ movdqa 64(%r11),%xmm4
+ pcmpeqd %xmm5,%xmm5
+
+ movdqu (%rcx),%xmm6
+ movdqa %xmm7,(%rax)
+ leaq 16(%rax),%rax
+ decl %r10d
+ jmp .Lkey_loop
+.align 16
+.Lkey_loop:
+.byte 102,15,56,0,244
+
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+
+ pand %xmm6,%xmm8
+ pand %xmm6,%xmm9
+ movdqa %xmm2,%xmm10
+ pcmpeqb %xmm0,%xmm8
+ psllq $4,%xmm0
+ movdqa %xmm3,%xmm11
+ pcmpeqb %xmm1,%xmm9
+ psllq $4,%xmm1
+
+ pand %xmm6,%xmm10
+ pand %xmm6,%xmm11
+ movdqa %xmm0,%xmm12
+ pcmpeqb %xmm2,%xmm10
+ psllq $4,%xmm2
+ movdqa %xmm1,%xmm13
+ pcmpeqb %xmm3,%xmm11
+ psllq $4,%xmm3
+
+ movdqa %xmm2,%xmm14
+ movdqa %xmm3,%xmm15
+ pxor %xmm5,%xmm8
+ pxor %xmm5,%xmm9
+
+ pand %xmm6,%xmm12
+ pand %xmm6,%xmm13
+ movdqa %xmm8,0(%rax)
+ pcmpeqb %xmm0,%xmm12
+ psrlq $4,%xmm0
+ movdqa %xmm9,16(%rax)
+ pcmpeqb %xmm1,%xmm13
+ psrlq $4,%xmm1
+ leaq 16(%rcx),%rcx
+
+ pand %xmm6,%xmm14
+ pand %xmm6,%xmm15
+ movdqa %xmm10,32(%rax)
+ pcmpeqb %xmm2,%xmm14
+ psrlq $4,%xmm2
+ movdqa %xmm11,48(%rax)
+ pcmpeqb %xmm3,%xmm15
+ psrlq $4,%xmm3
+ movdqu (%rcx),%xmm6
+
+ pxor %xmm5,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm12,64(%rax)
+ movdqa %xmm13,80(%rax)
+ movdqa %xmm14,96(%rax)
+ movdqa %xmm15,112(%rax)
+ leaq 128(%rax),%rax
+ decl %r10d
+ jnz .Lkey_loop
+
+ movdqa 80(%r11),%xmm7
+
+ .byte 0xf3,0xc3
+.size _bsaes_key_convert,.-_bsaes_key_convert
+
+.globl bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,@function
+.align 16
+bsaes_cbc_encrypt:
+ cmpl $0,%r9d
+ jne asm_AES_cbc_encrypt
+ cmpq $128,%rdx
+ jb asm_AES_cbc_encrypt
+
+ movq %rsp,%rax
+.Lcbc_dec_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movl 240(%rcx),%eax
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movq %r8,%rbx
+ shrq $4,%r14
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7
+ movdqa %xmm6,(%rax)
+ movdqa %xmm7,(%rsp)
+
+ movdqu (%rbx),%xmm14
+ subq $8,%r14
+.Lcbc_dec_loop:
+ movdqu 0(%r12),%xmm15
+ movdqu 16(%r12),%xmm0
+ movdqu 32(%r12),%xmm1
+ movdqu 48(%r12),%xmm2
+ movdqu 64(%r12),%xmm3
+ movdqu 80(%r12),%xmm4
+ movq %rsp,%rax
+ movdqu 96(%r12),%xmm5
+ movl %edx,%r10d
+ movdqu 112(%r12),%xmm6
+ movdqa %xmm14,32(%rbp)
+
+ call _bsaes_decrypt8
+
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm6
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm2
+ movdqu 112(%r12),%xmm14
+ pxor %xmm13,%xmm4
+ movdqu %xmm15,0(%r13)
+ leaq 128(%r12),%r12
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+ subq $8,%r14
+ jnc .Lcbc_dec_loop
+
+ addq $8,%r14
+ jz .Lcbc_dec_done
+
+ movdqu 0(%r12),%xmm15
+ movq %rsp,%rax
+ movl %edx,%r10d
+ cmpq $2,%r14
+ jb .Lcbc_dec_one
+ movdqu 16(%r12),%xmm0
+ je .Lcbc_dec_two
+ movdqu 32(%r12),%xmm1
+ cmpq $4,%r14
+ jb .Lcbc_dec_three
+ movdqu 48(%r12),%xmm2
+ je .Lcbc_dec_four
+ movdqu 64(%r12),%xmm3
+ cmpq $6,%r14
+ jb .Lcbc_dec_five
+ movdqu 80(%r12),%xmm4
+ je .Lcbc_dec_six
+ movdqu 96(%r12),%xmm5
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm6
+ movdqu 96(%r12),%xmm14
+ pxor %xmm12,%xmm2
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm14
+ pxor %xmm11,%xmm6
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm14
+ pxor %xmm10,%xmm1
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm14
+ pxor %xmm9,%xmm3
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm14
+ pxor %xmm8,%xmm5
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm14
+ pxor %xmm7,%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+ leaq (%r12),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm14
+ movdqu %xmm14,(%r13)
+ movdqa %xmm15,%xmm14
+
+.Lcbc_dec_done:
+ movdqu %xmm14,(%rbx)
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lcbc_dec_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lcbc_dec_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lcbc_dec_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,@function
+.align 16
+bsaes_ctr32_encrypt_blocks:
+ movq %rsp,%rax
+.Lctr_enc_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movdqu (%r8),%xmm0
+ movl 240(%rcx),%eax
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movdqa %xmm0,32(%rbp)
+ cmpq $8,%rdx
+ jb .Lctr_enc_short
+
+ movl %eax,%ebx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %ebx,%r10d
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7
+ movdqa %xmm7,(%rax)
+
+ movdqa (%rsp),%xmm8
+ leaq .LADD1(%rip),%r11
+ movdqa 32(%rbp),%xmm15
+ movdqa -32(%r11),%xmm7
+.byte 102,68,15,56,0,199
+.byte 102,68,15,56,0,255
+ movdqa %xmm8,(%rsp)
+ jmp .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+ movdqa %xmm15,32(%rbp)
+ movdqa %xmm15,%xmm0
+ movdqa %xmm15,%xmm1
+ paddd 0(%r11),%xmm0
+ movdqa %xmm15,%xmm2
+ paddd 16(%r11),%xmm1
+ movdqa %xmm15,%xmm3
+ paddd 32(%r11),%xmm2
+ movdqa %xmm15,%xmm4
+ paddd 48(%r11),%xmm3
+ movdqa %xmm15,%xmm5
+ paddd 64(%r11),%xmm4
+ movdqa %xmm15,%xmm6
+ paddd 80(%r11),%xmm5
+ paddd 96(%r11),%xmm6
+
+
+
+ movdqa (%rsp),%xmm8
+ leaq 16(%rsp),%rax
+ movdqa -16(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+.byte 102,68,15,56,0,255
+ pxor %xmm8,%xmm1
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm2
+.byte 102,15,56,0,207
+ pxor %xmm8,%xmm3
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,223
+ pxor %xmm8,%xmm5
+.byte 102,15,56,0,231
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,239
+ leaq .LBS0(%rip),%r11
+.byte 102,15,56,0,247
+ movl %ebx,%r10d
+
+ call _bsaes_encrypt8_bitslice
+
+ subq $8,%r14
+ jc .Lctr_enc_loop_done
+
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ movdqu 32(%r12),%xmm9
+ movdqu 48(%r12),%xmm10
+ movdqu 64(%r12),%xmm11
+ movdqu 80(%r12),%xmm12
+ movdqu 96(%r12),%xmm13
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ pxor %xmm15,%xmm7
+ movdqa 32(%rbp),%xmm15
+ pxor %xmm8,%xmm0
+ movdqu %xmm7,0(%r13)
+ pxor %xmm9,%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor %xmm10,%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor %xmm11,%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor %xmm12,%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor %xmm13,%xmm1
+ movdqu %xmm6,80(%r13)
+ pxor %xmm14,%xmm4
+ movdqu %xmm1,96(%r13)
+ leaq .LADD1(%rip),%r11
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+ paddd 112(%r11),%xmm15
+ jnz .Lctr_enc_loop
+
+ jmp .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+ addq $8,%r14
+ movdqu 0(%r12),%xmm7
+ pxor %xmm7,%xmm15
+ movdqu %xmm15,0(%r13)
+ cmpq $2,%r14
+ jb .Lctr_enc_done
+ movdqu 16(%r12),%xmm8
+ pxor %xmm8,%xmm0
+ movdqu %xmm0,16(%r13)
+ je .Lctr_enc_done
+ movdqu 32(%r12),%xmm9
+ pxor %xmm9,%xmm3
+ movdqu %xmm3,32(%r13)
+ cmpq $4,%r14
+ jb .Lctr_enc_done
+ movdqu 48(%r12),%xmm10
+ pxor %xmm10,%xmm5
+ movdqu %xmm5,48(%r13)
+ je .Lctr_enc_done
+ movdqu 64(%r12),%xmm11
+ pxor %xmm11,%xmm2
+ movdqu %xmm2,64(%r13)
+ cmpq $6,%r14
+ jb .Lctr_enc_done
+ movdqu 80(%r12),%xmm12
+ pxor %xmm12,%xmm6
+ movdqu %xmm6,80(%r13)
+ je .Lctr_enc_done
+ movdqu 96(%r12),%xmm13
+ pxor %xmm13,%xmm1
+ movdqu %xmm1,96(%r13)
+ jmp .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+ leaq 32(%rbp),%rdi
+ leaq 48(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_encrypt
+ movdqu (%r12),%xmm0
+ leaq 16(%r12),%r12
+ movl 44(%rbp),%eax
+ bswapl %eax
+ pxor 48(%rbp),%xmm0
+ incl %eax
+ movdqu %xmm0,(%r13)
+ bswapl %eax
+ leaq 16(%r13),%r13
+ movl %eax,44(%rsp)
+ decq %r14
+ jnz .Lctr_enc_short
+
+.Lctr_enc_done:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lctr_enc_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lctr_enc_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lctr_enc_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,@function
+.align 16
+bsaes_xts_encrypt:
+ movq %rsp,%rax
+.Lxts_enc_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+
+ leaq (%r9),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r8),%rdx
+ call asm_AES_encrypt
+
+ movl 240(%r15),%eax
+ movq %r14,%rbx
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7
+ movdqa %xmm7,(%rax)
+
+ andq $-16,%r14
+ subq $128,%rsp
+ movdqa 32(%rbp),%xmm6
+
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+
+ subq $128,%r14
+ jc .Lxts_enc_short
+ jmp .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ movdqa %xmm6,112(%rsp)
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ pxor %xmm14,%xmm6
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor 96(%rsp),%xmm1
+ movdqu %xmm6,80(%r13)
+ pxor 112(%rsp),%xmm4
+ movdqu %xmm1,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+
+ subq $128,%r14
+ jnc .Lxts_enc_loop
+
+.Lxts_enc_short:
+ addq $128,%r14
+ jz .Lxts_enc_done
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ cmpq $16,%r14
+ je .Lxts_enc_1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ cmpq $32,%r14
+ je .Lxts_enc_2
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ cmpq $48,%r14
+ je .Lxts_enc_3
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ cmpq $64,%r14
+ je .Lxts_enc_4
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ cmpq $80,%r14
+ je .Lxts_enc_5
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ cmpq $96,%r14
+ je .Lxts_enc_6
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqa %xmm6,112(%rsp)
+ leaq 112(%r12),%r12
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor 96(%rsp),%xmm1
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm1,96(%r13)
+ leaq 112(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+ pxor %xmm11,%xmm3
+ leaq 96(%r12),%r12
+ pxor %xmm12,%xmm4
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ movdqu %xmm6,80(%r13)
+ leaq 96(%r13),%r13
+
+ movdqa 96(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+ pxor %xmm10,%xmm2
+ leaq 80(%r12),%r12
+ pxor %xmm11,%xmm3
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ movdqu %xmm2,64(%r13)
+ leaq 80(%r13),%r13
+
+ movdqa 80(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+ pxor %xmm9,%xmm1
+ leaq 64(%r12),%r12
+ pxor %xmm10,%xmm2
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ movdqu %xmm5,48(%r13)
+ leaq 64(%r13),%r13
+
+ movdqa 64(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+ pxor %xmm8,%xmm0
+ leaq 48(%r12),%r12
+ pxor %xmm9,%xmm1
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm3,32(%r13)
+ leaq 48(%r13),%r13
+
+ movdqa 48(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+ pxor %xmm7,%xmm15
+ leaq 32(%r12),%r12
+ pxor %xmm8,%xmm0
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ leaq 32(%r13),%r13
+
+ movdqa 32(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+ pxor %xmm15,%xmm7
+ leaq 16(%r12),%r12
+ movdqa %xmm7,32(%rbp)
+ leaq 32(%rbp),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_encrypt
+ pxor 32(%rbp),%xmm15
+
+
+
+
+
+ movdqu %xmm15,0(%r13)
+ leaq 16(%r13),%r13
+
+ movdqa 16(%rsp),%xmm6
+
+.Lxts_enc_done:
+ andl $15,%ebx
+ jz .Lxts_enc_ret
+ movq %r13,%rdx
+
+.Lxts_enc_steal:
+ movzbl (%r12),%eax
+ movzbl -16(%rdx),%ecx
+ leaq 1(%r12),%r12
+ movb %al,-16(%rdx)
+ movb %cl,0(%rdx)
+ leaq 1(%rdx),%rdx
+ subl $1,%ebx
+ jnz .Lxts_enc_steal
+
+ movdqu -16(%r13),%xmm15
+ leaq 32(%rbp),%rdi
+ pxor %xmm6,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call asm_AES_encrypt
+ pxor 32(%rbp),%xmm6
+ movdqu %xmm6,-16(%r13)
+
+.Lxts_enc_ret:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lxts_enc_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lxts_enc_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lxts_enc_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,@function
+.align 16
+bsaes_xts_decrypt:
+ movq %rsp,%rax
+.Lxts_dec_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+
+ leaq (%r9),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r8),%rdx
+ call asm_AES_encrypt
+
+ movl 240(%r15),%eax
+ movq %r14,%rbx
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7
+ movdqa %xmm6,(%rax)
+ movdqa %xmm7,(%rsp)
+
+ xorl %eax,%eax
+ andq $-16,%r14
+ testl $15,%ebx
+ setnz %al
+ shlq $4,%rax
+ subq %rax,%r14
+
+ subq $128,%rsp
+ movdqa 32(%rbp),%xmm6
+
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+
+ subq $128,%r14
+ jc .Lxts_dec_short
+ jmp .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ movdqa %xmm6,112(%rsp)
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ pxor %xmm14,%xmm6
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ pxor 96(%rsp),%xmm2
+ movdqu %xmm6,80(%r13)
+ pxor 112(%rsp),%xmm4
+ movdqu %xmm2,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+
+ subq $128,%r14
+ jnc .Lxts_dec_loop
+
+.Lxts_dec_short:
+ addq $128,%r14
+ jz .Lxts_dec_done
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ cmpq $16,%r14
+ je .Lxts_dec_1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ cmpq $32,%r14
+ je .Lxts_dec_2
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ cmpq $48,%r14
+ je .Lxts_dec_3
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ cmpq $64,%r14
+ je .Lxts_dec_4
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ cmpq $80,%r14
+ je .Lxts_dec_5
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ cmpq $96,%r14
+ je .Lxts_dec_6
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqa %xmm6,112(%rsp)
+ leaq 112(%r12),%r12
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ pxor 96(%rsp),%xmm2
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ leaq 112(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+ pxor %xmm11,%xmm3
+ leaq 96(%r12),%r12
+ pxor %xmm12,%xmm4
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ leaq 96(%r13),%r13
+
+ movdqa 96(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+ pxor %xmm10,%xmm2
+ leaq 80(%r12),%r12
+ pxor %xmm11,%xmm3
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ leaq 80(%r13),%r13
+
+ movdqa 80(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+ pxor %xmm9,%xmm1
+ leaq 64(%r12),%r12
+ pxor %xmm10,%xmm2
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ leaq 64(%r13),%r13
+
+ movdqa 64(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+ pxor %xmm8,%xmm0
+ leaq 48(%r12),%r12
+ pxor %xmm9,%xmm1
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ leaq 48(%r13),%r13
+
+ movdqa 48(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+ pxor %xmm7,%xmm15
+ leaq 32(%r12),%r12
+ pxor %xmm8,%xmm0
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ leaq 32(%r13),%r13
+
+ movdqa 32(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+ pxor %xmm15,%xmm7
+ leaq 16(%r12),%r12
+ movdqa %xmm7,32(%rbp)
+ leaq 32(%rbp),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm15
+
+
+
+
+
+ movdqu %xmm15,0(%r13)
+ leaq 16(%r13),%r13
+
+ movdqa 16(%rsp),%xmm6
+
+.Lxts_dec_done:
+ andl $15,%ebx
+ jz .Lxts_dec_ret
+
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ movdqa %xmm6,%xmm5
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ movdqu (%r12),%xmm15
+ pxor %xmm13,%xmm6
+
+ leaq 32(%rbp),%rdi
+ pxor %xmm6,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm6
+ movq %r13,%rdx
+ movdqu %xmm6,(%r13)
+
+.Lxts_dec_steal:
+ movzbl 16(%r12),%eax
+ movzbl (%rdx),%ecx
+ leaq 1(%r12),%r12
+ movb %al,(%rdx)
+ movb %cl,16(%rdx)
+ leaq 1(%rdx),%rdx
+ subl $1,%ebx
+ jnz .Lxts_dec_steal
+
+ movdqu (%r13),%xmm15
+ leaq 32(%rbp),%rdi
+ pxor %xmm5,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm5
+ movdqu %xmm5,(%r13)
+
+.Lxts_dec_ret:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lxts_dec_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lxts_dec_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lxts_dec_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+.type _bsaes_const,@object
+.align 64
+_bsaes_const:
+.LM0ISR:
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:
+.quad 0x5555555555555555, 0x5555555555555555
+.LBS1:
+.quad 0x3333333333333333, 0x3333333333333333
+.LBS2:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:
+.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+.quad 0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:
+.quad 0x0000000000000000, 0x0000000100000000
+.LADD2:
+.quad 0x0000000000000000, 0x0000000200000000
+.LADD3:
+.quad 0x0000000000000000, 0x0000000300000000
+.LADD4:
+.quad 0x0000000000000000, 0x0000000400000000
+.LADD5:
+.quad 0x0000000000000000, 0x0000000500000000
+.LADD6:
+.quad 0x0000000000000000, 0x0000000600000000
+.LADD7:
+.quad 0x0000000000000000, 0x0000000700000000
+.LADD8:
+.quad 0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+.long 0x87,0,1,0
+.Lmasks:
+.quad 0x0101010101010101, 0x0101010101010101
+.quad 0x0202020202020202, 0x0202020202020202
+.quad 0x0404040404040404, 0x0404040404040404
+.quad 0x0808080808080808, 0x0808080808080808
+.LM0:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+.quad 0x6363636363636363, 0x6363636363636363
+.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.align 64
+.size _bsaes_const,.-_bsaes_const
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000..ceb02b5
--- /dev/null
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3045 @@
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode] ###
+### bitsliced implementation for Intel Core 2 processors ###
+### requires support of SSE extensions up to SSSE3 ###
+### Author: Emilia Käsper and Peter Schwabe ###
+### Date: 2009-03-19 ###
+### Public domain ###
+### ###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
+### further information. ###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+# from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+# allowed to feed its output back to aesenc[last], this was
+# achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+# relies on conversion of "conventional" key schedule as returned
+# by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+# to skip one shiftrows(), reduce bit-sliced key schedule and
+# speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+# Emilia's this(*) difference
+#
+# Core 2 9.30 8.69 +7%
+# Nehalem(**) 7.63 6.98 +9%
+# Atom 17.1 17.4 -2%(***)
+#
+# (*) Comparison is not completely fair, because "this" is ECB,
+# i.e. no extra processing such as counter values calculation
+# and xor-ing input as in Emilia's CTR implementation is
+# performed. However, the CTR calculations stand for not more
+# than 1% of total time, so comparison is *rather* fair.
+#
+# (**) Results were collected on Westmere, which is considered to
+# be equivalent to Nehalem for this code.
+#
+# (***) Slowdown on Atom is rather strange per se, because original
+# implementation has a number of 9+-bytes instructions, which
+# are bad for Atom front-end, and which I eliminated completely.
+# In attempt to address deterioration sbox() was tested in FP
+# SIMD "domain" (movaps instead of movdqa, xorps instead of
+# pxor, etc.). While it resulted in nominal 4% improvement on
+# Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# conversion conversion/8x block
+# Core 2 240 0.22
+# Nehalem 180 0.20
+# Atom 430 0.19
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2 11.0
+# Nehalem 9.16
+# Atom 20.9
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+#
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
+my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[5]
+ pxor @b[1], @b[2]
+ pxor @b[0], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[0], @b[5]
+
+ pxor @b[3], @b[6]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[1], @b[3]
+
+ pxor @b[7], @b[2]
+ pxor @b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[1], @b[6]
+
+ pxor @b[5], @b[1]
+ pxor @b[3], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+
+ pxor @b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ pxor @b[7], @b[4]
+
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+ pxor @b[7], @b[3]
+ pxor @b[3], @b[5]
+ pxor @b[5], @b[1]
+
+ pxor @b[1], @b[6]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ pxor @b[5], @b[1]
+ pxor @b[7], @b[2]
+
+ pxor @b[1], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[0], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[1], @b[2]
+ pxor @b[3], @b[6]
+
+ pxor @b[0], @b[3]
+ pxor @b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x1, $x0
+ pxor $t0, $x1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x0, $x1
+ pxor $t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ movdqa $y2, $t1
+ pxor $y1, $t0
+ pxor $y3, $t1
+ pand $x0, $t0
+ pand $x2, $t1
+ pxor $x1, $x0
+ pxor $x3, $x2
+ pand $y0, $x1
+ pand $y2, $x3
+ pand $y1, $x0
+ pand $y3, $x2
+ pxor $x0, $x1
+ pxor $x3, $x2
+ pxor $t0, $x0
+ pxor $t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ movdqa @x[0], @t[0]
+ movdqa @x[1], @t[1]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+ pxor @x[2], @t[0]
+ pxor @x[3], @t[1]
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @t[0], @x[0]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[1]
+ pxor @t[1], @x[3]
+
+ movdqa @x[4], @t[0]
+ movdqa @x[5], @t[1]
+ pxor @x[6], @t[0]
+ pxor @x[7], @t[1]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+ pxor @t[0], @x[4]
+ pxor @t[0], @x[6]
+ pxor @t[1], @x[5]
+ pxor @t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ movdqa @x[4], @t[3]
+ movdqa @x[5], @t[2]
+ movdqa @x[1], @t[1]
+ movdqa @x[7], @s[1]
+ movdqa @x[0], @s[0]
+
+ pxor @x[6], @t[3]
+ pxor @x[7], @t[2]
+ pxor @x[3], @t[1]
+ movdqa @t[3], @s[2]
+ pxor @x[6], @s[1]
+ movdqa @t[2], @t[0]
+ pxor @x[2], @s[0]
+ movdqa @t[3], @s[3]
+
+ por @t[1], @t[2]
+ por @s[0], @t[3]
+ pxor @t[0], @s[3]
+ pand @s[0], @s[2]
+ pxor @t[1], @s[0]
+ pand @t[1], @t[0]
+ pand @s[0], @s[3]
+ movdqa @x[3], @s[0]
+ pxor @x[2], @s[0]
+ pand @s[0], @s[1]
+ pxor @s[1], @t[3]
+ pxor @s[1], @t[2]
+ movdqa @x[4], @s[1]
+ movdqa @x[1], @s[0]
+ pxor @x[5], @s[1]
+ pxor @x[0], @s[0]
+ movdqa @s[1], @t[1]
+ pand @s[0], @s[1]
+ por @s[0], @t[1]
+ pxor @s[1], @t[0]
+ pxor @s[3], @t[3]
+ pxor @s[2], @t[2]
+ pxor @s[3], @t[1]
+ movdqa @x[7], @s[0]
+ pxor @s[2], @t[0]
+ movdqa @x[6], @s[1]
+ pxor @s[2], @t[1]
+ movdqa @x[5], @s[2]
+ pand @x[3], @s[0]
+ movdqa @x[4], @s[3]
+ pand @x[2], @s[1]
+ pand @x[1], @s[2]
+ por @x[0], @s[3]
+ pxor @s[0], @t[3]
+ pxor @s[1], @t[2]
+ pxor @s[2], @t[1]
+ pxor @s[3], @t[0]
+
+ #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ # new smaller inversion
+
+ movdqa @t[3], @s[0]
+ pand @t[1], @t[3]
+ pxor @t[2], @s[0]
+
+ movdqa @t[0], @s[2]
+ movdqa @s[0], @s[3]
+ pxor @t[3], @s[2]
+ pand @s[2], @s[3]
+
+ movdqa @t[1], @s[1]
+ pxor @t[2], @s[3]
+ pxor @t[0], @s[1]
+
+ pxor @t[2], @t[3]
+
+ pand @t[3], @s[1]
+
+ movdqa @s[2], @t[2]
+ pxor @t[0], @s[1]
+
+ pxor @s[1], @t[2]
+ pxor @s[1], @t[1]
+
+ pand @t[0], @t[2]
+
+ pxor @t[2], @s[2]
+ pxor @t[2], @t[1]
+
+ pand @s[3], @s[2]
+
+ pxor @s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+ pxor 0x00($key),@x[0]
+ pxor 0x10($key),@x[1]
+ pshufb $mask,@x[0]
+ pxor 0x20($key),@x[2]
+ pshufb $mask,@x[1]
+ pxor 0x30($key),@x[3]
+ pshufb $mask,@x[2]
+ pxor 0x40($key),@x[4]
+ pshufb $mask,@x[3]
+ pxor 0x50($key),@x[5]
+ pshufb $mask,@x[4]
+ pxor 0x60($key),@x[6]
+ pshufb $mask,@x[5]
+ pxor 0x70($key),@x[7]
+ pshufb $mask,@x[6]
+ lea 0x80($key),$key
+ pshufb $mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
+ pshufd \$0x93, @x[2], @t[2]
+ pxor @t[1], @x[1]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @t[2], @x[2]
+ pshufd \$0x93, @x[4], @t[4]
+ pxor @t[3], @x[3]
+ pshufd \$0x93, @x[5], @t[5]
+ pxor @t[4], @x[4]
+ pshufd \$0x93, @x[6], @t[6]
+ pxor @t[5], @x[5]
+ pshufd \$0x93, @x[7], @t[7]
+ pxor @t[6], @x[6]
+ pxor @t[7], @x[7]
+
+ pxor @x[0], @t[1]
+ pxor @x[7], @t[0]
+ pxor @x[7], @t[1]
+ pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
+ pxor @x[1], @t[2]
+ pshufd \$0x4E, @x[1], @x[1]
+ pxor @x[4], @t[5]
+ pxor @t[0], @x[0]
+ pxor @x[5], @t[6]
+ pxor @t[1], @x[1]
+ pxor @x[3], @t[4]
+ pshufd \$0x4E, @x[4], @t[0]
+ pxor @x[6], @t[7]
+ pshufd \$0x4E, @x[5], @t[1]
+ pxor @x[2], @t[3]
+ pshufd \$0x4E, @x[3], @x[4]
+ pxor @x[7], @t[3]
+ pshufd \$0x4E, @x[7], @x[5]
+ pxor @x[7], @t[4]
+ pshufd \$0x4E, @x[6], @x[3]
+ pxor @t[4], @t[0]
+ pshufd \$0x4E, @x[2], @x[6]
+ pxor @t[5], @t[1]
+
+ pxor @t[3], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[6], @x[3]
+ movdqa @t[0], @x[2]
+ pxor @t[2], @x[6]
+ movdqa @t[1], @x[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ # multiplication by 0x0e
+ pshufd \$0x93, @x[7], @t[7]
+ movdqa @x[2], @t[2]
+ pxor @x[5], @x[7] # 7 5
+ pxor @x[5], @x[2] # 2 5
+ pshufd \$0x93, @x[0], @t[0]
+ movdqa @x[5], @t[5]
+ pxor @x[0], @x[5] # 5 0 [1]
+ pxor @x[1], @x[0] # 0 1
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @x[2], @x[1] # 1 25
+ pxor @x[6], @x[0] # 01 6 [2]
+ pxor @x[3], @x[1] # 125 3 [4]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @x[0], @x[2] # 25 016 [3]
+ pxor @x[7], @x[3] # 3 75
+ pxor @x[6], @x[7] # 75 6 [0]
+ pshufd \$0x93, @x[6], @t[6]
+ movdqa @x[4], @t[4]
+ pxor @x[4], @x[6] # 6 4
+ pxor @x[3], @x[4] # 4 375 [6]
+ pxor @x[7], @x[3] # 375 756=36
+ pxor @t[5], @x[6] # 64 5 [7]
+ pxor @t[2], @x[3] # 36 2
+ pxor @t[4], @x[3] # 362 4 [5]
+ pshufd \$0x93, @t[5], @t[5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ # multiplication by 0x0b
+ pxor @y[0], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[1], @y[1]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[5], @y[0]
+ pxor @t[6], @y[1]
+ pxor @t[7], @y[0]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @t[7] # clobber t[7]
+ pxor @y[0], @y[1]
+
+ pxor @t[0], @y[3]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[1], @y[2]
+ pxor @t[1], @y[4]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[1], @t[1]
+ pxor @t[2], @y[3]
+ pxor @t[2], @y[5]
+ pxor @t[7], @y[2]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[3], @y[3]
+ pxor @t[3], @y[6]
+ pxor @t[3], @y[4]
+ pshufd \$0x93, @t[3], @t[3]
+ pxor @t[4], @y[7]
+ pxor @t[4], @y[5]
+ pxor @t[7], @y[7]
+ pxor @t[5], @y[3]
+ pxor @t[4], @y[4]
+ pxor @t[5], @t[7] # clobber t[7] even more
+
+ pxor @t[7], @y[5]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[7], @y[6]
+ pxor @t[7], @y[4]
+
+ pxor @t[5], @t[7]
+ pshufd \$0x93, @t[5], @t[5]
+ pxor @t[6], @t[7] # restore t[7]
+
+ # multiplication by 0x0d
+ pxor @y[7], @y[4]
+ pxor @t[4], @y[7]
+ pshufd \$0x93, @t[6], @t[6]
+ pxor @t[0], @y[2]
+ pxor @t[5], @y[7]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[7], @t[7]
+
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[0], @y[3]
+ pxor @t[5], @y[1]
+ pxor @t[5], @y[0]
+ pxor @t[7], @y[1]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[6], @y[0]
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[4]
+ pshufd \$0x93, @t[1], @t[1]
+
+ pxor @t[7], @y[7]
+ pxor @t[2], @y[4]
+ pxor @t[2], @y[5]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[6], @y[2]
+ pxor @t[3], @t[6] # clobber t[6]
+ pxor @y[7], @y[4]
+ pxor @t[6], @y[3]
+
+ pxor @t[6], @y[6]
+ pxor @t[5], @y[5]
+ pxor @t[4], @y[6]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @y[5]
+ pxor @t[7], @y[6]
+ pxor @t[3], @t[6] # restore t[6]
+
+ pshufd \$0x93, @t[5], @t[5]
+ pshufd \$0x93, @t[6], @t[6]
+ pshufd \$0x93, @t[7], @t[7]
+ pshufd \$0x93, @t[3], @t[3]
+
+ # multiplication by 0x09
+ pxor @y[1], @y[4]
+ pxor @y[1], @t[1] # t[1]=y[1]
+ pxor @t[5], @t[0] # clobber t[0]
+ pxor @t[5], @t[1]
+ pxor @t[0], @y[3]
+ pxor @y[0], @t[0] # t[0]=y[0]
+ pxor @t[6], @t[1]
+ pxor @t[7], @t[6] # clobber t[6]
+ pxor @t[1], @y[4]
+ pxor @t[4], @y[7]
+ pxor @y[4], @t[4] # t[4]=y[4]
+ pxor @t[3], @y[6]
+ pxor @y[3], @t[3] # t[3]=y[3]
+ pxor @t[2], @y[5]
+ pxor @y[2], @t[2] # t[2]=y[2]
+ pxor @t[7], @t[3]
+ pxor @y[5], @t[5] # t[5]=y[5]
+ pxor @t[6], @t[2]
+ pxor @t[6], @t[5]
+ pxor @y[6], @t[6] # t[6]=y[6]
+ pxor @y[7], @t[7] # t[7]=y[7]
+
+ movdqa @t[0],@XMM[0]
+ movdqa @t[1],@XMM[1]
+ movdqa @t[2],@XMM[2]
+ movdqa @t[3],@XMM[3]
+ movdqa @t[4],@XMM[4]
+ movdqa @t[5],@XMM[5]
+ movdqa @t[6],@XMM[6]
+ movdqa @t[7],@XMM[7]
+___
+}
+
+sub aesenc { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x30($const),@t[0] # .LSR
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+ &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x40($const),@t[0] # .LSRM0
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+$code.=<<___
+ pxor 0x00($key),@b[0]
+ pxor 0x10($key),@b[1]
+ pxor 0x20($key),@b[4]
+ pxor 0x30($key),@b[6]
+ pxor 0x40($key),@b[3]
+ pxor 0x50($key),@b[7]
+ pxor 0x60($key),@b[2]
+ pxor 0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ movdqa $b,$t
+ psrlq \$$n,$b
+ pxor $a,$b
+ pand $mask,$b
+ pxor $b,$a
+ psllq \$$n,$b
+ pxor $t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ movdqa $b0,$t0
+ psrlq \$$n,$b0
+ movdqa $b1,$t1
+ psrlq \$$n,$b1
+ pxor $a0,$b0
+ pxor $a1,$b1
+ pand $mask,$b0
+ pand $mask,$b1
+ pxor $b0,$a0
+ psllq \$$n,$b0
+ pxor $b1,$a1
+ psllq \$$n,$b1
+ pxor $t0,$b0
+ pxor $t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ movdqa 0x00($const),$t0 # .LBS0
+ movdqa 0x10($const),$t1 # .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ movdqa 0x20($const),$t0 # .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern asm_AES_encrypt
+.extern asm_AES_decrypt
+
+.type _bsaes_encrypt8,\@abi-omnipotent
+.align 64
+_bsaes_encrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa 0x50($const), @XMM[8] # .LM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Lenc_sbox
+.align 16
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ movdqa 0x30($const), @XMM[8] # .LSR
+ jnz .Lenc_loop
+ movdqa 0x40($const), @XMM[8] # .LSRM0
+ jmp .Lenc_loop
+.align 16
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_decrypt8,\@abi-omnipotent
+.align 64
+_bsaes_decrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa -0x30($const), @XMM[8] # .LM0ISR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Ldec_sbox
+.align 16
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ movdqa -0x10($const), @XMM[8] # .LISR
+ jnz .Ldec_loop
+ movdqa -0x20($const), @XMM[8] # .LISRM0
+ jmp .Ldec_loop
+.align 16
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ #&swapmove(@x[2,3],1,$t0,$t2,$t3);
+ movdqa @x[0], @x[2]
+ movdqa @x[1], @x[3]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ movdqa @x[0], @x[4]
+ movdqa @x[2], @x[6]
+ movdqa @x[1], @x[5]
+ movdqa @x[3], @x[7]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,\@abi-omnipotent
+.align 16
+_bsaes_key_convert:
+ lea .Lmasks(%rip), $const
+ movdqu ($inp), %xmm7 # load round 0 key
+ lea 0x10($inp), $inp
+ movdqa 0x00($const), %xmm0 # 0x01...
+ movdqa 0x10($const), %xmm1 # 0x02...
+ movdqa 0x20($const), %xmm2 # 0x04...
+ movdqa 0x30($const), %xmm3 # 0x08...
+ movdqa 0x40($const), %xmm4 # .LM0
+ pcmpeqd %xmm5, %xmm5 # .LNOT
+
+ movdqu ($inp), %xmm6 # load round 1 key
+ movdqa %xmm7, ($out) # save round 0 key
+ lea 0x10($out), $out
+ dec $rounds
+ jmp .Lkey_loop
+.align 16
+.Lkey_loop:
+ pshufb %xmm4, %xmm6 # .LM0
+
+ movdqa %xmm0, %xmm8
+ movdqa %xmm1, %xmm9
+
+ pand %xmm6, %xmm8
+ pand %xmm6, %xmm9
+ movdqa %xmm2, %xmm10
+ pcmpeqb %xmm0, %xmm8
+ psllq \$4, %xmm0 # 0x10...
+ movdqa %xmm3, %xmm11
+ pcmpeqb %xmm1, %xmm9
+ psllq \$4, %xmm1 # 0x20...
+
+ pand %xmm6, %xmm10
+ pand %xmm6, %xmm11
+ movdqa %xmm0, %xmm12
+ pcmpeqb %xmm2, %xmm10
+ psllq \$4, %xmm2 # 0x40...
+ movdqa %xmm1, %xmm13
+ pcmpeqb %xmm3, %xmm11
+ psllq \$4, %xmm3 # 0x80...
+
+ movdqa %xmm2, %xmm14
+ movdqa %xmm3, %xmm15
+ pxor %xmm5, %xmm8 # "pnot"
+ pxor %xmm5, %xmm9
+
+ pand %xmm6, %xmm12
+ pand %xmm6, %xmm13
+ movdqa %xmm8, 0x00($out) # write bit-sliced round key
+ pcmpeqb %xmm0, %xmm12
+ psrlq \$4, %xmm0 # 0x01...
+ movdqa %xmm9, 0x10($out)
+ pcmpeqb %xmm1, %xmm13
+ psrlq \$4, %xmm1 # 0x02...
+ lea 0x10($inp), $inp
+
+ pand %xmm6, %xmm14
+ pand %xmm6, %xmm15
+ movdqa %xmm10, 0x20($out)
+ pcmpeqb %xmm2, %xmm14
+ psrlq \$4, %xmm2 # 0x04...
+ movdqa %xmm11, 0x30($out)
+ pcmpeqb %xmm3, %xmm15
+ psrlq \$4, %xmm3 # 0x08...
+ movdqu ($inp), %xmm6 # load next round key
+
+ pxor %xmm5, %xmm13 # "pnot"
+ pxor %xmm5, %xmm14
+ movdqa %xmm12, 0x40($out)
+ movdqa %xmm13, 0x50($out)
+ movdqa %xmm14, 0x60($out)
+ movdqa %xmm15, 0x70($out)
+ lea 0x80($out),$out
+ dec $rounds
+ jnz .Lkey_loop
+
+ movdqa 0x50($const), %xmm7 # .L63
+ #movdqa %xmm6, ($out) # don't save last round key
+ ret
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,\@function,2
+.align 16
+bsaes_enc_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+ ret
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,\@function,4
+.align 16
+bsaes_encrypt_128:
+.Lenc128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Lenc128_loop
+ ret
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,\@function,2
+.align 16
+bsaes_dec_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor ($out),%xmm7 # fix up round 0 key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,($out)
+ ret
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,\@function,4
+.align 16
+bsaes_decrypt_128:
+.Ldec128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Ldec128_loop
+ ret
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+ : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl bsaes_ecb_encrypt_blocks
+.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_encrypt_blocks:
+ mov %rsp, %rax
+.Lecb_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_enc_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ sub \$8,$len
+.Lecb_enc_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_enc_loop
+
+ add \$8,$len
+ jz .Lecb_enc_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_enc_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_enc_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_enc_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_enc_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_enc_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_enc_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_six:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_five:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_four:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_three:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_two:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_one:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_enc_short
+
+.Lecb_enc_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_enc_epilogue:
+ ret
+.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl bsaes_ecb_decrypt_blocks
+.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_decrypt_blocks:
+ mov %rsp, %rax
+.Lecb_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_dec_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ sub \$8,$len
+.Lecb_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_dec_loop
+
+ add \$8,$len
+ jz .Lecb_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_six:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_five:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_four:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_three:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_two:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_one:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_dec_short
+
+.Lecb_dec_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_dec_epilogue:
+ ret
+.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern asm_AES_cbc_encrypt
+.globl bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,\@abi-omnipotent
+.align 16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+ mov 48(%rsp),$arg6 # pull direction flag
+___
+$code.=<<___;
+ cmp \$0,$arg6
+ jne asm_AES_cbc_encrypt
+ cmp \$128,$arg3
+ jb asm_AES_cbc_encrypt
+
+ mov %rsp, %rax
+.Lcbc_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ mov $arg5, %rbx
+ shr \$4, $len # bytes to blocks
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ movdqu (%rbx), @XMM[15] # load IV
+ sub \$8,$len
+.Lcbc_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %edx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+
+ call _bsaes_decrypt8
+
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[13], @XMM[3]
+ movdqu 0x70($inp), @XMM[15] # IV
+ pxor @XMM[14], @XMM[5]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x80($inp), $inp
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lcbc_dec_loop
+
+ add \$8,$len
+ jz .Lcbc_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+ cmp \$2,$len
+ jb .Lcbc_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lcbc_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lcbc_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lcbc_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lcbc_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lcbc_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[15] # IV
+ pxor @XMM[13], @XMM[3]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[15] # IV
+ pxor @XMM[12], @XMM[7]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[15] # IV
+ pxor @XMM[11], @XMM[2]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[15] # IV
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[15] # IV
+ pxor @XMM[9], @XMM[6]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[15] # IV
+ pxor @XMM[8], @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+ lea ($inp), $arg1
+ lea 0x20(%rbp), $arg2 # buffer output
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[15] # ^= IV
+ movdqu @XMM[15], ($out) # write output
+ movdqa @XMM[0], @XMM[15] # IV
+
+.Lcbc_dec_done:
+ movdqu @XMM[15], (%rbx) # return IV
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lcbc_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lcbc_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lcbc_dec_epilogue:
+ ret
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ctr32_encrypt_blocks:
+ mov %rsp, %rax
+.Lctr_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ movdqu ($arg5), %xmm0 # load counter
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ movdqa %xmm0, 0x20(%rbp) # copy counter
+ cmp \$8, $arg3
+ jb .Lctr_enc_short
+
+ mov %eax, %ebx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %ebx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ movdqa (%rsp), @XMM[9] # load round0 key
+ lea .LADD1(%rip), %r11
+ movdqa 0x20(%rbp), @XMM[0] # counter copy
+ movdqa -0x20(%r11), @XMM[8] # .LSWPUP
+ pshufb @XMM[8], @XMM[9] # byte swap upper part
+ pshufb @XMM[8], @XMM[0]
+ movdqa @XMM[9], (%rsp) # save adjusted round0 key
+ jmp .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+ movdqa @XMM[0], 0x20(%rbp) # save counter
+ movdqa @XMM[0], @XMM[1] # prepare 8 counter values
+ movdqa @XMM[0], @XMM[2]
+ paddd 0x00(%r11), @XMM[1] # .LADD1
+ movdqa @XMM[0], @XMM[3]
+ paddd 0x10(%r11), @XMM[2] # .LADD2
+ movdqa @XMM[0], @XMM[4]
+ paddd 0x20(%r11), @XMM[3] # .LADD3
+ movdqa @XMM[0], @XMM[5]
+ paddd 0x30(%r11), @XMM[4] # .LADD4
+ movdqa @XMM[0], @XMM[6]
+ paddd 0x40(%r11), @XMM[5] # .LADD5
+ movdqa @XMM[0], @XMM[7]
+ paddd 0x50(%r11), @XMM[6] # .LADD6
+ paddd 0x60(%r11), @XMM[7] # .LADD7
+
+ # Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ # to flip byte order in 32-bit counter
+ movdqa (%rsp), @XMM[9] # round 0 key
+ lea 0x10(%rsp), %rax # pass key schedule
+ movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ lea .LBS0(%rip), %r11 # constants table
+ pshufb @XMM[8], @XMM[7]
+ mov %ebx,%r10d # pass rounds
+
+ call _bsaes_encrypt8_bitslice
+
+ sub \$8,$len
+ jc .Lctr_enc_loop_done
+
+ movdqu 0x00($inp), @XMM[8] # load input
+ movdqu 0x10($inp), @XMM[9]
+ movdqu 0x20($inp), @XMM[10]
+ movdqu 0x30($inp), @XMM[11]
+ movdqu 0x40($inp), @XMM[12]
+ movdqu 0x50($inp), @XMM[13]
+ movdqu 0x60($inp), @XMM[14]
+ movdqu 0x70($inp), @XMM[15]
+ lea 0x80($inp),$inp
+ pxor @XMM[0], @XMM[8]
+ movdqa 0x20(%rbp), @XMM[0] # load counter
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[8], 0x00($out) # write output
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor @XMM[15], @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ lea .LADD1(%rip), %r11
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ paddd 0x70(%r11), @XMM[0] # .LADD8
+ jnz .Lctr_enc_loop
+
+ jmp .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+ add \$8, $len
+ movdqu 0x00($inp), @XMM[8] # load input
+ pxor @XMM[8], @XMM[0]
+ movdqu @XMM[0], 0x00($out) # write output
+ cmp \$2,$len
+ jb .Lctr_enc_done
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[1], 0x10($out)
+ je .Lctr_enc_done
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[4], 0x20($out)
+ cmp \$4,$len
+ jb .Lctr_enc_done
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[6], 0x30($out)
+ je .Lctr_enc_done
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[3], 0x40($out)
+ cmp \$6,$len
+ jb .Lctr_enc_done
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[7], 0x50($out)
+ je .Lctr_enc_done
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+ lea 0x20(%rbp), $arg1
+ lea 0x30(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ movdqu ($inp), @XMM[1]
+ lea 16($inp), $inp
+ mov 0x2c(%rbp), %eax # load 32-bit counter
+ bswap %eax
+ pxor 0x30(%rbp), @XMM[1]
+ inc %eax # increment
+ movdqu @XMM[1], ($out)
+ bswap %eax
+ lea 16($out), $out
+ mov %eax, 0x2c(%rsp) # save 32-bit counter
+ dec $len
+ jnz .Lctr_enc_short
+
+.Lctr_enc_done:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lctr_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lctr_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lctr_enc_epilogue:
+ ret
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_encrypt:
+ mov %rsp, %rax
+.Lxts_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6, %xmm7 # fix up last round key
+ movdqa %xmm7, (%rax) # save last round key
+
+ and \$-16, $len
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_enc_short
+ jmp .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_enc_loop
+
+.Lxts_enc_short:
+ add \$0x80, $len
+ jz .Lxts_enc_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_enc_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_encrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_enc_done:
+ and \$15, %ebx
+ jz .Lxts_enc_ret
+ mov $out, %rdx
+
+.Lxts_enc_steal:
+ movzb ($inp), %eax
+ movzb -16(%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, -16(%rdx)
+ mov %cl, 0(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_enc_steal
+
+ movdqu -16($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ movdqu @XMM[7], -16($out)
+
+.Lxts_enc_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_enc_epilogue:
+ ret
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_decrypt:
+ mov %rsp, %rax
+.Lxts_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp), %xmm7 # fix up round 0 key
+ movdqa %xmm6, (%rax) # save last round key
+ movdqa %xmm7, (%rsp)
+
+ xor %eax, %eax # if ($len%16) len-=16;
+ and \$-16, $len
+ test \$15, %ebx
+ setnz %al
+ shl \$4, %rax
+ sub %rax, $len
+
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_dec_short
+ jmp .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_dec_loop
+
+.Lxts_dec_short:
+ add \$0x80, $len
+ jz .Lxts_dec_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_dec_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_decrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_dec_done:
+ and \$15, %ebx
+ jz .Lxts_dec_ret
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ movdqa @XMM[7], @XMM[6]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ movdqu ($inp), @XMM[0]
+ pxor $twres, @XMM[7]
+
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ mov $out, %rdx
+ movdqu @XMM[7], ($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp), %eax
+ movzb (%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, (%rdx)
+ mov %cl, 16(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_dec_steal
+
+ movdqu ($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[6], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[6]
+ movdqu @XMM[6], ($out)
+
+.Lxts_dec_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_dec_epilogue:
+ ret
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type _bsaes_const,\@object
+.align 64
+_bsaes_const:
+.LM0ISR: # InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0: # bit-slice constants
+ .quad 0x5555555555555555, 0x5555555555555555
+.LBS1:
+ .quad 0x3333333333333333, 0x3333333333333333
+.LBS2:
+ .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR: # shiftrows constants
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP: # byte-swap upper dword
+ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1: # counter increment constants
+ .quad 0x0000000000000000, 0x0000000100000000
+.LADD2:
+ .quad 0x0000000000000000, 0x0000000200000000
+.LADD3:
+ .quad 0x0000000000000000, 0x0000000300000000
+.LADD4:
+ .quad 0x0000000000000000, 0x0000000400000000
+.LADD5:
+ .quad 0x0000000000000000, 0x0000000500000000
+.LADD6:
+ .quad 0x0000000000000000, 0x0000000600000000
+.LADD7:
+ .quad 0x0000000000000000, 0x0000000700000000
+.LADD8:
+ .quad 0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+ .long 0x87,0,1,0
+.Lmasks:
+ .quad 0x0101010101010101, 0x0101010101010101
+ .quad 0x0202020202020202, 0x0202020202020202
+ .quad 0x0404040404040404, 0x0404040404040404
+ .quad 0x0808080808080808, 0x0808080808080808
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+ .quad 0x6363636363636363, 0x6363636363636363
+.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align 64
+.size _bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->RipRsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov 160($context),%rax # pull context->Rbp
+
+ lea 0x40(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0(%rax),%rax # adjust stack pointer
+
+ mov 0x70(%rax),%rbp
+ mov 0x68(%rax),%rbx
+ mov 0x60(%rax),%r12
+ mov 0x58(%rax),%r13
+ mov 0x50(%rax),%r14
+ mov 0x48(%rax),%r15
+ lea 0x78(%rax),%rax # adjust stack pointer
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov %rax,152($context) # restore context->Rsp
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($ecb);
+ .rva .Lecb_enc_prologue
+ .rva .Lecb_enc_epilogue
+ .rva .Lecb_enc_info
+
+ .rva .Lecb_dec_prologue
+ .rva .Lecb_dec_epilogue
+ .rva .Lecb_dec_info
+___
+$code.=<<___;
+ .rva .Lcbc_dec_prologue
+ .rva .Lcbc_dec_epilogue
+ .rva .Lcbc_dec_info
+
+ .rva .Lctr_enc_prologue
+ .rva .Lctr_enc_epilogue
+ .rva .Lctr_enc_info
+
+ .rva .Lxts_enc_prologue
+ .rva .Lxts_enc_epilogue
+ .rva .Lxts_enc_info
+
+ .rva .Lxts_dec_prologue
+ .rva .Lxts_dec_epilogue
+ .rva .Lxts_dec_info
+
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
+.Lecb_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
+.Lctr_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
+.Lxts_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.Lxts_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/aes/asm/vpaes-x86.S b/crypto/aes/asm/vpaes-x86.S
new file mode 100644
index 0000000..c53a507
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86.S
@@ -0,0 +1,661 @@
+.file "vpaes-x86.s"
+.text
+.align 64
+.L_vpaes_consts:
+.long 218628480,235210255,168496130,67568393
+.long 252381056,17041926,33884169,51187212
+.long 252645135,252645135,252645135,252645135
+.long 1512730624,3266504856,1377990664,3401244816
+.long 830229760,1275146365,2969422977,3447763452
+.long 3411033600,2979783055,338359620,2782886510
+.long 4209124096,907596821,221174255,1006095553
+.long 191964160,3799684038,3164090317,1589111125
+.long 182528256,1777043520,2877432650,3265356744
+.long 1874708224,3503451415,3305285752,363511674
+.long 1606117888,3487855781,1093350906,2384367825
+.long 197121,67569157,134941193,202313229
+.long 67569157,134941193,202313229,197121
+.long 134941193,202313229,197121,67569157
+.long 202313229,197121,67569157,134941193
+.long 33619971,100992007,168364043,235736079
+.long 235736079,33619971,100992007,168364043
+.long 168364043,235736079,33619971,100992007
+.long 100992007,168364043,235736079,33619971
+.long 50462976,117835012,185207048,252579084
+.long 252314880,51251460,117574920,184942860
+.long 184682752,252054788,50987272,118359308
+.long 118099200,185467140,251790600,50727180
+.long 2946363062,528716217,1300004225,1881839624
+.long 1532713819,1532713819,1532713819,1532713819
+.long 3602276352,4288629033,3737020424,4153884961
+.long 1354558464,32357713,2958822624,3775749553
+.long 1201988352,132424512,1572796698,503232858
+.long 2213177600,1597421020,4103937655,675398315
+.long 2749646592,4273543773,1511898873,121693092
+.long 3040248576,1103263732,2871565598,1608280554
+.long 2236667136,2588920351,482954393,64377734
+.long 3069987328,291237287,2117370568,3650299247
+.long 533321216,3573750986,2572112006,1401264716
+.long 1339849704,2721158661,548607111,3445553514
+.long 2128193280,3054596040,2183486460,1257083700
+.long 655635200,1165381986,3923443150,2344132524
+.long 190078720,256924420,290342170,357187870
+.long 1610966272,2263057382,4103205268,309794674
+.long 2592527872,2233205587,1335446729,3402964816
+.long 3973531904,3225098121,3002836325,1918774430
+.long 3870401024,2102906079,2284471353,4117666579
+.long 617007872,1021508343,366931923,691083277
+.long 2528395776,3491914898,2968704004,1613121270
+.long 3445188352,3247741094,844474987,4093578302
+.long 651481088,1190302358,1689581232,574775300
+.long 4289380608,206939853,2555985458,2489840491
+.long 2130264064,327674451,3566485037,3349835193
+.long 2470714624,316102159,3636825756,3393945945
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte 118,101,114,115,105,116,121,41,0
+.align 64
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+ addl (%esp),%ebp
+ movdqa -48(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm6
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+ movl $16,%ecx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa (%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%edx),%xmm5
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa 16(%ebp),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal 192(%ebp),%ebx
+ jmp .L000enc_entry
+.align 16
+.L001enc_loop:
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ movdqa 64(%ebp),%xmm5
+.byte 102,15,56,0,234
+ movdqa -64(%ebx,%ecx,1),%xmm1
+ movdqa 80(%ebp),%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm5,%xmm2
+ movdqa (%ebx,%ecx,1),%xmm4
+ movdqa %xmm0,%xmm3
+.byte 102,15,56,0,193
+ addl $16,%edx
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addl $16,%ecx
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm3,%xmm0
+ subl $1,%eax
+.L000enc_entry:
+ movdqa %xmm6,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+ movdqa -32(%ebp),%xmm5
+.byte 102,15,56,0,232
+ pxor %xmm1,%xmm0
+ movdqa %xmm7,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm5,%xmm3
+ movdqa %xmm7,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm5,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm7,%xmm3
+ movdqu (%edx),%xmm5
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ jnz .L001enc_loop
+ movdqa 96(%ebp),%xmm4
+ movdqa 112(%ebp),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%ebx,%ecx,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+ movl 240(%edx),%eax
+ leal 608(%ebp),%ebx
+ movdqa %xmm6,%xmm1
+ movdqa -64(%ebx),%xmm2
+ pandn %xmm0,%xmm1
+ movl %eax,%ecx
+ psrld $4,%xmm1
+ movdqu (%edx),%xmm5
+ shll $4,%ecx
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa -48(%ebx),%xmm0
+ xorl $48,%ecx
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm5,%xmm2
+ movdqa 176(%ebp),%xmm5
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal -352(%ebx,%ecx,1),%ecx
+ jmp .L002dec_entry
+.align 16
+.L003dec_loop:
+ movdqa -32(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa -16(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ addl $16,%edx
+.byte 102,15,56,0,197
+ movdqa (%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ subl $1,%eax
+.byte 102,15,56,0,197
+ movdqa 32(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 48(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,197
+ movdqa 64(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 80(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,58,15,237,12
+.L002dec_entry:
+ movdqa %xmm6,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm7,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm7,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm7,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqu (%edx),%xmm0
+ jnz .L003dec_loop
+ movdqa 96(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%ebx),%xmm0
+ movdqa (%ecx),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+ addl (%esp),%ebp
+ movdqu (%esi),%xmm0
+ movdqa 320(%ebp),%xmm2
+ movdqa %xmm0,%xmm3
+ leal (%ebp),%ebx
+ movdqa %xmm2,4(%esp)
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+ testl %edi,%edi
+ jnz .L004schedule_am_decrypting
+ movdqu %xmm0,(%edx)
+ jmp .L005schedule_go
+.L004schedule_am_decrypting:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%edx)
+ xorl $48,%ecx
+.L005schedule_go:
+ cmpl $192,%eax
+ ja .L006schedule_256
+ je .L007schedule_192
+.L008schedule_128:
+ movl $10,%eax
+.L009loop_schedule_128:
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .L009loop_schedule_128
+.align 16
+.L007schedule_192:
+ movdqu 8(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%eax
+.L011loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .L011loop_schedule_192
+.align 16
+.L006schedule_256:
+ movdqu 16(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%eax
+.L012loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,20(%esp)
+ movdqa %xmm6,%xmm7
+ call .L_vpaes_schedule_low_round
+ movdqa 20(%esp),%xmm7
+ jmp .L012loop_schedule_256
+.align 16
+.L010schedule_mangle_last:
+ leal 384(%ebp),%ebx
+ testl %edi,%edi
+ jnz .L013schedule_mangle_last_dec
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,193
+ leal 352(%ebp),%ebx
+ addl $32,%edx
+.L013schedule_mangle_last_dec:
+ addl $-16,%edx
+ pxor 336(%ebp),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm0
+ pxor %xmm0,%xmm6
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ pxor %xmm1,%xmm1
+ movhlps %xmm1,%xmm6
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+ movdqa 8(%esp),%xmm2
+ pxor %xmm1,%xmm1
+.byte 102,15,58,15,202,15
+.byte 102,15,58,15,210,15
+ pxor %xmm1,%xmm7
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+ movdqa %xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor 336(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm4
+ movdqa -48(%ebp),%xmm5
+ movdqa %xmm4,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm4,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm5,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm5,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+ movdqa -16(%ebp),%xmm2
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ movdqa (%ebx),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa 128(%ebp),%xmm5
+ testl %edi,%edi
+ jnz .L014schedule_mangle_dec
+ addl $16,%edx
+ pxor 336(%ebp),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+ jmp .L015schedule_mangle_both
+.align 16
+.L014schedule_mangle_dec:
+ movdqa -16(%ebp),%xmm2
+ leal 416(%ebp),%esi
+ movdqa %xmm2,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm4
+ movdqa (%esi),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 32(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 64(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 96(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ addl $-16,%edx
+.L015schedule_mangle_both:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ addl $-16,%ecx
+ andl $48,%ecx
+ movdqu %xmm3,(%edx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ movl $48,%ecx
+ movl $0,%edi
+ leal .L_vpaes_consts+0x30-.L016pic_point,%ebp
+ call _vpaes_schedule_core
+.L016pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ shll $4,%ebx
+ leal 16(%edx,%ebx,1),%edx
+ movl $1,%edi
+ movl %eax,%ecx
+ shrl $1,%ecx
+ andl $32,%ecx
+ xorl $32,%ecx
+ leal .L_vpaes_consts+0x30-.L017pic_point,%ebp
+ call _vpaes_schedule_core
+.L017pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L018pic_point,%ebp
+ call _vpaes_preheat
+.L018pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L019pic_point,%ebp
+ call _vpaes_preheat
+.L019pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ subl $16,%eax
+ jc .L020cbc_abort
+ leal -56(%esp),%ebx
+ movl 36(%esp),%ebp
+ andl $-16,%ebx
+ movl 40(%esp),%ecx
+ xchgl %esp,%ebx
+ movdqu (%ebp),%xmm1
+ subl %esi,%edi
+ movl %ebx,48(%esp)
+ movl %edi,(%esp)
+ movl %edx,4(%esp)
+ movl %ebp,8(%esp)
+ movl %eax,%edi
+ leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
+ call _vpaes_preheat
+.L021pic_point:
+ cmpl $0,%ecx
+ je .L022cbc_dec_loop
+ jmp .L023cbc_enc_loop
+.align 16
+.L023cbc_enc_loop:
+ movdqu (%esi),%xmm0
+ pxor %xmm1,%xmm0
+ call _vpaes_encrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ movdqa %xmm0,%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L023cbc_enc_loop
+ jmp .L024cbc_done
+.align 16
+.L022cbc_dec_loop:
+ movdqu (%esi),%xmm0
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm0,32(%esp)
+ call _vpaes_decrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ pxor 16(%esp),%xmm0
+ movdqa 32(%esp),%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L022cbc_dec_loop
+.L024cbc_done:
+ movl 8(%esp),%ebx
+ movl 48(%esp),%esp
+ movdqu %xmm1,(%ebx)
+.L020cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000..1533e2c
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+# aes-586.pl vpaes-x86.pl
+#
+# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
+# Nehalem 27.9/40.4/18.1 10.3/12.0
+# Atom 102./119./60.1 64.5/85.3(***)
+#
+# (*) "Hyper-threading" in the context refers rather to cache shared
+# among multiple cores, than to specifically Intel HTT. As vast
+# majority of contemporary cores share cache, slower code path
+# is common place. In other words "with-hyper-threading-off"
+# results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+# pshufb, yet it's respectable +32%/65% improvement on Core 2
+# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+# code path).
+#
+#
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my ($round, $base, $magic, $key, $const, $inp, $out)=
+ ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30; # inv, inva
+ &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+ &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10; # s0F
+ &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00; # input transform (lo, hi)
+ &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+ &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20; # sb1u, sb1t
+ &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+ &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40; # sb2u, sb2t
+ &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+ &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60; # sbou, sbot
+ &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+ &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80; # mc_forward
+ &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+ &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+ &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+ &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0; # mc_backward
+ &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+ &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+ &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+ &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100; # sr
+ &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+ &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+ &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+ &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140; # rcon
+ &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150; # s63: all equal to 0x63 transformed
+ &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160; # output transform
+ &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+ &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
+ &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+ &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+## Decryption stuff
+## Key schedule constants
+##
+$k_dksd=0x1a0; # decryption key schedule: invskew x*D
+ &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+ &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0; # decryption key schedule: invskew x*B
+ &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+ &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
+ &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+ &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200; # decryption key schedule: invskew x*9
+ &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+ &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+## Decryption stuff
+## Round function constants
+##
+$k_dipt=0x220; # decryption input transform
+ &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+ &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
+ &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+ &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
+ &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+ &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
+ &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+ &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
+ &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+ &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0; # decryption sbox final output
+ &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+ &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align (64);
+
+&function_begin_B("_vpaes_preheat");
+ &add ($const,&DWP(0,"esp"));
+ &movdqa ("xmm7",&QWP($k_inv,$const));
+ &movdqa ("xmm6",&QWP($k_s0F,$const));
+ &ret ();
+&function_end_B("_vpaes_preheat");
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm6-%xmm7 as in _vpaes_preheat
+## (%edx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+ &mov ($magic,16);
+ &mov ($round,&DWP(240,$key));
+ &movdqa ("xmm1","xmm6")
+ &movdqa ("xmm2",&QWP($k_ipt,$const));
+ &pandn ("xmm1","xmm0");
+ &movdqu ("xmm5",&QWP(0,$key));
+ &psrld ("xmm1",4);
+ &pand ("xmm0","xmm6");
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP($k_ipt+16,$const));
+ &pshufb ("xmm0","xmm1");
+ &pxor ("xmm2","xmm5");
+ &pxor ("xmm0","xmm2");
+ &add ($key,16);
+ &lea ($base,&DWP($k_mc_backward,$const));
+ &jmp (&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+ # middle of middle round
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
+ &pshufb ("xmm4","xmm2"); # 4 = sb1u
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
+ &pshufb ("xmm5","xmm2"); # 4 = sb2u
+ &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+ &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+ &pshufb ("xmm2","xmm3"); # 2 = sb2t
+ &pxor ("xmm2","xmm5"); # 2 = 2A
+ &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
+ &movdqa ("xmm3","xmm0"); # 3 = A
+ &pshufb ("xmm0","xmm1"); # 0 = B
+ &add ($key,16); # next key
+ &pxor ("xmm0","xmm2"); # 0 = 2A+B
+ &pshufb ("xmm3","xmm4"); # 3 = D
+ &add ($magic,16); # next mc
+ &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
+ &pshufb ("xmm0","xmm1"); # 0 = 2B+C
+ &and ($magic,0x30); # ... mod 4
+ &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
+ &sub ($round,1); # nr--
+
+&set_label("enc_entry");
+ # top of round
+ &movdqa ("xmm1","xmm6"); # 1 : i
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm6"); # 0 = k
+ &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+ &pshufb ("xmm5","xmm0"); # 2 = a/k
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
+ &movdqu ("xmm5",&QWP(0,$key));
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &jnz (&label("enc_loop"));
+
+ # middle of last round
+ &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
+ &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &pshufb ("xmm0","xmm1");
+ &ret ();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+ &mov ($round,&DWP(240,$key));
+ &lea ($base,&DWP($k_dsbd,$const));
+ &movdqa ("xmm1","xmm6");
+ &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+ &pandn ("xmm1","xmm0");
+ &mov ($magic,$round);
+ &psrld ("xmm1",4)
+ &movdqu ("xmm5",&QWP(0,$key));
+ &shl ($magic,4);
+ &pand ("xmm0","xmm6");
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+ &xor ($magic,0x30);
+ &pshufb ("xmm0","xmm1");
+ &and ($magic,0x30);
+ &pxor ("xmm2","xmm5");
+ &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
+ &pxor ("xmm0","xmm2");
+ &add ($key,16);
+ &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+ &jmp (&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+## Inverse mix columns
+##
+ &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
+ &pshufb ("xmm4","xmm2"); # 4 = sb9u
+ &pxor ("xmm4","xmm0");
+ &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
+ &pshufb ("xmm0","xmm3"); # 0 = sb9t
+ &pxor ("xmm0","xmm4"); # 0 = ch
+ &add ($key,16); # next round key
+
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
+ &pshufb ("xmm4","xmm2"); # 4 = sbdu
+ &pxor ("xmm4","xmm0"); # 4 = ch
+ &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
+ &pshufb ("xmm0","xmm3"); # 0 = sbdt
+ &pxor ("xmm0","xmm4"); # 0 = ch
+ &sub ($round,1); # nr--
+
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
+ &pshufb ("xmm4","xmm2"); # 4 = sbbu
+ &pxor ("xmm4","xmm0"); # 4 = ch
+ &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
+ &pshufb ("xmm0","xmm3"); # 0 = sbbt
+ &pxor ("xmm0","xmm4"); # 0 = ch
+
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
+ &pshufb ("xmm4","xmm2"); # 4 = sbeu
+ &pxor ("xmm4","xmm0"); # 4 = ch
+ &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
+ &pshufb ("xmm0","xmm3"); # 0 = sbet
+ &pxor ("xmm0","xmm4"); # 0 = ch
+
+ &palignr("xmm5","xmm5",12);
+
+&set_label("dec_entry");
+ # top of round
+ &movdqa ("xmm1","xmm6"); # 1 : i
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm6"); # 0 = k
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &movdqu ("xmm0",&QWP(0,$key));
+ &jnz (&label("dec_loop"));
+
+ # middle of last round
+ &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &pxor ("xmm4","xmm0"); # 4 = sb1u + k
+ &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
+ &movdqa ("xmm2",&QWP(0,$magic));
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &pshufb ("xmm0","xmm2");
+ &ret ();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+ &add ($const,&DWP(0,"esp"));
+ &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
+ &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
+
+ # input transform
+ &movdqa ("xmm3","xmm0");
+ &lea ($base,&DWP($k_ipt,$const));
+ &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
+ &call ("_vpaes_schedule_transform");
+ &movdqa ("xmm7","xmm0");
+
+ &test ($out,$out);
+ &jnz (&label("schedule_am_decrypting"));
+
+ # encrypting, output zeroth round key after transform
+ &movdqu (&QWP(0,$key),"xmm0");
+ &jmp (&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+ # decrypting, output zeroth round key after shiftrows
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm3","xmm1");
+ &movdqu (&QWP(0,$key),"xmm3");
+ &xor ($magic,0x30);
+
+&set_label("schedule_go");
+ &cmp ($round,192);
+ &ja (&label("schedule_256"));
+ &je (&label("schedule_192"));
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+ &mov ($round,10);
+
+&set_label("loop_schedule_128");
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle"); # write output
+ &jmp (&label("loop_schedule_128"));
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+&set_label("schedule_192",16);
+ &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
+ &call ("_vpaes_schedule_transform"); # input transform
+ &movdqa ("xmm6","xmm0"); # save short part
+ &pxor ("xmm4","xmm4"); # clear 4
+ &movhlps("xmm6","xmm4"); # clobber low side with zeros
+ &mov ($round,4);
+
+&set_label("loop_schedule_192");
+ &call ("_vpaes_schedule_round");
+ &palignr("xmm0","xmm6",8);
+ &call ("_vpaes_schedule_mangle"); # save key n
+ &call ("_vpaes_schedule_192_smear");
+ &call ("_vpaes_schedule_mangle"); # save key n+1
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle"); # save key n+2
+ &call ("_vpaes_schedule_192_smear");
+ &jmp (&label("loop_schedule_192"));
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+ &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
+ &call ("_vpaes_schedule_transform"); # input transform
+ &mov ($round,7);
+
+&set_label("loop_schedule_256");
+ &call ("_vpaes_schedule_mangle"); # output low result
+ &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
+
+ # high round
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle");
+
+ # low round. swap xmm7 and xmm6
+ &pshufd ("xmm0","xmm0",0xFF);
+ &movdqa (&QWP(20,"esp"),"xmm7");
+ &movdqa ("xmm7","xmm6");
+ &call ("_vpaes_schedule_low_round");
+ &movdqa ("xmm7",&QWP(20,"esp"));
+
+ &jmp (&label("loop_schedule_256"));
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+ # schedule last round key from xmm0
+ &lea ($base,&DWP($k_deskew,$const));
+ &test ($out,$out);
+ &jnz (&label("schedule_mangle_last_dec"));
+
+ # encrypting
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm0","xmm1"); # output permute
+ &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
+ &add ($key,32);
+
+&set_label("schedule_mangle_last_dec");
+ &add ($key,-16);
+ &pxor ("xmm0",&QWP($k_s63,$const));
+ &call ("_vpaes_schedule_transform"); # output transform
+ &movdqu (&QWP(0,$key),"xmm0"); # save last key
+
+ # cleanup
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
+ &ret ();
+&function_end_B("_vpaes_schedule_core");
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+ &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
+ &pxor ("xmm6","xmm0"); # -> c+d c 0 0
+ &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
+ &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
+ &movdqa ("xmm0","xmm6");
+ &pxor ("xmm1","xmm1");
+ &movhlps("xmm6","xmm1"); # clobber low side with zeros
+ &ret ();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+ # extract rcon from xmm8
+ &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
+ &pxor ("xmm1","xmm1");
+ &palignr("xmm1","xmm2",15);
+ &palignr("xmm2","xmm2",15);
+ &pxor ("xmm7","xmm1");
+
+ # rotate
+ &pshufd ("xmm0","xmm0",0xFF);
+ &palignr("xmm0","xmm0",1);
+
+ # fall through...
+ &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
+
+ # low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+ # smear xmm7
+ &movdqa ("xmm1","xmm7");
+ &pslldq ("xmm7",4);
+ &pxor ("xmm7","xmm1");
+ &movdqa ("xmm1","xmm7");
+ &pslldq ("xmm7",8);
+ &pxor ("xmm7","xmm1");
+ &pxor ("xmm7",&QWP($k_s63,$const));
+
+ # subbyte
+ &movdqa ("xmm4",&QWP($k_s0F,$const));
+ &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
+ &movdqa ("xmm1","xmm4");
+ &pandn ("xmm1","xmm0");
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm4"); # 0 = k
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &movdqa ("xmm3","xmm5"); # 3 : 1/i
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
+ &movdqa ("xmm4","xmm5"); # 4 : 1/j
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm5"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &movdqa ("xmm3","xmm5"); # 3 : 1/jak
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = sbox output
+
+ # add in smeared stuff
+ &pxor ("xmm0","xmm7");
+ &movdqa ("xmm7","xmm0");
+ &ret ();
+&function_end_B("_vpaes_schedule_round");
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%ebx)
+##
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
+ &movdqa ("xmm1","xmm2");
+ &pandn ("xmm1","xmm0");
+ &psrld ("xmm1",4);
+ &pand ("xmm0","xmm2");
+ &movdqa ("xmm2",&QWP(0,$base));
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP(16,$base));
+ &pshufb ("xmm0","xmm1");
+ &pxor ("xmm0","xmm2");
+ &ret ();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%edx), and increments or decrements it
+## Keeps track of round number mod 4 in %ecx
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+ &movdqa ("xmm4","xmm0"); # save xmm0 for later
+ &movdqa ("xmm5",&QWP($k_mc_forward,$const));
+ &test ($out,$out);
+ &jnz (&label("schedule_mangle_dec"));
+
+ # encrypting
+ &add ($key,16);
+ &pxor ("xmm4",&QWP($k_s63,$const));
+ &pshufb ("xmm4","xmm5");
+ &movdqa ("xmm3","xmm4");
+ &pshufb ("xmm4","xmm5");
+ &pxor ("xmm3","xmm4");
+ &pshufb ("xmm4","xmm5");
+ &pxor ("xmm3","xmm4");
+
+ &jmp (&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+ # inverse mix columns
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
+ &lea ($inp,&DWP($k_dksd,$const));
+ &movdqa ("xmm1","xmm2");
+ &pandn ("xmm1","xmm4");
+ &psrld ("xmm1",4); # 1 = hi
+ &pand ("xmm4","xmm2"); # 4 = lo
+
+ &movdqa ("xmm2",&QWP(0,$inp));
+ &pshufb ("xmm2","xmm4");
+ &movdqa ("xmm3",&QWP(0x10,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x20,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x30,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x40,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x50,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x60,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x70,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+
+ &add ($key,-16);
+
+&set_label("schedule_mangle_both");
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm3","xmm1");
+ &add ($magic,-16);
+ &and ($magic,0x30);
+ &movdqu (&QWP(0,$key),"xmm3");
+ &ret ();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($round,&wparam(1)); # bits
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov ($base,$round);
+ &shr ($base,5);
+ &add ($base,5);
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
+ &mov ($magic,0x30);
+ &mov ($out,0);
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_schedule_core");
+&set_label("pic_point");
+
+ &mov ("esp",&DWP(48,"esp"));
+ &xor ("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($round,&wparam(1)); # bits
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov ($base,$round);
+ &shr ($base,5);
+ &add ($base,5);
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
+ &shl ($base,4);
+ &lea ($key,&DWP(16,$key,$base));
+
+ &mov ($out,1);
+ &mov ($magic,$round);
+ &shr ($magic,1);
+ &and ($magic,32);
+ &xor ($magic,32); # nbist==192?0:32;
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_schedule_core");
+&set_label("pic_point");
+
+ &mov ("esp",&DWP(48,"esp"));
+ &xor ("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($out,&wparam(1)); # out
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &movdqu ("xmm0",&QWP(0,$inp));
+ &call ("_vpaes_encrypt_core");
+ &movdqu (&QWP(0,$out),"xmm0");
+
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($out,&wparam(1)); # out
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &movdqu ("xmm0",&QWP(0,$inp));
+ &call ("_vpaes_decrypt_core");
+ &movdqu (&QWP(0,$out),"xmm0");
+
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+ &mov ($inp,&wparam(0)); # inp
+ &mov ($out,&wparam(1)); # out
+ &mov ($round,&wparam(2)); # len
+ &mov ($key,&wparam(3)); # key
+ &sub ($round,16);
+ &jc (&label("cbc_abort"));
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($const,&wparam(4)); # ivp
+ &and ($base,-16);
+ &mov ($magic,&wparam(5)); # enc
+ &xchg ($base,"esp"); # alloca
+ &movdqu ("xmm1",&QWP(0,$const)); # load IV
+ &sub ($out,$inp);
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov (&DWP(0,"esp"),$out); # save out
+ &mov (&DWP(4,"esp"),$key) # save key
+ &mov (&DWP(8,"esp"),$const); # save ivp
+ &mov ($out,$round); # $out works as $len
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &cmp ($magic,0);
+ &je (&label("cbc_dec_loop"));
+ &jmp (&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+ &movdqu ("xmm0",&QWP(0,$inp)); # load input
+ &pxor ("xmm0","xmm1"); # inp^=iv
+ &call ("_vpaes_encrypt_core");
+ &mov ($base,&DWP(0,"esp")); # restore out
+ &mov ($key,&DWP(4,"esp")); # restore key
+ &movdqa ("xmm1","xmm0");
+ &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
+ &lea ($inp,&DWP(16,$inp));
+ &sub ($out,16);
+ &jnc (&label("cbc_enc_loop"));
+ &jmp (&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+ &movdqu ("xmm0",&QWP(0,$inp)); # load input
+ &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
+ &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
+ &call ("_vpaes_decrypt_core");
+ &mov ($base,&DWP(0,"esp")); # restore out
+ &mov ($key,&DWP(4,"esp")); # restore key
+ &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
+ &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
+ &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
+ &lea ($inp,&DWP(16,$inp));
+ &sub ($out,16);
+ &jnc (&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+ &mov ($base,&DWP(8,"esp")); # restore ivp
+ &mov ("esp",&DWP(48,"esp"));
+ &movdqu (&QWP(0,$base),"xmm1"); # write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();
diff --git a/crypto/aes/asm/vpaes-x86_64.S b/crypto/aes/asm/vpaes-x86_64.S
new file mode 100644
index 0000000..2b68e61
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86_64.S
@@ -0,0 +1,828 @@
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_ipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%r9),%xmm5
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_ipt+16(%rip),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ leaq .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ movdqa %xmm15,%xmm5
+.byte 102,15,56,0,234
+ movdqa -64(%r11,%r10,1),%xmm1
+ movdqa %xmm14,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm5,%xmm2
+ movdqa (%r11,%r10,1),%xmm4
+ movdqa %xmm0,%xmm3
+.byte 102,15,56,0,193
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andq $48,%r11
+ pxor %xmm3,%xmm0
+ subq $1,%rax
+
+.Lenc_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm5
+.byte 102,15,56,0,232
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm5,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm5,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+ movdqu (%r9),%xmm5
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ jnz .Lenc_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%r11,%r10,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ .byte 0xf3,0xc3
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+ movq %rdx,%r9
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_dipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movq %rax,%r11
+ psrld $4,%xmm1
+ movdqu (%r9),%xmm5
+ shlq $4,%r11
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_dipt+16(%rip),%xmm0
+ xorq $48,%r11
+ leaq .Lk_dsbd(%rip),%r10
+.byte 102,15,56,0,193
+ andq $48,%r11
+ pxor %xmm5,%xmm2
+ movdqa .Lk_mc_forward+48(%rip),%xmm5
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ addq %r10,%r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+
+
+
+ movdqa -32(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa -16(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ addq $16,%r9
+
+.byte 102,15,56,0,197
+ movdqa 0(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 16(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ subq $1,%rax
+
+.byte 102,15,56,0,197
+ movdqa 32(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 48(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+.byte 102,15,56,0,197
+ movdqa 64(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 80(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+.byte 102,15,58,15,237,12
+
+.Ldec_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqu (%r9),%xmm0
+ jnz .Ldec_loop
+
+
+ movdqa 96(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%r10),%xmm0
+ movdqa -352(%r11),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ .byte 0xf3,0xc3
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa .Lk_rcon(%rip),%xmm8
+ movdqu (%rdi),%xmm0
+
+
+ movdqa %xmm0,%xmm3
+ leaq .Lk_ipt(%rip),%r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+
+ leaq .Lk_sr(%rip),%r10
+ testq %rcx,%rcx
+ jnz .Lschedule_am_decrypting
+
+
+ movdqu %xmm0,(%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%rdx)
+ xorq $48,%r8
+
+.Lschedule_go:
+ cmpl $192,%esi
+ ja .Lschedule_256
+ je .Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+ movl $10,%esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+
+
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,%xmm5
+ movdqa %xmm6,%xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5,%xmm7
+
+ jmp .Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_mangle_last:
+
+ leaq .Lk_deskew(%rip),%r11
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_last_dec
+
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,193
+ leaq .Lk_opt(%rip),%r11
+ addq $32,%rdx
+
+.Lschedule_mangle_last_dec:
+ addq $-16,%rdx
+ pxor .Lk_s63(%rip),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%rdx)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm0
+ pxor %xmm0,%xmm6
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ pxor %xmm1,%xmm1
+ movhlps %xmm1,%xmm6
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+
+ pxor %xmm1,%xmm1
+.byte 102,65,15,58,15,200,15
+.byte 102,69,15,58,15,192,15
+ pxor %xmm1,%xmm7
+
+
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor .Lk_s63(%rip),%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa (%r11),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%r11),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_dec
+
+
+ addq $16,%rdx
+ pxor .Lk_s63(%rip),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+
+ leaq .Lk_dksd(%rip),%r11
+ movdqa %xmm9,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm4
+
+ movdqa 0(%r11),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 32(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 64(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 96(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+
+ addq $-16,%rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ addq $-16,%r8
+ andq $48,%r8
+ movdqu %xmm3,(%rdx)
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+
+ movl $0,%ecx
+ movl $48,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+ shll $4,%eax
+ leaq 16(%rdx,%rax,1),%rdx
+
+ movl $1,%ecx
+ movl %esi,%r8d
+ shrl $1,%r8d
+ andl $32,%r8d
+ xorl $32,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+.size vpaes_encrypt,.-vpaes_encrypt
+
+.globl vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+.size vpaes_decrypt,.-vpaes_decrypt
+.globl vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+ xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc .Lcbc_abort
+ movdqu (%r8),%xmm6
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ cmpl $0,%r9d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu (%rdi),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu (%rdi),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,(%r8)
+.Lcbc_abort:
+ .byte 0xf3,0xc3
+.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+ leaq .Lk_s0F(%rip),%r10
+ movdqa -32(%r10),%xmm10
+ movdqa -16(%r10),%xmm11
+ movdqa 0(%r10),%xmm9
+ movdqa 48(%r10),%xmm13
+ movdqa 64(%r10),%xmm12
+ movdqa 80(%r10),%xmm15
+ movdqa 96(%r10),%xmm14
+ .byte 0xf3,0xc3
+.size _vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type _vpaes_consts,@object
+.align 64
+_vpaes_consts:
+.Lk_inv:
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000..41f2e46
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1207 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+# aes-x86_64.pl vpaes-x86_64.pl
+#
+# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
+# Nehalem 30.5/42.2/14.6 9.8/11.8
+# Atom 63.9/79.0/32.1 64.0/84.8(***)
+#
+# (*) "Hyper-threading" in the context refers rather to cache shared
+# among multiple cores, than to specifically Intel HTT. As vast
+# majority of contemporary cores share cache, slower code path
+# is common place. In other words "with-hyper-threading-off"
+# results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+# pshufb, yet it's respectable +40%/78% improvement on Core 2
+# (as implied, over "hyper-threading-safe" code path).
+#
+#
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type _vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+ mov %rdx, %r9
+ mov \$16, %r11
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
+ pshufb %xmm1, %xmm0
+ pxor %xmm5, %xmm2
+ pxor %xmm2, %xmm0
+ add \$16, %r9
+ lea .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+ # middle of middle round
+ movdqa %xmm13, %xmm4 # 4 : sb1u
+ pshufb %xmm2, %xmm4 # 4 = sb1u
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ movdqa %xmm12, %xmm0 # 0 : sb1t
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ movdqa %xmm15, %xmm5 # 4 : sb2u
+ pshufb %xmm2, %xmm5 # 4 = sb2u
+ movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ movdqa %xmm14, %xmm2 # 2 : sb2t
+ pshufb %xmm3, %xmm2 # 2 = sb2t
+ pxor %xmm5, %xmm2 # 2 = 2A
+ movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ movdqa %xmm0, %xmm3 # 3 = A
+ pshufb %xmm1, %xmm0 # 0 = B
+ add \$16, %r9 # next key
+ pxor %xmm2, %xmm0 # 0 = 2A+B
+ pshufb %xmm4, %xmm3 # 3 = D
+ add \$16, %r11 # next mc
+ pxor %xmm0, %xmm3 # 3 = 2A+B+D
+ pshufb %xmm1, %xmm0 # 0 = 2B+C
+ and \$0x30, %r11 # ... mod 4
+ pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
+ sub \$1,%rax # nr--
+
+.Lenc_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm5 # 2 : a/k
+ pshufb %xmm0, %xmm5 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ movdqu (%r9), %xmm5
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ jnz .Lenc_loop
+
+ # middle of last round
+ movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm1, %xmm0
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.type _vpaes_decrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_decrypt_core:
+ mov %rdx, %r9 # load key
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ mov %rax, %r11
+ psrld \$4, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ shl \$4, %r11
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
+ xor \$0x30, %r11
+ lea .Lk_dsbd(%rip),%r10
+ pshufb %xmm1, %xmm0
+ and \$0x30, %r11
+ pxor %xmm5, %xmm2
+ movdqa .Lk_mc_forward+48(%rip), %xmm5
+ pxor %xmm2, %xmm0
+ add \$16, %r9
+ add %r10, %r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+## Inverse mix columns
+##
+ movdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ pshufb %xmm2, %xmm4 # 4 = sb9u
+ pxor %xmm0, %xmm4
+ movdqa -0x10(%r10),%xmm0 # 0 : sb9t
+ pshufb %xmm3, %xmm0 # 0 = sb9t
+ pxor %xmm4, %xmm0 # 0 = ch
+ add \$16, %r9 # next round key
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ pshufb %xmm2, %xmm4 # 4 = sbdu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x10(%r10),%xmm0 # 0 : sbdt
+ pshufb %xmm3, %xmm0 # 0 = sbdt
+ pxor %xmm4, %xmm0 # 0 = ch
+ sub \$1,%rax # nr--
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ pshufb %xmm2, %xmm4 # 4 = sbbu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x30(%r10),%xmm0 # 0 : sbbt
+ pshufb %xmm3, %xmm0 # 0 = sbbt
+ pxor %xmm4, %xmm0 # 0 = ch
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x40(%r10),%xmm4 # 4 : sbeu
+ pshufb %xmm2, %xmm4 # 4 = sbeu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x50(%r10),%xmm0 # 0 : sbet
+ pshufb %xmm3, %xmm0 # 0 = sbet
+ pxor %xmm4, %xmm0 # 0 = ch
+
+ palignr \$12, %xmm5, %xmm5
+
+.Ldec_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqu (%r9), %xmm0
+ jnz .Ldec_loop
+
+ # middle of last round
+ movdqa 0x60(%r10), %xmm4 # 3 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm0, %xmm4 # 4 = sb1u + k
+ movdqa 0x70(%r10), %xmm0 # 0 : sbot
+ movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm2, %xmm0
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.type _vpaes_schedule_core,\@abi-omnipotent
+.align 16
+_vpaes_schedule_core:
+ # rdi = key
+ # rsi = size in bits
+ # rdx = buffer
+ # rcx = direction. 0=encrypt, 1=decrypt
+
+ call _vpaes_preheat # load the tables
+ movdqa .Lk_rcon(%rip), %xmm8 # load rcon
+ movdqu (%rdi), %xmm0 # load key (unaligned)
+
+ # input transform
+ movdqa %xmm0, %xmm3
+ lea .Lk_ipt(%rip), %r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0, %xmm7
+
+ lea .Lk_sr(%rip),%r10
+ test %rcx, %rcx
+ jnz .Lschedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ movdqu %xmm0, (%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+ # decrypting, output zeroth round key after shiftrows
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm3
+ movdqu %xmm3, (%rdx)
+ xor \$0x30, %r8
+
+.Lschedule_go:
+ cmp \$192, %esi
+ ja .Lschedule_256
+ je .Lschedule_192
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ mov \$10, %esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # write output
+ jmp .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ call _vpaes_schedule_transform # input transform
+ movdqa %xmm0, %xmm6 # save short part
+ pxor %xmm4, %xmm4 # clear 4
+ movhlps %xmm4, %xmm6 # clobber low side with zeros
+ mov \$4, %esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+ palignr \$8,%xmm6,%xmm0
+ call _vpaes_schedule_mangle # save key n
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle # save key n+1
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # save key n+2
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ call _vpaes_schedule_transform # input transform
+ mov \$7, %esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle # output low result
+ movdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ # high round
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+ # low round. swap xmm7 and xmm6
+ pshufd \$0xFF, %xmm0, %xmm0
+ movdqa %xmm7, %xmm5
+ movdqa %xmm6, %xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5, %xmm7
+
+ jmp .Loop_schedule_256
+
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 16
+.Lschedule_mangle_last:
+ # schedule last round key from xmm0
+ lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_last_dec
+
+ # encrypting
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm0 # output permute
+ lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add \$32, %rdx
+
+.Lschedule_mangle_last_dec:
+ add \$-16, %rdx
+ pxor .Lk_s63(%rip), %xmm0
+ call _vpaes_schedule_transform # output transform
+ movdqu %xmm0, (%rdx) # save last key
+
+ # cleanup
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.type _vpaes_schedule_192_smear,\@abi-omnipotent
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
+ pxor %xmm0, %xmm6 # -> c+d c 0 0
+ pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ pxor %xmm0, %xmm6 # -> b+c+d b+c b a
+ movdqa %xmm6, %xmm0
+ pxor %xmm1, %xmm1
+ movhlps %xmm1, %xmm6 # clobber low side with zeros
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.type _vpaes_schedule_round,\@abi-omnipotent
+.align 16
+_vpaes_schedule_round:
+ # extract rcon from xmm8
+ pxor %xmm1, %xmm1
+ palignr \$15, %xmm8, %xmm1
+ palignr \$15, %xmm8, %xmm8
+ pxor %xmm1, %xmm7
+
+ # rotate
+ pshufd \$0xFF, %xmm0, %xmm0
+ palignr \$1, %xmm0, %xmm0
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ # smear xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$4, %xmm7
+ pxor %xmm1, %xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$8, %xmm7
+ pxor %xmm1, %xmm7
+ pxor .Lk_s63(%rip), %xmm7
+
+ # subbytes
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqa %xmm13, %xmm4 # 4 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ movdqa %xmm12, %xmm0 # 0 : sbot
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = sbox output
+
+ # add in smeared stuff
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, %xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.type _vpaes_schedule_transform,\@abi-omnipotent
+.align 16
+_vpaes_schedule_transform:
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ movdqa (%r11), %xmm2 # lo
+ pshufb %xmm0, %xmm2
+ movdqa 16(%r11), %xmm0 # hi
+ pshufb %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.type _vpaes_schedule_mangle,\@abi-omnipotent
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0, %xmm4 # save xmm0 for later
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_dec
+
+ # encrypting
+ add \$16, %rdx
+ pxor .Lk_s63(%rip),%xmm4
+ pshufb %xmm5, %xmm4
+ movdqa %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+ # inverse mix columns
+ lea .Lk_dksd(%rip),%r11
+ movdqa %xmm9, %xmm1
+ pandn %xmm4, %xmm1
+ psrld \$4, %xmm1 # 1 = hi
+ pand %xmm9, %xmm4 # 4 = lo
+
+ movdqa 0x00(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ movdqa 0x10(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x20(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x30(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x40(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x50(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x60(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x70(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+
+ add \$-16, %rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1,%xmm3
+ add \$-16, %r8
+ and \$0x30, %r8
+ movdqu %xmm3, (%rdx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@function,3
+.align 16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov \$0,%ecx
+ mov \$0x30,%r8d
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@function,3
+.align 16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ shl \$4,%eax
+ lea 16(%rdx,%rax),%rdx
+
+ mov \$1,%ecx
+ mov %esi,%r8d
+ shr \$1,%r8d
+ and \$32,%r8d
+ xor \$32,%r8d # nbits==192?0:32
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@function,3
+.align 16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@function,3
+.align 16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+ xchg $key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+ sub \$16,$len
+ jc .Lcbc_abort
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+ movdqu ($ivp),%xmm6 # load IV
+ sub $inp,$out
+ call _vpaes_preheat
+ cmp \$0,${enc}d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu ($inp),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu ($inp),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,($ivp) # save IV
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+.Lcbc_abort:
+ ret
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.type _vpaes_preheat,\@abi-omnipotent
+.align 16
+_vpaes_preheat:
+ lea .Lk_s0F(%rip), %r10
+ movdqa -0x20(%r10), %xmm10 # .Lk_inv
+ movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
+ movdqa 0x00(%r10), %xmm9 # .Lk_s0F
+ movdqa 0x30(%r10), %xmm13 # .Lk_sb1
+ movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
+ movdqa 0x50(%r10), %xmm15 # .Lk_sb2
+ movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+########################################################
+## ##
+## Constants ##
+## ##
+########################################################
+.type _vpaes_consts,\@object
+.align 64
+_vpaes_consts:
+.Lk_inv: # inv, inva
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F: # s0F
+ .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt: # input transform (lo, hi)
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1: # sb1u, sb1t
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2: # sb2u, sb2t
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo: # sbou, sbot
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward: # mc_forward
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr: # sr
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon: # rcon
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63: # s63: all equal to 0x63 transformed
+ .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt: # output transform
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew: # deskew tables: inverts the sbox's "skew"
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+## Decryption stuff
+## Key schedule constants
+##
+.Lk_dksd: # decryption key schedule: invskew x*D
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: # decryption key schedule: invskew x*B
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: # decryption key schedule: invskew x*9
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+## Decryption stuff
+## Round function constants
+##
+.Lk_dipt: # decryption input transform
+ .quad 0x0F505B040B545F00, 0x154A411E114E451A
+ .quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9: # decryption sbox output *9*u, *9*t
+ .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: # decryption sbox output *D*u, *D*t
+ .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: # decryption sbox output *B*u, *B*t
+ .quad 0xD022649296B44200, 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: # decryption sbox output *E*u, *E*t
+ .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo: # decryption sbox final output
+ .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->RipRsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ lea 16(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xb8(%rax),%rax # adjust stack pointer
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_${PREFIX}_set_encrypt_key
+ .rva .LSEH_info_${PREFIX}_set_encrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_${PREFIX}_set_decrypt_key
+ .rva .LSEH_info_${PREFIX}_set_decrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_encrypt
+ .rva .LSEH_end_${PREFIX}_encrypt
+ .rva .LSEH_info_${PREFIX}_encrypt
+
+ .rva .LSEH_begin_${PREFIX}_decrypt
+ .rva .LSEH_end_${PREFIX}_decrypt
+ .rva .LSEH_info_${PREFIX}_decrypt
+
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_${PREFIX}_cbc_encrypt
+
+.section .xdata
+.align 8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
new file mode 100644
index 0000000..5a83107
--- /dev/null
+++ b/crypto/arm_arch.h
@@ -0,0 +1,51 @@
+#ifndef __ARM_ARCH_H__
+#define __ARM_ARCH_H__
+
+#if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+# define __ARM_ARCH__ __TARGET_ARCH_ARM
+# if defined(__BIG_ENDIAN)
+# define __ARMEB__
+# else
+# define __ARMEL__
+# endif
+# elif defined(__GNUC__)
+ /*
+ * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+ * bunch of below macros. See all_architectires[] table in
+ * gcc/config/arm/arm.c. On a side note it defines
+ * __ARMEL__/__ARMEB__ for little-/big-endian.
+ */
+# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
+ defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
+ defined(__ARM_ARCH_7EM__)
+# define __ARM_ARCH__ 7
+# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+ defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \
+ defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \
+ defined(__ARM_ARCH_6T2__)
+# define __ARM_ARCH__ 6
+# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
+ defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \
+ defined(__ARM_ARCH_5TEJ__)
+# define __ARM_ARCH__ 5
+# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+# define __ARM_ARCH__ 4
+# else
+# error "unsupported ARM architecture"
+# endif
+# endif
+#endif
+
+#ifdef OPENSSL_FIPSCANISTER
+#include
+#endif
+
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+
+#define ARMV7_NEON (1<<0)
+#define ARMV7_TICK (1<<1)
+#endif
+
+#endif
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
new file mode 100644
index 0000000..2d618de
--- /dev/null
+++ b/crypto/armv4cpuid.S
@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+
+.text
+.code 32
+
+.align 5
+.global _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ .word 0xf26ee1fe @ vorr q15,q15,q15
+ .word 0xe12fff1e @ bx lr
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.global _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+ mrc p15,0,r0,c9,c13,0
+ .word 0xe12fff1e @ bx lr
+.size _armv7_tick,.-_armv7_tick
+
+.global OPENSSL_atomic_add
+.type OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd: ldrex r2,[r0]
+ add r3,r2,r1
+ strex r2,r3,[r0]
+ cmp r2,#0
+ bne .Ladd
+ mov r0,r3
+ .word 0xe12fff1e @ bx lr
+#else
+ stmdb sp!,{r4-r6,lr}
+ ldr r2,.Lspinlock
+ adr r3,.Lspinlock
+ mov r4,r0
+ mov r5,r1
+ add r6,r3,r2 @ &spinlock
+ b .+8
+.Lspin: bl sched_yield
+ mov r0,#-1
+ swp r0,r0,[r6]
+ cmp r0,#0
+ bne .Lspin
+
+ ldr r2,[r4]
+ add r2,r2,r5
+ str r2,[r4]
+ str r0,[r6] @ release spinlock
+ ldmia sp!,{r4-r6,lr}
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global OPENSSL_cleanse
+.type OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+ eor ip,ip,ip
+ cmp r1,#7
+ subhs r1,r1,#4
+ bhs .Lot
+ cmp r1,#0
+ beq .Lcleanse_done
+.Little:
+ strb ip,[r0],#1
+ subs r1,r1,#1
+ bhi .Little
+ b .Lcleanse_done
+
+.Lot: tst r0,#3
+ beq .Laligned
+ strb ip,[r0],#1
+ sub r1,r1,#1
+ b .Lot
+.Laligned:
+ str ip,[r0],#4
+ subs r1,r1,#4
+ bhs .Laligned
+ adds r1,r1,#4
+ bne .Little
+.Lcleanse_done:
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global OPENSSL_wipe_cpu
+.type OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+ ldr r0,.LOPENSSL_armcap
+ adr r1,.LOPENSSL_armcap
+ ldr r0,[r1,r0]
+ eor r2,r2,r2
+ eor r3,r3,r3
+ eor ip,ip,ip
+ tst r0,#1
+ beq .Lwipe_done
+ .word 0xf3000150 @ veor q0, q0, q0
+ .word 0xf3022152 @ veor q1, q1, q1
+ .word 0xf3044154 @ veor q2, q2, q2
+ .word 0xf3066156 @ veor q3, q3, q3
+ .word 0xf34001f0 @ veor q8, q8, q8
+ .word 0xf34221f2 @ veor q9, q9, q9
+ .word 0xf34441f4 @ veor q10, q10, q10
+ .word 0xf34661f6 @ veor q11, q11, q11
+ .word 0xf34881f8 @ veor q12, q12, q12
+ .word 0xf34aa1fa @ veor q13, q13, q13
+ .word 0xf34cc1fc @ veor q14, q14, q14
+ .word 0xf34ee1fe @ veor q15, q15, q15
+.Lwipe_done:
+ mov r0,sp
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global OPENSSL_instrument_bus
+.type OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+ eor r0,r0,r0
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global OPENSSL_instrument_bus2
+.type OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+ eor r0,r0,r0
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align 5
+#else
+.Lspinlock:
+.word atomic_add_spinlock-.Lspinlock
+.align 5
+
+.data
+.align 2
+atomic_add_spinlock:
+.word 0
+#endif
+
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
diff --git a/crypto/asn1/a_d2i_fp.c b/crypto/asn1/a_d2i_fp.c
index ece40bc..52b2ebd 100644
--- a/crypto/asn1/a_d2i_fp.c
+++ b/crypto/asn1/a_d2i_fp.c
@@ -57,6 +57,7 @@
*/
#include
+#include
#include "cryptlib.h"
#include
#include
@@ -143,17 +144,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
BUF_MEM *b;
unsigned char *p;
int i;
- int ret=-1;
ASN1_const_CTX c;
- int want=HEADER_SIZE;
+ size_t want=HEADER_SIZE;
int eos=0;
-#if defined(__GNUC__) && defined(__ia64)
- /* pathetic compiler bug in all known versions as of Nov. 2002 */
- long off=0;
-#else
- int off=0;
-#endif
- int len=0;
+ size_t off=0;
+ size_t len=0;
b=BUF_MEM_new();
if (b == NULL)
@@ -169,7 +164,7 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
{
want-=(len-off);
- if (!BUF_MEM_grow_clean(b,len+want))
+ if (len + want < len || !BUF_MEM_grow_clean(b,len+want))
{
ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
goto err;
@@ -181,7 +176,14 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
goto err;
}
if (i > 0)
+ {
+ if (len+i < len)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
len+=i;
+ }
}
/* else data already loaded */
@@ -206,6 +208,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
{
/* no data body so go round again */
eos++;
+ if (eos < 0)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_HEADER_TOO_LONG);
+ goto err;
+ }
want=HEADER_SIZE;
}
else if (eos && (c.slen == 0) && (c.tag == V_ASN1_EOC))
@@ -220,10 +227,16 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
else
{
/* suck in c.slen bytes of data */
- want=(int)c.slen;
+ want=c.slen;
if (want > (len-off))
{
want-=(len-off);
+ if (want > INT_MAX /* BIO_read takes an int length */ ||
+ len+want < len)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
if (!BUF_MEM_grow_clean(b,len+want))
{
ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
@@ -238,11 +251,18 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
ASN1_R_NOT_ENOUGH_DATA);
goto err;
}
+ /* This can't overflow because
+ * |len+want| didn't overflow. */
len+=i;
- want -= i;
+ want-=i;
}
}
- off+=(int)c.slen;
+ if (off + c.slen < off)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
+ off+=c.slen;
if (eos <= 0)
{
break;
@@ -252,9 +272,15 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
}
}
+ if (off > INT_MAX)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
+
*pb = b;
return off;
err:
if (b != NULL) BUF_MEM_free(b);
- return(ret);
+ return -1;
}
diff --git a/crypto/asn1/a_digest.c b/crypto/asn1/a_digest.c
index d00d9e2..cbdeea6 100644
--- a/crypto/asn1/a_digest.c
+++ b/crypto/asn1/a_digest.c
@@ -87,7 +87,8 @@ int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data,
p=str;
i2d(data,&p);
- EVP_Digest(str, i, md, len, type, NULL);
+ if (!EVP_Digest(str, i, md, len, type, NULL))
+ return 0;
OPENSSL_free(str);
return(1);
}
@@ -104,7 +105,8 @@ int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn,
i=ASN1_item_i2d(asn,&str, it);
if (!str) return(0);
- EVP_Digest(str, i, md, len, type, NULL);
+ if (!EVP_Digest(str, i, md, len, type, NULL))
+ return 0;
OPENSSL_free(str);
return(1);
}
diff --git a/crypto/asn1/a_int.c b/crypto/asn1/a_int.c
index 3348b87..ad0d250 100644
--- a/crypto/asn1/a_int.c
+++ b/crypto/asn1/a_int.c
@@ -386,8 +386,8 @@ long ASN1_INTEGER_get(const ASN1_INTEGER *a)
if (a->length > (int)sizeof(long))
{
- /* hmm... a bit ugly */
- return(0xffffffffL);
+ /* hmm... a bit ugly, return all ones */
+ return -1;
}
if (a->data == NULL)
return 0;
diff --git a/crypto/asn1/a_sign.c b/crypto/asn1/a_sign.c
index ff63bfc..7b4a193 100644
--- a/crypto/asn1/a_sign.c
+++ b/crypto/asn1/a_sign.c
@@ -184,9 +184,9 @@ int ASN1_sign(i2d_of_void *i2d, X509_ALGOR *algor1, X509_ALGOR *algor2,
p=buf_in;
i2d(data,&p);
- EVP_SignInit_ex(&ctx,type, NULL);
- EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
- if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
+ if (!EVP_SignInit_ex(&ctx,type, NULL)
+ || !EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl)
+ || !EVP_SignFinal(&ctx,(unsigned char *)buf_out,
(unsigned int *)&outl,pkey))
{
outl=0;
@@ -218,65 +218,100 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
const EVP_MD *type)
{
EVP_MD_CTX ctx;
+ EVP_MD_CTX_init(&ctx);
+ if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey))
+ {
+ EVP_MD_CTX_cleanup(&ctx);
+ return 0;
+ }
+ return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx);
+ }
+
+
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+ X509_ALGOR *algor1, X509_ALGOR *algor2,
+ ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx)
+ {
+ const EVP_MD *type;
+ EVP_PKEY *pkey;
unsigned char *buf_in=NULL,*buf_out=NULL;
- int inl=0,outl=0,outll=0;
+ size_t inl=0,outl=0,outll=0;
int signid, paramtype;
+ int rv;
+
+ type = EVP_MD_CTX_md(ctx);
+ pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx);
- if (type == NULL)
+ if (!type || !pkey)
{
- int def_nid;
- if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
- type = EVP_get_digestbynid(def_nid);
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED);
+ return 0;
}
- if (type == NULL)
+ if (pkey->ameth->item_sign)
{
- ASN1err(ASN1_F_ASN1_ITEM_SIGN, ASN1_R_NO_DEFAULT_DIGEST);
- return 0;
+ rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2,
+ signature);
+ if (rv == 1)
+ outl = signature->length;
+ /* Return value meanings:
+ * <=0: error.
+ * 1: method does everything.
+ * 2: carry on as normal.
+ * 3: ASN1 method sets algorithm identifiers: just sign.
+ */
+ if (rv <= 0)
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+ if (rv <= 1)
+ goto err;
}
+ else
+ rv = 2;
- if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
+ if (rv == 2)
{
- if (!pkey->ameth ||
- !OBJ_find_sigid_by_algs(&signid, EVP_MD_nid(type),
- pkey->ameth->pkey_id))
+ if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
{
- ASN1err(ASN1_F_ASN1_ITEM_SIGN,
- ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
- return 0;
+ if (!pkey->ameth ||
+ !OBJ_find_sigid_by_algs(&signid,
+ EVP_MD_nid(type),
+ pkey->ameth->pkey_id))
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,
+ ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+ return 0;
+ }
}
- }
- else
- signid = type->pkey_type;
+ else
+ signid = type->pkey_type;
- if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
- paramtype = V_ASN1_NULL;
- else
- paramtype = V_ASN1_UNDEF;
+ if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
+ paramtype = V_ASN1_NULL;
+ else
+ paramtype = V_ASN1_UNDEF;
- if (algor1)
- X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
- if (algor2)
- X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+ if (algor1)
+ X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
+ if (algor2)
+ X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+
+ }
- EVP_MD_CTX_init(&ctx);
inl=ASN1_item_i2d(asn,&buf_in, it);
outll=outl=EVP_PKEY_size(pkey);
- buf_out=(unsigned char *)OPENSSL_malloc((unsigned int)outl);
+ buf_out=OPENSSL_malloc((unsigned int)outl);
if ((buf_in == NULL) || (buf_out == NULL))
{
outl=0;
- ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_MALLOC_FAILURE);
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_MALLOC_FAILURE);
goto err;
}
- EVP_SignInit_ex(&ctx,type, NULL);
- EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
- if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
- (unsigned int *)&outl,pkey))
+ if (!EVP_DigestSignUpdate(ctx, buf_in, inl)
+ || !EVP_DigestSignFinal(ctx, buf_out, &outl))
{
outl=0;
- ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_EVP_LIB);
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_EVP_LIB);
goto err;
}
if (signature->data != NULL) OPENSSL_free(signature->data);
@@ -289,7 +324,7 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
signature->flags&= ~(ASN1_STRING_FLAG_BITS_LEFT|0x07);
signature->flags|=ASN1_STRING_FLAG_BITS_LEFT;
err:
- EVP_MD_CTX_cleanup(&ctx);
+ EVP_MD_CTX_cleanup(ctx);
if (buf_in != NULL)
{ OPENSSL_cleanse((char *)buf_in,(unsigned int)inl); OPENSSL_free(buf_in); }
if (buf_out != NULL)
diff --git a/crypto/asn1/a_strex.c b/crypto/asn1/a_strex.c
index 264ebf2..ead37ac 100644
--- a/crypto/asn1/a_strex.c
+++ b/crypto/asn1/a_strex.c
@@ -567,6 +567,7 @@ int ASN1_STRING_to_UTF8(unsigned char **out, ASN1_STRING *in)
if(mbflag == -1) return -1;
mbflag |= MBSTRING_FLAG;
stmp.data = NULL;
+ stmp.length = 0;
ret = ASN1_mbstring_copy(&str, in->data, in->length, mbflag, B_ASN1_UTF8STRING);
if(ret < 0) return ret;
*out = stmp.data;
diff --git a/crypto/asn1/a_verify.c b/crypto/asn1/a_verify.c
index cecdb13..fc84cd3 100644
--- a/crypto/asn1/a_verify.c
+++ b/crypto/asn1/a_verify.c
@@ -101,8 +101,13 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
p=buf_in;
i2d(data,&p);
- EVP_VerifyInit_ex(&ctx,type, NULL);
- EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+ if (!EVP_VerifyInit_ex(&ctx,type, NULL)
+ || !EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl))
+ {
+ ASN1err(ASN1_F_ASN1_VERIFY,ERR_R_EVP_LIB);
+ ret=0;
+ goto err;
+ }
OPENSSL_cleanse(buf_in,(unsigned int)inl);
OPENSSL_free(buf_in);
@@ -126,16 +131,21 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
#endif
-int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signature,
- void *asn, EVP_PKEY *pkey)
+int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a,
+ ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey)
{
EVP_MD_CTX ctx;
- const EVP_MD *type = NULL;
unsigned char *buf_in=NULL;
int ret= -1,inl;
int mdnid, pknid;
+ if (!pkey)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_PASSED_NULL_PARAMETER);
+ return -1;
+ }
+
EVP_MD_CTX_init(&ctx);
/* Convert signature OID into digest and public key OIDs */
@@ -144,25 +154,47 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
goto err;
}
- type=EVP_get_digestbynid(mdnid);
- if (type == NULL)
+ if (mdnid == NID_undef)
{
- ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
- goto err;
+ if (!pkey->ameth || !pkey->ameth->item_verify)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+ goto err;
+ }
+ ret = pkey->ameth->item_verify(&ctx, it, asn, a,
+ signature, pkey);
+ /* Return value of 2 means carry on, anything else means we
+ * exit straight away: either a fatal error of the underlying
+ * verification routine handles all verification.
+ */
+ if (ret != 2)
+ goto err;
+ ret = -1;
}
-
- /* Check public key OID matches public key type */
- if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+ else
{
- ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
- goto err;
- }
+ const EVP_MD *type;
+ type=EVP_get_digestbynid(mdnid);
+ if (type == NULL)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+ goto err;
+ }
+
+ /* Check public key OID matches public key type */
+ if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
+ goto err;
+ }
+
+ if (!EVP_DigestVerifyInit(&ctx, NULL, type, NULL, pkey))
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+ ret=0;
+ goto err;
+ }
- if (!EVP_VerifyInit_ex(&ctx,type, NULL))
- {
- ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
- ret=0;
- goto err;
}
inl = ASN1_item_i2d(asn, &buf_in, it);
@@ -173,13 +205,18 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
goto err;
}
- EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+ if (!EVP_DigestVerifyUpdate(&ctx,buf_in,inl))
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+ ret=0;
+ goto err;
+ }
OPENSSL_cleanse(buf_in,(unsigned int)inl);
OPENSSL_free(buf_in);
- if (EVP_VerifyFinal(&ctx,(unsigned char *)signature->data,
- (unsigned int)signature->length,pkey) <= 0)
+ if (EVP_DigestVerifyFinal(&ctx,signature->data,
+ (size_t)signature->length) <= 0)
{
ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
ret=0;
diff --git a/crypto/asn1/ameth_lib.c b/crypto/asn1/ameth_lib.c
index 5a581b9..a19e058 100644
--- a/crypto/asn1/ameth_lib.c
+++ b/crypto/asn1/ameth_lib.c
@@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
/* Keep this sorted in type order !! */
static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
@@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
#ifndef OPENSSL_NO_EC
&eckey_asn1_meth,
#endif
- &hmac_asn1_meth
+ &hmac_asn1_meth,
+ &cmac_asn1_meth
};
typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
@@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
if (!ameth)
return NULL;
+ memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD));
+
ameth->pkey_id = id;
ameth->pkey_base_id = id;
ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
@@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
ameth->old_priv_encode = 0;
ameth->old_priv_decode = 0;
+ ameth->item_verify = 0;
+ ameth->item_sign = 0;
+
ameth->pkey_size = 0;
ameth->pkey_bits = 0;
@@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
dst->pkey_free = src->pkey_free;
dst->pkey_ctrl = src->pkey_ctrl;
+ dst->item_sign = src->item_sign;
+ dst->item_verify = src->item_verify;
+
}
void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
diff --git a/crypto/asn1/asn1.h b/crypto/asn1/asn1.h
index 59540e4..220a0c8 100644
--- a/crypto/asn1/asn1.h
+++ b/crypto/asn1/asn1.h
@@ -235,7 +235,7 @@ typedef struct asn1_object_st
*/
#define ASN1_STRING_FLAG_MSTRING 0x040
/* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
{
int length;
int type;
@@ -245,7 +245,7 @@ typedef struct asn1_string_st
* input data has a non-zero 'unused bits' value, it will be
* handled correctly */
long flags;
- } ASN1_STRING;
+ };
/* ASN1_ENCODING structure: this is used to save the received
* encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@ DECLARE_STACK_OF(ASN1_STRING_TABLE)
* see asn1t.h
*/
typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
typedef struct ASN1_TLC_st ASN1_TLC;
/* This is just an opaque pointer */
typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@ void ERR_load_ASN1_strings(void);
#define ASN1_F_ASN1_ITEM_I2D_FP 193
#define ASN1_F_ASN1_ITEM_PACK 198
#define ASN1_F_ASN1_ITEM_SIGN 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX 220
#define ASN1_F_ASN1_ITEM_UNPACK 199
#define ASN1_F_ASN1_ITEM_VERIFY 197
#define ASN1_F_ASN1_MBSTRING_NCOPY 122
@@ -1266,6 +1266,7 @@ void ERR_load_ASN1_strings(void);
#define ASN1_F_PKCS5_PBE2_SET_IV 167
#define ASN1_F_PKCS5_PBE_SET 202
#define ASN1_F_PKCS5_PBE_SET0_ALGOR 215
+#define ASN1_F_PKCS5_PBKDF2_SET 219
#define ASN1_F_SMIME_READ_ASN1 212
#define ASN1_F_SMIME_TEXT 213
#define ASN1_F_X509_CINF_NEW 168
@@ -1291,6 +1292,7 @@ void ERR_load_ASN1_strings(void);
#define ASN1_R_BOOLEAN_IS_WRONG_LENGTH 106
#define ASN1_R_BUFFER_TOO_SMALL 107
#define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED 217
#define ASN1_R_DATA_IS_WRONG 109
#define ASN1_R_DECODE_ERROR 110
#define ASN1_R_DECODING_ERROR 111
diff --git a/crypto/asn1/asn1_err.c b/crypto/asn1/asn1_err.c
index 6e04d08..1a30bf1 100644
--- a/crypto/asn1/asn1_err.c
+++ b/crypto/asn1/asn1_err.c
@@ -1,6 +1,6 @@
/* crypto/asn1/asn1_err.c */
/* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -107,6 +107,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
{ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP), "ASN1_item_i2d_fp"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_PACK), "ASN1_item_pack"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN), "ASN1_item_sign"},
+{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX), "ASN1_item_sign_ctx"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK), "ASN1_item_unpack"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY), "ASN1_item_verify"},
{ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY), "ASN1_mbstring_ncopy"},
@@ -179,6 +180,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
{ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV), "PKCS5_pbe2_set_iv"},
{ERR_FUNC(ASN1_F_PKCS5_PBE_SET), "PKCS5_pbe_set"},
{ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR), "PKCS5_pbe_set0_algor"},
+{ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET), "PKCS5_pbkdf2_set"},
{ERR_FUNC(ASN1_F_SMIME_READ_ASN1), "SMIME_read_ASN1"},
{ERR_FUNC(ASN1_F_SMIME_TEXT), "SMIME_text"},
{ERR_FUNC(ASN1_F_X509_CINF_NEW), "X509_CINF_NEW"},
@@ -207,6 +209,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]=
{ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"},
{ERR_REASON(ASN1_R_BUFFER_TOO_SMALL) ,"buffer too small"},
{ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),"cipher has no object identifier"},
+{ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED),"context not initialised"},
{ERR_REASON(ASN1_R_DATA_IS_WRONG) ,"data is wrong"},
{ERR_REASON(ASN1_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(ASN1_R_DECODING_ERROR) ,"decoding error"},
diff --git a/crypto/asn1/asn1_locl.h b/crypto/asn1/asn1_locl.h
index 5aa65e2..9fcf0d9 100644
--- a/crypto/asn1/asn1_locl.h
+++ b/crypto/asn1/asn1_locl.h
@@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st
int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
ASN1_PCTX *pctx);
+ int (*sig_print)(BIO *out,
+ const X509_ALGOR *sigalg, const ASN1_STRING *sig,
+ int indent, ASN1_PCTX *pctx);
+
void (*pkey_free)(EVP_PKEY *pkey);
int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
@@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st
int (*old_priv_decode)(EVP_PKEY *pkey,
const unsigned char **pder, int derlen);
int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+ /* Custom ASN1 signature verification */
+ int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+ X509_ALGOR *a, ASN1_BIT_STRING *sig,
+ EVP_PKEY *pkey);
+ int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+ X509_ALGOR *alg1, X509_ALGOR *alg2,
+ ASN1_BIT_STRING *sig);
} /* EVP_PKEY_ASN1_METHOD */;
diff --git a/crypto/asn1/asn_mime.c b/crypto/asn1/asn_mime.c
index c1d1b12..54a704a 100644
--- a/crypto/asn1/asn_mime.c
+++ b/crypto/asn1/asn_mime.c
@@ -377,8 +377,12 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
BIO *tmpbio;
const ASN1_AUX *aux = it->funcs;
ASN1_STREAM_ARG sarg;
+ int rv = 1;
- if (!(flags & SMIME_DETACHED))
+ /* If data is not deteched or resigning then the output BIO is
+ * already set up to finalise when it is written through.
+ */
+ if (!(flags & SMIME_DETACHED) || (flags & PKCS7_REUSE_DIGEST))
{
SMIME_crlf_copy(data, out, flags);
return 1;
@@ -405,7 +409,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
/* Finalize structure */
if (aux->asn1_cb(ASN1_OP_DETACHED_POST, &val, it, &sarg) <= 0)
- return 0;
+ rv = 0;
/* Now remove any digests prepended to the BIO */
@@ -416,7 +420,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
sarg.ndef_bio = tmpbio;
}
- return 1;
+ return rv;
}
@@ -486,9 +490,9 @@ ASN1_VALUE *SMIME_read_ASN1(BIO *bio, BIO **bcont, const ASN1_ITEM *it)
if(strcmp(hdr->value, "application/x-pkcs7-signature") &&
strcmp(hdr->value, "application/pkcs7-signature")) {
- sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
ASN1err(ASN1_F_SMIME_READ_ASN1,ASN1_R_SIG_INVALID_MIME_TYPE);
ERR_add_error_data(2, "type: ", hdr->value);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
sk_BIO_pop_free(parts, BIO_vfree);
return NULL;
}
@@ -801,7 +805,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
if(name) {
if(!(tmpname = BUF_strdup(name))) return NULL;
for(p = tmpname ; *p; p++) {
- c = *p;
+ c = (unsigned char)*p;
if(isupper(c)) {
c = tolower(c);
*p = c;
@@ -811,7 +815,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
if(value) {
if(!(tmpval = BUF_strdup(value))) return NULL;
for(p = tmpval ; *p; p++) {
- c = *p;
+ c = (unsigned char)*p;
if(isupper(c)) {
c = tolower(c);
*p = c;
@@ -835,7 +839,7 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
tmpname = BUF_strdup(name);
if(!tmpname) return 0;
for(p = tmpname ; *p; p++) {
- c = *p;
+ c = (unsigned char)*p;
if(isupper(c)) {
c = tolower(c);
*p = c;
@@ -858,12 +862,17 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
static int mime_hdr_cmp(const MIME_HEADER * const *a,
const MIME_HEADER * const *b)
{
+ if (!(*a)->name || !(*b)->name)
+ return !!(*a)->name - !!(*b)->name;
+
return(strcmp((*a)->name, (*b)->name));
}
static int mime_param_cmp(const MIME_PARAM * const *a,
const MIME_PARAM * const *b)
{
+ if (!(*a)->param_name || !(*b)->param_name)
+ return !!(*a)->param_name - !!(*b)->param_name;
return(strcmp((*a)->param_name, (*b)->param_name));
}
diff --git a/crypto/asn1/n_pkey.c b/crypto/asn1/n_pkey.c
index e7d0439..e251739 100644
--- a/crypto/asn1/n_pkey.c
+++ b/crypto/asn1/n_pkey.c
@@ -129,6 +129,7 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
unsigned char buf[256],*zz;
unsigned char key[EVP_MAX_KEY_LENGTH];
EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
if (a == NULL) return(0);
@@ -206,24 +207,28 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
i = strlen((char *)buf);
/* If the key is used for SGC the algorithm is modified a little. */
if(sgckey) {
- EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+ if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+ goto err;
memcpy(buf + 16, "SGCKEYSALT", 10);
i = 26;
}
- EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+ if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+ goto err;
OPENSSL_cleanse(buf,256);
/* Encrypt private key in place */
zz = enckey->enckey->digest->data;
- EVP_CIPHER_CTX_init(&ctx);
- EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL);
- EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen);
- EVP_EncryptFinal_ex(&ctx,zz + i,&j);
- EVP_CIPHER_CTX_cleanup(&ctx);
+ if (!EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL))
+ goto err;
+ if (!EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen))
+ goto err;
+ if (!EVP_EncryptFinal_ex(&ctx,zz + i,&j))
+ goto err;
ret = i2d_NETSCAPE_ENCRYPTED_PKEY(enckey, pp);
err:
+ EVP_CIPHER_CTX_cleanup(&ctx);
NETSCAPE_ENCRYPTED_PKEY_free(enckey);
NETSCAPE_PKEY_free(pkey);
return(ret);
@@ -288,6 +293,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
const unsigned char *zz;
unsigned char key[EVP_MAX_KEY_LENGTH];
EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
i=cb((char *)buf,256,"Enter Private Key password:",0);
if (i != 0)
@@ -298,19 +304,22 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
i = strlen((char *)buf);
if(sgckey){
- EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+ if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+ goto err;
memcpy(buf + 16, "SGCKEYSALT", 10);
i = 26;
}
- EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+ if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+ goto err;
OPENSSL_cleanse(buf,256);
- EVP_CIPHER_CTX_init(&ctx);
- EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL);
- EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length);
- EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j);
- EVP_CIPHER_CTX_cleanup(&ctx);
+ if (!EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL))
+ goto err;
+ if (!EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length))
+ goto err;
+ if (!EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j))
+ goto err;
os->length=i+j;
zz=os->data;
@@ -328,6 +337,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
goto err;
}
err:
+ EVP_CIPHER_CTX_cleanup(&ctx);
NETSCAPE_PKEY_free(pkey);
return(ret);
}
diff --git a/crypto/asn1/p5_pbev2.c b/crypto/asn1/p5_pbev2.c
index cb49b66..4ea6830 100644
--- a/crypto/asn1/p5_pbev2.c
+++ b/crypto/asn1/p5_pbev2.c
@@ -91,12 +91,10 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
unsigned char *aiv, int prf_nid)
{
X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL;
- int alg_nid;
+ int alg_nid, keylen;
EVP_CIPHER_CTX ctx;
unsigned char iv[EVP_MAX_IV_LENGTH];
- PBKDF2PARAM *kdf = NULL;
PBE2PARAM *pbe2 = NULL;
- ASN1_OCTET_STRING *osalt = NULL;
ASN1_OBJECT *obj;
alg_nid = EVP_CIPHER_type(cipher);
@@ -127,7 +125,8 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
EVP_CIPHER_CTX_init(&ctx);
/* Dummy cipherinit to just setup the IV, and PRF */
- EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0);
+ if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0))
+ goto err;
if(EVP_CIPHER_param_to_asn1(&ctx, scheme->parameter) < 0) {
ASN1err(ASN1_F_PKCS5_PBE2_SET_IV,
ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
@@ -145,55 +144,21 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
}
EVP_CIPHER_CTX_cleanup(&ctx);
- if(!(kdf = PBKDF2PARAM_new())) goto merr;
- if(!(osalt = M_ASN1_OCTET_STRING_new())) goto merr;
-
- if (!saltlen) saltlen = PKCS5_SALT_LEN;
- if (!(osalt->data = OPENSSL_malloc (saltlen))) goto merr;
- osalt->length = saltlen;
- if (salt) memcpy (osalt->data, salt, saltlen);
- else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) goto merr;
-
- if(iter <= 0) iter = PKCS5_DEFAULT_ITER;
- if(!ASN1_INTEGER_set(kdf->iter, iter)) goto merr;
-
- /* Now include salt in kdf structure */
- kdf->salt->value.octet_string = osalt;
- kdf->salt->type = V_ASN1_OCTET_STRING;
- osalt = NULL;
-
/* If its RC2 then we'd better setup the key length */
- if(alg_nid == NID_rc2_cbc) {
- if(!(kdf->keylength = M_ASN1_INTEGER_new())) goto merr;
- if(!ASN1_INTEGER_set (kdf->keylength,
- EVP_CIPHER_key_length(cipher))) goto merr;
- }
-
- /* prf can stay NULL if we are using hmacWithSHA1 */
- if (prf_nid != NID_hmacWithSHA1)
- {
- kdf->prf = X509_ALGOR_new();
- if (!kdf->prf)
- goto merr;
- X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
- V_ASN1_NULL, NULL);
- }
-
- /* Now setup the PBE2PARAM keyfunc structure */
+ if(alg_nid == NID_rc2_cbc)
+ keylen = EVP_CIPHER_key_length(cipher);
+ else
+ keylen = -1;
- pbe2->keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+ /* Setup keyfunc */
- /* Encode PBKDF2PARAM into parameter of pbe2 */
+ X509_ALGOR_free(pbe2->keyfunc);
- if(!(pbe2->keyfunc->parameter = ASN1_TYPE_new())) goto merr;
+ pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
- if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
- &pbe2->keyfunc->parameter->value.sequence)) goto merr;
- pbe2->keyfunc->parameter->type = V_ASN1_SEQUENCE;
-
- PBKDF2PARAM_free(kdf);
- kdf = NULL;
+ if (!pbe2->keyfunc)
+ goto merr;
/* Now set up top level AlgorithmIdentifier */
@@ -219,8 +184,6 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
err:
PBE2PARAM_free(pbe2);
/* Note 'scheme' is freed as part of pbe2 */
- M_ASN1_OCTET_STRING_free(osalt);
- PBKDF2PARAM_free(kdf);
X509_ALGOR_free(kalg);
X509_ALGOR_free(ret);
@@ -233,3 +196,85 @@ X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
{
return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
}
+
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+ int prf_nid, int keylen)
+ {
+ X509_ALGOR *keyfunc = NULL;
+ PBKDF2PARAM *kdf = NULL;
+ ASN1_OCTET_STRING *osalt = NULL;
+
+ if(!(kdf = PBKDF2PARAM_new()))
+ goto merr;
+ if(!(osalt = M_ASN1_OCTET_STRING_new()))
+ goto merr;
+
+ kdf->salt->value.octet_string = osalt;
+ kdf->salt->type = V_ASN1_OCTET_STRING;
+
+ if (!saltlen)
+ saltlen = PKCS5_SALT_LEN;
+ if (!(osalt->data = OPENSSL_malloc (saltlen)))
+ goto merr;
+
+ osalt->length = saltlen;
+
+ if (salt)
+ memcpy (osalt->data, salt, saltlen);
+ else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0)
+ goto merr;
+
+ if(iter <= 0)
+ iter = PKCS5_DEFAULT_ITER;
+
+ if(!ASN1_INTEGER_set(kdf->iter, iter))
+ goto merr;
+
+ /* If have a key len set it up */
+
+ if(keylen > 0)
+ {
+ if(!(kdf->keylength = M_ASN1_INTEGER_new()))
+ goto merr;
+ if(!ASN1_INTEGER_set (kdf->keylength, keylen))
+ goto merr;
+ }
+
+ /* prf can stay NULL if we are using hmacWithSHA1 */
+ if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1)
+ {
+ kdf->prf = X509_ALGOR_new();
+ if (!kdf->prf)
+ goto merr;
+ X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
+ V_ASN1_NULL, NULL);
+ }
+
+ /* Finally setup the keyfunc structure */
+
+ keyfunc = X509_ALGOR_new();
+ if (!keyfunc)
+ goto merr;
+
+ keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+
+ /* Encode PBKDF2PARAM into parameter of pbe2 */
+
+ if(!(keyfunc->parameter = ASN1_TYPE_new()))
+ goto merr;
+
+ if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
+ &keyfunc->parameter->value.sequence))
+ goto merr;
+ keyfunc->parameter->type = V_ASN1_SEQUENCE;
+
+ PBKDF2PARAM_free(kdf);
+ return keyfunc;
+
+ merr:
+ ASN1err(ASN1_F_PKCS5_PBKDF2_SET,ERR_R_MALLOC_FAILURE);
+ PBKDF2PARAM_free(kdf);
+ X509_ALGOR_free(keyfunc);
+ return NULL;
+ }
+
diff --git a/crypto/asn1/t_crl.c b/crypto/asn1/t_crl.c
index ee5a687..c611692 100644
--- a/crypto/asn1/t_crl.c
+++ b/crypto/asn1/t_crl.c
@@ -94,8 +94,7 @@ int X509_CRL_print(BIO *out, X509_CRL *x)
l = X509_CRL_get_version(x);
BIO_printf(out, "%8sVersion %lu (0x%lx)\n", "", l+1, l);
i = OBJ_obj2nid(x->sig_alg->algorithm);
- BIO_printf(out, "%8sSignature Algorithm: %s\n", "",
- (i == NID_undef) ? "NONE" : OBJ_nid2ln(i));
+ X509_signature_print(out, x->sig_alg, NULL);
p=X509_NAME_oneline(X509_CRL_get_issuer(x),NULL,0);
BIO_printf(out,"%8sIssuer: %s\n","",p);
OPENSSL_free(p);
diff --git a/crypto/asn1/t_x509.c b/crypto/asn1/t_x509.c
index e061f2f..edbb39a 100644
--- a/crypto/asn1/t_x509.c
+++ b/crypto/asn1/t_x509.c
@@ -72,6 +72,7 @@
#include
#include
#include
+#include "asn1_locl.h"
#ifndef OPENSSL_NO_FP_API
int X509_print_fp(FILE *fp, X509 *x)
@@ -137,10 +138,10 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
if (BIO_write(bp," Serial Number:",22) <= 0) goto err;
bs=X509_get_serialNumber(x);
- if (bs->length <= 4)
+ if (bs->length <= (int)sizeof(long))
{
l=ASN1_INTEGER_get(bs);
- if (l < 0)
+ if (bs->type == V_ASN1_NEG_INTEGER)
{
l= -l;
neg="-";
@@ -167,12 +168,16 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
if(!(cflag & X509_FLAG_NO_SIGNAME))
{
+ if(X509_signature_print(bp, x->sig_alg, NULL) <= 0)
+ goto err;
+#if 0
if (BIO_printf(bp,"%8sSignature Algorithm: ","") <= 0)
goto err;
if (i2a_ASN1_OBJECT(bp, ci->signature->algorithm) <= 0)
goto err;
if (BIO_puts(bp, "\n") <= 0)
goto err;
+#endif
}
if(!(cflag & X509_FLAG_NO_ISSUER))
@@ -255,7 +260,8 @@ int X509_ocspid_print (BIO *bp, X509 *x)
goto err;
i2d_X509_NAME(x->cert_info->subject, &dertmp);
- EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL);
+ if (!EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL))
+ goto err;
for (i=0; i < SHA_DIGEST_LENGTH; i++)
{
if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) goto err;
@@ -268,8 +274,10 @@ int X509_ocspid_print (BIO *bp, X509 *x)
if (BIO_printf(bp,"\n Public key OCSP hash: ") <= 0)
goto err;
- EVP_Digest(x->cert_info->key->public_key->data,
- x->cert_info->key->public_key->length, SHA1md, NULL, EVP_sha1(), NULL);
+ if (!EVP_Digest(x->cert_info->key->public_key->data,
+ x->cert_info->key->public_key->length,
+ SHA1md, NULL, EVP_sha1(), NULL))
+ goto err;
for (i=0; i < SHA_DIGEST_LENGTH; i++)
{
if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0)
@@ -283,23 +291,50 @@ int X509_ocspid_print (BIO *bp, X509 *x)
return(0);
}
-int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+int X509_signature_dump(BIO *bp, const ASN1_STRING *sig, int indent)
{
- unsigned char *s;
+ const unsigned char *s;
int i, n;
- if (BIO_puts(bp," Signature Algorithm: ") <= 0) return 0;
- if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
n=sig->length;
s=sig->data;
for (i=0; ialgorithm) <= 0) return 0;
+
+ sig_nid = OBJ_obj2nid(sigalg->algorithm);
+ if (sig_nid != NID_undef)
+ {
+ int pkey_nid, dig_nid;
+ const EVP_PKEY_ASN1_METHOD *ameth;
+ if (OBJ_find_sigid_algs(sig_nid, &dig_nid, &pkey_nid))
+ {
+ ameth = EVP_PKEY_asn1_find(NULL, pkey_nid);
+ if (ameth && ameth->sig_print)
+ return ameth->sig_print(bp, sigalg, sig, 9, 0);
+ }
+ }
+ if (sig)
+ return X509_signature_dump(bp, sig, 9);
+ else if (BIO_puts(bp, "\n") <= 0)
+ return 0;
return 1;
}
diff --git a/crypto/asn1/tasn_prn.c b/crypto/asn1/tasn_prn.c
index 4536980..542a091 100644
--- a/crypto/asn1/tasn_prn.c
+++ b/crypto/asn1/tasn_prn.c
@@ -446,11 +446,11 @@ static int asn1_print_fsname(BIO *out, int indent,
return 1;
}
-static int asn1_print_boolean_ctx(BIO *out, const int bool,
+static int asn1_print_boolean_ctx(BIO *out, int boolval,
const ASN1_PCTX *pctx)
{
const char *str;
- switch (bool)
+ switch (boolval)
{
case -1:
str = "BOOL ABSENT";
@@ -574,10 +574,10 @@ static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld,
{
case V_ASN1_BOOLEAN:
{
- int bool = *(int *)fld;
- if (bool == -1)
- bool = it->size;
- ret = asn1_print_boolean_ctx(out, bool, pctx);
+ int boolval = *(int *)fld;
+ if (boolval == -1)
+ boolval = it->size;
+ ret = asn1_print_boolean_ctx(out, boolval, pctx);
}
break;
diff --git a/crypto/asn1/x_algor.c b/crypto/asn1/x_algor.c
index 99e5342..274e456 100644
--- a/crypto/asn1/x_algor.c
+++ b/crypto/asn1/x_algor.c
@@ -128,3 +128,17 @@ void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
}
}
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md)
+ {
+ int param_type;
+
+ if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+ param_type = V_ASN1_UNDEF;
+ else
+ param_type = V_ASN1_NULL;
+
+ X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+ }
diff --git a/crypto/asn1/x_name.c b/crypto/asn1/x_name.c
index 49be08b..d7c2318 100644
--- a/crypto/asn1/x_name.c
+++ b/crypto/asn1/x_name.c
@@ -399,8 +399,7 @@ static int asn1_string_canon(ASN1_STRING *out, ASN1_STRING *in)
/* If type not in bitmask just copy string across */
if (!(ASN1_tag2bit(in->type) & ASN1_MASK_CANON))
{
- out->type = in->type;
- if (!ASN1_STRING_set(out, in->data, in->length))
+ if (!ASN1_STRING_copy(out, in))
return 0;
return 1;
}
diff --git a/crypto/asn1/x_pubkey.c b/crypto/asn1/x_pubkey.c
index d42b6a2..b649e1f 100644
--- a/crypto/asn1/x_pubkey.c
+++ b/crypto/asn1/x_pubkey.c
@@ -171,7 +171,19 @@ EVP_PKEY *X509_PUBKEY_get(X509_PUBKEY *key)
goto error;
}
- key->pkey = ret;
+ /* Check to see if another thread set key->pkey first */
+ CRYPTO_w_lock(CRYPTO_LOCK_EVP_PKEY);
+ if (key->pkey)
+ {
+ CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+ EVP_PKEY_free(ret);
+ ret = key->pkey;
+ }
+ else
+ {
+ key->pkey = ret;
+ CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+ }
CRYPTO_add(&ret->references, 1, CRYPTO_LOCK_EVP_PKEY);
return ret;
diff --git a/crypto/bf/asm/bf-586.S b/crypto/bf/asm/bf-586.S
new file mode 100644
index 0000000..aa718d4
--- /dev/null
+++ b/crypto/bf/asm/bf-586.S
@@ -0,0 +1,896 @@
+.file "bf-586.s"
+.text
+.globl BF_encrypt
+.type BF_encrypt,@function
+.align 16
+BF_encrypt:
+.L_BF_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ movl 12(%esp),%ebx
+ movl 16(%esp),%ebp
+ pushl %esi
+ pushl %edi
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ xorl %eax,%eax
+ movl (%ebp),%ebx
+ xorl %ecx,%ecx
+ xorl %ebx,%edi
+
+
+ movl 4(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 8(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 12(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 16(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 20(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 24(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 28(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 32(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 36(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 40(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 44(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 48(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 52(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 56(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 60(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 64(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+
+ movl 20(%esp),%eax
+ xorl %ebx,%edi
+ movl 68(%ebp),%edx
+ xorl %edx,%esi
+ movl %edi,4(%eax)
+ movl %esi,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size BF_encrypt,.-.L_BF_encrypt_begin
+.globl BF_decrypt
+.type BF_decrypt,@function
+.align 16
+BF_decrypt:
+.L_BF_decrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ movl 12(%esp),%ebx
+ movl 16(%esp),%ebp
+ pushl %esi
+ pushl %edi
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ xorl %eax,%eax
+ movl 68(%ebp),%ebx
+ xorl %ecx,%ecx
+ xorl %ebx,%edi
+
+
+ movl 64(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 60(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 56(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 52(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 48(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 44(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 40(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 36(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 32(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 28(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 24(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 20(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 16(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 12(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 8(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 4(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+
+ movl 20(%esp),%eax
+ xorl %ebx,%edi
+ movl (%ebp),%edx
+ xorl %edx,%esi
+ movl %edi,4(%eax)
+ movl %esi,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size BF_decrypt,.-.L_BF_decrypt_begin
+.globl BF_cbc_encrypt
+.type BF_cbc_encrypt,@function
+.align 16
+BF_cbc_encrypt:
+.L_BF_cbc_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ebp
+
+ movl 36(%esp),%ebx
+ movl (%ebx),%esi
+ movl 4(%ebx),%edi
+ pushl %edi
+ pushl %esi
+ pushl %edi
+ pushl %esi
+ movl %esp,%ebx
+ movl 36(%esp),%esi
+ movl 40(%esp),%edi
+
+ movl 56(%esp),%ecx
+
+ movl 48(%esp),%eax
+ pushl %eax
+ pushl %ebx
+ cmpl $0,%ecx
+ jz .L000decrypt
+ andl $4294967288,%ebp
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ jz .L001encrypt_finish
+.L002encrypt_loop:
+ movl (%esi),%ecx
+ movl 4(%esi),%edx
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_encrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L002encrypt_loop
+.L001encrypt_finish:
+ movl 52(%esp),%ebp
+ andl $7,%ebp
+ jz .L003finish
+ call .L004PIC_point
+.L004PIC_point:
+ popl %edx
+ leal .L005cbc_enc_jmp_table-.L004PIC_point(%edx),%ecx
+ movl (%ecx,%ebp,4),%ebp
+ addl %edx,%ebp
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ jmp *%ebp
+.L006ej7:
+ movb 6(%esi),%dh
+ shll $8,%edx
+.L007ej6:
+ movb 5(%esi),%dh
+.L008ej5:
+ movb 4(%esi),%dl
+.L009ej4:
+ movl (%esi),%ecx
+ jmp .L010ejend
+.L011ej3:
+ movb 2(%esi),%ch
+ shll $8,%ecx
+.L012ej2:
+ movb 1(%esi),%ch
+.L013ej1:
+ movb (%esi),%cl
+.L010ejend:
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_encrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ jmp .L003finish
+.L000decrypt:
+ andl $4294967288,%ebp
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ jz .L014decrypt_finish
+.L015decrypt_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_decrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl 16(%esp),%ecx
+ movl 20(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %ecx,(%edi)
+ movl %edx,4(%edi)
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L015decrypt_loop
+.L014decrypt_finish:
+ movl 52(%esp),%ebp
+ andl $7,%ebp
+ jz .L003finish
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_decrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl 16(%esp),%ecx
+ movl 20(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+.L016dj7:
+ rorl $16,%edx
+ movb %dl,6(%edi)
+ shrl $16,%edx
+.L017dj6:
+ movb %dh,5(%edi)
+.L018dj5:
+ movb %dl,4(%edi)
+.L019dj4:
+ movl %ecx,(%edi)
+ jmp .L020djend
+.L021dj3:
+ rorl $16,%ecx
+ movb %cl,2(%edi)
+ shll $16,%ecx
+.L022dj2:
+ movb %ch,1(%esi)
+.L023dj1:
+ movb %cl,(%esi)
+.L020djend:
+ jmp .L003finish
+.L003finish:
+ movl 60(%esp),%ecx
+ addl $24,%esp
+ movl %eax,(%ecx)
+ movl %ebx,4(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L005cbc_enc_jmp_table:
+.long 0
+.long .L013ej1-.L004PIC_point
+.long .L012ej2-.L004PIC_point
+.long .L011ej3-.L004PIC_point
+.long .L009ej4-.L004PIC_point
+.long .L008ej5-.L004PIC_point
+.long .L007ej6-.L004PIC_point
+.long .L006ej7-.L004PIC_point
+.align 64
+.size BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin
diff --git a/crypto/bf/bf_skey.c b/crypto/bf/bf_skey.c
index 3673cde..3b0bca4 100644
--- a/crypto/bf/bf_skey.c
+++ b/crypto/bf/bf_skey.c
@@ -58,11 +58,19 @@
#include
#include
+#include
#include
#include "bf_locl.h"
#include "bf_pi.h"
void BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#ifdef OPENSSL_FIPS
+ {
+ fips_cipher_abort(BLOWFISH);
+ private_BF_set_key(key, len, data);
+ }
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#endif
{
int i;
BF_LONG *p,ri,in[2];
diff --git a/crypto/bf/blowfish.h b/crypto/bf/blowfish.h
index b97e76f..4b6c892 100644
--- a/crypto/bf/blowfish.h
+++ b/crypto/bf/blowfish.h
@@ -104,7 +104,9 @@ typedef struct bf_key_st
BF_LONG S[4*256];
} BF_KEY;
-
+#ifdef OPENSSL_FIPS
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
void BF_encrypt(BF_LONG *data,const BF_KEY *key);
diff --git a/crypto/bio/b_sock.c b/crypto/bio/b_sock.c
index d47310d..470b1a0 100644
--- a/crypto/bio/b_sock.c
+++ b/crypto/bio/b_sock.c
@@ -629,7 +629,8 @@ int BIO_get_accept_socket(char *host, int bind_mode)
struct sockaddr_in6 sa_in6;
#endif
} server,client;
- int s=INVALID_SOCKET,cs,addrlen;
+ int s=INVALID_SOCKET,cs;
+ socklen_t addrlen;
unsigned char ip[4];
unsigned short port;
char *str=NULL,*e;
@@ -704,10 +705,10 @@ int BIO_get_accept_socket(char *host, int bind_mode)
if ((*p_getaddrinfo.f)(h,p,&hint,&res)) break;
- addrlen = res->ai_addrlen<=sizeof(server) ?
+ addrlen = res->ai_addrlen <= (socklen_t)sizeof(server) ?
res->ai_addrlen :
- sizeof(server);
- memcpy(&server, res->ai_addr, addrlen);
+ (socklen_t)sizeof(server);
+ memcpy(&server, res->ai_addr, (size_t)addrlen);
(*p_freeaddrinfo.f)(res);
goto again;
@@ -719,7 +720,7 @@ int BIO_get_accept_socket(char *host, int bind_mode)
memset((char *)&server,0,sizeof(server));
server.sa_in.sin_family=AF_INET;
server.sa_in.sin_port=htons(port);
- addrlen = sizeof(server.sa_in);
+ addrlen = (socklen_t)sizeof(server.sa_in);
if (h == NULL || strcmp(h,"*") == 0)
server.sa_in.sin_addr.s_addr=INADDR_ANY;
@@ -960,7 +961,6 @@ int BIO_set_tcp_ndelay(int s, int on)
#endif
return(ret == 0);
}
-#endif
int BIO_socket_nbio(int s, int mode)
{
@@ -973,3 +973,4 @@ int BIO_socket_nbio(int s, int mode)
#endif
return(ret == 0);
}
+#endif
diff --git a/crypto/bio/bf_buff.c b/crypto/bio/bf_buff.c
index c1fd75a..4b5a132 100644
--- a/crypto/bio/bf_buff.c
+++ b/crypto/bio/bf_buff.c
@@ -209,7 +209,7 @@ static int buffer_write(BIO *b, const char *in, int inl)
/* add to buffer and return */
if (i >= inl)
{
- memcpy(&(ctx->obuf[ctx->obuf_len]),in,inl);
+ memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,inl);
ctx->obuf_len+=inl;
return(num+inl);
}
@@ -219,7 +219,7 @@ static int buffer_write(BIO *b, const char *in, int inl)
{
if (i > 0) /* lets fill it up if we can */
{
- memcpy(&(ctx->obuf[ctx->obuf_len]),in,i);
+ memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,i);
in+=i;
inl-=i;
num+=i;
@@ -294,9 +294,9 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
case BIO_C_GET_BUFF_NUM_LINES:
ret=0;
p1=ctx->ibuf;
- for (i=ctx->ibuf_off; iibuf_len; i++)
+ for (i=0; iibuf_len; i++)
{
- if (p1[i] == '\n') ret++;
+ if (p1[ctx->ibuf_off + i] == '\n') ret++;
}
break;
case BIO_CTRL_WPENDING:
@@ -399,17 +399,18 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
for (;;)
{
BIO_clear_retry_flags(b);
- if (ctx->obuf_len > ctx->obuf_off)
+ if (ctx->obuf_len > 0)
{
r=BIO_write(b->next_bio,
&(ctx->obuf[ctx->obuf_off]),
- ctx->obuf_len-ctx->obuf_off);
+ ctx->obuf_len);
#if 0
-fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len-ctx->obuf_off,r);
+fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len,r);
#endif
BIO_copy_next_retry(b);
if (r <= 0) return((long)r);
ctx->obuf_off+=r;
+ ctx->obuf_len-=r;
}
else
{
diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h
index 152802f..05699ab 100644
--- a/crypto/bio/bio.h
+++ b/crypto/bio/bio.h
@@ -68,6 +68,14 @@
#include
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include
+# else
+# include
+# endif
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -95,6 +103,9 @@ extern "C" {
#define BIO_TYPE_BIO (19|0x0400) /* (half a) BIO pair */
#define BIO_TYPE_LINEBUFFER (20|0x0200) /* filter */
#define BIO_TYPE_DGRAM (21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP (24|0x0400|0x0100)
+#endif
#define BIO_TYPE_ASN1 (22|0x0200) /* filter */
#define BIO_TYPE_COMP (23|0x0200) /* filter */
@@ -146,6 +157,7 @@ extern "C" {
/* #endif */
#define BIO_CTRL_DGRAM_QUERY_MTU 40 /* as kernel for current MTU */
+#define BIO_CTRL_DGRAM_GET_FALLBACK_MTU 47
#define BIO_CTRL_DGRAM_GET_MTU 41 /* get cached value for MTU */
#define BIO_CTRL_DGRAM_SET_MTU 42 /* set cached value for
* MTU. want to use this
@@ -161,7 +173,22 @@ extern "C" {
#define BIO_CTRL_DGRAM_SET_PEER 44 /* Destination for the data */
#define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT 45 /* Next DTLS handshake timeout to
- * adjust socket timeouts */
+ * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE 50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY 51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY 52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD 53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO 60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO 61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO 62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO 63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO 64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO 65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN 70
+#endif
/* modifiers */
#define BIO_FP_READ 0x02
@@ -306,6 +333,15 @@ DECLARE_STACK_OF(BIO)
typedef struct bio_f_buffer_ctx_struct
{
+ /* Buffers are setup like this:
+ *
+ * <---------------------- size ----------------------->
+ * +---------------------------------------------------+
+ * | consumed | remaining | free space |
+ * +---------------------------------------------------+
+ * <-- off --><------- len ------->
+ */
+
/* BIO *bio; */ /* this is now in the BIO struct */
int ibuf_size; /* how big is the input buffer */
int obuf_size; /* how big is the output buffer */
@@ -322,6 +358,34 @@ typedef struct bio_f_buffer_ctx_struct
/* Prefix and suffix callback in ASN1 BIO */
typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+ {
+ uint16_t snd_sid;
+ uint16_t snd_flags;
+ uint32_t snd_ppid;
+ uint32_t snd_context;
+ };
+
+struct bio_dgram_sctp_rcvinfo
+ {
+ uint16_t rcv_sid;
+ uint16_t rcv_ssn;
+ uint16_t rcv_flags;
+ uint32_t rcv_ppid;
+ uint32_t rcv_tsn;
+ uint32_t rcv_cumtsn;
+ uint32_t rcv_context;
+ };
+
+struct bio_dgram_sctp_prinfo
+ {
+ uint16_t pr_policy;
+ uint32_t pr_value;
+ };
+#endif
+
/* connect BIO stuff */
#define BIO_CONN_S_BEFORE 1
#define BIO_CONN_S_GET_IP 2
@@ -619,6 +683,9 @@ BIO_METHOD *BIO_f_linebuffer(void);
BIO_METHOD *BIO_f_nbio_test(void);
#ifndef OPENSSL_NO_DGRAM
BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
#endif
/* BIO_METHOD *BIO_f_ber(void); */
@@ -661,6 +728,15 @@ int BIO_set_tcp_ndelay(int sock,int turn_on);
BIO *BIO_new_socket(int sock, int close_flag);
BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+ void (*handle_notifications)(BIO *bio, void *context, void *buf),
+ void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
BIO *BIO_new_fd(int fd, int close_flag);
BIO *BIO_new_connect(char *host_port);
BIO *BIO_new_accept(char *host_port);
@@ -725,6 +801,7 @@ void ERR_load_BIO_strings(void);
#define BIO_F_BUFFER_CTRL 114
#define BIO_F_CONN_CTRL 127
#define BIO_F_CONN_STATE 115
+#define BIO_F_DGRAM_SCTP_READ 132
#define BIO_F_FILE_CTRL 116
#define BIO_F_FILE_READ 130
#define BIO_F_LINEBUFFER_CTRL 129
diff --git a/crypto/bio/bio_err.c b/crypto/bio/bio_err.c
index a224edd..0dbfbd8 100644
--- a/crypto/bio/bio_err.c
+++ b/crypto/bio/bio_err.c
@@ -1,6 +1,6 @@
/* crypto/bio/bio_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -95,6 +95,7 @@ static ERR_STRING_DATA BIO_str_functs[]=
{ERR_FUNC(BIO_F_BUFFER_CTRL), "BUFFER_CTRL"},
{ERR_FUNC(BIO_F_CONN_CTRL), "CONN_CTRL"},
{ERR_FUNC(BIO_F_CONN_STATE), "CONN_STATE"},
+{ERR_FUNC(BIO_F_DGRAM_SCTP_READ), "DGRAM_SCTP_READ"},
{ERR_FUNC(BIO_F_FILE_CTRL), "FILE_CTRL"},
{ERR_FUNC(BIO_F_FILE_READ), "FILE_READ"},
{ERR_FUNC(BIO_F_LINEBUFFER_CTRL), "LINEBUFFER_CTRL"},
diff --git a/crypto/bio/bio_lib.c b/crypto/bio/bio_lib.c
index e12bc3a..9c9646a 100644
--- a/crypto/bio/bio_lib.c
+++ b/crypto/bio/bio_lib.c
@@ -521,40 +521,40 @@ void BIO_free_all(BIO *bio)
BIO *BIO_dup_chain(BIO *in)
{
- BIO *ret=NULL,*eoc=NULL,*bio,*new;
+ BIO *ret=NULL,*eoc=NULL,*bio,*new_bio;
for (bio=in; bio != NULL; bio=bio->next_bio)
{
- if ((new=BIO_new(bio->method)) == NULL) goto err;
- new->callback=bio->callback;
- new->cb_arg=bio->cb_arg;
- new->init=bio->init;
- new->shutdown=bio->shutdown;
- new->flags=bio->flags;
+ if ((new_bio=BIO_new(bio->method)) == NULL) goto err;
+ new_bio->callback=bio->callback;
+ new_bio->cb_arg=bio->cb_arg;
+ new_bio->init=bio->init;
+ new_bio->shutdown=bio->shutdown;
+ new_bio->flags=bio->flags;
/* This will let SSL_s_sock() work with stdin/stdout */
- new->num=bio->num;
+ new_bio->num=bio->num;
- if (!BIO_dup_state(bio,(char *)new))
+ if (!BIO_dup_state(bio,(char *)new_bio))
{
- BIO_free(new);
+ BIO_free(new_bio);
goto err;
}
/* copy app data */
- if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new->ex_data,
+ if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new_bio->ex_data,
&bio->ex_data))
goto err;
if (ret == NULL)
{
- eoc=new;
+ eoc=new_bio;
ret=eoc;
}
else
{
- BIO_push(eoc,new);
- eoc=new;
+ BIO_push(eoc,new_bio);
+ eoc=new_bio;
}
}
return(ret);
diff --git a/crypto/bio/bss_bio.c b/crypto/bio/bss_bio.c
index 76bd48e..52ef0eb 100644
--- a/crypto/bio/bss_bio.c
+++ b/crypto/bio/bss_bio.c
@@ -277,10 +277,10 @@ static int bio_read(BIO *bio, char *buf, int size_)
*/
/* WARNING: The non-copying interface is largely untested as of yet
* and may contain bugs. */
-static ssize_t bio_nread0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nread0(BIO *bio, char **buf)
{
struct bio_bio_st *b, *peer_b;
- ssize_t num;
+ ossl_ssize_t num;
BIO_clear_retry_flags(bio);
@@ -315,15 +315,15 @@ static ssize_t bio_nread0(BIO *bio, char **buf)
return num;
}
-static ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
{
struct bio_bio_st *b, *peer_b;
- ssize_t num, available;
+ ossl_ssize_t num, available;
if (num_ > SSIZE_MAX)
num = SSIZE_MAX;
else
- num = (ssize_t)num_;
+ num = (ossl_ssize_t)num_;
available = bio_nread0(bio, buf);
if (num > available)
@@ -428,7 +428,7 @@ static int bio_write(BIO *bio, const char *buf, int num_)
* (example usage: bio_nwrite0(), write to buffer, bio_nwrite()
* or just bio_nwrite(), write to buffer)
*/
-static ssize_t bio_nwrite0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf)
{
struct bio_bio_st *b;
size_t num;
@@ -476,15 +476,15 @@ static ssize_t bio_nwrite0(BIO *bio, char **buf)
return num;
}
-static ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
{
struct bio_bio_st *b;
- ssize_t num, space;
+ ossl_ssize_t num, space;
if (num_ > SSIZE_MAX)
num = SSIZE_MAX;
else
- num = (ssize_t)num_;
+ num = (ossl_ssize_t)num_;
space = bio_nwrite0(bio, buf);
if (num > space)
diff --git a/crypto/bio/bss_dgram.c b/crypto/bio/bss_dgram.c
index 71ebe98..8990909 100644
--- a/crypto/bio/bss_dgram.c
+++ b/crypto/bio/bss_dgram.c
@@ -70,10 +70,27 @@
#include
#endif
-#ifdef OPENSSL_SYS_LINUX
+#ifndef OPENSSL_NO_SCTP
+#include
+#include
+#define OPENSSL_SCTP_DATA_CHUNK_TYPE 0x00
+#define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0
+#endif
+
+#if defined(OPENSSL_SYS_LINUX) && !defined(IP_MTU)
#define IP_MTU 14 /* linux is lame */
#endif
+#if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
+/* Standard definition causes type-punning problems. */
+#undef IN6_IS_ADDR_V4MAPPED
+#define s6_addr32 __u6_addr.__u6_addr32
+#define IN6_IS_ADDR_V4MAPPED(a) \
+ (((a)->s6_addr32[0] == 0) && \
+ ((a)->s6_addr32[1] == 0) && \
+ ((a)->s6_addr32[2] == htonl(0x0000ffff)))
+#endif
+
#ifdef WATT32
#define sock_write SockWrite /* Watt-32 uses same names */
#define sock_read SockRead
@@ -88,6 +105,18 @@ static int dgram_new(BIO *h);
static int dgram_free(BIO *data);
static int dgram_clear(BIO *bio);
+#ifndef OPENSSL_NO_SCTP
+static int dgram_sctp_write(BIO *h, const char *buf, int num);
+static int dgram_sctp_read(BIO *h, char *buf, int size);
+static int dgram_sctp_puts(BIO *h, const char *str);
+static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_sctp_new(BIO *h);
+static int dgram_sctp_free(BIO *data);
+#ifdef SCTP_AUTHENTICATION_EVENT
+static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp);
+#endif
+#endif
+
static int BIO_dgram_should_retry(int s);
static void get_current_time(struct timeval *t);
@@ -106,6 +135,22 @@ static BIO_METHOD methods_dgramp=
NULL,
};
+#ifndef OPENSSL_NO_SCTP
+static BIO_METHOD methods_dgramp_sctp=
+ {
+ BIO_TYPE_DGRAM_SCTP,
+ "datagram sctp socket",
+ dgram_sctp_write,
+ dgram_sctp_read,
+ dgram_sctp_puts,
+ NULL, /* dgram_gets, */
+ dgram_sctp_ctrl,
+ dgram_sctp_new,
+ dgram_sctp_free,
+ NULL,
+ };
+#endif
+
typedef struct bio_dgram_data_st
{
union {
@@ -122,6 +167,40 @@ typedef struct bio_dgram_data_st
struct timeval socket_timeout;
} bio_dgram_data;
+#ifndef OPENSSL_NO_SCTP
+typedef struct bio_dgram_sctp_save_message_st
+ {
+ BIO *bio;
+ char *data;
+ int length;
+ } bio_dgram_sctp_save_message;
+
+typedef struct bio_dgram_sctp_data_st
+ {
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sa_in;
+#if OPENSSL_USE_IPV6
+ struct sockaddr_in6 sa_in6;
+#endif
+ } peer;
+ unsigned int connected;
+ unsigned int _errno;
+ unsigned int mtu;
+ struct bio_dgram_sctp_sndinfo sndinfo;
+ struct bio_dgram_sctp_rcvinfo rcvinfo;
+ struct bio_dgram_sctp_prinfo prinfo;
+ void (*handle_notifications)(BIO *bio, void *context, void *buf);
+ void* notification_context;
+ int in_handshake;
+ int ccs_rcvd;
+ int ccs_sent;
+ int save_shutdown;
+ int peer_auth_tested;
+ bio_dgram_sctp_save_message saved_message;
+ } bio_dgram_sctp_data;
+#endif
+
BIO_METHOD *BIO_s_datagram(void)
{
return(&methods_dgramp);
@@ -186,7 +265,7 @@ static void dgram_adjust_rcv_timeout(BIO *b)
{
#if defined(SO_RCVTIMEO)
bio_dgram_data *data = (bio_dgram_data *)b->ptr;
- int sz = sizeof(int);
+ union { size_t s; int i; } sz = {0};
/* Is a timer active? */
if (data->next_timeout.tv_sec > 0 || data->next_timeout.tv_usec > 0)
@@ -196,8 +275,10 @@ static void dgram_adjust_rcv_timeout(BIO *b)
/* Read current socket timeout */
#ifdef OPENSSL_SYS_WINDOWS
int timeout;
+
+ sz.i = sizeof(timeout);
if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
- (void*)&timeout, &sz) < 0)
+ (void*)&timeout, &sz.i) < 0)
{ perror("getsockopt"); }
else
{
@@ -205,9 +286,12 @@ static void dgram_adjust_rcv_timeout(BIO *b)
data->socket_timeout.tv_usec = (timeout % 1000) * 1000;
}
#else
+ sz.i = sizeof(data->socket_timeout);
if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
&(data->socket_timeout), (void *)&sz) < 0)
{ perror("getsockopt"); }
+ else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+ OPENSSL_assert(sz.s<=sizeof(data->socket_timeout));
#endif
/* Get current time */
@@ -376,11 +460,10 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
int *ip;
struct sockaddr *to = NULL;
bio_dgram_data *data = NULL;
-#if defined(IP_MTU_DISCOVER) || defined(IP_MTU)
- long sockopt_val = 0;
- unsigned int sockopt_len = 0;
-#endif
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
+ int sockopt_val = 0;
+ socklen_t sockopt_len; /* assume that system supporting IP_MTU is
+ * modern enough to define socklen_t */
socklen_t addr_len;
union {
struct sockaddr sa;
@@ -462,7 +545,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
break;
/* (Linux)kernel sets DF bit on outgoing IP packets */
case BIO_CTRL_DGRAM_MTU_DISCOVER:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DO)
addr_len = (socklen_t)sizeof(addr);
memset((void *)&addr, 0, sizeof(addr));
if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -470,7 +553,6 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
ret = 0;
break;
}
- sockopt_len = sizeof(sockopt_val);
switch (addr.sa.sa_family)
{
case AF_INET:
@@ -479,7 +561,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
&sockopt_val, sizeof(sockopt_val))) < 0)
perror("setsockopt");
break;
-#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER)
+#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DO)
case AF_INET6:
sockopt_val = IPV6_PMTUDISC_DO;
if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
@@ -496,7 +578,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
break;
#endif
case BIO_CTRL_DGRAM_QUERY_MTU:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU)
addr_len = (socklen_t)sizeof(addr);
memset((void *)&addr, 0, sizeof(addr));
if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -547,6 +629,27 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
ret = 0;
#endif
break;
+ case BIO_CTRL_DGRAM_GET_FALLBACK_MTU:
+ switch (data->peer.sa.sa_family)
+ {
+ case AF_INET:
+ ret = 576 - 20 - 8;
+ break;
+#if OPENSSL_USE_IPV6
+ case AF_INET6:
+#ifdef IN6_IS_ADDR_V4MAPPED
+ if (IN6_IS_ADDR_V4MAPPED(&data->peer.sa_in6.sin6_addr))
+ ret = 576 - 20 - 8;
+ else
+#endif
+ ret = 1280 - 40 - 8;
+ break;
+#endif
+ default:
+ ret = 576 - 20 - 8;
+ break;
+ }
+ break;
case BIO_CTRL_DGRAM_GET_MTU:
return data->mtu;
break;
@@ -637,12 +740,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
#endif
break;
case BIO_CTRL_DGRAM_GET_RECV_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
{
- int timeout, sz = sizeof(timeout);
+ union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+ int timeout;
struct timeval *tv = (struct timeval *)ptr;
+
+ sz.i = sizeof(timeout);
if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
- (void*)&timeout, &sz) < 0)
+ (void*)&timeout, &sz.i) < 0)
{ perror("getsockopt"); ret = -1; }
else
{
@@ -650,12 +756,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
tv->tv_usec = (timeout % 1000) * 1000;
ret = sizeof(*tv);
}
- }
#else
+ sz.i = sizeof(struct timeval);
if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
- ptr, (void *)&ret) < 0)
+ ptr, (void *)&sz) < 0)
{ perror("getsockopt"); ret = -1; }
+ else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+ {
+ OPENSSL_assert(sz.s<=sizeof(struct timeval));
+ ret = (int)sz.s;
+ }
+ else
+ ret = sz.i;
#endif
+ }
break;
#endif
#if defined(SO_SNDTIMEO)
@@ -675,12 +789,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
#endif
break;
case BIO_CTRL_DGRAM_GET_SEND_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
{
- int timeout, sz = sizeof(timeout);
+ union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+ int timeout;
struct timeval *tv = (struct timeval *)ptr;
+
+ sz.i = sizeof(timeout);
if (getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
- (void*)&timeout, &sz) < 0)
+ (void*)&timeout, &sz.i) < 0)
{ perror("getsockopt"); ret = -1; }
else
{
@@ -688,12 +805,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
tv->tv_usec = (timeout % 1000) * 1000;
ret = sizeof(*tv);
}
- }
#else
+ sz.i = sizeof(struct timeval);
if ( getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
- ptr, (void *)&ret) < 0)
+ ptr, (void *)&sz) < 0)
{ perror("getsockopt"); ret = -1; }
+ else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+ {
+ OPENSSL_assert(sz.s<=sizeof(struct timeval));
+ ret = (int)sz.s;
+ }
+ else
+ ret = sz.i;
#endif
+ }
break;
#endif
case BIO_CTRL_DGRAM_GET_SEND_TIMER_EXP:
@@ -738,6 +863,910 @@ static int dgram_puts(BIO *bp, const char *str)
return(ret);
}
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void)
+ {
+ return(&methods_dgramp_sctp);
+ }
+
+BIO *BIO_new_dgram_sctp(int fd, int close_flag)
+ {
+ BIO *bio;
+ int ret, optval = 20000;
+ int auth_data = 0, auth_forward = 0;
+ unsigned char *p;
+ struct sctp_authchunk auth;
+ struct sctp_authchunks *authchunks;
+ socklen_t sockopt_len;
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+ struct sctp_event event;
+#else
+ struct sctp_event_subscribe event;
+#endif
+#endif
+
+ bio=BIO_new(BIO_s_datagram_sctp());
+ if (bio == NULL) return(NULL);
+ BIO_set_fd(bio,fd,close_flag);
+
+ /* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */
+ auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE;
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+ OPENSSL_assert(ret >= 0);
+ auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE;
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+ OPENSSL_assert(ret >= 0);
+
+ /* Test if activation was successful. When using accept(),
+ * SCTP-AUTH has to be activated for the listening socket
+ * already, otherwise the connected socket won't use it. */
+ sockopt_len = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+ authchunks = OPENSSL_malloc(sockopt_len);
+ memset(authchunks, 0, sizeof(sockopt_len));
+ ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks, &sockopt_len);
+ OPENSSL_assert(ret >= 0);
+
+ for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+ p < (unsigned char*) authchunks + sockopt_len;
+ p += sizeof(uint8_t))
+ {
+ if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+ if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+ }
+
+ OPENSSL_free(authchunks);
+
+ OPENSSL_assert(auth_data);
+ OPENSSL_assert(auth_forward);
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_AUTHENTICATION_EVENT;
+ event.se_on = 1;
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+ OPENSSL_assert(ret >= 0);
+#else
+ sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len);
+ OPENSSL_assert(ret >= 0);
+
+ event.sctp_authentication_event = 1;
+
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+ OPENSSL_assert(ret >= 0);
+#endif
+#endif
+
+ /* Disable partial delivery by setting the min size
+ * larger than the max record size of 2^14 + 2048 + 13
+ */
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval, sizeof(optval));
+ OPENSSL_assert(ret >= 0);
+
+ return(bio);
+ }
+
+int BIO_dgram_is_sctp(BIO *bio)
+ {
+ return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP);
+ }
+
+static int dgram_sctp_new(BIO *bi)
+ {
+ bio_dgram_sctp_data *data = NULL;
+
+ bi->init=0;
+ bi->num=0;
+ data = OPENSSL_malloc(sizeof(bio_dgram_sctp_data));
+ if (data == NULL)
+ return 0;
+ memset(data, 0x00, sizeof(bio_dgram_sctp_data));
+#ifdef SCTP_PR_SCTP_NONE
+ data->prinfo.pr_policy = SCTP_PR_SCTP_NONE;
+#endif
+ bi->ptr = data;
+
+ bi->flags=0;
+ return(1);
+ }
+
+static int dgram_sctp_free(BIO *a)
+ {
+ bio_dgram_sctp_data *data;
+
+ if (a == NULL) return(0);
+ if ( ! dgram_clear(a))
+ return 0;
+
+ data = (bio_dgram_sctp_data *)a->ptr;
+ if(data != NULL) OPENSSL_free(data);
+
+ return(1);
+ }
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp)
+ {
+ int ret;
+ struct sctp_authkey_event* authkeyevent = &snp->sn_auth_event;
+
+ if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY)
+ {
+ struct sctp_authkeyid authkeyid;
+
+ /* delete key */
+ authkeyid.scact_keynumber = authkeyevent->auth_keynumber;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ }
+ }
+#endif
+
+static int dgram_sctp_read(BIO *b, char *out, int outl)
+ {
+ int ret = 0, n = 0, i, optval;
+ socklen_t optlen;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+ union sctp_notification *snp;
+ struct msghdr msg;
+ struct iovec iov;
+ struct cmsghdr *cmsg;
+ char cmsgbuf[512];
+
+ if (out != NULL)
+ {
+ clear_socket_error();
+
+ do
+ {
+ memset(&data->rcvinfo, 0x00, sizeof(struct bio_dgram_sctp_rcvinfo));
+ iov.iov_base = out;
+ iov.iov_len = outl;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = 512;
+ msg.msg_flags = 0;
+ n = recvmsg(b->num, &msg, 0);
+
+ if (msg.msg_controllen > 0)
+ {
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg))
+ {
+ if (cmsg->cmsg_level != IPPROTO_SCTP)
+ continue;
+#ifdef SCTP_RCVINFO
+ if (cmsg->cmsg_type == SCTP_RCVINFO)
+ {
+ struct sctp_rcvinfo *rcvinfo;
+
+ rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg);
+ data->rcvinfo.rcv_sid = rcvinfo->rcv_sid;
+ data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn;
+ data->rcvinfo.rcv_flags = rcvinfo->rcv_flags;
+ data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid;
+ data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn;
+ data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn;
+ data->rcvinfo.rcv_context = rcvinfo->rcv_context;
+ }
+#endif
+#ifdef SCTP_SNDRCV
+ if (cmsg->cmsg_type == SCTP_SNDRCV)
+ {
+ struct sctp_sndrcvinfo *sndrcvinfo;
+
+ sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream;
+ data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn;
+ data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags;
+ data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid;
+ data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn;
+ data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn;
+ data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context;
+ }
+#endif
+ }
+ }
+
+ if (n <= 0)
+ {
+ if (n < 0)
+ ret = n;
+ break;
+ }
+
+ if (msg.msg_flags & MSG_NOTIFICATION)
+ {
+ snp = (union sctp_notification*) out;
+ if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+ {
+#ifdef SCTP_EVENT
+ struct sctp_event event;
+#else
+ struct sctp_event_subscribe event;
+ socklen_t eventsize;
+#endif
+ /* If a message has been delayed until the socket
+ * is dry, it can be sent now.
+ */
+ if (data->saved_message.length > 0)
+ {
+ dgram_sctp_write(data->saved_message.bio, data->saved_message.data,
+ data->saved_message.length);
+ OPENSSL_free(data->saved_message.data);
+ data->saved_message.length = 0;
+ }
+
+ /* disable sender dry event */
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 0;
+ i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+ OPENSSL_assert(i >= 0);
+#else
+ eventsize = sizeof(struct sctp_event_subscribe);
+ i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+ OPENSSL_assert(i >= 0);
+
+ event.sctp_sender_dry_event = 0;
+
+ i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+ OPENSSL_assert(i >= 0);
+#endif
+ }
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, snp);
+#endif
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context, (void*) out);
+
+ memset(out, 0, outl);
+ }
+ else
+ ret += n;
+ }
+ while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR) && (ret < outl));
+
+ if (ret > 0 && !(msg.msg_flags & MSG_EOR))
+ {
+ /* Partial message read, this should never happen! */
+
+ /* The buffer was too small, this means the peer sent
+ * a message that was larger than allowed. */
+ if (ret == outl)
+ return -1;
+
+ /* Test if socket buffer can handle max record
+ * size (2^14 + 2048 + 13)
+ */
+ optlen = (socklen_t) sizeof(int);
+ ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen);
+ OPENSSL_assert(ret >= 0);
+ OPENSSL_assert(optval >= 18445);
+
+ /* Test if SCTP doesn't partially deliver below
+ * max record size (2^14 + 2048 + 13)
+ */
+ optlen = (socklen_t) sizeof(int);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT,
+ &optval, &optlen);
+ OPENSSL_assert(ret >= 0);
+ OPENSSL_assert(optval >= 18445);
+
+ /* Partially delivered notification??? Probably a bug.... */
+ OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION));
+
+ /* Everything seems ok till now, so it's most likely
+ * a message dropped by PR-SCTP.
+ */
+ memset(out, 0, outl);
+ BIO_set_retry_read(b);
+ return -1;
+ }
+
+ BIO_clear_retry_flags(b);
+ if (ret < 0)
+ {
+ if (BIO_dgram_should_retry(ret))
+ {
+ BIO_set_retry_read(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+
+ /* Test if peer uses SCTP-AUTH before continuing */
+ if (!data->peer_auth_tested)
+ {
+ int ii, auth_data = 0, auth_forward = 0;
+ unsigned char *p;
+ struct sctp_authchunks *authchunks;
+
+ optlen = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+ authchunks = OPENSSL_malloc(optlen);
+ memset(authchunks, 0, sizeof(optlen));
+ ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS, authchunks, &optlen);
+ OPENSSL_assert(ii >= 0);
+
+ for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+ p < (unsigned char*) authchunks + optlen;
+ p += sizeof(uint8_t))
+ {
+ if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+ if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+ }
+
+ OPENSSL_free(authchunks);
+
+ if (!auth_data || !auth_forward)
+ {
+ BIOerr(BIO_F_DGRAM_SCTP_READ,BIO_R_CONNECT_ERROR);
+ return -1;
+ }
+
+ data->peer_auth_tested = 1;
+ }
+ }
+ return(ret);
+ }
+
+static int dgram_sctp_write(BIO *b, const char *in, int inl)
+ {
+ int ret;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+ struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo);
+ struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo);
+ struct bio_dgram_sctp_sndinfo handshake_sinfo;
+ struct iovec iov[1];
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+ char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) + CMSG_SPACE(sizeof(struct sctp_prinfo))];
+ struct sctp_sndinfo *sndinfo;
+ struct sctp_prinfo *prinfo;
+#else
+ char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct sctp_sndrcvinfo *sndrcvinfo;
+#endif
+
+ clear_socket_error();
+
+ /* If we're send anything else than application data,
+ * disable all user parameters and flags.
+ */
+ if (in[0] != 23) {
+ memset(&handshake_sinfo, 0x00, sizeof(struct bio_dgram_sctp_sndinfo));
+#ifdef SCTP_SACK_IMMEDIATELY
+ handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY;
+#endif
+ sinfo = &handshake_sinfo;
+ }
+
+ /* If we have to send a shutdown alert message and the
+ * socket is not dry yet, we have to save it and send it
+ * as soon as the socket gets dry.
+ */
+ if (data->save_shutdown && !BIO_dgram_sctp_wait_for_dry(b))
+ {
+ data->saved_message.bio = b;
+ data->saved_message.length = inl;
+ data->saved_message.data = OPENSSL_malloc(inl);
+ memcpy(data->saved_message.data, in, inl);
+ return inl;
+ }
+
+ iov[0].iov_base = (char *)in;
+ iov[0].iov_len = inl;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = (caddr_t)cmsgbuf;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+ cmsg = (struct cmsghdr *)cmsgbuf;
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo));
+ sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg);
+ memset(sndinfo, 0, sizeof(struct sctp_sndinfo));
+ sndinfo->snd_sid = sinfo->snd_sid;
+ sndinfo->snd_flags = sinfo->snd_flags;
+ sndinfo->snd_ppid = sinfo->snd_ppid;
+ sndinfo->snd_context = sinfo->snd_context;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo));
+
+ cmsg = (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))];
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_PRINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo));
+ prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg);
+ memset(prinfo, 0, sizeof(struct sctp_prinfo));
+ prinfo->pr_policy = pinfo->pr_policy;
+ prinfo->pr_value = pinfo->pr_value;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo));
+#else
+ cmsg = (struct cmsghdr *)cmsgbuf;
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo));
+ sndrcvinfo->sinfo_stream = sinfo->snd_sid;
+ sndrcvinfo->sinfo_flags = sinfo->snd_flags;
+#ifdef __FreeBSD__
+ sndrcvinfo->sinfo_flags |= pinfo->pr_policy;
+#endif
+ sndrcvinfo->sinfo_ppid = sinfo->snd_ppid;
+ sndrcvinfo->sinfo_context = sinfo->snd_context;
+ sndrcvinfo->sinfo_timetolive = pinfo->pr_value;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo));
+#endif
+
+ ret = sendmsg(b->num, &msg, 0);
+
+ BIO_clear_retry_flags(b);
+ if (ret <= 0)
+ {
+ if (BIO_dgram_should_retry(ret))
+ {
+ BIO_set_retry_write(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+ return(ret);
+ }
+
+static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr)
+ {
+ long ret=1;
+ bio_dgram_sctp_data *data = NULL;
+ socklen_t sockopt_len = 0;
+ struct sctp_authkeyid authkeyid;
+ struct sctp_authkey *authkey;
+
+ data = (bio_dgram_sctp_data *)b->ptr;
+
+ switch (cmd)
+ {
+ case BIO_CTRL_DGRAM_QUERY_MTU:
+ /* Set to maximum (2^14)
+ * and ignore user input to enable transport
+ * protocol fragmentation.
+ * Returns always 2^14.
+ */
+ data->mtu = 16384;
+ ret = data->mtu;
+ break;
+ case BIO_CTRL_DGRAM_SET_MTU:
+ /* Set to maximum (2^14)
+ * and ignore input to enable transport
+ * protocol fragmentation.
+ * Returns always 2^14.
+ */
+ data->mtu = 16384;
+ ret = data->mtu;
+ break;
+ case BIO_CTRL_DGRAM_SET_CONNECTED:
+ case BIO_CTRL_DGRAM_CONNECT:
+ /* Returns always -1. */
+ ret = -1;
+ break;
+ case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+ /* SCTP doesn't need the DTLS timer
+ * Returns always 1.
+ */
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+ if (num > 0)
+ data->in_handshake = 1;
+ else
+ data->in_handshake = 0;
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY, &data->in_handshake, sizeof(int));
+ break;
+ case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY:
+ /* New shared key for SCTP AUTH.
+ * Returns 0 on success, -1 otherwise.
+ */
+
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+ if (ret < 0) break;
+
+ /* Add new key */
+ sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t);
+ authkey = OPENSSL_malloc(sockopt_len);
+ memset(authkey, 0x00, sockopt_len);
+ authkey->sca_keynumber = authkeyid.scact_keynumber + 1;
+#ifndef __FreeBSD__
+ /* This field is missing in FreeBSD 8.2 and earlier,
+ * and FreeBSD 8.3 and higher work without it.
+ */
+ authkey->sca_keylength = 64;
+#endif
+ memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t));
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len);
+ if (ret < 0) break;
+
+ /* Reset active key */
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0) break;
+
+ break;
+ case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY:
+ /* Returns 0 on success, -1 otherwise. */
+
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+ if (ret < 0) break;
+
+ /* Set active key */
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0) break;
+
+ /* CCS has been sent, so remember that and fall through
+ * to check if we need to deactivate an old key
+ */
+ data->ccs_sent = 1;
+
+ case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD:
+ /* Returns 0 on success, -1 otherwise. */
+
+ /* Has this command really been called or is this just a fall-through? */
+ if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD)
+ data->ccs_rcvd = 1;
+
+ /* CSS has been both, received and sent, so deactivate an old key */
+ if (data->ccs_rcvd == 1 && data->ccs_sent == 1)
+ {
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+ if (ret < 0) break;
+
+ /* Deactivate key or delete second last key if
+ * SCTP_AUTHENTICATION_EVENT is not available.
+ */
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+#ifdef SCTP_AUTH_DEACTIVATE_KEY
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY,
+ &authkeyid, sockopt_len);
+ if (ret < 0) break;
+#endif
+#ifndef SCTP_AUTHENTICATION_EVENT
+ if (authkeyid.scact_keynumber > 0)
+ {
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0) break;
+ }
+#endif
+
+ data->ccs_rcvd = 0;
+ data->ccs_sent = 0;
+ }
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+ num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+ memcpy(ptr, &(data->sndinfo), num);
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+ num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+ memcpy(&(data->sndinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+ num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+ memcpy(ptr, &data->rcvinfo, num);
+
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+ num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+ memcpy(&(data->rcvinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_PRINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+ num = sizeof(struct bio_dgram_sctp_prinfo);
+
+ memcpy(ptr, &(data->prinfo), num);
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_PRINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+ num = sizeof(struct bio_dgram_sctp_prinfo);
+
+ memcpy(&(data->prinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN:
+ /* Returns always 1. */
+ if (num > 0)
+ data->save_shutdown = 1;
+ else
+ data->save_shutdown = 0;
+ break;
+
+ default:
+ /* Pass to default ctrl function to
+ * process SCTP unspecific commands
+ */
+ ret=dgram_ctrl(b, cmd, num, ptr);
+ break;
+ }
+ return(ret);
+ }
+
+int BIO_dgram_sctp_notification_cb(BIO *b,
+ void (*handle_notifications)(BIO *bio, void *context, void *buf),
+ void *context)
+ {
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+ if (handle_notifications != NULL)
+ {
+ data->handle_notifications = handle_notifications;
+ data->notification_context = context;
+ }
+ else
+ return -1;
+
+ return 0;
+ }
+
+int BIO_dgram_sctp_wait_for_dry(BIO *b)
+{
+ int is_dry = 0;
+ int n, sockflags, ret;
+ union sctp_notification snp;
+ struct msghdr msg;
+ struct iovec iov;
+#ifdef SCTP_EVENT
+ struct sctp_event event;
+#else
+ struct sctp_event_subscribe event;
+ socklen_t eventsize;
+#endif
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+ /* set sender dry event */
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+ eventsize = sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+ if (ret < 0)
+ return -1;
+
+ event.sctp_sender_dry_event = 1;
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+ if (ret < 0)
+ return -1;
+
+ /* peek for notification */
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+ if (n <= 0)
+ {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return 0;
+ }
+
+ /* if we find a notification, process it and try again if necessary */
+ while (msg.msg_flags & MSG_NOTIFICATION)
+ {
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ n = recvmsg(b->num, &msg, 0);
+ if (n <= 0)
+ {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return is_dry;
+ }
+
+ if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+ {
+ is_dry = 1;
+
+ /* disable sender dry event */
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 0;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+ eventsize = (socklen_t) sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+ if (ret < 0)
+ return -1;
+
+ event.sctp_sender_dry_event = 0;
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+ if (ret < 0)
+ return -1;
+ }
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context, (void*) &snp);
+
+ /* found notification, peek again */
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ /* if we have seen the dry already, don't wait */
+ if (is_dry)
+ {
+ sockflags = fcntl(b->num, F_GETFL, 0);
+ fcntl(b->num, F_SETFL, O_NONBLOCK);
+ }
+
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+
+ if (is_dry)
+ {
+ fcntl(b->num, F_SETFL, sockflags);
+ }
+
+ if (n <= 0)
+ {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return is_dry;
+ }
+ }
+
+ /* read anything else */
+ return is_dry;
+}
+
+int BIO_dgram_sctp_msg_waiting(BIO *b)
+ {
+ int n, sockflags;
+ union sctp_notification snp;
+ struct msghdr msg;
+ struct iovec iov;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+ /* Check if there are any messages waiting to be read */
+ do
+ {
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ sockflags = fcntl(b->num, F_GETFL, 0);
+ fcntl(b->num, F_SETFL, O_NONBLOCK);
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+ fcntl(b->num, F_SETFL, sockflags);
+
+ /* if notification, process and try again */
+ if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION))
+ {
+#ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+ n = recvmsg(b->num, &msg, 0);
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context, (void*) &snp);
+ }
+
+ } while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION));
+
+ /* Return 1 if there is a message to be read, return 0 otherwise. */
+ if (n > 0)
+ return 1;
+ else
+ return 0;
+ }
+
+static int dgram_sctp_puts(BIO *bp, const char *str)
+ {
+ int n,ret;
+
+ n=strlen(str);
+ ret=dgram_sctp_write(bp,str,n);
+ return(ret);
+ }
+#endif
+
static int BIO_dgram_should_retry(int i)
{
int err;
diff --git a/crypto/bn/asm/armv4-gf2m.S b/crypto/bn/asm/armv4-gf2m.S
new file mode 100644
index 0000000..038f086
--- /dev/null
+++ b/crypto/bn/asm/armv4-gf2m.S
@@ -0,0 +1,213 @@
+#include "arm_arch.h"
+
+.text
+.code 32
+
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type mul_1x1_neon,%function
+.align 5
+mul_1x1_neon:
+ vshl.u64 d2,d16,#8 @ q1-q3 are slided
+ vmull.p8 q0,d16,d17 @ a·bb
+ vshl.u64 d4,d16,#16
+ vmull.p8 q1,d2,d17 @ a<<8·bb
+ vshl.u64 d6,d16,#24
+ vmull.p8 q2,d4,d17 @ a<<16·bb
+ vshr.u64 d2,#8
+ vmull.p8 q3,d6,d17 @ a<<24·bb
+ vshl.u64 d3,#24
+ veor d0,d2
+ vshr.u64 d4,#16
+ veor d0,d3
+ vshl.u64 d5,#16
+ veor d0,d4
+ vshr.u64 d6,#24
+ veor d0,d5
+ vshl.u64 d7,#8
+ veor d0,d6
+ veor d0,d7
+ .word 0xe12fff1e
+.size mul_1x1_neon,.-mul_1x1_neon
+#endif
+.type mul_1x1_ialu,%function
+.align 5
+mul_1x1_ialu:
+ mov r4,#0
+ bic r5,r1,#3<<30 @ a1=a&0x3fffffff
+ str r4,[sp,#0] @ tab[0]=0
+ add r6,r5,r5 @ a2=a1<<1
+ str r5,[sp,#4] @ tab[1]=a1
+ eor r7,r5,r6 @ a1^a2
+ str r6,[sp,#8] @ tab[2]=a2
+ mov r8,r5,lsl#2 @ a4=a1<<2
+ str r7,[sp,#12] @ tab[3]=a1^a2
+ eor r9,r5,r8 @ a1^a4
+ str r8,[sp,#16] @ tab[4]=a4
+ eor r4,r6,r8 @ a2^a4
+ str r9,[sp,#20] @ tab[5]=a1^a4
+ eor r7,r7,r8 @ a1^a2^a4
+ str r4,[sp,#24] @ tab[6]=a2^a4
+ and r8,r12,r0,lsl#2
+ str r7,[sp,#28] @ tab[7]=a1^a2^a4
+
+ and r9,r12,r0,lsr#1
+ ldr r5,[sp,r8] @ tab[b & 0x7]
+ and r8,r12,r0,lsr#4
+ ldr r7,[sp,r9] @ tab[b >> 3 & 0x7]
+ and r9,r12,r0,lsr#7
+ ldr r6,[sp,r8] @ tab[b >> 6 & 0x7]
+ eor r5,r5,r7,lsl#3 @ stall
+ mov r4,r7,lsr#29
+ ldr r7,[sp,r9] @ tab[b >> 9 & 0x7]
+
+ and r8,r12,r0,lsr#10
+ eor r5,r5,r6,lsl#6
+ eor r4,r4,r6,lsr#26
+ ldr r6,[sp,r8] @ tab[b >> 12 & 0x7]
+
+ and r9,r12,r0,lsr#13
+ eor r5,r5,r7,lsl#9
+ eor r4,r4,r7,lsr#23
+ ldr r7,[sp,r9] @ tab[b >> 15 & 0x7]
+
+ and r8,r12,r0,lsr#16
+ eor r5,r5,r6,lsl#12
+ eor r4,r4,r6,lsr#20
+ ldr r6,[sp,r8] @ tab[b >> 18 & 0x7]
+
+ and r9,r12,r0,lsr#19
+ eor r5,r5,r7,lsl#15
+ eor r4,r4,r7,lsr#17
+ ldr r7,[sp,r9] @ tab[b >> 21 & 0x7]
+
+ and r8,r12,r0,lsr#22
+ eor r5,r5,r6,lsl#18
+ eor r4,r4,r6,lsr#14
+ ldr r6,[sp,r8] @ tab[b >> 24 & 0x7]
+
+ and r9,r12,r0,lsr#25
+ eor r5,r5,r7,lsl#21
+ eor r4,r4,r7,lsr#11
+ ldr r7,[sp,r9] @ tab[b >> 27 & 0x7]
+
+ tst r1,#1<<30
+ and r8,r12,r0,lsr#28
+ eor r5,r5,r6,lsl#24
+ eor r4,r4,r6,lsr#8
+ ldr r6,[sp,r8] @ tab[b >> 30 ]
+
+ eorne r5,r5,r0,lsl#30
+ eorne r4,r4,r0,lsr#2
+ tst r1,#1<<31
+ eor r5,r5,r7,lsl#27
+ eor r4,r4,r7,lsr#5
+ eorne r5,r5,r0,lsl#31
+ eorne r4,r4,r0,lsr#1
+ eor r5,r5,r6,lsl#30
+ eor r4,r4,r6,lsr#2
+
+ mov pc,lr
+.size mul_1x1_ialu,.-mul_1x1_ialu
+.global bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,%function
+.align 5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+ ldr r12,.LOPENSSL_armcap
+.Lpic: ldr r12,[pc,r12]
+ tst r12,#1
+ beq .Lialu
+
+ veor d18,d18
+ vmov.32 d19,r3,r3 @ two copies of b1
+ vmov.32 d18[0],r1 @ a1
+
+ veor d20,d20
+ vld1.32 d21[],[sp,:32] @ two copies of b0
+ vmov.32 d20[0],r2 @ a0
+ mov r12,lr
+
+ vmov d16,d18
+ vmov d17,d19
+ bl mul_1x1_neon @ a1·b1
+ vmov d22,d0
+
+ vmov d16,d20
+ vmov d17,d21
+ bl mul_1x1_neon @ a0·b0
+ vmov d23,d0
+
+ veor d16,d20,d18
+ veor d17,d21,d19
+ veor d20,d23,d22
+ bl mul_1x1_neon @ (a0+a1)·(b0+b1)
+
+ veor d0,d20 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ vshl.u64 d1,d0,#32
+ vshr.u64 d0,d0,#32
+ veor d23,d1
+ veor d22,d0
+ vst1.32 {d23[0]},[r0,:32]!
+ vst1.32 {d23[1]},[r0,:32]!
+ vst1.32 {d22[0]},[r0,:32]!
+ vst1.32 {d22[1]},[r0,:32]
+ bx r12
+.align 4
+.Lialu:
+#endif
+ stmdb sp!,{r4-r10,lr}
+ mov r10,r0 @ reassign 1st argument
+ mov r0,r3 @ r0=b1
+ ldr r3,[sp,#32] @ load b0
+ mov r12,#7<<2
+ sub sp,sp,#32 @ allocate tab[8]
+
+ bl mul_1x1_ialu @ a1·b1
+ str r5,[r10,#8]
+ str r4,[r10,#12]
+
+ eor r0,r0,r3 @ flip b0 and b1
+ eor r1,r1,r2 @ flip a0 and a1
+ eor r3,r3,r0
+ eor r2,r2,r1
+ eor r0,r0,r3
+ eor r1,r1,r2
+ bl mul_1x1_ialu @ a0·b0
+ str r5,[r10]
+ str r4,[r10,#4]
+
+ eor r1,r1,r2
+ eor r0,r0,r3
+ bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
+ ldmia r10,{r6-r9}
+ eor r5,r5,r4
+ eor r4,r4,r7
+ eor r5,r5,r6
+ eor r4,r4,r8
+ eor r5,r5,r9
+ eor r4,r4,r9
+ str r4,[r10,#8]
+ eor r5,r5,r4
+ add sp,sp,#32 @ destroy tab[8]
+ str r5,[r10,#4]
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r10,pc}
+#else
+ ldmia sp!,{r4-r10,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by "
+.align 5
+
+.comm OPENSSL_armcap_P,4,4
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 0000000..22ad1f8
--- /dev/null
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,278 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code 32
+
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type mul_1x1_neon,%function
+.align 5
+mul_1x1_neon:
+ vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
+ vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
+ vshl.u64 `&Dlo("q2")`,d16,#16
+ vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
+ vshl.u64 `&Dlo("q3")`,d16,#24
+ vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
+ vshr.u64 `&Dlo("q1")`,#8
+ vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
+ vshl.u64 `&Dhi("q1")`,#24
+ veor d0,`&Dlo("q1")`
+ vshr.u64 `&Dlo("q2")`,#16
+ veor d0,`&Dhi("q1")`
+ vshl.u64 `&Dhi("q2")`,#16
+ veor d0,`&Dlo("q2")`
+ vshr.u64 `&Dlo("q3")`,#24
+ veor d0,`&Dhi("q2")`
+ vshl.u64 `&Dhi("q3")`,#8
+ veor d0,`&Dlo("q3")`
+ veor d0,`&Dhi("q3")`
+ bx lr
+.size mul_1x1_neon,.-mul_1x1_neon
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type mul_1x1_ialu,%function
+.align 5
+mul_1x1_ialu:
+ mov $a0,#0
+ bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
+ str $a0,[sp,#0] @ tab[0]=0
+ add $a2,$a1,$a1 @ a2=a1<<1
+ str $a1,[sp,#4] @ tab[1]=a1
+ eor $a12,$a1,$a2 @ a1^a2
+ str $a2,[sp,#8] @ tab[2]=a2
+ mov $a4,$a1,lsl#2 @ a4=a1<<2
+ str $a12,[sp,#12] @ tab[3]=a1^a2
+ eor $a14,$a1,$a4 @ a1^a4
+ str $a4,[sp,#16] @ tab[4]=a4
+ eor $a0,$a2,$a4 @ a2^a4
+ str $a14,[sp,#20] @ tab[5]=a1^a4
+ eor $a12,$a12,$a4 @ a1^a2^a4
+ str $a0,[sp,#24] @ tab[6]=a2^a4
+ and $i0,$mask,$b,lsl#2
+ str $a12,[sp,#28] @ tab[7]=a1^a2^a4
+
+ and $i1,$mask,$b,lsr#1
+ ldr $lo,[sp,$i0] @ tab[b & 0x7]
+ and $i0,$mask,$b,lsr#4
+ ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
+ and $i1,$mask,$b,lsr#7
+ ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
+ eor $lo,$lo,$t1,lsl#3 @ stall
+ mov $hi,$t1,lsr#29
+ ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
+
+ and $i0,$mask,$b,lsr#10
+ eor $lo,$lo,$t0,lsl#6
+ eor $hi,$hi,$t0,lsr#26
+ ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
+
+ and $i1,$mask,$b,lsr#13
+ eor $lo,$lo,$t1,lsl#9
+ eor $hi,$hi,$t1,lsr#23
+ ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
+
+ and $i0,$mask,$b,lsr#16
+ eor $lo,$lo,$t0,lsl#12
+ eor $hi,$hi,$t0,lsr#20
+ ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
+
+ and $i1,$mask,$b,lsr#19
+ eor $lo,$lo,$t1,lsl#15
+ eor $hi,$hi,$t1,lsr#17
+ ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
+
+ and $i0,$mask,$b,lsr#22
+ eor $lo,$lo,$t0,lsl#18
+ eor $hi,$hi,$t0,lsr#14
+ ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
+
+ and $i1,$mask,$b,lsr#25
+ eor $lo,$lo,$t1,lsl#21
+ eor $hi,$hi,$t1,lsr#11
+ ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
+
+ tst $a,#1<<30
+ and $i0,$mask,$b,lsr#28
+ eor $lo,$lo,$t0,lsl#24
+ eor $hi,$hi,$t0,lsr#8
+ ldr $t0,[sp,$i0] @ tab[b >> 30 ]
+
+ eorne $lo,$lo,$b,lsl#30
+ eorne $hi,$hi,$b,lsr#2
+ tst $a,#1<<31
+ eor $lo,$lo,$t1,lsl#27
+ eor $hi,$hi,$t1,lsr#5
+ eorne $lo,$lo,$b,lsl#31
+ eorne $hi,$hi,$b,lsr#1
+ eor $lo,$lo,$t0,lsl#30
+ eor $hi,$hi,$t0,lsr#2
+
+ mov pc,lr
+.size mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void bn_GF2m_mul_2x2(BN_ULONG *r,
+# BN_ULONG a1,BN_ULONG a0,
+# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+
+$code.=<<___;
+.global bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,%function
+.align 5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+ ldr r12,.LOPENSSL_armcap
+.Lpic: ldr r12,[pc,r12]
+ tst r12,#1
+ beq .Lialu
+
+ veor $A1,$A1
+ vmov.32 $B1,r3,r3 @ two copies of b1
+ vmov.32 ${A1}[0],r1 @ a1
+
+ veor $A0,$A0
+ vld1.32 ${B0}[],[sp,:32] @ two copies of b0
+ vmov.32 ${A0}[0],r2 @ a0
+ mov r12,lr
+
+ vmov d16,$A1
+ vmov d17,$B1
+ bl mul_1x1_neon @ a1·b1
+ vmov $A1B1,d0
+
+ vmov d16,$A0
+ vmov d17,$B0
+ bl mul_1x1_neon @ a0·b0
+ vmov $A0B0,d0
+
+ veor d16,$A0,$A1
+ veor d17,$B0,$B1
+ veor $A0,$A0B0,$A1B1
+ bl mul_1x1_neon @ (a0+a1)·(b0+b1)
+
+ veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ vshl.u64 d1,d0,#32
+ vshr.u64 d0,d0,#32
+ veor $A0B0,d1
+ veor $A1B1,d0
+ vst1.32 {${A0B0}[0]},[r0,:32]!
+ vst1.32 {${A0B0}[1]},[r0,:32]!
+ vst1.32 {${A1B1}[0]},[r0,:32]!
+ vst1.32 {${A1B1}[1]},[r0,:32]
+ bx r12
+.align 4
+.Lialu:
+#endif
+___
+$ret="r10"; # reassigned 1st argument
+$code.=<<___;
+ stmdb sp!,{r4-r10,lr}
+ mov $ret,r0 @ reassign 1st argument
+ mov $b,r3 @ $b=b1
+ ldr r3,[sp,#32] @ load b0
+ mov $mask,#7<<2
+ sub sp,sp,#32 @ allocate tab[8]
+
+ bl mul_1x1_ialu @ a1·b1
+ str $lo,[$ret,#8]
+ str $hi,[$ret,#12]
+
+ eor $b,$b,r3 @ flip b0 and b1
+ eor $a,$a,r2 @ flip a0 and a1
+ eor r3,r3,$b
+ eor r2,r2,$a
+ eor $b,$b,r3
+ eor $a,$a,r2
+ bl mul_1x1_ialu @ a0·b0
+ str $lo,[$ret]
+ str $hi,[$ret,#4]
+
+ eor $a,$a,r2
+ eor $b,$b,r3
+ bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+ ldmia $ret,{@r[0]-@r[3]}
+ eor $lo,$lo,$hi
+ eor $hi,$hi,@r[1]
+ eor $lo,$lo,@r[0]
+ eor $hi,$hi,@r[2]
+ eor $lo,$lo,@r[3]
+ eor $hi,$hi,@r[3]
+ str $hi,[$ret,#8]
+ eor $lo,$lo,$hi
+ add sp,sp,#32 @ destroy tab[8]
+ str $lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r10,pc}
+#else
+ ldmia sp!,{r4-r10,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by "
+.align 5
+
+.comm OPENSSL_armcap_P,4,4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/crypto/bn/asm/armv4-mont.s b/crypto/bn/asm/armv4-mont.S
similarity index 100%
rename from crypto/bn/asm/armv4-mont.s
rename to crypto/bn/asm/armv4-mont.S
index 0488455..64c220b 100644
--- a/crypto/bn/asm/armv4-mont.s
+++ b/crypto/bn/asm/armv4-mont.S
@@ -38,9 +38,9 @@ bn_mul_mont:
.L1st:
ldr r5,[r1],#4 @ ap[j],ap++
mov r10,r11
+ ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[0]
- ldr r6,[r3],#4 @ np[j],np++
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adds r12,r12,r10
@@ -50,21 +50,21 @@ bn_mul_mont:
bne .L1st
adds r12,r12,r11
+ ldr r4,[r0,#13*4] @ restore bp
mov r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
adc r14,r14,#0
- ldr r4,[r0,#13*4] @ restore bp
str r12,[r0] @ tp[num-1]=
- ldr r8,[r0,#14*4] @ restore n0
str r14,[r0,#4] @ tp[num]=
.Louter:
sub r7,r0,sp @ "original" r0-1 value
sub r1,r1,r7 @ "rewind" ap to &ap[1]
- sub r3,r3,r7 @ "rewind" np to &np[1]
ldr r2,[r4,#4]! @ *(++bp)
+ sub r3,r3,r7 @ "rewind" np to &np[1]
ldr r5,[r1,#-4] @ ap[0]
- ldr r6,[r3,#-4] @ np[0]
ldr r10,[sp] @ tp[0]
+ ldr r6,[r3,#-4] @ np[0]
ldr r7,[sp,#4] @ tp[1]
mov r11,#0
@@ -78,13 +78,13 @@ bn_mul_mont:
.Linner:
ldr r5,[r1],#4 @ ap[j],ap++
adds r10,r11,r7 @ +=tp[j]
+ ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[i]
- ldr r6,[r3],#4 @ np[j],np++
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
- ldr r7,[r4,#8] @ tp[j+1]
adc r11,r11,#0
+ ldr r7,[r4,#8] @ tp[j+1]
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
@@ -93,13 +93,13 @@ bn_mul_mont:
adds r12,r12,r11
mov r14,#0
+ ldr r4,[r0,#13*4] @ restore bp
adc r14,r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
adds r12,r12,r7
- adc r14,r14,#0
- ldr r4,[r0,#13*4] @ restore bp
ldr r7,[r0,#15*4] @ restore &bp[num]
+ adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
- ldr r8,[r0,#14*4] @ restore n0
str r14,[r0,#4] @ tp[num]=
cmp r4,r7
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index 14e0d2d..f78a8b5 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -23,6 +23,9 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
@@ -89,9 +92,9 @@
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
+ ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
- ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
@@ -101,21 +104,21 @@
bne .L1st
adds $nlo,$nlo,$ahi
+ ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
+ ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
- ldr $tp,[$_bp] @ restore bp
str $nlo,[$num] @ tp[num-1]=
- ldr $n0,[$_n0] @ restore n0
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
- sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $bi,[$tp,#4]! @ *(++bp)
+ sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
- ldr $nj,[$np,#-4] @ np[0]
ldr $alo,[sp] @ tp[0]
+ ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
@@ -129,13 +132,13 @@
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
+ ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
- ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
- ldr $tj,[$tp,#8] @ tp[j+1]
adc $ahi,$ahi,#0
+ ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
@@ -144,13 +147,13 @@
adds $nlo,$nlo,$ahi
mov $nhi,#0
+ ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
+ ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
- adc $nhi,$nhi,#0
- ldr $tp,[$_bp] @ restore bp
ldr $tj,[$_bpend] @ restore &bp[num]
+ adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
- ldr $n0,[$_n0] @ restore n0
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
diff --git a/crypto/bn/asm/bn-586.S b/crypto/bn/asm/bn-586.S
new file mode 100644
index 0000000..fe873ce
--- /dev/null
+++ b/crypto/bn/asm/bn-586.S
@@ -0,0 +1,1384 @@
+.file "crypto/bn/asm/bn-586.s"
+.text
+.globl bn_mul_add_words
+.type bn_mul_add_words,@function
+.align 16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 28(%esp),%ecx
+ movl 24(%esp),%ebx
+ andl $4294967288,%ecx
+ movl 32(%esp),%ebp
+ pushl %ecx
+ jz .L000maw_finish
+.align 16
+.L001maw_loop:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 28(%edi),%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ subl $8,%ecx
+ leal 32(%ebx),%ebx
+ leal 32(%edi),%edi
+ jnz .L001maw_loop
+.L000maw_finish:
+ movl 32(%esp),%ecx
+ andl $7,%ecx
+ jnz .L002maw_finish2
+ jmp .L003maw_end
+.L002maw_finish2:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L003maw_end:
+ movl %esi,%eax
+ popl %ecx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl bn_mul_words
+.type bn_mul_words,@function
+.align 16
+bn_mul_words:
+.L_bn_mul_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ebp
+ movl 32(%esp),%ecx
+ andl $4294967288,%ebp
+ jz .L004mw_finish
+.L005mw_loop:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ addl $32,%ebx
+ addl $32,%edi
+ subl $8,%ebp
+ jz .L004mw_finish
+ jmp .L005mw_loop
+.L004mw_finish:
+ movl 28(%esp),%ebp
+ andl $7,%ebp
+ jnz .L006mw_finish2
+ jmp .L007mw_end
+.L006mw_finish2:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L007mw_end:
+ movl %esi,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_words,.-.L_bn_mul_words_begin
+.globl bn_sqr_words
+.type bn_sqr_words,@function
+.align 16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%ebx
+ andl $4294967288,%ebx
+ jz .L008sw_finish
+.L009sw_loop:
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ movl %edx,4(%esi)
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ movl %edx,12(%esi)
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ movl %edx,20(%esi)
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ movl %edx,28(%esi)
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ movl %edx,36(%esi)
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ movl %edx,44(%esi)
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+
+ movl 28(%edi),%eax
+ mull %eax
+ movl %eax,56(%esi)
+ movl %edx,60(%esi)
+
+ addl $32,%edi
+ addl $64,%esi
+ subl $8,%ebx
+ jnz .L009sw_loop
+.L008sw_finish:
+ movl 28(%esp),%ebx
+ andl $7,%ebx
+ jz .L010sw_end
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ decl %ebx
+ movl %edx,4(%esi)
+ jz .L010sw_end
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ decl %ebx
+ movl %edx,12(%esi)
+ jz .L010sw_end
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ decl %ebx
+ movl %edx,20(%esi)
+ jz .L010sw_end
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ decl %ebx
+ movl %edx,28(%esi)
+ jz .L010sw_end
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ decl %ebx
+ movl %edx,36(%esi)
+ jz .L010sw_end
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ decl %ebx
+ movl %edx,44(%esi)
+ jz .L010sw_end
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+.L010sw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl bn_div_words
+.type bn_div_words,@function
+.align 16
+bn_div_words:
+.L_bn_div_words_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ movl 12(%esp),%ecx
+ divl %ecx
+ ret
+.size bn_div_words,.-.L_bn_div_words_begin
+.globl bn_add_words
+.type bn_add_words,@function
+.align 16
+bn_add_words:
+.L_bn_add_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L011aw_finish
+.L012aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L012aw_loop
+.L011aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L013aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L013aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L013aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L013aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L013aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L013aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L013aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L013aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_add_words,.-.L_bn_add_words_begin
+.globl bn_sub_words
+.type bn_sub_words,@function
+.align 16
+bn_sub_words:
+.L_bn_sub_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L014aw_finish
+.L015aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L015aw_loop
+.L014aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L016aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L016aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L016aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L016aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L016aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L016aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L016aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L016aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sub_words,.-.L_bn_sub_words_begin
+.globl bn_sub_part_words
+.type bn_sub_part_words,@function
+.align 16
+bn_sub_part_words:
+.L_bn_sub_part_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L017aw_finish
+.L018aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L018aw_loop
+.L017aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+.L019aw_end:
+ cmpl $0,36(%esp)
+ je .L020pw_end
+ movl 36(%esp),%ebp
+ cmpl $0,%ebp
+ je .L020pw_end
+ jge .L021pw_pos
+
+ movl $0,%edx
+ subl %ebp,%edx
+ movl %edx,%ebp
+ andl $4294967288,%ebp
+ jz .L022pw_neg_finish
+.L023pw_neg_loop:
+
+ movl $0,%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl $0,%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl $0,%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl $0,%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl $0,%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl $0,%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl $0,%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl $0,%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L023pw_neg_loop
+.L022pw_neg_finish:
+ movl 36(%esp),%edx
+ movl $0,%ebp
+ subl %edx,%ebp
+ andl $7,%ebp
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+ jmp .L020pw_end
+.L021pw_pos:
+ andl $4294967288,%ebp
+ jz .L024pw_pos_finish
+.L025pw_pos_loop:
+
+ movl (%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,(%ebx)
+ jnc .L026pw_nc0
+
+ movl 4(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,4(%ebx)
+ jnc .L027pw_nc1
+
+ movl 8(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,8(%ebx)
+ jnc .L028pw_nc2
+
+ movl 12(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,12(%ebx)
+ jnc .L029pw_nc3
+
+ movl 16(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,16(%ebx)
+ jnc .L030pw_nc4
+
+ movl 20(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,20(%ebx)
+ jnc .L031pw_nc5
+
+ movl 24(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,24(%ebx)
+ jnc .L032pw_nc6
+
+ movl 28(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,28(%ebx)
+ jnc .L033pw_nc7
+
+ addl $32,%esi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L025pw_pos_loop
+.L024pw_pos_finish:
+ movl 36(%esp),%ebp
+ andl $7,%ebp
+ jz .L020pw_end
+
+ movl (%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,(%ebx)
+ jnc .L034pw_tail_nc0
+ decl %ebp
+ jz .L020pw_end
+
+ movl 4(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,4(%ebx)
+ jnc .L035pw_tail_nc1
+ decl %ebp
+ jz .L020pw_end
+
+ movl 8(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,8(%ebx)
+ jnc .L036pw_tail_nc2
+ decl %ebp
+ jz .L020pw_end
+
+ movl 12(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,12(%ebx)
+ jnc .L037pw_tail_nc3
+ decl %ebp
+ jz .L020pw_end
+
+ movl 16(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,16(%ebx)
+ jnc .L038pw_tail_nc4
+ decl %ebp
+ jz .L020pw_end
+
+ movl 20(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,20(%ebx)
+ jnc .L039pw_tail_nc5
+ decl %ebp
+ jz .L020pw_end
+
+ movl 24(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,24(%ebx)
+ jnc .L040pw_tail_nc6
+ movl $1,%eax
+ jmp .L020pw_end
+.L041pw_nc_loop:
+ movl (%esi),%ecx
+ movl %ecx,(%ebx)
+.L026pw_nc0:
+ movl 4(%esi),%ecx
+ movl %ecx,4(%ebx)
+.L027pw_nc1:
+ movl 8(%esi),%ecx
+ movl %ecx,8(%ebx)
+.L028pw_nc2:
+ movl 12(%esi),%ecx
+ movl %ecx,12(%ebx)
+.L029pw_nc3:
+ movl 16(%esi),%ecx
+ movl %ecx,16(%ebx)
+.L030pw_nc4:
+ movl 20(%esi),%ecx
+ movl %ecx,20(%ebx)
+.L031pw_nc5:
+ movl 24(%esi),%ecx
+ movl %ecx,24(%ebx)
+.L032pw_nc6:
+ movl 28(%esi),%ecx
+ movl %ecx,28(%ebx)
+.L033pw_nc7:
+
+ addl $32,%esi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L041pw_nc_loop
+ movl 36(%esp),%ebp
+ andl $7,%ebp
+ jz .L042pw_nc_end
+ movl (%esi),%ecx
+ movl %ecx,(%ebx)
+.L034pw_tail_nc0:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 4(%esi),%ecx
+ movl %ecx,4(%ebx)
+.L035pw_tail_nc1:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 8(%esi),%ecx
+ movl %ecx,8(%ebx)
+.L036pw_tail_nc2:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 12(%esi),%ecx
+ movl %ecx,12(%ebx)
+.L037pw_tail_nc3:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 16(%esi),%ecx
+ movl %ecx,16(%ebx)
+.L038pw_tail_nc4:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 20(%esi),%ecx
+ movl %ecx,20(%ebx)
+.L039pw_tail_nc5:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 24(%esi),%ecx
+ movl %ecx,24(%ebx)
+.L040pw_tail_nc6:
+.L042pw_nc_end:
+ movl $0,%eax
+.L020pw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sub_part_words,.-.L_bn_sub_part_words_begin
diff --git a/crypto/bn/asm/bn-mips.s b/crypto/bn/asm/bn-mips.S
similarity index 99%
rename from crypto/bn/asm/bn-mips.s
rename to crypto/bn/asm/bn-mips.S
index d1535b1..229c709 100644
--- a/crypto/bn/asm/bn-mips.s
+++ b/crypto/bn/asm/bn-mips.S
@@ -394,14 +394,14 @@ bn_add_words_internal:
sltu $2,$14,$10
sw $14,-2*4($4)
addu $2,$24
-
+
addu $11,$15
sltu $25,$11,$15
addu $15,$11,$2
sltu $2,$15,$11
sw $15,-4($4)
addu $2,$25
-
+
.set noreorder
bgtzl $1,.L_bn_add_words_loop
lw $12,0($5)
@@ -567,7 +567,7 @@ bn_div_3_words:
# so that we can save two arguments
# and return address in registers
# instead of stack:-)
-
+
lw $4,($7)
move $10,$5
bne $4,$6,bn_div_3_words_internal
@@ -582,7 +582,7 @@ bn_div_3_words:
bn_div_3_words_internal:
.set reorder
move $11,$31
- bal bn_div_words
+ bal bn_div_words_internal
move $31,$11
multu $10,$2
lw $14,-2*4($7)
diff --git a/crypto/bn/asm/co-586.S b/crypto/bn/asm/co-586.S
new file mode 100644
index 0000000..3cb8073
--- /dev/null
+++ b/crypto/bn/asm/co-586.S
@@ -0,0 +1,1254 @@
+.file "crypto/bn/asm/co-586.s"
+.text
+.globl bn_mul_comba8
+.type bn_mul_comba8,@function
+.align 16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 16(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 24(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,28(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,36(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,44(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,52(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%eax)
+
+
+ movl %ebx,60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl bn_mul_comba4
+.type bn_mul_comba4,@function
+.align 16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+
+
+ movl %ecx,28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl bn_sqr_comba8
+.type bn_sqr_comba8,@function
+.align 16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 12(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl (%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%eax
+ adcl $0,%ebx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,28(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 20(%esi),%eax
+ adcl $0,%ecx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 8(%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 28(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,36(%edi)
+ movl 12(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 20(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 28(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,44(%edi)
+ movl 20(%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,52(%edi)
+
+
+ xorl %ecx,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%edi)
+
+ movl %ebx,60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl bn_sqr_comba4
+.type bn_sqr_comba4,@function
+.align 16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+
+ movl %ecx,28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000..e258658
--- /dev/null
+++ b/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+# stalls after ldf8, xma and getf.sig outside inner loop and
+# improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+# once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+# acute interest, because upcoming Tukwila's individual cores are
+# reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+# sign verify sign/s verify/s
+# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
+# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
+# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
+# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
+# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
+# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
+# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
+# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
+# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
+# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
+# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
+# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
+# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+ $ADDP="addp4";
+ for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+// const BN_ULONG *bp,const BN_ULONG *np,
+// const BN_ULONG *n0p,int num);
+.align 64
+.global bn_mul_mont#
+.proc bn_mul_mont#
+bn_mul_mont:
+ .prologue
+ .body
+{ .mmi; cmp4.le p6,p7=2,r37;;
+(p6) cmp4.lt.unc p8,p9=8,r37
+ mov ret0=r0 };;
+{ .bbb;
+(p9) br.cond.dptk.many bn_mul_mont_8
+(p8) br.cond.dpnt.many bn_mul_mont_general
+(p7) br.ret.spnt.many b0 };;
+.endp bn_mul_mont#
+
+prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
+
+rptr=r8; aptr=r9; bptr=r14; nptr=r15;
+tptr=r16; // &tp[0]
+tp_1=r17; // &tp[-1]
+num=r18; len=r19; lc=r20;
+topbit=r21; // carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align 64
+.local bn_mul_mont_general#
+.proc bn_mul_mont_general#
+bn_mul_mont_general:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ $ADDP aptr=0,in1
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; .vframe prevsp
+ mov prevsp=sp
+ $ADDP bptr=0,in2
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+ .body
+ .rotf alo[6],nlo[4],ahi[8],nhi[6]
+ .rotr a[3],n[3],t[2]
+
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 alo[4]=[aptr],16 // ap[0]
+ $ADDP r30=8,in1 };;
+{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
+ ldf8 alo[2]=[aptr],16 // ap[2]
+ $ADDP in4=0,in4 };;
+{ .mmi; ldf8 alo[1]=[r30] // ap[3]
+ ldf8 n0=[in4] // n0
+ $ADDP rptr=0,in0 }
+{ .mmi; $ADDP nptr=0,in3
+ mov r31=16
+ zxt4 num=in5 };;
+{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
+ shladd len=num,3,r0
+ shladd r31=num,3,r31 };;
+{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
+ add lc=-5,num
+ sub r31=sp,r31 };;
+{ .mfb; and sp=-16,r31 // alloca
+ xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
+ nop.b 0 }
+{ .mfb; nop.m 0
+ xmpy.lu alo[4]=alo[4],bi
+ brp.loop.imp .L1st_ctop,.L1st_cend-16
+ };;
+{ .mfi; nop.m 0
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
+ add tp_1=8,sp }
+{ .mfi; nop.m 0
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20001f<<16
+ // ------^----- (p40) at first (p23)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; nop.m 0
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
+ mov ar.lc=lc }
+{ .mfi; nop.m 0
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+.align 32
+.L1st_ctop:
+.pred.rel "mutex",p40,p42
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23) }
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p23) st8 [tp_1]=n[2],8
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mmb; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ br.ctop.sptk .L1st_ctop };;
+.L1st_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ add num=-1,num };; // num--
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ sub aptr=aptr,len };; // rewind
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+ sub nptr=nptr,len };;
+{ .mmi; .pred.rel "mutex",p39,p41
+(p39) add topbit=r0,r0
+(p41) add topbit=r0,r0,1
+ nop.i 0 }
+{ .mmi; st8 [tp_1]=n[0]
+ add tptr=16,sp
+ add tp_1=8,sp };;
+
+.Louter:
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 ahi[3]=[tptr] // tp[0]
+ add r30=8,aptr };;
+{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
+ ldf8 alo[3]=[r30],16 // ap[1]
+ add r31=8,nptr };;
+{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
+ xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
+ brp.loop.imp .Linner_ctop,.Linner_cend-16
+ }
+{ .mfb; ldf8 alo[1]=[r30] // ap[3]
+ xma.lu alo[4]=alo[4],bi,ahi[3]
+ clrrrb.pr };;
+{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
+ nop.i 0 }
+{ .mfi; ldf8 nlo[1]=[r31] // np[1]
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20101f<<16
+ // ------^----- (p40) at first (p23)
+ // --------^--- (p30) at first (p22)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
+ mov ar.lc=lc }
+{ .mfi;
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align 32
+.Linner_ctop:
+.pred.rel "mutex",p40,p42
+.pred.rel "mutex",p30,p32
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23)
+{ .mfi; (p16) nop.m 0
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p16) nop.f 0
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p21) ld8 t[0]=[tptr],8
+ (p16) nop.f 0
+ (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p30) add a[1]=a[1],t[1] } // (p22)
+{ .mfi; (p16) nop.m 0
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p32) add a[1]=a[1],t[1],1 };; // (p22)
+{ .mmi; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
+{ .mmb; (p23) st8 [tp_1]=n[2],8
+ (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
+ br.ctop.sptk .Linner_ctop };;
+.Linner_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ nop.i 0 };;
+
+{ .mmi; .pred.rel "mutex",p31,p33
+(p31) add a[0]=a[0],topbit
+(p33) add a[0]=a[0],topbit,1
+ mov topbit=r0 };;
+{ .mfi; .pred.rel "mutex",p31,p33
+(p31) cmp.ltu p32,p30=a[0],topbit
+(p33) cmp.leu p32,p30=a[0],topbit
+ }
+{ .mfi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ };;
+{ .mmi; .pred.rel "mutex",p44,p46
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+(p32) add topbit=r0,r0,1 }
+
+{ .mmi; st8 [tp_1]=n[0],8
+ cmp4.ne p6,p0=1,num
+ sub aptr=aptr,len };; // rewind
+{ .mmi; sub nptr=nptr,len
+(p41) add topbit=r0,r0,1
+ add tptr=16,sp }
+{ .mmb; add tp_1=8,sp
+ add num=-1,num // num--
+(p6) br.cond.sptk.many .Louter };;
+
+{ .mbb; add lc=4,lc
+ brp.loop.imp .Lsub_ctop,.Lsub_cend-16
+ clrrrb.pr };;
+{ .mii; nop.m 0
+ mov pr.rot=0x10001<<16
+ // ------^---- (p33) at first (p17)
+ mov ar.lc=lc }
+{ .mii; nop.m 0
+ mov ar.ec=3
+ nop.i 0 };;
+
+.Lsub_ctop:
+.pred.rel "mutex",p33,p35
+{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
+ (p16) nop.f 0
+ (p33) sub n[1]=t[1],n[1] } // (p17)
+{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
+ (p16) nop.f 0
+ (p35) sub n[1]=t[1],n[1],1 };; // (p17)
+{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
+ (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
+ (p18) nop.b 0 }
+{ .mib; (p18) nop.m 0
+ (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
+ br.ctop.sptk .Lsub_ctop };;
+.Lsub_cend:
+
+{ .mmb; .pred.rel "mutex",p34,p36
+(p34) sub topbit=topbit,r0 // (p19)
+(p36) sub topbit=topbit,r0,1
+ brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
+ }
+{ .mmb; sub rptr=rptr,len // rewind
+ sub tptr=tptr,len
+ clrrrb.pr };;
+{ .mmi; and aptr=tptr,topbit
+ andcm bptr=rptr,topbit
+ mov pr.rot=1<<16 };;
+{ .mii; or nptr=aptr,bptr
+ mov ar.lc=lc
+ mov ar.ec=3 };;
+
+.Lcopy_ctop:
+{ .mmb; (p16) ld8 n[0]=[nptr],8
+ (p18) st8 [tptr]=r0,8
+ (p16) nop.b 0 }
+{ .mmb; (p16) nop.m 0
+ (p18) st8 [rptr]=n[2],8
+ br.ctop.sptk .Lcopy_ctop };;
+.Lcopy_cend:
+
+{ .mmi; mov ret0=1 // signal "handled"
+ rum 1<<5 // clear um.mfh
+ mov ar.lc=prevlc }
+{ .mib; .restore sp
+ mov sp=prevsp
+ mov pr=prevpr,0x1ffff
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_general#
+
+a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
+n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
+t0=r15;
+
+ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align 64
+.skip 48 // aligns loop body
+.local bn_mul_mont_8#
+.proc bn_mul_mont_8#
+bn_mul_mont_8:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ .vframe prevsp
+ mov prevsp=sp
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; add r17=-6*16,sp
+ add sp=-7*16,sp
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+{ .mmi; .save.gf 0,0x10
+ stf.spill [sp]=f16,-16
+ .save.gf 0,0x20
+ stf.spill [r17]=f17,32
+ add r16=-5*16,prevsp};;
+{ .mmi; .save.gf 0,0x40
+ stf.spill [r16]=f18,32
+ .save.gf 0,0x80
+ stf.spill [r17]=f19,32
+ $ADDP aptr=0,in1 };;
+{ .mmi; .save.gf 0,0x100
+ stf.spill [r16]=f20,32
+ .save.gf 0,0x200
+ stf.spill [r17]=f21,32
+ $ADDP r29=8,in1 };;
+{ .mmi; .save.gf 0,0x400
+ stf.spill [r16]=f22
+ .save.gf 0,0x800
+ stf.spill [r17]=f23
+ $ADDP rptr=0,in0 };;
+
+ .body
+ .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+ .rotr t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
+ ldf8 ai1=[r29],16 // ap[1]
+ $ADDP bptr=0,in2 }
+{ .mmi; $ADDP r30=8,in2
+ $ADDP nptr=0,in3
+ $ADDP r31=8,in3 };;
+{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
+ ldf8 bj[6]=[r30],16 // bp[1]
+ cmp4.le p4,p5=3,in5 }
+{ .mmi; ldf8 ni0=[nptr],16 // np[0]
+ ldf8 ni1=[r31],16 // np[1]
+ cmp4.le p6,p7=4,in5 };;
+
+{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
+ (p5)fcvt.fxu ai2=f0
+ cmp4.le p8,p9=5,in5 }
+{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
+ (p7)fcvt.fxu ai3=f0
+ cmp4.le p10,p11=6,in5 }
+{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
+ (p5)fcvt.fxu bj[5]=f0
+ cmp4.le p12,p13=7,in5 }
+{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
+ (p7)fcvt.fxu bj[4]=f0
+ cmp4.le p14,p15=8,in5 }
+{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
+ (p5)fcvt.fxu ni2=f0
+ addp4 r28=-1,in5 }
+{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
+ (p7)fcvt.fxu ni3=f0
+ $ADDP in4=0,in4 };;
+
+{ .mfi; ldf8 n0=[in4]
+ fcvt.fxu tf[1]=f0
+ nop.i 0 }
+
+{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
+ (p9)fcvt.fxu ai4=f0
+ mov t[0]=r0 }
+{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
+ (p11)fcvt.fxu ai5=f0
+ mov t[1]=r0 }
+{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
+ (p9)fcvt.fxu bj[3]=f0
+ mov t[2]=r0 }
+{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
+ (p11)fcvt.fxu bj[2]=f0
+ mov t[3]=r0 }
+{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
+ (p9)fcvt.fxu ni4=f0
+ mov t[4]=r0 }
+{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
+ (p11)fcvt.fxu ni5=f0
+ mov t[5]=r0 };;
+
+{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
+ (p13)fcvt.fxu ai6=f0
+ mov t[6]=r0 }
+{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
+ (p15)fcvt.fxu ai7=f0
+ mov t[7]=r0 }
+{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
+ (p13)fcvt.fxu bj[1]=f0
+ mov ar.lc=r28 }
+{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
+ (p15)fcvt.fxu bj[0]=f0
+ mov ar.ec=1 }
+{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
+ (p13)fcvt.fxu ni6=f0
+ mov pr.rot=1<<16 }
+{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
+ (p15)fcvt.fxu ni7=f0
+ brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
+ };;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 0:
+ (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
+ (p40) add a3=a3,n3 } // (p17) a3+=n3
+{ .mfi; (p42) add a3=a3,n3,1
+ (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig a7=alo[8] // 1:
+ (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
+ (p50) add t[6]=t[6],a3,1 };;
+{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
+ (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
+ (p40) cmp.ltu p43,p41=a3,n3 }
+{ .mfi; (p42) cmp.leu p43,p41=a3,n3
+ (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n5=nlo[6] // 3:
+ (p48) cmp.ltu p51,p49=t[6],a3
+ (p50) cmp.leu p51,p49=t[6],a3 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 4:
+ (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
+ (p41) add a4=a4,n4 } // (p17) a4+=n4
+{ .mfi; (p43) add a4=a4,n4,1
+ (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
+ (p16) nop.i 0 };;
+{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
+ (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
+ (p51) add t[5]=t[5],a4,1 };;
+{ .mfi; (p16) nop.m 0 // 6:
+ (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
+ (p41) cmp.ltu p42,p40=a4,n4 }
+{ .mfi; (p43) cmp.leu p42,p40=a4,n4
+ (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n6=nlo[7] // 7:
+ (p49) cmp.ltu p50,p48=t[5],a4
+ (p51) cmp.leu p50,p48=t[5],a4 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 8:
+ (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
+ (p40) add a5=a5,n5 } // (p17) a5+=n5
+{ .mfi; (p42) add a5=a5,n5,1
+ (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a1=alo[1] // 9:
+ (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
+ (p50) add t[4]=t[4],a5,1 };;
+{ .mfi; (p16) nop.m 0 // 10:
+ (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
+ (p40) cmp.ltu p43,p41=a5,n5 }
+{ .mfi; (p42) cmp.leu p43,p41=a5,n5
+ (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n7=nlo[8] // 11:
+ (p48) cmp.ltu p51,p49=t[4],a5
+ (p50) cmp.leu p51,p49=t[4],a5 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
+ (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
+ (p41) add a6=a6,n6 } // (p17) a6+=n6
+{ .mfi; (p43) add a6=a6,n6,1
+ (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a2=alo[2] // 13:
+ (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
+ (p51) add t[3]=t[3],a6,1 };;
+{ .mfi; (p16) nop.m 0 // 14:
+ (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
+ (p41) cmp.ltu p42,p40=a6,n6 }
+{ .mfi; (p43) cmp.leu p42,p40=a6,n6
+ (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
+ (p16) nop.i 0 };;
+{ .mii; (p16) nop.m 0 // 15:
+ (p49) cmp.ltu p50,p48=t[3],a6
+ (p51) cmp.leu p50,p48=t[3],a6 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 16:
+ (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
+ (p40) add a7=a7,n7 } // (p17) a7+=n7
+{ .mfi; (p42) add a7=a7,n7,1
+ (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a3=alo[3] // 17:
+ (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
+ (p50) add t[2]=t[2],a7,1 };;
+{ .mfi; (p16) nop.m 0 // 18:
+ (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
+ (p40) cmp.ltu p43,p41=a7,n7 }
+{ .mfi; (p42) cmp.leu p43,p41=a7,n7
+ (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n1=nlo[1] // 19:
+ (p48) cmp.ltu p51,p49=t[2],a7
+ (p50) cmp.leu p51,p49=t[2],a7 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 20:
+ (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
+ (p41) add a8=a8,n8 } // (p17) a8+=n8
+{ .mfi; (p43) add a8=a8,n8,1
+ (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a4=alo[4] // 21:
+ (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
+ (p51) add t[1]=t[1],a8,1 };;
+{ .mfi; (p16) nop.m 0 // 22:
+ (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
+ (p41) cmp.ltu p42,p40=a8,n8 }
+{ .mfi; (p43) cmp.leu p42,p40=a8,n8
+ (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n2=nlo[2] // 23:
+ (p49) cmp.ltu p50,p48=t[1],a8
+ (p51) cmp.leu p50,p48=t[1],a8 };;
+{ .mfi; (p16) nop.m 0 // 24:
+ (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
+ (p16) add a1=a1,n1 } // (p16) a1+=n1
+{ .mfi; (p16) nop.m 0
+ (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
+ (p17) mov t[0]=r0 };;
+{ .mii; (p16) getf.sig a5=alo[5] // 25:
+ (p16) add t0=t[7],a1 // (p16) t[7]+=a1
+ (p42) add t[0]=t[0],r0,1 };;
+{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
+ (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
+ (p50) add t[0]=t[0],r0,1 }
+{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
+ (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n3=nlo[3] // 27:
+ (p16) cmp.ltu.unc p50,p48=t0,a1
+ (p16) nop.i 0 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 28:
+ (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
+ (p40) add a2=a2,n2 } // (p16) a2+=n2
+{ .mfi; (p42) add a2=a2,n2,1
+ (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a6=alo[6] // 29:
+ (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
+ (p50) add t[6]=t[6],a2,1 };;
+{ .mfi; (p16) nop.m 0 // 30:
+ (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
+ (p40) cmp.ltu p41,p39=a2,n2 }
+{ .mfi; (p42) cmp.leu p41,p39=a2,n2
+ (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
+ (p16) nop.i 0 };;
+{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
+ (p16) nop.f 0
+ (p48) cmp.ltu p49,p47=t[6],a2 }
+{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
+ (p16) nop.f 0
+ br.ctop.sptk.many .Louter_8_ctop };;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mmi; (p0) getf.sig n1=ni0 // 0:
+ (p40) add a3=a3,n3 // (p17) a3+=n3
+ (p42) add a3=a3,n3,1 };;
+{ .mii; (p17) getf.sig a7=alo[8] // 1:
+ (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
+ (p50) add t[6]=t[6],a3,1 };;
+{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
+ (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
+ (p40) cmp.ltu p43,p41=a3,n3 }
+{ .mfi; (p42) cmp.leu p43,p41=a3,n3
+ (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
+ (p0) nop.i 0 };;
+{ .mii; (p17) getf.sig n5=nlo[6] // 3:
+ (p48) cmp.ltu p51,p49=t[6],a3
+ (p50) cmp.leu p51,p49=t[6],a3 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mmi; (p0) getf.sig n2=ni1 // 4:
+ (p41) add a4=a4,n4 // (p17) a4+=n4
+ (p43) add a4=a4,n4,1 };;
+{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
+ (p0) nop.f 0
+ (p51) add t[5]=t[5],a4,1 };;
+{ .mfi; (p0) getf.sig n3=ni2 // 6:
+ (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
+ (p41) cmp.ltu p42,p40=a4,n4 }
+{ .mfi; (p43) cmp.leu p42,p40=a4,n4
+ (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
+ (p0) nop.i 0 };;
+{ .mii; (p17) getf.sig n6=nlo[7] // 7:
+ (p49) cmp.ltu p50,p48=t[5],a4
+ (p51) cmp.leu p50,p48=t[5],a4 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mii; (p0) getf.sig n4=ni3 // 8:
+ (p40) add a5=a5,n5 // (p17) a5+=n5
+ (p42) add a5=a5,n5,1 };;
+{ .mii; (p0) nop.m 0 // 9:
+ (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
+ (p50) add t[4]=t[4],a5,1 };;
+{ .mii; (p0) nop.m 0 // 10:
+ (p40) cmp.ltu p43,p41=a5,n5
+ (p42) cmp.leu p43,p41=a5,n5 };;
+{ .mii; (p17) getf.sig n7=nlo[8] // 11:
+ (p48) cmp.ltu p51,p49=t[4],a5
+ (p50) cmp.leu p51,p49=t[4],a5 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mii; (p17) getf.sig n8=nhi[8] // 12:
+ (p41) add a6=a6,n6 // (p17) a6+=n6
+ (p43) add a6=a6,n6,1 };;
+{ .mii; (p0) getf.sig n5=ni4 // 13:
+ (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
+ (p51) add t[3]=t[3],a6,1 };;
+{ .mii; (p0) nop.m 0 // 14:
+ (p41) cmp.ltu p42,p40=a6,n6
+ (p43) cmp.leu p42,p40=a6,n6 };;
+{ .mii; (p0) getf.sig n6=ni5 // 15:
+ (p49) cmp.ltu p50,p48=t[3],a6
+ (p51) cmp.leu p50,p48=t[3],a6 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mii; (p0) nop.m 0 // 16:
+ (p40) add a7=a7,n7 // (p17) a7+=n7
+ (p42) add a7=a7,n7,1 };;
+{ .mii; (p0) nop.m 0 // 17:
+ (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
+ (p50) add t[2]=t[2],a7,1 };;
+{ .mii; (p0) nop.m 0 // 18:
+ (p40) cmp.ltu p43,p41=a7,n7
+ (p42) cmp.leu p43,p41=a7,n7 };;
+{ .mii; (p0) getf.sig n7=ni6 // 19:
+ (p48) cmp.ltu p51,p49=t[2],a7
+ (p50) cmp.leu p51,p49=t[2],a7 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mii; (p0) nop.m 0 // 20:
+ (p41) add a8=a8,n8 // (p17) a8+=n8
+ (p43) add a8=a8,n8,1 };;
+{ .mmi; (p0) nop.m 0 // 21:
+ (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
+ (p51) add t[1]=t[1],a8,1 }
+{ .mmi; (p17) mov t[0]=r0
+ (p41) cmp.ltu p42,p40=a8,n8
+ (p43) cmp.leu p42,p40=a8,n8 };;
+{ .mmi; (p0) getf.sig n8=ni7 // 22:
+ (p49) cmp.ltu p50,p48=t[1],a8
+ (p51) cmp.leu p50,p48=t[1],a8 }
+{ .mmi; (p42) add t[0]=t[0],r0,1
+ (p0) add r16=-7*16,prevsp
+ (p0) add r17=-6*16,prevsp };;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi; (p50)add t[0]=t[0],r0,1
+ add r18=-5*16,prevsp
+ sub n1=t0,n1 };;
+{ .mmi; cmp.gtu p34,p32=n1,t0;;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n2=t[7],n2
+ (p34)sub n2=t[7],n2,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
+ (p34)cmp.geu p35,p33=n2,t[7];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n3=t[6],n3 }
+{ .mmi; (p35)sub n3=t[6],n3,1;;
+ (p33)cmp.gtu p34,p32=n3,t[6]
+ (p35)cmp.geu p34,p32=n3,t[6] };;
+ .pred.rel "mutex",p32,p34
+{ .mii; (p32)sub n4=t[5],n4
+ (p34)sub n4=t[5],n4,1;;
+ (p32)cmp.gtu p35,p33=n4,t[5] }
+{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n5=t[4],n5
+ (p35)sub n5=t[4],n5,1 };;
+{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
+ (p35)cmp.geu p34,p32=n5,t[4];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n6=t[3],n6 }
+{ .mmi; (p34)sub n6=t[3],n6,1;;
+ (p32)cmp.gtu p35,p33=n6,t[3]
+ (p34)cmp.geu p35,p33=n6,t[3] };;
+ .pred.rel "mutex",p33,p35
+{ .mii; (p33)sub n7=t[2],n7
+ (p35)sub n7=t[2],n7,1;;
+ (p33)cmp.gtu p34,p32=n7,t[2] }
+{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n8=t[1],n8
+ (p34)sub n8=t[1],n8,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
+ (p34)cmp.geu p35,p33=n8,t[1];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub a8=t[0],r0 }
+{ .mmi; (p35)sub a8=t[0],r0,1;;
+ (p33)cmp.gtu p34,p32=a8,t[0]
+ (p35)cmp.geu p34,p32=a8,t[0] };;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+ .pred.rel "mutex",p32,p34
+{ .mmi; (p32)st8 [rptr]=n1,8
+ (p34)st8 [rptr]=t0,8
+ add r19=-4*16,prevsp};;
+{ .mmb; (p32)st8 [rptr]=n2,8
+ (p34)st8 [rptr]=t[7],8
+ (p5)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n3,8
+ (p34)st8 [rptr]=t[6],8
+ (p7)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n4,8
+ (p34)st8 [rptr]=t[5],8
+ (p9)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n5,8
+ (p34)st8 [rptr]=t[4],8
+ (p11)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n6,8
+ (p34)st8 [rptr]=t[3],8
+ (p13)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n7,8
+ (p34)st8 [rptr]=t[2],8
+ (p15)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n8,8
+ (p34)st8 [rptr]=t[1],8
+ nop.b 0 };;
+.Ldone: // epilogue
+{ .mmi; ldf.fill f16=[r16],64
+ ldf.fill f17=[r17],64
+ nop.i 0 }
+{ .mmi; ldf.fill f18=[r18],64
+ ldf.fill f19=[r19],64
+ mov pr=prevpr,0x1ffff };;
+{ .mmi; ldf.fill f20=[r16]
+ ldf.fill f21=[r17]
+ mov ar.lc=prevlc }
+{ .mmi; ldf.fill f22=[r18]
+ ldf.fill f23=[r19]
+ mov ret0=1 } // signal "handled"
+{ .mib; rum 1<<5
+ .restore sp
+ mov sp=prevsp
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_8#
+
+.type copyright#,\@object
+copyright:
+stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by "
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 951abc5..c0cee82 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -568,7 +568,7 @@ bn_sqr_comba8:
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
-// clause in Itanium µ-architecture manual? Comments are welcomed and
+// clause in Itanium µ-architecture manual? Comments are welcomed and
// highly appreciated.
//
// On Itanium 2 it takes ~190 ticks. This is because of stalls on
diff --git a/crypto/bn/asm/mips-mont.s b/crypto/bn/asm/mips-mont.S
similarity index 99%
rename from crypto/bn/asm/mips-mont.s
rename to crypto/bn/asm/mips-mont.S
index 867de6f..32ecee5 100644
--- a/crypto/bn/asm/mips-mont.s
+++ b/crypto/bn/asm/mips-mont.S
@@ -7,6 +7,8 @@
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
+ lw $8,16($29)
+ lw $9,20($29)
slt $1,$9,4
bnez $1,1f
li $2,0
diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
index f04b3b9..38b5164 100644
--- a/crypto/bn/asm/mips.pl
+++ b/crypto/bn/asm/mips.pl
@@ -583,14 +583,14 @@
sltu $v0,$t2,$ta2
$ST $t2,-2*$BNSZ($a0)
$ADDU $v0,$t8
-
+
$ADDU $ta3,$t3
sltu $t9,$ta3,$t3
$ADDU $t3,$ta3,$v0
sltu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
$ADDU $v0,$t9
-
+
.set noreorder
bgtzl $at,.L_bn_add_words_loop
$LD $t0,0($a1)
@@ -790,7 +790,7 @@
# so that we can save two arguments
# and return address in registers
# instead of stack:-)
-
+
$LD $a0,($a3)
move $ta2,$a1
bne $a0,$a2,bn_div_3_words_internal
@@ -819,7 +819,7 @@
$code.=<<___;
.set reorder
move $ta3,$ra
- bal bn_div_words
+ bal bn_div_words_internal
move $ra,$ta3
$MULTU $ta2,$v0
$LD $t2,-2*$BNSZ($a3)
diff --git a/crypto/bn/asm/modexp512-x86_64.S b/crypto/bn/asm/modexp512-x86_64.S
new file mode 100644
index 0000000..6cccafb
--- /dev/null
+++ b/crypto/bn/asm/modexp512-x86_64.S
@@ -0,0 +1,1773 @@
+.text
+
+.type MULADD_128x512,@function
+.align 16
+MULADD_128x512:
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ .byte 0xf3,0xc3
+.size MULADD_128x512,.-MULADD_128x512
+.type mont_reduce,@function
+.align 16
+mont_reduce:
+ leaq 192(%rsp),%rdi
+ movq 32(%rsp),%rsi
+ addq $576,%rsi
+ leaq 520(%rsp),%rcx
+
+ movq 96(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ movq (%rcx),%r8
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ movq 8(%rcx),%r9
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ movq 16(%rcx),%r10
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ movq 24(%rcx),%r11
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ movq 32(%rcx),%r12
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ movq 40(%rcx),%r13
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ movq 48(%rcx),%r14
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ movq 56(%rcx),%r15
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 104(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 112(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,16(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 120(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,24(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ xorq %rax,%rax
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+ adcq 80(%rcx),%r10
+ adcq 88(%rcx),%r11
+ adcq $0,%rax
+
+
+
+
+ movq %r8,64(%rdi)
+ movq %r9,72(%rdi)
+ movq %r10,%rbp
+ movq %r11,88(%rdi)
+
+ movq %rax,384(%rsp)
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+
+
+
+
+
+
+
+
+ addq $80,%rdi
+
+ addq $64,%rsi
+ leaq 296(%rsp),%rcx
+
+ call MULADD_128x512
+
+ movq 384(%rsp),%rax
+
+
+ addq -16(%rdi),%r8
+ adcq -8(%rdi),%r9
+ movq %r8,64(%rcx)
+ movq %r9,72(%rcx)
+
+ adcq %rax,%rax
+ movq %rax,384(%rsp)
+
+ leaq 192(%rsp),%rdi
+ addq $64,%rsi
+
+
+
+
+
+ movq (%rsi),%r8
+ movq 8(%rsi),%rbx
+
+ movq (%rcx),%rax
+ mulq %r8
+ movq %rax,%rbp
+ movq %rdx,%r9
+
+ movq 8(%rcx),%rax
+ mulq %r8
+ addq %rax,%r9
+
+ movq (%rcx),%rax
+ mulq %rbx
+ addq %rax,%r9
+
+ movq %r9,8(%rdi)
+
+
+ subq $192,%rsi
+
+ movq (%rcx),%r8
+ movq 8(%rcx),%r9
+
+ call MULADD_128x512
+
+
+
+
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rdi
+ movq 24(%rsi),%rdx
+
+
+ movq 384(%rsp),%rbp
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+
+
+ adcq %rbp,%rbp
+
+
+
+ shlq $3,%rbp
+ movq 32(%rsp),%rcx
+ addq %rcx,%rbp
+
+
+ xorq %rsi,%rsi
+
+ addq 0(%rbp),%r10
+ adcq 64(%rbp),%r11
+ adcq 128(%rbp),%r12
+ adcq 192(%rbp),%r13
+ adcq 256(%rbp),%r14
+ adcq 320(%rbp),%r15
+ adcq 384(%rbp),%r8
+ adcq 448(%rbp),%r9
+
+
+
+ sbbq $0,%rsi
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+ movq $1,%rbp
+ subq %rax,%r10
+ sbbq %rbx,%r11
+ sbbq %rdi,%r12
+ sbbq %rdx,%r13
+
+
+
+
+ sbbq $0,%rbp
+
+
+
+ addq $512,%rcx
+ movq 32(%rcx),%rax
+ movq 40(%rcx),%rbx
+ movq 48(%rcx),%rdi
+ movq 56(%rcx),%rdx
+
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+
+
+ subq $1,%rbp
+
+ sbbq %rax,%r14
+ sbbq %rbx,%r15
+ sbbq %rdi,%r8
+ sbbq %rdx,%r9
+
+
+
+ movq 144(%rsp),%rsi
+ movq %r10,0(%rsi)
+ movq %r11,8(%rsi)
+ movq %r12,16(%rsi)
+ movq %r13,24(%rsi)
+ movq %r14,32(%rsi)
+ movq %r15,40(%rsi)
+ movq %r8,48(%rsi)
+ movq %r9,56(%rsi)
+
+ .byte 0xf3,0xc3
+.size mont_reduce,.-mont_reduce
+.type mont_mul_a3b,@function
+.align 16
+mont_mul_a3b:
+
+
+
+
+ movq 0(%rdi),%rbp
+
+ movq %r10,%rax
+ mulq %rbp
+ movq %rax,520(%rsp)
+ movq %rdx,%r10
+ movq %r11,%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r12,%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r13,%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r14,%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r15,%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r8,%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,528(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 16(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,536(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq 24(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,544(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq 32(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,552(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq 40(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %r14,560(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq 48(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,568(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq 56(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,576(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,584(%rsp)
+ movq %r10,592(%rsp)
+ movq %r11,600(%rsp)
+ movq %r12,608(%rsp)
+ movq %r13,616(%rsp)
+ movq %r14,624(%rsp)
+ movq %r15,632(%rsp)
+ movq %r8,640(%rsp)
+
+
+
+
+
+ jmp mont_reduce
+
+
+.size mont_mul_a3b,.-mont_mul_a3b
+.type sqr_reduce,@function
+.align 16
+sqr_reduce:
+ movq 16(%rsp),%rcx
+
+
+
+ movq %r10,%rbx
+
+ movq %r11,%rax
+ mulq %rbx
+ movq %rax,528(%rsp)
+ movq %rdx,%r10
+ movq %r12,%rax
+ mulq %rbx
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r13,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r14,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r15,%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%rsi
+
+ movq %r10,536(%rsp)
+
+
+
+
+
+ movq 8(%rcx),%rbx
+
+ movq 16(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,544(%rsp)
+
+ movq %rdx,%r10
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,552(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 16(%rcx),%rbx
+
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,560(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+ movq %r14,568(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r12
+
+
+
+
+
+ movq 24(%rcx),%rbx
+
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,576(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+ movq %rsi,584(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+
+ movq %rdx,%r15
+
+
+
+
+ movq 32(%rcx),%rbx
+
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,592(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,600(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 40(%rcx),%rbx
+
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,608(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+ movq %r11,616(%rsp)
+
+ movq %rdx,%r12
+
+
+
+
+ movq %r8,%rbx
+
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,624(%rsp)
+
+ movq %rdx,632(%rsp)
+
+
+ movq 528(%rsp),%r10
+ movq 536(%rsp),%r11
+ movq 544(%rsp),%r12
+ movq 552(%rsp),%r13
+ movq 560(%rsp),%r14
+ movq 568(%rsp),%r15
+
+ movq 24(%rcx),%rax
+ mulq %rax
+ movq %rax,%rdi
+ movq %rdx,%r8
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq $0,%r8
+
+ movq 0(%rcx),%rax
+ mulq %rax
+ movq %rax,520(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+ movq %r10,528(%rsp)
+ movq %r11,536(%rsp)
+
+ movq 16(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+
+ movq %r12,544(%rsp)
+ movq %r13,552(%rsp)
+
+ xorq %rbp,%rbp
+ addq %rbx,%r14
+ adcq %rdi,%r15
+ adcq $0,%rbp
+
+ movq %r14,560(%rsp)
+ movq %r15,568(%rsp)
+
+
+
+
+ movq 576(%rsp),%r10
+ movq 584(%rsp),%r11
+ movq 592(%rsp),%r12
+ movq 600(%rsp),%r13
+ movq 608(%rsp),%r14
+ movq 616(%rsp),%r15
+ movq 624(%rsp),%rdi
+ movq 632(%rsp),%rsi
+
+ movq %r9,%rax
+ mulq %rax
+ movq %rax,%r9
+ movq %rdx,%rbx
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq %rdi,%rdi
+ adcq %rsi,%rsi
+ adcq $0,%rbx
+
+ addq %rbp,%r10
+
+ movq 32(%rcx),%rax
+ mulq %rax
+
+ addq %r8,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r10,576(%rsp)
+ movq %r11,584(%rsp)
+
+ movq 40(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r12,592(%rsp)
+ movq %r13,600(%rsp)
+
+ movq 48(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r14
+ adcq %rax,%r15
+ adcq $0,%rdx
+
+ movq %r14,608(%rsp)
+ movq %r15,616(%rsp)
+
+ addq %rdx,%rdi
+ adcq %r9,%rsi
+ adcq $0,%rbx
+
+ movq %rdi,624(%rsp)
+ movq %rsi,632(%rsp)
+ movq %rbx,640(%rsp)
+
+ jmp mont_reduce
+
+
+.size sqr_reduce,.-sqr_reduce
+.globl mod_exp_512
+.type mod_exp_512,@function
+mod_exp_512:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r8
+ subq $2688,%rsp
+ andq $-64,%rsp
+
+
+ movq %r8,0(%rsp)
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rcx,24(%rsp)
+.Lbody:
+
+
+
+ pxor %xmm4,%xmm4
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqa %xmm4,512(%rsp)
+ movdqa %xmm4,528(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,544(%rsp)
+ movdqa %xmm1,560(%rsp)
+ movdqa %xmm2,576(%rsp)
+ movdqa %xmm3,592(%rsp)
+
+
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+
+ leaq 384(%rsp),%rbx
+ movq %rbx,136(%rsp)
+ call mont_reduce
+
+
+ leaq 448(%rsp),%rcx
+ xorq %rax,%rax
+ movq %rax,0(%rcx)
+ movq %rax,8(%rcx)
+ movq %rax,24(%rcx)
+ movq %rax,32(%rcx)
+ movq %rax,40(%rcx)
+ movq %rax,48(%rcx)
+ movq %rax,56(%rcx)
+ movq %rax,128(%rsp)
+ movq $1,16(%rcx)
+
+ leaq 640(%rsp),%rbp
+ movq %rcx,%rsi
+ movq %rbp,%rdi
+ movq $8,%rax
+loop_0:
+ movq (%rcx),%rbx
+ movw %bx,(%rdi)
+ shrq $16,%rbx
+ movw %bx,64(%rdi)
+ shrq $16,%rbx
+ movw %bx,128(%rdi)
+ shrq $16,%rbx
+ movw %bx,192(%rdi)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rdi),%rdi
+ decq %rax
+ jnz loop_0
+ movq $31,%rax
+ movq %rax,32(%rsp)
+ movq %rbp,40(%rsp)
+
+ movq %rsi,136(%rsp)
+ movq 0(%rsi),%r10
+ movq 8(%rsi),%r11
+ movq 16(%rsi),%r12
+ movq 24(%rsi),%r13
+ movq 32(%rsi),%r14
+ movq 40(%rsi),%r15
+ movq 48(%rsi),%r8
+ movq 56(%rsi),%r9
+init_loop:
+ leaq 384(%rsp),%rdi
+ call mont_mul_a3b
+ leaq 448(%rsp),%rsi
+ movq 40(%rsp),%rbp
+ addq $2,%rbp
+ movq %rbp,40(%rsp)
+ movq %rsi,%rcx
+ movq $8,%rax
+loop_1:
+ movq (%rcx),%rbx
+ movw %bx,(%rbp)
+ shrq $16,%rbx
+ movw %bx,64(%rbp)
+ shrq $16,%rbx
+ movw %bx,128(%rbp)
+ shrq $16,%rbx
+ movw %bx,192(%rbp)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rbp),%rbp
+ decq %rax
+ jnz loop_1
+ movq 32(%rsp),%rax
+ subq $1,%rax
+ movq %rax,32(%rsp)
+ jne init_loop
+
+
+
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm1,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movdqa %xmm3,112(%rsp)
+
+
+
+
+
+ movl 126(%rsp),%eax
+ movq %rax,%rdx
+ shrq $11,%rax
+ andl $2047,%edx
+ movl %edx,126(%rsp)
+ leaq 640(%rsp,%rax,2),%rsi
+ movq 8(%rsp),%rdx
+ movq $4,%rbp
+loop_2:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_2
+ movq $505,48(%rsp)
+
+ movq 8(%rsp),%rcx
+ movq %rcx,136(%rsp)
+ movq 0(%rcx),%r10
+ movq 8(%rcx),%r11
+ movq 16(%rcx),%r12
+ movq 24(%rcx),%r13
+ movq 32(%rcx),%r14
+ movq 40(%rcx),%r15
+ movq 48(%rcx),%r8
+ movq 56(%rcx),%r9
+ jmp sqr_2
+
+main_loop_a3b:
+ call sqr_reduce
+ call sqr_reduce
+ call sqr_reduce
+sqr_2:
+ call sqr_reduce
+ call sqr_reduce
+
+
+
+ movq 48(%rsp),%rcx
+ movq %rcx,%rax
+ shrq $4,%rax
+ movl 64(%rsp,%rax,2),%edx
+ andq $15,%rcx
+ shrq %cl,%rdx
+ andq $31,%rdx
+
+ leaq 640(%rsp,%rdx,2),%rsi
+ leaq 448(%rsp),%rdx
+ movq %rdx,%rdi
+ movq $4,%rbp
+loop_3:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_3
+ movq 8(%rsp),%rsi
+ call mont_mul_a3b
+
+
+
+ movq 48(%rsp),%rcx
+ subq $5,%rcx
+ movq %rcx,48(%rsp)
+ jge main_loop_a3b
+
+
+
+end_main_loop_a3b:
+
+
+ movq 8(%rsp),%rdx
+ pxor %xmm4,%xmm4
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+ movdqa %xmm4,576(%rsp)
+ movdqa %xmm4,592(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,512(%rsp)
+ movdqa %xmm1,528(%rsp)
+ movdqa %xmm2,544(%rsp)
+ movdqa %xmm3,560(%rsp)
+ call mont_reduce
+
+
+
+ movq 8(%rsp),%rax
+ movq 0(%rax),%r8
+ movq 8(%rax),%r9
+ movq 16(%rax),%r10
+ movq 24(%rax),%r11
+ movq 32(%rax),%r12
+ movq 40(%rax),%r13
+ movq 48(%rax),%r14
+ movq 56(%rax),%r15
+
+
+ movq 24(%rsp),%rbx
+ addq $512,%rbx
+
+ subq 0(%rbx),%r8
+ sbbq 8(%rbx),%r9
+ sbbq 16(%rbx),%r10
+ sbbq 24(%rbx),%r11
+ sbbq 32(%rbx),%r12
+ sbbq 40(%rbx),%r13
+ sbbq 48(%rbx),%r14
+ sbbq 56(%rbx),%r15
+
+
+ movq 0(%rax),%rsi
+ movq 8(%rax),%rdi
+ movq 16(%rax),%rcx
+ movq 24(%rax),%rdx
+ cmovncq %r8,%rsi
+ cmovncq %r9,%rdi
+ cmovncq %r10,%rcx
+ cmovncq %r11,%rdx
+ movq %rsi,0(%rax)
+ movq %rdi,8(%rax)
+ movq %rcx,16(%rax)
+ movq %rdx,24(%rax)
+
+ movq 32(%rax),%rsi
+ movq 40(%rax),%rdi
+ movq 48(%rax),%rcx
+ movq 56(%rax),%rdx
+ cmovncq %r12,%rsi
+ cmovncq %r13,%rdi
+ cmovncq %r14,%rcx
+ cmovncq %r15,%rdx
+ movq %rsi,32(%rax)
+ movq %rdi,40(%rax)
+ movq %rcx,48(%rax)
+ movq %rdx,56(%rax)
+
+ movq 0(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbx
+ movq 40(%rsi),%rbp
+ leaq 48(%rsi),%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size mod_exp_512, . - mod_exp_512
diff --git a/crypto/bn/asm/modexp512-x86_64.pl b/crypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 0000000..bfd6e97
--- /dev/null
+++ b/crypto/bn/asm/modexp512-x86_64.pl
@@ -0,0 +1,1497 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+# Author: Vinodh.Gopal@intel.com
+# Jim Guilford
+# Erdinc.Ozturk@intel.com
+# Maxim.Perminov@intel.com
+#
+# More information about algorithm used can be found at:
+# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+# software must display the following acknowledgment:
+# "This product includes software developed by the OpenSSL Project
+# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+# endorse or promote products derived from this software without
+# prior written permission. For written permission, please contact
+# licensing@OpenSSL.org.
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+# nor may "OpenSSL" appear in their names without prior written
+# permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+# acknowledgment:
+# "This product includes software developed by the OpenSSL Project
+# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+use strict;
+my $code=".text\n\n";
+my $m=0;
+
+#
+# Define x512 macros
+#
+
+#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x; # make a copy
+$code.=<<___;
+ mov (+8*0)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [0]
+ mov ($ASRC), $X[0]
+ add %rax, $X[0]
+ adc \$0, %rdx
+ mov $X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov %rdx, $TMP
+
+ mov (+8*$i)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [$i]
+ mov (+8*$i)($ASRC), $X[$i]
+ add %rax, $X[$i]
+ adc \$0, %rdx
+ add $TMP, $X[$i]
+ adc \$0, %rdx
+___
+}
+$code.=<<___;
+ mov %rdx, $X[0]
+___
+}
+
+#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x; # make a copy
+$code.=<<___;
+ mov (+8*0)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [0]
+ add %rax, $X[0]
+ adc \$0, %rdx
+ mov $X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov %rdx, $TMP
+
+ mov (+8*$i)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [$i]
+ add %rax, $X[$i]
+ adc \$0, %rdx
+ add $TMP, $X[$i]
+ adc \$0, %rdx
+___
+}
+$code.=<<___;
+ mov %rdx, $X[0]
+___
+}
+
+#
+# Swizzle Macros
+#
+
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+ mov \$8, $cnt
+loop_$m:
+ mov ($pSrc), $d0
+ mov $d0#w, ($pDst)
+ shr \$16, $d0
+ mov $d0#w, (+64*1)($pDst)
+ shr \$16, $d0
+ mov $d0#w, (+64*2)($pDst)
+ shr \$16, $d0
+ mov $d0#w, (+64*3)($pDst)
+ lea 8($pSrc), $pSrc
+ lea 64*4($pDst), $pDst
+ dec $cnt
+ jnz loop_$m
+___
+
+ $m++;
+}
+
+# macro to copy data from swizzled table to flat space
+#MACRO unswizzle pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+ mov \$4, $cnt
+loop_$m:
+ movzxw (+64*3+256*0)($pSrc), $d0
+ movzxw (+64*3+256*1)($pSrc), $d1
+ shl \$16, $d0
+ shl \$16, $d1
+ mov (+64*2+256*0)($pSrc), $d0#w
+ mov (+64*2+256*1)($pSrc), $d1#w
+ shl \$16, $d0
+ shl \$16, $d1
+ mov (+64*1+256*0)($pSrc), $d0#w
+ mov (+64*1+256*1)($pSrc), $d1#w
+ shl \$16, $d0
+ shl \$16, $d1
+ mov (+64*0+256*0)($pSrc), $d0#w
+ mov (+64*0+256*1)($pSrc), $d1#w
+ mov $d0, (+8*0)($pDst)
+ mov $d1, (+8*1)($pDst)
+ lea 256*2($pSrc), $pSrc
+ lea 8*2($pDst), $pDst
+ sub \$1, $cnt
+ jnz loop_$m
+___
+
+ $m++;
+}
+
+#
+# Data Structures
+#
+
+# Reduce Data
+#
+#
+# Offset Value
+# 0C0 Carries
+# 0B8 X2[10]
+# 0B0 X2[9]
+# 0A8 X2[8]
+# 0A0 X2[7]
+# 098 X2[6]
+# 090 X2[5]
+# 088 X2[4]
+# 080 X2[3]
+# 078 X2[2]
+# 070 X2[1]
+# 068 X2[0]
+# 060 X1[12] P[10]
+# 058 X1[11] P[9] Z[8]
+# 050 X1[10] P[8] Z[7]
+# 048 X1[9] P[7] Z[6]
+# 040 X1[8] P[6] Z[5]
+# 038 X1[7] P[5] Z[4]
+# 030 X1[6] P[4] Z[3]
+# 028 X1[5] P[3] Z[2]
+# 020 X1[4] P[2] Z[1]
+# 018 X1[3] P[1] Z[0]
+# 010 X1[2] P[0] Y[2]
+# 008 X1[1] Q[1] Y[1]
+# 000 X1[0] Q[0] Y[0]
+
+my $X1_offset = 0; # 13 qwords
+my $X2_offset = $X1_offset + 13*8; # 11 qwords
+my $Carries_offset = $X2_offset + 11*8; # 1 qword
+my $Q_offset = 0; # 2 qwords
+my $P_offset = $Q_offset + 2*8; # 11 qwords
+my $Y_offset = 0; # 3 qwords
+my $Z_offset = $Y_offset + 3*8; # 9 qwords
+
+my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords)
+
+#
+# Stack Frame
+#
+#
+# offset value
+# ...
+# ...
+# 280 Garray
+
+# 278 tmp16[15]
+# ... ...
+# 200 tmp16[0]
+
+# 1F8 tmp[7]
+# ... ...
+# 1C0 tmp[0]
+
+# 1B8 GT[7]
+# ... ...
+# 180 GT[0]
+
+# 178 Reduce Data
+# ... ...
+# 0B8 Reduce Data
+# 0B0 reserved
+# 0A8 reserved
+# 0A0 reserved
+# 098 reserved
+# 090 reserved
+# 088 reduce result addr
+# 080 exp[8]
+
+# ...
+# 048 exp[1]
+# 040 exp[0]
+
+# 038 reserved
+# 030 loop_idx
+# 028 pg
+# 020 i
+# 018 pData ; arg 4
+# 010 pG ; arg 2
+# 008 pResult ; arg 1
+# 000 rsp ; stack pointer before subtract
+
+my $rsp_offset = 0;
+my $pResult_offset = 8*1 + $rsp_offset;
+my $pG_offset = 8*1 + $pResult_offset;
+my $pData_offset = 8*1 + $pG_offset;
+my $i_offset = 8*1 + $pData_offset;
+my $pg_offset = 8*1 + $i_offset;
+my $loop_idx_offset = 8*1 + $pg_offset;
+my $reserved1_offset = 8*1 + $loop_idx_offset;
+my $exp_offset = 8*1 + $reserved1_offset;
+my $red_result_addr_offset= 8*9 + $exp_offset;
+my $reserved2_offset = 8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset = 8*5 + $reserved2_offset;
+my $GT_offset = $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset = 8*8 + $GT_offset;
+my $tmp16_offset = 8*8 + $tmp_offset;
+my $garray_offset = 8*16 + $tmp16_offset;
+my $mem_size = 8*8*32 + $garray_offset;
+
+#
+# Offsets within Reduce Data
+#
+#
+# struct MODF_2FOLD_MONT_512_C1_DATA {
+# UINT64 t[8][8];
+# UINT64 m[8];
+# UINT64 m1[8]; /* 2^768 % m */
+# UINT64 m2[8]; /* 2^640 % m */
+# UINT64 k1[2]; /* (- 1/m) % 2^128 */
+# };
+
+my $T = 0;
+my $M = 512; # = 8 * 8 * 8
+my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */
+my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */
+my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */
+
+#
+# FUNCTIONS
+#
+
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+# and add 512-bits (8 qwords)
+# to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+# 512-bit mul source: [rsi+8*n]
+# 512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type MULADD_128x512,\@abi-omnipotent
+.align 16
+MULADD_128x512:
+___
+ &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+ mov (+8*1)(%rdi), %rbp
+___
+ &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+ ret
+.size MULADD_128x512,.-MULADD_128x512
+___
+}}}
+
+{{{
+#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination (768 bits, 12 qwords)
+# pA: Multiplicand (1024 bits, 16 qwords)
+# pB: Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+ mov (+8*12)($pA), $OP
+___
+ &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+ push(@$X,shift(@$X));
+
+$code.=<<___;
+ mov (+8*13)($pA), $OP
+___
+ &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+ push(@$X,shift(@$X));
+
+$code.=<<___;
+ mov (+8*14)($pA), $OP
+___
+ &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+ push(@$X,shift(@$X));
+
+$code.=<<___;
+ mov (+8*15)($pA), $OP
+___
+ &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+ push(@$X,shift(@$X));
+}
+
+#
+# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */
+# UINT64 *m, /* 512 bits, 8 qwords */
+# MODF_2FOLD_MONT_512_C1_DATA *data,
+# UINT64 *r) /* 512 bits, 8 qwords */
+# Input: x (number to be reduced): tmp16 (Implicit)
+# m (modulus): [pM] (Implicit)
+# data (reduce data): [pData] (Implicit)
+# Output: r (result): Address in [red_res_addr]
+# result also in: r9, r8, r15, r14, r13, r12, r11, r10
+
+my @X=map("%r$_",(8..15));
+
+$code.=<<___;
+.type mont_reduce,\@abi-omnipotent
+.align 16
+mont_reduce:
+___
+
+my $STACK_DEPTH = 8;
+ #
+ # X1 = Xh * M1 + Xl
+$code.=<<___;
+ lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords
+ mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords
+ add \$$M1, %rsi
+ lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords
+
+___
+
+ &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times
+ # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+
+$code.=<<___;
+ xor %rax, %rax
+ # X1 += xl
+ add (+8*8)(%rcx), $X[4]
+ adc (+8*9)(%rcx), $X[5]
+ adc (+8*10)(%rcx), $X[6]
+ adc (+8*11)(%rcx), $X[7]
+ adc \$0, %rax
+ # X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+
+ #
+ # check for carry ;; carry stored in rax
+ mov $X[4], (+8*8)(%rdi) # rdi points to X1
+ mov $X[5], (+8*9)(%rdi)
+ mov $X[6], %rbp
+ mov $X[7], (+8*11)(%rdi)
+
+ mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+ mov (+8*0)(%rdi), $X[4]
+ mov (+8*1)(%rdi), $X[5]
+ mov (+8*2)(%rdi), $X[6]
+ mov (+8*3)(%rdi), $X[7]
+
+ # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+ # rdi -> X1
+ # rsi -> M1
+
+ #
+ # X2 = Xh * M2 + Xl
+ # do first part (X2 = Xh * M2)
+ add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
+ # Xh is actually { [rdi+8*1], rbp }
+ add \$`$M2-$M1`, %rsi # rsi -> M2
+ lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
+___
+ unshift(@X,pop(@X)); unshift(@X,pop(@X));
+$code.=<<___;
+
+ call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
+ # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+ mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+
+ # X2 += Xl
+ add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl
+ adc (+8*9-8*10)(%rdi), $X[7]
+ mov $X[6], (+8*8)(%rcx)
+ mov $X[7], (+8*9)(%rcx)
+
+ adc %rax, %rax
+ mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+ lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
+ add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords
+
+ # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
+ # B1:B0 = rsi[1:0] = K1[1:0]
+ # A1:A0 = rcx[1:0] = X2[1:0]
+ # Result = rdi[1],rbp = Q[1],rbp
+ mov (%rsi), %r8 # B0
+ mov (+8*1)(%rsi), %rbx # B1
+
+ mov (%rcx), %rax # A0
+ mul %r8 # B0
+ mov %rax, %rbp
+ mov %rdx, %r9
+
+ mov (+8*1)(%rcx), %rax # A1
+ mul %r8 # B0
+ add %rax, %r9
+
+ mov (%rcx), %rax # A0
+ mul %rbx # B1
+ add %rax, %r9
+
+ mov %r9, (+8*1)(%rdi)
+ # end MUL_128x128t128
+
+ sub \$`$K1-$M`, %rsi
+
+ mov (%rcx), $X[6]
+ mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0]
+
+ call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
+ # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+
+ # load first half of m to rdx, rdi, rbx, rax
+ # moved this here for efficiency
+ mov (+8*0)(%rsi), %rax
+ mov (+8*1)(%rsi), %rbx
+ mov (+8*2)(%rsi), %rdi
+ mov (+8*3)(%rsi), %rdx
+
+ # continue with reduction
+ mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+
+ add (+8*8)(%rcx), $X[6]
+ adc (+8*9)(%rcx), $X[7]
+
+ #accumulate the final carry to rbp
+ adc %rbp, %rbp
+
+ # Add in overflow corrections: R = (X2>>128) += T[overflow]
+ # R = {r9, r8, r15, r14, ..., r10}
+ shl \$3, %rbp
+ mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T)
+ add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out
+
+ # rsi will be used to generate a mask after the addition
+ xor %rsi, %rsi
+
+ add (+8*8*0)(%rbp), $X[0]
+ adc (+8*8*1)(%rbp), $X[1]
+ adc (+8*8*2)(%rbp), $X[2]
+ adc (+8*8*3)(%rbp), $X[3]
+ adc (+8*8*4)(%rbp), $X[4]
+ adc (+8*8*5)(%rbp), $X[5]
+ adc (+8*8*6)(%rbp), $X[6]
+ adc (+8*8*7)(%rbp), $X[7]
+
+ # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF
+ # if carry is clear: rsi = 0x0000000000000000
+ sbb \$0, %rsi
+
+ # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+ and %rsi, %rax
+ and %rsi, %rbx
+ and %rsi, %rdi
+ and %rsi, %rdx
+
+ mov \$1, %rbp
+ sub %rax, $X[0]
+ sbb %rbx, $X[1]
+ sbb %rdi, $X[2]
+ sbb %rdx, $X[3]
+
+ # if there is a borrow: rbp = 0
+ # if there is no borrow: rbp = 1
+ # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+ sbb \$0, %rbp
+
+ #load second half of m to rdx, rdi, rbx, rax
+
+ add \$$M, %rcx
+ mov (+8*4)(%rcx), %rax
+ mov (+8*5)(%rcx), %rbx
+ mov (+8*6)(%rcx), %rdi
+ mov (+8*7)(%rcx), %rdx
+
+ # use the rsi mask as before
+ # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+ and %rsi, %rax
+ and %rsi, %rbx
+ and %rsi, %rdi
+ and %rsi, %rdx
+
+ # if rbp = 0, there was a borrow before, it is moved to the carry flag
+ # if rbp = 1, there was not a borrow before, carry flag is cleared
+ sub \$1, %rbp
+
+ sbb %rax, $X[4]
+ sbb %rbx, $X[5]
+ sbb %rdi, $X[6]
+ sbb %rdx, $X[7]
+
+ # write R back to memory
+
+ mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+ mov $X[0], (+8*0)(%rsi)
+ mov $X[1], (+8*1)(%rsi)
+ mov $X[2], (+8*2)(%rsi)
+ mov $X[3], (+8*3)(%rsi)
+ mov $X[4], (+8*4)(%rsi)
+ mov $X[5], (+8*5)(%rsi)
+ mov $X[6], (+8*6)(%rsi)
+ mov $X[7], (+8*7)(%rsi)
+
+ ret
+.size mont_reduce,.-mont_reduce
+___
+}}}
+
+{{{
+#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination (1024 bits, 16 qwords)
+# pA: Multiplicand (512 bits, 8 qwords)
+# pB: Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+# B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x; # make a copy
+
+$code.=<<___;
+ mov (+8*0)($pA), $OP
+
+ mov $X[0], %rax
+ mul $OP # rdx:rax = %OP * [0]
+ mov %rax, (+$pDst_o+8*0)($pDst)
+ mov %rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov $X[$i], %rax
+ mul $OP # rdx:rax = %OP * [$i]
+ add %rax, $X[$i-1]
+ adc \$0, %rdx
+ mov %rdx, $X[$i]
+___
+}
+
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov (+8*$i)($pA), $OP
+___
+
+ &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+ push(@X,shift(@X));
+}
+
+$code.=<<___;
+ mov $X[0], (+$pDst_o+8*8)($pDst)
+ mov $X[1], (+$pDst_o+8*9)($pDst)
+ mov $X[2], (+$pDst_o+8*10)($pDst)
+ mov $X[3], (+$pDst_o+8*11)($pDst)
+ mov $X[4], (+$pDst_o+8*12)($pDst)
+ mov $X[5], (+$pDst_o+8*13)($pDst)
+ mov $X[6], (+$pDst_o+8*14)($pDst)
+ mov $X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input: src1: Address of source 1: rdi
+# src2: Address of source 2: rsi
+# Output: dst: Address of destination: [red_res_addr]
+# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp: Clobbers [tmp16], all registers
+$code.=<<___;
+.type mont_mul_a3b,\@abi-omnipotent
+.align 16
+mont_mul_a3b:
+ #
+ # multiply tmp = src1 * src2
+ # For multiply: dst = rcx, src1 = rdi, src2 = rsi
+ # stack depth is extra 8 from call
+___
+ &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+ #
+ # Dst = tmp % m
+ # Call reduce(tmp, m, data, dst)
+
+ # tail recursion optimization: jmp to mont_reduce and return from there
+ jmp mont_reduce
+ # call mont_reduce
+ # ret
+.size mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x; # make a copy
+$code.=<<___;
+ # ------------------
+ # first pass 01...07
+ # ------------------
+ mov $X[0], $A
+
+ mov $X[1],%rax
+ mul $A
+ mov %rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+ mov %rdx, $X[$i-2]
+ mov $X[$i],%rax
+ mul $A
+ add %rax, $X[$i-2]
+ adc \$0, %rdx
+___
+}
+$code.=<<___;
+ mov %rdx, $x7
+
+ mov $X[0], (+$pDst_o+8*2)($pDst)
+
+ # ------------------
+ # second pass 12...17
+ # ------------------
+
+ mov (+8*1)($pA), $A
+
+ mov (+8*2)($pA),%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ mov $X[1], (+$pDst_o+8*3)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*3)($pA),%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ add $X[0], $X[2]
+ adc \$0, %rdx
+ mov $X[2], (+$pDst_o+8*4)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*4)($pA),%rax
+ mul $A
+ add %rax, $X[3]
+ adc \$0, %rdx
+ add $X[0], $X[3]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $X[4]
+ adc \$0, %rdx
+ add $X[0], $X[4]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ add $X[0], $X[5]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $x7
+ adc \$0, %rdx
+ add $X[0], $x7
+ adc \$0, %rdx
+
+ mov %rdx, $X[1]
+
+ # ------------------
+ # third pass 23...27
+ # ------------------
+ mov (+8*2)($pA), $A
+
+ mov (+8*3)($pA),%rax
+ mul $A
+ add %rax, $X[3]
+ adc \$0, %rdx
+ mov $X[3], (+$pDst_o+8*5)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*4)($pA),%rax
+ mul $A
+ add %rax, $X[4]
+ adc \$0, %rdx
+ add $X[0], $X[4]
+ adc \$0, %rdx
+ mov $X[4], (+$pDst_o+8*6)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ add $X[0], $X[5]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $x7
+ adc \$0, %rdx
+ add $X[0], $x7
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ add $X[0], $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $X[2]
+
+ # ------------------
+ # fourth pass 34...37
+ # ------------------
+
+ mov (+8*3)($pA), $A
+
+ mov (+8*4)($pA),%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ mov $X[5], (+$pDst_o+8*7)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $x7
+ adc \$0, %rdx
+ add $X[0], $x7
+ adc \$0, %rdx
+ mov $x7, (+$pDst_o+8*8)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ add $X[0], $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ add $X[0], $X[2]
+ adc \$0, %rdx
+
+ mov %rdx, $X[5]
+
+ # ------------------
+ # fifth pass 45...47
+ # ------------------
+ mov (+8*4)($pA), $A
+
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ mov $X[1], (+$pDst_o+8*9)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ add $X[0], $X[2]
+ adc \$0, %rdx
+ mov $X[2], (+$pDst_o+8*10)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ add $X[0], $X[5]
+ adc \$0, %rdx
+
+ mov %rdx, $X[1]
+
+ # ------------------
+ # sixth pass 56...57
+ # ------------------
+ mov (+8*5)($pA), $A
+
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ mov $X[5], (+$pDst_o+8*11)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ add $X[0], $X[1]
+ adc \$0, %rdx
+ mov $X[1], (+$pDst_o+8*12)($pDst)
+
+ mov %rdx, $X[2]
+
+ # ------------------
+ # seventh pass 67
+ # ------------------
+ mov $X[6], $A
+
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ mov $X[2], (+$pDst_o+8*13)($pDst)
+
+ mov %rdx, (+$pDst_o+8*14)($pDst)
+
+ # start finalize (add in squares, and double off-terms)
+ mov (+$pDst_o+8*1)($pDst), $X[0]
+ mov (+$pDst_o+8*2)($pDst), $X[1]
+ mov (+$pDst_o+8*3)($pDst), $X[2]
+ mov (+$pDst_o+8*4)($pDst), $X[3]
+ mov (+$pDst_o+8*5)($pDst), $X[4]
+ mov (+$pDst_o+8*6)($pDst), $X[5]
+
+ mov (+8*3)($pA), %rax
+ mul %rax
+ mov %rax, $x6
+ mov %rdx, $X[6]
+
+ add $X[0], $X[0]
+ adc $X[1], $X[1]
+ adc $X[2], $X[2]
+ adc $X[3], $X[3]
+ adc $X[4], $X[4]
+ adc $X[5], $X[5]
+ adc \$0, $X[6]
+
+ mov (+8*0)($pA), %rax
+ mul %rax
+ mov %rax, (+$pDst_o+8*0)($pDst)
+ mov %rdx, $A
+
+ mov (+8*1)($pA), %rax
+ mul %rax
+
+ add $A, $X[0]
+ adc %rax, $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $A
+ mov $X[0], (+$pDst_o+8*1)($pDst)
+ mov $X[1], (+$pDst_o+8*2)($pDst)
+
+ mov (+8*2)($pA), %rax
+ mul %rax
+
+ add $A, $X[2]
+ adc %rax, $X[3]
+ adc \$0, %rdx
+
+ mov %rdx, $A
+
+ mov $X[2], (+$pDst_o+8*3)($pDst)
+ mov $X[3], (+$pDst_o+8*4)($pDst)
+
+ xor $tmp, $tmp
+ add $A, $X[4]
+ adc $x6, $X[5]
+ adc \$0, $tmp
+
+ mov $X[4], (+$pDst_o+8*5)($pDst)
+ mov $X[5], (+$pDst_o+8*6)($pDst)
+
+ # %%tmp has 0/1 in column 7
+ # %%A6 has a full value in column 7
+
+ mov (+$pDst_o+8*7)($pDst), $X[0]
+ mov (+$pDst_o+8*8)($pDst), $X[1]
+ mov (+$pDst_o+8*9)($pDst), $X[2]
+ mov (+$pDst_o+8*10)($pDst), $X[3]
+ mov (+$pDst_o+8*11)($pDst), $X[4]
+ mov (+$pDst_o+8*12)($pDst), $X[5]
+ mov (+$pDst_o+8*13)($pDst), $x6
+ mov (+$pDst_o+8*14)($pDst), $x7
+
+ mov $X[7], %rax
+ mul %rax
+ mov %rax, $X[7]
+ mov %rdx, $A
+
+ add $X[0], $X[0]
+ adc $X[1], $X[1]
+ adc $X[2], $X[2]
+ adc $X[3], $X[3]
+ adc $X[4], $X[4]
+ adc $X[5], $X[5]
+ adc $x6, $x6
+ adc $x7, $x7
+ adc \$0, $A
+
+ add $tmp, $X[0]
+
+ mov (+8*4)($pA), %rax
+ mul %rax
+
+ add $X[6], $X[0]
+ adc %rax, $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $tmp
+
+ mov $X[0], (+$pDst_o+8*7)($pDst)
+ mov $X[1], (+$pDst_o+8*8)($pDst)
+
+ mov (+8*5)($pA), %rax
+ mul %rax
+
+ add $tmp, $X[2]
+ adc %rax, $X[3]
+ adc \$0, %rdx
+
+ mov %rdx, $tmp
+
+ mov $X[2], (+$pDst_o+8*9)($pDst)
+ mov $X[3], (+$pDst_o+8*10)($pDst)
+
+ mov (+8*6)($pA), %rax
+ mul %rax
+
+ add $tmp, $X[4]
+ adc %rax, $X[5]
+ adc \$0, %rdx
+
+ mov $X[4], (+$pDst_o+8*11)($pDst)
+ mov $X[5], (+$pDst_o+8*12)($pDst)
+
+ add %rdx, $x6
+ adc $X[7], $x7
+ adc \$0, $A
+
+ mov $x6, (+$pDst_o+8*13)($pDst)
+ mov $x7, (+$pDst_o+8*14)($pDst)
+ mov $A, (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type sqr_reduce,\@abi-omnipotent
+.align 16
+sqr_reduce:
+ mov (+$pResult_offset+8)(%rsp), %rcx
+___
+ &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+ # tail recursion optimization: jmp to mont_reduce and return from there
+ jmp mont_reduce
+ # call mont_reduce
+ # ret
+.size sqr_reduce,.-sqr_reduce
+___
+}}}
+
+#
+# MAIN FUNCTION
+#
+
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+# UINT64 *g, /* 512 bits, 8 qwords */
+# UINT64 *exp, /* 512 bits, 8 qwords */
+# struct mod_ctx_512 *data)
+
+# window size = 5
+# table size = 2^5 = 32
+#table_entries equ 32
+#table_size equ table_entries * 8
+$code.=<<___;
+.globl mod_exp_512
+.type mod_exp_512,\@function,4
+mod_exp_512:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # adjust stack down and then align it with cache boundary
+ mov %rsp, %r8
+ sub \$$mem_size, %rsp
+ and \$-64, %rsp
+
+ # store previous stack pointer and arguments
+ mov %r8, (+$rsp_offset)(%rsp)
+ mov %rdi, (+$pResult_offset)(%rsp)
+ mov %rsi, (+$pG_offset)(%rsp)
+ mov %rcx, (+$pData_offset)(%rsp)
+.Lbody:
+ # transform g into montgomery space
+ # GT = reduce(g * C2) = reduce(g * (2^256))
+ # reduce expects to have the input in [tmp16]
+ pxor %xmm4, %xmm4
+ movdqu (+16*0)(%rsi), %xmm0
+ movdqu (+16*1)(%rsi), %xmm1
+ movdqu (+16*2)(%rsi), %xmm2
+ movdqu (+16*3)(%rsi), %xmm3
+ movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
+ movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp)
+ movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp)
+ movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp)
+ movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp)
+
+ # load pExp before rdx gets blown away
+ movdqu (+16*0)(%rdx), %xmm0
+ movdqu (+16*1)(%rdx), %xmm1
+ movdqu (+16*2)(%rdx), %xmm2
+ movdqu (+16*3)(%rdx), %xmm3
+
+ lea (+$GT_offset)(%rsp), %rbx
+ mov %rbx, (+$red_result_addr_offset)(%rsp)
+ call mont_reduce
+
+ # Initialize tmp = C
+ lea (+$tmp_offset)(%rsp), %rcx
+ xor %rax, %rax
+ mov %rax, (+8*0)(%rcx)
+ mov %rax, (+8*1)(%rcx)
+ mov %rax, (+8*3)(%rcx)
+ mov %rax, (+8*4)(%rcx)
+ mov %rax, (+8*5)(%rcx)
+ mov %rax, (+8*6)(%rcx)
+ mov %rax, (+8*7)(%rcx)
+ mov %rax, (+$exp_offset+8*8)(%rsp)
+ movq \$1, (+8*2)(%rcx)
+
+ lea (+$garray_offset)(%rsp), %rbp
+ mov %rcx, %rsi # pTmp
+ mov %rbp, %rdi # Garray[][0]
+___
+
+ &swizzle("%rdi", "%rcx", "%rax", "%rbx");
+
+ # for (rax = 31; rax != 0; rax--) {
+ # tmp = reduce(tmp * G)
+ # swizzle(pg, tmp);
+ # pg += 2; }
+$code.=<<___;
+ mov \$31, %rax
+ mov %rax, (+$i_offset)(%rsp)
+ mov %rbp, (+$pg_offset)(%rsp)
+ # rsi -> pTmp
+ mov %rsi, (+$red_result_addr_offset)(%rsp)
+ mov (+8*0)(%rsi), %r10
+ mov (+8*1)(%rsi), %r11
+ mov (+8*2)(%rsi), %r12
+ mov (+8*3)(%rsi), %r13
+ mov (+8*4)(%rsi), %r14
+ mov (+8*5)(%rsi), %r15
+ mov (+8*6)(%rsi), %r8
+ mov (+8*7)(%rsi), %r9
+init_loop:
+ lea (+$GT_offset)(%rsp), %rdi
+ call mont_mul_a3b
+ lea (+$tmp_offset)(%rsp), %rsi
+ mov (+$pg_offset)(%rsp), %rbp
+ add \$2, %rbp
+ mov %rbp, (+$pg_offset)(%rsp)
+ mov %rsi, %rcx # rcx = rsi = addr of tmp
+___
+
+ &swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+ mov (+$i_offset)(%rsp), %rax
+ sub \$1, %rax
+ mov %rax, (+$i_offset)(%rsp)
+ jne init_loop
+
+ #
+ # Copy exponent onto stack
+ movdqa %xmm0, (+$exp_offset+16*0)(%rsp)
+ movdqa %xmm1, (+$exp_offset+16*1)(%rsp)
+ movdqa %xmm2, (+$exp_offset+16*2)(%rsp)
+ movdqa %xmm3, (+$exp_offset+16*3)(%rsp)
+
+
+ #
+ # Do exponentiation
+ # Initialize result to G[exp{511:507}]
+ mov (+$exp_offset+62)(%rsp), %eax
+ mov %rax, %rdx
+ shr \$11, %rax
+ and \$0x07FF, %edx
+ mov %edx, (+$exp_offset+62)(%rsp)
+ lea (+$garray_offset)(%rsp,%rax,2), %rsi
+ mov (+$pResult_offset)(%rsp), %rdx
+___
+
+ &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+
+ #
+ # Loop variables
+ # rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+ movq \$505, (+$loop_idx_offset)(%rsp)
+
+ mov (+$pResult_offset)(%rsp), %rcx
+ mov %rcx, (+$red_result_addr_offset)(%rsp)
+ mov (+8*0)(%rcx), %r10
+ mov (+8*1)(%rcx), %r11
+ mov (+8*2)(%rcx), %r12
+ mov (+8*3)(%rcx), %r13
+ mov (+8*4)(%rcx), %r14
+ mov (+8*5)(%rcx), %r15
+ mov (+8*6)(%rcx), %r8
+ mov (+8*7)(%rcx), %r9
+ jmp sqr_2
+
+main_loop_a3b:
+ call sqr_reduce
+ call sqr_reduce
+ call sqr_reduce
+sqr_2:
+ call sqr_reduce
+ call sqr_reduce
+
+ #
+ # Do multiply, first look up proper value in Garray
+ mov (+$loop_idx_offset)(%rsp), %rcx # bit index
+ mov %rcx, %rax
+ shr \$4, %rax # rax is word pointer
+ mov (+$exp_offset)(%rsp,%rax,2), %edx
+ and \$15, %rcx
+ shrq %cl, %rdx
+ and \$0x1F, %rdx
+
+ lea (+$garray_offset)(%rsp,%rdx,2), %rsi
+ lea (+$tmp_offset)(%rsp), %rdx
+ mov %rdx, %rdi
+___
+
+ &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+ # rdi = tmp = pG
+
+ #
+ # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData)
+ # result result pG M Data
+$code.=<<___;
+ mov (+$pResult_offset)(%rsp), %rsi
+ call mont_mul_a3b
+
+ #
+ # finish loop
+ mov (+$loop_idx_offset)(%rsp), %rcx
+ sub \$5, %rcx
+ mov %rcx, (+$loop_idx_offset)(%rsp)
+ jge main_loop_a3b
+
+ #
+
+end_main_loop_a3b:
+ # transform result out of Montgomery space
+ # result = reduce(result)
+ mov (+$pResult_offset)(%rsp), %rdx
+ pxor %xmm4, %xmm4
+ movdqu (+16*0)(%rdx), %xmm0
+ movdqu (+16*1)(%rdx), %xmm1
+ movdqu (+16*2)(%rdx), %xmm2
+ movdqu (+16*3)(%rdx), %xmm3
+ movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
+ movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp)
+ movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp)
+ movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp)
+ movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp)
+ call mont_reduce
+
+ # If result > m, subract m
+ # load result into r15:r8
+ mov (+$pResult_offset)(%rsp), %rax
+ mov (+8*0)(%rax), %r8
+ mov (+8*1)(%rax), %r9
+ mov (+8*2)(%rax), %r10
+ mov (+8*3)(%rax), %r11
+ mov (+8*4)(%rax), %r12
+ mov (+8*5)(%rax), %r13
+ mov (+8*6)(%rax), %r14
+ mov (+8*7)(%rax), %r15
+
+ # subtract m
+ mov (+$pData_offset)(%rsp), %rbx
+ add \$$M, %rbx
+
+ sub (+8*0)(%rbx), %r8
+ sbb (+8*1)(%rbx), %r9
+ sbb (+8*2)(%rbx), %r10
+ sbb (+8*3)(%rbx), %r11
+ sbb (+8*4)(%rbx), %r12
+ sbb (+8*5)(%rbx), %r13
+ sbb (+8*6)(%rbx), %r14
+ sbb (+8*7)(%rbx), %r15
+
+ # if Carry is clear, replace result with difference
+ mov (+8*0)(%rax), %rsi
+ mov (+8*1)(%rax), %rdi
+ mov (+8*2)(%rax), %rcx
+ mov (+8*3)(%rax), %rdx
+ cmovnc %r8, %rsi
+ cmovnc %r9, %rdi
+ cmovnc %r10, %rcx
+ cmovnc %r11, %rdx
+ mov %rsi, (+8*0)(%rax)
+ mov %rdi, (+8*1)(%rax)
+ mov %rcx, (+8*2)(%rax)
+ mov %rdx, (+8*3)(%rax)
+
+ mov (+8*4)(%rax), %rsi
+ mov (+8*5)(%rax), %rdi
+ mov (+8*6)(%rax), %rcx
+ mov (+8*7)(%rax), %rdx
+ cmovnc %r12, %rsi
+ cmovnc %r13, %rdi
+ cmovnc %r14, %rcx
+ cmovnc %r15, %rdx
+ mov %rsi, (+8*4)(%rax)
+ mov %rdi, (+8*5)(%rax)
+ mov %rcx, (+8*6)(%rax)
+ mov %rdx, (+8*7)(%rax)
+
+ mov (+$rsp_offset)(%rsp), %rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbx
+ mov 40(%rsi),%rbp
+ lea 48(%rsi),%rsp
+.Lepilogue:
+ ret
+.size mod_exp_512, . - mod_exp_512
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type mod_exp_512_se_handler,\@abi-omnipotent
+.align 16
+mod_exp_512_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lbody(%rip),%r10
+ cmp %r10,%rbx # context->RipRsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov $rsp_offset(%rax),%rax # pull saved Rsp
+
+ mov 32(%rax),%rbx
+ mov 40(%rax),%rbp
+ mov 24(%rax),%r12
+ mov 16(%rax),%r13
+ mov 8(%rax),%r14
+ mov 0(%rax),%r15
+ lea 48(%rax),%rax
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size mod_exp_512_se_handler,.-mod_exp_512_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_mod_exp_512
+ .rva .LSEH_end_mod_exp_512
+ .rva .LSEH_info_mod_exp_512
+
+.section .xdata
+.align 8
+.LSEH_info_mod_exp_512:
+ .byte 9,0,0,0
+ .rva mod_exp_512_se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+ if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
+ elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
+ elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
+ elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
+ return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000..4a766a8
--- /dev/null
+++ b/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,993 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+ $BN_SZ =$SIZE_T;
+} else {
+ $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+ $BN_SZ =$SIZE_T;
+ if (open CONF,"<${dir}../../opensslconf.h") {
+ while() {
+ if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+ $BN_SZ=8;
+ $LEVEL="2.0";
+ last;
+ }
+ }
+ close CONF;
+ }
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
+ # [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32; # local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22"; # passed through stack in 32-bit
+$num="%r21"; # passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4"; $fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
+$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+ .ALIGN 64
+bn_mul_mont
+ .PROC
+ .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ ldo -$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldw `-$FRAME_MARKER-4`($fp),$n0
+ ldw `-$FRAME_MARKER-8`($fp),$num
+ nop
+ nop ; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+ comiclr,<= 6,$num,%r0 ; are vectors long enough?
+ b L\$abort
+ ldi 0,%r28 ; signal "unhandled"
+ add,ev %r0,$num,$num ; is $num even?
+ b L\$abort
+ nop
+ or $ap,$np,$ti1
+ extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
+ b L\$abort
+ nop
+ nop ; alignment
+ nop
+
+ fldws 0($n0),${fn0}
+ fldws,ma 4($bp),${fbi} ; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+ comib,> 3,$num,L\$abort ; are vectors long enough?
+ ldi 0,%r28 ; signal "unhandled"
+ addl $num,$num,$num ; I operate on 32-bit values
+
+ fldws 4($n0),${fn0} ; only low part of n0
+ fldws 4($bp),${fbi} ; bp[0] in flipped word order
+___
+$code.=<<___;
+ fldds 0($ap),${fai} ; ap[0,1]
+ fldds 0($np),${fni} ; np[0,1]
+
+ sh2addl $num,%r0,$arrsz
+ ldi 31,$hi0
+ ldo 36($arrsz),$hi1 ; space for tp[num+1]
+ andcm $hi1,$hi0,$hi1 ; align
+ addl $hi1,%sp,%sp
+ $PUSH $fp,-$SIZE_T(%sp)
+
+ ldo `$LOCALS+16`($fp),$xfer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ addl $arrsz,$ap,$ap ; point at the end
+ addl $arrsz,$np,$np
+ subi 0,$arrsz,$idx ; j=0
+ ldo 8($idx),$idx ; j++++
+
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ fstds ${fab1},0($xfer)
+ fstds ${fnm1},8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+ mtctl $hi0,%cr11 ; $hi0 still holds 31
+ extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
+ b L\$parisc11
+ nop
+___
+$code.=<<___; # PA-RISC 2.0 code-path
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $ab0,63,32,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ ldo 8($idx),$idx ; j++++
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ extrd,u $nm0,31,32,$hi1
+
+L\$1st
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ stw $nm1,-4($tp) ; tp[j-1]
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ addl $ab1,$nm1,$nm1
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+ fldws 0($bp),${fbi} ; bp[1] in flipped word order
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ addl $hi1,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2]
+ flddx $idx($np),${fni} ; np[2]
+ ldo 8($idx),$idx ; j++++
+ ldd -16($xfer),$ab0 ; 33-bit value
+ ldd -8($xfer),$nm0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ extrd,u $ab0,31,32,$ti0 ; carry bit
+ extrd,u $ab0,63,32,$ab0
+ fstds ${fab1},0($xfer)
+ addl $ti0,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ extrd,u $nm0,31,32,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+L\$inner
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldw 4($tp),$ti0 ; tp[j]
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ti0,$ti0
+ addl $ti0,$ab0,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $nm1,31,32,$hi1
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldw 4($tp),$ti0 ; tp[j]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ addl $ti0,$ab0,$ab0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,31,32,$hi0
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldi 12,$ti0 ; bp[i] in flipped word order
+ addl,ev %r0,$num,$num
+ ldi -4,$ti0
+ addl $ti0,$bp,$bp
+ fldws 0($bp),${fbi}
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0]
+ addl $hi0,$ab1,$ab1
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ addl $hi1,$hi0,$hi0
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+ addl $hi0,$ab1,$ab1
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ addl $hi1,$hi0,$hi0
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+ ldws,ma 4($tp),$ti0
+ extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
+ b L\$sub_pa11
+ addl $tp,$arrsz,$tp
+L\$sub
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldd,ma 8($tp),$ti0
+L\$sub
+ ldd $idx($np),$hi0
+ shrpd $ti0,$ti0,32,$ti0 ; flip word order
+ std $ti0,-8($tp) ; save flipped value
+ sub,db $ti0,$hi0,$hi1
+ ldd,ma 8($tp),$ti0
+ addib,<> 8,$idx,L\$sub
+ std,ma $hi1,8($rp)
+
+ extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
+ sub,db $ti0,%r0,$hi1
+ ldo -8($tp),$tp
+___
+$code.=<<___;
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy
+ ldd $idx($np),$hi0
+ std,ma %r0,8($tp)
+ addib,<> 8,$idx,.-8 ; L\$copy
+ std,ma $hi0,8($rp)
+___
+
+if ($BN_SZ==4) { # PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+ b L\$done
+ nop
+
+ .ALIGN 8
+L\$parisc11
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -12($xfer),$ablo
+ ldw -16($xfer),$hi0
+ ldw -4($xfer),$nmlo0
+ ldw -8($xfer),$nmhi0
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+ ldo 8($idx),$idx ; j++++
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ addc %r0,$nmhi0,$hi1
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+ nop
+
+L\$1st_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 12($xfer),$nmlo1
+ addc %r0,$abhi,$hi0
+ ldw 8($xfer),$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fab1},0($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ fstds ${fnm1},8($xfer)
+ add $hi1,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$hi1
+ ldw -16($xfer),$abhi
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ ldw -4($xfer),$nmlo0
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -8($xfer),$nmhi0
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ fstds ${fab0},-16($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ fstds ${fnm0},-8($xfer)
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ ldw 8($xfer),$nmhi1
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ add $hi0,$ablo,$ablo
+ fstds ${fab1},0($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -8($xfer),$nmhi0
+ addc %r0,$nmhi1,$hi1
+ ldw -4($xfer),$nmlo0
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[1]
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+ ldw -16($xfer),$abhi ; carry bit actually
+ ldo 8($idx),$idx ; j++++
+ ldw -12($xfer),$ablo
+ ldw -8($xfer),$nmhi0
+ ldw -4($xfer),$nmlo0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ fstds ${fab1},0($xfer)
+ addl $abhi,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ addc %r0,$nmhi0,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+
+L\$inner_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ ldw 12($xfer),$nmlo1
+ add $ti1,$ablo,$ablo
+ ldw 8($xfer),$nmhi1
+ addc %r0,$abhi,$hi0
+ fstds ${fab1},0($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fnm1},8($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ ldw 8($tp),$ti1 ; tp[j]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -4($xfer),$nmlo0
+ add $hi0,$ablo,$ablo
+ ldw -8($xfer),$nmhi0
+ addc %r0,$abhi,$abhi
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ add $ti0,$ablo,$ablo
+ fstds ${fab0},-16($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm0},-8($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldw 8($xfer),$nmhi1
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ fstds ${fab1},0($xfer)
+ add $ti1,$ablo,$ablo
+ fstds ${fnm1},8($xfer)
+ addc %r0,$abhi,$hi0
+ ldw -16($xfer),$abhi
+ add $ablo,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$nmhi1
+ ldw -8($xfer),$nmhi0
+ add $hi1,$nmlo1,$nmlo1
+ ldw -4($xfer),$nmlo0
+ addc %r0,$nmhi1,$hi1
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$abhi
+ add $ti0,$ablo,$ablo
+ ldw 8($tp),$ti1 ; tp[j]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone_pa11; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[i]
+ flddx $idx($ap),${fai} ; ap[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer_pa11
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32+4`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+ ldw -4($tp),$ti0
+ addl $tp,$arrsz,$tp
+L\$sub_pa11
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub_pa11
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy_pa11
+ ldwx $idx($np),$hi0
+ stws,ma %r0,4($tp)
+ addib,<> 4,$idx,L\$copy_pa11
+ stws,ma $hi0,4($rp)
+
+ nop ; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+ ldi 1,%r28 ; signal "handled"
+ ldo $FRAME($fp),%sp ; destroy tp[num+1]
+
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+ .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by "
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "ldd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
+ { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
+ { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+ $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $std = sub {
+ my ($mod,$args) = @_;
+ my $orig = "std$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
+ { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+ $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $extrd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "extrd$mod\t$args";
+
+ # I only have ",u" completer, it's implicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
+ { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+ my $len=32-$3;
+ $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
+ { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+ my $len=32-$2;
+ $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
+ $opcode |= (1<<13) if ($mod =~ /,\**=/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "shrpd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
+ { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+ my $cpos=63-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $sub = sub {
+ my ($mod,$args) = @_;
+ my $orig = "sub$mod\t$args";
+
+ if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+ my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+ $opcode|=(1<<10); # e1
+ $opcode|=(1<<8); # e2
+ $opcode|=(1<<5); # d
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+ }
+ else { "\t".$orig; }
+};
+
+sub assemble {
+ my ($mnemonic,$mod,$args)=@_;
+ my $opcode = eval("\$$mnemonic");
+
+ ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ # flip word order in 64-bit mode...
+ s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+ # assemble 2.0 instructions in 32-bit mode...
+ s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index 7849eae..f9b6992 100644
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -31,7 +31,6 @@
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
- $FRAME= $SIZE_T*16;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
@@ -51,7 +50,6 @@
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
- $FRAME= $SIZE_T*16;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
@@ -69,6 +67,9 @@
$POP= $LD;
} else { die "nonsense $flavour"; }
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@
$nj="r11";
$tj="r12";
# non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
#
$nhi="r0";
@@ -108,42 +109,48 @@
.machine "any"
.text
-.globl .bn_mul_mont
+.globl .bn_mul_mont_int
.align 4
-.bn_mul_mont:
+.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+ cmpwi $num,32 ; longer key performance is not better
+ bgelr
+___
+$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
- addi $ovf,$num,`$FRAME+$RZONE`
+ addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
+ mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
- $PUSH r14,`4*$SIZE_T`($sp)
- $PUSH r15,`5*$SIZE_T`($sp)
- $PUSH r16,`6*$SIZE_T`($sp)
- $PUSH r17,`7*$SIZE_T`($sp)
- $PUSH r18,`8*$SIZE_T`($sp)
- $PUSH r19,`9*$SIZE_T`($sp)
- $PUSH r20,`10*$SIZE_T`($sp)
- $PUSH r21,`11*$SIZE_T`($sp)
- $PUSH r22,`12*$SIZE_T`($sp)
- $PUSH r23,`13*$SIZE_T`($sp)
- $PUSH r24,`14*$SIZE_T`($sp)
- $PUSH r25,`15*$SIZE_T`($sp)
+ $PUSH r20,`-12*$SIZE_T`($tj)
+ $PUSH r21,`-11*$SIZE_T`($tj)
+ $PUSH r22,`-10*$SIZE_T`($tj)
+ $PUSH r23,`-9*$SIZE_T`($tj)
+ $PUSH r24,`-8*$SIZE_T`($tj)
+ $PUSH r25,`-7*$SIZE_T`($tj)
+ $PUSH r26,`-6*$SIZE_T`($tj)
+ $PUSH r27,`-5*$SIZE_T`($tj)
+ $PUSH r28,`-4*$SIZE_T`($tj)
+ $PUSH r29,`-3*$SIZE_T`($tj)
+ $PUSH r30,`-2*$SIZE_T`($tj)
+ $PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
- addi $tp,$sp,$FRAME
+ addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
@@ -205,8 +212,8 @@
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
- addi $tp,$sp,$FRAME
- $LD $tj,$FRAME($sp) ; tp[0]
+ addi $tp,$sp,$LOCALS
+ $LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
@@ -273,7 +280,7 @@
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
- addi $tp,$sp,$FRAME
+ addi $tp,$sp,$LOCALS
mtctr $num
.align 4
@@ -299,23 +306,27 @@
addi $j,$j,$BNSZ
bdnz- Lcopy
- $POP r14,`4*$SIZE_T`($sp)
- $POP r15,`5*$SIZE_T`($sp)
- $POP r16,`6*$SIZE_T`($sp)
- $POP r17,`7*$SIZE_T`($sp)
- $POP r18,`8*$SIZE_T`($sp)
- $POP r19,`9*$SIZE_T`($sp)
- $POP r20,`10*$SIZE_T`($sp)
- $POP r21,`11*$SIZE_T`($sp)
- $POP r22,`12*$SIZE_T`($sp)
- $POP r23,`13*$SIZE_T`($sp)
- $POP r24,`14*$SIZE_T`($sp)
- $POP r25,`15*$SIZE_T`($sp)
- $POP $sp,0($sp)
+ $POP $tj,0($sp)
li r3,1
+ $POP r20,`-12*$SIZE_T`($tj)
+ $POP r21,`-11*$SIZE_T`($tj)
+ $POP r22,`-10*$SIZE_T`($tj)
+ $POP r23,`-9*$SIZE_T`($tj)
+ $POP r24,`-8*$SIZE_T`($tj)
+ $POP r25,`-7*$SIZE_T`($tj)
+ $POP r26,`-6*$SIZE_T`($tj)
+ $POP r27,`-5*$SIZE_T`($tj)
+ $POP r28,`-4*$SIZE_T`($tj)
+ $POP r29,`-3*$SIZE_T`($tj)
+ $POP r30,`-2*$SIZE_T`($tj)
+ $POP r31,`-1*$SIZE_T`($tj)
+ mr $sp,$tj
blr
.long 0
-.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by "
+ .byte 0,12,4,0,0x80,12,6,0
+ .long 0
+
+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by "
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
index 37c65d3..1249ce2 100644
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@@ -389,7 +389,9 @@
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -814,8 +816,9 @@
blr
-
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -949,7 +952,7 @@
addze r11,r0
#mul_add_c(a[3],b[2],c3,c1,c2);
$LD r6,`3*$BNSZ`(r4)
- $LD r7,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
@@ -966,7 +969,9 @@
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1502,7 +1507,9 @@
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1550,8 +1557,9 @@
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
blr
- .long 0x00000000
-
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1594,7 +1602,9 @@
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1707,7 +1717,9 @@
Lppcasm_div9:
or r3,r8,r0
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1746,8 +1758,9 @@
bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
blr
- .long 0x00000000
-
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1850,7 +1863,9 @@
Lppcasm_mw_OVER:
addi r3,r12,0
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1973,7 +1988,9 @@
Lppcasm_maw_adios:
addi r3,r12,0
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
.align 4
EOF
$data =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 3449b35..a14e769 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -45,23 +45,40 @@
# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
# in absolute terms, but it's apparently the way Power 6 is...
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
$flavour = shift;
if ($flavour =~ /32/) {
$SIZE_T=4;
$RZONE= 224;
- $FRAME= $SIZE_T*12+8*12;
- $fname= "bn_mul_mont_ppc64";
+ $fname= "bn_mul_mont_fpu64";
$STUX= "stwux"; # store indexed and update
$PUSH= "stw";
$POP= "lwz";
- die "not implemented yet";
} elsif ($flavour =~ /64/) {
$SIZE_T=8;
$RZONE= 288;
- $FRAME= $SIZE_T*12+8*12;
- $fname= "bn_mul_mont";
+ $fname= "bn_mul_mont_fpu64";
# same as above, but 64-bit mnemonics...
$STUX= "stdux"; # store indexed and update
@@ -76,7 +93,7 @@
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-$FRAME=($FRAME+63)&~63;
+$FRAME=64; # padded frame header
$TRANSFER=16*8;
$carry="r0";
@@ -93,16 +110,16 @@
$j="r11";
$i="r12";
# non-volatile registers
-$nap_d="r14"; # interleaved ap and np in double format
-$a0="r15"; # ap[0]
-$t0="r16"; # temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22"; # interleaved ap and np in double format
+$a0="r23"; # ap[0]
+$t0="r24"; # temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
# PPC offers enough register bank capacity to unroll inner loops twice
#
@@ -132,28 +149,17 @@
$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
$dota="f8"; $dotb="f9";
$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
-$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17";
-$T0a="f18"; $T0b="f19";
-$T1a="f20"; $T1b="f21";
-$T2a="f22"; $T2b="f23";
-$T3a="f24"; $T3b="f25";
+$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
+$T0a="f24"; $T0b="f25";
+$T1a="f26"; $T1b="f27";
+$T2a="f28"; $T2b="f29";
+$T3a="f30"; $T3b="f31";
# sp----------->+-------------------------------+
# | saved sp |
# +-------------------------------+
-# | |
-# +-------------------------------+
-# | 10 saved gpr, r14-r23 |
-# . .
-# . .
-# +12*size_t +-------------------------------+
-# | 12 saved fpr, f14-f25 |
# . .
-# . .
-# +12*8 +-------------------------------+
-# | padding to 64 byte boundary |
-# . .
-# +X +-------------------------------+
+# +64 +-------------------------------+
# | 16 gpr<->fpr transfer zone |
# . .
# . .
@@ -173,6 +179,16 @@
# . .
# . .
# +-------------------------------+
+# . .
+# -12*size_t +-------------------------------+
+# | 10 saved gpr, r22-r31 |
+# . .
+# . .
+# -12*8 +-------------------------------+
+# | 12 saved fpr, f20-f31 |
+# . .
+# . .
+# +-------------------------------+
$code=<<___;
.machine "any"
@@ -181,14 +197,14 @@
.globl .$fname
.align 5
.$fname:
- cmpwi $num,4
+ cmpwi $num,`3*8/$SIZE_T`
mr $rp,r3 ; $rp is reassigned
li r3,0 ; possible "not handled" return code
bltlr-
- andi. r0,$num,1 ; $num has to be even
+ andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
bnelr-
- slwi $num,$num,3 ; num*=8
+ slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
li $i,-4096
slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
add $tp,$tp,$num ; place for tp[num+1]
@@ -196,35 +212,50 @@
subf $tp,$tp,$sp ; $sp-$tp
and $tp,$tp,$i ; minimize TLB usage
subf $tp,$sp,$tp ; $tp-$sp
+ mr $i,$sp
$STUX $sp,$sp,$tp ; alloca
- $PUSH r14,`2*$SIZE_T`($sp)
- $PUSH r15,`3*$SIZE_T`($sp)
- $PUSH r16,`4*$SIZE_T`($sp)
- $PUSH r17,`5*$SIZE_T`($sp)
- $PUSH r18,`6*$SIZE_T`($sp)
- $PUSH r19,`7*$SIZE_T`($sp)
- $PUSH r20,`8*$SIZE_T`($sp)
- $PUSH r21,`9*$SIZE_T`($sp)
- $PUSH r22,`10*$SIZE_T`($sp)
- $PUSH r23,`11*$SIZE_T`($sp)
- stfd f14,`12*$SIZE_T+0`($sp)
- stfd f15,`12*$SIZE_T+8`($sp)
- stfd f16,`12*$SIZE_T+16`($sp)
- stfd f17,`12*$SIZE_T+24`($sp)
- stfd f18,`12*$SIZE_T+32`($sp)
- stfd f19,`12*$SIZE_T+40`($sp)
- stfd f20,`12*$SIZE_T+48`($sp)
- stfd f21,`12*$SIZE_T+56`($sp)
- stfd f22,`12*$SIZE_T+64`($sp)
- stfd f23,`12*$SIZE_T+72`($sp)
- stfd f24,`12*$SIZE_T+80`($sp)
- stfd f25,`12*$SIZE_T+88`($sp)
-
+ $PUSH r22,`-12*8-10*$SIZE_T`($i)
+ $PUSH r23,`-12*8-9*$SIZE_T`($i)
+ $PUSH r24,`-12*8-8*$SIZE_T`($i)
+ $PUSH r25,`-12*8-7*$SIZE_T`($i)
+ $PUSH r26,`-12*8-6*$SIZE_T`($i)
+ $PUSH r27,`-12*8-5*$SIZE_T`($i)
+ $PUSH r28,`-12*8-4*$SIZE_T`($i)
+ $PUSH r29,`-12*8-3*$SIZE_T`($i)
+ $PUSH r30,`-12*8-2*$SIZE_T`($i)
+ $PUSH r31,`-12*8-1*$SIZE_T`($i)
+ stfd f20,`-12*8`($i)
+ stfd f21,`-11*8`($i)
+ stfd f22,`-10*8`($i)
+ stfd f23,`-9*8`($i)
+ stfd f24,`-8*8`($i)
+ stfd f25,`-7*8`($i)
+ stfd f26,`-6*8`($i)
+ stfd f27,`-5*8`($i)
+ stfd f28,`-4*8`($i)
+ stfd f29,`-3*8`($i)
+ stfd f30,`-2*8`($i)
+ stfd f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
ld $a0,0($ap) ; pull ap[0] value
ld $n0,0($n0) ; pull n0[0] value
ld $t3,0($bp) ; bp[0]
-
+___
+$code.=<<___ if ($SIZE_T==4);
+ mr $t1,$n0
+ lwz $a0,0($ap) ; pull ap[0,1] value
+ lwz $t0,4($ap)
+ lwz $n0,0($t1) ; pull n0[0,1] value
+ lwz $t1,4($t1)
+ lwz $t3,0($bp) ; bp[0,1]
+ lwz $t2,4($bp)
+ insrdi $a0,$t0,32,0
+ insrdi $n0,$t1,32,0
+ insrdi $t3,$t2,32,0
+___
+$code.=<<___;
addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
li $i,-64
add $nap_d,$tp,$num
@@ -258,6 +289,8 @@
std $t5,`$FRAME+40`($sp)
std $t6,`$FRAME+48`($sp)
std $t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@
lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t1,4($ap)
+ lwz $t2,8($ap)
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
lfd $ba,`$FRAME+0`($sp)
lfd $bb,`$FRAME+8`($sp)
lfd $bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@
.align 5
L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@
lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t1,4($ap)
+ lwz $t2,8($ap)
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
std $t0,`$FRAME+64`($sp)
std $t1,`$FRAME+72`($sp)
std $t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@
li $i,8 ; i=1
.align 5
Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
ldx $t3,$bp,$i ; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+ add $t0,$bp,$i
+ lwz $t3,0($t0) ; bp[i,i+1]
+ lwz $t0,4($t0)
+ insrdi $t3,$t0,32,0
+___
+$code.=<<___;
ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
mulld $t7,$a0,$t3 ; ap[0]*bp[i]
@@ -761,6 +830,13 @@
stfd $T0b,`$FRAME+8`($sp)
add $t7,$t7,$carry
addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
stfd $T1a,`$FRAME+16`($sp)
stfd $T1b,`$FRAME+24`($sp)
insrdi $t4,$t7,16,0 ; 64..127 bits
@@ -768,6 +844,13 @@
stfd $T2a,`$FRAME+32`($sp)
stfd $T2b,`$FRAME+40`($sp)
adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
stfd $T3a,`$FRAME+48`($sp)
stfd $T3b,`$FRAME+56`($sp)
addze $carry,$carry
@@ -816,7 +899,21 @@
ld $t7,`$FRAME+72`($sp)
addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
addze $carry,$carry
std $t3,-16($tp) ; tp[j-1]
@@ -835,7 +932,9 @@
subf $nap_d,$t7,$nap_d ; rewind pointer
cmpw $i,$num
blt- Louter
+___
+$code.=<<___ if ($SIZE_T==8);
subf $np,$num,$np ; rewind np
addi $j,$j,1 ; restore counter
subfc $i,$i,$i ; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@
stdx $i,$t4,$i
addi $i,$i,16
bdnz- Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+ subf $np,$num,$np ; rewind np
+ addi $j,$j,1 ; restore counter
+ subfc $i,$i,$i ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ addi $np,$np,-4
+ addi $rp,$rp,-4
+ addi $ap,$sp,`$FRAME+$TRANSFER+4`
+ mtctr $j
+
+.align 4
+Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
+ ldu $t2,16($tp)
+ lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
+ lwz $t5,8($np)
+ lwz $t6,12($np)
+ lwzu $t7,16($np)
+ extrdi $t1,$t0,32,0
+ extrdi $t3,$t2,32,0
+ subfe $t4,$t4,$t0 ; tp[j]-np[j]
+ stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
+ subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
+ stw $t1,8($ap)
+ subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
+ stw $t2,12($ap)
+ subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
+ stwu $t3,16($ap)
+ stw $t4,4($rp)
+ stw $t5,8($rp)
+ stw $t6,12($rp)
+ stwu $t7,16($rp)
+ bdnz- Lsub
+
+ li $i,0
+ subfe $ovf,$i,$ovf ; handle upmost overflow bit
+ addi $tp,$sp,`$FRAME+$TRANSFER+4`
+ subf $rp,$num,$rp ; rewind rp
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ mtctr $j
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ lwz $t0,4($ap)
+ lwz $t1,8($ap)
+ lwz $t2,12($ap)
+ lwzu $t3,16($ap)
+ std $i,8($nap_d) ; zap nap_d
+ std $i,16($nap_d)
+ std $i,24($nap_d)
+ std $i,32($nap_d)
+ std $i,40($nap_d)
+ std $i,48($nap_d)
+ std $i,56($nap_d)
+ stdu $i,64($nap_d)
+ stw $t0,4($rp)
+ stw $t1,8($rp)
+ stw $t2,12($rp)
+ stwu $t3,16($rp)
+ std $i,8($tp) ; zap tp at once
+ stdu $i,16($tp)
+ bdnz- Lcopy
+___
- $POP r14,`2*$SIZE_T`($sp)
- $POP r15,`3*$SIZE_T`($sp)
- $POP r16,`4*$SIZE_T`($sp)
- $POP r17,`5*$SIZE_T`($sp)
- $POP r18,`6*$SIZE_T`($sp)
- $POP r19,`7*$SIZE_T`($sp)
- $POP r20,`8*$SIZE_T`($sp)
- $POP r21,`9*$SIZE_T`($sp)
- $POP r22,`10*$SIZE_T`($sp)
- $POP r23,`11*$SIZE_T`($sp)
- lfd f14,`12*$SIZE_T+0`($sp)
- lfd f15,`12*$SIZE_T+8`($sp)
- lfd f16,`12*$SIZE_T+16`($sp)
- lfd f17,`12*$SIZE_T+24`($sp)
- lfd f18,`12*$SIZE_T+32`($sp)
- lfd f19,`12*$SIZE_T+40`($sp)
- lfd f20,`12*$SIZE_T+48`($sp)
- lfd f21,`12*$SIZE_T+56`($sp)
- lfd f22,`12*$SIZE_T+64`($sp)
- lfd f23,`12*$SIZE_T+72`($sp)
- lfd f24,`12*$SIZE_T+80`($sp)
- lfd f25,`12*$SIZE_T+88`($sp)
- $POP $sp,0($sp)
+$code.=<<___;
+ $POP $i,0($sp)
li r3,1 ; signal "handled"
+ $POP r22,`-12*8-10*$SIZE_T`($i)
+ $POP r23,`-12*8-9*$SIZE_T`($i)
+ $POP r24,`-12*8-8*$SIZE_T`($i)
+ $POP r25,`-12*8-7*$SIZE_T`($i)
+ $POP r26,`-12*8-6*$SIZE_T`($i)
+ $POP r27,`-12*8-5*$SIZE_T`($i)
+ $POP r28,`-12*8-4*$SIZE_T`($i)
+ $POP r29,`-12*8-3*$SIZE_T`($i)
+ $POP r30,`-12*8-2*$SIZE_T`($i)
+ $POP r31,`-12*8-1*$SIZE_T`($i)
+ lfd f20,`-12*8`($i)
+ lfd f21,`-11*8`($i)
+ lfd f22,`-10*8`($i)
+ lfd f23,`-9*8`($i)
+ lfd f24,`-8*8`($i)
+ lfd f25,`-7*8`($i)
+ lfd f26,`-6*8`($i)
+ lfd f27,`-5*8`($i)
+ lfd f28,`-4*8`($i)
+ lfd f29,`-3*8`($i)
+ lfd f30,`-2*8`($i)
+ lfd f31,`-1*8`($i)
+ mr $sp,$i
blr
.long 0
-.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by "
+ .byte 0,12,4,0,0x8c,10,6,0
+ .long 0
+
+.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by "
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl
new file mode 100644
index 0000000..9d18d40
--- /dev/null
+++ b/crypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
+# so that improvement coefficients can vary from one specific
+# setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type _mul_1x1,\@function
+.align 16
+_mul_1x1:
+ lgr $a1,$a
+ sllg $a2,$a,1
+ sllg $a4,$a,2
+ sllg $a8,$a,3
+
+ srag $lo,$a1,63 # broadcast 63rd bit
+ nihh $a1,0x1fff
+ srag @i[0],$a2,63 # broadcast 62nd bit
+ nihh $a2,0x3fff
+ srag @i[1],$a4,63 # broadcast 61st bit
+ nihh $a4,0x7fff
+ ngr $lo,$b
+ ngr @i[0],$b
+ ngr @i[1],$b
+
+ lghi @T[0],0
+ lgr $a12,$a1
+ stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
+ xgr $a12,$a2
+ stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
+ lgr $a48,$a4
+ stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
+ xgr $a48,$a8
+ stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
+ xgr $a1,$a4
+
+ stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
+ xgr $a2,$a4
+ stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
+ xgr $a12,$a4
+ stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
+ xgr $a1,$a48
+ stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
+ xgr $a2,$a48
+
+ stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
+ xgr $a12,$a48
+ stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
+ xgr $a1,$a4
+ stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
+ xgr $a2,$a4
+ stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
+
+ xgr $a12,$a4
+ stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
+ srlg $hi,$lo,1
+ stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
+ sllg $lo,$lo,63
+ stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
+ srlg @T[0],@i[0],2
+ stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
+
+ lghi $mask,`0xf<<3`
+ sllg $a1,@i[0],62
+ sllg @i[0],$b,3
+ srlg @T[1],@i[1],3
+ ngr @i[0],$mask
+ sllg $a2,@i[1],61
+ srlg @i[1],$b,4-3
+ xgr $hi,@T[0]
+ ngr @i[1],$mask
+ xgr $lo,$a1
+ xgr $hi,@T[1]
+ xgr $lo,$a2
+
+ xg $lo,$stdframe(@i[0],$sp)
+ srlg @i[0],$b,8-3
+ ngr @i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+ lg @T[1],$stdframe(@i[1],$sp)
+ srlg @i[1],$b,`($n+2)*4`-3
+ sllg @T[0],@T[1],`$n*4`
+ ngr @i[1],$mask
+ srlg @T[1],@T[1],`64-$n*4`
+ xgr $lo,@T[0]
+ xgr $hi,@T[1]
+___
+ push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+ lg @T[1],$stdframe(@i[1],$sp)
+ sllg @T[0],@T[1],`$n*4`
+ srlg @T[1],@T[1],`64-$n*4`
+ xgr $lo,@T[0]
+ xgr $hi,@T[1]
+
+ lg @T[0],$stdframe(@i[0],$sp)
+ sllg @T[1],@T[0],`($n+1)*4`
+ srlg @T[0],@T[0],`64-($n+1)*4`
+ xgr $lo,@T[1]
+ xgr $hi,@T[0]
+
+ br $ra
+.size _mul_1x1,.-_mul_1x1
+
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,\@function
+.align 16
+bn_GF2m_mul_2x2:
+ stm${g} %r3,%r15,3*$SIZE_T($sp)
+
+ lghi %r1,-$stdframe-128
+ la %r0,0($sp)
+ la $sp,0(%r1,$sp) # alloca
+ st${g} %r0,0($sp) # back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+ bras $ra,_mul_1x1 # a1·b1
+ stmg $lo,$hi,16($rp)
+
+ lg $a,`$stdframe+128+4*$SIZE_T`($sp)
+ lg $b,`$stdframe+128+6*$SIZE_T`($sp)
+ bras $ra,_mul_1x1 # a0·b0
+ stmg $lo,$hi,0($rp)
+
+ lg $a,`$stdframe+128+3*$SIZE_T`($sp)
+ lg $b,`$stdframe+128+5*$SIZE_T`($sp)
+ xg $a,`$stdframe+128+4*$SIZE_T`($sp)
+ xg $b,`$stdframe+128+6*$SIZE_T`($sp)
+ bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
+ lmg @r[0],@r[3],0($rp)
+
+ xgr $lo,$hi
+ xgr $hi,@r[1]
+ xgr $lo,@r[0]
+ xgr $hi,@r[2]
+ xgr $lo,@r[3]
+ xgr $hi,@r[3]
+ xgr $lo,$hi
+ stg $hi,16($rp)
+ stg $lo,8($rp)
+___
+} else {
+$code.=<<___;
+ sllg %r3,%r3,32
+ sllg %r5,%r5,32
+ or %r3,%r4
+ or %r5,%r6
+ bras $ra,_mul_1x1
+ rllg $lo,$lo,32
+ rllg $hi,$hi,32
+ stmg $lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+ lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+ br $ra
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by "
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl
index f61246f..9fd64e8 100644
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -32,6 +32,33 @@
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
$mn0="%r0";
$num="%r1";
@@ -60,34 +87,44 @@
.globl bn_mul_mont
.type bn_mul_mont,\@function
bn_mul_mont:
- lgf $num,164($sp) # pull $num
- sla $num,3 # $num to enumerate bytes
+ lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
+ sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
la $bp,0($num,$bp)
- stg %r2,16($sp)
+ st${g} %r2,2*$SIZE_T($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+ tmll $num,4
+ bnzr %r14 # if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
cghi $num,96 #
bhr %r14 # if($num>96) return 0;
+___
+$code.=<<___;
+ stm${g} %r3,%r15,3*$SIZE_T($sp)
- stmg %r3,%r15,24($sp)
-
- lghi $rp,-160-8 # leave room for carry bit
+ lghi $rp,-$stdframe-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
- stg %r0,0($sp) # back chain
+ st${g} %r0,0($sp) # back chain
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
+ _dswap $n0
lg $bi,0($bp)
+ _dswap $bi
lg $alo,0($ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
@@ -95,6 +132,7 @@
msgr $mn0,$n0
lg $nlo,0($np) #
+ _dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
@@ -106,12 +144,14 @@
.align 16
.L1st:
lg $alo,0($j,$ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[j]*bp[0]
algr $alo,$AHI
lghi $AHI,0
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
+ _dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
@@ -119,22 +159,24 @@
algr $nlo,$alo
alcgr $NHI,$nhi
- stg $nlo,160-8($j,$sp) # tp[j-1]=
+ stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
- stg $NHI,160-8($j,$sp)
- stg $AHI,160($j,$sp)
+ stg $NHI,$stdframe-8($j,$sp)
+ stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
+ _dswap $bi
lg $alo,0($ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[0]*bp[i]
- alg $alo,160($sp) # +=tp[0]
+ alg $alo,$stdframe($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
@@ -142,6 +184,7 @@
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($np) # np[0]
+ _dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
@@ -153,14 +196,16 @@
.align 16
.Linner:
lg $alo,0($j,$ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
- alg $alo,160($j,$sp)# +=tp[j]
+ alg $alo,$stdframe($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
+ _dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
@@ -168,31 +213,33 @@
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
- stg $nlo,160-8($j,$sp) # tp[j-1]=
+ stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
- alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
+ alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
- stg $NHI,160-8($j,$sp)
- stg $AHI,160($j,$sp)
+ stg $NHI,$stdframe-8($j,$sp)
+ stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
- clg $bp,160+8+32($j,$sp) # compare to &bp[num]
+ cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
jne .Louter
- lg $rp,160+8+16($j,$sp) # reincarnate rp
- la $ap,160($sp)
+ l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
+ la $ap,$stdframe($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
- slbg $alo,0($j,$np)
+ lg $nlo,0($j,$np)
+ _dswap $nlo
+ slbgr $alo,$nlo
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
@@ -207,19 +254,24 @@
la $j,0(%r0)
lgr $count,$num
-.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
- stg $j,160($j,$sp) # zap tp
+.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
+ _dswap $alo
+ stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lcopy
- la %r1,160+8+48($j,$sp)
- lmg %r6,%r15,0(%r1)
+ la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+ lm${g} %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by "
___
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+ print $_,"\n";
+}
close STDOUT;
diff --git a/crypto/bn/asm/x86-gf2m.S b/crypto/bn/asm/x86-gf2m.S
new file mode 100644
index 0000000..9403a5a
--- /dev/null
+++ b/crypto/bn/asm/x86-gf2m.S
@@ -0,0 +1,335 @@
+.file "crypto/bn/asm/x86-gf2m.s"
+.text
+.type _mul_1x1_mmx,@function
+.align 16
+_mul_1x1_mmx:
+ subl $36,%esp
+ movl %eax,%ecx
+ leal (%eax,%eax,1),%edx
+ andl $1073741823,%ecx
+ leal (%edx,%edx,1),%ebp
+ movl $0,(%esp)
+ andl $2147483647,%edx
+ movd %eax,%mm2
+ movd %ebx,%mm3
+ movl %ecx,4(%esp)
+ xorl %edx,%ecx
+ pxor %mm5,%mm5
+ pxor %mm4,%mm4
+ movl %edx,8(%esp)
+ xorl %ebp,%edx
+ movl %ecx,12(%esp)
+ pcmpgtd %mm2,%mm5
+ paddd %mm2,%mm2
+ xorl %edx,%ecx
+ movl %ebp,16(%esp)
+ xorl %edx,%ebp
+ pand %mm3,%mm5
+ pcmpgtd %mm2,%mm4
+ movl %ecx,20(%esp)
+ xorl %ecx,%ebp
+ psllq $31,%mm5
+ pand %mm3,%mm4
+ movl %edx,24(%esp)
+ movl $7,%esi
+ movl %ebp,28(%esp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ shrl $3,%ebx
+ movl %ebp,%edi
+ psllq $30,%mm4
+ andl %ebx,%edi
+ shrl $3,%ebx
+ movd (%esp,%esi,4),%mm0
+ movl %ebp,%esi
+ andl %ebx,%esi
+ shrl $3,%ebx
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $3,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $6,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $9,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $12,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $15,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $18,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $21,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $24,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ pxor %mm4,%mm0
+ psllq $27,%mm2
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ pxor %mm5,%mm0
+ psllq $30,%mm1
+ addl $36,%esp
+ pxor %mm1,%mm0
+ ret
+.size _mul_1x1_mmx,.-_mul_1x1_mmx
+.type _mul_1x1_ialu,@function
+.align 16
+_mul_1x1_ialu:
+ subl $36,%esp
+ movl %eax,%ecx
+ leal (%eax,%eax,1),%edx
+ leal (,%eax,4),%ebp
+ andl $1073741823,%ecx
+ leal (%eax,%eax,1),%edi
+ sarl $31,%eax
+ movl $0,(%esp)
+ andl $2147483647,%edx
+ movl %ecx,4(%esp)
+ xorl %edx,%ecx
+ movl %edx,8(%esp)
+ xorl %ebp,%edx
+ movl %ecx,12(%esp)
+ xorl %edx,%ecx
+ movl %ebp,16(%esp)
+ xorl %edx,%ebp
+ movl %ecx,20(%esp)
+ xorl %ecx,%ebp
+ sarl $31,%edi
+ andl %ebx,%eax
+ movl %edx,24(%esp)
+ andl %ebx,%edi
+ movl %ebp,28(%esp)
+ movl %eax,%edx
+ shll $31,%eax
+ movl %edi,%ecx
+ shrl $1,%edx
+ movl $7,%esi
+ shll $30,%edi
+ andl %ebx,%esi
+ shrl $2,%ecx
+ xorl %edi,%eax
+ shrl $3,%ebx
+ movl $7,%edi
+ andl %ebx,%edi
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ xorl (%esp,%esi,4),%eax
+ movl $7,%esi
+ andl %ebx,%esi
+ shrl $3,%ebx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $3,%ebp
+ andl %ebx,%edi
+ shrl $29,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $6,%ecx
+ andl %ebx,%esi
+ shrl $26,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $9,%ebp
+ andl %ebx,%edi
+ shrl $23,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $12,%ecx
+ andl %ebx,%esi
+ shrl $20,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $15,%ebp
+ andl %ebx,%edi
+ shrl $17,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $18,%ecx
+ andl %ebx,%esi
+ shrl $14,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $21,%ebp
+ andl %ebx,%edi
+ shrl $11,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $24,%ecx
+ andl %ebx,%esi
+ shrl $8,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl %ebp,%ecx
+ shll $27,%ebp
+ movl (%esp,%esi,4),%edi
+ shrl $5,%ecx
+ movl %edi,%esi
+ xorl %ebp,%eax
+ shll $30,%edi
+ xorl %ecx,%edx
+ shrl $2,%esi
+ xorl %edi,%eax
+ xorl %esi,%edx
+ addl $36,%esp
+ ret
+.size _mul_1x1_ialu,.-_mul_1x1_ialu
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,@function
+.align 16
+bn_GF2m_mul_2x2:
+.L_bn_GF2m_mul_2x2_begin:
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %edx
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+ movl OPENSSL_ia32cap_P@GOT(%edx),%edx
+ movl (%edx),%eax
+ movl 4(%edx),%edx
+ testl $8388608,%eax
+ jz .L001ialu
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 24(%esp),%eax
+ movl 32(%esp),%ebx
+ call _mul_1x1_mmx
+ movq %mm0,%mm7
+ movl 28(%esp),%eax
+ movl 36(%esp),%ebx
+ call _mul_1x1_mmx
+ movq %mm0,%mm6
+ movl 24(%esp),%eax
+ movl 32(%esp),%ebx
+ xorl 28(%esp),%eax
+ xorl 36(%esp),%ebx
+ call _mul_1x1_mmx
+ pxor %mm7,%mm0
+ movl 20(%esp),%eax
+ pxor %mm6,%mm0
+ movq %mm0,%mm2
+ psllq $32,%mm0
+ popl %edi
+ psrlq $32,%mm2
+ popl %esi
+ pxor %mm6,%mm0
+ popl %ebx
+ pxor %mm7,%mm2
+ movq %mm0,(%eax)
+ popl %ebp
+ movq %mm2,8(%eax)
+ emms
+ ret
+.align 16
+.L001ialu:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $20,%esp
+ movl 44(%esp),%eax
+ movl 52(%esp),%ebx
+ call _mul_1x1_ialu
+ movl %eax,8(%esp)
+ movl %edx,12(%esp)
+ movl 48(%esp),%eax
+ movl 56(%esp),%ebx
+ call _mul_1x1_ialu
+ movl %eax,(%esp)
+ movl %edx,4(%esp)
+ movl 44(%esp),%eax
+ movl 52(%esp),%ebx
+ xorl 48(%esp),%eax
+ xorl 56(%esp),%ebx
+ call _mul_1x1_ialu
+ movl 40(%esp),%ebp
+ movl (%esp),%ebx
+ movl 4(%esp),%ecx
+ movl 8(%esp),%edi
+ movl 12(%esp),%esi
+ xorl %edx,%eax
+ xorl %ecx,%edx
+ xorl %ebx,%eax
+ movl %ebx,(%ebp)
+ xorl %edi,%edx
+ movl %esi,12(%ebp)
+ xorl %esi,%eax
+ addl $20,%esp
+ xorl %esi,%edx
+ popl %edi
+ xorl %edx,%eax
+ popl %esi
+ movl %edx,8(%ebp)
+ popl %ebx
+ movl %eax,4(%ebp)
+ popl %ebp
+ ret
+.size bn_GF2m_mul_2x2,.-.L_bn_GF2m_mul_2x2_begin
+.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105
+.byte 99,97,116,105,111,110,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+.comm OPENSSL_ia32cap_P,8,4
diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl
new file mode 100644
index 0000000..b579530
--- /dev/null
+++ b/crypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and µ-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII 16%-30%
+# P4 12%-12%
+# Opteron 18%-40%
+# Core2 19%-44%
+# Atom 38%-64%
+# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+ if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+ &sub ("esp",32+4);
+ &mov ($a1,$a);
+ &lea ($a2,&DWP(0,$a,$a));
+ &and ($a1,0x3fffffff);
+ &lea ($a4,&DWP(0,$a2,$a2));
+ &mov (&DWP(0*4,"esp"),0);
+ &and ($a2,0x7fffffff);
+ &movd ($A,$a);
+ &movd ($B,$b);
+ &mov (&DWP(1*4,"esp"),$a1); # a1
+ &xor ($a1,$a2); # a1^a2
+ &pxor ($B31,$B31);
+ &pxor ($B30,$B30);
+ &mov (&DWP(2*4,"esp"),$a2); # a2
+ &xor ($a2,$a4); # a2^a4
+ &mov (&DWP(3*4,"esp"),$a1); # a1^a2
+ &pcmpgtd($B31,$A); # broadcast 31st bit
+ &paddd ($A,$A); # $A<<=1
+ &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
+ &mov (&DWP(4*4,"esp"),$a4); # a4
+ &xor ($a4,$a2); # a2=a4^a2^a4
+ &pand ($B31,$B);
+ &pcmpgtd($B30,$A); # broadcast 30th bit
+ &mov (&DWP(5*4,"esp"),$a1); # a1^a4
+ &xor ($a4,$a1); # a1^a2^a4
+ &psllq ($B31,31);
+ &pand ($B30,$B);
+ &mov (&DWP(6*4,"esp"),$a2); # a2^a4
+ &mov (@i[0],0x7);
+ &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
+ &mov ($a4,@i[0]);
+ &and (@i[0],$b);
+ &shr ($b,3);
+ &mov (@i[1],$a4);
+ &psllq ($B30,30);
+ &and (@i[1],$b);
+ &shr ($b,3);
+ &movd ($R,&DWP(0,"esp",@i[0],4));
+ &mov (@i[0],$a4);
+ &and (@i[0],$b);
+ &shr ($b,3);
+ for($n=1;$n<9;$n++) {
+ &movd (@T[1],&DWP(0,"esp",@i[1],4));
+ &mov (@i[1],$a4);
+ &psllq (@T[1],3*$n);
+ &and (@i[1],$b);
+ &shr ($b,3);
+ &pxor ($R,@T[1]);
+
+ push(@i,shift(@i)); push(@T,shift(@T));
+ }
+ &movd (@T[1],&DWP(0,"esp",@i[1],4));
+ &pxor ($R,$B30);
+ &psllq (@T[1],3*$n++);
+ &pxor ($R,@T[1]);
+
+ &movd (@T[0],&DWP(0,"esp",@i[0],4));
+ &pxor ($R,$B31);
+ &psllq (@T[0],3*$n);
+ &add ("esp",32+4);
+ &pxor ($R,@T[0]);
+ &ret ();
+&function_end_B("_mul_1x1_mmx");
+ }
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+ &sub ("esp",32+4);
+ &mov ($a1,$a);
+ &lea ($a2,&DWP(0,$a,$a));
+ &lea ($a4,&DWP(0,"",$a,4));
+ &and ($a1,0x3fffffff);
+ &lea (@i[1],&DWP(0,$lo,$lo));
+ &sar ($lo,31); # broadcast 31st bit
+ &mov (&DWP(0*4,"esp"),0);
+ &and ($a2,0x7fffffff);
+ &mov (&DWP(1*4,"esp"),$a1); # a1
+ &xor ($a1,$a2); # a1^a2
+ &mov (&DWP(2*4,"esp"),$a2); # a2
+ &xor ($a2,$a4); # a2^a4
+ &mov (&DWP(3*4,"esp"),$a1); # a1^a2
+ &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
+ &mov (&DWP(4*4,"esp"),$a4); # a4
+ &xor ($a4,$a2); # a2=a4^a2^a4
+ &mov (&DWP(5*4,"esp"),$a1); # a1^a4
+ &xor ($a4,$a1); # a1^a2^a4
+ &sar (@i[1],31); # broardcast 30th bit
+ &and ($lo,$b);
+ &mov (&DWP(6*4,"esp"),$a2); # a2^a4
+ &and (@i[1],$b);
+ &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
+ &mov ($hi,$lo);
+ &shl ($lo,31);
+ &mov (@T[0],@i[1]);
+ &shr ($hi,1);
+
+ &mov (@i[0],0x7);
+ &shl (@i[1],30);
+ &and (@i[0],$b);
+ &shr (@T[0],2);
+ &xor ($lo,@i[1]);
+
+ &shr ($b,3);
+ &mov (@i[1],0x7); # 5-byte instruction!?
+ &and (@i[1],$b);
+ &shr ($b,3);
+ &xor ($hi,@T[0]);
+ &xor ($lo,&DWP(0,"esp",@i[0],4));
+ &mov (@i[0],0x7);
+ &and (@i[0],$b);
+ &shr ($b,3);
+ for($n=1;$n<9;$n++) {
+ &mov (@T[1],&DWP(0,"esp",@i[1],4));
+ &mov (@i[1],0x7);
+ &mov (@T[0],@T[1]);
+ &shl (@T[1],3*$n);
+ &and (@i[1],$b);
+ &shr (@T[0],32-3*$n);
+ &xor ($lo,@T[1]);
+ &shr ($b,3);
+ &xor ($hi,@T[0]);
+
+ push(@i,shift(@i)); push(@T,shift(@T));
+ }
+ &mov (@T[1],&DWP(0,"esp",@i[1],4));
+ &mov (@T[0],@T[1]);
+ &shl (@T[1],3*$n);
+ &mov (@i[1],&DWP(0,"esp",@i[0],4));
+ &shr (@T[0],32-3*$n); $n++;
+ &mov (@i[0],@i[1]);
+ &xor ($lo,@T[1]);
+ &shl (@i[1],3*$n);
+ &xor ($hi,@T[0]);
+ &shr (@i[0],32-3*$n);
+ &xor ($lo,@i[1]);
+ &xor ($hi,@i[0]);
+
+ &add ("esp",32+4);
+ &ret ();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+ &picmeup("edx","OPENSSL_ia32cap_P");
+ &mov ("eax",&DWP(0,"edx"));
+ &mov ("edx",&DWP(4,"edx"));
+ &test ("eax",1<<23); # check MMX bit
+ &jz (&label("ialu"));
+if ($sse2) {
+ &test ("eax",1<<24); # check FXSR bit
+ &jz (&label("mmx"));
+ &test ("edx",1<<1); # check PCLMULQDQ bit
+ &jz (&label("mmx"));
+
+ &movups ("xmm0",&QWP(8,"esp"));
+ &shufps ("xmm0","xmm0",0b10110001);
+ &pclmulqdq ("xmm0","xmm0",1);
+ &mov ("eax",&DWP(4,"esp"));
+ &movups (&QWP(0,"eax"),"xmm0");
+ &ret ();
+
+&set_label("mmx",16);
+}
+ &push ("ebp");
+ &push ("ebx");
+ &push ("esi");
+ &push ("edi");
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &call ("_mul_1x1_mmx"); # a1·b1
+ &movq ("mm7",$R);
+
+ &mov ($a,&wparam(2));
+ &mov ($b,&wparam(4));
+ &call ("_mul_1x1_mmx"); # a0·b0
+ &movq ("mm6",$R);
+
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &xor ($a,&wparam(2));
+ &xor ($b,&wparam(4));
+ &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
+ &pxor ($R,"mm7");
+ &mov ($a,&wparam(0));
+ &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
+
+ &movq ($A,$R);
+ &psllq ($R,32);
+ &pop ("edi");
+ &psrlq ($A,32);
+ &pop ("esi");
+ &pxor ($R,"mm6");
+ &pop ("ebx");
+ &pxor ($A,"mm7");
+ &movq (&QWP(0,$a),$R);
+ &pop ("ebp");
+ &movq (&QWP(8,$a),$A);
+ &emms ();
+ &ret ();
+&set_label("ialu",16);
+}
+ &push ("ebp");
+ &push ("ebx");
+ &push ("esi");
+ &push ("edi");
+ &stack_push(4+1);
+
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &call ("_mul_1x1_ialu"); # a1·b1
+ &mov (&DWP(8,"esp"),$lo);
+ &mov (&DWP(12,"esp"),$hi);
+
+ &mov ($a,&wparam(2));
+ &mov ($b,&wparam(4));
+ &call ("_mul_1x1_ialu"); # a0·b0
+ &mov (&DWP(0,"esp"),$lo);
+ &mov (&DWP(4,"esp"),$hi);
+
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &xor ($a,&wparam(2));
+ &xor ($b,&wparam(4));
+ &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
+
+ &mov ("ebp",&wparam(0));
+ @r=("ebx","ecx","edi","esi");
+ &mov (@r[0],&DWP(0,"esp"));
+ &mov (@r[1],&DWP(4,"esp"));
+ &mov (@r[2],&DWP(8,"esp"));
+ &mov (@r[3],&DWP(12,"esp"));
+
+ &xor ($lo,$hi);
+ &xor ($hi,@r[1]);
+ &xor ($lo,@r[0]);
+ &mov (&DWP(0,"ebp"),@r[0]);
+ &xor ($hi,@r[2]);
+ &mov (&DWP(12,"ebp"),@r[3]);
+ &xor ($lo,@r[3]);
+ &stack_pop(4+1);
+ &xor ($hi,@r[3]);
+ &pop ("edi");
+ &xor ($lo,$hi);
+ &pop ("esi");
+ &mov (&DWP(8,"ebp"),$hi);
+ &pop ("ebx");
+ &mov (&DWP(4,"ebp"),$lo);
+ &pop ("ebp");
+ &ret ();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by ");
+
+&asm_finish();
diff --git a/crypto/bn/asm/x86-mont.S b/crypto/bn/asm/x86-mont.S
new file mode 100644
index 0000000..2bbb0e3
--- /dev/null
+++ b/crypto/bn/asm/x86-mont.S
@@ -0,0 +1,338 @@
+.file "crypto/bn/asm/x86-mont.s"
+.text
+.globl bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ xorl %eax,%eax
+ movl 40(%esp),%edi
+ cmpl $4,%edi
+ jl .L000just_leave
+ leal 20(%esp),%esi
+ leal 24(%esp),%edx
+ movl %esp,%ebp
+ addl $2,%edi
+ negl %edi
+ leal -32(%esp,%edi,4),%esp
+ negl %edi
+ movl %esp,%eax
+ subl %edx,%eax
+ andl $2047,%eax
+ subl %eax,%esp
+ xorl %esp,%edx
+ andl $2048,%edx
+ xorl $2048,%edx
+ subl %edx,%esp
+ andl $-64,%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl 16(%esi),%esi
+ movl (%esi),%esi
+ movl %eax,4(%esp)
+ movl %ebx,8(%esp)
+ movl %ecx,12(%esp)
+ movl %edx,16(%esp)
+ movl %esi,20(%esp)
+ leal -3(%edi),%ebx
+ movl %ebp,24(%esp)
+ movl 8(%esp),%esi
+ leal 1(%ebx),%ebp
+ movl 12(%esp),%edi
+ xorl %ecx,%ecx
+ movl %esi,%edx
+ andl $1,%ebp
+ subl %edi,%edx
+ leal 4(%edi,%ebx,4),%eax
+ orl %edx,%ebp
+ movl (%edi),%edi
+ jz .L001bn_sqr_mont
+ movl %eax,28(%esp)
+ movl (%esi),%eax
+ xorl %edx,%edx
+.align 16
+.L002mull:
+ movl %edx,%ebp
+ mull %edi
+ addl %eax,%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ movl (%esi,%ecx,4),%eax
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L002mull
+ movl %edx,%ebp
+ mull %edi
+ movl 20(%esp),%edi
+ addl %ebp,%eax
+ movl 16(%esp),%esi
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ movl %eax,32(%esp,%ebx,4)
+ xorl %ecx,%ecx
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ movl (%esi),%eax
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ incl %ecx
+ jmp .L0032ndmadd
+.align 16
+.L0041stmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L0041stmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ addl %eax,%ebp
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ xorl %ecx,%ecx
+ addl 36(%esp,%ebx,4),%edx
+ movl %ebp,32(%esp,%ebx,4)
+ adcl $0,%ecx
+ movl (%esi),%eax
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ movl $1,%ecx
+.align 16
+.L0032ndmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0032ndmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ xorl %eax,%eax
+ movl 12(%esp),%ecx
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ leal 4(%ecx),%ecx
+ movl %edx,32(%esp,%ebx,4)
+ cmpl 28(%esp),%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L005common_tail
+ movl (%ecx),%edi
+ movl 8(%esp),%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ movl (%esi),%eax
+ jmp .L0041stmadd
+.align 16
+.L001bn_sqr_mont:
+ movl %ebx,(%esp)
+ movl %ecx,12(%esp)
+ movl %edi,%eax
+ mull %edi
+ movl %eax,32(%esp)
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+ incl %ecx
+.align 16
+.L006sqr:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ leal (%ebx,%eax,2),%ebp
+ shrl $31,%eax
+ cmpl (%esp),%ecx
+ movl %eax,%ebx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L006sqr
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ leal (%ebx,%eax,2),%ebp
+ imull 32(%esp),%edi
+ shrl $31,%eax
+ movl %ebp,32(%esp,%ecx,4)
+ leal (%eax,%edx,2),%ebp
+ movl (%esi),%eax
+ shrl $31,%edx
+ movl %ebp,36(%esp,%ecx,4)
+ movl %edx,40(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl %ecx,%ebx
+ adcl $0,%edx
+ movl 4(%esi),%eax
+ movl $1,%ecx
+.align 16
+.L0073rdmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl 4(%esi,%ecx,4),%eax
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %edx,%ebp
+ mull %edi
+ addl 36(%esp,%ecx,4),%ebp
+ leal 2(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0073rdmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ movl 12(%esp),%ecx
+ xorl %eax,%eax
+ movl 8(%esp),%esi
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ movl %edx,32(%esp,%ebx,4)
+ cmpl %ebx,%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L005common_tail
+ movl 4(%esi,%ecx,4),%edi
+ leal 1(%ecx),%ecx
+ movl %edi,%eax
+ movl %ecx,12(%esp)
+ mull %edi
+ addl 32(%esp,%ecx,4),%eax
+ adcl $0,%edx
+ movl %eax,32(%esp,%ecx,4)
+ xorl %ebp,%ebp
+ cmpl %ebx,%ecx
+ leal 1(%ecx),%ecx
+ je .L008sqrlast
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+.align 16
+.L009sqradd:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal (%eax,%eax,1),%ebp
+ adcl $0,%edx
+ shrl $31,%eax
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%eax
+ addl %ebx,%ebp
+ adcl $0,%eax
+ cmpl (%esp),%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %eax,%ebx
+ jle .L009sqradd
+ movl %edx,%ebp
+ addl %edx,%edx
+ shrl $31,%ebp
+ addl %ebx,%edx
+ adcl $0,%ebp
+.L008sqrlast:
+ movl 20(%esp),%edi
+ movl 16(%esp),%esi
+ imull 32(%esp),%edi
+ addl 32(%esp,%ecx,4),%edx
+ movl (%esi),%eax
+ adcl $0,%ebp
+ movl %edx,32(%esp,%ecx,4)
+ movl %ebp,36(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ leal -1(%ecx),%ebx
+ adcl $0,%edx
+ movl $1,%ecx
+ movl 4(%esi),%eax
+ jmp .L0073rdmadd
+.align 16
+.L005common_tail:
+ movl 16(%esp),%ebp
+ movl 4(%esp),%edi
+ leal 32(%esp),%esi
+ movl (%esi),%eax
+ movl %ebx,%ecx
+ xorl %edx,%edx
+.align 16
+.L010sub:
+ sbbl (%ebp,%edx,4),%eax
+ movl %eax,(%edi,%edx,4)
+ decl %ecx
+ movl 4(%esi,%edx,4),%eax
+ leal 1(%edx),%edx
+ jge .L010sub
+ sbbl $0,%eax
+ andl %eax,%esi
+ notl %eax
+ movl %edi,%ebp
+ andl %eax,%ebp
+ orl %ebp,%esi
+.align 16
+.L011copy:
+ movl (%esi,%ebx,4),%eax
+ movl %eax,(%edi,%ebx,4)
+ movl %ecx,32(%esp,%ebx,4)
+ decl %ebx
+ jge .L011copy
+ movl 24(%esp),%esp
+ movl $1,%eax
+.L000just_leave:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte 111,114,103,62,0
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index 5cd3cd2..e8f6b05 100755
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -527,8 +527,10 @@
&jle (&label("sqradd"));
&mov ($carry,"edx");
- &lea ("edx",&DWP(0,$sbit,"edx",2));
+ &add ("edx","edx");
&shr ($carry,31);
+ &add ("edx",$sbit);
+ &adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index acb0b40..329946e 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -66,7 +66,7 @@
#undef sqr
/*
- * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
+ * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
diff --git a/crypto/bn/asm/x86_64-gf2m.S b/crypto/bn/asm/x86_64-gf2m.S
new file mode 100644
index 0000000..ccd2ed7
--- /dev/null
+++ b/crypto/bn/asm/x86_64-gf2m.S
@@ -0,0 +1,291 @@
+.text
+
+.type _mul_1x1,@function
+.align 16
+_mul_1x1:
+ subq $128+8,%rsp
+ movq $-1,%r9
+ leaq (%rax,%rax,1),%rsi
+ shrq $3,%r9
+ leaq (,%rax,4),%rdi
+ andq %rax,%r9
+ leaq (,%rax,8),%r12
+ sarq $63,%rax
+ leaq (%r9,%r9,1),%r10
+ sarq $63,%rsi
+ leaq (,%r9,4),%r11
+ andq %rbp,%rax
+ sarq $63,%rdi
+ movq %rax,%rdx
+ shlq $63,%rax
+ andq %rbp,%rsi
+ shrq $1,%rdx
+ movq %rsi,%rcx
+ shlq $62,%rsi
+ andq %rbp,%rdi
+ shrq $2,%rcx
+ xorq %rsi,%rax
+ movq %rdi,%rbx
+ shlq $61,%rdi
+ xorq %rcx,%rdx
+ shrq $3,%rbx
+ xorq %rdi,%rax
+ xorq %rbx,%rdx
+
+ movq %r9,%r13
+ movq $0,0(%rsp)
+ xorq %r10,%r13
+ movq %r9,8(%rsp)
+ movq %r11,%r14
+ movq %r10,16(%rsp)
+ xorq %r12,%r14
+ movq %r13,24(%rsp)
+
+ xorq %r11,%r9
+ movq %r11,32(%rsp)
+ xorq %r11,%r10
+ movq %r9,40(%rsp)
+ xorq %r11,%r13
+ movq %r10,48(%rsp)
+ xorq %r14,%r9
+ movq %r13,56(%rsp)
+ xorq %r14,%r10
+
+ movq %r12,64(%rsp)
+ xorq %r14,%r13
+ movq %r9,72(%rsp)
+ xorq %r11,%r9
+ movq %r10,80(%rsp)
+ xorq %r11,%r10
+ movq %r13,88(%rsp)
+
+ xorq %r11,%r13
+ movq %r14,96(%rsp)
+ movq %r8,%rsi
+ movq %r9,104(%rsp)
+ andq %rbp,%rsi
+ movq %r10,112(%rsp)
+ shrq $4,%rbp
+ movq %r13,120(%rsp)
+ movq %r8,%rdi
+ andq %rbp,%rdi
+ shrq $4,%rbp
+
+ movq (%rsp,%rsi,8),%xmm0
+ movq %r8,%rsi
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $4,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $60,%rbx
+ xorq %rcx,%rax
+ pslldq $1,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $12,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $52,%rbx
+ xorq %rcx,%rax
+ pslldq $2,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $20,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $44,%rbx
+ xorq %rcx,%rax
+ pslldq $3,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $28,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $36,%rbx
+ xorq %rcx,%rax
+ pslldq $4,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $36,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $28,%rbx
+ xorq %rcx,%rax
+ pslldq $5,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $44,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $20,%rbx
+ xorq %rcx,%rax
+ pslldq $6,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $52,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $12,%rbx
+ xorq %rcx,%rax
+ pslldq $7,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %rcx,%rbx
+ shlq $60,%rcx
+.byte 102,72,15,126,198
+ shrq $4,%rbx
+ xorq %rcx,%rax
+ psrldq $8,%xmm0
+ xorq %rbx,%rdx
+.byte 102,72,15,126,199
+ xorq %rsi,%rax
+ xorq %rdi,%rdx
+
+ addq $128+8,%rsp
+ .byte 0xf3,0xc3
+.Lend_mul_1x1:
+.size _mul_1x1,.-_mul_1x1
+
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,@function
+.align 16
+bn_GF2m_mul_2x2:
+ movq OPENSSL_ia32cap_P(%rip),%rax
+ btq $33,%rax
+ jnc .Lvanilla_mul_2x2
+
+.byte 102,72,15,110,198
+.byte 102,72,15,110,201
+.byte 102,72,15,110,210
+.byte 102,73,15,110,216
+ movdqa %xmm0,%xmm4
+ movdqa %xmm1,%xmm5
+.byte 102,15,58,68,193,0
+ pxor %xmm2,%xmm4
+ pxor %xmm3,%xmm5
+.byte 102,15,58,68,211,0
+.byte 102,15,58,68,229,0
+ xorps %xmm0,%xmm4
+ xorps %xmm2,%xmm4
+ movdqa %xmm4,%xmm5
+ pslldq $8,%xmm4
+ psrldq $8,%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm0
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ .byte 0xf3,0xc3
+
+.align 16
+.Lvanilla_mul_2x2:
+ leaq -136(%rsp),%rsp
+ movq %r14,80(%rsp)
+ movq %r13,88(%rsp)
+ movq %r12,96(%rsp)
+ movq %rbp,104(%rsp)
+ movq %rbx,112(%rsp)
+.Lbody_mul_2x2:
+ movq %rdi,32(%rsp)
+ movq %rsi,40(%rsp)
+ movq %rdx,48(%rsp)
+ movq %rcx,56(%rsp)
+ movq %r8,64(%rsp)
+
+ movq $15,%r8
+ movq %rsi,%rax
+ movq %rcx,%rbp
+ call _mul_1x1
+ movq %rax,16(%rsp)
+ movq %rdx,24(%rsp)
+
+ movq 48(%rsp),%rax
+ movq 64(%rsp),%rbp
+ call _mul_1x1
+ movq %rax,0(%rsp)
+ movq %rdx,8(%rsp)
+
+ movq 40(%rsp),%rax
+ movq 56(%rsp),%rbp
+ xorq 48(%rsp),%rax
+ xorq 64(%rsp),%rbp
+ call _mul_1x1
+ movq 0(%rsp),%rbx
+ movq 8(%rsp),%rcx
+ movq 16(%rsp),%rdi
+ movq 24(%rsp),%rsi
+ movq 32(%rsp),%rbp
+
+ xorq %rdx,%rax
+ xorq %rcx,%rdx
+ xorq %rbx,%rax
+ movq %rbx,0(%rbp)
+ xorq %rdi,%rdx
+ movq %rsi,24(%rbp)
+ xorq %rsi,%rax
+ xorq %rsi,%rdx
+ xorq %rdx,%rax
+ movq %rdx,16(%rbp)
+ movq %rax,8(%rbp)
+
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbp
+ movq 112(%rsp),%rbx
+ leaq 136(%rsp),%rsp
+ .byte 0xf3,0xc3
+.Lend_mul_2x2:
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
new file mode 100644
index 0000000..cf9f48e
--- /dev/null
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,389 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" $xlate $flavour $output";
+
+($lo,$hi)=("%rax","%rdx"); $a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type _mul_1x1,\@abi-omnipotent
+.align 16
+_mul_1x1:
+ sub \$128+8,%rsp
+ mov \$-1,$a1
+ lea ($a,$a),$i0
+ shr \$3,$a1
+ lea (,$a,4),$i1
+ and $a,$a1 # a1=a&0x1fffffffffffffff
+ lea (,$a,8),$a8
+ sar \$63,$a # broadcast 63rd bit
+ lea ($a1,$a1),$a2
+ sar \$63,$i0 # broadcast 62nd bit
+ lea (,$a1,4),$a4
+ and $b,$a
+ sar \$63,$i1 # boardcast 61st bit
+ mov $a,$hi # $a is $lo
+ shl \$63,$lo
+ and $b,$i0
+ shr \$1,$hi
+ mov $i0,$t1
+ shl \$62,$i0
+ and $b,$i1
+ shr \$2,$t1
+ xor $i0,$lo
+ mov $i1,$t0
+ shl \$61,$i1
+ xor $t1,$hi
+ shr \$3,$t0
+ xor $i1,$lo
+ xor $t0,$hi
+
+ mov $a1,$a12
+ movq \$0,0(%rsp) # tab[0]=0
+ xor $a2,$a12 # a1^a2
+ mov $a1,8(%rsp) # tab[1]=a1
+ mov $a4,$a48
+ mov $a2,16(%rsp) # tab[2]=a2
+ xor $a8,$a48 # a4^a8
+ mov $a12,24(%rsp) # tab[3]=a1^a2
+
+ xor $a4,$a1
+ mov $a4,32(%rsp) # tab[4]=a4
+ xor $a4,$a2
+ mov $a1,40(%rsp) # tab[5]=a1^a4
+ xor $a4,$a12
+ mov $a2,48(%rsp) # tab[6]=a2^a4
+ xor $a48,$a1 # a1^a4^a4^a8=a1^a8
+ mov $a12,56(%rsp) # tab[7]=a1^a2^a4
+ xor $a48,$a2 # a2^a4^a4^a8=a1^a8
+
+ mov $a8,64(%rsp) # tab[8]=a8
+ xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
+ mov $a1,72(%rsp) # tab[9]=a1^a8
+ xor $a4,$a1 # a1^a8^a4
+ mov $a2,80(%rsp) # tab[10]=a2^a8
+ xor $a4,$a2 # a2^a8^a4
+ mov $a12,88(%rsp) # tab[11]=a1^a2^a8
+
+ xor $a4,$a12 # a1^a2^a8^a4
+ mov $a48,96(%rsp) # tab[12]=a4^a8
+ mov $mask,$i0
+ mov $a1,104(%rsp) # tab[13]=a1^a4^a8
+ and $b,$i0
+ mov $a2,112(%rsp) # tab[14]=a2^a4^a8
+ shr \$4,$b
+ mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
+ mov $mask,$i1
+ and $b,$i1
+ shr \$4,$b
+
+ movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
+ mov $mask,$i0
+ and $b,$i0
+ shr \$4,$b
+___
+ for ($n=1;$n<8;$n++) {
+ $code.=<<___;
+ mov (%rsp,$i1,8),$t1
+ mov $mask,$i1
+ mov $t1,$t0
+ shl \$`8*$n-4`,$t1
+ and $b,$i1
+ movq (%rsp,$i0,8),$Tx
+ shr \$`64-(8*$n-4)`,$t0
+ xor $t1,$lo
+ pslldq \$$n,$Tx
+ mov $mask,$i0
+ shr \$4,$b
+ xor $t0,$hi
+ and $b,$i0
+ shr \$4,$b
+ pxor $Tx,$R
+___
+ }
+$code.=<<___;
+ mov (%rsp,$i1,8),$t1
+ mov $t1,$t0
+ shl \$`8*$n-4`,$t1
+ movq $R,$i0
+ shr \$`64-(8*$n-4)`,$t0
+ xor $t1,$lo
+ psrldq \$8,$R
+ xor $t0,$hi
+ movq $R,$i1
+ xor $i0,$lo
+ xor $i1,$hi
+
+ add \$128+8,%rsp
+ ret
+.Lend_mul_1x1:
+.size _mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,\@abi-omnipotent
+.align 16
+bn_GF2m_mul_2x2:
+ mov OPENSSL_ia32cap_P(%rip),%rax
+ bt \$33,%rax
+ jnc .Lvanilla_mul_2x2
+
+ movq $a1,%xmm0
+ movq $b1,%xmm1
+ movq $a0,%xmm2
+___
+$code.=<<___ if ($win64);
+ movq 40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+ movq $b0,%xmm3
+___
+$code.=<<___;
+ movdqa %xmm0,%xmm4
+ movdqa %xmm1,%xmm5
+ pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
+ pxor %xmm2,%xmm4
+ pxor %xmm3,%xmm5
+ pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
+ pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
+ xorps %xmm0,%xmm4
+ xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ movdqa %xmm4,%xmm5
+ pslldq \$8,%xmm4
+ psrldq \$8,%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm0
+ movdqu %xmm2,0($rp)
+ movdqu %xmm0,16($rp)
+ ret
+
+.align 16
+.Lvanilla_mul_2x2:
+ lea -8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ mov `8*17+40`(%rsp),$b0
+ mov %rdi,8*15(%rsp)
+ mov %rsi,8*16(%rsp)
+___
+$code.=<<___;
+ mov %r14,8*10(%rsp)
+ mov %r13,8*11(%rsp)
+ mov %r12,8*12(%rsp)
+ mov %rbp,8*13(%rsp)
+ mov %rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+ mov $rp,32(%rsp) # save the arguments
+ mov $a1,40(%rsp)
+ mov $a0,48(%rsp)
+ mov $b1,56(%rsp)
+ mov $b0,64(%rsp)
+
+ mov \$0xf,$mask
+ mov $a1,$a
+ mov $b1,$b
+ call _mul_1x1 # a1·b1
+ mov $lo,16(%rsp)
+ mov $hi,24(%rsp)
+
+ mov 48(%rsp),$a
+ mov 64(%rsp),$b
+ call _mul_1x1 # a0·b0
+ mov $lo,0(%rsp)
+ mov $hi,8(%rsp)
+
+ mov 40(%rsp),$a
+ mov 56(%rsp),$b
+ xor 48(%rsp),$a
+ xor 64(%rsp),$b
+ call _mul_1x1 # (a0+a1)·(b0+b1)
+___
+ @r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+ mov 0(%rsp),@r[0]
+ mov 8(%rsp),@r[1]
+ mov 16(%rsp),@r[2]
+ mov 24(%rsp),@r[3]
+ mov 32(%rsp),%rbp
+
+ xor $hi,$lo
+ xor @r[1],$hi
+ xor @r[0],$lo
+ mov @r[0],0(%rbp)
+ xor @r[2],$hi
+ mov @r[3],24(%rbp)
+ xor @r[3],$lo
+ xor @r[3],$hi
+ xor $hi,$lo
+ mov $hi,16(%rbp)
+ mov $lo,8(%rbp)
+
+ mov 8*10(%rsp),%r14
+ mov 8*11(%rsp),%r13
+ mov 8*12(%rsp),%r12
+ mov 8*13(%rsp),%rbp
+ mov 8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+ mov 8*15(%rsp),%rdi
+ mov 8*16(%rsp),%rsi
+___
+$code.=<<___;
+ lea 8*17(%rsp),%rsp
+ ret
+.Lend_mul_2x2:
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by "
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lbody_mul_2x2(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lin_prologue
+
+ mov 8*10(%rax),%r14 # mimic epilogue
+ mov 8*11(%rax),%r13
+ mov 8*12(%rax),%r12
+ mov 8*13(%rax),%rbp
+ mov 8*14(%rax),%rbx
+ mov 8*15(%rax),%rdi
+ mov 8*16(%rax),%rsi
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+.Lin_prologue:
+ lea 8*17(%rax),%rax
+ mov %rax,152($context) # restore context->Rsp
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva _mul_1x1
+ .rva .Lend_mul_1x1
+ .rva .LSEH_info_1x1
+
+ .rva .Lvanilla_mul_2x2
+ .rva .Lend_mul_2x2
+ .rva .LSEH_info_2x2
+.section .xdata
+.align 8
+.LSEH_info_1x1:
+ .byte 0x01,0x07,0x02,0x00
+ .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
+.LSEH_info_2x2:
+ .byte 9,0,0,0
+ .rva se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/x86_64-mont.S b/crypto/bn/asm/x86_64-mont.S
new file mode 100644
index 0000000..95e2905
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont.S
@@ -0,0 +1,1374 @@
+.text
+
+.globl bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ cmpq %rsi,%rdx
+ jne .Lmul4x_enter
+ jmp .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 2(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jl .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ movq %r9,%r15
+ orq %rcx,%rsi
+.align 16
+.Lcopy:
+ movq (%rsi,%r14,8),%rax
+ movq %r14,(%rsp,%r14,8)
+ movq %rax,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont,.-bn_mul_mont
+.type bn_mul4x_mont,@function
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 4(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jl .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -1(%r9),%r15
+ orq %rcx,%rsi
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz .Lcopy4x
+
+ shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont,.-bn_mul4x_mont
+.type bn_sqr4x_mont,@function
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ movq %rsp,%r11
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,2),%rsp
+ andq $-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdi,32(%rsp)
+ movq %rcx,40(%rsp)
+ movq %r8,48(%rsp)
+ movq %r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,16(%rdi,%rcx,1)
+
+
+ movq 24(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 32(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_1st
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ leaq 16(%rbp),%rbp
+ movq %r12,8(%rdi)
+ jmp .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ movq -24(%rdi,%rbp,1),%r10
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ addq -16(%rdi,%rbp,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+ xorq %r12,%r12
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ leaq 16(%rcx),%rcx
+ jmp .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_inner
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ addq $16,%rbp
+ jnz .Lsqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi)
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi)
+
+ movq -8(%rsi),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ movq %rdx,%r13
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi)
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 64(%rsp,%r9,2),%rdi
+ xorq %r10,%r10
+ movq -24(%rdi,%rbp,2),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,-40(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ jmp .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ addq $32,%rbp
+ jnz .Lsqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+ movq 40(%rsp),%rsi
+ movq 48(%rsp),%r8
+ xorq %rcx,%rcx
+ movq %r9,0(%rsp)
+ subq %r9,%rcx
+ movq 64(%rsp),%r10
+ movq %r8,%r14
+ leaq 64(%rsp,%r9,2),%rax
+ leaq 64(%rsp,%r9,1),%rdi
+ movq %rax,8(%rsp)
+ leaq (%rsi,%r9,1),%rsi
+ xorq %rbp,%rbp
+
+ movq 0(%rsi,%rcx,1),%rax
+ movq 8(%rsi,%rcx,1),%r9
+ imulq %r10,%r14
+ movq %rax,%rbx
+ jmp .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r8,%r15
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+ imulq %r11,%r15
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ jmp .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq (%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 8(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ cmpq $0,%rcx
+ jne .Lsqr4x_mont_inner
+
+ subq 0(%rsp),%rcx
+ movq %r8,%r14
+
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi)
+
+ xorq %r11,%r11
+ addq (%rdi),%r10
+ adcq $0,%r11
+ movq 0(%rsi,%rcx,1),%rbx
+ addq %rbp,%r10
+ adcq $0,%r11
+
+ imulq 16(%rdi,%rcx,1),%r14
+ xorq %r12,%r12
+ movq 8(%rsi,%rcx,1),%r9
+ addq %r10,%r13
+ movq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi)
+
+ xorq %rbp,%rbp
+ addq 8(%rdi),%r12
+ adcq %rbp,%rbp
+ addq %r11,%r12
+ leaq 16(%rdi),%rdi
+ adcq $0,%rbp
+ movq %r12,-8(%rdi)
+ cmpq 8(%rsp),%rdi
+ jb .Lsqr4x_mont_outer
+
+ movq 0(%rsp),%r9
+ movq %rbp,(%rdi)
+ movq 64(%rsp,%r9,1),%rax
+ leaq 64(%rsp,%r9,1),%rbx
+ movq 40(%rsp),%rsi
+ shrq $5,%r9
+ movq 8(%rbx),%rdx
+ xorq %rbp,%rbp
+
+ movq 32(%rsp),%rdi
+ subq 0(%rsi),%rax
+ movq 16(%rbx),%r10
+ movq 24(%rbx),%r11
+ sbbq 8(%rsi),%rdx
+ leaq -1(%r9),%rcx
+ jmp .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+ movq %rax,0(%rdi,%rbp,8)
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq 32(%rbx,%rbp,8),%rax
+ movq 40(%rbx,%rbp,8),%rdx
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+ movq %r11,24(%rdi,%rbp,8)
+ sbbq 32(%rsi,%rbp,8),%rax
+ movq 48(%rbx,%rbp,8),%r10
+ movq 56(%rbx,%rbp,8),%r11
+ sbbq 40(%rsi,%rbp,8),%rdx
+ leaq 4(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_sub
+
+ movq %rax,0(%rdi,%rbp,8)
+ movq 32(%rbx,%rbp,8),%rax
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+
+ sbbq $0,%rax
+ movq %r11,24(%rdi,%rbp,8)
+ xorq %rbp,%rbp
+ andq %rax,%rbx
+ notq %rax
+ movq %rdi,%rsi
+ andq %rax,%rsi
+ leaq -1(%r9),%rcx
+ orq %rsi,%rbx
+
+ pxor %xmm0,%xmm0
+ leaq 64(%rsp,%r9,8),%rsi
+ movdqu (%rbx),%xmm1
+ leaq (%rsi,%r9,8),%rsi
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm0,(%rsi)
+ movdqu %xmm1,(%rdi)
+ jmp .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy:
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqu 32(%rbx,%rbp,1),%xmm1
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,96(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqa %xmm0,32(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movdqu %xmm1,32(%rdi,%rbp,1)
+ leaq 32(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_copy
+
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movq 56(%rsp),%rsi
+ movq $1,%rax
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lsqr4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_sqr4x_mont,.-bn_sqr4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 3b7a6f2..17fb94c 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov for the OpenSSL
+# Written by Andy Polyakov for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -15,6 +15,20 @@
# respectful 50%. It remains to be seen if loop unrolling and
# dedicated squaring routine can provide further improvement...
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -26,7 +40,8 @@
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
# int bn_mul_mont(
$rp="%rdi"; # BN_ULONG *rp,
@@ -37,7 +52,6 @@
$num="%r9"; # int num);
$lo0="%r10";
$hi0="%r11";
-$bp="%r12"; # reassign $bp
$hi1="%r13";
$i="%r14";
$j="%r15";
@@ -51,6 +65,16 @@
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
+ test \$3,${num}d
+ jnz .Lmul_enter
+ cmp \$8,${num}d
+ jb .Lmul_enter
+ cmp $ap,$bp
+ jne .Lmul4x_enter
+ jmp .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
push %rbx
push %rbp
push %r12
@@ -66,48 +90,66 @@
and \$-1024,%rsp # minimize TLB usage
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lprologue:
- mov %rdx,$bp # $bp reassigned, remember?
-
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
xor $i,$i # i=0
xor $j,$j # j=0
- mov ($bp),$m0 # m0=bp[0]
- mov ($ap),%rax
+ mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
- mov %rdx,$hi0
+ mov ($np),%rax
- imulq $n0,%rax # "tp[0]"*n0
- mov %rax,$m1
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
- mulq ($np) # np[0]*m1
- add $lo0,%rax # discarded
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$hi1
lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
.L1st:
+ add %rax,$hi1
mov ($ap,$j,8),%rax
- mulq $m0 # ap[j]*bp[0]
- add $hi0,%rax
adc \$0,%rdx
- mov %rax,$lo0
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
mov ($np,$j,8),%rax
- mov %rdx,$hi0
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
mulq $m1 # np[j]*m1
- add $hi1,%rax
- lea 1($j),$j # j++
+ cmp $num,$j
+ jne .L1st
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
adc \$0,%rdx
- add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov %rax,-16(%rsp,$j,8) # tp[j-1]
- cmp $num,$j
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
- jl .L1st
+ mov $lo0,$hi0
xor %rdx,%rdx
add $hi0,$hi1
@@ -116,50 +158,64 @@
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
lea 1($i),$i # i++
-.align 4
+ jmp .Louter
+.align 16
.Louter:
- xor $j,$j # j=0
-
mov ($bp,$i,8),$m0 # m0=bp[i]
- mov ($ap),%rax # ap[0]
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
mulq $m0 # ap[0]*bp[i]
- add (%rsp),%rax # ap[0]*bp[i]+tp[0]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
adc \$0,%rdx
- mov %rax,$lo0
- mov %rdx,$hi0
- imulq $n0,%rax # tp[0]*n0
- mov %rax,$m1
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
- mulq ($np,$j,8) # np[0]*m1
- add $lo0,%rax # discarded
- mov 8(%rsp),$lo0 # tp[1]
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
mov %rdx,$hi1
lea 1($j),$j # j++
-.align 4
+ jmp .Linner_enter
+
+.align 16
.Linner:
+ add %rax,$hi1
mov ($ap,$j,8),%rax
- mulq $m0 # ap[j]*bp[i]
- add $hi0,%rax
adc \$0,%rdx
- add %rax,$lo0 # ap[j]*bp[i]+tp[j]
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
mov ($np,$j,8),%rax
adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
mulq $m1 # np[j]*m1
- add $hi1,%rax
- lea 1($j),$j # j++
- adc \$0,%rdx
- add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
+ cmp $num,$j
+ jne .Linner
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0
- cmp $num,$j
- mov %rax,-16(%rsp,$j,8) # tp[j-1]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
- jl .Linner
xor %rdx,%rdx
add $hi0,$hi1
@@ -173,35 +229,449 @@
cmp $num,$i
jl .Louter
- lea (%rsp),$ap # borrow ap for tp
- lea -1($num),$j # j=num-1
-
- mov ($ap),%rax # tp[0]
xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
jmp .Lsub
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
- dec $j # doesn't affect CF!
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
- jge .Lsub
+ dec $j # doesnn't affect CF!
+ jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
and %rax,$ap
not %rax
mov $rp,$np
and %rax,$np
- lea -1($num),$j
+ mov $num,$j # j=num
or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
+ mov ($ap,$i,8),%rax
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont,\@function,6
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov ${num}d,${num}d
+ lea 4($num),%r10
+ mov %rsp,%r11
+ neg %r10
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
+ mov %rdx,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4($j),$j # j++
+ adc \$0,%rdx
+ mov $N[1],(%rsp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
mov ($ap,$j,8),%rax
- mov %rax,($rp,$j,8) # rp[i]=tp[i]
- mov $i,(%rsp,$j,8) # zap temporary vector
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter4x:
+ mov ($bp,$i,8),$m0 # m0=bp[i]
+ xor $j,$j # j=0
+ mov (%rsp),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ imulq $A[0],$m1 # tp[0]*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ add 8(%rsp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4($j),$j # j+=2
+ adc \$0,%rdx
+ mov $N[1],(%rsp) # tp[j-1]
+ mov %rdx,$N[0]
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ add 8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 1($i),$i # i++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ cmp $num,$i
+ jl .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+ mov 16(%rsp,$num,8),$rp # restore $rp
+ mov 0(%rsp),@ri[0] # tp[0]
+ pxor %xmm0,%xmm0
+ mov 8(%rsp),@ri[1] # tp[1]
+ shr \$2,$num # num/=4
+ lea (%rsp),$ap # borrow ap for tp
+ xor $i,$i # i=0 and clear CF!
+
+ sub 0($np),@ri[0]
+ mov 16($ap),@ri[2] # tp[2]
+ mov 24($ap),@ri[3] # tp[3]
+ sbb 8($np),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($np,$i,8),@ri[2]
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
+ mov 40($ap,$i,8),@ri[1]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($np,$i,8),@ri[0]
+ mov 48($ap,$i,8),@ri[2]
+ mov 56($ap,$i,8),@ri[3]
+ sbb 40($np,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub4x
+
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
+ sbb 16($np,$i,8),@ri[2]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$ap
+ not @ri[0]
+ mov $rp,$np
+ and @ri[0],$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+
+ movdqu ($ap),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,($rp)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x: # copy or in-place refresh
+ movdqu 16($ap,$i),%xmm2
+ movdqu 32($ap,$i),%xmm1
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+ movdqa %xmm0,32(%rsp,$i)
+ movdqu %xmm1,32($rp,$i)
+ lea 32($i),$i
dec $j
- jge .Lcopy
+ jnz .Lcopy4x
+ shl \$2,$num
+ movdqu 16($ap,$i),%xmm2
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
mov (%rsi),%r15
@@ -211,9 +681,823 @@
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
ret
-.size bn_mul_mont,.-bn_mul_mont
+.size bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi"; # const BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # not used
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 4 and
+ # not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type bn_sqr4x_mont,\@function,6
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ shl \$3,${num}d # convert $num to bytes
+ xor %r10,%r10
+ mov %rsp,%r11 # put aside %rsp
+ sub $num,%r10 # -$num
+ mov ($n0),$n0 # *n0
+ lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num)
+ and \$-1024,%rsp # minimize TLB usage
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved $rptr
+ # +40 saved $nptr
+ # +48 saved *n0
+ # +56 saved %rsp
+ # +64 t[2*$num]
+ #
+ mov $rptr,32(%rsp) # save $rptr
+ mov $nptr,40(%rsp)
+ mov $n0, 48(%rsp)
+ mov %r11, 56(%rsp) # save original %rsp
+.Lsqr4x_body:
+ ##############################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ lea 32(%r10),$i # $i=-($num-32)
+ lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
+
+ mov $num,$j # $j=$num
+
+ # comments apply to $num==8 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov %rax,$A0[0] # a[1]*a[0]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ xor $A0[0],$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ lea -16($i),$j # j=-16
+
+
+ mov 8($aptr,$j),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ mov %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 16($j),$j
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[3]
+ jmp .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+ mov ($aptr,$j),$ai # a[4]
+ xor $A1[0],$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],($tptr,$j) # t[4]
+
+
+ mov 8($aptr,$j),$ai # a[5]
+ xor $A1[1],$A1[1]
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],8($tptr,$j) # t[5]
+
+ mov 16($aptr,$j),$ai # a[6]
+ xor $A1[0],$A1[0]
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1] # a[5]*a[3]+t[6]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[6]*a[2]
+ add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],16($tptr,$j) # t[6]
+
+
+ mov 24($aptr,$j),$ai # a[7]
+ xor $A1[1],$A1[1]
+ mul $a1 # a[6]*a[5]
+ add %rax,$A1[0] # a[6]*a[5]+t[7]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 32($j),$j
+ adc \$0,$A0[1]
+ mul $a0 # a[7]*a[4]
+ add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[7]
+
+ cmp \$0,$j
+ jne .Lsqr4x_1st
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[7]*a[5]
+ add %rax,$A1[1]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[8]
+ lea 16($i),$i
+ mov $A1[0],8($tptr) # t[9]
+ jmp .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer: # comments apply to $num==6 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mov -24($tptr,$i),$A0[0] # t[1]
+ xor $A0[1],$A0[1]
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1]
+ mov $ai,%rax # a[2]
+ adc %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ xor $A0[0],$A0[0]
+ add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
+ adc \$0,$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ lea -16($i),$j # j=-16
+ xor $A1[0],$A1[0]
+
+
+ mov 8($aptr,$j),$ai # a[3]
+ xor $A1[1],$A1[1]
+ add 8($tptr,$j),$A1[0]
+ adc \$0,$A1[1]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],8($tptr,$j) # t[3]
+
+ lea 16($j),$j
+ jmp .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+ mov ($aptr,$j),$ai # a[4]
+ xor $A1[0],$A1[0]
+ add ($tptr,$j),$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],($tptr,$j) # t[4]
+
+ mov 8($aptr,$j),$ai # a[5]
+ xor $A1[1],$A1[1]
+ add 8($tptr,$j),$A1[0]
+ adc \$0,$A1[1]
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 16($j),$j # j++
+ adc \$0,$A0[1]
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
+
+ cmp \$0,$j
+ jne .Lsqr4x_inner
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
+ mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
+
+ add \$16,$i
+ jnz .Lsqr4x_outer
+
+ # comments apply to $num==4 case
+ mov -32($aptr),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr),$ai # a[2]
+ mov %rax,$a1
+
+ xor $A0[1],$A0[1]
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
+ mov $ai,%rax # a[2]
+ adc %rdx,$A0[1]
+ mov $A0[0],-24($tptr) # t[1]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
+ adc \$0,$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr) # t[2]
+
+ mov -8($aptr),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ mov %rdx,$A1[1]
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr) # t[3]
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1]
+ mov -16($aptr),%rax # a[2]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[4]
+ mov $A1[0],8($tptr) # t[5]
+
+ mul $ai # a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+ add \$16,$i
+ xor $shift,$shift
+ sub $num,$i # $i=16-$num
+ xor $carry,$carry
+
+ add $A1[0],%rax # t[5]
+ adc \$0,%rdx
+ mov %rax,8($tptr) # t[5]
+ mov %rdx,16($tptr) # t[6]
+ mov $carry,24($tptr) # t[7]
+
+ mov -16($aptr,$i),%rax # a[0]
+ lea 64(%rsp,$num,2),$tptr
+ xor $A0[0],$A0[0] # t[0]
+ mov -24($tptr,$i,2),$A0[1] # t[1]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr,$i,2)
+ adc %rdx,$S[3]
+ lea 16($i),$i
+ mov $S[3],-40($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ jmp .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr,$i,2)
+ adc %rdx,$S[3]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ mov $S[3],-8($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov 8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],0($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],8($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 16($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],16($tptr,$i,2)
+ adc %rdx,$S[3]
+ mov $S[3],24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ add \$32,$i
+ jnz .Lsqr4x_shift_n_add
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+ mov $S[1],-24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ adc %rax,$S[2]
+ adc %rdx,$S[3]
+ mov $S[2],-16($tptr)
+ mov $S[3],-8($tptr)
+___
+}
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+ mov 40(%rsp),$nptr # restore $nptr
+ mov 48(%rsp),$n0 # restore *n0
+ xor $j,$j
+ mov $num,0(%rsp) # save $num
+ sub $num,$j # $j=-$num
+ mov 64(%rsp),$A0[0] # t[0] # modsched #
+ mov $n0,$m0 # # modsched #
+ lea 64(%rsp,$num,2),%rax # end of t[] buffer
+ lea 64(%rsp,$num),$tptr # end of t[] window
+ mov %rax,8(%rsp) # save end of t[] buffer
+ lea ($nptr,$num),$nptr # end of n[] buffer
+ xor $topbit,$topbit # $topbit=0
+
+ mov 0($nptr,$j),%rax # n[0] # modsched #
+ mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
+ imulq $A0[0],$m0 # m0=t[0]*n0 # modsched #
+ mov %rax,$Ni[0] # # modsched #
+ jmp .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+ xor $A0[1],$A0[1]
+ mul $m0 # n[0]*m0
+ add %rax,$A0[0] # n[0]*m0+t[0]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+ mov $n0,$m1
+
+ xor $A0[0],$A0[0]
+ add 8($tptr,$j),$A0[1]
+ adc \$0,$A0[0]
+ mul $m0 # n[1]*m0
+ add %rax,$A0[1] # n[1]*m0+t[1]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+
+ imulq $A0[1],$m1
+
+ mov 16($nptr,$j),$Ni[0] # n[2]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[0]*m1
+ add %rax,$A1[0] # n[0]*m1+"t[1]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],8($tptr,$j) # "t[1]"
+
+ xor $A0[1],$A0[1]
+ add 16($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[2]*m0
+ add %rax,$A0[0] # n[2]*m0+t[2]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 24($nptr,$j),$Ni[1] # n[3]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[1]*m1
+ add %rax,$A1[1] # n[1]*m1+"t[2]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],16($tptr,$j) # "t[2]"
+
+ xor $A0[0],$A0[0]
+ add 24($tptr,$j),$A0[1]
+ lea 32($j),$j
+ adc \$0,$A0[0]
+ mul $m0 # n[3]*m0
+ add %rax,$A0[1] # n[3]*m0+t[3]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+ jmp .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+ mov ($nptr,$j),$Ni[0] # n[4]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[2]*m1
+ add %rax,$A1[0] # n[2]*m1+"t[3]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],-8($tptr,$j) # "t[3]"
+
+ xor $A0[1],$A0[1]
+ add ($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[4]*m0
+ add %rax,$A0[0] # n[4]*m0+t[4]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 8($nptr,$j),$Ni[1] # n[5]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[3]*m1
+ add %rax,$A1[1] # n[3]*m1+"t[4]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],($tptr,$j) # "t[4]"
+
+ xor $A0[0],$A0[0]
+ add 8($tptr,$j),$A0[1]
+ adc \$0,$A0[0]
+ mul $m0 # n[5]*m0
+ add %rax,$A0[1] # n[5]*m0+t[5]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+
+
+ mov 16($nptr,$j),$Ni[0] # n[6]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[4]*m1
+ add %rax,$A1[0] # n[4]*m1+"t[5]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],8($tptr,$j) # "t[5]"
+
+ xor $A0[1],$A0[1]
+ add 16($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[6]*m0
+ add %rax,$A0[0] # n[6]*m0+t[6]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 24($nptr,$j),$Ni[1] # n[7]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[5]*m1
+ add %rax,$A1[1] # n[5]*m1+"t[6]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],16($tptr,$j) # "t[6]"
+
+ xor $A0[0],$A0[0]
+ add 24($tptr,$j),$A0[1]
+ lea 32($j),$j
+ adc \$0,$A0[0]
+ mul $m0 # n[7]*m0
+ add %rax,$A0[1] # n[7]*m0+t[7]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+ cmp \$0,$j
+ jne .Lsqr4x_mont_inner
+
+ sub 0(%rsp),$j # $j=-$num # modsched #
+ mov $n0,$m0 # # modsched #
+
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[6]*m1
+ add %rax,$A1[0] # n[6]*m1+"t[7]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],-8($tptr) # "t[7]"
+
+ xor $A0[1],$A0[1]
+ add ($tptr),$A0[0] # +t[8]
+ adc \$0,$A0[1]
+ mov 0($nptr,$j),$Ni[0] # n[0] # modsched #
+ add $topbit,$A0[0]
+ adc \$0,$A0[1]
+
+ imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
+ xor $A1[0],$A1[0]
+ mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
+ add $A0[0],$A1[1]
+ mov 16($tptr,$j),$A0[0] # t[0] # modsched #
+ adc \$0,$A1[0]
+ mul $m1 # n[7]*m1
+ add %rax,$A1[1] # n[7]*m1+"t[8]"
+ mov $Ni[0],%rax # # modsched #
+ adc %rdx,$A1[0]
+ mov $A1[1],($tptr) # "t[8]"
+
+ xor $topbit,$topbit
+ add 8($tptr),$A1[0] # +t[9]
+ adc $topbit,$topbit
+ add $A0[1],$A1[0]
+ lea 16($tptr),$tptr # "t[$num]>>128"
+ adc \$0,$topbit
+ mov $A1[0],-8($tptr) # "t[9]"
+ cmp 8(%rsp),$tptr # are we done?
+ jb .Lsqr4x_mont_outer
+
+ mov 0(%rsp),$num # restore $num
+ mov $topbit,($tptr) # save $topbit
+___
+}
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+ mov 64(%rsp,$num),@ri[0] # tp[0]
+ lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result
+ mov 40(%rsp),$nptr # restore $nptr
+ shr \$5,$num # num/4
+ mov 8($tptr),@ri[1] # t[1]
+ xor $i,$i # i=0 and clear CF!
+
+ mov 32(%rsp),$rptr # restore $rptr
+ sub 0($nptr),@ri[0]
+ mov 16($tptr),@ri[2] # t[2]
+ mov 24($tptr),@ri[3] # t[3]
+ sbb 8($nptr),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+ mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($nptr,$i,8),@ri[2]
+ mov 32($tptr,$i,8),@ri[0] # tp[i+1]
+ mov 40($tptr,$i,8),@ri[1]
+ sbb 24($nptr,$i,8),@ri[3]
+ mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($nptr,$i,8),@ri[0]
+ mov 48($tptr,$i,8),@ri[2]
+ mov 56($tptr,$i,8),@ri[3]
+ sbb 40($nptr,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesn't affect CF!
+ jnz .Lsqr4x_sub
+
+ mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($tptr,$i,8),@ri[0] # load overflow bit
+ sbb 16($nptr,$i,8),@ri[2]
+ mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($nptr,$i,8),@ri[3]
+ mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$tptr
+ not @ri[0]
+ mov $rptr,$nptr
+ and @ri[0],$nptr
+ lea -1($num),$j
+ or $nptr,$tptr # tp=borrow?tp:rp
+
+ pxor %xmm0,%xmm0
+ lea 64(%rsp,$num,8),$nptr
+ movdqu ($tptr),%xmm1
+ lea ($nptr,$num,8),$nptr
+ movdqa %xmm0,64(%rsp) # zap lower half of temporary vector
+ movdqa %xmm0,($nptr) # zap upper half of temporary vector
+ movdqu %xmm1,($rptr)
+ jmp .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy: # copy or in-place refresh
+ movdqu 16($tptr,$i),%xmm2
+ movdqu 32($tptr,$i),%xmm1
+ movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
+ movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector
+ movdqu %xmm2,16($rptr,$i)
+ movdqu %xmm1,32($rptr,$i)
+ lea 32($i),$i
+ dec $j
+ jnz .Lsqr4x_copy
+
+ movdqu 16($tptr,$i),%xmm2
+ movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
+ movdqu %xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+ mov 56(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lsqr4x_epilogue:
+ ret
+.size bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by "
.align 16
___
@@ -228,9 +1512,9 @@
$code.=<<___;
.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
+.type mul_handler,\@abi-omnipotent
.align 16
-se_handler:
+mul_handler:
push %rsi
push %rdi
push %rbx
@@ -245,15 +1529,20 @@
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
- lea .Lprologue(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lin_prologue
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->RipRsp
- lea .Lepilogue(%rip),%r10
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lin_prologue
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
@@ -272,7 +1561,53 @@
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
-.Lin_prologue:
+ jmp .Lcommon_seh_tail
+.size mul_handler,.-mul_handler
+
+.type sqr_handler,\@abi-omnipotent
+.align 16
+sqr_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lsqr4x_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lsqr_body
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lsqr4x_epilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
+ jae .Lcommon_seh_tail
+
+ mov 56(%rax),%rax # pull saved stack pointer
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
@@ -310,7 +1645,7 @@
pop %rdi
pop %rsi
ret
-.size se_handler,.-se_handler
+.size sqr_handler,.-sqr_handler
.section .pdata
.align 4
@@ -318,11 +1653,27 @@
.rva .LSEH_end_bn_mul_mont
.rva .LSEH_info_bn_mul_mont
+ .rva .LSEH_begin_bn_mul4x_mont
+ .rva .LSEH_end_bn_mul4x_mont
+ .rva .LSEH_info_bn_mul4x_mont
+
+ .rva .LSEH_begin_bn_sqr4x_mont
+ .rva .LSEH_end_bn_sqr4x_mont
+ .rva .LSEH_info_bn_sqr4x_mont
+
.section .xdata
.align 8
.LSEH_info_bn_mul_mont:
.byte 9,0,0,0
- .rva se_handler
+ .rva mul_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+ .byte 9,0,0,0
+ .rva sqr_handler
___
}
diff --git a/crypto/bn/asm/x86_64-mont5.S b/crypto/bn/asm/x86_64-mont5.S
new file mode 100644
index 0000000..49ec6ac
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont5.S
@@ -0,0 +1,784 @@
+.text
+
+.globl bn_mul_mont_gather5
+.type bn_mul_mont_gather5,@function
+.align 64
+bn_mul_mont_gather5:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ movl %r9d,%r9d
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%rax
+ leaq 2(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jl .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ movq %r9,%r15
+ orq %rcx,%rsi
+.align 16
+.Lcopy:
+ movq (%rsi,%r14,8),%rax
+ movq %r14,(%rsp,%r14,8)
+ movq %rax,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type bn_mul4x_mont_gather5,@function
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+ movl %r9d,%r9d
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%rax
+ leaq 4(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ xorq %r15,%r15
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-40(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+ movq %rdi,-16(%rsp,%r15,8)
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jl .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -1(%r9),%r15
+ orq %rcx,%rsi
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz .Lcopy4x
+
+ shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.globl bn_scatter5
+.type bn_scatter5,@function
+.align 16
+bn_scatter5:
+ cmpq $0,%rsi
+ jz .Lscatter_epilogue
+ leaq (%rdx,%rcx,8),%rdx
+.Lscatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subq $1,%rsi
+ jnz .Lscatter
+.Lscatter_epilogue:
+ .byte 0xf3,0xc3
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type bn_gather5,@function
+.align 16
+bn_gather5:
+ movq %rcx,%r11
+ shrq $3,%rcx
+ andq $7,%r11
+ notq %rcx
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%rcx
+ leaq 96(%rdx,%r11,8),%rdx
+ movq 0(%rax,%rcx,8),%xmm4
+ movq 8(%rax,%rcx,8),%xmm5
+ movq 16(%rax,%rcx,8),%xmm6
+ movq 24(%rax,%rcx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq -96(%rdx),%xmm0
+ movq -32(%rdx),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%rdx),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%rdx),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%rdx),%rdx
+ por %xmm3,%xmm0
+
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subq $1,%rsi
+ jnz .Lgather
+ .byte 0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+.align 64
+.Lmagic_masks:
+.long 0,0, 0,0, 0,0, -1,-1
+.long 0,0, 0,0, 0,0, 0,0
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
new file mode 100755
index 0000000..8f8dc5a
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,1071 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# int bn_mul_mont_gather5(
+$rp="%rdi"; # BN_ULONG *rp,
+$ap="%rsi"; # const BN_ULONG *ap,
+$bp="%rdx"; # const BN_ULONG *bp,
+$np="%rcx"; # const BN_ULONG *np,
+$n0="%r8"; # const BN_ULONG *n0,
+$num="%r9"; # int num,
+ # int idx); # 0 to 2^5-1, "index" in $bp holding
+ # pre-computed powers of a', interlaced
+ # in such manner that b[0] is $bp[idx],
+ # b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl bn_mul_mont_gather5
+.type bn_mul_mont_gather5,\@function,6
+.align 64
+bn_mul_mont_gather5:
+ test \$3,${num}d
+ jnz .Lmul_enter
+ cmp \$8,${num}d
+ jb .Lmul_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ mov ${num}d,${num}d
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ lea 2($num),%r11
+ neg %r11
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ movq %xmm0,$m0 # m0=bp[0]
+
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$lo0
+ mov ($np),%rax
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .L1st
+
+ movq %xmm0,$m0 # bp[1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+ mov $lo0,$hi0
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ jmp .Louter
+.align 16
+.Louter:
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
+ mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .Linner
+
+ movq %xmm0,$m0 # bp[i+1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ add $lo0,$hi1 # pull upmost overflow bit
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ cmp $num,$i
+ jl .Louter
+
+ xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
+ mov $num,$j # j=num
+ or $np,$ap # ap=borrow?tp:rp
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov ($ap,$i,8),%rax
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps (%rsi),%xmm6
+ movaps 0x10(%rsi),%xmm7
+ lea 0x28(%rsi),%rsi
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont_gather5,\@function,6
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+ mov ${num}d,${num}d
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ lea 4($num),%r11
+ neg %r11
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
+ mov %rdx,%r12 # reassign $bp
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ movq %xmm0,$m0 # m0=bp[0]
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ mov %rdx,$A[1]
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4($j),$j # j++
+ adc \$0,%rdx
+ mov $N[1],(%rsp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[1]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter4x:
+ xor $j,$j # j=0
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov (%rsp),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $A[0],$m1 # tp[0]*n0
+ mov %rdx,$A[1]
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ add 8(%rsp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4($j),$j # j+=2
+ adc \$0,%rdx
+ mov %rdx,$N[0]
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ add 8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-40(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 1($i),$i # i++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[i+1]
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ cmp $num,$i
+ jl .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+ mov 16(%rsp,$num,8),$rp # restore $rp
+ mov 0(%rsp),@ri[0] # tp[0]
+ pxor %xmm0,%xmm0
+ mov 8(%rsp),@ri[1] # tp[1]
+ shr \$2,$num # num/=4
+ lea (%rsp),$ap # borrow ap for tp
+ xor $i,$i # i=0 and clear CF!
+
+ sub 0($np),@ri[0]
+ mov 16($ap),@ri[2] # tp[2]
+ mov 24($ap),@ri[3] # tp[3]
+ sbb 8($np),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($np,$i,8),@ri[2]
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
+ mov 40($ap,$i,8),@ri[1]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($np,$i,8),@ri[0]
+ mov 48($ap,$i,8),@ri[2]
+ mov 56($ap,$i,8),@ri[3]
+ sbb 40($np,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub4x
+
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
+ sbb 16($np,$i,8),@ri[2]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$ap
+ not @ri[0]
+ mov $rp,$np
+ and @ri[0],$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+
+ movdqu ($ap),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,($rp)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x: # copy or in-place refresh
+ movdqu 16($ap,$i),%xmm2
+ movdqu 32($ap,$i),%xmm1
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+ movdqa %xmm0,32(%rsp,$i)
+ movdqu %xmm1,32($rp,$i)
+ lea 32($i),$i
+ dec $j
+ jnz .Lcopy4x
+
+ shl \$2,$num
+ movdqu 16($ap,$i),%xmm2
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps (%rsi),%xmm6
+ movaps 0x10(%rsi),%xmm7
+ lea 0x28(%rsi),%rsi
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ ret
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl bn_scatter5
+.type bn_scatter5,\@abi-omnipotent
+.align 16
+bn_scatter5:
+ cmp \$0, $num
+ jz .Lscatter_epilogue
+ lea ($tbl,$idx,8),$tbl
+.Lscatter:
+ mov ($inp),%rax
+ lea 8($inp),$inp
+ mov %rax,($tbl)
+ lea 32*8($tbl),$tbl
+ sub \$1,$num
+ jnz .Lscatter
+.Lscatter_epilogue:
+ ret
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type bn_gather5,\@abi-omnipotent
+.align 16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ mov $idx,%r11
+ shr \$`log($N/8)/log(2)`,$idx
+ and \$`$N/8-1`,%r11
+ not $idx
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
+ lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
+ movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,$idx,8),%xmm5 # cache line contains element
+ movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,$idx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq `0*$STRIDE/4-96`($tbl),%xmm0
+ movq `1*$STRIDE/4-96`($tbl),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($tbl),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($tbl),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($tbl),$tbl
+ por %xmm3,%xmm0
+
+ movq %xmm0,($out) # m0=bp[0]
+ lea 8($out),$out
+ sub \$1,$num
+ jnz .Lgather
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ lea 0x28(%rsp),%rsp
+___
+$code.=<<___;
+ ret
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align 64
+.Lmagic_masks:
+ .long 0,0, 0,0, 0,0, -1,-1
+ .long 0,0, 0,0, 0,0, 0,0
+.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by "
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type mul_handler,\@abi-omnipotent
+.align 16
+mul_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->RipRipRsp
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 192($context),%r10 # pull $num
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+
+ movaps (%rax),%xmm0
+ movaps 16(%rax),%xmm1
+ lea `40+48`(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+ movups %xmm0,512($context) # restore context->Xmm6
+ movups %xmm1,528($context) # restore context->Xmm7
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size mul_handler,.-mul_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_bn_mul_mont_gather5
+ .rva .LSEH_end_bn_mul_mont_gather5
+ .rva .LSEH_info_bn_mul_mont_gather5
+
+ .rva .LSEH_begin_bn_mul4x_mont_gather5
+ .rva .LSEH_end_bn_mul4x_mont_gather5
+ .rva .LSEH_info_bn_mul4x_mont_gather5
+
+ .rva .LSEH_begin_bn_gather5
+ .rva .LSEH_end_bn_gather5
+ .rva .LSEH_info_bn_gather5
+
+.section .xdata
+.align 8
+.LSEH_info_bn_mul_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_mul4x_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_gather5:
+ .byte 0x01,0x0d,0x05,0x00
+ .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
+ .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+.align 8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index a0bc478..f34248e 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -558,6 +558,17 @@ int BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
int BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
int do_trial_division, BN_GENCB *cb);
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+ const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+ const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+ BIGNUM *Xp1, BIGNUM *Xp2,
+ const BIGNUM *Xp,
+ const BIGNUM *e, BN_CTX *ctx,
+ BN_GENCB *cb);
+
BN_MONT_CTX *BN_MONT_CTX_new(void );
void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
BN_RECP_CTX *recp, BN_CTX *ctx);
+#ifndef OPENSSL_NO_EC2M
+
/* Functions for arithmetic over binary polynomials represented by BIGNUMs.
*
* The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@ int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
int BN_GF2m_arr2poly(const int p[], BIGNUM *a);
+#endif
+
/* faster mod functions for the 'NIST primes'
* 0 <= a < p^2 */
int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
diff --git a/crypto/bn/bn_blind.c b/crypto/bn/bn_blind.c
index e060592..9ed8bc2 100644
--- a/crypto/bn/bn_blind.c
+++ b/crypto/bn/bn_blind.c
@@ -126,7 +126,7 @@ struct bn_blinding_st
* used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
#endif
CRYPTO_THREADID tid;
- unsigned int counter;
+ int counter;
unsigned long flags;
BN_MONT_CTX *m_ctx;
int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
@@ -160,7 +160,10 @@ BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
- ret->counter = BN_BLINDING_COUNTER;
+ /* Set the counter to the special value -1
+ * to indicate that this is never-used fresh blinding
+ * that does not need updating before first use. */
+ ret->counter = -1;
CRYPTO_THREADID_current(&ret->tid);
return(ret);
err:
@@ -190,7 +193,10 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
goto err;
}
- if (--(b->counter) == 0 && b->e != NULL &&
+ if (b->counter == -1)
+ b->counter = 0;
+
+ if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
!(b->flags & BN_BLINDING_NO_RECREATE))
{
/* re-create blinding parameters */
@@ -205,8 +211,8 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
ret=1;
err:
- if (b->counter == 0)
- b->counter = BN_BLINDING_COUNTER;
+ if (b->counter == BN_BLINDING_COUNTER)
+ b->counter = 0;
return(ret);
}
@@ -227,6 +233,12 @@ int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
return(0);
}
+ if (b->counter == -1)
+ /* Fresh blinding, doesn't need updating. */
+ b->counter = 0;
+ else if (!BN_BLINDING_update(b,ctx))
+ return(0);
+
if (r != NULL)
{
if (!BN_copy(r, b->Ai)) ret=0;
@@ -247,22 +259,19 @@ int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ct
int ret;
bn_check_top(n);
- if ((b->A == NULL) || (b->Ai == NULL))
- {
- BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
- return(0);
- }
if (r != NULL)
ret = BN_mod_mul(n, n, r, b->mod, ctx);
else
- ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
-
- if (ret >= 0)
{
- if (!BN_BLINDING_update(b,ctx))
+ if (b->Ai == NULL)
+ {
+ BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
return(0);
+ }
+ ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
}
+
bn_check_top(n);
return(ret);
}
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 802a43d..7b24031 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -141,6 +141,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
*
*
*/
+#undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divl %4" \
@@ -155,6 +156,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
* Same story here, but it's 128-bit by 64-bit division. Wow!
*
*/
+# undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divq %4" \
@@ -169,15 +171,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
#endif /* OPENSSL_NO_ASM */
-/* BN_div[_no_branch] computes dv := num / divisor, rounding towards
+/* BN_div computes dv := num / divisor, rounding towards
* zero, and sets up rm such that dv*divisor + rm = num holds.
* Thus:
* dv->neg == num->neg ^ divisor->neg (unless the result is zero)
* rm->neg == num->neg (unless the remainder is zero)
* If 'dv' or 'rm' is NULL, the respective value is not returned.
*/
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
- const BIGNUM *divisor, BN_CTX *ctx);
int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_CTX *ctx)
{
@@ -186,6 +186,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_ULONG *resp,*wnump;
BN_ULONG d0,d1;
int num_n,div_n;
+ int no_branch=0;
/* Invalid zero-padding would have particularly bad consequences
* in the case of 'num', so don't just rely on bn_check_top() for this one
@@ -200,7 +201,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
{
- return BN_div_no_branch(dv, rm, num, divisor, ctx);
+ no_branch=1;
}
bn_check_top(dv);
@@ -214,7 +215,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
return(0);
}
- if (BN_ucmp(num,divisor) < 0)
+ if (!no_branch && BN_ucmp(num,divisor) < 0)
{
if (rm != NULL)
{ if (BN_copy(rm,num) == NULL) return(0); }
@@ -239,242 +240,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
norm_shift+=BN_BITS2;
if (!(BN_lshift(snum,num,norm_shift))) goto err;
snum->neg=0;
- div_n=sdiv->top;
- num_n=snum->top;
- loop=num_n-div_n;
- /* Lets setup a 'window' into snum
- * This is the part that corresponds to the current
- * 'area' being divided */
- wnum.neg = 0;
- wnum.d = &(snum->d[loop]);
- wnum.top = div_n;
- /* only needed when BN_ucmp messes up the values between top and max */
- wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
-
- /* Get the top 2 words of sdiv */
- /* div_n=sdiv->top; */
- d0=sdiv->d[div_n-1];
- d1=(div_n == 1)?0:sdiv->d[div_n-2];
-
- /* pointer to the 'top' of snum */
- wnump= &(snum->d[num_n-1]);
-
- /* Setup to 'res' */
- res->neg= (num->neg^divisor->neg);
- if (!bn_wexpand(res,(loop+1))) goto err;
- res->top=loop;
- resp= &(res->d[loop-1]);
-
- /* space for temp */
- if (!bn_wexpand(tmp,(div_n+1))) goto err;
- if (BN_ucmp(&wnum,sdiv) >= 0)
+ if (no_branch)
{
- /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
- * bn_pollute) the const bignum arguments =>
- * clean the values between top and max again */
- bn_clear_top2max(&wnum);
- bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
- *resp=1;
- }
- else
- res->top--;
- /* if res->top == 0 then clear the neg value otherwise decrease
- * the resp pointer */
- if (res->top == 0)
- res->neg = 0;
- else
- resp--;
-
- for (i=0; i 0x%08X\n",
- n0, n1, d0, q);
-#endif
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
- /*
- * rem doesn't have to be BN_ULLONG. The least we
- * know it's less that d0, isn't it?
- */
- rem=(n1-q*d0)&BN_MASK2;
-#endif
- t2=(BN_ULLONG)d1*q;
-
- for (;;)
- {
- if (t2 <= ((((BN_ULLONG)rem)< 0x%08X\n",
- n0, n1, d0, q);
-#endif
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
- rem=(n1-q*d0)&BN_MASK2;
-#endif
-
-#if defined(BN_UMULT_LOHI)
- BN_UMULT_LOHI(t2l,t2h,d1,q);
-#elif defined(BN_UMULT_HIGH)
- t2l = d1 * q;
- t2h = BN_UMULT_HIGH(d1,q);
-#else
+ /* Since we don't know whether snum is larger than sdiv,
+ * we pad snum with enough zeroes without changing its
+ * value.
+ */
+ if (snum->top <= sdiv->top+1)
{
- BN_ULONG ql, qh;
- t2l=LBITS(d1); t2h=HBITS(d1);
- ql =LBITS(q); qh =HBITS(q);
- mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+ if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+ for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+ snum->top = sdiv->top + 2;
}
-#endif
-
- for (;;)
- {
- if ((t2h < rem) ||
- ((t2h == rem) && (t2l <= wnump[-2])))
- break;
- q--;
- rem += d0;
- if (rem < d0) break; /* don't let rem overflow */
- if (t2l < d1) t2h--; t2l -= d1;
- }
-#endif /* !BN_LLONG */
- }
-#endif /* !BN_DIV3W */
-
- l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
- tmp->d[div_n]=l0;
- wnum.d--;
- /* ingore top values of the bignums just sub the two
- * BN_ULONG arrays with bn_sub_words */
- if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+ else
{
- /* Note: As we have considered only the leading
- * two BN_ULONGs in the calculation of q, sdiv * q
- * might be greater than wnum (but then (q-1) * sdiv
- * is less or equal than wnum)
- */
- q--;
- if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
- /* we can't have an overflow here (assuming
- * that q != 0, but if q == 0 then tmp is
- * zero anyway) */
- (*wnump)++;
+ if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+ snum->d[snum->top] = 0;
+ snum->top ++;
}
- /* store part of the result */
- *resp = q;
- }
- bn_correct_top(snum);
- if (rm != NULL)
- {
- /* Keep a copy of the neg flag in num because if rm==num
- * BN_rshift() will overwrite it.
- */
- int neg = num->neg;
- BN_rshift(rm,snum,norm_shift);
- if (!BN_is_zero(rm))
- rm->neg = neg;
- bn_check_top(rm);
- }
- BN_CTX_end(ctx);
- return(1);
-err:
- bn_check_top(rm);
- BN_CTX_end(ctx);
- return(0);
- }
-
-
-/* BN_div_no_branch is a special version of BN_div. It does not contain
- * branches that may leak sensitive information.
- */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
- const BIGNUM *divisor, BN_CTX *ctx)
- {
- int norm_shift,i,loop;
- BIGNUM *tmp,wnum,*snum,*sdiv,*res;
- BN_ULONG *resp,*wnump;
- BN_ULONG d0,d1;
- int num_n,div_n;
-
- bn_check_top(dv);
- bn_check_top(rm);
- /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
- bn_check_top(divisor);
-
- if (BN_is_zero(divisor))
- {
- BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
- return(0);
- }
-
- BN_CTX_start(ctx);
- tmp=BN_CTX_get(ctx);
- snum=BN_CTX_get(ctx);
- sdiv=BN_CTX_get(ctx);
- if (dv == NULL)
- res=BN_CTX_get(ctx);
- else res=dv;
- if (sdiv == NULL || res == NULL) goto err;
-
- /* First we normalise the numbers */
- norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
- if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
- sdiv->neg=0;
- norm_shift+=BN_BITS2;
- if (!(BN_lshift(snum,num,norm_shift))) goto err;
- snum->neg=0;
-
- /* Since we don't know whether snum is larger than sdiv,
- * we pad snum with enough zeroes without changing its
- * value.
- */
- if (snum->top <= sdiv->top+1)
- {
- if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
- for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
- snum->top = sdiv->top + 2;
- }
- else
- {
- if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
- snum->d[snum->top] = 0;
- snum->top ++;
}
div_n=sdiv->top;
@@ -500,12 +284,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
/* Setup to 'res' */
res->neg= (num->neg^divisor->neg);
if (!bn_wexpand(res,(loop+1))) goto err;
- res->top=loop-1;
+ res->top=loop-no_branch;
resp= &(res->d[loop-1]);
/* space for temp */
if (!bn_wexpand(tmp,(div_n+1))) goto err;
+ if (!no_branch)
+ {
+ if (BN_ucmp(&wnum,sdiv) >= 0)
+ {
+ /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
+ * bn_pollute) the const bignum arguments =>
+ * clean the values between top and max again */
+ bn_clear_top2max(&wnum);
+ bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+ *resp=1;
+ }
+ else
+ res->top--;
+ }
+
/* if res->top == 0 then clear the neg value otherwise decrease
* the resp pointer */
if (res->top == 0)
@@ -638,7 +437,7 @@ X) -> 0x%08X\n",
rm->neg = neg;
bn_check_top(rm);
}
- bn_correct_top(res);
+ if (no_branch) bn_correct_top(res);
BN_CTX_end(ctx);
return(1);
err:
@@ -646,5 +445,4 @@ X) -> 0x%08X\n",
BN_CTX_end(ctx);
return(0);
}
-
#endif
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index d9b6c73..2abf6fd 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -113,6 +113,18 @@
#include "cryptlib.h"
#include "bn_lcl.h"
+#include
+#ifdef _WIN32
+# include
+# ifndef alloca
+# define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+# define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
/* maximum precomputation table size for *variable* sliding windows */
#define TABLE_SIZE 32
@@ -522,23 +534,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
* as cache lines are concerned. The following functions are used to transfer a BIGNUM
* from/to that table. */
-static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
{
size_t i, j;
- if (bn_wexpand(b, top) == NULL)
- return 0;
- while (b->top < top)
- {
- b->d[b->top++] = 0;
- }
-
+ if (top > b->top)
+ top = b->top; /* this works because 'buf' is explicitly zeroed */
for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
{
buf[j] = ((unsigned char*)b->d)[i];
}
- bn_correct_top(b);
return 1;
}
@@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
/* Given a pointer value, compute the next address that is a cache line multiple. */
#define MOD_EXP_CTIME_ALIGN(x_) \
- ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+ ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
/* This variant of BN_mod_exp_mont() uses fixed windows and the special
* precomputation memory layout to limit data-dependency to a minimum
@@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
{
- int i,bits,ret=0,idx,window,wvalue;
+ int i,bits,ret=0,window,wvalue;
int top;
- BIGNUM *r;
- const BIGNUM *aa;
BN_MONT_CTX *mont=NULL;
int numPowers;
unsigned char *powerbufFree=NULL;
int powerbufLen = 0;
unsigned char *powerbuf=NULL;
- BIGNUM *computeTemp=NULL, *am=NULL;
+ BIGNUM tmp, am;
bn_check_top(a);
bn_check_top(p);
@@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
return ret;
}
- /* Initialize BIGNUM context and allocate intermediate result */
BN_CTX_start(ctx);
- r = BN_CTX_get(ctx);
- if (r == NULL) goto err;
/* Allocate a montgomery context if it was not supplied by the caller.
* If this is not done, things will break in the montgomery part.
@@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/* Get the window size to use with size of p. */
window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+ if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
+#endif
/* Allocate a buffer large enough to hold all of the pre-computed
- * powers of a.
+ * powers of am, am itself and tmp.
*/
numPowers = 1 << window;
- powerbufLen = sizeof(m->d[0])*top*numPowers;
+ powerbufLen = sizeof(m->d[0])*(top*numPowers +
+ ((2*top)>numPowers?(2*top):numPowers));
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+ else
+#endif
if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
goto err;
powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
memset(powerbuf, 0, powerbufLen);
- /* Initialize the intermediate result. Do this early to save double conversion,
- * once each for a^0 and intermediate result.
- */
- if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = NULL;
+#endif
- /* Initialize computeTemp as a^1 with montgomery precalcs */
- computeTemp = BN_CTX_get(ctx);
- am = BN_CTX_get(ctx);
- if (computeTemp==NULL || am==NULL) goto err;
+ /* lay down tmp and am right after powers table */
+ tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+ am.d = tmp.d + top;
+ tmp.top = am.top = 0;
+ tmp.dmax = am.dmax = top;
+ tmp.neg = am.neg = 0;
+ tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+ /* prepare a^0 in Montgomery domain */
+#if 1
+ if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err;
+#else
+ tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
+ for (i=1;id[i])&BN_MASK2;
+ tmp.top = top;
+#endif
+ /* prepare a^1 in Montgomery domain */
if (a->neg || BN_ucmp(a,m) >= 0)
{
- if (!BN_mod(am,a,m,ctx))
- goto err;
- aa= am;
+ if (!BN_mod(&am,a,m,ctx)) goto err;
+ if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err;
}
- else
- aa=a;
- if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
- if (!BN_copy(computeTemp, am)) goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+ else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+ /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+ * specifically optimization of cache-timing attack countermeasures
+ * and pre-computation optimization. */
+
+ /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+ * 512-bit RSA is hardly relevant, we omit it to spare size... */
+ if (window==5)
+ {
+ void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+ const void *table,const BN_ULONG *np,
+ const BN_ULONG *n0,int num,int power);
+ void bn_scatter5(const BN_ULONG *inp,size_t num,
+ void *table,size_t power);
+ void bn_gather5(BN_ULONG *out,size_t num,
+ void *table,size_t power);
+
+ BN_ULONG *np=mont->N.d, *n0=mont->n0;
+
+ /* BN_to_montgomery can contaminate words above .top
+ * [in BN_DEBUG[_DEBUG] build]... */
+ for (i=am.top; i=0; i--,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+ bn_gather5(tmp.d,top,powerbuf,wvalue);
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0)
+ {
+ for (wvalue=0, i=0; i<5; i++,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
+ }
+
+ tmp.top=top;
+ bn_correct_top(&tmp);
+ }
+ else
+#endif
+ {
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err;
/* If the window size is greater than 1, then calculate
* val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
@@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
*/
if (window > 1)
{
- for (i=2; i= 0)
+ bits--;
+ for (wvalue=0, i=bits%window; i>=0; i--,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0)
{
wvalue=0; /* The 'value' of the window */
/* Scan the window, squaring the result as we go */
- for (i=0; i> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
#endif
+#if !defined(OPENSSL_BN_ASM_GF2m)
/* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
* result is a polynomial r with degree < 2 * BN_BITS - 1
* The caller MUST ensure that the variables have the right amount
@@ -216,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c
r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */
r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */
}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif
/* Add polynomials a and b and store result in r; r could be a or b, a and b
* could be equal; r is the bitwise XOR of a and b.
@@ -360,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
{
int ret = 0;
- const int max = BN_num_bits(p) + 1;
- int *arr=NULL;
+ int arr[6];
bn_check_top(a);
bn_check_top(p);
- if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
- ret = BN_GF2m_poly2arr(p, arr, max);
- if (!ret || ret > max)
+ ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+ if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
{
BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
- goto err;
+ return 0;
}
ret = BN_GF2m_mod_arr(r, a, arr);
bn_check_top(r);
-err:
- if (arr) OPENSSL_free(arr);
return ret;
}
@@ -521,7 +522,7 @@ int BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
*/
int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
{
- BIGNUM *b, *c, *u, *v, *tmp;
+ BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
int ret = 0;
bn_check_top(a);
@@ -529,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
BN_CTX_start(ctx);
- b = BN_CTX_get(ctx);
- c = BN_CTX_get(ctx);
- u = BN_CTX_get(ctx);
- v = BN_CTX_get(ctx);
- if (v == NULL) goto err;
+ if ((b = BN_CTX_get(ctx))==NULL) goto err;
+ if ((c = BN_CTX_get(ctx))==NULL) goto err;
+ if ((u = BN_CTX_get(ctx))==NULL) goto err;
+ if ((v = BN_CTX_get(ctx))==NULL) goto err;
- if (!BN_one(b)) goto err;
if (!BN_GF2m_mod(u, a, p)) goto err;
- if (!BN_copy(v, p)) goto err;
-
if (BN_is_zero(u)) goto err;
+ if (!BN_copy(v, p)) goto err;
+#if 0
+ if (!BN_one(b)) goto err;
+
while (1)
{
while (!BN_is_odd(u))
@@ -565,13 +566,89 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
if (!BN_GF2m_add(u, u, v)) goto err;
if (!BN_GF2m_add(b, b, c)) goto err;
}
+#else
+ {
+ int i, ubits = BN_num_bits(u),
+ vbits = BN_num_bits(v), /* v is copy of p */
+ top = p->top;
+ BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+ bn_wexpand(u,top); udp = u->d;
+ for (i=u->top;itop = top;
+ bn_wexpand(b,top); bdp = b->d;
+ bdp[0] = 1;
+ for (i=1;itop = top;
+ bn_wexpand(c,top); cdp = c->d;
+ for (i=0;itop = top;
+ vdp = v->d; /* It pays off to "cache" *->d pointers, because
+ * it allows optimizer to be more aggressive.
+ * But we don't have to "cache" p->d, because *p
+ * is declared 'const'... */
+ while (1)
+ {
+ while (ubits && !(udp[0]&1))
+ {
+ BN_ULONG u0,u1,b0,b1,mask;
+
+ u0 = udp[0];
+ b0 = bdp[0];
+ mask = (BN_ULONG)0-(b0&1);
+ b0 ^= p->d[0]&mask;
+ for (i=0;i>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+ u0 = u1;
+ b1 = bdp[i+1]^(p->d[i+1]&mask);
+ bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+ b0 = b1;
+ }
+ udp[i] = u0>>1;
+ bdp[i] = b0>>1;
+ ubits--;
+ }
+ if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+ if (ubitsd;
+ bdp = cdp; cdp = c->d;
+ }
+ for(i=0;i
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-# elif defined(__GNUC__)
+# elif defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %1,%2,%0" \
@@ -247,7 +247,7 @@ extern "C" {
ret; })
# endif /* compiler */
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-# if defined(__GNUC__)
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("mulhdu %0,%1,%2" \
@@ -257,7 +257,7 @@ extern "C" {
# endif /* compiler */
# elif (defined(__x86_64) || defined(__x86_64__)) && \
(defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
-# if defined(__GNUC__)
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret,discard; \
asm ("mulq %3" \
@@ -280,6 +280,26 @@ extern "C" {
# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
# endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+# if defined(__GNUC__) && __GNUC__>=2
+# if __GNUC__>=4 && __GNUC_MINOR__>=4 /* "h" constraint is no more since 4.4 */
+# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64)
+# define BN_UMULT_LOHI(low,high,a,b) ({ \
+ __uint128_t ret=(__uint128_t)(a)*(b); \
+ (high)=ret>>64; (low)=ret; })
+# else
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("dmultu %1,%2" \
+ : "=h"(ret) \
+ : "r"(a), "r"(b) : "l"); \
+ ret; })
+# define BN_UMULT_LOHI(low,high,a,b)\
+ asm ("dmultu %2,%3" \
+ : "=l"(low),"=h"(high) \
+ : "r"(a), "r"(b));
+# endif
+# endif
# endif /* cpu */
#endif /* OPENSSL_NO_ASM */
@@ -459,6 +479,10 @@ extern "C" {
}
#endif /* !BN_LLONG */
+#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
+#undef bn_div_words
+#endif
+
void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index 5470fbe..7a5676d 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c
@@ -139,25 +139,6 @@ const BIGNUM *BN_value_one(void)
return(&const_one);
}
-char *BN_options(void)
- {
- static int init=0;
- static char data[16];
-
- if (!init)
- {
- init++;
-#ifdef BN_LLONG
- BIO_snprintf(data,sizeof data,"bn(%d,%d)",
- (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
-#else
- BIO_snprintf(data,sizeof data,"bn(%d,%d)",
- (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
-#endif
- }
- return(data);
- }
-
int BN_num_bits_word(BN_ULONG l)
{
static const unsigned char bits[256]={
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 1a86688..427b5cf 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -177,31 +177,26 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
{
BIGNUM *n;
- BN_ULONG *ap,*np,*rp,n0,v,*nrp;
- int al,nl,max,i,x,ri;
+ BN_ULONG *ap,*np,*rp,n0,v,carry;
+ int nl,max,i;
n= &(mont->N);
- /* mont->ri is the size of mont->N in bits (rounded up
- to the word size) */
- al=ri=mont->ri/BN_BITS2;
-
nl=n->top;
- if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+ if (nl == 0) { ret->top=0; return(1); }
- max=(nl+al+1); /* allow for overflow (no?) XXX */
+ max=(2*nl); /* carry is stored separately */
if (bn_wexpand(r,max) == NULL) return(0);
r->neg^=n->neg;
np=n->d;
rp=r->d;
- nrp= &(r->d[nl]);
/* clear the top words of T */
#if 1
for (i=r->top; id[i]=0;
+ rp[i]=0;
#else
- memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
+ memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
#endif
r->top=max;
@@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
#ifdef BN_COUNT
fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
#endif
- for (i=0; i= v)
- continue;
- else
- {
- if (((++nrp[0])&BN_MASK2) != 0) continue;
- if (((++nrp[1])&BN_MASK2) != 0) continue;
- for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
- }
- }
- bn_correct_top(r);
-
- /* mont->ri will be a multiple of the word size and below code
- * is kind of BN_rshift(ret,r,mont->ri) equivalent */
- if (r->top <= ri)
- {
- ret->top=0;
- return(1);
+ v = (v+carry+rp[nl])&BN_MASK2;
+ carry |= (v != rp[nl]);
+ carry &= (v <= rp[nl]);
+ rp[nl]=v;
}
- al=r->top-ri;
-#define BRANCH_FREE 1
-#if BRANCH_FREE
- if (bn_wexpand(ret,ri) == NULL) return(0);
- x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
- ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */
+ if (bn_wexpand(ret,nl) == NULL) return(0);
+ ret->top=nl;
ret->neg=r->neg;
rp=ret->d;
- ap=&(r->d[ri]);
+ ap=&(r->d[nl]);
+#define BRANCH_FREE 1
+#if BRANCH_FREE
{
- size_t m1,m2;
-
- v=bn_sub_words(rp,ap,np,ri);
- /* this ----------------^^ works even in alri) nrp=rp; else nrp=ap; */
- /* in other words if subtraction result is real, then
+ v=bn_sub_words(rp,ap,np,nl)-carry;
+ /* if subtraction result is real, then
* trick unconditional memcpy below to perform in-place
* "refresh" instead of actual copy. */
- m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al>(sizeof(al)*8-1))&1); /* al>ri */
- m1|=m2; /* (al!=ri) */
- m1|=(0-(size_t)v); /* (al!=ri || v) */
- m1&=~m2; /* (al!=ri || v) && !al>ri */
- nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1));
- }
+ m=(0-(size_t)v);
+ nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
- /* 'itop=al;
- ret->neg=r->neg;
-
- rp=ret->d;
- ap=&(r->d[ri]);
- al-=4;
- for (i=0; iN)) >= 0)
- {
- if (!BN_usub(ret,ret,&(mont->N))) return(0);
- }
+ if (bn_sub_words (rp,ap,np,nl)-carry)
+ memcpy(rp,ap,nl*sizeof(BN_ULONG));
#endif
+ bn_correct_top(r);
+ bn_correct_top(ret);
bn_check_top(ret);
return(1);
diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c
index c6de032..43caee4 100644
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c
@@ -319,6 +319,13 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
:(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
#define bn_32_set_0(to, n) (((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
#define bn_cp_32(to,n,from,m) ((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
+# if defined(L_ENDIAN)
+# if defined(__arch64__)
+# define NIST_INT64 long
+# else
+# define NIST_INT64 long long
+# endif
+# endif
#else
#define bn_cp_64(to, n, from, m) \
{ \
@@ -330,13 +337,15 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
bn_32_set_0(to, (n)*2); \
bn_32_set_0(to, (n)*2+1); \
}
-#if BN_BITS2 == 32
#define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
#define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0;
-#endif
+# if defined(_WIN32) && !defined(__GNUC__)
+# define NIST_INT64 __int64
+# elif defined(BN_LLONG)
+# define NIST_INT64 long long
+# endif
#endif /* BN_BITS2 != 64 */
-
#define nist_set_192(to, from, a1, a2, a3) \
{ \
bn_cp_64(to, 0, from, (a3) - 3) \
@@ -350,9 +359,11 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int top = a->top, i;
int carry;
register BN_ULONG *r_d, *a_d = a->d;
- BN_ULONG t_d[BN_NIST_192_TOP],
- buf[BN_NIST_192_TOP],
- c_d[BN_NIST_192_TOP],
+ union {
+ BN_ULONG bn[BN_NIST_192_TOP];
+ unsigned int ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+ } buf;
+ BN_ULONG c_d[BN_NIST_192_TOP],
*res;
PTR_SIZE_INT mask;
static const BIGNUM _bignum_nist_p_192_sqr = {
@@ -385,15 +396,48 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
else
r_d = a_d;
- nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+ nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+
+#if defined(NIST_INT64)
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf.ui;
+
+ acc = rp[0]; acc += bp[3*2-6];
+ acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc += bp[3*2-5];
+ acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32;
- nist_set_192(t_d, buf, 0, 3, 3);
+ acc += rp[2]; acc += bp[3*2-6];
+ acc += bp[4*2-6];
+ acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[3*2-5];
+ acc += bp[4*2-5];
+ acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[4]; acc += bp[4*2-6];
+ acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[4*2-5];
+ acc += bp[5*2-5]; rp[5] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_192_TOP];
+
+ nist_set_192(t_d, buf.bn, 0, 3, 3);
carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
- nist_set_192(t_d, buf, 4, 4, 0);
+ nist_set_192(t_d, buf.bn, 4, 4, 0);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
- nist_set_192(t_d, buf, 5, 5, 5)
+ nist_set_192(t_d, buf.bn, 5, 5, 5)
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-
+ }
+#endif
if (carry > 0)
carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
else
@@ -435,8 +479,7 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int top = a->top, i;
int carry;
BN_ULONG *r_d, *a_d = a->d;
- BN_ULONG t_d[BN_NIST_224_TOP],
- buf[BN_NIST_224_TOP],
+ BN_ULONG buf[BN_NIST_224_TOP],
c_d[BN_NIST_224_TOP],
*res;
PTR_SIZE_INT mask;
@@ -474,14 +517,54 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
#if BN_BITS2==64
/* copy upper 256 bits of 448 bit number ... */
- nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+ nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
/* ... and right shift by 32 to obtain upper 224 bits */
- nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+ nist_set_224(buf, c_d, 14, 13, 12, 11, 10, 9, 8);
/* truncate lower part to 224 bits too */
r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
#else
nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
#endif
+
+#if defined(NIST_INT64) && BN_BITS2!=64
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf;
+
+ acc = rp[0]; acc -= bp[7-7];
+ acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc -= bp[8-7];
+ acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[2]; acc -= bp[9-7];
+ acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[7-7];
+ acc += bp[11-7];
+ acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32;
+
+ acc += rp[4]; acc += bp[8-7];
+ acc += bp[12-7];
+ acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[9-7];
+ acc += bp[13-7];
+ acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[6]; acc += bp[10-7];
+ acc -= bp[13-7]; rp[6] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+# if BN_BITS2==64
+ rp[7] = carry;
+# endif
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_224_TOP];
+
nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
@@ -493,6 +576,8 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
#if BN_BITS2==64
carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
+#endif
+ }
#endif
u.f = bn_sub_words;
if (carry > 0)
@@ -548,9 +633,11 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int i, top = a->top;
int carry = 0;
register BN_ULONG *a_d = a->d, *r_d;
- BN_ULONG t_d[BN_NIST_256_TOP],
- buf[BN_NIST_256_TOP],
- c_d[BN_NIST_256_TOP],
+ union {
+ BN_ULONG bn[BN_NIST_256_TOP];
+ unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+ } buf;
+ BN_ULONG c_d[BN_NIST_256_TOP],
*res;
PTR_SIZE_INT mask;
union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -584,12 +671,87 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
else
r_d = a_d;
- nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+ nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+
+#if defined(NIST_INT64)
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf.ui;
+
+ acc = rp[0]; acc += bp[8-8];
+ acc += bp[9-8];
+ acc -= bp[11-8];
+ acc -= bp[12-8];
+ acc -= bp[13-8];
+ acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc += bp[9-8];
+ acc += bp[10-8];
+ acc -= bp[12-8];
+ acc -= bp[13-8];
+ acc -= bp[14-8];
+ acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[2]; acc += bp[10-8];
+ acc += bp[11-8];
+ acc -= bp[13-8];
+ acc -= bp[14-8];
+ acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[11-8];
+ acc += bp[11-8];
+ acc += bp[12-8];
+ acc += bp[12-8];
+ acc += bp[13-8];
+ acc -= bp[15-8];
+ acc -= bp[8-8];
+ acc -= bp[9-8]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[4]; acc += bp[12-8];
+ acc += bp[12-8];
+ acc += bp[13-8];
+ acc += bp[13-8];
+ acc += bp[14-8];
+ acc -= bp[9-8];
+ acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[13-8];
+ acc += bp[13-8];
+ acc += bp[14-8];
+ acc += bp[14-8];
+ acc += bp[15-8];
+ acc -= bp[10-8];
+ acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[6]; acc += bp[14-8];
+ acc += bp[14-8];
+ acc += bp[15-8];
+ acc += bp[15-8];
+ acc += bp[14-8];
+ acc += bp[13-8];
+ acc -= bp[8-8];
+ acc -= bp[9-8]; rp[6] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[7]; acc += bp[15-8];
+ acc += bp[15-8];
+ acc += bp[15-8];
+ acc += bp[8 -8];
+ acc -= bp[10-8];
+ acc -= bp[11-8];
+ acc -= bp[12-8];
+ acc -= bp[13-8]; rp[7] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_256_TOP];
/*S1*/
- nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
+ nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
/*S2*/
- nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+ nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
/* left shift */
{
@@ -607,24 +769,26 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
}
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*S3*/
- nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+ nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*S4*/
- nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+ nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D1*/
- nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+ nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D2*/
- nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+ nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D3*/
- nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+ nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D4*/
- nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+ nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
+ }
+#endif
/* see BN_nist_mod_224 for explanation */
u.f = bn_sub_words;
if (carry > 0)
@@ -672,9 +836,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int i, top = a->top;
int carry = 0;
register BN_ULONG *r_d, *a_d = a->d;
- BN_ULONG t_d[BN_NIST_384_TOP],
- buf[BN_NIST_384_TOP],
- c_d[BN_NIST_384_TOP],
+ union {
+ BN_ULONG bn[BN_NIST_384_TOP];
+ unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+ } buf;
+ BN_ULONG c_d[BN_NIST_384_TOP],
*res;
PTR_SIZE_INT mask;
union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -709,10 +875,100 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
else
r_d = a_d;
- nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+ nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+
+#if defined(NIST_INT64)
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf.ui;
+
+ acc = rp[0]; acc += bp[12-12];
+ acc += bp[21-12];
+ acc += bp[20-12];
+ acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc += bp[13-12];
+ acc += bp[22-12];
+ acc += bp[23-12];
+ acc -= bp[12-12];
+ acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[2]; acc += bp[14-12];
+ acc += bp[23-12];
+ acc -= bp[13-12];
+ acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[15-12];
+ acc += bp[12-12];
+ acc += bp[20-12];
+ acc += bp[21-12];
+ acc -= bp[14-12];
+ acc -= bp[22-12];
+ acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[4]; acc += bp[21-12];
+ acc += bp[21-12];
+ acc += bp[16-12];
+ acc += bp[13-12];
+ acc += bp[12-12];
+ acc += bp[20-12];
+ acc += bp[22-12];
+ acc -= bp[15-12];
+ acc -= bp[23-12];
+ acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[22-12];
+ acc += bp[22-12];
+ acc += bp[17-12];
+ acc += bp[14-12];
+ acc += bp[13-12];
+ acc += bp[21-12];
+ acc += bp[23-12];
+ acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[6]; acc += bp[23-12];
+ acc += bp[23-12];
+ acc += bp[18-12];
+ acc += bp[15-12];
+ acc += bp[14-12];
+ acc += bp[22-12];
+ acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[7]; acc += bp[19-12];
+ acc += bp[16-12];
+ acc += bp[15-12];
+ acc += bp[23-12];
+ acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[8]; acc += bp[20-12];
+ acc += bp[17-12];
+ acc += bp[16-12];
+ acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[9]; acc += bp[21-12];
+ acc += bp[18-12];
+ acc += bp[17-12];
+ acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[10]; acc += bp[22-12];
+ acc += bp[19-12];
+ acc += bp[18-12];
+ acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[11]; acc += bp[23-12];
+ acc += bp[20-12];
+ acc += bp[19-12];
+ acc -= bp[22-12]; rp[11] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_384_TOP];
/*S1*/
- nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
+ nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
/* left shift */
{
register BN_ULONG *ap,t,c;
@@ -729,29 +985,31 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2),
t_d, BN_NIST_256_TOP);
/*S2 */
- carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
+ carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
/*S3*/
- nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+ nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S4*/
- nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+ nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S5*/
- nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+ nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S6*/
- nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+ nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D1*/
- nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+ nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D2*/
- nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+ nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D3*/
- nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+ nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
+ }
+#endif
/* see BN_nist_mod_224 for explanation */
u.f = bn_sub_words;
if (carry > 0)
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index bebb466..1743b6a 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -357,3 +357,22 @@ int BN_print(BIO *bp, const BIGNUM *a)
return(ret);
}
#endif
+
+char *BN_options(void)
+ {
+ static int init=0;
+ static char data[16];
+
+ if (!init)
+ {
+ init++;
+#ifdef BN_LLONG
+ BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+ (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
+#else
+ BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+ (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
+#endif
+ }
+ return(data);
+ }
diff --git a/crypto/bn/bn_shift.c b/crypto/bn/bn_shift.c
index c4d301a..a6fca2c 100644
--- a/crypto/bn/bn_shift.c
+++ b/crypto/bn/bn_shift.c
@@ -99,7 +99,7 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a)
int BN_rshift1(BIGNUM *r, const BIGNUM *a)
{
BN_ULONG *ap,*rp,t,c;
- int i;
+ int i,j;
bn_check_top(r);
bn_check_top(a);
@@ -109,22 +109,25 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a)
BN_zero(r);
return(1);
}
+ i = a->top;
+ ap= a->d;
+ j = i-(ap[i-1]==1);
if (a != r)
{
- if (bn_wexpand(r,a->top) == NULL) return(0);
- r->top=a->top;
+ if (bn_wexpand(r,j) == NULL) return(0);
r->neg=a->neg;
}
- ap=a->d;
rp=r->d;
- c=0;
- for (i=a->top-1; i>=0; i--)
+ t=ap[--i];
+ c=(t&1)?BN_TBIT:0;
+ if (t>>=1) rp[i]=t;
+ while (i>0)
{
- t=ap[i];
+ t=ap[--i];
rp[i]=((t>>1)&BN_MASK2)|c;
c=(t&1)?BN_TBIT:0;
}
- bn_correct_top(r);
+ r->top=j;
bn_check_top(r);
return(1);
}
@@ -182,10 +185,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
BN_zero(r);
return(1);
}
+ i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2;
if (r != a)
{
r->neg=a->neg;
- if (bn_wexpand(r,a->top-nw+1) == NULL) return(0);
+ if (bn_wexpand(r,i) == NULL) return(0);
}
else
{
@@ -196,7 +200,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
f= &(a->d[nw]);
t=r->d;
j=a->top-nw;
- r->top=j;
+ r->top=i;
if (rb == 0)
{
@@ -212,9 +216,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
l= *(f++);
*(t++) =(tmp|(l<>rb)&BN_MASK2;
+ if ((l = (l>>rb)&BN_MASK2)) *(t) = l;
}
- bn_correct_top(r);
bn_check_top(r);
return(1);
}
diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c
index ee7b87c..de83a15 100644
--- a/crypto/bn/bn_word.c
+++ b/crypto/bn/bn_word.c
@@ -144,26 +144,17 @@ int BN_add_word(BIGNUM *a, BN_ULONG w)
a->neg=!(a->neg);
return(i);
}
- /* Only expand (and risk failing) if it's possibly necessary */
- if (((BN_ULONG)(a->d[a->top - 1] + 1) == 0) &&
- (bn_wexpand(a,a->top+1) == NULL))
- return(0);
- i=0;
- for (;;)
+ for (i=0;w!=0 && itop;i++)
{
- if (i >= a->top)
- l=w;
- else
- l=(a->d[i]+w)&BN_MASK2;
- a->d[i]=l;
- if (w > l)
- w=1;
- else
- break;
- i++;
+ a->d[i] = l = (a->d[i]+w)&BN_MASK2;
+ w = (w>l)?1:0;
}
- if (i >= a->top)
+ if (w && i==a->top)
+ {
+ if (bn_wexpand(a,a->top+1) == NULL) return 0;
a->top++;
+ a->d[i]=w;
+ }
bn_check_top(a);
return(1);
}
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 0cd99c5..06f5954 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -262,7 +262,7 @@ int main(int argc, char *argv[])
message(out,"BN_mod_sqrt");
if (!test_sqrt(out,ctx)) goto err;
(void)BIO_flush(out);
-
+#ifndef OPENSSL_NO_EC2M
message(out,"BN_GF2m_add");
if (!test_gf2m_add(out)) goto err;
(void)BIO_flush(out);
@@ -298,7 +298,7 @@ int main(int argc, char *argv[])
message(out,"BN_GF2m_mod_solve_quad");
if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
(void)BIO_flush(out);
-
+#endif
BN_CTX_free(ctx);
BIO_free(out);
@@ -1061,7 +1061,7 @@ int test_exp(BIO *bp, BN_CTX *ctx)
BN_free(one);
return(1);
}
-
+#ifndef OPENSSL_NO_EC2M
int test_gf2m_add(BIO *bp)
{
BIGNUM a,b,c;
@@ -1636,7 +1636,7 @@ int test_gf2m_mod_solve_quad(BIO *bp,BN_CTX *ctx)
BN_free(e);
return ret;
}
-
+#endif
static int genprime_cb(int p, int n, BN_GENCB *arg)
{
char c='*';
diff --git a/crypto/buffer/buf_str.c b/crypto/buffer/buf_str.c
new file mode 100644
index 0000000..151f5ea
--- /dev/null
+++ b/crypto/buffer/buf_str.c
@@ -0,0 +1,119 @@
+/* crypto/buffer/buffer.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include
+#include "cryptlib.h"
+#include
+
+char *BUF_strdup(const char *str)
+ {
+ if (str == NULL) return(NULL);
+ return BUF_strndup(str, strlen(str));
+ }
+
+char *BUF_strndup(const char *str, size_t siz)
+ {
+ char *ret;
+
+ if (str == NULL) return(NULL);
+
+ ret=OPENSSL_malloc(siz+1);
+ if (ret == NULL)
+ {
+ BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
+ return(NULL);
+ }
+ BUF_strlcpy(ret,str,siz+1);
+ return(ret);
+ }
+
+void *BUF_memdup(const void *data, size_t siz)
+ {
+ void *ret;
+
+ if (data == NULL) return(NULL);
+
+ ret=OPENSSL_malloc(siz);
+ if (ret == NULL)
+ {
+ BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
+ return(NULL);
+ }
+ return memcpy(ret, data, siz);
+ }
+
+size_t BUF_strlcpy(char *dst, const char *src, size_t size)
+ {
+ size_t l = 0;
+ for(; size > 1 && *src; size--)
+ {
+ *dst++ = *src++;
+ l++;
+ }
+ if (size)
+ *dst = '\0';
+ return l + strlen(src);
+ }
+
+size_t BUF_strlcat(char *dst, const char *src, size_t size)
+ {
+ size_t l = 0;
+ for(; size > 0 && *dst; size--, dst++)
+ l++;
+ return l + BUF_strlcpy(dst, src, size);
+ }
diff --git a/crypto/buffer/buffer.c b/crypto/buffer/buffer.c
index 620ea8d..d7aa79a 100644
--- a/crypto/buffer/buffer.c
+++ b/crypto/buffer/buffer.c
@@ -60,6 +60,11 @@
#include "cryptlib.h"
#include
+/* LIMIT_BEFORE_EXPANSION is the maximum n such that (n+3)/3*4 < 2**31. That
+ * function is applied in several functions in this file and this limit ensures
+ * that the result fits in an int. */
+#define LIMIT_BEFORE_EXPANSION 0x5ffffffc
+
BUF_MEM *BUF_MEM_new(void)
{
BUF_MEM *ret;
@@ -105,6 +110,12 @@ int BUF_MEM_grow(BUF_MEM *str, size_t len)
str->length=len;
return(len);
}
+ /* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+ if (len > LIMIT_BEFORE_EXPANSION)
+ {
+ BUFerr(BUF_F_BUF_MEM_GROW,ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
n=(len+3)/3*4;
if (str->data == NULL)
ret=OPENSSL_malloc(n);
@@ -142,6 +153,12 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
str->length=len;
return(len);
}
+ /* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+ if (len > LIMIT_BEFORE_EXPANSION)
+ {
+ BUFerr(BUF_F_BUF_MEM_GROW_CLEAN,ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
n=(len+3)/3*4;
if (str->data == NULL)
ret=OPENSSL_malloc(n);
@@ -162,64 +179,6 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
return(len);
}
-char *BUF_strdup(const char *str)
- {
- if (str == NULL) return(NULL);
- return BUF_strndup(str, strlen(str));
- }
-
-char *BUF_strndup(const char *str, size_t siz)
- {
- char *ret;
-
- if (str == NULL) return(NULL);
-
- ret=OPENSSL_malloc(siz+1);
- if (ret == NULL)
- {
- BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
- return(NULL);
- }
- BUF_strlcpy(ret,str,siz+1);
- return(ret);
- }
-
-void *BUF_memdup(const void *data, size_t siz)
- {
- void *ret;
-
- if (data == NULL) return(NULL);
-
- ret=OPENSSL_malloc(siz);
- if (ret == NULL)
- {
- BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
- return(NULL);
- }
- return memcpy(ret, data, siz);
- }
-
-size_t BUF_strlcpy(char *dst, const char *src, size_t size)
- {
- size_t l = 0;
- for(; size > 1 && *src; size--)
- {
- *dst++ = *src++;
- l++;
- }
- if (size)
- *dst = '\0';
- return l + strlen(src);
- }
-
-size_t BUF_strlcat(char *dst, const char *src, size_t size)
- {
- size_t l = 0;
- for(; size > 0 && *dst; size--, dst++)
- l++;
- return l + BUF_strlcpy(dst, src, size);
- }
-
void BUF_reverse(unsigned char *out, unsigned char *in, size_t size)
{
size_t i;
diff --git a/crypto/cmac/cm_ameth.c b/crypto/cmac/cm_ameth.c
new file mode 100644
index 0000000..0b8e567
--- /dev/null
+++ b/crypto/cmac/cm_ameth.c
@@ -0,0 +1,97 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include
+#include "cryptlib.h"
+#include
+#include
+#include "asn1_locl.h"
+
+/* CMAC "ASN1" method. This is just here to indicate the
+ * maximum CMAC output length and to free up a CMAC
+ * key.
+ */
+
+static int cmac_size(const EVP_PKEY *pkey)
+ {
+ return EVP_MAX_BLOCK_LENGTH;
+ }
+
+static void cmac_key_free(EVP_PKEY *pkey)
+ {
+ CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr;
+ if (cmctx)
+ CMAC_CTX_free(cmctx);
+ }
+
+const EVP_PKEY_ASN1_METHOD cmac_asn1_meth =
+ {
+ EVP_PKEY_CMAC,
+ EVP_PKEY_CMAC,
+ 0,
+
+ "CMAC",
+ "OpenSSL CMAC method",
+
+ 0,0,0,0,
+
+ 0,0,0,
+
+ cmac_size,
+ 0,
+ 0,0,0,0,0,0,0,
+
+ cmac_key_free,
+ 0,
+ 0,0
+ };
+
diff --git a/crypto/cmac/cm_pmeth.c b/crypto/cmac/cm_pmeth.c
new file mode 100644
index 0000000..072228e
--- /dev/null
+++ b/crypto/cmac/cm_pmeth.c
@@ -0,0 +1,224 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include
+#include "cryptlib.h"
+#include
+#include
+#include
+#include
+#include "evp_locl.h"
+
+/* The context structure and "key" is simply a CMAC_CTX */
+
+static int pkey_cmac_init(EVP_PKEY_CTX *ctx)
+ {
+ ctx->data = CMAC_CTX_new();
+ if (!ctx->data)
+ return 0;
+ ctx->keygen_info_count = 0;
+ return 1;
+ }
+
+static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
+ {
+ if (!pkey_cmac_init(dst))
+ return 0;
+ if (!CMAC_CTX_copy(dst->data, src->data))
+ return 0;
+ return 1;
+ }
+
+static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx)
+ {
+ CMAC_CTX_free(ctx->data);
+ }
+
+static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
+ {
+ CMAC_CTX *cmkey = CMAC_CTX_new();
+ CMAC_CTX *cmctx = ctx->data;
+ if (!cmkey)
+ return 0;
+ if (!CMAC_CTX_copy(cmkey, cmctx))
+ {
+ CMAC_CTX_free(cmkey);
+ return 0;
+ }
+ EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey);
+
+ return 1;
+ }
+
+static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
+ {
+ if (!CMAC_Update(ctx->pctx->data, data, count))
+ return 0;
+ return 1;
+ }
+
+static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
+ {
+ EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
+ mctx->update = int_update;
+ return 1;
+ }
+
+static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+ EVP_MD_CTX *mctx)
+ {
+ return CMAC_Final(ctx->data, sig, siglen);
+ }
+
+static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
+ {
+ CMAC_CTX *cmctx = ctx->data;
+ switch (type)
+ {
+
+ case EVP_PKEY_CTRL_SET_MAC_KEY:
+ if (!p2 || p1 < 0)
+ return 0;
+ if (!CMAC_Init(cmctx, p2, p1, NULL, NULL))
+ return 0;
+ break;
+
+ case EVP_PKEY_CTRL_CIPHER:
+ if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine))
+ return 0;
+ break;
+
+ case EVP_PKEY_CTRL_MD:
+ if (ctx->pkey && !CMAC_CTX_copy(ctx->data,
+ (CMAC_CTX *)ctx->pkey->pkey.ptr))
+ return 0;
+ if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL))
+ return 0;
+ break;
+
+ default:
+ return -2;
+
+ }
+ return 1;
+ }
+
+static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx,
+ const char *type, const char *value)
+ {
+ if (!value)
+ {
+ return 0;
+ }
+ if (!strcmp(type, "key"))
+ {
+ void *p = (void *)value;
+ return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY,
+ strlen(p), p);
+ }
+ if (!strcmp(type, "cipher"))
+ {
+ const EVP_CIPHER *c;
+ c = EVP_get_cipherbyname(value);
+ if (!c)
+ return 0;
+ return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c);
+ }
+ if (!strcmp(type, "hexkey"))
+ {
+ unsigned char *key;
+ int r;
+ long keylen;
+ key = string_to_hex(value, &keylen);
+ if (!key)
+ return 0;
+ r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key);
+ OPENSSL_free(key);
+ return r;
+ }
+ return -2;
+ }
+
+const EVP_PKEY_METHOD cmac_pkey_meth =
+ {
+ EVP_PKEY_CMAC,
+ EVP_PKEY_FLAG_SIGCTX_CUSTOM,
+ pkey_cmac_init,
+ pkey_cmac_copy,
+ pkey_cmac_cleanup,
+
+ 0, 0,
+
+ 0,
+ pkey_cmac_keygen,
+
+ 0, 0,
+
+ 0, 0,
+
+ 0,0,
+
+ cmac_signctx_init,
+ cmac_signctx,
+
+ 0,0,
+
+ 0,0,
+
+ 0,0,
+
+ 0,0,
+
+ pkey_cmac_ctrl,
+ pkey_cmac_ctrl_str
+
+ };
diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c
new file mode 100644
index 0000000..8b72b09
--- /dev/null
+++ b/crypto/cmac/cmac.c
@@ -0,0 +1,308 @@
+/* crypto/cmac/cmac.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include
+#include
+#include
+#include "cryptlib.h"
+#include
+
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
+struct CMAC_CTX_st
+ {
+ /* Cipher context to use */
+ EVP_CIPHER_CTX cctx;
+ /* Keys k1 and k2 */
+ unsigned char k1[EVP_MAX_BLOCK_LENGTH];
+ unsigned char k2[EVP_MAX_BLOCK_LENGTH];
+ /* Temporary block */
+ unsigned char tbl[EVP_MAX_BLOCK_LENGTH];
+ /* Last (possibly partial) block */
+ unsigned char last_block[EVP_MAX_BLOCK_LENGTH];
+ /* Number of bytes in last block: -1 means context not initialised */
+ int nlast_block;
+ };
+
+
+/* Make temporary keys K1 and K2 */
+
+static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+ {
+ int i;
+ /* Shift block to left, including carry */
+ for (i = 0; i < bl; i++)
+ {
+ k1[i] = l[i] << 1;
+ if (i < bl - 1 && l[i + 1] & 0x80)
+ k1[i] |= 1;
+ }
+ /* If MSB set fixup with R */
+ if (l[0] & 0x80)
+ k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+ }
+
+CMAC_CTX *CMAC_CTX_new(void)
+ {
+ CMAC_CTX *ctx;
+ ctx = OPENSSL_malloc(sizeof(CMAC_CTX));
+ if (!ctx)
+ return NULL;
+ EVP_CIPHER_CTX_init(&ctx->cctx);
+ ctx->nlast_block = -1;
+ return ctx;
+ }
+
+void CMAC_CTX_cleanup(CMAC_CTX *ctx)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->cctx.engine)
+ {
+ FIPS_cmac_ctx_cleanup(ctx);
+ return;
+ }
+#endif
+ EVP_CIPHER_CTX_cleanup(&ctx->cctx);
+ OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH);
+ OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH);
+ OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH);
+ OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH);
+ ctx->nlast_block = -1;
+ }
+
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx)
+ {
+ return &ctx->cctx;
+ }
+
+void CMAC_CTX_free(CMAC_CTX *ctx)
+ {
+ CMAC_CTX_cleanup(ctx);
+ OPENSSL_free(ctx);
+ }
+
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
+ {
+ int bl;
+ if (in->nlast_block == -1)
+ return 0;
+ if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx))
+ return 0;
+ bl = EVP_CIPHER_CTX_block_size(&in->cctx);
+ memcpy(out->k1, in->k1, bl);
+ memcpy(out->k2, in->k2, bl);
+ memcpy(out->tbl, in->tbl, bl);
+ memcpy(out->last_block, in->last_block, bl);
+ out->nlast_block = in->nlast_block;
+ return 1;
+ }
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
+ const EVP_CIPHER *cipher, ENGINE *impl)
+ {
+ static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ {
+ /* If we have an ENGINE need to allow non FIPS */
+ if ((impl || ctx->cctx.engine)
+ && !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+
+ {
+ EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
+ return 0;
+ }
+ /* Other algorithm blocking will be done in FIPS_cmac_init,
+ * via FIPS_cipherinit().
+ */
+ if (!impl && !ctx->cctx.engine)
+ return FIPS_cmac_init(ctx, key, keylen, cipher, NULL);
+ }
+#endif
+ /* All zeros means restart */
+ if (!key && !cipher && !impl && keylen == 0)
+ {
+ /* Not initialised */
+ if (ctx->nlast_block == -1)
+ return 0;
+ if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+ return 0;
+ memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx));
+ ctx->nlast_block = 0;
+ return 1;
+ }
+ /* Initialiase context */
+ if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL))
+ return 0;
+ /* Non-NULL key means initialisation complete */
+ if (key)
+ {
+ int bl;
+ if (!EVP_CIPHER_CTX_cipher(&ctx->cctx))
+ return 0;
+ if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen))
+ return 0;
+ if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv))
+ return 0;
+ bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+ if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl))
+ return 0;
+ make_kn(ctx->k1, ctx->tbl, bl);
+ make_kn(ctx->k2, ctx->k1, bl);
+ OPENSSL_cleanse(ctx->tbl, bl);
+ /* Reset context again ready for first data block */
+ if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+ return 0;
+ /* Zero tbl so resume works */
+ memset(ctx->tbl, 0, bl);
+ ctx->nlast_block = 0;
+ }
+ return 1;
+ }
+
+int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen)
+ {
+ const unsigned char *data = in;
+ size_t bl;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->cctx.engine)
+ return FIPS_cmac_update(ctx, in, dlen);
+#endif
+ if (ctx->nlast_block == -1)
+ return 0;
+ if (dlen == 0)
+ return 1;
+ bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+ /* Copy into partial block if we need to */
+ if (ctx->nlast_block > 0)
+ {
+ size_t nleft;
+ nleft = bl - ctx->nlast_block;
+ if (dlen < nleft)
+ nleft = dlen;
+ memcpy(ctx->last_block + ctx->nlast_block, data, nleft);
+ dlen -= nleft;
+ ctx->nlast_block += nleft;
+ /* If no more to process return */
+ if (dlen == 0)
+ return 1;
+ data += nleft;
+ /* Else not final block so encrypt it */
+ if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl))
+ return 0;
+ }
+ /* Encrypt all but one of the complete blocks left */
+ while(dlen > bl)
+ {
+ if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl))
+ return 0;
+ dlen -= bl;
+ data += bl;
+ }
+ /* Copy any data left to last block buffer */
+ memcpy(ctx->last_block, data, dlen);
+ ctx->nlast_block = dlen;
+ return 1;
+
+ }
+
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen)
+ {
+ int i, bl, lb;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->cctx.engine)
+ return FIPS_cmac_final(ctx, out, poutlen);
+#endif
+ if (ctx->nlast_block == -1)
+ return 0;
+ bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+ *poutlen = (size_t)bl;
+ if (!out)
+ return 1;
+ lb = ctx->nlast_block;
+ /* Is last block complete? */
+ if (lb == bl)
+ {
+ for (i = 0; i < bl; i++)
+ out[i] = ctx->last_block[i] ^ ctx->k1[i];
+ }
+ else
+ {
+ ctx->last_block[lb] = 0x80;
+ if (bl - lb > 1)
+ memset(ctx->last_block + lb + 1, 0, bl - lb - 1);
+ for (i = 0; i < bl; i++)
+ out[i] = ctx->last_block[i] ^ ctx->k2[i];
+ }
+ if (!EVP_Cipher(&ctx->cctx, out, out, bl))
+ {
+ OPENSSL_cleanse(out, bl);
+ return 0;
+ }
+ return 1;
+ }
+
+int CMAC_resume(CMAC_CTX *ctx)
+ {
+ if (ctx->nlast_block == -1)
+ return 0;
+ /* The buffer "tbl" containes the last fully encrypted block
+ * which is the last IV (or all zeroes if no last encrypted block).
+ * The last block has not been modified since CMAC_final().
+ * So reinitliasing using the last decrypted block will allow
+ * CMAC to continue after calling CMAC_Final().
+ */
+ return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl);
+ }
diff --git a/crypto/cmac/cmac.h b/crypto/cmac/cmac.h
new file mode 100644
index 0000000..712e92d
--- /dev/null
+++ b/crypto/cmac/cmac.h
@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
+ const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/crypto/comp/c_rle.c b/crypto/comp/c_rle.c
index 18bceae..47dfb67 100644
--- a/crypto/comp/c_rle.c
+++ b/crypto/comp/c_rle.c
@@ -30,7 +30,7 @@ static int rle_compress_block(COMP_CTX *ctx, unsigned char *out,
{
/* int i; */
- if (olen < (ilen+1))
+ if (ilen == 0 || olen < (ilen-1))
{
/* ZZZZZZZZZZZZZZZZZZZZZZ */
return(-1);
@@ -46,7 +46,7 @@ static int rle_expand_block(COMP_CTX *ctx, unsigned char *out,
{
int i;
- if (ilen == 0 || olen < (ilen-1))
+ if (olen < (ilen-1))
{
/* ZZZZZZZZZZZZZZZZZZZZZZ */
return(-1);
diff --git a/crypto/conf/conf_mall.c b/crypto/conf/conf_mall.c
index c6f4cb2..213890e 100644
--- a/crypto/conf/conf_mall.c
+++ b/crypto/conf/conf_mall.c
@@ -76,5 +76,6 @@ void OPENSSL_load_builtin_modules(void)
#ifndef OPENSSL_NO_ENGINE
ENGINE_add_conf_module();
#endif
+ EVP_add_alg_module();
}
diff --git a/crypto/cpt_err.c b/crypto/cpt_err.c
index 139b928..289005f 100644
--- a/crypto/cpt_err.c
+++ b/crypto/cpt_err.c
@@ -1,6 +1,6 @@
/* crypto/cpt_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -76,6 +76,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
{ERR_FUNC(CRYPTO_F_CRYPTO_SET_EX_DATA), "CRYPTO_set_ex_data"},
{ERR_FUNC(CRYPTO_F_DEF_ADD_INDEX), "DEF_ADD_INDEX"},
{ERR_FUNC(CRYPTO_F_DEF_GET_CLASS), "DEF_GET_CLASS"},
+{ERR_FUNC(CRYPTO_F_FIPS_MODE_SET), "FIPS_mode_set"},
{ERR_FUNC(CRYPTO_F_INT_DUP_EX_DATA), "INT_DUP_EX_DATA"},
{ERR_FUNC(CRYPTO_F_INT_FREE_EX_DATA), "INT_FREE_EX_DATA"},
{ERR_FUNC(CRYPTO_F_INT_NEW_EX_DATA), "INT_NEW_EX_DATA"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
static ERR_STRING_DATA CRYPTO_str_reasons[]=
{
+{ERR_REASON(CRYPTO_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
{ERR_REASON(CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK),"no dynlock create callback"},
{0,NULL}
};
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index 24fe123..304c6b7 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -409,6 +409,10 @@ int (*CRYPTO_get_add_lock_callback(void))(int *num,int mount,int type,
void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
const char *file,int line))
{
+ /* Calling this here ensures initialisation before any threads
+ * are started.
+ */
+ OPENSSL_init();
locking_callback=func;
}
@@ -500,7 +504,7 @@ void CRYPTO_THREADID_current(CRYPTO_THREADID *id)
CRYPTO_THREADID_set_numeric(id, (unsigned long)find_thread(NULL));
#else
/* For everything else, default to using the address of 'errno' */
- CRYPTO_THREADID_set_pointer(id, &errno);
+ CRYPTO_THREADID_set_pointer(id, (void*)&errno);
#endif
}
@@ -661,28 +665,53 @@ const char *CRYPTO_get_lock_name(int type)
defined(__INTEL__) || \
defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
-unsigned long OPENSSL_ia32cap_P=0;
-unsigned long *OPENSSL_ia32cap_loc(void) { return &OPENSSL_ia32cap_P; }
+unsigned int OPENSSL_ia32cap_P[2];
+unsigned long *OPENSSL_ia32cap_loc(void)
+{ if (sizeof(long)==4)
+ /*
+ * If 32-bit application pulls address of OPENSSL_ia32cap_P[0]
+ * clear second element to maintain the illusion that vector
+ * is 32-bit.
+ */
+ OPENSSL_ia32cap_P[1]=0;
+ return (unsigned long *)OPENSSL_ia32cap_P;
+}
#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY)
#define OPENSSL_CPUID_SETUP
+#if defined(_WIN32)
+typedef unsigned __int64 IA32CAP;
+#else
+typedef unsigned long long IA32CAP;
+#endif
void OPENSSL_cpuid_setup(void)
{ static int trigger=0;
- unsigned long OPENSSL_ia32_cpuid(void);
+ IA32CAP OPENSSL_ia32_cpuid(void);
+ IA32CAP vec;
char *env;
if (trigger) return;
trigger=1;
- if ((env=getenv("OPENSSL_ia32cap")))
- OPENSSL_ia32cap_P = strtoul(env,NULL,0)|(1<<10);
+ if ((env=getenv("OPENSSL_ia32cap"))) {
+ int off = (env[0]=='~')?1:0;
+#if defined(_WIN32)
+ if (!sscanf(env+off,"%I64i",&vec)) vec = strtoul(env+off,NULL,0);
+#else
+ if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0);
+#endif
+ if (off) vec = OPENSSL_ia32_cpuid()&~vec;
+ }
else
- OPENSSL_ia32cap_P = OPENSSL_ia32_cpuid()|(1<<10);
+ vec = OPENSSL_ia32_cpuid();
+
/*
* |(1<<10) sets a reserved bit to signal that variable
* was initialized already... This is to avoid interference
* with cpuid snippets in ELF .init segment.
*/
+ OPENSSL_ia32cap_P[0] = (unsigned int)vec|(1<<10);
+ OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32);
}
#endif
@@ -896,3 +925,16 @@ void OpenSSLDie(const char *file,int line,const char *assertion)
}
void *OPENSSL_stderr(void) { return stderr; }
+
+int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+ {
+ size_t i;
+ const unsigned char *a = in_a;
+ const unsigned char *b = in_b;
+ unsigned char x = 0;
+
+ for (i = 0; i < len; i++)
+ x |= a[i] ^ b[i];
+
+ return x;
+ }
diff --git a/crypto/cryptlib.h b/crypto/cryptlib.h
index fc249c5..d26f963 100644
--- a/crypto/cryptlib.h
+++ b/crypto/cryptlib.h
@@ -99,8 +99,8 @@ extern "C" {
#define HEX_SIZE(type) (sizeof(type)*2)
void OPENSSL_cpuid_setup(void);
-extern unsigned long OPENSSL_ia32cap_P;
-void OPENSSL_showfatal(const char *,...);
+extern unsigned int OPENSSL_ia32cap_P[];
+void OPENSSL_showfatal(const char *fmta,...);
void *OPENSSL_stderr(void);
extern int OPENSSL_NONPIC_relocated;
diff --git a/crypto/crypto.h b/crypto/crypto.h
index b0360ce..f92fc51 100644
--- a/crypto/crypto.h
+++ b/crypto/crypto.h
@@ -488,10 +488,10 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
long (**go)(void));
void *CRYPTO_malloc_locked(int num, const char *file, int line);
-void CRYPTO_free_locked(void *);
+void CRYPTO_free_locked(void *ptr);
void *CRYPTO_malloc(int num, const char *file, int line);
char *CRYPTO_strdup(const char *str, const char *file, int line);
-void CRYPTO_free(void *);
+void CRYPTO_free(void *ptr);
void *CRYPTO_realloc(void *addr,int num, const char *file, int line);
void *CRYPTO_realloc_clean(void *addr,int old_num,int num,const char *file,
int line);
@@ -547,6 +547,40 @@ unsigned long *OPENSSL_ia32cap_loc(void);
#define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
int OPENSSL_isservice(void);
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+ int alg##_Init(cx##_CTX *c) \
+ { \
+ if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+ "Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+ return private_##alg##_Init(c); \
+ } \
+ int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+ if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+ "Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+ int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
+/* CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It
+ * takes an amount of time dependent on |len|, but independent of the contents
+ * of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a
+ * defined order as the return value when a != b is undefined, other than to be
+ * non-zero. */
+int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
/* BEGIN ERROR CODES */
/* The following lines are auto generated by the script mkerr.pl. Any changes
* made after this point may be overwritten when the script is next run.
@@ -562,11 +596,13 @@ void ERR_load_CRYPTO_strings(void);
#define CRYPTO_F_CRYPTO_SET_EX_DATA 102
#define CRYPTO_F_DEF_ADD_INDEX 104
#define CRYPTO_F_DEF_GET_CLASS 105
+#define CRYPTO_F_FIPS_MODE_SET 109
#define CRYPTO_F_INT_DUP_EX_DATA 106
#define CRYPTO_F_INT_FREE_EX_DATA 107
#define CRYPTO_F_INT_NEW_EX_DATA 108
/* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED 101
#define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK 100
#ifdef __cplusplus
diff --git a/crypto/des/asm/crypt586.S b/crypto/des/asm/crypt586.S
new file mode 100644
index 0000000..fb321ba
--- /dev/null
+++ b/crypto/des/asm/crypt586.S
@@ -0,0 +1,879 @@
+.file "crypt586.s"
+.text
+.globl fcrypt_body
+.type fcrypt_body,@function
+.align 16
+fcrypt_body:
+.L_fcrypt_body_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ xorl %edi,%edi
+ xorl %esi,%esi
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %edx
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+ movl DES_SPtrans@GOT(%edx),%edx
+ pushl %edx
+ movl 28(%esp),%ebp
+ pushl $25
+.L001start:
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl (%ebp),%ebx
+ xorl %ebx,%eax
+ movl 4(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 8(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 12(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 16(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 20(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 24(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 28(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 32(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 36(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 40(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 44(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 48(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 52(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 56(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 60(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 64(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 68(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 72(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 76(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 80(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 84(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 88(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 92(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 96(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 100(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 104(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 108(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 112(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 116(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 120(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 124(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+ movl (%esp),%ebx
+ movl %edi,%eax
+ decl %ebx
+ movl %esi,%edi
+ movl %eax,%esi
+ movl %ebx,(%esp)
+ jnz .L001start
+
+
+ movl 28(%esp),%edx
+ rorl $1,%edi
+ movl %esi,%eax
+ xorl %edi,%esi
+ andl $0xaaaaaaaa,%esi
+ xorl %esi,%eax
+ xorl %esi,%edi
+
+ roll $23,%eax
+ movl %eax,%esi
+ xorl %edi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%esi
+ xorl %eax,%edi
+
+ roll $10,%esi
+ movl %esi,%eax
+ xorl %edi,%esi
+ andl $0x33333333,%esi
+ xorl %esi,%eax
+ xorl %esi,%edi
+
+ roll $18,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xfff0000f,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ roll $12,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xf0f0f0f0,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%edx)
+ movl %edi,4(%edx)
+ addl $8,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size fcrypt_body,.-.L_fcrypt_body_begin
diff --git a/crypto/des/asm/des-586.S b/crypto/des/asm/des-586.S
new file mode 100644
index 0000000..2fbd340
--- /dev/null
+++ b/crypto/des/asm/des-586.S
@@ -0,0 +1,1837 @@
+.file "des-586.s"
+.text
+.globl DES_SPtrans
+.type _x86_DES_encrypt,@function
+.align 16
+_x86_DES_encrypt:
+ pushl %ecx
+
+ movl (%ecx),%eax
+ xorl %ebx,%ebx
+ movl 4(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 8(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 12(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 16(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 20(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 24(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 28(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 32(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 36(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 40(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 44(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 48(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 52(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 56(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 60(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 64(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 68(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 72(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 76(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 80(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 84(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 88(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 92(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 96(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 100(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 104(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 108(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 112(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 116(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 120(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 124(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+ addl $4,%esp
+ ret
+.size _x86_DES_encrypt,.-_x86_DES_encrypt
+.type _x86_DES_decrypt,@function
+.align 16
+_x86_DES_decrypt:
+ pushl %ecx
+
+ movl 120(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 124(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 112(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 116(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 104(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 108(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 96(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 100(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 88(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 92(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 80(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 84(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 72(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 76(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 64(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 68(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 56(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 60(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 48(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 52(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 40(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 44(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 32(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 36(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 24(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 28(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 16(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 20(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 8(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 12(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl (%ecx),%eax
+ xorl %ebx,%ebx
+ movl 4(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+ addl $4,%esp
+ ret
+.size _x86_DES_decrypt,.-_x86_DES_decrypt
+.globl DES_encrypt1
+.type DES_encrypt1,@function
+.align 16
+DES_encrypt1:
+.L_DES_encrypt1_begin:
+ pushl %esi
+ pushl %edi
+
+
+ movl 12(%esp),%esi
+ xorl %ecx,%ecx
+ pushl %ebx
+ pushl %ebp
+ movl (%esi),%eax
+ movl 28(%esp),%ebx
+ movl 4(%esi),%edi
+
+
+ roll $4,%eax
+ movl %eax,%esi
+ xorl %edi,%eax
+ andl $0xf0f0f0f0,%eax
+ xorl %eax,%esi
+ xorl %eax,%edi
+
+ roll $20,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xfff0000f,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $14,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x33333333,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $22,%esi
+ movl %esi,%eax
+ xorl %edi,%esi
+ andl $0x03fc03fc,%esi
+ xorl %esi,%eax
+ xorl %esi,%edi
+
+ roll $9,%eax
+ movl %eax,%esi
+ xorl %edi,%eax
+ andl $0xaaaaaaaa,%eax
+ xorl %eax,%esi
+ xorl %eax,%edi
+
+ roll $1,%edi
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal DES_SPtrans-.L000pic_point(%ebp),%ebp
+ movl 24(%esp),%ecx
+ cmpl $0,%ebx
+ je .L001decrypt
+ call _x86_DES_encrypt
+ jmp .L002done
+.L001decrypt:
+ call _x86_DES_decrypt
+.L002done:
+
+
+ movl 20(%esp),%edx
+ rorl $1,%esi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $23,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $10,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $18,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ roll $12,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%edx)
+ movl %esi,4(%edx)
+ popl %ebp
+ popl %ebx
+ popl %edi
+ popl %esi
+ ret
+.size DES_encrypt1,.-.L_DES_encrypt1_begin
+.globl DES_encrypt2
+.type DES_encrypt2,@function
+.align 16
+DES_encrypt2:
+.L_DES_encrypt2_begin:
+ pushl %esi
+ pushl %edi
+
+
+ movl 12(%esp),%eax
+ xorl %ecx,%ecx
+ pushl %ebx
+ pushl %ebp
+ movl (%eax),%esi
+ movl 28(%esp),%ebx
+ roll $3,%esi
+ movl 4(%eax),%edi
+ roll $3,%edi
+ call .L003pic_point
+.L003pic_point:
+ popl %ebp
+ leal DES_SPtrans-.L003pic_point(%ebp),%ebp
+ movl 24(%esp),%ecx
+ cmpl $0,%ebx
+ je .L004decrypt
+ call _x86_DES_encrypt
+ jmp .L005done
+.L004decrypt:
+ call _x86_DES_decrypt
+.L005done:
+
+
+ rorl $3,%edi
+ movl 20(%esp),%eax
+ rorl $3,%esi
+ movl %edi,(%eax)
+ movl %esi,4(%eax)
+ popl %ebp
+ popl %ebx
+ popl %edi
+ popl %esi
+ ret
+.size DES_encrypt2,.-.L_DES_encrypt2_begin
+.globl DES_encrypt3
+.type DES_encrypt3,@function
+.align 16
+DES_encrypt3:
+.L_DES_encrypt3_begin:
+ pushl %ebx
+ movl 8(%esp),%ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ subl $12,%esp
+
+
+ roll $4,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ roll $20,%esi
+ movl %esi,%edi
+ xorl %edx,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%edx
+
+ roll $14,%edi
+ movl %edi,%esi
+ xorl %edx,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%esi
+ xorl %edi,%edx
+
+ roll $22,%edx
+ movl %edx,%edi
+ xorl %esi,%edx
+ andl $0x03fc03fc,%edx
+ xorl %edx,%edi
+ xorl %edx,%esi
+
+ roll $9,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ rorl $3,%edx
+ rorl $2,%esi
+ movl %esi,4(%ebx)
+ movl 36(%esp),%eax
+ movl %edx,(%ebx)
+ movl 40(%esp),%edi
+ movl 44(%esp),%esi
+ movl $1,8(%esp)
+ movl %eax,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $0,8(%esp)
+ movl %edi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $1,8(%esp)
+ movl %esi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ addl $12,%esp
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+
+
+ roll $2,%esi
+ roll $3,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $23,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $10,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $18,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ roll $12,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%ebx)
+ movl %esi,4(%ebx)
+ popl %edi
+ popl %esi
+ popl %ebp
+ popl %ebx
+ ret
+.size DES_encrypt3,.-.L_DES_encrypt3_begin
+.globl DES_decrypt3
+.type DES_decrypt3,@function
+.align 16
+DES_decrypt3:
+.L_DES_decrypt3_begin:
+ pushl %ebx
+ movl 8(%esp),%ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ subl $12,%esp
+
+
+ roll $4,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ roll $20,%esi
+ movl %esi,%edi
+ xorl %edx,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%edx
+
+ roll $14,%edi
+ movl %edi,%esi
+ xorl %edx,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%esi
+ xorl %edi,%edx
+
+ roll $22,%edx
+ movl %edx,%edi
+ xorl %esi,%edx
+ andl $0x03fc03fc,%edx
+ xorl %edx,%edi
+ xorl %edx,%esi
+
+ roll $9,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ rorl $3,%edx
+ rorl $2,%esi
+ movl %esi,4(%ebx)
+ movl 36(%esp),%esi
+ movl %edx,(%ebx)
+ movl 40(%esp),%edi
+ movl 44(%esp),%eax
+ movl $0,8(%esp)
+ movl %eax,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $1,8(%esp)
+ movl %edi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $0,8(%esp)
+ movl %esi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ addl $12,%esp
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+
+
+ roll $2,%esi
+ roll $3,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $23,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $10,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $18,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ roll $12,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%ebx)
+ movl %esi,4(%ebx)
+ popl %edi
+ popl %esi
+ popl %ebp
+ popl %ebx
+ ret
+.size DES_decrypt3,.-.L_DES_decrypt3_begin
+.globl DES_ncbc_encrypt
+.type DES_ncbc_encrypt,@function
+.align 16
+DES_ncbc_encrypt:
+.L_DES_ncbc_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ebp
+
+ movl 36(%esp),%ebx
+ movl (%ebx),%esi
+ movl 4(%ebx),%edi
+ pushl %edi
+ pushl %esi
+ pushl %edi
+ pushl %esi
+ movl %esp,%ebx
+ movl 36(%esp),%esi
+ movl 40(%esp),%edi
+
+ movl 56(%esp),%ecx
+
+ pushl %ecx
+
+ movl 52(%esp),%eax
+ pushl %eax
+ pushl %ebx
+ cmpl $0,%ecx
+ jz .L006decrypt
+ andl $4294967288,%ebp
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ jz .L007encrypt_finish
+.L008encrypt_loop:
+ movl (%esi),%ecx
+ movl 4(%esi),%edx
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L008encrypt_loop
+.L007encrypt_finish:
+ movl 56(%esp),%ebp
+ andl $7,%ebp
+ jz .L009finish
+ call .L010PIC_point
+.L010PIC_point:
+ popl %edx
+ leal .L011cbc_enc_jmp_table-.L010PIC_point(%edx),%ecx
+ movl (%ecx,%ebp,4),%ebp
+ addl %edx,%ebp
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ jmp *%ebp
+.L012ej7:
+ movb 6(%esi),%dh
+ shll $8,%edx
+.L013ej6:
+ movb 5(%esi),%dh
+.L014ej5:
+ movb 4(%esi),%dl
+.L015ej4:
+ movl (%esi),%ecx
+ jmp .L016ejend
+.L017ej3:
+ movb 2(%esi),%ch
+ shll $8,%ecx
+.L018ej2:
+ movb 1(%esi),%ch
+.L019ej1:
+ movb (%esi),%cl
+.L016ejend:
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ jmp .L009finish
+.L006decrypt:
+ andl $4294967288,%ebp
+ movl 20(%esp),%eax
+ movl 24(%esp),%ebx
+ jz .L020decrypt_finish
+.L021decrypt_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl 20(%esp),%ecx
+ movl 24(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %ecx,(%edi)
+ movl %edx,4(%edi)
+ movl %eax,20(%esp)
+ movl %ebx,24(%esp)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L021decrypt_loop
+.L020decrypt_finish:
+ movl 56(%esp),%ebp
+ andl $7,%ebp
+ jz .L009finish
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl 20(%esp),%ecx
+ movl 24(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+.L022dj7:
+ rorl $16,%edx
+ movb %dl,6(%edi)
+ shrl $16,%edx
+.L023dj6:
+ movb %dh,5(%edi)
+.L024dj5:
+ movb %dl,4(%edi)
+.L025dj4:
+ movl %ecx,(%edi)
+ jmp .L026djend
+.L027dj3:
+ rorl $16,%ecx
+ movb %cl,2(%edi)
+ shll $16,%ecx
+.L028dj2:
+ movb %ch,1(%esi)
+.L029dj1:
+ movb %cl,(%esi)
+.L026djend:
+ jmp .L009finish
+.L009finish:
+ movl 64(%esp),%ecx
+ addl $28,%esp
+ movl %eax,(%ecx)
+ movl %ebx,4(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L011cbc_enc_jmp_table:
+.long 0
+.long .L019ej1-.L010PIC_point
+.long .L018ej2-.L010PIC_point
+.long .L017ej3-.L010PIC_point
+.long .L015ej4-.L010PIC_point
+.long .L014ej5-.L010PIC_point
+.long .L013ej6-.L010PIC_point
+.long .L012ej7-.L010PIC_point
+.align 64
+.size DES_ncbc_encrypt,.-.L_DES_ncbc_encrypt_begin
+.globl DES_ede3_cbc_encrypt
+.type DES_ede3_cbc_encrypt,@function
+.align 16
+DES_ede3_cbc_encrypt:
+.L_DES_ede3_cbc_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ebp
+
+ movl 44(%esp),%ebx
+ movl (%ebx),%esi
+ movl 4(%ebx),%edi
+ pushl %edi
+ pushl %esi
+ pushl %edi
+ pushl %esi
+ movl %esp,%ebx
+ movl 36(%esp),%esi
+ movl 40(%esp),%edi
+
+ movl 64(%esp),%ecx
+
+ movl 56(%esp),%eax
+ pushl %eax
+
+ movl 56(%esp),%eax
+ pushl %eax
+
+ movl 56(%esp),%eax
+ pushl %eax
+ pushl %ebx
+ cmpl $0,%ecx
+ jz .L030decrypt
+ andl $4294967288,%ebp
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ jz .L031encrypt_finish
+.L032encrypt_loop:
+ movl (%esi),%ecx
+ movl 4(%esi),%edx
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_encrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L032encrypt_loop
+.L031encrypt_finish:
+ movl 60(%esp),%ebp
+ andl $7,%ebp
+ jz .L033finish
+ call .L034PIC_point
+.L034PIC_point:
+ popl %edx
+ leal .L035cbc_enc_jmp_table-.L034PIC_point(%edx),%ecx
+ movl (%ecx,%ebp,4),%ebp
+ addl %edx,%ebp
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ jmp *%ebp
+.L036ej7:
+ movb 6(%esi),%dh
+ shll $8,%edx
+.L037ej6:
+ movb 5(%esi),%dh
+.L038ej5:
+ movb 4(%esi),%dl
+.L039ej4:
+ movl (%esi),%ecx
+ jmp .L040ejend
+.L041ej3:
+ movb 2(%esi),%ch
+ shll $8,%ecx
+.L042ej2:
+ movb 1(%esi),%ch
+.L043ej1:
+ movb (%esi),%cl
+.L040ejend:
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_encrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ jmp .L033finish
+.L030decrypt:
+ andl $4294967288,%ebp
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ jz .L044decrypt_finish
+.L045decrypt_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_decrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl 24(%esp),%ecx
+ movl 28(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %ecx,(%edi)
+ movl %edx,4(%edi)
+ movl %eax,24(%esp)
+ movl %ebx,28(%esp)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L045decrypt_loop
+.L044decrypt_finish:
+ movl 60(%esp),%ebp
+ andl $7,%ebp
+ jz .L033finish
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_decrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl 24(%esp),%ecx
+ movl 28(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+.L046dj7:
+ rorl $16,%edx
+ movb %dl,6(%edi)
+ shrl $16,%edx
+.L047dj6:
+ movb %dh,5(%edi)
+.L048dj5:
+ movb %dl,4(%edi)
+.L049dj4:
+ movl %ecx,(%edi)
+ jmp .L050djend
+.L051dj3:
+ rorl $16,%ecx
+ movb %cl,2(%edi)
+ shll $16,%ecx
+.L052dj2:
+ movb %ch,1(%esi)
+.L053dj1:
+ movb %cl,(%esi)
+.L050djend:
+ jmp .L033finish
+.L033finish:
+ movl 76(%esp),%ecx
+ addl $32,%esp
+ movl %eax,(%ecx)
+ movl %ebx,4(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L035cbc_enc_jmp_table:
+.long 0
+.long .L043ej1-.L034PIC_point
+.long .L042ej2-.L034PIC_point
+.long .L041ej3-.L034PIC_point
+.long .L039ej4-.L034PIC_point
+.long .L038ej5-.L034PIC_point
+.long .L037ej6-.L034PIC_point
+.long .L036ej7-.L034PIC_point
+.align 64
+.size DES_ede3_cbc_encrypt,.-.L_DES_ede3_cbc_encrypt_begin
+.align 64
+DES_SPtrans:
+.long 34080768,524288,33554434,34080770
+.long 33554432,526338,524290,33554434
+.long 526338,34080768,34078720,2050
+.long 33556482,33554432,0,524290
+.long 524288,2,33556480,526336
+.long 34080770,34078720,2050,33556480
+.long 2,2048,526336,34078722
+.long 2048,33556482,34078722,0
+.long 0,34080770,33556480,524290
+.long 34080768,524288,2050,33556480
+.long 34078722,2048,526336,33554434
+.long 526338,2,33554434,34078720
+.long 34080770,526336,34078720,33556482
+.long 33554432,2050,524290,0
+.long 524288,33554432,33556482,34080768
+.long 2,34078722,2048,526338
+.long 1074823184,0,1081344,1074790400
+.long 1073741840,32784,1073774592,1081344
+.long 32768,1074790416,16,1073774592
+.long 1048592,1074823168,1074790400,16
+.long 1048576,1073774608,1074790416,32768
+.long 1081360,1073741824,0,1048592
+.long 1073774608,1081360,1074823168,1073741840
+.long 1073741824,1048576,32784,1074823184
+.long 1048592,1074823168,1073774592,1081360
+.long 1074823184,1048592,1073741840,0
+.long 1073741824,32784,1048576,1074790416
+.long 32768,1073741824,1081360,1073774608
+.long 1074823168,32768,0,1073741840
+.long 16,1074823184,1081344,1074790400
+.long 1074790416,1048576,32784,1073774592
+.long 1073774608,16,1074790400,1081344
+.long 67108865,67371264,256,67109121
+.long 262145,67108864,67109121,262400
+.long 67109120,262144,67371008,1
+.long 67371265,257,1,67371009
+.long 0,262145,67371264,256
+.long 257,67371265,262144,67108865
+.long 67371009,67109120,262401,67371008
+.long 262400,0,67108864,262401
+.long 67371264,256,1,262144
+.long 257,262145,67371008,67109121
+.long 0,67371264,262400,67371009
+.long 262145,67108864,67371265,1
+.long 262401,67108865,67108864,67371265
+.long 262144,67109120,67109121,262400
+.long 67109120,0,67371009,257
+.long 67108865,262401,256,67371008
+.long 4198408,268439552,8,272633864
+.long 0,272629760,268439560,4194312
+.long 272633856,268435464,268435456,4104
+.long 268435464,4198408,4194304,268435456
+.long 272629768,4198400,4096,8
+.long 4198400,268439560,272629760,4096
+.long 4104,0,4194312,272633856
+.long 268439552,272629768,272633864,4194304
+.long 272629768,4104,4194304,268435464
+.long 4198400,268439552,8,272629760
+.long 268439560,0,4096,4194312
+.long 0,272629768,272633856,4096
+.long 268435456,272633864,4198408,4194304
+.long 272633864,8,268439552,4198408
+.long 4194312,4198400,272629760,268439560
+.long 4104,268435456,268435464,272633856
+.long 134217728,65536,1024,134284320
+.long 134283296,134218752,66592,134283264
+.long 65536,32,134217760,66560
+.long 134218784,134283296,134284288,0
+.long 66560,134217728,65568,1056
+.long 134218752,66592,0,134217760
+.long 32,134218784,134284320,65568
+.long 134283264,1024,1056,134284288
+.long 134284288,134218784,65568,134283264
+.long 65536,32,134217760,134218752
+.long 134217728,66560,134284320,0
+.long 66592,134217728,1024,65568
+.long 134218784,1024,0,134284320
+.long 134283296,134284288,1056,65536
+.long 66560,134283296,134218752,1056
+.long 32,66592,134283264,134217760
+.long 2147483712,2097216,0,2149588992
+.long 2097216,8192,2147491904,2097152
+.long 8256,2149589056,2105344,2147483648
+.long 2147491840,2147483712,2149580800,2105408
+.long 2097152,2147491904,2149580864,0
+.long 8192,64,2149588992,2149580864
+.long 2149589056,2149580800,2147483648,8256
+.long 64,2105344,2105408,2147491840
+.long 8256,2147483648,2147491840,2105408
+.long 2149588992,2097216,0,2147491840
+.long 2147483648,8192,2149580864,2097152
+.long 2097216,2149589056,2105344,64
+.long 2149589056,2105344,2097152,2147491904
+.long 2147483712,2149580800,2105408,0
+.long 8192,2147483712,2147491904,2149588992
+.long 2149580800,8256,64,2149580864
+.long 16384,512,16777728,16777220
+.long 16794116,16388,16896,0
+.long 16777216,16777732,516,16793600
+.long 4,16794112,16793600,516
+.long 16777732,16384,16388,16794116
+.long 0,16777728,16777220,16896
+.long 16793604,16900,16794112,4
+.long 16900,16793604,512,16777216
+.long 16900,16793600,16793604,516
+.long 16384,512,16777216,16793604
+.long 16777732,16900,16896,0
+.long 512,16777220,4,16777728
+.long 0,16777732,16777728,16896
+.long 516,16384,16794116,16777216
+.long 16794112,4,16388,16794116
+.long 16777220,16794112,16793600,16388
+.long 545259648,545390592,131200,0
+.long 537001984,8388736,545259520,545390720
+.long 128,536870912,8519680,131200
+.long 8519808,537002112,536871040,545259520
+.long 131072,8519808,8388736,537001984
+.long 545390720,536871040,0,8519680
+.long 536870912,8388608,537002112,545259648
+.long 8388608,131072,545390592,128
+.long 8388608,131072,536871040,545390720
+.long 131200,536870912,0,8519680
+.long 545259648,537002112,537001984,8388736
+.long 545390592,128,8388736,537001984
+.long 545390720,8388608,545259520,536871040
+.long 8519680,131200,537002112,545259520
+.long 128,545390592,8519808,0
+.long 536870912,545259648,131072,8519808
diff --git a/crypto/des/des.h b/crypto/des/des.h
index 92b6663..1eaedcb 100644
--- a/crypto/des/des.h
+++ b/crypto/des/des.h
@@ -224,6 +224,9 @@ int DES_set_key(const_DES_cblock *key,DES_key_schedule *schedule);
int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
void DES_string_to_key(const char *str,DES_cblock *key);
void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,
diff --git a/crypto/des/set_key.c b/crypto/des/set_key.c
index 3004cc3..da4d62e 100644
--- a/crypto/des/set_key.c
+++ b/crypto/des/set_key.c
@@ -63,6 +63,7 @@
* 1.1 added norm_expand_bits
* 1.0 First working version
*/
+#include
#include "des_locl.h"
OPENSSL_IMPLEMENT_GLOBAL(int,DES_check_key,0) /* defaults to false */
@@ -335,6 +336,13 @@ int DES_set_key_checked(const_DES_cblock *key, DES_key_schedule *schedule)
}
void DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#ifdef OPENSSL_FIPS
+ {
+ fips_cipher_abort(DES);
+ private_DES_set_key_unchecked(key, schedule);
+ }
+void private_DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#endif
{
static const int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0};
register DES_LONG c,d,t,s,t2;
diff --git a/crypto/des/str2key.c b/crypto/des/str2key.c
index 9c2054b..1077f99 100644
--- a/crypto/des/str2key.c
+++ b/crypto/des/str2key.c
@@ -56,8 +56,8 @@
* [including the GNU Public Licence.]
*/
-#include "des_locl.h"
#include
+#include "des_locl.h"
void DES_string_to_key(const char *str, DES_cblock *key)
{
diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h
index 849309a..ea59e61 100644
--- a/crypto/dh/dh.h
+++ b/crypto/dh/dh.h
@@ -86,6 +86,21 @@
* be used for all exponents.
*/
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD 0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW 0x0400
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -230,6 +245,9 @@ void ERR_load_DH_strings(void);
#define DH_F_COMPUTE_KEY 102
#define DH_F_DHPARAMS_PRINT_FP 101
#define DH_F_DH_BUILTIN_GENPARAMS 106
+#define DH_F_DH_COMPUTE_KEY 114
+#define DH_F_DH_GENERATE_KEY 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX 116
#define DH_F_DH_NEW_METHOD 105
#define DH_F_DH_PARAM_DECODE 107
#define DH_F_DH_PRIV_DECODE 110
@@ -249,7 +267,9 @@ void ERR_load_DH_strings(void);
#define DH_R_DECODE_ERROR 104
#define DH_R_INVALID_PUBKEY 102
#define DH_R_KEYS_NOT_SET 108
+#define DH_R_KEY_SIZE_TOO_SMALL 110
#define DH_R_MODULUS_TOO_LARGE 103
+#define DH_R_NON_FIPS_METHOD 111
#define DH_R_NO_PARAMETERS_SET 107
#define DH_R_NO_PRIVATE_VALUE 100
#define DH_R_PARAMETER_ENCODING_ERROR 105
diff --git a/crypto/dh/dh_ameth.c b/crypto/dh/dh_ameth.c
index 377caf9..02ec2d4 100644
--- a/crypto/dh/dh_ameth.c
+++ b/crypto/dh/dh_ameth.c
@@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth =
dh_copy_parameters,
dh_cmp_parameters,
dh_param_print,
+ 0,
int_dh_free,
0
diff --git a/crypto/dh/dh_err.c b/crypto/dh/dh_err.c
index d5cf0c2..56d3df7 100644
--- a/crypto/dh/dh_err.c
+++ b/crypto/dh/dh_err.c
@@ -1,6 +1,6 @@
/* crypto/dh/dh_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -73,6 +73,9 @@ static ERR_STRING_DATA DH_str_functs[]=
{ERR_FUNC(DH_F_COMPUTE_KEY), "COMPUTE_KEY"},
{ERR_FUNC(DH_F_DHPARAMS_PRINT_FP), "DHparams_print_fp"},
{ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS), "DH_BUILTIN_GENPARAMS"},
+{ERR_FUNC(DH_F_DH_COMPUTE_KEY), "DH_compute_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_KEY), "DH_generate_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX), "DH_generate_parameters_ex"},
{ERR_FUNC(DH_F_DH_NEW_METHOD), "DH_new_method"},
{ERR_FUNC(DH_F_DH_PARAM_DECODE), "DH_PARAM_DECODE"},
{ERR_FUNC(DH_F_DH_PRIV_DECODE), "DH_PRIV_DECODE"},
@@ -95,7 +98,9 @@ static ERR_STRING_DATA DH_str_reasons[]=
{ERR_REASON(DH_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(DH_R_INVALID_PUBKEY) ,"invalid public key"},
{ERR_REASON(DH_R_KEYS_NOT_SET) ,"keys not set"},
+{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL) ,"key size too small"},
{ERR_REASON(DH_R_MODULUS_TOO_LARGE) ,"modulus too large"},
+{ERR_REASON(DH_R_NON_FIPS_METHOD) ,"non fips method"},
{ERR_REASON(DH_R_NO_PARAMETERS_SET) ,"no parameters set"},
{ERR_REASON(DH_R_NO_PRIVATE_VALUE) ,"no private value"},
{ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
diff --git a/crypto/dh/dh_gen.c b/crypto/dh/dh_gen.c
index cfd5b11..7b1fe9c 100644
--- a/crypto/dh/dh_gen.c
+++ b/crypto/dh/dh_gen.c
@@ -66,12 +66,29 @@
#include
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
static int dh_builtin_genparams(DH *ret, int prime_len, int generator, BN_GENCB *cb);
int DH_generate_parameters_ex(DH *ret, int prime_len, int generator, BN_GENCB *cb)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ret->meth->flags & DH_FLAG_FIPS_METHOD)
+ && !(ret->flags & DH_FLAG_NON_FIPS_ALLOW))
+ {
+ DHerr(DH_F_DH_GENERATE_PARAMETERS_EX, DH_R_NON_FIPS_METHOD);
+ return 0;
+ }
+#endif
if(ret->meth->generate_params)
return ret->meth->generate_params(ret, prime_len, generator, cb);
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dh_generate_parameters_ex(ret, prime_len,
+ generator, cb);
+#endif
return dh_builtin_genparams(ret, prime_len, generator, cb);
}
diff --git a/crypto/dh/dh_key.c b/crypto/dh/dh_key.c
index e7db440..89a74db 100644
--- a/crypto/dh/dh_key.c
+++ b/crypto/dh/dh_key.c
@@ -73,11 +73,27 @@ static int dh_finish(DH *dh);
int DH_generate_key(DH *dh)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+ && !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+ {
+ DHerr(DH_F_DH_GENERATE_KEY, DH_R_NON_FIPS_METHOD);
+ return 0;
+ }
+#endif
return dh->meth->generate_key(dh);
}
int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+ && !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+ {
+ DHerr(DH_F_DH_COMPUTE_KEY, DH_R_NON_FIPS_METHOD);
+ return 0;
+ }
+#endif
return dh->meth->compute_key(key, pub_key, dh);
}
@@ -138,8 +154,21 @@ static int generate_key(DH *dh)
if (generate_new_key)
{
- l = dh->length ? dh->length : BN_num_bits(dh->p)-1; /* secret exponent length */
- if (!BN_rand(priv_key, l, 0, 0)) goto err;
+ if (dh->q)
+ {
+ do
+ {
+ if (!BN_rand_range(priv_key, dh->q))
+ goto err;
+ }
+ while (BN_is_zero(priv_key) || BN_is_one(priv_key));
+ }
+ else
+ {
+ /* secret exponent length */
+ l = dh->length ? dh->length : BN_num_bits(dh->p)-1;
+ if (!BN_rand(priv_key, l, 0, 0)) goto err;
+ }
}
{
diff --git a/crypto/dh/dh_lib.c b/crypto/dh/dh_lib.c
index 7aef080..00218f2 100644
--- a/crypto/dh/dh_lib.c
+++ b/crypto/dh/dh_lib.c
@@ -64,6 +64,10 @@
#include
#endif
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
const char DH_version[]="Diffie-Hellman" OPENSSL_VERSION_PTEXT;
static const DH_METHOD *default_DH_method = NULL;
@@ -76,7 +80,16 @@ void DH_set_default_method(const DH_METHOD *meth)
const DH_METHOD *DH_get_default_method(void)
{
if(!default_DH_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dh_openssl();
+ else
+ return DH_OpenSSL();
+#else
default_DH_method = DH_OpenSSL();
+#endif
+ }
return default_DH_method;
}
@@ -156,7 +169,7 @@ DH *DH_new_method(ENGINE *engine)
ret->counter = NULL;
ret->method_mont_p=NULL;
ret->references = 1;
- ret->flags=ret->meth->flags;
+ ret->flags=ret->meth->flags & ~DH_FLAG_NON_FIPS_ALLOW;
CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DH, ret, &ret->ex_data);
if ((ret->meth->init != NULL) && !ret->meth->init(ret))
{
diff --git a/crypto/dsa/dsa.h b/crypto/dsa/dsa.h
index ac50a5c..a6f6d0b 100644
--- a/crypto/dsa/dsa.h
+++ b/crypto/dsa/dsa.h
@@ -97,6 +97,21 @@
* be used for all exponents.
*/
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD 0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW 0x0400
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -272,6 +287,8 @@ void ERR_load_DSA_strings(void);
#define DSA_F_DSAPARAMS_PRINT_FP 101
#define DSA_F_DSA_DO_SIGN 112
#define DSA_F_DSA_DO_VERIFY 113
+#define DSA_F_DSA_GENERATE_KEY 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX 123
#define DSA_F_DSA_NEW_METHOD 103
#define DSA_F_DSA_PARAM_DECODE 119
#define DSA_F_DSA_PRINT_FP 105
@@ -282,6 +299,7 @@ void ERR_load_DSA_strings(void);
#define DSA_F_DSA_SIGN 106
#define DSA_F_DSA_SIGN_SETUP 107
#define DSA_F_DSA_SIG_NEW 109
+#define DSA_F_DSA_SIG_PRINT 125
#define DSA_F_DSA_VERIFY 108
#define DSA_F_I2D_DSA_SIG 111
#define DSA_F_OLD_DSA_PRIV_DECODE 122
@@ -298,6 +316,8 @@ void ERR_load_DSA_strings(void);
#define DSA_R_INVALID_DIGEST_TYPE 106
#define DSA_R_MISSING_PARAMETERS 101
#define DSA_R_MODULUS_TOO_LARGE 103
+#define DSA_R_NEED_NEW_SETUP_VALUES 110
+#define DSA_R_NON_FIPS_DSA_METHOD 111
#define DSA_R_NO_PARAMETERS_SET 107
#define DSA_R_PARAMETER_ENCODING_ERROR 105
diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c
index 6413aae..376156e 100644
--- a/crypto/dsa/dsa_ameth.c
+++ b/crypto/dsa/dsa_ameth.c
@@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder)
return i2d_DSAPrivateKey(pkey->pkey.dsa, pder);
}
+static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+ const ASN1_STRING *sig,
+ int indent, ASN1_PCTX *pctx)
+ {
+ DSA_SIG *dsa_sig;
+ const unsigned char *p;
+ if (!sig)
+ {
+ if (BIO_puts(bp, "\n") <= 0)
+ return 0;
+ else
+ return 1;
+ }
+ p = sig->data;
+ dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
+ if (dsa_sig)
+ {
+ int rv = 0;
+ size_t buf_len = 0;
+ unsigned char *m=NULL;
+ update_buflen(dsa_sig->r, &buf_len);
+ update_buflen(dsa_sig->s, &buf_len);
+ m = OPENSSL_malloc(buf_len+10);
+ if (m == NULL)
+ {
+ DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ if (BIO_write(bp, "\n", 1) != 1)
+ goto err;
+
+ if (!ASN1_bn_print(bp,"r: ",dsa_sig->r,m,indent))
+ goto err;
+ if (!ASN1_bn_print(bp,"s: ",dsa_sig->s,m,indent))
+ goto err;
+ rv = 1;
+ err:
+ if (m)
+ OPENSSL_free(m);
+ DSA_SIG_free(dsa_sig);
+ return rv;
+ }
+ return X509_signature_dump(bp, sig, indent);
+ }
+
static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
{
switch (op)
@@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] =
dsa_copy_parameters,
dsa_cmp_parameters,
dsa_param_print,
+ dsa_sig_print,
int_dsa_free,
dsa_pkey_ctrl,
diff --git a/crypto/dsa/dsa_asn1.c b/crypto/dsa/dsa_asn1.c
index c37460b..6058534 100644
--- a/crypto/dsa/dsa_asn1.c
+++ b/crypto/dsa/dsa_asn1.c
@@ -61,6 +61,7 @@
#include
#include
#include
+#include
/* Override the default new methods */
static int sig_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -87,7 +88,7 @@ ASN1_SEQUENCE_cb(DSA_SIG, sig_cb) = {
ASN1_SIMPLE(DSA_SIG, s, CBIGNUM)
} ASN1_SEQUENCE_END_cb(DSA_SIG, DSA_SIG)
-IMPLEMENT_ASN1_FUNCTIONS_const(DSA_SIG)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DSA_SIG, DSA_SIG, DSA_SIG)
/* Override the default free and new methods */
static int dsa_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -148,3 +149,40 @@ DSA *DSAparams_dup(DSA *dsa)
{
return ASN1_item_dup(ASN1_ITEM_rptr(DSAparams), dsa);
}
+
+int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
+ unsigned int *siglen, DSA *dsa)
+ {
+ DSA_SIG *s;
+ RAND_seed(dgst, dlen);
+ s=DSA_do_sign(dgst,dlen,dsa);
+ if (s == NULL)
+ {
+ *siglen=0;
+ return(0);
+ }
+ *siglen=i2d_DSA_SIG(s,&sig);
+ DSA_SIG_free(s);
+ return(1);
+ }
+
+/* data has already been hashed (probably with SHA or SHA-1). */
+/* returns
+ * 1: correct signature
+ * 0: incorrect signature
+ * -1: error
+ */
+int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
+ const unsigned char *sigbuf, int siglen, DSA *dsa)
+ {
+ DSA_SIG *s;
+ int ret=-1;
+
+ s = DSA_SIG_new();
+ if (s == NULL) return(ret);
+ if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
+ ret=DSA_do_verify(dgst,dgst_len,s,dsa);
+err:
+ DSA_SIG_free(s);
+ return(ret);
+ }
diff --git a/crypto/dsa/dsa_err.c b/crypto/dsa/dsa_err.c
index bba984e..00545b7 100644
--- a/crypto/dsa/dsa_err.c
+++ b/crypto/dsa/dsa_err.c
@@ -1,6 +1,6 @@
/* crypto/dsa/dsa_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -76,6 +76,8 @@ static ERR_STRING_DATA DSA_str_functs[]=
{ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP), "DSAparams_print_fp"},
{ERR_FUNC(DSA_F_DSA_DO_SIGN), "DSA_do_sign"},
{ERR_FUNC(DSA_F_DSA_DO_VERIFY), "DSA_do_verify"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_KEY), "DSA_generate_key"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_PARAMETERS_EX), "DSA_generate_parameters_ex"},
{ERR_FUNC(DSA_F_DSA_NEW_METHOD), "DSA_new_method"},
{ERR_FUNC(DSA_F_DSA_PARAM_DECODE), "DSA_PARAM_DECODE"},
{ERR_FUNC(DSA_F_DSA_PRINT_FP), "DSA_print_fp"},
@@ -86,6 +88,7 @@ static ERR_STRING_DATA DSA_str_functs[]=
{ERR_FUNC(DSA_F_DSA_SIGN), "DSA_sign"},
{ERR_FUNC(DSA_F_DSA_SIGN_SETUP), "DSA_sign_setup"},
{ERR_FUNC(DSA_F_DSA_SIG_NEW), "DSA_SIG_new"},
+{ERR_FUNC(DSA_F_DSA_SIG_PRINT), "DSA_SIG_PRINT"},
{ERR_FUNC(DSA_F_DSA_VERIFY), "DSA_verify"},
{ERR_FUNC(DSA_F_I2D_DSA_SIG), "i2d_DSA_SIG"},
{ERR_FUNC(DSA_F_OLD_DSA_PRIV_DECODE), "OLD_DSA_PRIV_DECODE"},
@@ -105,6 +108,8 @@ static ERR_STRING_DATA DSA_str_reasons[]=
{ERR_REASON(DSA_R_INVALID_DIGEST_TYPE) ,"invalid digest type"},
{ERR_REASON(DSA_R_MISSING_PARAMETERS) ,"missing parameters"},
{ERR_REASON(DSA_R_MODULUS_TOO_LARGE) ,"modulus too large"},
+{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES) ,"need new setup values"},
+{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD) ,"non fips dsa method"},
{ERR_REASON(DSA_R_NO_PARAMETERS_SET) ,"no parameters set"},
{ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
{0,NULL}
diff --git a/crypto/dsa/dsa_gen.c b/crypto/dsa/dsa_gen.c
index cb0b453..c398761 100644
--- a/crypto/dsa/dsa_gen.c
+++ b/crypto/dsa/dsa_gen.c
@@ -81,13 +81,33 @@
#include
#include "dsa_locl.h"
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
int DSA_generate_parameters_ex(DSA *ret, int bits,
const unsigned char *seed_in, int seed_len,
int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ret->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(ret->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_GENERATE_PARAMETERS_EX, DSA_R_NON_FIPS_DSA_METHOD);
+ return 0;
+ }
+#endif
if(ret->meth->dsa_paramgen)
return ret->meth->dsa_paramgen(ret, bits, seed_in, seed_len,
counter_ret, h_ret, cb);
+#ifdef OPENSSL_FIPS
+ else if (FIPS_mode())
+ {
+ return FIPS_dsa_generate_parameters_ex(ret, bits,
+ seed_in, seed_len,
+ counter_ret, h_ret, cb);
+ }
+#endif
else
{
const EVP_MD *evpmd;
@@ -105,12 +125,13 @@ int DSA_generate_parameters_ex(DSA *ret, int bits,
}
return dsa_builtin_paramgen(ret, bits, qbits, evpmd,
- seed_in, seed_len, counter_ret, h_ret, cb);
+ seed_in, seed_len, NULL, counter_ret, h_ret, cb);
}
}
int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+ unsigned char *seed_out,
int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
{
int ok=0;
@@ -201,8 +222,10 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
}
/* step 2 */
- EVP_Digest(seed, qsize, md, NULL, evpmd, NULL);
- EVP_Digest(buf, qsize, buf2, NULL, evpmd, NULL);
+ if (!EVP_Digest(seed, qsize, md, NULL, evpmd, NULL))
+ goto err;
+ if (!EVP_Digest(buf, qsize, buf2, NULL, evpmd, NULL))
+ goto err;
for (i = 0; i < qsize; i++)
md[i]^=buf2[i];
@@ -251,7 +274,9 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
break;
}
- EVP_Digest(buf, qsize, md ,NULL, evpmd, NULL);
+ if (!EVP_Digest(buf, qsize, md ,NULL, evpmd,
+ NULL))
+ goto err;
/* step 8 */
if (!BN_bin2bn(md, qsize, r0))
@@ -332,6 +357,8 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
}
if (counter_ret != NULL) *counter_ret=counter;
if (h_ret != NULL) *h_ret=h;
+ if (seed_out)
+ memcpy(seed_out, seed, qsize);
}
if(ctx)
{
diff --git a/crypto/dsa/dsa_key.c b/crypto/dsa/dsa_key.c
index c4aa86b..9cf669b 100644
--- a/crypto/dsa/dsa_key.c
+++ b/crypto/dsa/dsa_key.c
@@ -64,12 +64,28 @@
#include
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
static int dsa_builtin_keygen(DSA *dsa);
int DSA_generate_key(DSA *dsa)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_GENERATE_KEY, DSA_R_NON_FIPS_DSA_METHOD);
+ return 0;
+ }
+#endif
if(dsa->meth->dsa_keygen)
return dsa->meth->dsa_keygen(dsa);
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dsa_generate_key(dsa);
+#endif
return dsa_builtin_keygen(dsa);
}
diff --git a/crypto/dsa/dsa_lib.c b/crypto/dsa/dsa_lib.c
index e9b7590..96d8d0c 100644
--- a/crypto/dsa/dsa_lib.c
+++ b/crypto/dsa/dsa_lib.c
@@ -70,6 +70,10 @@
#include
#endif
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
const char DSA_version[]="DSA" OPENSSL_VERSION_PTEXT;
static const DSA_METHOD *default_DSA_method = NULL;
@@ -82,7 +86,16 @@ void DSA_set_default_method(const DSA_METHOD *meth)
const DSA_METHOD *DSA_get_default_method(void)
{
if(!default_DSA_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dsa_openssl();
+ else
+ return DSA_OpenSSL();
+#else
default_DSA_method = DSA_OpenSSL();
+#endif
+ }
return default_DSA_method;
}
@@ -163,7 +176,7 @@ DSA *DSA_new_method(ENGINE *engine)
ret->method_mont_p=NULL;
ret->references=1;
- ret->flags=ret->meth->flags;
+ ret->flags=ret->meth->flags & ~DSA_FLAG_NON_FIPS_ALLOW;
CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DSA, ret, &ret->ex_data);
if ((ret->meth->init != NULL) && !ret->meth->init(ret))
{
@@ -276,7 +289,8 @@ void *DSA_get_ex_data(DSA *d, int idx)
DH *DSA_dup_DH(const DSA *r)
{
/* DSA has p, q, g, optional pub_key, optional priv_key.
- * DH has p, optional length, g, optional pub_key, optional priv_key.
+ * DH has p, optional length, g, optional pub_key, optional priv_key,
+ * optional q.
*/
DH *ret = NULL;
@@ -290,7 +304,11 @@ DH *DSA_dup_DH(const DSA *r)
if ((ret->p = BN_dup(r->p)) == NULL)
goto err;
if (r->q != NULL)
+ {
ret->length = BN_num_bits(r->q);
+ if ((ret->q = BN_dup(r->q)) == NULL)
+ goto err;
+ }
if (r->g != NULL)
if ((ret->g = BN_dup(r->g)) == NULL)
goto err;
diff --git a/crypto/dsa/dsa_locl.h b/crypto/dsa/dsa_locl.h
index 2b8cfee..21e2e45 100644
--- a/crypto/dsa/dsa_locl.h
+++ b/crypto/dsa/dsa_locl.h
@@ -56,4 +56,5 @@
int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+ unsigned char *seed_out,
int *counter_ret, unsigned long *h_ret, BN_GENCB *cb);
diff --git a/crypto/dsa/dsa_ossl.c b/crypto/dsa/dsa_ossl.c
index a3ddd7d..b3d78e5 100644
--- a/crypto/dsa/dsa_ossl.c
+++ b/crypto/dsa/dsa_ossl.c
@@ -136,6 +136,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
BN_CTX *ctx=NULL;
int reason=ERR_R_BN_LIB;
DSA_SIG *ret=NULL;
+ int noredo = 0;
BN_init(&m);
BN_init(&xr);
@@ -150,7 +151,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
if (s == NULL) goto err;
ctx=BN_CTX_new();
if (ctx == NULL) goto err;
-
+redo:
if ((dsa->kinv == NULL) || (dsa->r == NULL))
{
if (!DSA_sign_setup(dsa,ctx,&kinv,&r)) goto err;
@@ -161,6 +162,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
dsa->kinv=NULL;
r=dsa->r;
dsa->r=NULL;
+ noredo = 1;
}
@@ -181,6 +183,18 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
ret=DSA_SIG_new();
if (ret == NULL) goto err;
+ /* Redo if r or s is zero as required by FIPS 186-3: this is
+ * very unlikely.
+ */
+ if (BN_is_zero(r) || BN_is_zero(s))
+ {
+ if (noredo)
+ {
+ reason = DSA_R_NEED_NEW_SETUP_VALUES;
+ goto err;
+ }
+ goto redo;
+ }
ret->r = r;
ret->s = s;
diff --git a/crypto/dsa/dsa_pmeth.c b/crypto/dsa/dsa_pmeth.c
index e2df54f..715d8d6 100644
--- a/crypto/dsa/dsa_pmeth.c
+++ b/crypto/dsa/dsa_pmeth.c
@@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
EVP_MD_type((const EVP_MD *)p2) != NID_dsa &&
EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
- EVP_MD_type((const EVP_MD *)p2) != NID_sha256)
+ EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
+ EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
+ EVP_MD_type((const EVP_MD *)p2) != NID_sha512)
{
DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE);
return 0;
@@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
if (!dsa)
return 0;
ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd,
- NULL, 0, NULL, NULL, pcb);
+ NULL, 0, NULL, NULL, NULL, pcb);
if (ret)
EVP_PKEY_assign_DSA(pkey, dsa);
else
diff --git a/crypto/dsa/dsa_sign.c b/crypto/dsa/dsa_sign.c
index 17555e5..c3cc364 100644
--- a/crypto/dsa/dsa_sign.c
+++ b/crypto/dsa/dsa_sign.c
@@ -61,30 +61,54 @@
#include "cryptlib.h"
#include
#include
+#include
DSA_SIG * DSA_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_DO_SIGN, DSA_R_NON_FIPS_DSA_METHOD);
+ return NULL;
+ }
+#endif
return dsa->meth->dsa_do_sign(dgst, dlen, dsa);
}
-int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
- unsigned int *siglen, DSA *dsa)
+int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
{
- DSA_SIG *s;
- RAND_seed(dgst, dlen);
- s=DSA_do_sign(dgst,dlen,dsa);
- if (s == NULL)
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
{
- *siglen=0;
- return(0);
+ DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NON_FIPS_DSA_METHOD);
+ return 0;
}
- *siglen=i2d_DSA_SIG(s,&sig);
- DSA_SIG_free(s);
- return(1);
+#endif
+ return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
}
-int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
+DSA_SIG *DSA_SIG_new(void)
{
- return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
+ DSA_SIG *sig;
+ sig = OPENSSL_malloc(sizeof(DSA_SIG));
+ if (!sig)
+ return NULL;
+ sig->r = NULL;
+ sig->s = NULL;
+ return sig;
+ }
+
+void DSA_SIG_free(DSA_SIG *sig)
+ {
+ if (sig)
+ {
+ if (sig->r)
+ BN_free(sig->r);
+ if (sig->s)
+ BN_free(sig->s);
+ OPENSSL_free(sig);
+ }
}
diff --git a/crypto/dsa/dsa_vrf.c b/crypto/dsa/dsa_vrf.c
index 226a75f..674cb5f 100644
--- a/crypto/dsa/dsa_vrf.c
+++ b/crypto/dsa/dsa_vrf.c
@@ -64,26 +64,13 @@
int DSA_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
DSA *dsa)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_DO_VERIFY, DSA_R_NON_FIPS_DSA_METHOD);
+ return -1;
+ }
+#endif
return dsa->meth->dsa_do_verify(dgst, dgst_len, sig, dsa);
}
-
-/* data has already been hashed (probably with SHA or SHA-1). */
-/* returns
- * 1: correct signature
- * 0: incorrect signature
- * -1: error
- */
-int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
- const unsigned char *sigbuf, int siglen, DSA *dsa)
- {
- DSA_SIG *s;
- int ret=-1;
-
- s = DSA_SIG_new();
- if (s == NULL) return(ret);
- if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
- ret=DSA_do_verify(dgst,dgst_len,s,dsa);
-err:
- DSA_SIG_free(s);
- return(ret);
- }
diff --git a/crypto/dso/dso_dlfcn.c b/crypto/dso/dso_dlfcn.c
index c2bc617..5f22548 100644
--- a/crypto/dso/dso_dlfcn.c
+++ b/crypto/dso/dso_dlfcn.c
@@ -86,7 +86,8 @@ DSO_METHOD *DSO_METHOD_dlfcn(void)
# if defined(_AIX) || defined(__CYGWIN__) || \
defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
(defined(__osf__) && !defined(RTLD_NEXT)) || \
- (defined(__OpenBSD__) && !defined(RTLD_SELF))
+ (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \
+ defined(__ANDROID__)
# undef HAVE_DLINFO
# endif
#endif
diff --git a/crypto/dso/dso_lib.c b/crypto/dso/dso_lib.c
index 8a15b79..7801529 100644
--- a/crypto/dso/dso_lib.c
+++ b/crypto/dso/dso_lib.c
@@ -237,11 +237,19 @@ DSO *DSO_load(DSO *dso, const char *filename, DSO_METHOD *meth, int flags)
if(ret->meth->dso_load == NULL)
{
DSOerr(DSO_F_DSO_LOAD,DSO_R_UNSUPPORTED);
+ /* Make sure we unset the filename on failure, because we use
+ * this to determine when the DSO has been loaded above. */
+ OPENSSL_free(ret->filename);
+ ret->filename = NULL;
goto err;
}
if(!ret->meth->dso_load(ret))
{
DSOerr(DSO_F_DSO_LOAD,DSO_R_LOAD_FAILED);
+ /* Make sure we unset the filename on failure, because we use
+ * this to determine when the DSO has been loaded above. */
+ OPENSSL_free(ret->filename);
+ ret->filename = NULL;
goto err;
}
/* Load succeeded */
diff --git a/crypto/ec/ec.h b/crypto/ec/ec.h
index ee70781..dfe8710 100644
--- a/crypto/ec/ec.h
+++ b/crypto/ec/ec.h
@@ -151,7 +151,24 @@ const EC_METHOD *EC_GFp_mont_method(void);
*/
const EC_METHOD *EC_GFp_nist_method(void);
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ * \return EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
+
+/** Returns 64-bit optimized methods for nistp256
+ * \return EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ * \return EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
+#ifndef OPENSSL_NO_EC2M
/********************************************************************/
/* EC_METHOD for curves over GF(2^m) */
/********************************************************************/
@@ -161,6 +178,8 @@ const EC_METHOD *EC_GFp_nist_method(void);
*/
const EC_METHOD *EC_GF2m_simple_method(void);
+#endif
+
/********************************************************************/
/* EC_GROUP functions */
@@ -255,10 +274,10 @@ int EC_GROUP_get_curve_name(const EC_GROUP *group);
void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag);
int EC_GROUP_get_asn1_flag(const EC_GROUP *group);
-void EC_GROUP_set_point_conversion_form(EC_GROUP *, point_conversion_form_t);
+void EC_GROUP_set_point_conversion_form(EC_GROUP *group, point_conversion_form_t form);
point_conversion_form_t EC_GROUP_get_point_conversion_form(const EC_GROUP *);
-unsigned char *EC_GROUP_get0_seed(const EC_GROUP *);
+unsigned char *EC_GROUP_get0_seed(const EC_GROUP *x);
size_t EC_GROUP_get_seed_len(const EC_GROUP *);
size_t EC_GROUP_set_seed(EC_GROUP *, const unsigned char *, size_t len);
@@ -282,6 +301,7 @@ int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, co
*/
int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
+#ifndef OPENSSL_NO_EC2M
/** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
* \param group EC_GROUP object
* \param p BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@ int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, c
* \return 1 on success and 0 if an error occured
*/
int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
/** Returns the number of bits needed to represent a field element
* \param group EC_GROUP object
* \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
* \return newly created EC_GROUP object with the specified parameters
*/
EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
/** Creates a new EC_GROUP object with the specified parameters defined
* over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
* \param p BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
* \return newly created EC_GROUP object with the specified parameters
*/
EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
/** Creates a EC_GROUP object with a curve specified by a NID
* \param nid NID of the OID of the curve name
* \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
*/
int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
/** Sets the affine coordinates of a EC_POINT over GF2m
* \param group underlying EC_GROUP object
* \param p EC_POINT object
@@ -514,7 +534,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
*/
int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
/** Encodes a EC_POINT object to a octet string
* \param group underlying EC_GROUP object
* \param p EC_POINT object
@@ -606,8 +626,8 @@ int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_CTX *c
*/
int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
-int EC_POINT_make_affine(const EC_GROUP *, EC_POINT *, BN_CTX *);
-int EC_POINTs_make_affine(const EC_GROUP *, size_t num, EC_POINT *[], BN_CTX *);
+int EC_POINT_make_affine(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx);
+int EC_POINTs_make_affine(const EC_GROUP *group, size_t num, EC_POINT *points[], BN_CTX *ctx);
/** Computes r = generator * n sum_{i=0}^num p[i] * m[i]
* \param group underlying EC_GROUP object
@@ -653,9 +673,11 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
/* EC_GROUP_get_basis_type() returns the NID of the basis type
* used to represent the field elements */
int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1,
unsigned int *k2, unsigned int *k3);
+#endif
#define OPENSSL_EC_NAMED_CURVE 0x001
@@ -689,11 +711,21 @@ typedef struct ec_key_st EC_KEY;
#define EC_PKEY_NO_PARAMETERS 0x001
#define EC_PKEY_NO_PUBKEY 0x002
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW 0x1
+#define EC_FLAG_FIPS_CHECKED 0x2
+
/** Creates a new EC_KEY object.
* \return EC_KEY object or NULL if an error occurred.
*/
EC_KEY *EC_KEY_new(void);
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
/** Creates a new EC_KEY object using a named curve as underlying
* EC_GROUP object.
* \param nid NID of the named curve.
@@ -768,16 +800,24 @@ const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key);
int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub);
unsigned EC_KEY_get_enc_flags(const EC_KEY *key);
-void EC_KEY_set_enc_flags(EC_KEY *, unsigned int);
-point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *);
-void EC_KEY_set_conv_form(EC_KEY *, point_conversion_form_t);
+void EC_KEY_set_enc_flags(EC_KEY *eckey, unsigned int flags);
+point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key);
+void EC_KEY_set_conv_form(EC_KEY *eckey, point_conversion_form_t cform);
/* functions to set/get method specific data */
-void *EC_KEY_get_key_method_data(EC_KEY *,
+void *EC_KEY_get_key_method_data(EC_KEY *key,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
-void EC_KEY_insert_key_method_data(EC_KEY *, void *data,
+/** Sets the key method data of an EC_KEY object, if none has yet been set.
+ * \param key EC_KEY object
+ * \param data opaque data to install.
+ * \param dup_func a function that duplicates |data|.
+ * \param free_func a function that frees |data|.
+ * \param clear_free_func a function that wipes and frees |data|.
+ * \return the previously set data pointer, or NULL if |data| was inserted.
+ */
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
/* wrapper functions for the underlying EC_GROUP object */
-void EC_KEY_set_asn1_flag(EC_KEY *, int);
+void EC_KEY_set_asn1_flag(EC_KEY *eckey, int asn1_flag);
/** Creates a table of pre-computed multiples of the generator to
* accelerate further EC_KEY operations.
@@ -799,6 +839,15 @@ int EC_KEY_generate_key(EC_KEY *key);
*/
int EC_KEY_check_key(const EC_KEY *key);
+/** Sets a public key from affine coordindates performing
+ * neccessary NIST PKV tests.
+ * \param key the EC_KEY object
+ * \param x public key x coordinate
+ * \param y public key y coordinate
+ * \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
/********************************************************************/
/* de- and encoding functions for SEC1 ECPrivateKey */
@@ -926,6 +975,7 @@ void ERR_load_EC_strings(void);
/* Error codes for the EC functions. */
/* Function codes. */
+#define EC_F_BN_TO_FELEM 224
#define EC_F_COMPUTE_WNAF 143
#define EC_F_D2I_ECPARAMETERS 144
#define EC_F_D2I_ECPKPARAMETERS 145
@@ -968,6 +1018,15 @@ void ERR_load_EC_strings(void);
#define EC_F_EC_GFP_MONT_FIELD_SQR 132
#define EC_F_EC_GFP_MONT_GROUP_SET_CURVE 189
#define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
#define EC_F_EC_GFP_NIST_FIELD_MUL 200
#define EC_F_EC_GFP_NIST_FIELD_SQR 201
#define EC_F_EC_GFP_NIST_GROUP_SET_CURVE 202
@@ -1010,6 +1069,7 @@ void ERR_load_EC_strings(void);
#define EC_F_EC_KEY_NEW 182
#define EC_F_EC_KEY_PRINT 180
#define EC_F_EC_KEY_PRINT_FP 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES 229
#define EC_F_EC_POINTS_MAKE_AFFINE 136
#define EC_F_EC_POINT_ADD 112
#define EC_F_EC_POINT_CMP 113
@@ -1040,6 +1100,9 @@ void ERR_load_EC_strings(void);
#define EC_F_I2D_ECPKPARAMETERS 191
#define EC_F_I2D_ECPRIVATEKEY 192
#define EC_F_I2O_ECPUBLICKEY 151
+#define EC_F_NISTP224_PRE_COMP_NEW 227
+#define EC_F_NISTP256_PRE_COMP_NEW 236
+#define EC_F_NISTP521_PRE_COMP_NEW 237
#define EC_F_O2I_ECPUBLICKEY 152
#define EC_F_OLD_EC_PRIV_DECODE 222
#define EC_F_PKEY_EC_CTRL 197
@@ -1052,12 +1115,15 @@ void ERR_load_EC_strings(void);
/* Reason codes. */
#define EC_R_ASN1_ERROR 115
#define EC_R_ASN1_UNKNOWN_FIELD 116
+#define EC_R_BIGNUM_OUT_OF_RANGE 144
#define EC_R_BUFFER_TOO_SMALL 100
+#define EC_R_COORDINATES_OUT_OF_RANGE 146
#define EC_R_D2I_ECPKPARAMETERS_FAILURE 117
#define EC_R_DECODE_ERROR 142
#define EC_R_DISCRIMINANT_IS_ZERO 118
#define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE 119
#define EC_R_FIELD_TOO_LARGE 143
+#define EC_R_GF2M_NOT_SUPPORTED 147
#define EC_R_GROUP2PKPARAMETERS_FAILURE 120
#define EC_R_I2D_ECPKPARAMETERS_FAILURE 121
#define EC_R_INCOMPATIBLE_OBJECTS 101
@@ -1092,6 +1158,7 @@ void ERR_load_EC_strings(void);
#define EC_R_UNKNOWN_GROUP 129
#define EC_R_UNKNOWN_ORDER 114
#define EC_R_UNSUPPORTED_FIELD 131
+#define EC_R_WRONG_CURVE_PARAMETERS 145
#define EC_R_WRONG_ORDER 130
#ifdef __cplusplus
diff --git a/crypto/ec/ec2_mult.c b/crypto/ec/ec2_mult.c
index e12b9b2..26f4a78 100644
--- a/crypto/ec/ec2_mult.c
+++ b/crypto/ec/ec2_mult.c
@@ -71,6 +71,8 @@
#include "ec_lcl.h"
+#ifndef OPENSSL_NO_EC2M
+
/* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective
* coordinates.
@@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group)
{
return ec_wNAF_have_precompute_mult(group);
}
+
+#endif
diff --git a/crypto/ec/ec2_oct.c b/crypto/ec/ec2_oct.c
new file mode 100644
index 0000000..f1d75e5
--- /dev/null
+++ b/crypto/ec/ec2_oct.c
@@ -0,0 +1,407 @@
+/* crypto/ec/ec2_oct.c */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
+ * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
+ * to the OpenSSL project.
+ *
+ * The ECC Code is licensed pursuant to the OpenSSL open source
+ * license provided below.
+ *
+ * The software is originally written by Sheueling Chang Shantz and
+ * Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include
+
+#include "ec_lcl.h"
+
+#ifndef OPENSSL_NO_EC2M
+
+/* Calculates and sets the affine coordinates of an EC_POINT from the given
+ * compressed coordinates. Uses algorithm 2.3.4 of SEC 1.
+ * Note that the simple implementation only uses affine coordinates.
+ *
+ * The method is from the following publication:
+ *
+ * Harper, Menezes, Vanstone:
+ * "Public-Key Cryptosystems with Very Small Key Lengths",
+ * EUROCRYPT '92, Springer-Verlag LNCS 658,
+ * published February 1993
+ *
+ * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
+ * the same method, but claim no priority date earlier than July 29, 1994
+ * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
+ */
+int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+ {
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *tmp, *x, *y, *z;
+ int ret = 0, z0;
+
+ /* clear error queue */
+ ERR_clear_error();
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ y_bit = (y_bit != 0) ? 1 : 0;
+
+ BN_CTX_start(ctx);
+ tmp = BN_CTX_get(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ z = BN_CTX_get(ctx);
+ if (z == NULL) goto err;
+
+ if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
+ if (BN_is_zero(x))
+ {
+ if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
+ }
+ else
+ {
+ if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
+ if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
+ if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
+ if (!BN_GF2m_add(tmp, x, tmp)) goto err;
+ if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
+ {
+ unsigned long err = ERR_peek_last_error();
+
+ if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
+ {
+ ERR_clear_error();
+ ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+ }
+ else
+ ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+ goto err;
+ }
+ z0 = (BN_is_odd(z)) ? 1 : 0;
+ if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
+ if (z0 != y_bit)
+ {
+ if (!BN_GF2m_add(y, y, x)) goto err;
+ }
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+
+
+/* Converts an EC_POINT to an octet string.
+ * If buf is NULL, the encoded length will be returned.
+ * If the length len of buf is smaller than required an error will be returned.
+ */
+size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+ unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ size_t ret;
+ BN_CTX *new_ctx = NULL;
+ int used_ctx = 0;
+ BIGNUM *x, *y, *yxi;
+ size_t field_len, i, skip;
+
+ if ((form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+ goto err;
+ }
+
+ if (EC_POINT_is_at_infinity(group, point))
+ {
+ /* encodes to a single 0 octet */
+ if (buf != NULL)
+ {
+ if (len < 1)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ buf[0] = 0;
+ }
+ return 1;
+ }
+
+
+ /* ret := required output buffer length */
+ field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+ ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ /* if 'buf' is NULL, just return required length */
+ if (buf != NULL)
+ {
+ if (len < ret)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ goto err;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ used_ctx = 1;
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ yxi = BN_CTX_get(ctx);
+ if (yxi == NULL) goto err;
+
+ if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+ buf[0] = form;
+ if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
+ {
+ if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+ if (BN_is_odd(yxi)) buf[0]++;
+ }
+
+ i = 1;
+
+ skip = field_len - BN_num_bytes(x);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(x, buf + i);
+ i += skip;
+ if (i != 1 + field_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+ {
+ skip = field_len - BN_num_bytes(y);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(y, buf + i);
+ i += skip;
+ }
+
+ if (i != ret)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ }
+
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+
+ err:
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return 0;
+ }
+
+
+/* Converts an octet string representation to an EC_POINT.
+ * Note that the simple implementation only uses affine coordinates.
+ */
+int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+ const unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ point_conversion_form_t form;
+ int y_bit;
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *x, *y, *yxi;
+ size_t field_len, enc_len;
+ int ret = 0;
+
+ if (len == 0)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ form = buf[0];
+ y_bit = form & 1;
+ form = form & ~1U;
+ if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+ if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (form == 0)
+ {
+ if (len != 1)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ return EC_POINT_set_to_infinity(group, point);
+ }
+
+ field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+ enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ if (len != enc_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ yxi = BN_CTX_get(ctx);
+ if (yxi == NULL) goto err;
+
+ if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+ if (BN_ucmp(x, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_COMPRESSED)
+ {
+ if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
+ }
+ else
+ {
+ if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+ if (BN_ucmp(y, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ if (form == POINT_CONVERSION_HYBRID)
+ {
+ if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+ if (y_bit != BN_is_odd(yxi))
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+ }
+
+ if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+ goto err;
+ }
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+#endif
diff --git a/crypto/ec/ec2_smpl.c b/crypto/ec/ec2_smpl.c
index af94458..e0e59c7 100644
--- a/crypto/ec/ec2_smpl.c
+++ b/crypto/ec/ec2_smpl.c
@@ -71,10 +71,20 @@
#include "ec_lcl.h"
+#ifndef OPENSSL_NO_EC2M
+
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
const EC_METHOD *EC_GF2m_simple_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gf2m_simple_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_characteristic_two_field,
ec_GF2m_simple_group_init,
ec_GF2m_simple_group_finish,
@@ -93,9 +103,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
0 /* get_Jprojective_coordinates_GFp */,
ec_GF2m_simple_point_set_affine_coordinates,
ec_GF2m_simple_point_get_affine_coordinates,
- ec_GF2m_simple_set_compressed_coordinates,
- ec_GF2m_simple_point2oct,
- ec_GF2m_simple_oct2point,
+ 0,0,0,
ec_GF2m_simple_add,
ec_GF2m_simple_dbl,
ec_GF2m_simple_invert,
@@ -118,6 +126,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
0 /* field_set_to_one */ };
return &ret;
+#endif
}
@@ -405,340 +414,6 @@ int ec_GF2m_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_
return ret;
}
-
-/* Calculates and sets the affine coordinates of an EC_POINT from the given
- * compressed coordinates. Uses algorithm 2.3.4 of SEC 1.
- * Note that the simple implementation only uses affine coordinates.
- *
- * The method is from the following publication:
- *
- * Harper, Menezes, Vanstone:
- * "Public-Key Cryptosystems with Very Small Key Lengths",
- * EUROCRYPT '92, Springer-Verlag LNCS 658,
- * published February 1993
- *
- * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
- * the same method, but claim no priority date earlier than July 29, 1994
- * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
- */
-int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x_, int y_bit, BN_CTX *ctx)
- {
- BN_CTX *new_ctx = NULL;
- BIGNUM *tmp, *x, *y, *z;
- int ret = 0, z0;
-
- /* clear error queue */
- ERR_clear_error();
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- y_bit = (y_bit != 0) ? 1 : 0;
-
- BN_CTX_start(ctx);
- tmp = BN_CTX_get(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- z = BN_CTX_get(ctx);
- if (z == NULL) goto err;
-
- if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
- if (BN_is_zero(x))
- {
- if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
- }
- else
- {
- if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
- if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
- if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
- if (!BN_GF2m_add(tmp, x, tmp)) goto err;
- if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
- {
- unsigned long err = ERR_peek_last_error();
-
- if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
- {
- ERR_clear_error();
- ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
- }
- else
- ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
- goto err;
- }
- z0 = (BN_is_odd(z)) ? 1 : 0;
- if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
- if (z0 != y_bit)
- {
- if (!BN_GF2m_add(y, y, x)) goto err;
- }
- }
-
- if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
-/* Converts an EC_POINT to an octet string.
- * If buf is NULL, the encoded length will be returned.
- * If the length len of buf is smaller than required an error will be returned.
- */
-size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
- unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- size_t ret;
- BN_CTX *new_ctx = NULL;
- int used_ctx = 0;
- BIGNUM *x, *y, *yxi;
- size_t field_len, i, skip;
-
- if ((form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
- goto err;
- }
-
- if (EC_POINT_is_at_infinity(group, point))
- {
- /* encodes to a single 0 octet */
- if (buf != NULL)
- {
- if (len < 1)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- buf[0] = 0;
- }
- return 1;
- }
-
-
- /* ret := required output buffer length */
- field_len = (EC_GROUP_get_degree(group) + 7) / 8;
- ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- /* if 'buf' is NULL, just return required length */
- if (buf != NULL)
- {
- if (len < ret)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- goto err;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- used_ctx = 1;
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- yxi = BN_CTX_get(ctx);
- if (yxi == NULL) goto err;
-
- if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
- buf[0] = form;
- if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
- {
- if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
- if (BN_is_odd(yxi)) buf[0]++;
- }
-
- i = 1;
-
- skip = field_len - BN_num_bytes(x);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(x, buf + i);
- i += skip;
- if (i != 1 + field_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
-
- if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
- {
- skip = field_len - BN_num_bytes(y);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(y, buf + i);
- i += skip;
- }
-
- if (i != ret)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- }
-
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
-
- err:
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return 0;
- }
-
-
-/* Converts an octet string representation to an EC_POINT.
- * Note that the simple implementation only uses affine coordinates.
- */
-int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
- const unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- point_conversion_form_t form;
- int y_bit;
- BN_CTX *new_ctx = NULL;
- BIGNUM *x, *y, *yxi;
- size_t field_len, enc_len;
- int ret = 0;
-
- if (len == 0)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- form = buf[0];
- y_bit = form & 1;
- form = form & ~1U;
- if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
- if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (form == 0)
- {
- if (len != 1)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- return EC_POINT_set_to_infinity(group, point);
- }
-
- field_len = (EC_GROUP_get_degree(group) + 7) / 8;
- enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- if (len != enc_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- yxi = BN_CTX_get(ctx);
- if (yxi == NULL) goto err;
-
- if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
- if (BN_ucmp(x, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
-
- if (form == POINT_CONVERSION_COMPRESSED)
- {
- if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
- }
- else
- {
- if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
- if (BN_ucmp(y, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- if (form == POINT_CONVERSION_HYBRID)
- {
- if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
- if (y_bit != BN_is_odd(yxi))
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- }
-
- if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
- }
-
- if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
- goto err;
- }
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
/* Computes a + b and stores the result in r. r could be a or b, a could be b.
* Uses algorithm A.10.2 of IEEE P1363.
*/
@@ -887,7 +562,7 @@ int ec_GF2m_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_
field_sqr = group->meth->field_sqr;
/* only support affine coordinates */
- if (!point->Z_is_one) goto err;
+ if (!point->Z_is_one) return -1;
if (ctx == NULL)
{
@@ -1040,3 +715,5 @@ int ec_GF2m_simple_field_div(const EC_GROUP *group, BIGNUM *r, const BIGNUM *a,
{
return BN_GF2m_mod_div(r, a, b, &group->field, ctx);
}
+
+#endif
diff --git a/crypto/ec/ec_ameth.c b/crypto/ec/ec_ameth.c
index c00f7d7..83909c1 100644
--- a/crypto/ec/ec_ameth.c
+++ b/crypto/ec/ec_ameth.c
@@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth =
ec_copy_parameters,
ec_cmp_parameters,
eckey_param_print,
+ 0,
int_ec_free,
ec_pkey_ctrl,
diff --git a/crypto/ec/ec_asn1.c b/crypto/ec/ec_asn1.c
index ae55539..175eec5 100644
--- a/crypto/ec/ec_asn1.c
+++ b/crypto/ec/ec_asn1.c
@@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group)
/* everything else is currently not supported */
return 0;
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
{
if (group == NULL)
@@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
return 1;
}
-
int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
unsigned int *k2, unsigned int *k3)
{
@@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
return 1;
}
-
+#endif
/* some structures needed for the asn1 encoding */
@@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
}
}
else /* nid == NID_X9_62_characteristic_two_field */
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED);
+ goto err;
+ }
+#else
{
int field_type;
X9_62_CHARACTERISTIC_TWO *char_two;
@@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
}
}
}
+#endif
ok = 1;
@@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* nid == NID_X9_62_characteristic_two_field */
{
if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL))
@@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
goto err;
}
}
-
+#endif
len_1 = (size_t)BN_num_bytes(tmp_1);
len_2 = (size_t)BN_num_bytes(tmp_2);
@@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
/* get the field parameters */
tmp = OBJ_obj2nid(params->fieldID->fieldType);
-
if (tmp == NID_X9_62_characteristic_two_field)
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED);
+ goto err;
+ }
+#else
{
X9_62_CHARACTERISTIC_TWO *char_two;
@@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
/* create the EC_GROUP structure */
ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL);
}
+#endif
else if (tmp == NID_X9_62_prime_field)
{
/* we have a curve over a prime field */
@@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len)
if ((group = ec_asn1_pkparameters2group(params)) == NULL)
{
ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE);
+ ECPKPARAMETERS_free(params);
return NULL;
}
diff --git a/crypto/ec/ec_curve.c b/crypto/ec/ec_curve.c
index 23274e4..c72fb26 100644
--- a/crypto/ec/ec_curve.c
+++ b/crypto/ec/ec_curve.c
@@ -3,7 +3,7 @@
* Written by Nils Larsch for the OpenSSL project.
*/
/* ====================================================================
- * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -72,6 +72,7 @@
#include "ec_lcl.h"
#include
#include
+#include
typedef struct {
int field_type, /* either NID_X9_62_prime_field or
@@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; }
0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D }
};
+#ifndef OPENSSL_NO_EC2M
+
/* characteristic two curves */
static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; }
_EC_SECG_CHAR2_113R1 = {
@@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; }
{ 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */
0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD,
- 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */
+ 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
0x07,
0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */
@@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; }
0xBA,0xFC,0xA7,0x5E }
};
+#endif
+
typedef struct _ec_list_element_st {
int nid;
const EC_CURVE_DATA *data;
+ const EC_METHOD *(*meth)(void);
const char *comment;
} ec_list_element;
static const ec_list_element curve_list[] = {
- /* prime field curves */
+ /* prime field curves */
/* secg curves */
- { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
- { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"},
- { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"},
- { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"},
- { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"},
- { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"},
- { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
+ { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+ { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" },
+ { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" },
+ { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" },
+ { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" },
+ { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" },
+ { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
/* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */
- { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"},
- { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"},
- { NID_secp224r1, &_EC_NIST_PRIME_224.h, "NIST/SECG curve over a 224 bit prime field"},
- { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"},
+ { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" },
+ { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ { NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" },
+#else
+ { NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" },
+#endif
+ { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" },
/* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */
- { NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"},
- { NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"},
+ { NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ { NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" },
+#else
+ { NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" },
+#endif
/* X9.62 curves */
- { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"},
- { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"},
- { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"},
- { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"},
- { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"},
- { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"},
- { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"},
+ { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" },
+ { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" },
+ { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" },
+ { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
+ { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
+ { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+#else
+ { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+#endif
+#ifndef OPENSSL_NO_EC2M
/* characteristic two field curves */
/* NIST/SECG curves */
- { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
- { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"},
- { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"},
- { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"},
- { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field" },
- { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"},
- { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, "NIST/SECG curve over a 163 bit binary field" },
- { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"},
- { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"},
- { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field" },
- { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field" },
- { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"},
- { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, "NIST/SECG curve over a 283 bit binary field" },
- { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, "NIST/SECG curve over a 283 bit binary field" },
- { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, "NIST/SECG curve over a 409 bit binary field" },
- { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, "NIST/SECG curve over a 409 bit binary field" },
- { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, "NIST/SECG curve over a 571 bit binary field" },
- { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, "NIST/SECG curve over a 571 bit binary field" },
+ { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+ { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" },
+ { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" },
+ { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" },
+ { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+ { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" },
+ { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" },
+ { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" },
+ { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" },
+ { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+ { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+ { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" },
+ { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+ { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+ { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+ { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+ { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" },
+ { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" },
/* X9.62 curves */
- { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
- { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"},
- { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"},
- { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"},
- { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"},
- { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"},
- { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"},
- { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"},
- { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"},
- { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"},
- { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"},
- { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"},
- { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"},
- { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"},
- { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"},
- { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"},
+ { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+ { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" },
+ { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" },
+ { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" },
+ { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" },
+ { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" },
+ { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" },
+ { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" },
+ { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" },
+ { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" },
+ { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" },
+ { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" },
+ { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" },
+ { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" },
+ { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" },
+ { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" },
/* the WAP/WTLS curves
* [unlike SECG, spec has its own OIDs for curves from X9.62] */
- { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
- { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
- { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"},
- { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" },
- { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"},
+ { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+#endif
+ { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+ { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
+ { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" },
+ { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
+ { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+#endif
+ { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
/* IPSec curves */
- { NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
- { NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+ { NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n"
+ "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+ { NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
+ "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+#endif
};
#define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
-static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
+static EC_GROUP *ec_group_new_from_data(const ec_list_element curve)
{
EC_GROUP *group=NULL;
EC_POINT *P=NULL;
BN_CTX *ctx=NULL;
- BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
+ BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
int ok=0;
int seed_len,param_len;
+ const EC_METHOD *meth;
+ const EC_CURVE_DATA *data;
const unsigned char *params;
if ((ctx = BN_CTX_new()) == NULL)
@@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
+ data = curve.data;
seed_len = data->seed_len;
param_len = data->param_len;
- params = (const unsigned char *)(data+1); /* skip header */
- params += seed_len; /* skip seed */
+ params = (const unsigned char *)(data+1); /* skip header */
+ params += seed_len; /* skip seed */
if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL))
|| !(a = BN_bin2bn(params+1*param_len, param_len, NULL))
@@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
- if (data->field_type == NID_X9_62_prime_field)
+ if (curve.meth != 0)
+ {
+ meth = curve.meth();
+ if (((group = EC_GROUP_new(meth)) == NULL) ||
+ (!(group->meth->group_set_curve(group, p, a, b, ctx))))
+ {
+ ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
+ goto err;
+ }
+ }
+ else if (data->field_type == NID_X9_62_prime_field)
{
if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL)
{
@@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* field_type == NID_X9_62_characteristic_two_field */
{
if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL)
@@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
}
+#endif
if ((P = EC_POINT_new(group)) == NULL)
{
ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
goto err;
}
-
+
if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL))
|| !(y = BN_bin2bn(params+4*param_len, param_len, NULL)))
{
ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB);
goto err;
}
- if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx))
+ if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx))
{
ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
goto err;
@@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid)
for (i=0; i
+ */
+ meth = EC_GFp_mont_method();
+#else
meth = EC_GFp_nist_method();
+#endif
ret = EC_GROUP_new(meth);
if (ret == NULL)
@@ -122,7 +147,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
return ret;
}
-
+#ifndef OPENSSL_NO_EC2M
EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
const EC_METHOD *meth;
@@ -142,3 +167,4 @@ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM
return ret;
}
+#endif
diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c
index 84b4833..0d19398 100644
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c
@@ -1,6 +1,6 @@
/* crypto/ec/ec_err.c */
/* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
static ERR_STRING_DATA EC_str_functs[]=
{
+{ERR_FUNC(EC_F_BN_TO_FELEM), "BN_TO_FELEM"},
{ERR_FUNC(EC_F_COMPUTE_WNAF), "COMPUTE_WNAF"},
{ERR_FUNC(EC_F_D2I_ECPARAMETERS), "d2i_ECParameters"},
{ERR_FUNC(EC_F_D2I_ECPKPARAMETERS), "d2i_ECPKParameters"},
@@ -112,6 +113,15 @@ static ERR_STRING_DATA EC_str_functs[]=
{ERR_FUNC(EC_F_EC_GFP_MONT_FIELD_SQR), "ec_GFp_mont_field_sqr"},
{ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE), "ec_GFp_mont_group_set_curve"},
{ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP), "EC_GFP_MONT_GROUP_SET_CURVE_GFP"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE), "ec_GFp_nistp224_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINTS_MUL), "ec_GFp_nistp224_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp224_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE), "ec_GFp_nistp256_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINTS_MUL), "ec_GFp_nistp256_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp256_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE), "ec_GFp_nistp521_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINTS_MUL), "ec_GFp_nistp521_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp521_point_get_affine_coordinates"},
{ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_MUL), "ec_GFp_nist_field_mul"},
{ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_SQR), "ec_GFp_nist_field_sqr"},
{ERR_FUNC(EC_F_EC_GFP_NIST_GROUP_SET_CURVE), "ec_GFp_nist_group_set_curve"},
@@ -154,6 +164,7 @@ static ERR_STRING_DATA EC_str_functs[]=
{ERR_FUNC(EC_F_EC_KEY_NEW), "EC_KEY_new"},
{ERR_FUNC(EC_F_EC_KEY_PRINT), "EC_KEY_print"},
{ERR_FUNC(EC_F_EC_KEY_PRINT_FP), "EC_KEY_print_fp"},
+{ERR_FUNC(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES), "EC_KEY_set_public_key_affine_coordinates"},
{ERR_FUNC(EC_F_EC_POINTS_MAKE_AFFINE), "EC_POINTs_make_affine"},
{ERR_FUNC(EC_F_EC_POINT_ADD), "EC_POINT_add"},
{ERR_FUNC(EC_F_EC_POINT_CMP), "EC_POINT_cmp"},
@@ -184,6 +195,9 @@ static ERR_STRING_DATA EC_str_functs[]=
{ERR_FUNC(EC_F_I2D_ECPKPARAMETERS), "i2d_ECPKParameters"},
{ERR_FUNC(EC_F_I2D_ECPRIVATEKEY), "i2d_ECPrivateKey"},
{ERR_FUNC(EC_F_I2O_ECPUBLICKEY), "i2o_ECPublicKey"},
+{ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW), "NISTP224_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW), "NISTP256_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW), "NISTP521_PRE_COMP_NEW"},
{ERR_FUNC(EC_F_O2I_ECPUBLICKEY), "o2i_ECPublicKey"},
{ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE), "OLD_EC_PRIV_DECODE"},
{ERR_FUNC(EC_F_PKEY_EC_CTRL), "PKEY_EC_CTRL"},
@@ -199,12 +213,15 @@ static ERR_STRING_DATA EC_str_reasons[]=
{
{ERR_REASON(EC_R_ASN1_ERROR) ,"asn1 error"},
{ERR_REASON(EC_R_ASN1_UNKNOWN_FIELD) ,"asn1 unknown field"},
+{ERR_REASON(EC_R_BIGNUM_OUT_OF_RANGE) ,"bignum out of range"},
{ERR_REASON(EC_R_BUFFER_TOO_SMALL) ,"buffer too small"},
+{ERR_REASON(EC_R_COORDINATES_OUT_OF_RANGE),"coordinates out of range"},
{ERR_REASON(EC_R_D2I_ECPKPARAMETERS_FAILURE),"d2i ecpkparameters failure"},
{ERR_REASON(EC_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(EC_R_DISCRIMINANT_IS_ZERO) ,"discriminant is zero"},
{ERR_REASON(EC_R_EC_GROUP_NEW_BY_NAME_FAILURE),"ec group new by name failure"},
{ERR_REASON(EC_R_FIELD_TOO_LARGE) ,"field too large"},
+{ERR_REASON(EC_R_GF2M_NOT_SUPPORTED) ,"gf2m not supported"},
{ERR_REASON(EC_R_GROUP2PKPARAMETERS_FAILURE),"group2pkparameters failure"},
{ERR_REASON(EC_R_I2D_ECPKPARAMETERS_FAILURE),"i2d ecpkparameters failure"},
{ERR_REASON(EC_R_INCOMPATIBLE_OBJECTS) ,"incompatible objects"},
@@ -239,6 +256,7 @@ static ERR_STRING_DATA EC_str_reasons[]=
{ERR_REASON(EC_R_UNKNOWN_GROUP) ,"unknown group"},
{ERR_REASON(EC_R_UNKNOWN_ORDER) ,"unknown order"},
{ERR_REASON(EC_R_UNSUPPORTED_FIELD) ,"unsupported field"},
+{ERR_REASON(EC_R_WRONG_CURVE_PARAMETERS) ,"wrong curve parameters"},
{ERR_REASON(EC_R_WRONG_ORDER) ,"wrong order"},
{0,NULL}
};
diff --git a/crypto/ec/ec_key.c b/crypto/ec/ec_key.c
index 522802c..7fa2475 100644
--- a/crypto/ec/ec_key.c
+++ b/crypto/ec/ec_key.c
@@ -64,7 +64,9 @@
#include
#include "ec_lcl.h"
#include
-#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
EC_KEY *EC_KEY_new(void)
{
@@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void)
}
ret->version = 1;
+ ret->flags = 0;
ret->group = NULL;
ret->pub_key = NULL;
ret->priv_key= NULL;
@@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src)
dest->enc_flag = src->enc_flag;
dest->conv_form = src->conv_form;
dest->version = src->version;
+ dest->flags = src->flags;
return dest;
}
@@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey)
BIGNUM *priv_key = NULL, *order = NULL;
EC_POINT *pub_key = NULL;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ec_key_generate_key(eckey);
+#endif
+
if (!eckey || !eckey->group)
{
ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER);
@@ -371,6 +380,82 @@ int EC_KEY_check_key(const EC_KEY *eckey)
return(ok);
}
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
+ {
+ BN_CTX *ctx = NULL;
+ BIGNUM *tx, *ty;
+ EC_POINT *point = NULL;
+ int ok = 0, tmp_nid, is_char_two = 0;
+
+ if (!key || !key->group || !x || !y)
+ {
+ ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+ ERR_R_PASSED_NULL_PARAMETER);
+ return 0;
+ }
+ ctx = BN_CTX_new();
+ if (!ctx)
+ goto err;
+
+ point = EC_POINT_new(key->group);
+
+ if (!point)
+ goto err;
+
+ tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
+
+ if (tmp_nid == NID_X9_62_characteristic_two_field)
+ is_char_two = 1;
+
+ tx = BN_CTX_get(ctx);
+ ty = BN_CTX_get(ctx);
+#ifndef OPENSSL_NO_EC2M
+ if (is_char_two)
+ {
+ if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point,
+ x, y, ctx))
+ goto err;
+ if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point,
+ tx, ty, ctx))
+ goto err;
+ }
+ else
+#endif
+ {
+ if (!EC_POINT_set_affine_coordinates_GFp(key->group, point,
+ x, y, ctx))
+ goto err;
+ if (!EC_POINT_get_affine_coordinates_GFp(key->group, point,
+ tx, ty, ctx))
+ goto err;
+ }
+ /* Check if retrieved coordinates match originals: if not values
+ * are out of range.
+ */
+ if (BN_cmp(x, tx) || BN_cmp(y, ty))
+ {
+ ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+ EC_R_COORDINATES_OUT_OF_RANGE);
+ goto err;
+ }
+
+ if (!EC_KEY_set_public_key(key, point))
+ goto err;
+
+ if (EC_KEY_check_key(key) == 0)
+ goto err;
+
+ ok = 1;
+
+ err:
+ if (ctx)
+ BN_CTX_free(ctx);
+ if (point)
+ EC_POINT_free(point);
+ return ok;
+
+ }
+
const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key)
{
return key->group;
@@ -435,18 +520,27 @@ void EC_KEY_set_conv_form(EC_KEY *key, point_conversion_form_t cform)
void *EC_KEY_get_key_method_data(EC_KEY *key,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
{
- return EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+ void *ret;
+
+ CRYPTO_r_lock(CRYPTO_LOCK_EC);
+ ret = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+ CRYPTO_r_unlock(CRYPTO_LOCK_EC);
+
+ return ret;
}
-void EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
{
EC_EXTRA_DATA *ex_data;
+
CRYPTO_w_lock(CRYPTO_LOCK_EC);
ex_data = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
if (ex_data == NULL)
EC_EX_DATA_set_data(&key->method_data, data, dup_func, free_func, clear_free_func);
CRYPTO_w_unlock(CRYPTO_LOCK_EC);
+
+ return ex_data;
}
void EC_KEY_set_asn1_flag(EC_KEY *key, int flag)
@@ -461,3 +555,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx)
return 0;
return EC_GROUP_precompute_mult(key->group, ctx);
}
+
+int EC_KEY_get_flags(const EC_KEY *key)
+ {
+ return key->flags;
+ }
+
+void EC_KEY_set_flags(EC_KEY *key, int flags)
+ {
+ key->flags |= flags;
+ }
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags)
+ {
+ key->flags &= ~flags;
+ }
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index 3e2c34b..da7967d 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -3,7 +3,7 @@
* Originally written by Bodo Moeller for the OpenSSL project.
*/
/* ====================================================================
- * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -82,10 +82,15 @@
# endif
#endif
+/* Use default functions for poin2oct, oct2point and compressed coordinates */
+#define EC_FLAGS_DEFAULT_OCT 0x1
+
/* Structure details are not part of the exported interface,
* so all this may change in future versions. */
struct ec_method_st {
+ /* Various method flags */
+ int flags;
/* used by EC_METHOD_get_field_type: */
int field_type; /* a NID */
@@ -244,6 +249,7 @@ struct ec_key_st {
point_conversion_form_t conv_form;
int references;
+ int flags;
EC_EXTRA_DATA *method_data;
} /* EC_KEY */;
@@ -391,3 +397,50 @@ int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ec2_mult.c */
+int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
+ size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+#ifndef OPENSSL_EC_NISTP_64_GCC_128
+/* method functions in ecp_nistp224.c */
+int ec_GFp_nistp224_group_init(EC_GROUP *group);
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp224_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp256.c */
+int ec_GFp_nistp256_group_init(EC_GROUP *group);
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp256_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp521.c */
+int ec_GFp_nistp521_group_init(EC_GROUP *group);
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp521_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group);
+
+/* utility functions in ecp_nistputil.c */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+ size_t felem_size, void *tmp_felems,
+ void (*felem_one)(void *out),
+ int (*felem_is_zero)(const void *in),
+ void (*felem_assign)(void *out, const void *in),
+ void (*felem_square)(void *out, const void *in),
+ void (*felem_mul)(void *out, const void *in1, const void *in2),
+ void (*felem_inv)(void *out, const void *in),
+ void (*felem_contract)(void *out, const void *in));
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in);
+#endif
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index dd7da0f..25247b5 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -425,7 +425,7 @@ int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *
return group->meth->group_get_curve(group, p, a, b, ctx);
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
if (group->meth->group_set_curve == 0)
@@ -446,7 +446,7 @@ int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM
}
return group->meth->group_get_curve(group, p, a, b, ctx);
}
-
+#endif
int EC_GROUP_get_degree(const EC_GROUP *group)
{
@@ -856,7 +856,7 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx)
{
@@ -872,7 +872,7 @@ int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
}
return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
}
-
+#endif
int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *point,
BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
@@ -890,7 +890,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *p
return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *point,
BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
{
@@ -906,75 +906,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *
}
return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
}
-
-
-int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x, int y_bit, BN_CTX *ctx)
- {
- if (group->meth->point_set_compressed_coordinates == 0)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
- }
-
-
-int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x, int y_bit, BN_CTX *ctx)
- {
- if (group->meth->point_set_compressed_coordinates == 0)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
- }
-
-
-size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
- unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- if (group->meth->point2oct == 0)
- {
- ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->point2oct(group, point, form, buf, len, ctx);
- }
-
-
-int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
- const unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- if (group->meth->oct2point == 0)
- {
- ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->oct2point(group, point, buf, len, ctx);
- }
-
+#endif
int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
{
diff --git a/crypto/ec/ec_oct.c b/crypto/ec/ec_oct.c
new file mode 100644
index 0000000..fd9db07
--- /dev/null
+++ b/crypto/ec/ec_oct.c
@@ -0,0 +1,199 @@
+/* crypto/ec/ec_lib.c */
+/*
+ * Originally written by Bodo Moeller for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Binary polynomial ECC support in OpenSSL originally developed by
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+
+#include
+
+#include
+#include
+
+#include "ec_lcl.h"
+
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x, int y_bit, BN_CTX *ctx)
+ {
+ if (group->meth->point_set_compressed_coordinates == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+ else
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED);
+ return 0;
+ }
+#else
+ return ec_GF2m_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+#endif
+ }
+ return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+ }
+
+#ifndef OPENSSL_NO_EC2M
+int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x, int y_bit, BN_CTX *ctx)
+ {
+ if (group->meth->point_set_compressed_coordinates == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+ else
+ return ec_GF2m_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+ }
+ return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+ }
+#endif
+
+size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+ unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ if (group->meth->point2oct == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_point2oct(group, point,
+ form, buf, len, ctx);
+ else
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED);
+ return 0;
+ }
+#else
+ return ec_GF2m_simple_point2oct(group, point,
+ form, buf, len, ctx);
+#endif
+ }
+
+ return group->meth->point2oct(group, point, form, buf, len, ctx);
+ }
+
+
+int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
+ const unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ if (group->meth->oct2point == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_oct2point(group, point,
+ buf, len, ctx);
+ else
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED);
+ return 0;
+ }
+#else
+ return ec_GF2m_simple_oct2point(group, point,
+ buf, len, ctx);
+#endif
+ }
+ return group->meth->oct2point(group, point, buf, len, ctx);
+ }
+
diff --git a/crypto/ec/ec_pmeth.c b/crypto/ec/ec_pmeth.c
index f433076..66ee397 100644
--- a/crypto/ec/ec_pmeth.c
+++ b/crypto/ec/ec_pmeth.c
@@ -188,7 +188,7 @@ static int pkey_ec_derive(EVP_PKEY_CTX *ctx, unsigned char *key, size_t *keylen)
pubkey = EC_KEY_get0_public_key(ctx->peerkey->pkey.ec);
- /* NB: unlike PKS#3 DH, if *outlen is less than maximum size this is
+ /* NB: unlike PKCS#3 DH, if *outlen is less than maximum size this is
* not an error, the result is truncated.
*/
@@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
case EVP_PKEY_CTRL_MD:
if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
+ EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
diff --git a/crypto/ec/eck_prn.c b/crypto/ec/eck_prn.c
index 7d3e175..06de8f3 100644
--- a/crypto/ec/eck_prn.c
+++ b/crypto/ec/eck_prn.c
@@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
reason = ERR_R_MALLOC_FAILURE;
goto err;
}
-
+#ifndef OPENSSL_NO_EC2M
if (is_char_two)
{
if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx))
@@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
}
}
else /* prime field */
+#endif
{
if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx))
{
diff --git a/crypto/ec/ecp_mont.c b/crypto/ec/ecp_mont.c
index 9fc4a46..f04f132 100644
--- a/crypto/ec/ecp_mont.c
+++ b/crypto/ec/ecp_mont.c
@@ -63,12 +63,20 @@
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
#include "ec_lcl.h"
const EC_METHOD *EC_GFp_mont_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gfp_mont_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_prime_field,
ec_GFp_mont_group_init,
ec_GFp_mont_group_finish,
@@ -87,9 +95,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
ec_GFp_simple_get_Jprojective_coordinates_GFp,
ec_GFp_simple_point_set_affine_coordinates,
ec_GFp_simple_point_get_affine_coordinates,
- ec_GFp_simple_set_compressed_coordinates,
- ec_GFp_simple_point2oct,
- ec_GFp_simple_oct2point,
+ 0,0,0,
ec_GFp_simple_add,
ec_GFp_simple_dbl,
ec_GFp_simple_invert,
@@ -109,6 +115,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
ec_GFp_mont_field_set_to_one };
return &ret;
+#endif
}
diff --git a/crypto/ec/ecp_nist.c b/crypto/ec/ecp_nist.c
index 2a5682e..aad2d5f 100644
--- a/crypto/ec/ecp_nist.c
+++ b/crypto/ec/ecp_nist.c
@@ -67,9 +67,17 @@
#include
#include "ec_lcl.h"
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
const EC_METHOD *EC_GFp_nist_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gfp_nist_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_prime_field,
ec_GFp_simple_group_init,
ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
ec_GFp_simple_get_Jprojective_coordinates_GFp,
ec_GFp_simple_point_set_affine_coordinates,
ec_GFp_simple_point_get_affine_coordinates,
- ec_GFp_simple_set_compressed_coordinates,
- ec_GFp_simple_point2oct,
- ec_GFp_simple_oct2point,
+ 0,0,0,
ec_GFp_simple_add,
ec_GFp_simple_dbl,
ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
0 /* field_set_to_one */ };
return &ret;
+#endif
}
int ec_GFp_nist_group_copy(EC_GROUP *dest, const EC_GROUP *src)
diff --git a/crypto/ec/ecp_oct.c b/crypto/ec/ecp_oct.c
new file mode 100644
index 0000000..374a0ee
--- /dev/null
+++ b/crypto/ec/ecp_oct.c
@@ -0,0 +1,433 @@
+/* crypto/ec/ecp_oct.c */
+/* Includes code written by Lenka Fibikova
+ * for the OpenSSL project.
+ * Includes code written by Bodo Moeller for the OpenSSL project.
+*/
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Portions of this software developed by SUN MICROSYSTEMS, INC.,
+ * and contributed to the OpenSSL project.
+ */
+
+#include
+#include
+
+#include "ec_lcl.h"
+
+int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+ {
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *tmp1, *tmp2, *x, *y;
+ int ret = 0;
+
+ /* clear error queue*/
+ ERR_clear_error();
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ y_bit = (y_bit != 0);
+
+ BN_CTX_start(ctx);
+ tmp1 = BN_CTX_get(ctx);
+ tmp2 = BN_CTX_get(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ if (y == NULL) goto err;
+
+ /* Recover y. We have a Weierstrass equation
+ * y^2 = x^3 + a*x + b,
+ * so y is one of the square roots of x^3 + a*x + b.
+ */
+
+ /* tmp1 := x^3 */
+ if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
+ if (group->meth->field_decode == 0)
+ {
+ /* field_{sqr,mul} work on standard representation */
+ if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
+ if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
+ }
+ else
+ {
+ if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
+ if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
+ }
+
+ /* tmp1 := tmp1 + a*x */
+ if (group->a_is_minus3)
+ {
+ if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
+ if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
+ if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+ }
+ else
+ {
+ if (group->meth->field_decode)
+ {
+ if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
+ if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
+ }
+ else
+ {
+ /* field_mul works on standard representation */
+ if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
+ }
+
+ if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+ }
+
+ /* tmp1 := tmp1 + b */
+ if (group->meth->field_decode)
+ {
+ if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
+ if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+ }
+ else
+ {
+ if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
+ }
+
+ if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
+ {
+ unsigned long err = ERR_peek_last_error();
+
+ if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
+ {
+ ERR_clear_error();
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+ }
+ else
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+ goto err;
+ }
+
+ if (y_bit != BN_is_odd(y))
+ {
+ if (BN_is_zero(y))
+ {
+ int kron;
+
+ kron = BN_kronecker(x, &group->field, ctx);
+ if (kron == -2) goto err;
+
+ if (kron == 1)
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
+ else
+ /* BN_mod_sqrt() should have cought this error (not a square) */
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+ goto err;
+ }
+ if (!BN_usub(y, &group->field, y)) goto err;
+ }
+ if (y_bit != BN_is_odd(y))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+
+
+size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+ unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ size_t ret;
+ BN_CTX *new_ctx = NULL;
+ int used_ctx = 0;
+ BIGNUM *x, *y;
+ size_t field_len, i, skip;
+
+ if ((form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+ goto err;
+ }
+
+ if (EC_POINT_is_at_infinity(group, point))
+ {
+ /* encodes to a single 0 octet */
+ if (buf != NULL)
+ {
+ if (len < 1)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ buf[0] = 0;
+ }
+ return 1;
+ }
+
+
+ /* ret := required output buffer length */
+ field_len = BN_num_bytes(&group->field);
+ ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ /* if 'buf' is NULL, just return required length */
+ if (buf != NULL)
+ {
+ if (len < ret)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ goto err;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ used_ctx = 1;
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ if (y == NULL) goto err;
+
+ if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+ if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
+ buf[0] = form + 1;
+ else
+ buf[0] = form;
+
+ i = 1;
+
+ skip = field_len - BN_num_bytes(x);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(x, buf + i);
+ i += skip;
+ if (i != 1 + field_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+ {
+ skip = field_len - BN_num_bytes(y);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(y, buf + i);
+ i += skip;
+ }
+
+ if (i != ret)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ }
+
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+
+ err:
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return 0;
+ }
+
+
+int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+ const unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ point_conversion_form_t form;
+ int y_bit;
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *x, *y;
+ size_t field_len, enc_len;
+ int ret = 0;
+
+ if (len == 0)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ form = buf[0];
+ y_bit = form & 1;
+ form = form & ~1U;
+ if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+ if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (form == 0)
+ {
+ if (len != 1)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ return EC_POINT_set_to_infinity(group, point);
+ }
+
+ field_len = BN_num_bytes(&group->field);
+ enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ if (len != enc_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ if (y == NULL) goto err;
+
+ if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+ if (BN_ucmp(x, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_COMPRESSED)
+ {
+ if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
+ }
+ else
+ {
+ if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+ if (BN_ucmp(y, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ if (form == POINT_CONVERSION_HYBRID)
+ {
+ if (y_bit != BN_is_odd(y))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+ }
+
+ if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+ goto err;
+ }
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+
diff --git a/crypto/ec/ecp_smpl.c b/crypto/ec/ecp_smpl.c
index 66a92e2..7cbb321 100644
--- a/crypto/ec/ecp_smpl.c
+++ b/crypto/ec/ecp_smpl.c
@@ -65,11 +65,19 @@
#include
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
#include "ec_lcl.h"
const EC_METHOD *EC_GFp_simple_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gfp_simple_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_prime_field,
ec_GFp_simple_group_init,
ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
ec_GFp_simple_get_Jprojective_coordinates_GFp,
ec_GFp_simple_point_set_affine_coordinates,
ec_GFp_simple_point_get_affine_coordinates,
- ec_GFp_simple_set_compressed_coordinates,
- ec_GFp_simple_point2oct,
- ec_GFp_simple_oct2point,
+ 0,0,0,
ec_GFp_simple_add,
ec_GFp_simple_dbl,
ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
0 /* field_set_to_one */ };
return &ret;
+#endif
}
@@ -633,372 +640,6 @@ int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_P
return ret;
}
-
-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x_, int y_bit, BN_CTX *ctx)
- {
- BN_CTX *new_ctx = NULL;
- BIGNUM *tmp1, *tmp2, *x, *y;
- int ret = 0;
-
- /* clear error queue*/
- ERR_clear_error();
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- y_bit = (y_bit != 0);
-
- BN_CTX_start(ctx);
- tmp1 = BN_CTX_get(ctx);
- tmp2 = BN_CTX_get(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- if (y == NULL) goto err;
-
- /* Recover y. We have a Weierstrass equation
- * y^2 = x^3 + a*x + b,
- * so y is one of the square roots of x^3 + a*x + b.
- */
-
- /* tmp1 := x^3 */
- if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
- if (group->meth->field_decode == 0)
- {
- /* field_{sqr,mul} work on standard representation */
- if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
- if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
- }
- else
- {
- if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
- if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
- }
-
- /* tmp1 := tmp1 + a*x */
- if (group->a_is_minus3)
- {
- if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
- if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
- if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
- }
- else
- {
- if (group->meth->field_decode)
- {
- if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
- if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
- }
- else
- {
- /* field_mul works on standard representation */
- if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
- }
-
- if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
- }
-
- /* tmp1 := tmp1 + b */
- if (group->meth->field_decode)
- {
- if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
- if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
- }
- else
- {
- if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
- }
-
- if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
- {
- unsigned long err = ERR_peek_last_error();
-
- if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
- {
- ERR_clear_error();
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
- }
- else
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
- goto err;
- }
-
- if (y_bit != BN_is_odd(y))
- {
- if (BN_is_zero(y))
- {
- int kron;
-
- kron = BN_kronecker(x, &group->field, ctx);
- if (kron == -2) goto err;
-
- if (kron == 1)
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
- else
- /* BN_mod_sqrt() should have cought this error (not a square) */
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
- goto err;
- }
- if (!BN_usub(y, &group->field, y)) goto err;
- }
- if (y_bit != BN_is_odd(y))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
- goto err;
- }
-
- if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
-size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
- unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- size_t ret;
- BN_CTX *new_ctx = NULL;
- int used_ctx = 0;
- BIGNUM *x, *y;
- size_t field_len, i, skip;
-
- if ((form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
- goto err;
- }
-
- if (EC_POINT_is_at_infinity(group, point))
- {
- /* encodes to a single 0 octet */
- if (buf != NULL)
- {
- if (len < 1)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- buf[0] = 0;
- }
- return 1;
- }
-
-
- /* ret := required output buffer length */
- field_len = BN_num_bytes(&group->field);
- ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- /* if 'buf' is NULL, just return required length */
- if (buf != NULL)
- {
- if (len < ret)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- goto err;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- used_ctx = 1;
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- if (y == NULL) goto err;
-
- if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
- if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
- buf[0] = form + 1;
- else
- buf[0] = form;
-
- i = 1;
-
- skip = field_len - BN_num_bytes(x);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(x, buf + i);
- i += skip;
- if (i != 1 + field_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
-
- if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
- {
- skip = field_len - BN_num_bytes(y);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(y, buf + i);
- i += skip;
- }
-
- if (i != ret)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- }
-
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
-
- err:
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return 0;
- }
-
-
-int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
- const unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- point_conversion_form_t form;
- int y_bit;
- BN_CTX *new_ctx = NULL;
- BIGNUM *x, *y;
- size_t field_len, enc_len;
- int ret = 0;
-
- if (len == 0)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- form = buf[0];
- y_bit = form & 1;
- form = form & ~1U;
- if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
- if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (form == 0)
- {
- if (len != 1)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- return EC_POINT_set_to_infinity(group, point);
- }
-
- field_len = BN_num_bytes(&group->field);
- enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- if (len != enc_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- if (y == NULL) goto err;
-
- if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
- if (BN_ucmp(x, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
-
- if (form == POINT_CONVERSION_COMPRESSED)
- {
- if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
- }
- else
- {
- if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
- if (BN_ucmp(y, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- if (form == POINT_CONVERSION_HYBRID)
- {
- if (y_bit != BN_is_odd(y))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- }
-
- if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
- }
-
- if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
- goto err;
- }
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
{
int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *, BN_CTX *);
diff --git a/crypto/ec/ectest.c b/crypto/ec/ectest.c
index 7509cb9..102eaa9 100644
--- a/crypto/ec/ectest.c
+++ b/crypto/ec/ectest.c
@@ -94,6 +94,7 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
#include
#include
#include
+#include
#if defined(_MSC_VER) && defined(_MIPS_) && (_MSC_VER/100==12)
/* suppress "too big too optimize" warning */
@@ -107,10 +108,6 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
EXIT(1); \
} while (0)
-void prime_field_tests(void);
-void char2_field_tests(void);
-void internal_curve_test(void);
-
#define TIMING_BASE_PT 0
#define TIMING_RAND_PT 1
#define TIMING_SIMUL 2
@@ -195,8 +192,51 @@ static void timings(EC_GROUP *group, int type, BN_CTX *ctx)
}
#endif
-void prime_field_tests()
- {
+/* test multiplication with group order, long and negative scalars */
+static void group_order_tests(EC_GROUP *group)
+ {
+ BIGNUM *n1, *n2, *order;
+ EC_POINT *P = EC_POINT_new(group);
+ EC_POINT *Q = EC_POINT_new(group);
+ BN_CTX *ctx = BN_CTX_new();
+
+ n1 = BN_new(); n2 = BN_new(); order = BN_new();
+ fprintf(stdout, "verify group order ...");
+ fflush(stdout);
+ if (!EC_GROUP_get_order(group, order, ctx)) ABORT;
+ if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+ if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+ fprintf(stdout, ".");
+ fflush(stdout);
+ if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
+ if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+ if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+ fprintf(stdout, " ok\n");
+ fprintf(stdout, "long/negative scalar tests ... ");
+ if (!BN_one(n1)) ABORT;
+ /* n1 = 1 - order */
+ if (!BN_sub(n1, n1, order)) ABORT;
+ if(!EC_POINT_mul(group, Q, NULL, P, n1, ctx)) ABORT;
+ if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+ /* n2 = 1 + order */
+ if (!BN_add(n2, order, BN_value_one())) ABORT;
+ if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+ if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+ /* n2 = (1 - order) * (1 + order) */
+ if (!BN_mul(n2, n1, n2, ctx)) ABORT;
+ if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+ if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+ fprintf(stdout, "ok\n");
+ EC_POINT_free(P);
+ EC_POINT_free(Q);
+ BN_free(n1);
+ BN_free(n2);
+ BN_free(order);
+ BN_CTX_free(ctx);
+ }
+
+static void prime_field_tests(void)
+ {
BN_CTX *ctx = NULL;
BIGNUM *p, *a, *b;
EC_GROUP *group;
@@ -321,21 +361,21 @@ void prime_field_tests()
if (len == 0) ABORT;
if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
- fprintf(stdout, "Generator as octect string, compressed form:\n ");
+ fprintf(stdout, "Generator as octet string, compressed form:\n ");
for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_UNCOMPRESSED, buf, sizeof buf, ctx);
if (len == 0) ABORT;
if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
- fprintf(stdout, "\nGenerator as octect string, uncompressed form:\n ");
+ fprintf(stdout, "\nGenerator as octet string, uncompressed form:\n ");
for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_HYBRID, buf, sizeof buf, ctx);
if (len == 0) ABORT;
if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
- fprintf(stdout, "\nGenerator as octect string, hybrid form:\n ");
+ fprintf(stdout, "\nGenerator as octet string, hybrid form:\n ");
for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
if (!EC_POINT_get_Jprojective_coordinates_GFp(group, R, x, y, z, ctx)) ABORT;
@@ -381,17 +421,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 160) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_160 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_160, group)) ABORT;
@@ -425,17 +455,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 192) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_192 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_192, group)) ABORT;
@@ -469,17 +489,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 224) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_224 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_224, group)) ABORT;
@@ -514,17 +524,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 256) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_256 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_256, group)) ABORT;
@@ -563,18 +563,8 @@ void prime_field_tests()
fprintf(stdout, "verify degree ...");
if (EC_GROUP_get_degree(group) != 384) ABORT;
fprintf(stdout, " ok\n");
-
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+
+ group_order_tests(group);
if (!(P_384 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_384, group)) ABORT;
@@ -619,18 +609,8 @@ void prime_field_tests()
fprintf(stdout, "verify degree ...");
if (EC_GROUP_get_degree(group) != 521) ABORT;
fprintf(stdout, " ok\n");
-
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+
+ group_order_tests(group);
if (!(P_521 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_521, group)) ABORT;
@@ -659,6 +639,7 @@ void prime_field_tests()
points[2] = Q;
points[3] = Q;
+ if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
if (!BN_add(y, z, BN_value_one())) ABORT;
if (BN_is_odd(y)) ABORT;
if (!BN_rshift1(y, y)) ABORT;
@@ -792,22 +773,14 @@ void prime_field_tests()
fprintf(stdout, "verify degree ..."); \
if (EC_GROUP_get_degree(group) != _degree) ABORT; \
fprintf(stdout, " ok\n"); \
- fprintf(stdout, "verify group order ..."); \
- fflush(stdout); \
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT; \
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
- fprintf(stdout, "."); \
- fflush(stdout); \
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT; \
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
- fprintf(stdout, " ok\n"); \
+ group_order_tests(group); \
if (!(_variable = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT; \
- if (!EC_GROUP_copy(_variable, group)) ABORT;
+ if (!EC_GROUP_copy(_variable, group)) ABORT; \
-void char2_field_tests()
- {
+#ifndef OPENSSL_NO_EC2M
+
+static void char2_field_tests(void)
+ {
BN_CTX *ctx = NULL;
BIGNUM *p, *a, *b;
EC_GROUP *group;
@@ -1239,8 +1212,9 @@ void char2_field_tests()
if (C2_B571) EC_GROUP_free(C2_B571);
}
+#endif
-void internal_curve_test(void)
+static void internal_curve_test(void)
{
EC_builtin_curve *curves = NULL;
size_t crv_len = 0, n = 0;
@@ -1287,13 +1261,189 @@ void internal_curve_test(void)
EC_GROUP_free(group);
}
if (ok)
- fprintf(stdout, " ok\n");
+ fprintf(stdout, " ok\n\n");
else
- fprintf(stdout, " failed\n");
+ {
+ fprintf(stdout, " failed\n\n");
+ ABORT;
+ }
OPENSSL_free(curves);
return;
}
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/* nistp_test_params contains magic numbers for testing our optimized
+ * implementations of several NIST curves with characteristic > 3. */
+struct nistp_test_params
+ {
+ const EC_METHOD* (*meth) ();
+ int degree;
+ /* Qx, Qy and D are taken from
+ * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/ECDSA_Prime.pdf
+ * Otherwise, values are standard curve parameters from FIPS 180-3 */
+ const char *p, *a, *b, *Qx, *Qy, *Gx, *Gy, *order, *d;
+ };
+
+static const struct nistp_test_params nistp_tests_params[] =
+ {
+ {
+ /* P-224 */
+ EC_GFp_nistp224_method,
+ 224,
+ "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000000000000000001", /* p */
+ "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFE", /* a */
+ "B4050A850C04B3ABF54132565044B0B7D7BFD8BA270B39432355FFB4", /* b */
+ "E84FB0B8E7000CB657D7973CF6B42ED78B301674276DF744AF130B3E", /* Qx */
+ "4376675C6FC5612C21A0FF2D2A89D2987DF7A2BC52183B5982298555", /* Qy */
+ "B70E0CBD6BB4BF7F321390B94A03C1D356C21122343280D6115C1D21", /* Gx */
+ "BD376388B5F723FB4C22DFE6CD4375A05A07476444D5819985007E34", /* Gy */
+ "FFFFFFFFFFFFFFFFFFFFFFFFFFFF16A2E0B8F03E13DD29455C5C2A3D", /* order */
+ "3F0C488E987C80BE0FEE521F8D90BE6034EC69AE11CA72AA777481E8", /* d */
+ },
+ {
+ /* P-256 */
+ EC_GFp_nistp256_method,
+ 256,
+ "ffffffff00000001000000000000000000000000ffffffffffffffffffffffff", /* p */
+ "ffffffff00000001000000000000000000000000fffffffffffffffffffffffc", /* a */
+ "5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", /* b */
+ "b7e08afdfe94bad3f1dc8c734798ba1c62b3a0ad1e9ea2a38201cd0889bc7a19", /* Qx */
+ "3603f747959dbf7a4bb226e41928729063adc7ae43529e61b563bbc606cc5e09", /* Qy */
+ "6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", /* Gx */
+ "4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", /* Gy */
+ "ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551", /* order */
+ "c477f9f65c22cce20657faa5b2d1d8122336f851a508a1ed04e479c34985bf96", /* d */
+ },
+ {
+ /* P-521 */
+ EC_GFp_nistp521_method,
+ 521,
+ "1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", /* p */
+ "1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffc", /* a */
+ "051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00", /* b */
+ "0098e91eef9a68452822309c52fab453f5f117c1da8ed796b255e9ab8f6410cca16e59df403a6bdc6ca467a37056b1e54b3005d8ac030decfeb68df18b171885d5c4", /* Qx */
+ "0164350c321aecfc1cca1ba4364c9b15656150b4b78d6a48d7d28e7f31985ef17be8554376b72900712c4b83ad668327231526e313f5f092999a4632fd50d946bc2e", /* Qy */
+ "c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66", /* Gx */
+ "11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650", /* Gy */
+ "1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409", /* order */
+ "0100085f47b8e1b8b11b7eb33028c0b2888e304bfc98501955b45bba1478dc184eeedf09b86a5f7c21994406072787205e69a63709fe35aa93ba333514b24f961722", /* d */
+ },
+ };
+
+void nistp_single_test(const struct nistp_test_params *test)
+ {
+ BN_CTX *ctx;
+ BIGNUM *p, *a, *b, *x, *y, *n, *m, *order;
+ EC_GROUP *NISTP;
+ EC_POINT *G, *P, *Q, *Q_CHECK;
+
+ fprintf(stdout, "\nNIST curve P-%d (optimised implementation):\n", test->degree);
+ ctx = BN_CTX_new();
+ p = BN_new();
+ a = BN_new();
+ b = BN_new();
+ x = BN_new(); y = BN_new();
+ m = BN_new(); n = BN_new(); order = BN_new();
+
+ NISTP = EC_GROUP_new(test->meth());
+ if(!NISTP) ABORT;
+ if (!BN_hex2bn(&p, test->p)) ABORT;
+ if (1 != BN_is_prime_ex(p, BN_prime_checks, ctx, NULL)) ABORT;
+ if (!BN_hex2bn(&a, test->a)) ABORT;
+ if (!BN_hex2bn(&b, test->b)) ABORT;
+ if (!EC_GROUP_set_curve_GFp(NISTP, p, a, b, ctx)) ABORT;
+ G = EC_POINT_new(NISTP);
+ P = EC_POINT_new(NISTP);
+ Q = EC_POINT_new(NISTP);
+ Q_CHECK = EC_POINT_new(NISTP);
+ if(!BN_hex2bn(&x, test->Qx)) ABORT;
+ if(!BN_hex2bn(&y, test->Qy)) ABORT;
+ if(!EC_POINT_set_affine_coordinates_GFp(NISTP, Q_CHECK, x, y, ctx)) ABORT;
+ if (!BN_hex2bn(&x, test->Gx)) ABORT;
+ if (!BN_hex2bn(&y, test->Gy)) ABORT;
+ if (!EC_POINT_set_affine_coordinates_GFp(NISTP, G, x, y, ctx)) ABORT;
+ if (!BN_hex2bn(&order, test->order)) ABORT;
+ if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+
+ fprintf(stdout, "verify degree ... ");
+ if (EC_GROUP_get_degree(NISTP) != test->degree) ABORT;
+ fprintf(stdout, "ok\n");
+
+ fprintf(stdout, "NIST test vectors ... ");
+ if (!BN_hex2bn(&n, test->d)) ABORT;
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ /* set generator to P = 2*G, where G is the standard generator */
+ if (!EC_POINT_dbl(NISTP, P, G, ctx)) ABORT;
+ if (!EC_GROUP_set_generator(NISTP, P, order, BN_value_one())) ABORT;
+ /* set the scalar to m=n/2, where n is the NIST test scalar */
+ if (!BN_rshift(m, n, 1)) ABORT;
+
+ /* test the non-standard generator */
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ /* now repeat all tests with precomputation */
+ if (!EC_GROUP_precompute_mult(NISTP, ctx)) ABORT;
+
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ /* reset generator */
+ if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ fprintf(stdout, "ok\n");
+ group_order_tests(NISTP);
+#if 0
+ timings(NISTP, TIMING_BASE_PT, ctx);
+ timings(NISTP, TIMING_RAND_PT, ctx);
+#endif
+ EC_GROUP_free(NISTP);
+ EC_POINT_free(G);
+ EC_POINT_free(P);
+ EC_POINT_free(Q);
+ EC_POINT_free(Q_CHECK);
+ BN_free(n);
+ BN_free(m);
+ BN_free(p);
+ BN_free(a);
+ BN_free(b);
+ BN_free(x);
+ BN_free(y);
+ BN_free(order);
+ BN_CTX_free(ctx);
+ }
+
+void nistp_tests()
+ {
+ unsigned i;
+
+ for (i = 0; i < sizeof(nistp_tests_params) / sizeof(struct nistp_test_params); i++)
+ {
+ nistp_single_test(&nistp_tests_params[i]);
+ }
+ }
+#endif
+
static const char rnd_seed[] = "string to make the random number generator think it has entropy";
int main(int argc, char *argv[])
@@ -1317,7 +1467,12 @@ int main(int argc, char *argv[])
prime_field_tests();
puts("");
+#ifndef OPENSSL_NO_EC2M
char2_field_tests();
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ nistp_tests();
+#endif
/* test the internal curves */
internal_curve_test();
diff --git a/crypto/ecdh/ecdh.h b/crypto/ecdh/ecdh.h
index b4b58ee..8887102 100644
--- a/crypto/ecdh/ecdh.h
+++ b/crypto/ecdh/ecdh.h
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void);
/* Error codes for the ECDH functions. */
/* Function codes. */
+#define ECDH_F_ECDH_CHECK 102
#define ECDH_F_ECDH_COMPUTE_KEY 100
#define ECDH_F_ECDH_DATA_NEW_METHOD 101
/* Reason codes. */
#define ECDH_R_KDF_FAILED 102
+#define ECDH_R_NON_FIPS_METHOD 103
#define ECDH_R_NO_PRIVATE_VALUE 100
#define ECDH_R_POINT_ARITHMETIC_FAILURE 101
diff --git a/crypto/ecdh/ecdhtest.c b/crypto/ecdh/ecdhtest.c
index 212a87e..823d7ba 100644
--- a/crypto/ecdh/ecdhtest.c
+++ b/crypto/ecdh/ecdhtest.c
@@ -158,11 +158,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
if (!EC_POINT_get_affine_coordinates_GFp(group,
EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
}
+#ifndef OPENSSL_NO_EC2M
else
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
}
+#endif
#ifdef NOISY
BIO_puts(out," pri 1=");
BN_print(out,a->priv_key);
@@ -183,11 +185,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
if (!EC_POINT_get_affine_coordinates_GFp(group,
EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
}
+#ifndef OPENSSL_NO_EC2M
else
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
}
+#endif
#ifdef NOISY
BIO_puts(out," pri 2=");
@@ -324,6 +328,7 @@ int main(int argc, char *argv[])
if (!test_ecdh_curve(NID_X9_62_prime256v1, "NIST Prime-Curve P-256", ctx, out)) goto err;
if (!test_ecdh_curve(NID_secp384r1, "NIST Prime-Curve P-384", ctx, out)) goto err;
if (!test_ecdh_curve(NID_secp521r1, "NIST Prime-Curve P-521", ctx, out)) goto err;
+#ifndef OPENSSL_NO_EC2M
/* NIST BINARY CURVES TESTS */
if (!test_ecdh_curve(NID_sect163k1, "NIST Binary-Curve K-163", ctx, out)) goto err;
if (!test_ecdh_curve(NID_sect163r2, "NIST Binary-Curve B-163", ctx, out)) goto err;
@@ -335,6 +340,7 @@ int main(int argc, char *argv[])
if (!test_ecdh_curve(NID_sect409r1, "NIST Binary-Curve B-409", ctx, out)) goto err;
if (!test_ecdh_curve(NID_sect571k1, "NIST Binary-Curve K-571", ctx, out)) goto err;
if (!test_ecdh_curve(NID_sect571r1, "NIST Binary-Curve B-571", ctx, out)) goto err;
+#endif
ret = 0;
diff --git a/crypto/ecdh/ech_err.c b/crypto/ecdh/ech_err.c
index 6f4b0c9..3bd2473 100644
--- a/crypto/ecdh/ech_err.c
+++ b/crypto/ecdh/ech_err.c
@@ -1,6 +1,6 @@
/* crypto/ecdh/ech_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
static ERR_STRING_DATA ECDH_str_functs[]=
{
+{ERR_FUNC(ECDH_F_ECDH_CHECK), "ECDH_CHECK"},
{ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"},
{ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"},
{0,NULL}
@@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]=
static ERR_STRING_DATA ECDH_str_reasons[]=
{
{ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"},
+{ERR_REASON(ECDH_R_NON_FIPS_METHOD) ,"non fips method"},
{ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"},
{ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"},
{0,NULL}
diff --git a/crypto/ecdh/ech_key.c b/crypto/ecdh/ech_key.c
index f44da92..2988899 100644
--- a/crypto/ecdh/ech_key.c
+++ b/crypto/ecdh/ech_key.c
@@ -68,9 +68,6 @@
*/
#include "ech_locl.h"
-#ifndef OPENSSL_NO_ENGINE
-#include
-#endif
int ECDH_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
EC_KEY *eckey,
diff --git a/crypto/ecdh/ech_lib.c b/crypto/ecdh/ech_lib.c
index 4d8ea03..0644431 100644
--- a/crypto/ecdh/ech_lib.c
+++ b/crypto/ecdh/ech_lib.c
@@ -73,6 +73,9 @@
#include
#endif
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT;
@@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth)
const ECDH_METHOD *ECDH_get_default_method(void)
{
if(!default_ECDH_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ecdh_openssl();
+ else
+ return ECDH_OpenSSL();
+#else
default_ECDH_method = ECDH_OpenSSL();
+#endif
+ }
return default_ECDH_method;
}
@@ -210,11 +222,26 @@ ECDH_DATA *ecdh_check(EC_KEY *key)
ecdh_data = (ECDH_DATA *)ecdh_data_new();
if (ecdh_data == NULL)
return NULL;
- EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
- ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+ data = EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
+ ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+ if (data != NULL)
+ {
+ /* Another thread raced us to install the key_method
+ * data and won. */
+ ecdh_data_free(ecdh_data);
+ ecdh_data = (ECDH_DATA *)data;
+ }
}
else
ecdh_data = (ECDH_DATA *)data;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD)
+ && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+ {
+ ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD);
+ return NULL;
+ }
+#endif
return ecdh_data;
diff --git a/crypto/ecdh/ech_locl.h b/crypto/ecdh/ech_locl.h
index f658526..f6cad6a 100644
--- a/crypto/ecdh/ech_locl.h
+++ b/crypto/ecdh/ech_locl.h
@@ -75,6 +75,14 @@ struct ecdh_method
char *app_data;
};
+/* If this flag is set the ECDH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDH_FLAG_FIPS_METHOD 0x1
+
typedef struct ecdh_data_st {
/* EC_KEY_METH_DATA part */
int (*init)(EC_KEY *);
diff --git a/crypto/ecdh/ech_ossl.c b/crypto/ecdh/ech_ossl.c
index 2a40ff1..4a30628 100644
--- a/crypto/ecdh/ech_ossl.c
+++ b/crypto/ecdh/ech_ossl.c
@@ -157,6 +157,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else
{
if (!EC_POINT_get_affine_coordinates_GF2m(group, tmp, x, y, ctx))
@@ -165,6 +166,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
goto err;
}
}
+#endif
buflen = (EC_GROUP_get_degree(group) + 7)/8;
len = BN_num_bytes(x);
diff --git a/crypto/ecdsa/ecdsa.h b/crypto/ecdsa/ecdsa.h
index e61c539..7fb5254 100644
--- a/crypto/ecdsa/ecdsa.h
+++ b/crypto/ecdsa/ecdsa.h
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void);
/* Error codes for the ECDSA functions. */
/* Function codes. */
+#define ECDSA_F_ECDSA_CHECK 104
#define ECDSA_F_ECDSA_DATA_NEW_METHOD 100
#define ECDSA_F_ECDSA_DO_SIGN 101
#define ECDSA_F_ECDSA_DO_VERIFY 102
@@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void);
#define ECDSA_R_ERR_EC_LIB 102
#define ECDSA_R_MISSING_PARAMETERS 103
#define ECDSA_R_NEED_NEW_SETUP_VALUES 106
+#define ECDSA_R_NON_FIPS_METHOD 107
#define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104
#define ECDSA_R_SIGNATURE_MALLOC_FAILED 105
diff --git a/crypto/ecdsa/ecdsatest.c b/crypto/ecdsa/ecdsatest.c
index 26a4a9e..537bb30 100644
--- a/crypto/ecdsa/ecdsatest.c
+++ b/crypto/ecdsa/ecdsatest.c
@@ -262,6 +262,7 @@ int x9_62_tests(BIO *out)
"3238135532097973577080787768312505059318910517550078427819"
"78505179448783"))
goto x962_err;
+#ifndef OPENSSL_NO_EC2M
if (!x9_62_test_internal(out, NID_X9_62_c2tnb191v1,
"87194383164871543355722284926904419997237591535066528048",
"308992691965804947361541664549085895292153777025772063598"))
@@ -272,7 +273,7 @@ int x9_62_tests(BIO *out)
"1970303740007316867383349976549972270528498040721988191026"
"49413465737174"))
goto x962_err;
-
+#endif
ret = 1;
x962_err:
if (!restore_rand())
@@ -286,9 +287,13 @@ int test_builtin(BIO *out)
size_t crv_len = 0, n = 0;
EC_KEY *eckey = NULL, *wrong_eckey = NULL;
EC_GROUP *group;
+ ECDSA_SIG *ecdsa_sig = NULL;
unsigned char digest[20], wrong_digest[20];
- unsigned char *signature = NULL;
- unsigned int sig_len;
+ unsigned char *signature = NULL;
+ const unsigned char *sig_ptr;
+ unsigned char *sig_ptr2;
+ unsigned char *raw_buf = NULL;
+ unsigned int sig_len, degree, r_len, s_len, bn_len, buf_len;
int nid, ret = 0;
/* fill digest values with some random data */
@@ -338,7 +343,8 @@ int test_builtin(BIO *out)
if (EC_KEY_set_group(eckey, group) == 0)
goto builtin_err;
EC_GROUP_free(group);
- if (EC_GROUP_get_degree(EC_KEY_get0_group(eckey)) < 160)
+ degree = EC_GROUP_get_degree(EC_KEY_get0_group(eckey));
+ if (degree < 160)
/* drop the curve */
{
EC_KEY_free(eckey);
@@ -414,26 +420,89 @@ int test_builtin(BIO *out)
}
BIO_printf(out, ".");
(void)BIO_flush(out);
- /* modify a single byte of the signature */
- offset = signature[10] % sig_len;
- dirt = signature[11];
- signature[offset] ^= dirt ? dirt : 1;
+ /* wrong length */
+ if (ECDSA_verify(0, digest, 20, signature, sig_len - 1,
+ eckey) == 1)
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
+ BIO_printf(out, ".");
+ (void)BIO_flush(out);
+
+ /* Modify a single byte of the signature: to ensure we don't
+ * garble the ASN1 structure, we read the raw signature and
+ * modify a byte in one of the bignums directly. */
+ sig_ptr = signature;
+ if ((ecdsa_sig = d2i_ECDSA_SIG(NULL, &sig_ptr, sig_len)) == NULL)
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
+
+ /* Store the two BIGNUMs in raw_buf. */
+ r_len = BN_num_bytes(ecdsa_sig->r);
+ s_len = BN_num_bytes(ecdsa_sig->s);
+ bn_len = (degree + 7) / 8;
+ if ((r_len > bn_len) || (s_len > bn_len))
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
+ buf_len = 2 * bn_len;
+ if ((raw_buf = OPENSSL_malloc(buf_len)) == NULL)
+ goto builtin_err;
+ /* Pad the bignums with leading zeroes. */
+ memset(raw_buf, 0, buf_len);
+ BN_bn2bin(ecdsa_sig->r, raw_buf + bn_len - r_len);
+ BN_bn2bin(ecdsa_sig->s, raw_buf + buf_len - s_len);
+
+ /* Modify a single byte in the buffer. */
+ offset = raw_buf[10] % buf_len;
+ dirt = raw_buf[11] ? raw_buf[11] : 1;
+ raw_buf[offset] ^= dirt;
+ /* Now read the BIGNUMs back in from raw_buf. */
+ if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+ (BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+ goto builtin_err;
+
+ sig_ptr2 = signature;
+ sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) == 1)
{
BIO_printf(out, " failed\n");
goto builtin_err;
}
+ /* Sanity check: undo the modification and verify signature. */
+ raw_buf[offset] ^= dirt;
+ if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+ (BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+ goto builtin_err;
+
+ sig_ptr2 = signature;
+ sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
+ if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) != 1)
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
BIO_printf(out, ".");
(void)BIO_flush(out);
BIO_printf(out, " ok\n");
/* cleanup */
+ /* clean bogus errors */
+ ERR_clear_error();
OPENSSL_free(signature);
signature = NULL;
EC_KEY_free(eckey);
eckey = NULL;
EC_KEY_free(wrong_eckey);
wrong_eckey = NULL;
+ ECDSA_SIG_free(ecdsa_sig);
+ ecdsa_sig = NULL;
+ OPENSSL_free(raw_buf);
+ raw_buf = NULL;
}
ret = 1;
@@ -442,8 +511,12 @@ int test_builtin(BIO *out)
EC_KEY_free(eckey);
if (wrong_eckey)
EC_KEY_free(wrong_eckey);
+ if (ecdsa_sig)
+ ECDSA_SIG_free(ecdsa_sig);
if (signature)
OPENSSL_free(signature);
+ if (raw_buf)
+ OPENSSL_free(raw_buf);
if (curves)
OPENSSL_free(curves);
diff --git a/crypto/ecdsa/ecs_err.c b/crypto/ecdsa/ecs_err.c
index 98e38d5..81542e6 100644
--- a/crypto/ecdsa/ecs_err.c
+++ b/crypto/ecdsa/ecs_err.c
@@ -1,6 +1,6 @@
/* crypto/ecdsa/ecs_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
static ERR_STRING_DATA ECDSA_str_functs[]=
{
+{ERR_FUNC(ECDSA_F_ECDSA_CHECK), "ECDSA_CHECK"},
{ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"},
{ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"},
{ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]=
{ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"},
{ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"},
{ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"},
+{ERR_REASON(ECDSA_R_NON_FIPS_METHOD) ,"non fips method"},
{ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"},
{ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"},
{0,NULL}
diff --git a/crypto/ecdsa/ecs_lib.c b/crypto/ecdsa/ecs_lib.c
index 2ebae3a..814a6bf 100644
--- a/crypto/ecdsa/ecs_lib.c
+++ b/crypto/ecdsa/ecs_lib.c
@@ -60,6 +60,9 @@
#endif
#include
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT;
@@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth)
const ECDSA_METHOD *ECDSA_get_default_method(void)
{
if(!default_ECDSA_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ecdsa_openssl();
+ else
+ return ECDSA_OpenSSL();
+#else
default_ECDSA_method = ECDSA_OpenSSL();
+#endif
+ }
return default_ECDSA_method;
}
@@ -188,12 +200,26 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key)
ecdsa_data = (ECDSA_DATA *)ecdsa_data_new();
if (ecdsa_data == NULL)
return NULL;
- EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
- ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+ data = EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
+ ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+ if (data != NULL)
+ {
+ /* Another thread raced us to install the key_method
+ * data and won. */
+ ecdsa_data_free(ecdsa_data);
+ ecdsa_data = (ECDSA_DATA *)data;
+ }
}
else
ecdsa_data = (ECDSA_DATA *)data;
-
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD)
+ && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+ {
+ ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD);
+ return NULL;
+ }
+#endif
return ecdsa_data;
}
diff --git a/crypto/ecdsa/ecs_locl.h b/crypto/ecdsa/ecs_locl.h
index 3a69a84..cb3be13 100644
--- a/crypto/ecdsa/ecs_locl.h
+++ b/crypto/ecdsa/ecs_locl.h
@@ -82,6 +82,14 @@ struct ecdsa_method
char *app_data;
};
+/* If this flag is set the ECDSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDSA_FLAG_FIPS_METHOD 0x1
+
typedef struct ecdsa_data_st {
/* EC_KEY_METH_DATA part */
int (*init)(EC_KEY *);
diff --git a/crypto/ecdsa/ecs_ossl.c b/crypto/ecdsa/ecs_ossl.c
index 1bbf328..7725935 100644
--- a/crypto/ecdsa/ecs_ossl.c
+++ b/crypto/ecdsa/ecs_ossl.c
@@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* NID_X9_62_characteristic_two_field */
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
goto err;
}
}
+#endif
if (!BN_nnmod(r, X, order, ctx))
{
ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
@@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* NID_X9_62_characteristic_two_field */
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
goto err;
}
}
-
+#endif
if (!BN_nnmod(u1, X, order, ctx))
{
ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
diff --git a/crypto/engine/eng_all.c b/crypto/engine/eng_all.c
index 22c1204..6093376 100644
--- a/crypto/engine/eng_all.c
+++ b/crypto/engine/eng_all.c
@@ -61,6 +61,8 @@
void ENGINE_load_builtin_engines(void)
{
+ /* Some ENGINEs need this */
+ OPENSSL_cpuid_setup();
#if 0
/* There's no longer any need for an "openssl" ENGINE unless, one day,
* it is the *only* way for standard builtin implementations to be be
@@ -70,6 +72,12 @@ void ENGINE_load_builtin_engines(void)
#endif
#if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
ENGINE_load_cryptodev();
+#endif
+#ifndef OPENSSL_NO_RSAX
+ ENGINE_load_rsax();
+#endif
+#ifndef OPENSSL_NO_RDRAND
+ ENGINE_load_rdrand();
#endif
ENGINE_load_dynamic();
#ifndef OPENSSL_NO_STATIC_ENGINE
@@ -112,6 +120,7 @@ void ENGINE_load_builtin_engines(void)
ENGINE_load_capi();
#endif
#endif
+ ENGINE_register_all_complete();
}
#if defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)
diff --git a/crypto/engine/eng_cryptodev.c b/crypto/engine/eng_cryptodev.c
index 52f4ca3..5a715ac 100644
--- a/crypto/engine/eng_cryptodev.c
+++ b/crypto/engine/eng_cryptodev.c
@@ -79,8 +79,6 @@ struct dev_crypto_state {
unsigned char digest_res[HASH_MAX_LEN];
char *mac_data;
int mac_len;
-
- int copy;
#endif
};
@@ -200,6 +198,7 @@ get_dev_crypto(void)
if ((fd = open_dev_crypto()) == -1)
return (-1);
+#ifndef CRIOGET_NOT_NEEDED
if (ioctl(fd, CRIOGET, &retfd) == -1)
return (-1);
@@ -208,9 +207,19 @@ get_dev_crypto(void)
close(retfd);
return (-1);
}
+#else
+ retfd = fd;
+#endif
return (retfd);
}
+static void put_dev_crypto(int fd)
+{
+#ifndef CRIOGET_NOT_NEEDED
+ close(fd);
+#endif
+}
+
/* Caching version for asym operations */
static int
get_asym_dev_crypto(void)
@@ -252,7 +261,7 @@ get_cryptodev_ciphers(const int **cnids)
ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
nids[count++] = ciphers[i].nid;
}
- close(fd);
+ put_dev_crypto(fd);
if (count > 0)
*cnids = nids;
@@ -291,7 +300,7 @@ get_cryptodev_digests(const int **cnids)
ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
nids[count++] = digests[i].nid;
}
- close(fd);
+ put_dev_crypto(fd);
if (count > 0)
*cnids = nids;
@@ -436,7 +445,7 @@ cryptodev_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
sess->cipher = cipher;
if (ioctl(state->d_fd, CIOCGSESSION, sess) == -1) {
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
return (0);
}
@@ -473,7 +482,7 @@ cryptodev_cleanup(EVP_CIPHER_CTX *ctx)
} else {
ret = 1;
}
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
return (ret);
@@ -686,7 +695,7 @@ static int cryptodev_digest_init(EVP_MD_CTX *ctx)
sess->mac = digest;
if (ioctl(state->d_fd, CIOCGSESSION, sess) < 0) {
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
printf("cryptodev_digest_init: Open session failed\n");
return (0);
@@ -758,14 +767,12 @@ static int cryptodev_digest_final(EVP_MD_CTX *ctx, unsigned char *md)
if (! (ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) ) {
/* if application doesn't support one buffer */
memset(&cryp, 0, sizeof(cryp));
-
cryp.ses = sess->ses;
cryp.flags = 0;
cryp.len = state->mac_len;
cryp.src = state->mac_data;
cryp.dst = NULL;
cryp.mac = (caddr_t)md;
-
if (ioctl(state->d_fd, CIOCCRYPT, &cryp) < 0) {
printf("cryptodev_digest_final: digest failed\n");
return (0);
@@ -786,6 +793,9 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
struct dev_crypto_state *state = ctx->md_data;
struct session_op *sess = &state->d_sess;
+ if (state == NULL)
+ return 0;
+
if (state->d_fd < 0) {
printf("cryptodev_digest_cleanup: illegal input\n");
return (0);
@@ -797,16 +807,13 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
state->mac_len = 0;
}
- if (state->copy)
- return 1;
-
if (ioctl(state->d_fd, CIOCFSESSION, &sess->ses) < 0) {
printf("cryptodev_digest_cleanup: failed to close session\n");
ret = 0;
} else {
ret = 1;
}
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
return (ret);
@@ -816,15 +823,39 @@ static int cryptodev_digest_copy(EVP_MD_CTX *to,const EVP_MD_CTX *from)
{
struct dev_crypto_state *fstate = from->md_data;
struct dev_crypto_state *dstate = to->md_data;
+ struct session_op *sess;
+ int digest;
- memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+ if (dstate == NULL || fstate == NULL)
+ return 1;
- if (fstate->mac_len != 0) {
- dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
- memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+ memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+
+ sess = &dstate->d_sess;
+
+ digest = digest_nid_to_cryptodev(to->digest->type);
+
+ sess->mackey = dstate->dummy_mac_key;
+ sess->mackeylen = digest_key_length(to->digest->type);
+ sess->mac = digest;
+
+ dstate->d_fd = get_dev_crypto();
+
+ if (ioctl(dstate->d_fd, CIOCGSESSION, sess) < 0) {
+ put_dev_crypto(dstate->d_fd);
+ dstate->d_fd = -1;
+ printf("cryptodev_digest_init: Open session failed\n");
+ return (0);
}
- dstate->copy = 1;
+ if (fstate->mac_len != 0) {
+ if (fstate->mac_data != NULL)
+ {
+ dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
+ memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+ dstate->mac_len = fstate->mac_len;
+ }
+ }
return 1;
}
@@ -1347,11 +1378,11 @@ ENGINE_load_cryptodev(void)
* find out what asymmetric crypto algorithms we support
*/
if (ioctl(fd, CIOCASYMFEAT, &cryptodev_asymfeat) == -1) {
- close(fd);
+ put_dev_crypto(fd);
ENGINE_free(engine);
return;
}
- close(fd);
+ put_dev_crypto(fd);
if (!ENGINE_set_id(engine, "cryptodev") ||
!ENGINE_set_name(engine, "BSD cryptodev engine") ||
diff --git a/crypto/engine/eng_dyn.c b/crypto/engine/eng_dyn.c
index 807da7a..8fb8634 100644
--- a/crypto/engine/eng_dyn.c
+++ b/crypto/engine/eng_dyn.c
@@ -408,7 +408,7 @@ static int int_load(dynamic_data_ctx *ctx)
int num, loop;
/* Unless told not to, try a direct load */
if((ctx->dir_load != 2) && (DSO_load(ctx->dynamic_dso,
- ctx->DYNAMIC_LIBNAME, NULL, 0)) != NULL)
+ ctx->DYNAMIC_LIBNAME, NULL, 0) != NULL))
return 1;
/* If we're not allowed to use 'dirs' or we have none, fail */
if(!ctx->dir_load || (num = sk_OPENSSL_STRING_num(ctx->dirs)) < 1)
@@ -423,6 +423,9 @@ static int int_load(dynamic_data_ctx *ctx)
{
/* Found what we're looking for */
OPENSSL_free(merge);
+ /* Previous failed loop iterations, if any, will have resulted in
+ * errors. Clear them out before returning success. */
+ ERR_clear_error();
return 1;
}
OPENSSL_free(merge);
diff --git a/crypto/engine/eng_fat.c b/crypto/engine/eng_fat.c
index db66e62..789b8d5 100644
--- a/crypto/engine/eng_fat.c
+++ b/crypto/engine/eng_fat.c
@@ -176,6 +176,7 @@ int ENGINE_register_all_complete(void)
ENGINE *e;
for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
- ENGINE_register_complete(e);
+ if (!(e->flags & ENGINE_FLAGS_NO_REGISTER_ALL))
+ ENGINE_register_complete(e);
return 1;
}
diff --git a/crypto/engine/engine.h b/crypto/engine/engine.h
index 943aeae..f8be497 100644
--- a/crypto/engine/engine.h
+++ b/crypto/engine/engine.h
@@ -141,6 +141,13 @@ extern "C" {
* the existing ENGINE's structural reference count. */
#define ENGINE_FLAGS_BY_ID_COPY (int)0x0004
+/* This flag if for an ENGINE that does not want its methods registered as
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL (int)0x0008
+
/* ENGINEs can support their own command types, and these flags are used in
* ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
* command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@ void ENGINE_load_gost(void);
#endif
#endif
void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
void ENGINE_load_builtin_engines(void);
/* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation
diff --git a/crypto/engine/tb_asnmth.c b/crypto/engine/tb_asnmth.c
new file mode 100644
index 0000000..7509033
--- /dev/null
+++ b/crypto/engine/tb_asnmth.c
@@ -0,0 +1,246 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include "asn1_locl.h"
+#include
+
+/* If this symbol is defined then ENGINE_get_pkey_asn1_meth_engine(), the
+ * function that is used by EVP to hook in pkey_asn1_meth code and cache
+ * defaults (etc), will display brief debugging summaries to stderr with the
+ * 'nid'. */
+/* #define ENGINE_PKEY_ASN1_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_asn1_meth_table = NULL;
+
+void ENGINE_unregister_pkey_asn1_meths(ENGINE *e)
+ {
+ engine_table_unregister(&pkey_asn1_meth_table, e);
+ }
+
+static void engine_unregister_all_pkey_asn1_meths(void)
+ {
+ engine_table_cleanup(&pkey_asn1_meth_table);
+ }
+
+int ENGINE_register_pkey_asn1_meths(ENGINE *e)
+ {
+ if(e->pkey_asn1_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_asn1_meth_table,
+ engine_unregister_all_pkey_asn1_meths, e, nids,
+ num_nids, 0);
+ }
+ return 1;
+ }
+
+void ENGINE_register_all_pkey_asn1_meths(void)
+ {
+ ENGINE *e;
+
+ for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+ ENGINE_register_pkey_asn1_meths(e);
+ }
+
+int ENGINE_set_default_pkey_asn1_meths(ENGINE *e)
+ {
+ if(e->pkey_asn1_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_asn1_meth_table,
+ engine_unregister_all_pkey_asn1_meths, e, nids,
+ num_nids, 1);
+ }
+ return 1;
+ }
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_asn1_meth 'nid' */
+ENGINE *ENGINE_get_pkey_asn1_meth_engine(int nid)
+ {
+ return engine_table_select(&pkey_asn1_meth_table, nid);
+ }
+
+/* Obtains a pkey_asn1_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth(ENGINE *e, int nid)
+ {
+ EVP_PKEY_ASN1_METHOD *ret;
+ ENGINE_PKEY_ASN1_METHS_PTR fn = ENGINE_get_pkey_asn1_meths(e);
+ if(!fn || !fn(e, &ret, NULL, nid))
+ {
+ ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_ASN1_METH,
+ ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+ return NULL;
+ }
+ return ret;
+ }
+
+/* Gets the pkey_asn1_meth callback from an ENGINE structure */
+ENGINE_PKEY_ASN1_METHS_PTR ENGINE_get_pkey_asn1_meths(const ENGINE *e)
+ {
+ return e->pkey_asn1_meths;
+ }
+
+/* Sets the pkey_asn1_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_asn1_meths(ENGINE *e, ENGINE_PKEY_ASN1_METHS_PTR f)
+ {
+ e->pkey_asn1_meths = f;
+ return 1;
+ }
+
+/* Internal function to free up EVP_PKEY_ASN1_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_asn1_meths_free(ENGINE *e)
+ {
+ int i;
+ EVP_PKEY_ASN1_METHOD *pkm;
+ if (e->pkey_asn1_meths)
+ {
+ const int *pknids;
+ int npknids;
+ npknids = e->pkey_asn1_meths(e, NULL, &pknids, 0);
+ for (i = 0; i < npknids; i++)
+ {
+ if (e->pkey_asn1_meths(e, &pkm, NULL, pknids[i]))
+ {
+ EVP_PKEY_asn1_free(pkm);
+ }
+ }
+ }
+ }
+
+/* Find a method based on a string. This does a linear search through
+ * all implemented algorithms. This is OK in practice because only
+ * a small number of algorithms are likely to be implemented in an engine
+ * and it is not used for speed critical operations.
+ */
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth_str(ENGINE *e,
+ const char *str, int len)
+ {
+ int i, nidcount;
+ const int *nids;
+ EVP_PKEY_ASN1_METHOD *ameth;
+ if (!e->pkey_asn1_meths)
+ return NULL;
+ if (len == -1)
+ len = strlen(str);
+ nidcount = e->pkey_asn1_meths(e, NULL, &nids, 0);
+ for (i = 0; i < nidcount; i++)
+ {
+ e->pkey_asn1_meths(e, &ameth, NULL, nids[i]);
+ if (((int)strlen(ameth->pem_str) == len) &&
+ !strncasecmp(ameth->pem_str, str, len))
+ return ameth;
+ }
+ return NULL;
+ }
+
+typedef struct
+ {
+ ENGINE *e;
+ const EVP_PKEY_ASN1_METHOD *ameth;
+ const char *str;
+ int len;
+ } ENGINE_FIND_STR;
+
+static void look_str_cb(int nid, STACK_OF(ENGINE) *sk, ENGINE *def, void *arg)
+ {
+ ENGINE_FIND_STR *lk = arg;
+ int i;
+ if (lk->ameth)
+ return;
+ for (i = 0; i < sk_ENGINE_num(sk); i++)
+ {
+ ENGINE *e = sk_ENGINE_value(sk, i);
+ EVP_PKEY_ASN1_METHOD *ameth;
+ e->pkey_asn1_meths(e, &ameth, NULL, nid);
+ if (((int)strlen(ameth->pem_str) == lk->len) &&
+ !strncasecmp(ameth->pem_str, lk->str, lk->len))
+ {
+ lk->e = e;
+ lk->ameth = ameth;
+ return;
+ }
+ }
+ }
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_pkey_asn1_find_str(ENGINE **pe,
+ const char *str, int len)
+ {
+ ENGINE_FIND_STR fstr;
+ fstr.e = NULL;
+ fstr.ameth = NULL;
+ fstr.str = str;
+ fstr.len = len;
+ CRYPTO_w_lock(CRYPTO_LOCK_ENGINE);
+ engine_table_doall(pkey_asn1_meth_table, look_str_cb, &fstr);
+ /* If found obtain a structural reference to engine */
+ if (fstr.e)
+ {
+ fstr.e->struct_ref++;
+ engine_ref_debug(fstr.e, 0, 1)
+ }
+ *pe = fstr.e;
+ CRYPTO_w_unlock(CRYPTO_LOCK_ENGINE);
+ return fstr.ameth;
+ }
diff --git a/crypto/engine/tb_pkmeth.c b/crypto/engine/tb_pkmeth.c
new file mode 100644
index 0000000..1cdb967
--- /dev/null
+++ b/crypto/engine/tb_pkmeth.c
@@ -0,0 +1,167 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include
+
+/* If this symbol is defined then ENGINE_get_pkey_meth_engine(), the function
+ * that is used by EVP to hook in pkey_meth code and cache defaults (etc), will
+ * display brief debugging summaries to stderr with the 'nid'. */
+/* #define ENGINE_PKEY_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_meth_table = NULL;
+
+void ENGINE_unregister_pkey_meths(ENGINE *e)
+ {
+ engine_table_unregister(&pkey_meth_table, e);
+ }
+
+static void engine_unregister_all_pkey_meths(void)
+ {
+ engine_table_cleanup(&pkey_meth_table);
+ }
+
+int ENGINE_register_pkey_meths(ENGINE *e)
+ {
+ if(e->pkey_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_meth_table,
+ engine_unregister_all_pkey_meths, e, nids,
+ num_nids, 0);
+ }
+ return 1;
+ }
+
+void ENGINE_register_all_pkey_meths()
+ {
+ ENGINE *e;
+
+ for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+ ENGINE_register_pkey_meths(e);
+ }
+
+int ENGINE_set_default_pkey_meths(ENGINE *e)
+ {
+ if(e->pkey_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_meth_table,
+ engine_unregister_all_pkey_meths, e, nids,
+ num_nids, 1);
+ }
+ return 1;
+ }
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_meth 'nid' */
+ENGINE *ENGINE_get_pkey_meth_engine(int nid)
+ {
+ return engine_table_select(&pkey_meth_table, nid);
+ }
+
+/* Obtains a pkey_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_METHOD *ENGINE_get_pkey_meth(ENGINE *e, int nid)
+ {
+ EVP_PKEY_METHOD *ret;
+ ENGINE_PKEY_METHS_PTR fn = ENGINE_get_pkey_meths(e);
+ if(!fn || !fn(e, &ret, NULL, nid))
+ {
+ ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_METH,
+ ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+ return NULL;
+ }
+ return ret;
+ }
+
+/* Gets the pkey_meth callback from an ENGINE structure */
+ENGINE_PKEY_METHS_PTR ENGINE_get_pkey_meths(const ENGINE *e)
+ {
+ return e->pkey_meths;
+ }
+
+/* Sets the pkey_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_meths(ENGINE *e, ENGINE_PKEY_METHS_PTR f)
+ {
+ e->pkey_meths = f;
+ return 1;
+ }
+
+/* Internal function to free up EVP_PKEY_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_meths_free(ENGINE *e)
+ {
+ int i;
+ EVP_PKEY_METHOD *pkm;
+ if (e->pkey_meths)
+ {
+ const int *pknids;
+ int npknids;
+ npknids = e->pkey_meths(e, NULL, &pknids, 0);
+ for (i = 0; i < npknids; i++)
+ {
+ if (e->pkey_meths(e, &pkm, NULL, pknids[i]))
+ {
+ EVP_PKEY_meth_free(pkm);
+ }
+ }
+ }
+ }
diff --git a/crypto/err/err.c b/crypto/err/err.c
index 69713a6..fcdb244 100644
--- a/crypto/err/err.c
+++ b/crypto/err/err.c
@@ -1066,6 +1066,13 @@ void ERR_set_error_data(char *data, int flags)
void ERR_add_error_data(int num, ...)
{
va_list args;
+ va_start(args, num);
+ ERR_add_error_vdata(num, args);
+ va_end(args);
+ }
+
+void ERR_add_error_vdata(int num, va_list args)
+ {
int i,n,s;
char *str,*p,*a;
@@ -1074,7 +1081,6 @@ void ERR_add_error_data(int num, ...)
if (str == NULL) return;
str[0]='\0';
- va_start(args, num);
n=0;
for (i=0; i
#include
+#ifndef OPENSSL_NO_COMP
#include
+#endif
#ifndef OPENSSL_NO_RSA
#include
#endif
@@ -95,6 +97,9 @@
#include
#include
#include
+#ifdef OPENSSL_FIPS
+#include
+#endif
#include
#ifndef OPENSSL_NO_CMS
#include
@@ -102,7 +107,6 @@
#ifndef OPENSSL_NO_JPAKE
#include
#endif
-#include
void ERR_load_crypto_strings(void)
{
@@ -126,7 +130,9 @@ void ERR_load_crypto_strings(void)
ERR_load_ASN1_strings();
ERR_load_CONF_strings();
ERR_load_CRYPTO_strings();
+#ifndef OPENSSL_NO_COMP
ERR_load_COMP_strings();
+#endif
#ifndef OPENSSL_NO_EC
ERR_load_EC_strings();
#endif
@@ -149,12 +155,14 @@ void ERR_load_crypto_strings(void)
#endif
ERR_load_OCSP_strings();
ERR_load_UI_strings();
+#ifdef OPENSSL_FIPS
+ ERR_load_FIPS_strings();
+#endif
#ifndef OPENSSL_NO_CMS
ERR_load_CMS_strings();
#endif
#ifndef OPENSSL_NO_JPAKE
ERR_load_JPAKE_strings();
#endif
- ERR_load_COMP_strings();
#endif
}
diff --git a/crypto/evp/bio_md.c b/crypto/evp/bio_md.c
index 9841e32..144fdfd 100644
--- a/crypto/evp/bio_md.c
+++ b/crypto/evp/bio_md.c
@@ -153,8 +153,12 @@ static int md_write(BIO *b, const char *in, int inl)
{
if (ret > 0)
{
- EVP_DigestUpdate(ctx,(const unsigned char *)in,
- (unsigned int)ret);
+ if (!EVP_DigestUpdate(ctx,(const unsigned char *)in,
+ (unsigned int)ret))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
}
}
if(b->next_bio != NULL)
@@ -220,7 +224,8 @@ static long md_ctrl(BIO *b, int cmd, long num, void *ptr)
case BIO_CTRL_DUP:
dbio=ptr;
dctx=dbio->ptr;
- EVP_MD_CTX_copy_ex(dctx,ctx);
+ if (!EVP_MD_CTX_copy_ex(dctx,ctx))
+ return 0;
b->init=1;
break;
default:
diff --git a/crypto/evp/bio_ok.c b/crypto/evp/bio_ok.c
index 98bc1ab..e643353 100644
--- a/crypto/evp/bio_ok.c
+++ b/crypto/evp/bio_ok.c
@@ -133,10 +133,10 @@ static int ok_new(BIO *h);
static int ok_free(BIO *data);
static long ok_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp);
-static void sig_out(BIO* b);
-static void sig_in(BIO* b);
-static void block_out(BIO* b);
-static void block_in(BIO* b);
+static int sig_out(BIO* b);
+static int sig_in(BIO* b);
+static int block_out(BIO* b);
+static int block_in(BIO* b);
#define OK_BLOCK_SIZE (1024*4)
#define OK_BLOCK_BLOCK 4
#define IOBS (OK_BLOCK_SIZE+ OK_BLOCK_BLOCK+ 3*EVP_MAX_MD_SIZE)
@@ -266,10 +266,24 @@ static int ok_read(BIO *b, char *out, int outl)
ctx->buf_len+= i;
/* no signature yet -- check if we got one */
- if (ctx->sigio == 1) sig_in(b);
+ if (ctx->sigio == 1)
+ {
+ if (!sig_in(b))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
+ }
/* signature ok -- check if we got block */
- if (ctx->sigio == 0) block_in(b);
+ if (ctx->sigio == 0)
+ {
+ if (!block_in(b))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
+ }
/* invalid block -- cancel */
if (ctx->cont <= 0) break;
@@ -293,7 +307,8 @@ static int ok_write(BIO *b, const char *in, int inl)
if ((ctx == NULL) || (b->next_bio == NULL) || (b->init == 0)) return(0);
- if(ctx->sigio) sig_out(b);
+ if(ctx->sigio && !sig_out(b))
+ return 0;
do{
BIO_clear_retry_flags(b);
@@ -332,7 +347,11 @@ static int ok_write(BIO *b, const char *in, int inl)
if(ctx->buf_len >= OK_BLOCK_SIZE+ OK_BLOCK_BLOCK)
{
- block_out(b);
+ if (!block_out(b))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
}
}while(inl > 0);
@@ -379,7 +398,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
case BIO_CTRL_FLUSH:
/* do a final write */
if(ctx->blockout == 0)
- block_out(b);
+ if (!block_out(b))
+ return 0;
while (ctx->blockout)
{
@@ -408,7 +428,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
break;
case BIO_C_SET_MD:
md=ptr;
- EVP_DigestInit_ex(&ctx->md, md, NULL);
+ if (!EVP_DigestInit_ex(&ctx->md, md, NULL))
+ return 0;
b->init=1;
break;
case BIO_C_GET_MD:
@@ -455,7 +476,7 @@ static void longswap(void *_ptr, size_t len)
}
}
-static void sig_out(BIO* b)
+static int sig_out(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -463,9 +484,10 @@ static void sig_out(BIO* b)
ctx=b->ptr;
md=&ctx->md;
- if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return;
+ if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return 1;
- EVP_DigestInit_ex(md, md->digest, NULL);
+ if (!EVP_DigestInit_ex(md, md->digest, NULL))
+ goto berr;
/* FIXME: there's absolutely no guarantee this makes any sense at all,
* particularly now EVP_MD_CTX has been restructured.
*/
@@ -474,14 +496,20 @@ static void sig_out(BIO* b)
longswap(&(ctx->buf[ctx->buf_len]), md->digest->md_size);
ctx->buf_len+= md->digest->md_size;
- EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
- EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+ if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+ goto berr;
ctx->buf_len+= md->digest->md_size;
ctx->blockout= 1;
ctx->sigio= 0;
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
-static void sig_in(BIO* b)
+static int sig_in(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -491,15 +519,18 @@ static void sig_in(BIO* b)
ctx=b->ptr;
md=&ctx->md;
- if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return;
+ if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return 1;
- EVP_DigestInit_ex(md, md->digest, NULL);
+ if (!EVP_DigestInit_ex(md, md->digest, NULL))
+ goto berr;
memcpy(md->md_data, &(ctx->buf[ctx->buf_off]), md->digest->md_size);
longswap(md->md_data, md->digest->md_size);
ctx->buf_off+= md->digest->md_size;
- EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
- EVP_DigestFinal_ex(md, tmp, NULL);
+ if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, tmp, NULL))
+ goto berr;
ret= memcmp(&(ctx->buf[ctx->buf_off]), tmp, md->digest->md_size) == 0;
ctx->buf_off+= md->digest->md_size;
if(ret == 1)
@@ -516,9 +547,13 @@ static void sig_in(BIO* b)
{
ctx->cont= 0;
}
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
-static void block_out(BIO* b)
+static int block_out(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -532,13 +567,20 @@ static void block_out(BIO* b)
ctx->buf[1]=(unsigned char)(tl>>16);
ctx->buf[2]=(unsigned char)(tl>>8);
ctx->buf[3]=(unsigned char)(tl);
- EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
- EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+ if (!EVP_DigestUpdate(md,
+ (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+ goto berr;
ctx->buf_len+= md->digest->md_size;
ctx->blockout= 1;
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
-static void block_in(BIO* b)
+static int block_in(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -554,10 +596,13 @@ static void block_in(BIO* b)
tl|=ctx->buf[2]; tl<<=8;
tl|=ctx->buf[3];
- if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return;
+ if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return 1;
- EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
- EVP_DigestFinal_ex(md, tmp, NULL);
+ if (!EVP_DigestUpdate(md,
+ (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, tmp, NULL))
+ goto berr;
if(memcmp(&(ctx->buf[tl+ OK_BLOCK_BLOCK]), tmp, md->digest->md_size) == 0)
{
/* there might be parts from next block lurking around ! */
@@ -571,5 +616,9 @@ static void block_in(BIO* b)
{
ctx->cont= 0;
}
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c
index c5f9268..2a45d43 100644
--- a/crypto/evp/c_allc.c
+++ b/crypto/evp/c_allc.c
@@ -98,6 +98,9 @@ void OpenSSL_add_all_ciphers(void)
#ifndef OPENSSL_NO_RC4
EVP_add_cipher(EVP_rc4());
EVP_add_cipher(EVP_rc4_40());
+#ifndef OPENSSL_NO_MD5
+ EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
#endif
#ifndef OPENSSL_NO_IDEA
@@ -166,9 +169,9 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_128_cfb1());
EVP_add_cipher(EVP_aes_128_cfb8());
EVP_add_cipher(EVP_aes_128_ofb());
-#if 0
EVP_add_cipher(EVP_aes_128_ctr());
-#endif
+ EVP_add_cipher(EVP_aes_128_gcm());
+ EVP_add_cipher(EVP_aes_128_xts());
EVP_add_cipher_alias(SN_aes_128_cbc,"AES128");
EVP_add_cipher_alias(SN_aes_128_cbc,"aes128");
EVP_add_cipher(EVP_aes_192_ecb());
@@ -177,9 +180,8 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_192_cfb1());
EVP_add_cipher(EVP_aes_192_cfb8());
EVP_add_cipher(EVP_aes_192_ofb());
-#if 0
EVP_add_cipher(EVP_aes_192_ctr());
-#endif
+ EVP_add_cipher(EVP_aes_192_gcm());
EVP_add_cipher_alias(SN_aes_192_cbc,"AES192");
EVP_add_cipher_alias(SN_aes_192_cbc,"aes192");
EVP_add_cipher(EVP_aes_256_ecb());
@@ -188,11 +190,15 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_256_cfb1());
EVP_add_cipher(EVP_aes_256_cfb8());
EVP_add_cipher(EVP_aes_256_ofb());
-#if 0
EVP_add_cipher(EVP_aes_256_ctr());
-#endif
+ EVP_add_cipher(EVP_aes_256_gcm());
+ EVP_add_cipher(EVP_aes_256_xts());
EVP_add_cipher_alias(SN_aes_256_cbc,"AES256");
EVP_add_cipher_alias(SN_aes_256_cbc,"aes256");
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+ EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+ EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
#endif
#ifndef OPENSSL_NO_CAMELLIA
diff --git a/crypto/evp/digest.c b/crypto/evp/digest.c
index 982ba2b..6fc469f 100644
--- a/crypto/evp/digest.c
+++ b/crypto/evp/digest.c
@@ -117,6 +117,10 @@
#include
#endif
+#ifdef OPENSSL_FIPS
+#include
+#endif
+
void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
{
memset(ctx,'\0',sizeof *ctx);
@@ -225,12 +229,26 @@ int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl)
}
if (ctx->flags & EVP_MD_CTX_FLAG_NO_INIT)
return 1;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ {
+ if (FIPS_digestinit(ctx, type))
+ return 1;
+ OPENSSL_free(ctx->md_data);
+ ctx->md_data = NULL;
+ return 0;
+ }
+#endif
return ctx->digest->init(ctx);
}
int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count)
{
+#ifdef OPENSSL_FIPS
+ return FIPS_digestupdate(ctx, data, count);
+#else
return ctx->update(ctx,data,count);
+#endif
}
/* The caller can assume that this removes any secret data from the context */
@@ -245,6 +263,9 @@ int EVP_DigestFinal(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
/* The caller can assume that this removes any secret data from the context */
int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
{
+#ifdef OPENSSL_FIPS
+ return FIPS_digestfinal(ctx, md, size);
+#else
int ret;
OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
@@ -258,6 +279,7 @@ int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
}
memset(ctx->md_data,0,ctx->digest->ctx_size);
return ret;
+#endif
}
int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in)
@@ -351,6 +373,7 @@ void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx)
/* This call frees resources associated with the context */
int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
{
+#ifndef OPENSSL_FIPS
/* Don't assume ctx->md_data was cleaned in EVP_Digest_Final,
* because sometimes only copies of the context are ever finalised.
*/
@@ -363,6 +386,7 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
OPENSSL_cleanse(ctx->md_data,ctx->digest->ctx_size);
OPENSSL_free(ctx->md_data);
}
+#endif
if (ctx->pctx)
EVP_PKEY_CTX_free(ctx->pctx);
#ifndef OPENSSL_NO_ENGINE
@@ -370,6 +394,9 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
/* The EVP_MD we used belongs to an ENGINE, release the
* functional reference we held for this reason. */
ENGINE_finish(ctx->engine);
+#endif
+#ifdef OPENSSL_FIPS
+ FIPS_md_ctx_cleanup(ctx);
#endif
memset(ctx,'\0',sizeof *ctx);
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index bd6c0a3..1bfb5d9 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -1,5 +1,5 @@
/* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -56,57 +56,511 @@
#include
#include
#include "evp_locl.h"
-
-static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
- const unsigned char *iv, int enc);
+#ifndef OPENSSL_FIPS
+#include "modes_lcl.h"
+#include
typedef struct
{
AES_KEY ks;
+ block128_f block;
+ union {
+ cbc128_f cbc;
+ ctr128_f ctr;
+ } stream;
} EVP_AES_KEY;
-#define data(ctx) EVP_C_DATA(EVP_AES_KEY,ctx)
-
-IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
- NID_aes_128, 16, 16, 16, 128,
- 0, aes_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv,
- NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_192, ks, AES, EVP_AES_KEY,
- NID_aes_192, 16, 24, 16, 128,
- 0, aes_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv,
- NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_256, ks, AES, EVP_AES_KEY,
- NID_aes_256, 16, 32, 16, 128,
- 0, aes_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv,
- NULL)
-
-#define IMPLEMENT_AES_CFBR(ksize,cbits) IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
-
-IMPLEMENT_AES_CFBR(128,1)
-IMPLEMENT_AES_CFBR(192,1)
-IMPLEMENT_AES_CFBR(256,1)
-
-IMPLEMENT_AES_CFBR(128,8)
-IMPLEMENT_AES_CFBR(192,8)
-IMPLEMENT_AES_CFBR(256,8)
+typedef struct
+ {
+ AES_KEY ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ GCM128_CONTEXT gcm;
+ unsigned char *iv; /* Temporary IV store */
+ int ivlen; /* IV length */
+ int taglen;
+ int iv_gen; /* It is OK to generate IVs */
+ int tls_aad_len; /* TLS AAD length */
+ ctr128_f ctr;
+ } EVP_AES_GCM_CTX;
+
+typedef struct
+ {
+ AES_KEY ks1, ks2; /* AES key schedules to use */
+ XTS128_CONTEXT xts;
+ void (*stream)(const unsigned char *in,
+ unsigned char *out, size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+ } EVP_AES_XTS_CTX;
+
+typedef struct
+ {
+ AES_KEY ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ int tag_set; /* Set if tag is valid */
+ int len_set; /* Set if message length set */
+ int L, M; /* L and M parameters from RFC3610 */
+ CCM128_CONTEXT ccm;
+ ccm128_f str;
+ } EVP_AES_CCM_CTX;
+
+#define MAXBITCHUNK ((size_t)1<<(sizeof(size_t)*8-4))
+
+#ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ unsigned char *ivec, int enc);
+#endif
+#ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char iv[16]);
+#endif
+#ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ const unsigned char ivec[AES_BLOCK_SIZE]);
+#endif
+#ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp,char *out,size_t len,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp,char *out,size_t len,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+#endif
+
+#if defined(AES_ASM) && !defined(I386_ONLY) && ( \
+ ((defined(__i386) || defined(__i386__) || \
+ defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) )
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+#ifdef VPAES_ASM
+#define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#endif
+#ifdef BSAES_ASM
+#define BSAES_CAPABLE VPAES_CAPABLE
+#endif
+/*
+ * AES-NI section
+ */
+#define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks (const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char ivec[16],
+ unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks (const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char ivec[16],
+ unsigned char cmac[16]);
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ int ret, mode;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc)
+ {
+ ret = aesni_set_decrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+ dat->block = (block128_f)aesni_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)aesni_cbc_encrypt :
+ NULL;
+ }
+ else {
+ ret = aesni_set_encrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+ dat->block = (block128_f)aesni_encrypt;
+ if (mode==EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f)aesni_cbc_encrypt;
+ else if (mode==EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+ else
+ dat->stream.cbc = NULL;
+ }
+
+ if(ret < 0)
+ {
+ EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
+ return 0;
+ }
+
+ return 1;
+ }
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ aesni_cbc_encrypt(in,out,len,ctx->cipher_data,ctx->iv,ctx->encrypt);
+
+ return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+
+ if (lencipher_data,ctx->encrypt);
+
+ return 1;
+}
+
+#define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key)
+ {
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f)aesni_encrypt);
+ gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+ /* If we have an iv can set it directly, otherwise use
+ * saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv)
+ {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ }
+ else
+ {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+ }
+
+#define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key)
+ {
+ /* key_len is two AES keys */
+ if (enc)
+ {
+ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)aesni_encrypt;
+ xctx->stream = aesni_xts_encrypt;
+ }
+ else
+ {
+ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)aesni_decrypt;
+ xctx->stream = aesni_xts_decrypt;
+ }
+
+ aesni_set_encrypt_key(key + ctx->key_len/2,
+ ctx->key_len * 4, &xctx->ks2);
+ xctx->xts.block2 = (block128_f)aesni_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ }
+
+ if (iv)
+ {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+ }
+
+#define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key)
+ {
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)aesni_encrypt);
+ cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
+ (ccm128_f)aesni_ccm64_decrypt_blocks;
+ cctx->key_set = 1;
+ }
+ if (iv)
+ {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+ }
+
+#define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aesni_init_key, \
+ aesni_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize, \
+ keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aesni_##mode##_init_key, \
+ aesni_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#else
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+#endif
+
+#define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
- int ret;
+ int ret, mode;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
- if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
- || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
- || enc)
- ret=AES_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc)
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+ {
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_decrypt;
+ dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt;
+ }
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)vpaes_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)vpaes_cbc_encrypt :
+ NULL;
+ }
+ else
+#endif
+ {
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)AES_cbc_encrypt :
+ NULL;
+ }
else
- ret=AES_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
+ {
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_encrypt;
+ dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+ }
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)vpaes_encrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)vpaes_cbc_encrypt :
+ NULL;
+ }
+ else
+#endif
+ {
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_encrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)AES_cbc_encrypt :
+ NULL;
+#ifdef AES_CTR_ASM
+ if (mode==EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f)AES_ctr32_encrypt;
+#endif
+ }
if(ret < 0)
{
@@ -117,4 +571,744 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
}
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (dat->stream.cbc)
+ (*dat->stream.cbc)(in,out,len,&dat->ks,ctx->iv,ctx->encrypt);
+ else if (ctx->encrypt)
+ CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+ else
+ CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+
+ return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+ size_t i;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (lenblock)(in+i,out+i,&dat->ks);
+
+ return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ CRYPTO_ofb128_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,dat->block);
+ return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ CRYPTO_cfb128_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ CRYPTO_cfb128_8_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (ctx->flags&EVP_CIPH_FLAG_LENGTH_BITS) {
+ CRYPTO_cfb128_1_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ return 1;
+ }
+
+ while (len>=MAXBITCHUNK) {
+ CRYPTO_cfb128_1_encrypt(in,out,MAXBITCHUNK*8,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ len-=MAXBITCHUNK;
+ }
+ if (len)
+ CRYPTO_cfb128_1_encrypt(in,out,len*8,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+
+ return 1;
+}
+
+static int aes_ctr_cipher (EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ unsigned int num = ctx->num;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (dat->stream.ctr)
+ CRYPTO_ctr128_encrypt_ctr32(in,out,len,&dat->ks,
+ ctx->iv,ctx->buf,&num,dat->stream.ctr);
+ else
+ CRYPTO_ctr128_encrypt(in,out,len,&dat->ks,
+ ctx->iv,ctx->buf,&num,dat->block);
+ ctx->num = (size_t)num;
+ return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes,128,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,192,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,256,EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+ {
+ EVP_AES_GCM_CTX *gctx = c->cipher_data;
+ OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+ if (gctx->iv != c->iv)
+ OPENSSL_free(gctx->iv);
+ return 1;
+ }
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter) {
+ int n=8;
+ unsigned char c;
+
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c) return;
+ } while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+ {
+ EVP_AES_GCM_CTX *gctx = c->cipher_data;
+ switch (type)
+ {
+ case EVP_CTRL_INIT:
+ gctx->key_set = 0;
+ gctx->iv_set = 0;
+ gctx->ivlen = c->cipher->iv_len;
+ gctx->iv = c->iv;
+ gctx->taglen = -1;
+ gctx->iv_gen = 0;
+ gctx->tls_aad_len = -1;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IVLEN:
+ if (arg <= 0)
+ return 0;
+#ifdef OPENSSL_FIPS
+ if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)
+ && arg < 12)
+ return 0;
+#endif
+ /* Allocate memory for IV if needed */
+ if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen))
+ {
+ if (gctx->iv != c->iv)
+ OPENSSL_free(gctx->iv);
+ gctx->iv = OPENSSL_malloc(arg);
+ if (!gctx->iv)
+ return 0;
+ }
+ gctx->ivlen = arg;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_TAG:
+ if (arg <= 0 || arg > 16 || c->encrypt)
+ return 0;
+ memcpy(c->buf, ptr, arg);
+ gctx->taglen = arg;
+ return 1;
+
+ case EVP_CTRL_GCM_GET_TAG:
+ if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+ return 0;
+ memcpy(ptr, c->buf, arg);
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IV_FIXED:
+ /* Special case: -1 length restores whole IV */
+ if (arg == -1)
+ {
+ memcpy(gctx->iv, ptr, gctx->ivlen);
+ gctx->iv_gen = 1;
+ return 1;
+ }
+ /* Fixed field must be at least 4 bytes and invocation field
+ * at least 8.
+ */
+ if ((arg < 4) || (gctx->ivlen - arg) < 8)
+ return 0;
+ if (arg)
+ memcpy(gctx->iv, ptr, arg);
+ if (c->encrypt &&
+ RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+ return 0;
+ gctx->iv_gen = 1;
+ return 1;
+
+ case EVP_CTRL_GCM_IV_GEN:
+ if (gctx->iv_gen == 0 || gctx->key_set == 0)
+ return 0;
+ CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+ if (arg <= 0 || arg > gctx->ivlen)
+ arg = gctx->ivlen;
+ memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+ /* Invocation field will be at least 8 bytes in size and
+ * so no need to check wrap around or increment more than
+ * last 8 bytes.
+ */
+ ctr64_inc(gctx->iv + gctx->ivlen - 8);
+ gctx->iv_set = 1;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IV_INV:
+ if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+ return 0;
+ memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+ CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ return 1;
+
+ case EVP_CTRL_AEAD_TLS1_AAD:
+ /* Save the AAD for later use */
+ if (arg != 13)
+ return 0;
+ memcpy(c->buf, ptr, arg);
+ gctx->tls_aad_len = arg;
+ {
+ unsigned int len=c->buf[arg-2]<<8|c->buf[arg-1];
+ /* Correct length for explicit IV */
+ len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ /* If decrypting correct for tag too */
+ if (!c->encrypt)
+ len -= EVP_GCM_TLS_TAG_LEN;
+ c->buf[arg-2] = len>>8;
+ c->buf[arg-1] = len & 0xff;
+ }
+ /* Extra padding: tag appended to record */
+ return EVP_GCM_TLS_TAG_LEN;
+
+ default:
+ return -1;
+
+ }
+ }
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key)
+ { do {
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE)
+ {
+ AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+ (block128_f)AES_encrypt);
+ gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+ break;
+ }
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+ (block128_f)vpaes_encrypt);
+ gctx->ctr = NULL;
+ break;
+ }
+#endif
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+#ifdef AES_CTR_ASM
+ gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
+#else
+ gctx->ctr = NULL;
+#endif
+ } while (0);
+
+ /* If we have an iv can set it directly, otherwise use
+ * saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv)
+ {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ }
+ else
+ {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+ }
+
+/* Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ int rv = -1;
+ /* Encrypt/decrypt must be performed in place */
+ if (out != in || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN+EVP_GCM_TLS_TAG_LEN))
+ return -1;
+ /* Set IV from start of buffer or generate IV and write to start
+ * of buffer.
+ */
+ if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+ EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+ EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+ goto err;
+ /* Use saved AAD */
+ if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+ goto err;
+ /* Fix buffer and length to point to payload */
+ in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+ if (ctx->encrypt)
+ {
+ /* Encrypt payload */
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ goto err;
+ }
+ else {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+ goto err;
+ }
+ out += len;
+ /* Finally write tag */
+ CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+ rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+ }
+ else
+ {
+ /* Decrypt */
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ goto err;
+ }
+ else {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+ goto err;
+ }
+ /* Retrieve tag */
+ CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf,
+ EVP_GCM_TLS_TAG_LEN);
+ /* If tag mismatch wipe buffer */
+ if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN))
+ {
+ OPENSSL_cleanse(out, len);
+ goto err;
+ }
+ rv = len;
+ }
+
+ err:
+ gctx->iv_set = 0;
+ gctx->tls_aad_len = -1;
+ return rv;
+ }
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ /* If not set up, return error */
+ if (!gctx->key_set)
+ return -1;
+
+ if (gctx->tls_aad_len >= 0)
+ return aes_gcm_tls_cipher(ctx, out, in, len);
+
+ if (!gctx->iv_set)
+ return -1;
+ if (in)
+ {
+ if (out == NULL)
+ {
+ if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+ return -1;
+ }
+ else if (ctx->encrypt)
+ {
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ return -1;
+ }
+ else {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+ return -1;
+ }
+ }
+ else
+ {
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ return -1;
+ }
+ else {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+ return -1;
+ }
+ }
+ return len;
+ }
+ else
+ {
+ if (!ctx->encrypt)
+ {
+ if (gctx->taglen < 0)
+ return -1;
+ if (CRYPTO_gcm128_finish(&gctx->gcm,
+ ctx->buf, gctx->taglen) != 0)
+ return -1;
+ gctx->iv_set = 0;
+ return 0;
+ }
+ CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+ gctx->taglen = 16;
+ /* Don't reuse the IV */
+ gctx->iv_set = 0;
+ return 0;
+ }
+
+ }
+
+#define CUSTOM_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 \
+ | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM,
+ EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,gcm,GCM,
+ EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM,
+ EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+ {
+ EVP_AES_XTS_CTX *xctx = c->cipher_data;
+ if (type != EVP_CTRL_INIT)
+ return -1;
+ /* key1 and key2 are used as an indicator both key and IV are set */
+ xctx->xts.key1 = NULL;
+ xctx->xts.key2 = NULL;
+ return 1;
+ }
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key) do
+ {
+#ifdef AES_XTS_ASM
+ xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+#else
+ xctx->stream = NULL;
+#endif
+ /* key_len is two AES keys */
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE)
+ xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ if (enc)
+ {
+ vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)vpaes_encrypt;
+ }
+ else
+ {
+ vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)vpaes_decrypt;
+ }
+
+ vpaes_set_encrypt_key(key + ctx->key_len/2,
+ ctx->key_len * 4, &xctx->ks2);
+ xctx->xts.block2 = (block128_f)vpaes_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ break;
+ }
+#endif
+ if (enc)
+ {
+ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)AES_encrypt;
+ }
+ else
+ {
+ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)AES_decrypt;
+ }
+
+ AES_set_encrypt_key(key + ctx->key_len/2,
+ ctx->key_len * 4, &xctx->ks2);
+ xctx->xts.block2 = (block128_f)AES_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ } while (0);
+
+ if (iv)
+ {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+ }
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!xctx->xts.key1 || !xctx->xts.key2)
+ return 0;
+ if (!out || !in || lenflags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
+ (len > (1UL<<20)*16))
+ {
+ EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
+ return 0;
+ }
+#endif
+ if (xctx->stream)
+ (*xctx->stream)(in, out, len,
+ xctx->xts.key1, xctx->xts.key2, ctx->iv);
+ else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+ ctx->encrypt))
+ return 0;
+ return 1;
+ }
+
+#define aes_xts_cleanup NULL
+
+#define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+ {
+ EVP_AES_CCM_CTX *cctx = c->cipher_data;
+ switch (type)
+ {
+ case EVP_CTRL_INIT:
+ cctx->key_set = 0;
+ cctx->iv_set = 0;
+ cctx->L = 8;
+ cctx->M = 12;
+ cctx->tag_set = 0;
+ cctx->len_set = 0;
+ return 1;
+
+ case EVP_CTRL_CCM_SET_IVLEN:
+ arg = 15 - arg;
+ case EVP_CTRL_CCM_SET_L:
+ if (arg < 2 || arg > 8)
+ return 0;
+ cctx->L = arg;
+ return 1;
+
+ case EVP_CTRL_CCM_SET_TAG:
+ if ((arg & 1) || arg < 4 || arg > 16)
+ return 0;
+ if ((c->encrypt && ptr) || (!c->encrypt && !ptr))
+ return 0;
+ if (ptr)
+ {
+ cctx->tag_set = 1;
+ memcpy(c->buf, ptr, arg);
+ }
+ cctx->M = arg;
+ return 1;
+
+ case EVP_CTRL_CCM_GET_TAG:
+ if (!c->encrypt || !cctx->tag_set)
+ return 0;
+ if(!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+ return 0;
+ cctx->tag_set = 0;
+ cctx->iv_set = 0;
+ cctx->len_set = 0;
+ return 1;
+
+ default:
+ return -1;
+
+ }
+ }
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) do
+ {
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)vpaes_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ break;
+ }
+#endif
+ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)AES_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ } while (0);
+ if (iv)
+ {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+ }
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ CCM128_CONTEXT *ccm = &cctx->ccm;
+ /* If not set up, return error */
+ if (!cctx->iv_set && !cctx->key_set)
+ return -1;
+ if (!ctx->encrypt && !cctx->tag_set)
+ return -1;
+ if (!out)
+ {
+ if (!in)
+ {
+ if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L,len))
+ return -1;
+ cctx->len_set = 1;
+ return len;
+ }
+ /* If have AAD need message length */
+ if (!cctx->len_set && len)
+ return -1;
+ CRYPTO_ccm128_aad(ccm, in, len);
+ return len;
+ }
+ /* EVP_*Final() doesn't return any data */
+ if (!in)
+ return 0;
+ /* If not set length yet do it */
+ if (!cctx->len_set)
+ {
+ if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+ return -1;
+ cctx->len_set = 1;
+ }
+ if (ctx->encrypt)
+ {
+ if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+ cctx->str) :
+ CRYPTO_ccm128_encrypt(ccm, in, out, len))
+ return -1;
+ cctx->tag_set = 1;
+ return len;
+ }
+ else
+ {
+ int rv = -1;
+ if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+ cctx->str) :
+ !CRYPTO_ccm128_decrypt(ccm, in, out, len))
+ {
+ unsigned char tag[16];
+ if (CRYPTO_ccm128_tag(ccm, tag, cctx->M))
+ {
+ if (!memcmp(tag, ctx->buf, cctx->M))
+ rv = len;
+ }
+ }
+ if (rv == -1)
+ OPENSSL_cleanse(out, len);
+ cctx->iv_set = 0;
+ cctx->tag_set = 0;
+ cctx->len_set = 0;
+ return rv;
+ }
+
+ }
+
+#define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+
+#endif
#endif
diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 0000000..483e04b
--- /dev/null
+++ b/crypto/evp/e_aes_cbc_hmac_sha1.c
@@ -0,0 +1,580 @@
+/* ====================================================================
+ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include
+
+#include
+#include
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include
+#include
+#include
+#include
+#include "evp_locl.h"
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD 0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+ {
+ AES_KEY ks;
+ SHA_CTX head,tail,md;
+ size_t payload_length; /* AAD length in decrypt case */
+ union {
+ unsigned int tls_ver;
+ unsigned char tls_aad[16]; /* 13 used */
+ } aux;
+ } EVP_AES_HMAC_SHA1;
+
+#define NO_PAYLOAD_LENGTH ((size_t)-1)
+
+#if defined(AES_ASM) && ( \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) )
+
+#if defined(__GNUC__) && __GNUC__>=2 && !defined(PEDANTIC)
+# define BSWAP(x) ({ unsigned int r=(x); asm ("bswapl %0":"=r"(r):"0"(r)); r; })
+#endif
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+ const AES_KEY *key, unsigned char iv[16],
+ SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+ const unsigned char *inkey,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_HMAC_SHA1 *key = data(ctx);
+ int ret;
+
+ if (enc)
+ ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+ else
+ ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+ SHA1_Init(&key->head); /* handy when benchmarking */
+ key->tail = key->head;
+ key->md = key->head;
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ return ret<0?0:1;
+ }
+
+#define STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{ const unsigned char *ptr = data;
+ size_t res;
+
+ if ((res = c->num)) {
+ res = SHA_CBLOCK-res;
+ if (lenNh += len>>29;
+ c->Nl += len<<=3;
+ if (c->Nl<(unsigned int)len) c->Nh++;
+ }
+
+ if (res)
+ SHA1_Update(c,ptr,res);
+}
+
+#ifdef SHA1_Update
+#undef SHA1_Update
+#endif
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_HMAC_SHA1 *key = data(ctx);
+ unsigned int l;
+ size_t plen = key->payload_length,
+ iv = 0, /* explicit IV in TLS 1.1 and later */
+ sha_off = 0;
+#if defined(STITCHED_CALL)
+ size_t aes_off = 0,
+ blocks;
+
+ sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ if (len%AES_BLOCK_SIZE) return 0;
+
+ if (ctx->encrypt) {
+ if (plen==NO_PAYLOAD_LENGTH)
+ plen = len;
+ else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+ return 0;
+ else if (key->aux.tls_ver >= TLS1_1_VERSION)
+ iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+ if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+ SHA1_Update(&key->md,in+iv,sha_off);
+
+ aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+ ctx->iv,&key->md,in+iv+sha_off);
+ blocks *= SHA_CBLOCK;
+ aes_off += blocks;
+ sha_off += blocks;
+ key->md.Nh += blocks>>29;
+ key->md.Nl += blocks<<=3;
+ if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+ } else {
+ sha_off = 0;
+ }
+#endif
+ sha_off += iv;
+ SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+ if (plen!=len) { /* "TLS" mode of operation */
+ if (in!=out)
+ memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+ /* calculate HMAC and append it to payload */
+ SHA1_Final(out+plen,&key->md);
+ key->md = key->tail;
+ SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+ SHA1_Final(out+plen,&key->md);
+
+ /* pad the payload|hmac */
+ plen += SHA_DIGEST_LENGTH;
+ for (l=len-plen-1;plenks,ctx->iv,1);
+ } else {
+ aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+ &key->ks,ctx->iv,1);
+ }
+ } else {
+ union { unsigned int u[SHA_DIGEST_LENGTH/sizeof(unsigned int)];
+ unsigned char c[32+SHA_DIGEST_LENGTH]; } mac, *pmac;
+
+ /* arrange cache line alignment */
+ pmac = (void *)(((size_t)mac.c+31)&((size_t)0-32));
+
+ /* decrypt HMAC|padding at once */
+ aesni_cbc_encrypt(in,out,len,
+ &key->ks,ctx->iv,0);
+
+ if (plen) { /* "TLS" mode of operation */
+ size_t inp_len, mask, j, i;
+ unsigned int res, maxpad, pad, bitlen;
+ int ret = 1;
+ union { unsigned int u[SHA_LBLOCK];
+ unsigned char c[SHA_CBLOCK]; }
+ *data = (void *)key->md.data;
+
+ if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+ >= TLS1_1_VERSION)
+ iv = AES_BLOCK_SIZE;
+
+ if (len<(iv+SHA_DIGEST_LENGTH+1))
+ return 0;
+
+ /* omit explicit iv */
+ out += iv;
+ len -= iv;
+
+ /* figure out payload length */
+ pad = out[len-1];
+ maxpad = len-(SHA_DIGEST_LENGTH+1);
+ maxpad |= (255-maxpad)>>(sizeof(maxpad)*8-8);
+ maxpad &= 255;
+
+ inp_len = len - (SHA_DIGEST_LENGTH+pad+1);
+ mask = (0-((inp_len-len)>>(sizeof(inp_len)*8-1)));
+ inp_len &= mask;
+ ret &= (int)mask;
+
+ key->aux.tls_aad[plen-2] = inp_len>>8;
+ key->aux.tls_aad[plen-1] = inp_len;
+
+ /* calculate HMAC */
+ key->md = key->head;
+ SHA1_Update(&key->md,key->aux.tls_aad,plen);
+
+#if 1
+ len -= SHA_DIGEST_LENGTH; /* amend mac */
+ if (len>=(256+SHA_CBLOCK)) {
+ j = (len-(256+SHA_CBLOCK))&(0-SHA_CBLOCK);
+ j += SHA_CBLOCK-key->md.num;
+ SHA1_Update(&key->md,out,j);
+ out += j;
+ len -= j;
+ inp_len -= j;
+ }
+
+ /* but pretend as if we hashed padded payload */
+ bitlen = key->md.Nl+(inp_len<<3); /* at most 18 bits */
+#ifdef BSWAP
+ bitlen = BSWAP(bitlen);
+#else
+ mac.c[0] = 0;
+ mac.c[1] = (unsigned char)(bitlen>>16);
+ mac.c[2] = (unsigned char)(bitlen>>8);
+ mac.c[3] = (unsigned char)bitlen;
+ bitlen = mac.u[0];
+#endif
+
+ pmac->u[0]=0;
+ pmac->u[1]=0;
+ pmac->u[2]=0;
+ pmac->u[3]=0;
+ pmac->u[4]=0;
+
+ for (res=key->md.num, j=0;j>(sizeof(j)*8-8);
+ c &= mask;
+ c |= 0x80&~mask&~((inp_len-j)>>(sizeof(j)*8-8));
+ data->c[res++]=(unsigned char)c;
+
+ if (res!=SHA_CBLOCK) continue;
+
+ mask = 0-((inp_len+8-j)>>(sizeof(j)*8-1));
+ data->u[SHA_LBLOCK-1] |= bitlen&mask;
+ sha1_block_data_order(&key->md,data,1);
+ mask &= 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+ pmac->u[0] |= key->md.h0 & mask;
+ pmac->u[1] |= key->md.h1 & mask;
+ pmac->u[2] |= key->md.h2 & mask;
+ pmac->u[3] |= key->md.h3 & mask;
+ pmac->u[4] |= key->md.h4 & mask;
+ res=0;
+ }
+
+ for(i=res;ic[i]=0;
+
+ if (res>SHA_CBLOCK-8) {
+ mask = 0-((inp_len+8-j)>>(sizeof(j)*8-1));
+ data->u[SHA_LBLOCK-1] |= bitlen&mask;
+ sha1_block_data_order(&key->md,data,1);
+ mask &= 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+ pmac->u[0] |= key->md.h0 & mask;
+ pmac->u[1] |= key->md.h1 & mask;
+ pmac->u[2] |= key->md.h2 & mask;
+ pmac->u[3] |= key->md.h3 & mask;
+ pmac->u[4] |= key->md.h4 & mask;
+
+ memset(data,0,SHA_CBLOCK);
+ j+=64;
+ }
+ data->u[SHA_LBLOCK-1] = bitlen;
+ sha1_block_data_order(&key->md,data,1);
+ mask = 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+ pmac->u[0] |= key->md.h0 & mask;
+ pmac->u[1] |= key->md.h1 & mask;
+ pmac->u[2] |= key->md.h2 & mask;
+ pmac->u[3] |= key->md.h3 & mask;
+ pmac->u[4] |= key->md.h4 & mask;
+
+#ifdef BSWAP
+ pmac->u[0] = BSWAP(pmac->u[0]);
+ pmac->u[1] = BSWAP(pmac->u[1]);
+ pmac->u[2] = BSWAP(pmac->u[2]);
+ pmac->u[3] = BSWAP(pmac->u[3]);
+ pmac->u[4] = BSWAP(pmac->u[4]);
+#else
+ for (i=0;i<5;i++) {
+ res = pmac->u[i];
+ pmac->c[4*i+0]=(unsigned char)(res>>24);
+ pmac->c[4*i+1]=(unsigned char)(res>>16);
+ pmac->c[4*i+2]=(unsigned char)(res>>8);
+ pmac->c[4*i+3]=(unsigned char)res;
+ }
+#endif
+ len += SHA_DIGEST_LENGTH;
+#else
+ SHA1_Update(&key->md,out,inp_len);
+ res = key->md.num;
+ SHA1_Final(pmac->c,&key->md);
+
+ {
+ unsigned int inp_blocks, pad_blocks;
+
+ /* but pretend as if we hashed padded payload */
+ inp_blocks = 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+ res += (unsigned int)(len-inp_len);
+ pad_blocks = res / SHA_CBLOCK;
+ res %= SHA_CBLOCK;
+ pad_blocks += 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+ for (;inp_blocks