diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0ada434
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+libs
+obj
+local.properties
diff --git a/Android.mk b/Android.mk
index 741123b..9f5834e 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,9 +1,13 @@
 LOCAL_PATH := $(call my-dir)
 
-subdirs := $(addprefix $(LOCAL_PATH)/,$(addsuffix /Android.mk, \
-		crypto \
-		ssl \
-		apps \
-	))
+# Enable to be able to use ALOG* with #include "cutils/log.h"
+#log_c_includes += system/core/include
+#log_shared_libraries := liblog
 
-include $(subdirs)
+# These makefiles are here instead of being Android.mk files in the
+# respective crypto, ssl, and apps directories so
+# that import_openssl.sh import won't remove them.
+include $(LOCAL_PATH)/build-config.mk
+include $(LOCAL_PATH)/Crypto.mk
+include $(LOCAL_PATH)/Ssl.mk
+include $(LOCAL_PATH)/Apps.mk
diff --git a/AndroidManifest.xml b/AndroidManifest.xml
new file mode 100644
index 0000000..7b13d57
--- /dev/null
+++ b/AndroidManifest.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.duh"
+    android:versionCode="1"
+    android:versionName="1.0">
+    <uses-sdk android:minSdkVersion="9"/>
+    <application/>
+</manifest>
diff --git a/Apps-config.mk b/Apps-config.mk
new file mode 100644
index 0000000..2ee8026
--- /dev/null
+++ b/Apps-config.mk
@@ -0,0 +1,133 @@
+# Auto-generated - DO NOT EDIT!
+# To regenerate, edit openssl.config, then run:
+#     ./import_openssl.sh import /path/to/openssl-1.0.1e.tar.gz
+#
+# Before including this file, the local Android.mk must define the following
+# variables:
+#
+#    local_c_flags
+#    local_c_includes
+#    local_additional_dependencies
+#
+# This script will define the following variables:
+#
+#    target_c_flags
+#    target_c_includes
+#    target_src_files
+#
+#    host_c_flags
+#    host_c_includes
+#    host_src_files
+#
+
+# Ensure these are empty.
+unknown_arch_c_flags :=
+unknown_arch_src_files :=
+unknown_arch_exclude_files :=
+
+
+common_c_flags := \
+  -DMONOLITH \
+
+common_src_files := \
+  apps/app_rand.c \
+  apps/apps.c \
+  apps/asn1pars.c \
+  apps/ca.c \
+  apps/ciphers.c \
+  apps/crl.c \
+  apps/crl2p7.c \
+  apps/dgst.c \
+  apps/dh.c \
+  apps/dhparam.c \
+  apps/dsa.c \
+  apps/dsaparam.c \
+  apps/ec.c \
+  apps/ecparam.c \
+  apps/enc.c \
+  apps/engine.c \
+  apps/errstr.c \
+  apps/gendh.c \
+  apps/gendsa.c \
+  apps/genpkey.c \
+  apps/genrsa.c \
+  apps/nseq.c \
+  apps/ocsp.c \
+  apps/openssl.c \
+  apps/passwd.c \
+  apps/pkcs12.c \
+  apps/pkcs7.c \
+  apps/pkcs8.c \
+  apps/pkey.c \
+  apps/pkeyparam.c \
+  apps/pkeyutl.c \
+  apps/prime.c \
+  apps/rand.c \
+  apps/req.c \
+  apps/rsa.c \
+  apps/rsautl.c \
+  apps/s_cb.c \
+  apps/s_client.c \
+  apps/s_server.c \
+  apps/s_socket.c \
+  apps/s_time.c \
+  apps/sess_id.c \
+  apps/smime.c \
+  apps/speed.c \
+  apps/spkac.c \
+  apps/srp.c \
+  apps/verify.c \
+  apps/version.c \
+  apps/x509.c \
+
+common_c_includes := \
+  . \
+  include \
+
+arm_c_flags :=
+
+arm_src_files :=
+
+arm_exclude_files :=
+
+x86_c_flags :=
+
+x86_src_files :=
+
+x86_exclude_files :=
+
+x86_64_c_flags :=
+
+x86_64_src_files :=
+
+x86_64_exclude_files :=
+
+mips_c_flags :=
+
+mips_src_files :=
+
+mips_exclude_files :=
+
+target_arch := $(TARGET_ARCH)
+ifeq ($(target_arch)-$(TARGET_HAS_BIGENDIAN),mips-true)
+target_arch := unknown_arch
+endif
+
+target_c_flags    := $(common_c_flags) $($(target_arch)_c_flags) $(local_c_flags)
+target_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+target_src_files  := $(common_src_files) $($(target_arch)_src_files)
+target_src_files  := $(filter-out $($(target_arch)_exclude_files), $(target_src_files))
+
+ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
+host_arch := x86
+else
+host_arch := unknown_arch
+endif
+
+host_c_flags    := $(common_c_flags) $($(host_arch)_c_flags) $(local_c_flags)
+host_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+host_src_files  := $(common_src_files) $($(host_arch)_src_files)
+host_src_files  := $(filter-out $($(host_arch)_exclude_files), $(host_src_files))
+
+local_additional_dependencies += $(LOCAL_PATH)/Apps-config.mk
+
diff --git a/Apps.mk b/Apps.mk
new file mode 100644
index 0000000..3ba6fe3
--- /dev/null
+++ b/Apps.mk
@@ -0,0 +1,34 @@
+# Copyright 2006 The Android Open Source Project
+
+LOCAL_PATH:= $(call my-dir)
+
+local_c_includes := 	$(NDK_PROJECT_PATH) \
+			$(NDK_PROJECT_PATH)/include
+local_c_flags :=
+
+local_additional_dependencies := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Apps.mk
+
+include $(LOCAL_PATH)/Apps-config.mk
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= openssl
+LOCAL_CLANG := true
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := $(target_src_files)
+LOCAL_SHARED_LIBRARIES := libssl libcrypto
+LOCAL_C_INCLUDES := $(target_c_includes)
+LOCAL_CFLAGS := $(target_c_flags)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(LOCAL_PATH)/android-config.mk
+include $(BUILD_EXECUTABLE)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= openssl
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := $(host_src_files)
+LOCAL_SHARED_LIBRARIES := libssl-host libcrypto-host
+LOCAL_C_INCLUDES := $(host_c_includes)
+LOCAL_CFLAGS := $(host_c_flags)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(LOCAL_PATH)/android-config.mk
+#include $(BUILD_HOST_EXECUTABLE)
diff --git a/CleanSpec.mk b/CleanSpec.mk
index 7549ef9..3c1f850 100644
--- a/CleanSpec.mk
+++ b/CleanSpec.mk
@@ -54,6 +54,7 @@ $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libssl_interme
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libcrypto_intermediates)
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libssl_static_intermediates)
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libcrypto_static_intermediates)
+$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/EXECUTABLES/*ssl*_intermediates $(PRODUCT_OUT)/obj/*/libssl_*intermediates $(PRODUCT_OUT)/obj/*/libcrypto_*intermediates)
 
 # ************************************************
 # NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
diff --git a/Crypto-config.mk b/Crypto-config.mk
new file mode 100644
index 0000000..d74a791
--- /dev/null
+++ b/Crypto-config.mk
@@ -0,0 +1,686 @@
+# Auto-generated - DO NOT EDIT!
+# To regenerate, edit openssl.config, then run:
+#     ./import_openssl.sh import /path/to/openssl-1.0.1e.tar.gz
+#
+# Before including this file, the local Android.mk must define the following
+# variables:
+#
+#    local_c_flags
+#    local_c_includes
+#    local_additional_dependencies
+#
+# This script will define the following variables:
+#
+#    target_c_flags
+#    target_c_includes
+#    target_src_files
+#
+#    host_c_flags
+#    host_c_includes
+#    host_src_files
+#
+
+# Ensure these are empty.
+unknown_arch_c_flags :=
+unknown_arch_src_files :=
+unknown_arch_exclude_files :=
+
+
+common_c_flags := \
+  -DNO_WINDOWS_BRAINDEATH \
+
+common_src_files := \
+  crypto/aes/aes_cbc.c \
+  crypto/aes/aes_cfb.c \
+  crypto/aes/aes_core.c \
+  crypto/aes/aes_ctr.c \
+  crypto/aes/aes_ecb.c \
+  crypto/aes/aes_misc.c \
+  crypto/aes/aes_ofb.c \
+  crypto/aes/aes_wrap.c \
+  crypto/asn1/a_bitstr.c \
+  crypto/asn1/a_bool.c \
+  crypto/asn1/a_bytes.c \
+  crypto/asn1/a_d2i_fp.c \
+  crypto/asn1/a_digest.c \
+  crypto/asn1/a_dup.c \
+  crypto/asn1/a_enum.c \
+  crypto/asn1/a_gentm.c \
+  crypto/asn1/a_i2d_fp.c \
+  crypto/asn1/a_int.c \
+  crypto/asn1/a_mbstr.c \
+  crypto/asn1/a_object.c \
+  crypto/asn1/a_octet.c \
+  crypto/asn1/a_print.c \
+  crypto/asn1/a_set.c \
+  crypto/asn1/a_sign.c \
+  crypto/asn1/a_strex.c \
+  crypto/asn1/a_strnid.c \
+  crypto/asn1/a_time.c \
+  crypto/asn1/a_type.c \
+  crypto/asn1/a_utctm.c \
+  crypto/asn1/a_utf8.c \
+  crypto/asn1/a_verify.c \
+  crypto/asn1/ameth_lib.c \
+  crypto/asn1/asn1_err.c \
+  crypto/asn1/asn1_gen.c \
+  crypto/asn1/asn1_lib.c \
+  crypto/asn1/asn1_par.c \
+  crypto/asn1/asn_mime.c \
+  crypto/asn1/asn_moid.c \
+  crypto/asn1/asn_pack.c \
+  crypto/asn1/bio_asn1.c \
+  crypto/asn1/bio_ndef.c \
+  crypto/asn1/d2i_pr.c \
+  crypto/asn1/d2i_pu.c \
+  crypto/asn1/evp_asn1.c \
+  crypto/asn1/f_enum.c \
+  crypto/asn1/f_int.c \
+  crypto/asn1/f_string.c \
+  crypto/asn1/i2d_pr.c \
+  crypto/asn1/i2d_pu.c \
+  crypto/asn1/n_pkey.c \
+  crypto/asn1/nsseq.c \
+  crypto/asn1/p5_pbe.c \
+  crypto/asn1/p5_pbev2.c \
+  crypto/asn1/p8_pkey.c \
+  crypto/asn1/t_bitst.c \
+  crypto/asn1/t_crl.c \
+  crypto/asn1/t_pkey.c \
+  crypto/asn1/t_req.c \
+  crypto/asn1/t_spki.c \
+  crypto/asn1/t_x509.c \
+  crypto/asn1/t_x509a.c \
+  crypto/asn1/tasn_dec.c \
+  crypto/asn1/tasn_enc.c \
+  crypto/asn1/tasn_fre.c \
+  crypto/asn1/tasn_new.c \
+  crypto/asn1/tasn_prn.c \
+  crypto/asn1/tasn_typ.c \
+  crypto/asn1/tasn_utl.c \
+  crypto/asn1/x_algor.c \
+  crypto/asn1/x_attrib.c \
+  crypto/asn1/x_bignum.c \
+  crypto/asn1/x_crl.c \
+  crypto/asn1/x_exten.c \
+  crypto/asn1/x_info.c \
+  crypto/asn1/x_long.c \
+  crypto/asn1/x_name.c \
+  crypto/asn1/x_nx509.c \
+  crypto/asn1/x_pkey.c \
+  crypto/asn1/x_pubkey.c \
+  crypto/asn1/x_req.c \
+  crypto/asn1/x_sig.c \
+  crypto/asn1/x_spki.c \
+  crypto/asn1/x_val.c \
+  crypto/asn1/x_x509.c \
+  crypto/asn1/x_x509a.c \
+  crypto/bf/bf_cfb64.c \
+  crypto/bf/bf_ecb.c \
+  crypto/bf/bf_enc.c \
+  crypto/bf/bf_ofb64.c \
+  crypto/bf/bf_skey.c \
+  crypto/bio/b_dump.c \
+  crypto/bio/b_print.c \
+  crypto/bio/b_sock.c \
+  crypto/bio/bf_buff.c \
+  crypto/bio/bf_nbio.c \
+  crypto/bio/bf_null.c \
+  crypto/bio/bio_cb.c \
+  crypto/bio/bio_err.c \
+  crypto/bio/bio_lib.c \
+  crypto/bio/bss_acpt.c \
+  crypto/bio/bss_bio.c \
+  crypto/bio/bss_conn.c \
+  crypto/bio/bss_dgram.c \
+  crypto/bio/bss_fd.c \
+  crypto/bio/bss_file.c \
+  crypto/bio/bss_log.c \
+  crypto/bio/bss_mem.c \
+  crypto/bio/bss_null.c \
+  crypto/bio/bss_sock.c \
+  crypto/bn/bn_add.c \
+  crypto/bn/bn_asm.c \
+  crypto/bn/bn_blind.c \
+  crypto/bn/bn_const.c \
+  crypto/bn/bn_ctx.c \
+  crypto/bn/bn_div.c \
+  crypto/bn/bn_err.c \
+  crypto/bn/bn_exp.c \
+  crypto/bn/bn_exp2.c \
+  crypto/bn/bn_gcd.c \
+  crypto/bn/bn_gf2m.c \
+  crypto/bn/bn_kron.c \
+  crypto/bn/bn_lib.c \
+  crypto/bn/bn_mod.c \
+  crypto/bn/bn_mont.c \
+  crypto/bn/bn_mpi.c \
+  crypto/bn/bn_mul.c \
+  crypto/bn/bn_nist.c \
+  crypto/bn/bn_prime.c \
+  crypto/bn/bn_print.c \
+  crypto/bn/bn_rand.c \
+  crypto/bn/bn_recp.c \
+  crypto/bn/bn_shift.c \
+  crypto/bn/bn_sqr.c \
+  crypto/bn/bn_sqrt.c \
+  crypto/bn/bn_word.c \
+  crypto/buffer/buf_err.c \
+  crypto/buffer/buf_str.c \
+  crypto/buffer/buffer.c \
+  crypto/cmac/cm_ameth.c \
+  crypto/cmac/cm_pmeth.c \
+  crypto/cmac/cmac.c \
+  crypto/comp/c_rle.c \
+  crypto/comp/c_zlib.c \
+  crypto/comp/comp_err.c \
+  crypto/comp/comp_lib.c \
+  crypto/conf/conf_api.c \
+  crypto/conf/conf_def.c \
+  crypto/conf/conf_err.c \
+  crypto/conf/conf_lib.c \
+  crypto/conf/conf_mall.c \
+  crypto/conf/conf_mod.c \
+  crypto/conf/conf_sap.c \
+  crypto/cpt_err.c \
+  crypto/cryptlib.c \
+  crypto/cversion.c \
+  crypto/des/cbc_cksm.c \
+  crypto/des/cbc_enc.c \
+  crypto/des/cfb64ede.c \
+  crypto/des/cfb64enc.c \
+  crypto/des/cfb_enc.c \
+  crypto/des/des_enc.c \
+  crypto/des/des_old.c \
+  crypto/des/des_old2.c \
+  crypto/des/ecb3_enc.c \
+  crypto/des/ecb_enc.c \
+  crypto/des/ede_cbcm_enc.c \
+  crypto/des/enc_read.c \
+  crypto/des/enc_writ.c \
+  crypto/des/fcrypt.c \
+  crypto/des/fcrypt_b.c \
+  crypto/des/ofb64ede.c \
+  crypto/des/ofb64enc.c \
+  crypto/des/ofb_enc.c \
+  crypto/des/pcbc_enc.c \
+  crypto/des/qud_cksm.c \
+  crypto/des/rand_key.c \
+  crypto/des/read2pwd.c \
+  crypto/des/rpc_enc.c \
+  crypto/des/set_key.c \
+  crypto/des/str2key.c \
+  crypto/des/xcbc_enc.c \
+  crypto/dh/dh_ameth.c \
+  crypto/dh/dh_asn1.c \
+  crypto/dh/dh_check.c \
+  crypto/dh/dh_depr.c \
+  crypto/dh/dh_err.c \
+  crypto/dh/dh_gen.c \
+  crypto/dh/dh_key.c \
+  crypto/dh/dh_lib.c \
+  crypto/dh/dh_pmeth.c \
+  crypto/dsa/dsa_ameth.c \
+  crypto/dsa/dsa_asn1.c \
+  crypto/dsa/dsa_depr.c \
+  crypto/dsa/dsa_err.c \
+  crypto/dsa/dsa_gen.c \
+  crypto/dsa/dsa_key.c \
+  crypto/dsa/dsa_lib.c \
+  crypto/dsa/dsa_ossl.c \
+  crypto/dsa/dsa_pmeth.c \
+  crypto/dsa/dsa_prn.c \
+  crypto/dsa/dsa_sign.c \
+  crypto/dsa/dsa_vrf.c \
+  crypto/dso/dso_dl.c \
+  crypto/dso/dso_dlfcn.c \
+  crypto/dso/dso_err.c \
+  crypto/dso/dso_lib.c \
+  crypto/dso/dso_null.c \
+  crypto/dso/dso_openssl.c \
+  crypto/ebcdic.c \
+  crypto/ec/ec2_mult.c \
+  crypto/ec/ec2_oct.c \
+  crypto/ec/ec2_smpl.c \
+  crypto/ec/ec_ameth.c \
+  crypto/ec/ec_asn1.c \
+  crypto/ec/ec_check.c \
+  crypto/ec/ec_curve.c \
+  crypto/ec/ec_cvt.c \
+  crypto/ec/ec_err.c \
+  crypto/ec/ec_key.c \
+  crypto/ec/ec_lib.c \
+  crypto/ec/ec_mult.c \
+  crypto/ec/ec_oct.c \
+  crypto/ec/ec_pmeth.c \
+  crypto/ec/ec_print.c \
+  crypto/ec/eck_prn.c \
+  crypto/ec/ecp_mont.c \
+  crypto/ec/ecp_nist.c \
+  crypto/ec/ecp_oct.c \
+  crypto/ec/ecp_smpl.c \
+  crypto/ecdh/ech_err.c \
+  crypto/ecdh/ech_key.c \
+  crypto/ecdh/ech_lib.c \
+  crypto/ecdh/ech_ossl.c \
+  crypto/ecdsa/ecs_asn1.c \
+  crypto/ecdsa/ecs_err.c \
+  crypto/ecdsa/ecs_lib.c \
+  crypto/ecdsa/ecs_ossl.c \
+  crypto/ecdsa/ecs_sign.c \
+  crypto/ecdsa/ecs_vrf.c \
+  crypto/engine/eng_all.c \
+  crypto/engine/eng_cnf.c \
+  crypto/engine/eng_ctrl.c \
+  crypto/engine/eng_dyn.c \
+  crypto/engine/eng_err.c \
+  crypto/engine/eng_fat.c \
+  crypto/engine/eng_init.c \
+  crypto/engine/eng_lib.c \
+  crypto/engine/eng_list.c \
+  crypto/engine/eng_pkey.c \
+  crypto/engine/eng_table.c \
+  crypto/engine/tb_asnmth.c \
+  crypto/engine/tb_cipher.c \
+  crypto/engine/tb_dh.c \
+  crypto/engine/tb_digest.c \
+  crypto/engine/tb_dsa.c \
+  crypto/engine/tb_ecdh.c \
+  crypto/engine/tb_ecdsa.c \
+  crypto/engine/tb_pkmeth.c \
+  crypto/engine/tb_rand.c \
+  crypto/engine/tb_rsa.c \
+  crypto/engine/tb_store.c \
+  crypto/err/err.c \
+  crypto/err/err_all.c \
+  crypto/err/err_prn.c \
+  crypto/evp/bio_b64.c \
+  crypto/evp/bio_enc.c \
+  crypto/evp/bio_md.c \
+  crypto/evp/bio_ok.c \
+  crypto/evp/c_all.c \
+  crypto/evp/c_allc.c \
+  crypto/evp/c_alld.c \
+  crypto/evp/digest.c \
+  crypto/evp/e_aes.c \
+  crypto/evp/e_aes_cbc_hmac_sha1.c \
+  crypto/evp/e_bf.c \
+  crypto/evp/e_des.c \
+  crypto/evp/e_des3.c \
+  crypto/evp/e_null.c \
+  crypto/evp/e_old.c \
+  crypto/evp/e_rc2.c \
+  crypto/evp/e_rc4.c \
+  crypto/evp/e_rc4_hmac_md5.c \
+  crypto/evp/e_rc5.c \
+  crypto/evp/e_xcbc_d.c \
+  crypto/evp/encode.c \
+  crypto/evp/evp_acnf.c \
+  crypto/evp/evp_cnf.c \
+  crypto/evp/evp_enc.c \
+  crypto/evp/evp_err.c \
+  crypto/evp/evp_key.c \
+  crypto/evp/evp_lib.c \
+  crypto/evp/evp_pbe.c \
+  crypto/evp/evp_pkey.c \
+  crypto/evp/m_dss.c \
+  crypto/evp/m_dss1.c \
+  crypto/evp/m_ecdsa.c \
+  crypto/evp/m_md4.c \
+  crypto/evp/m_md5.c \
+  crypto/evp/m_mdc2.c \
+  crypto/evp/m_null.c \
+  crypto/evp/m_ripemd.c \
+  crypto/evp/m_sha1.c \
+  crypto/evp/m_sigver.c \
+  crypto/evp/m_wp.c \
+  crypto/evp/names.c \
+  crypto/evp/p5_crpt.c \
+  crypto/evp/p5_crpt2.c \
+  crypto/evp/p_dec.c \
+  crypto/evp/p_enc.c \
+  crypto/evp/p_lib.c \
+  crypto/evp/p_open.c \
+  crypto/evp/p_seal.c \
+  crypto/evp/p_sign.c \
+  crypto/evp/p_verify.c \
+  crypto/evp/pmeth_fn.c \
+  crypto/evp/pmeth_gn.c \
+  crypto/evp/pmeth_lib.c \
+  crypto/ex_data.c \
+  crypto/hmac/hm_ameth.c \
+  crypto/hmac/hm_pmeth.c \
+  crypto/hmac/hmac.c \
+  crypto/krb5/krb5_asn.c \
+  crypto/lhash/lh_stats.c \
+  crypto/lhash/lhash.c \
+  crypto/md4/md4_dgst.c \
+  crypto/md4/md4_one.c \
+  crypto/md5/md5_dgst.c \
+  crypto/md5/md5_one.c \
+  crypto/mem.c \
+  crypto/mem_clr.c \
+  crypto/mem_dbg.c \
+  crypto/modes/cbc128.c \
+  crypto/modes/ccm128.c \
+  crypto/modes/cfb128.c \
+  crypto/modes/ctr128.c \
+  crypto/modes/gcm128.c \
+  crypto/modes/ofb128.c \
+  crypto/modes/xts128.c \
+  crypto/o_dir.c \
+  crypto/o_init.c \
+  crypto/o_str.c \
+  crypto/o_time.c \
+  crypto/objects/o_names.c \
+  crypto/objects/obj_dat.c \
+  crypto/objects/obj_err.c \
+  crypto/objects/obj_lib.c \
+  crypto/objects/obj_xref.c \
+  crypto/ocsp/ocsp_asn.c \
+  crypto/ocsp/ocsp_cl.c \
+  crypto/ocsp/ocsp_err.c \
+  crypto/ocsp/ocsp_ext.c \
+  crypto/ocsp/ocsp_ht.c \
+  crypto/ocsp/ocsp_lib.c \
+  crypto/ocsp/ocsp_prn.c \
+  crypto/ocsp/ocsp_srv.c \
+  crypto/ocsp/ocsp_vfy.c \
+  crypto/pem/pem_all.c \
+  crypto/pem/pem_err.c \
+  crypto/pem/pem_info.c \
+  crypto/pem/pem_lib.c \
+  crypto/pem/pem_oth.c \
+  crypto/pem/pem_pk8.c \
+  crypto/pem/pem_pkey.c \
+  crypto/pem/pem_seal.c \
+  crypto/pem/pem_sign.c \
+  crypto/pem/pem_x509.c \
+  crypto/pem/pem_xaux.c \
+  crypto/pem/pvkfmt.c \
+  crypto/pkcs12/p12_add.c \
+  crypto/pkcs12/p12_asn.c \
+  crypto/pkcs12/p12_attr.c \
+  crypto/pkcs12/p12_crpt.c \
+  crypto/pkcs12/p12_crt.c \
+  crypto/pkcs12/p12_decr.c \
+  crypto/pkcs12/p12_init.c \
+  crypto/pkcs12/p12_key.c \
+  crypto/pkcs12/p12_kiss.c \
+  crypto/pkcs12/p12_mutl.c \
+  crypto/pkcs12/p12_npas.c \
+  crypto/pkcs12/p12_p8d.c \
+  crypto/pkcs12/p12_p8e.c \
+  crypto/pkcs12/p12_utl.c \
+  crypto/pkcs12/pk12err.c \
+  crypto/pkcs7/pk7_asn1.c \
+  crypto/pkcs7/pk7_attr.c \
+  crypto/pkcs7/pk7_doit.c \
+  crypto/pkcs7/pk7_lib.c \
+  crypto/pkcs7/pk7_mime.c \
+  crypto/pkcs7/pk7_smime.c \
+  crypto/pkcs7/pkcs7err.c \
+  crypto/pqueue/pqueue.c \
+  crypto/rand/md_rand.c \
+  crypto/rand/rand_egd.c \
+  crypto/rand/rand_err.c \
+  crypto/rand/rand_lib.c \
+  crypto/rand/rand_unix.c \
+  crypto/rand/rand_win.c \
+  crypto/rand/randfile.c \
+  crypto/rc2/rc2_cbc.c \
+  crypto/rc2/rc2_ecb.c \
+  crypto/rc2/rc2_skey.c \
+  crypto/rc2/rc2cfb64.c \
+  crypto/rc2/rc2ofb64.c \
+  crypto/rc4/rc4_enc.c \
+  crypto/rc4/rc4_skey.c \
+  crypto/rc4/rc4_utl.c \
+  crypto/ripemd/rmd_dgst.c \
+  crypto/ripemd/rmd_one.c \
+  crypto/rsa/rsa_ameth.c \
+  crypto/rsa/rsa_asn1.c \
+  crypto/rsa/rsa_chk.c \
+  crypto/rsa/rsa_crpt.c \
+  crypto/rsa/rsa_eay.c \
+  crypto/rsa/rsa_err.c \
+  crypto/rsa/rsa_gen.c \
+  crypto/rsa/rsa_lib.c \
+  crypto/rsa/rsa_none.c \
+  crypto/rsa/rsa_null.c \
+  crypto/rsa/rsa_oaep.c \
+  crypto/rsa/rsa_pk1.c \
+  crypto/rsa/rsa_pmeth.c \
+  crypto/rsa/rsa_prn.c \
+  crypto/rsa/rsa_pss.c \
+  crypto/rsa/rsa_saos.c \
+  crypto/rsa/rsa_sign.c \
+  crypto/rsa/rsa_ssl.c \
+  crypto/rsa/rsa_x931.c \
+  crypto/sha/sha1_one.c \
+  crypto/sha/sha1dgst.c \
+  crypto/sha/sha256.c \
+  crypto/sha/sha512.c \
+  crypto/sha/sha_dgst.c \
+  crypto/srp/srp_lib.c \
+  crypto/srp/srp_vfy.c \
+  crypto/stack/stack.c \
+  crypto/ts/ts_err.c \
+  crypto/txt_db/txt_db.c \
+  crypto/ui/ui_compat.c \
+  crypto/ui/ui_err.c \
+  crypto/ui/ui_lib.c \
+  crypto/ui/ui_openssl.c \
+  crypto/ui/ui_util.c \
+  crypto/uid.c \
+  crypto/x509/by_dir.c \
+  crypto/x509/by_file.c \
+  crypto/x509/x509_att.c \
+  crypto/x509/x509_cmp.c \
+  crypto/x509/x509_d2.c \
+  crypto/x509/x509_def.c \
+  crypto/x509/x509_err.c \
+  crypto/x509/x509_ext.c \
+  crypto/x509/x509_lu.c \
+  crypto/x509/x509_obj.c \
+  crypto/x509/x509_r2x.c \
+  crypto/x509/x509_req.c \
+  crypto/x509/x509_set.c \
+  crypto/x509/x509_trs.c \
+  crypto/x509/x509_txt.c \
+  crypto/x509/x509_v3.c \
+  crypto/x509/x509_vfy.c \
+  crypto/x509/x509_vpm.c \
+  crypto/x509/x509cset.c \
+  crypto/x509/x509name.c \
+  crypto/x509/x509rset.c \
+  crypto/x509/x509spki.c \
+  crypto/x509/x509type.c \
+  crypto/x509/x_all.c \
+  crypto/x509v3/pcy_cache.c \
+  crypto/x509v3/pcy_data.c \
+  crypto/x509v3/pcy_lib.c \
+  crypto/x509v3/pcy_map.c \
+  crypto/x509v3/pcy_node.c \
+  crypto/x509v3/pcy_tree.c \
+  crypto/x509v3/v3_akey.c \
+  crypto/x509v3/v3_akeya.c \
+  crypto/x509v3/v3_alt.c \
+  crypto/x509v3/v3_bcons.c \
+  crypto/x509v3/v3_bitst.c \
+  crypto/x509v3/v3_conf.c \
+  crypto/x509v3/v3_cpols.c \
+  crypto/x509v3/v3_crld.c \
+  crypto/x509v3/v3_enum.c \
+  crypto/x509v3/v3_extku.c \
+  crypto/x509v3/v3_genn.c \
+  crypto/x509v3/v3_ia5.c \
+  crypto/x509v3/v3_info.c \
+  crypto/x509v3/v3_int.c \
+  crypto/x509v3/v3_lib.c \
+  crypto/x509v3/v3_ncons.c \
+  crypto/x509v3/v3_ocsp.c \
+  crypto/x509v3/v3_pci.c \
+  crypto/x509v3/v3_pcia.c \
+  crypto/x509v3/v3_pcons.c \
+  crypto/x509v3/v3_pku.c \
+  crypto/x509v3/v3_pmaps.c \
+  crypto/x509v3/v3_prn.c \
+  crypto/x509v3/v3_purp.c \
+  crypto/x509v3/v3_skey.c \
+  crypto/x509v3/v3_sxnet.c \
+  crypto/x509v3/v3_utl.c \
+  crypto/x509v3/v3err.c \
+
+common_c_includes := \
+  . \
+  crypto \
+  crypto/asn1 \
+  crypto/evp \
+  crypto/modes \
+  include \
+  include/openssl \
+
+arm_c_flags := \
+  -DAES_ASM \
+  -DGHASH_ASM \
+  -DOPENSSL_BN_ASM_GF2m \
+  -DOPENSSL_BN_ASM_MONT \
+  -DSHA1_ASM \
+  -DSHA256_ASM \
+  -DSHA512_ASM \
+
+arm_src_files := \
+  crypto/aes/asm/aes-armv4.S \
+  crypto/bn/asm/armv4-gf2m.S \
+  crypto/bn/asm/armv4-mont.S \
+  crypto/modes/asm/ghash-armv4.S \
+  crypto/sha/asm/sha1-armv4-large.S \
+  crypto/sha/asm/sha256-armv4.S \
+  crypto/sha/asm/sha512-armv4.S \
+
+arm_exclude_files := \
+  crypto/aes/aes_core.c \
+
+x86_c_flags := \
+  -DAES_ASM \
+  -DDES_PTR \
+  -DDES_RISC1 \
+  -DDES_UNROLL \
+  -DGHASH_ASM \
+  -DMD5_ASM \
+  -DOPENSSL_BN_ASM_GF2m \
+  -DOPENSSL_BN_ASM_MONT \
+  -DOPENSSL_BN_ASM_PART_WORDS \
+  -DOPENSSL_CPUID_OBJ \
+  -DSHA1_ASM \
+  -DSHA256_ASM \
+  -DSHA512_ASM \
+
+x86_src_files := \
+  crypto/aes/asm/aes-586.S \
+  crypto/aes/asm/aesni-x86.S \
+  crypto/aes/asm/vpaes-x86.S \
+  crypto/bf/asm/bf-586.S \
+  crypto/bn/asm/bn-586.S \
+  crypto/bn/asm/co-586.S \
+  crypto/bn/asm/x86-gf2m.S \
+  crypto/bn/asm/x86-mont.S \
+  crypto/des/asm/crypt586.S \
+  crypto/des/asm/des-586.S \
+  crypto/md5/asm/md5-586.S \
+  crypto/modes/asm/ghash-x86.S \
+  crypto/sha/asm/sha1-586.S \
+  crypto/sha/asm/sha256-586.S \
+  crypto/sha/asm/sha512-586.S \
+  crypto/x86cpuid.S \
+
+x86_exclude_files := \
+  crypto/aes/aes_cbc.c \
+  crypto/aes/aes_core.c \
+  crypto/bf/bf_enc.c \
+  crypto/bn/bn_asm.c \
+  crypto/des/des_enc.c \
+  crypto/des/fcrypt_b.c \
+  crypto/mem_clr.c \
+
+x86_64_c_flags := \
+  -DAES_ASM \
+  -DDES_PTR \
+  -DDES_RISC1 \
+  -DDES_UNROLL \
+  -DGHASH_ASM \
+  -DMD5_ASM \
+  -DOPENSSL_BN_ASM_GF2m \
+  -DOPENSSL_BN_ASM_MONT \
+  -DOPENSSL_CPUID_OBJ \
+  -DSHA1_ASM \
+  -DSHA256_ASM \
+  -DSHA512_ASM \
+
+x86_64_src_files := \
+  crypto/aes/asm/aes-x86_64.S \
+  crypto/aes/asm/aesni-sha1-x86_64.S \
+  crypto/aes/asm/aesni-x86_64.S \
+  crypto/aes/asm/bsaes-x86_64.S \
+  crypto/aes/asm/vpaes-x86_64.S \
+  crypto/bn/asm/modexp512-x86_64.S \
+  crypto/bn/asm/x86_64-gcc.c \
+  crypto/bn/asm/x86_64-gf2m.S \
+  crypto/bn/asm/x86_64-mont.S \
+  crypto/bn/asm/x86_64-mont5.S \
+  crypto/md5/asm/md5-x86_64.S \
+  crypto/modes/asm/ghash-x86_64.S \
+  crypto/rc4/asm/rc4-md5-x86_64.S \
+  crypto/rc4/asm/rc4-x86_64.S \
+  crypto/sha/asm/sha1-x86_64.S \
+  crypto/sha/asm/sha256-x86_64.S \
+  crypto/sha/asm/sha512-x86_64.S \
+  crypto/x86_64cpuid.S \
+
+x86_64_exclude_files := \
+  crypto/aes/aes_cbc.c \
+  crypto/aes/aes_core.c \
+  crypto/mem_clr.c \
+  crypto/rc4/rc4_enc.c \
+
+mips_c_flags := \
+  -DAES_ASM \
+  -DOPENSSL_BN_ASM_MONT \
+  -DSHA1_ASM \
+  -DSHA256_ASM \
+
+mips_src_files := \
+  crypto/aes/asm/aes-mips.S \
+  crypto/bn/asm/bn-mips.S \
+  crypto/bn/asm/mips-mont.S \
+  crypto/sha/asm/sha1-mips.S \
+  crypto/sha/asm/sha256-mips.S \
+
+mips_exclude_files := \
+  crypto/aes/aes_core.c \
+  crypto/bn/bn_asm.c \
+
+target_arch := $(TARGET_ARCH)
+ifeq ($(target_arch)-$(TARGET_HAS_BIGENDIAN),mips-true)
+target_arch := unknown_arch
+endif
+
+target_c_flags    := $(common_c_flags) $($(target_arch)_c_flags) $(local_c_flags)
+target_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+target_src_files  := $(common_src_files) $($(target_arch)_src_files)
+target_src_files  := $(filter-out $($(target_arch)_exclude_files), $(target_src_files))
+
+ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
+host_arch := x86
+else
+host_arch := unknown_arch
+endif
+
+host_c_flags    := $(common_c_flags) $($(host_arch)_c_flags) $(local_c_flags)
+host_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+host_src_files  := $(common_src_files) $($(host_arch)_src_files)
+host_src_files  := $(filter-out $($(host_arch)_exclude_files), $(host_src_files))
+
+local_additional_dependencies += $(LOCAL_PATH)/Crypto-config.mk
+
diff --git a/Crypto.mk b/Crypto.mk
new file mode 100644
index 0000000..d2be80e
--- /dev/null
+++ b/Crypto.mk
@@ -0,0 +1,92 @@
+local_c_flags :=
+
+local_c_includes := $(log_c_includes) \
+	$(APP_PROJECT_PATH) \
+	$(NDK_PROJECT_PATH)/crypto \
+	$(NDK_PROJECT_PATH)/crypto/asn1 \
+	$(NDK_PROJECT_PATH)/crypto/evp \
+	$(NDK_PROJECT_PATH)/crypto/modes \
+	$(NDK_PROJECT_PATH)/include \
+	$(NDK_PROJECT_PATH)/include/openssl
+
+local_additional_dependencies := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Crypto.mk
+
+include $(LOCAL_PATH)/Crypto-config.mk
+
+#######################################
+# target static library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto_static
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_STATIC_LIBRARY)
+
+#######################################
+# target shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+LOCAL_LDFLAGS += -ldl
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_SHARED_LIBRARY)
+
+#######################################
+# host shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+LOCAL_SRC_FILES += $(host_src_files)
+LOCAL_CFLAGS += $(host_c_flags) -DPURIFY
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_LDLIBS += -ldl
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto-host
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+#include $(BUILD_HOST_SHARED_LIBRARY)
+
+########################################
+# host static library, which is used by some SDK tools.
+
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SHARED_LIBRARIES := $(log_shared_libraries)
+LOCAL_SRC_FILES += $(host_src_files)
+LOCAL_CFLAGS += $(host_c_flags) -DPURIFY
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_LDLIBS += -ldl
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libcrypto_static
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+#include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/README.android b/README.android
index e93fb00..a7c3cc1 100644
--- a/README.android
+++ b/README.android
@@ -23,7 +23,6 @@ The following steps are recommended for porting new OpenSSL versions.
 
 2) Update the variables in openssl.config and openssl.version as appropriate.
    At the very least you will need to update the openssl.version.
-   Similarly update ThirdPartyProject.prop.
 
 3) Run:
 
@@ -54,17 +53,23 @@ The following steps are recommended for porting new OpenSSL versions.
      # Build and sync libcore tests
      (croot && cd libcore && mm -j16 snod && adb remount && adb sync)
      # Run tests from libcore
-     (croot && vogar --classpath out/target/common/obj/JAVA_LIBRARIES/core-tests-support_intermediates/classes.jar --classpath out/target/common/obj/JAVA_LIBRARIES/core-tests_intermediates/classes.jar javax.net.ssl tests.api.javax.net)
+     (croot && vogar --classpath out/target/common/obj/JAVA_LIBRARIES/core-tests_intermediates/classes.jar javax.net.ssl tests.api.javax.net)
      # Run tests from Harmony
-     (croot && vogar --classpath harmony_tests.jar tests.api.java.math.BigIntegerTest org.apache.harmony.tests.java.math)
+     (croot && vogar --classpath out/target/common/obj/JAVA_LIBRARIES/apache-harmony-tests_intermediates/classes.jar tests.api.java.math.BigIntegerTest org.apache.harmony.tests.java.math)
      # try an https website
      adb shell am start https://online.citibank.com # confirm result in browser
 
      The vogar tool can be found externally at http://code.google.com/p/vogar/
-     Within Google it can be run with ~dalvik-prebuild/vogar/bin/vogar
 
-     harmony_tests.jar is built from Subversion http://harmony.apache.org/
-     Within Google it can be found at ~dalvik-prebuild/bin/harmony_tests.jar
+     Quick installation instructions (without rebuilding from source):
+        VOGAR=$HOME/vogar
+        svn co http://vogar.googlecode.com/svn/trunk/ $VOGAR
+        mkdir -p $VOGAR/build/
+        curl -o $VOGAR/build/vogar.jar https://vogar.googlecode.com/files/vogar.jar
+        PATH=$PATH:$VOGAR/bin
+
+     Within Google, you can find it under:
+       /home/dalvik-prebuild/vogar/bin/vogar
 
      # You can also run openssl s_server as a test server on the device:
      adb push ./android.testssl/CAss.cnf /sdcard/CAss.cnf
@@ -76,17 +81,7 @@ The following steps are recommended for porting new OpenSSL versions.
 
      m -j16
 
-Optionally, check whether build flags (located in android-config.mk
-need to be updated.  Doing this step will help ensure that the
-compiled library is appropriately optimized for speed and size.  To
-update build flags:
-
-a) source openssl.config
-b) tar -zxf openssl-*.tar.gz
-c) cd openssl-*/
-d) ./Configure $CONFIGURE_ARGS
-e) examine Makefile and compare with ../android-config.mk
-f) modify ../openssl.config as appropriate and go to step 3) above.
-
-Alternatively, ."/import_openssl.sh import" now prints the
-post-Configure Makefile for review before deleting in on import.
+Optionally, check whether build flags (located in CONFIGURE_ARGS in
+openssl.config, plus some extras in android-config.mk), need to be updated.
+Doing this step will help ensure that the compiled library is appropriately
+optimized for speed and size.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4a22d32
--- /dev/null
+++ b/README.md
@@ -0,0 +1,22 @@
+This is the official Android OpenSSL sources with the makefiles lightly patched to build as an external project with the Android NDK. Similar to https://github.com/eighthave/openssl-android but updated for Android (OpenSSL 1.0.1e) and NDK r8d.
+
+To build, run:
+```
+ndk-build -j $(getconf _NPROCESSORS_ONLN)
+```
+
+* Upstream: https://android.googlesource.com/platform/external/openssl
+  commit ID: af3454c5e67108665d5f26a90aae1831e26d38b7 of master branch
+
+* Makefile patches inpired by: https://github.com/eighthave/openssl-android
+
+
+To see the patches, run:
+<pre>
+git remote add upstream https://android.googlesource.com/platform/external/openssl
+git fetch upstream
+git diff upstream/master
+</pre>
+
+# Background
+Although Android ships with OpenSSL, it understandably doesn't provide libcrypto and libssl as part of its public API. Thus developers porting OpenSSL using apps to Android need a project like this.
diff --git a/Ssl-config.mk b/Ssl-config.mk
new file mode 100644
index 0000000..e14e4bb
--- /dev/null
+++ b/Ssl-config.mk
@@ -0,0 +1,128 @@
+# Auto-generated - DO NOT EDIT!
+# To regenerate, edit openssl.config, then run:
+#     ./import_openssl.sh import /path/to/openssl-1.0.1e.tar.gz
+#
+# Before including this file, the local Android.mk must define the following
+# variables:
+#
+#    local_c_flags
+#    local_c_includes
+#    local_additional_dependencies
+#
+# This script will define the following variables:
+#
+#    target_c_flags
+#    target_c_includes
+#    target_src_files
+#
+#    host_c_flags
+#    host_c_includes
+#    host_src_files
+#
+
+# Ensure these are empty.
+unknown_arch_c_flags :=
+unknown_arch_src_files :=
+unknown_arch_exclude_files :=
+
+
+common_c_flags :=
+
+common_src_files := \
+  ssl/bio_ssl.c \
+  ssl/d1_both.c \
+  ssl/d1_enc.c \
+  ssl/d1_lib.c \
+  ssl/d1_pkt.c \
+  ssl/d1_srtp.c \
+  ssl/kssl.c \
+  ssl/s23_clnt.c \
+  ssl/s23_lib.c \
+  ssl/s23_meth.c \
+  ssl/s23_pkt.c \
+  ssl/s23_srvr.c \
+  ssl/s2_clnt.c \
+  ssl/s2_enc.c \
+  ssl/s2_lib.c \
+  ssl/s2_meth.c \
+  ssl/s2_pkt.c \
+  ssl/s2_srvr.c \
+  ssl/s3_both.c \
+  ssl/s3_cbc.c \
+  ssl/s3_clnt.c \
+  ssl/s3_enc.c \
+  ssl/s3_lib.c \
+  ssl/s3_meth.c \
+  ssl/s3_pkt.c \
+  ssl/s3_srvr.c \
+  ssl/ssl_algs.c \
+  ssl/ssl_asn1.c \
+  ssl/ssl_cert.c \
+  ssl/ssl_ciph.c \
+  ssl/ssl_err.c \
+  ssl/ssl_err2.c \
+  ssl/ssl_lib.c \
+  ssl/ssl_rsa.c \
+  ssl/ssl_sess.c \
+  ssl/ssl_stat.c \
+  ssl/ssl_txt.c \
+  ssl/t1_clnt.c \
+  ssl/t1_enc.c \
+  ssl/t1_lib.c \
+  ssl/t1_meth.c \
+  ssl/t1_reneg.c \
+  ssl/t1_srvr.c \
+  ssl/tls_srp.c \
+
+common_c_includes := \
+  . \
+  crypto \
+  include \
+
+arm_c_flags :=
+
+arm_src_files :=
+
+arm_exclude_files :=
+
+x86_c_flags :=
+
+x86_src_files :=
+
+x86_exclude_files :=
+
+x86_64_c_flags :=
+
+x86_64_src_files :=
+
+x86_64_exclude_files :=
+
+mips_c_flags :=
+
+mips_src_files :=
+
+mips_exclude_files :=
+
+target_arch := $(TARGET_ARCH)
+ifeq ($(target_arch)-$(TARGET_HAS_BIGENDIAN),mips-true)
+target_arch := unknown_arch
+endif
+
+target_c_flags    := $(common_c_flags) $($(target_arch)_c_flags) $(local_c_flags)
+target_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+target_src_files  := $(common_src_files) $($(target_arch)_src_files)
+target_src_files  := $(filter-out $($(target_arch)_exclude_files), $(target_src_files))
+
+ifeq ($(HOST_OS)-$(HOST_ARCH),linux-x86)
+host_arch := x86
+else
+host_arch := unknown_arch
+endif
+
+host_c_flags    := $(common_c_flags) $($(host_arch)_c_flags) $(local_c_flags)
+host_c_includes := $(addprefix external/openssl/,$(common_c_includes)) $(local_c_includes)
+host_src_files  := $(common_src_files) $($(host_arch)_src_files)
+host_src_files  := $(filter-out $($(host_arch)_exclude_files), $(host_src_files))
+
+local_additional_dependencies += $(LOCAL_PATH)/Ssl-config.mk
+
diff --git a/Ssl.mk b/Ssl.mk
new file mode 100644
index 0000000..546fc08
--- /dev/null
+++ b/Ssl.mk
@@ -0,0 +1,84 @@
+local_c_flags :=
+
+local_c_includes := $(log_c_includes) \
+	$(NDK_PROJECT_PATH) \
+	$(NDK_PROJECT_PATH)/crypto \
+	$(NDK_PROJECT_PATH)/crypto/asn1 \
+	$(NDK_PROJECT_PATH)/crypto/evp \
+	$(NDK_PROJECT_PATH)/include \
+	$(NDK_PROJECT_PATH)/include/openssl
+
+local_additional_dependencies := $(LOCAL_PATH)/android-config.mk $(LOCAL_PATH)/Ssl.mk
+
+include $(LOCAL_PATH)/Ssl-config.mk
+
+#######################################
+# target static library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_SHARED_LIBRARIES = $(log_shared_libraries)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libssl_static
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_STATIC_LIBRARY)
+
+#######################################
+# target shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+
+# If we're building an unbundled build, don't try to use clang since it's not
+# in the NDK yet. This can be removed when a clang version that is fast enough
+# in the NDK.
+ifeq (,$(TARGET_BUILD_APPS))
+LOCAL_CLANG := true
+else
+LOCAL_SDK_VERSION := 9
+endif
+
+LOCAL_SRC_FILES += $(target_src_files)
+LOCAL_CFLAGS += $(target_c_flags)
+LOCAL_C_INCLUDES += $(target_c_includes)
+LOCAL_SHARED_LIBRARIES += libcrypto $(log_shared_libraries)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libssl
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_SHARED_LIBRARY)
+
+#######################################
+# host shared library
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SRC_FILES += $(host_src_files)
+LOCAL_CFLAGS += $(host_c_flags)
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_SHARED_LIBRARIES += libcrypto-host $(log_shared_libraries)
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE:= libssl-host
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+#include $(BUILD_HOST_SHARED_LIBRARY)
+
+#######################################
+# ssltest
+include $(CLEAR_VARS)
+include $(LOCAL_PATH)/android-config.mk
+LOCAL_SRC_FILES:= ssl/ssltest.c
+LOCAL_C_INCLUDES += $(host_c_includes)
+LOCAL_SHARED_LIBRARIES := libssl libcrypto $(log_shared_libraries)
+LOCAL_MODULE:= ssltest
+LOCAL_MODULE_TAGS := optional
+LOCAL_ADDITIONAL_DEPENDENCIES := $(local_additional_dependencies)
+include $(BUILD_EXECUTABLE)
diff --git a/ThirdPartyProject.prop b/ThirdPartyProject.prop
deleted file mode 100644
index 34ad609..0000000
--- a/ThirdPartyProject.prop
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright 2010 Google Inc. All Rights Reserved.
-#Fri Jul 16 10:03:09 PDT 2010
-currentVersion=1.0.0e
-version=1.0.0e
-isNative=true
-feedurl=http\://www.openssl.org/
-name=openssl
-keywords=openssl
-onDevice=true
-homepage=http\://www.openssl.org/
diff --git a/android-config.mk b/android-config.mk
index d76d6e3..d1bda99 100644
--- a/android-config.mk
+++ b/android-config.mk
@@ -1,17 +1,30 @@
 #
-# These flags represent the build-time configuration of openssl for android
+# These flags represent the build-time configuration of OpenSSL for android
 #
-# They were pruned from the "Makefile" generated by running ./Configure from import_openssl.sh
+# The value of $(openssl_cflags) was pruned from the Makefile generated
+# by running ./Configure from import_openssl.sh.
 #
+# This script performs minor but required patching for the Android build.
+#
+
+LOCAL_CFLAGS += $(openssl_cflags)
 
-# From CLFAG=	
-LOCAL_CFLAGS += -DOPENSSL_THREADS -D_REENTRANT -DDSO_DLFCN -DHAVE_DLFCN_H -DL_ENDIAN #-DTERMIO
+LOCAL_CFLAGS := $(filter-out -DTERMIO, $(LOCAL_CFLAGS))
 
-# From DEPFLAG=
-LOCAL_CFLAGS += -DOPENSSL_NO_CAMELLIA -DOPENSSL_NO_CAPIENG -DOPENSSL_NO_CAST -DOPENSSL_NO_CMS -DOPENSSL_NO_GMP -DOPENSSL_NO_IDEA -DOPENSSL_NO_JPAKE -DOPENSSL_NO_MD2 -DOPENSSL_NO_MDC2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_SHA0 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SEED -DOPENSSL_NO_STORE -DOPENSSL_NO_WHIRLPOOL
+ifeq ($(HOST_OS),windows)
+LOCAL_CFLAGS := $(filter-out -DDSO_DLFCN -DHAVE_DLFCN_H,$(LOCAL_CFLAGS))
+endif
 
-# Extra
-LOCAL_CFLAGS += -DOPENSSL_NO_HW -DOPENSSL_NO_ENGINE -DZLIB
+# Directories
+LOCAL_CFLAGS += \
+  -DOPENSSLDIR="\"/system/lib/ssl\"" \
+  -DENGINESDIR="\"/system/lib/ssl/engines\""
+
+# Intentionally excluded http://b/7079965
+LOCAL_CFLAGS := $(filter-out -DZLIB, $(LOCAL_CFLAGS))
 
 # Debug
 # LOCAL_CFLAGS += -DCIPHER_DEBUG
+
+# Add clang here when it works on host
+# LOCAL_CLANG := true
diff --git a/android.testssl/CAss.cnf b/android.testssl/CAss.cnf
index 1173c08..77c01c3 100644
--- a/android.testssl/CAss.cnf
+++ b/android.testssl/CAss.cnf
@@ -7,7 +7,7 @@ RANDFILE		= /sdcard/android.testssl/.rnd
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 2048
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
diff --git a/android.testssl/Uss.cnf b/android.testssl/Uss.cnf
index 56dcdd5..317ab6d 100644
--- a/android.testssl/Uss.cnf
+++ b/android.testssl/Uss.cnf
@@ -7,11 +7,11 @@ RANDFILE		= /sdcard/android.testssl/.rnd
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 2048
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
-default_md		= md2
+default_md		= sha256
 
 [ req_distinguished_name ]
 countryName			= Country Name (2 letter code)
diff --git a/android.testssl/server2.pem b/android.testssl/server2.pem
index 8bb6641..a3927cf 100644
--- a/android.testssl/server2.pem
+++ b/android.testssl/server2.pem
@@ -1,376 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (1024 bit)
------BEGIN CERTIFICATE-----
-MIICLjCCAZcCAQEwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU0WhcNOTgwNjA5
-MTM1NzU0WjBkMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxJDAiBgNVBAMTG1NlcnZlciB0ZXN0IGNl
-cnQgKDEwMjQgYml0KTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAsxH1PBPm
-RkxrR11eV4bzNi4N9n11CI8nV29+ARlT1+qDe/mjVUvXlmsr1v/vf71G9GgqopSa
-6RXrICLVdk/FYYYzhPvl1M+OrjaXDFO8BzBAF1Lnz6c7aRZvGRJNrRSr2nZEkqDf
-JW9dY7r2VZEpD5QeuaRYUnuECkqeieB65GMCAwEAATANBgkqhkiG9w0BAQQFAAOB
-gQCWsOta6C0wiVzXz8wPmJKyTrurMlgUss2iSuW9366iwofZddsNg7FXniMzkIf6
-dp7jnmWZwKZ9cXsNUS2o4OL07qOk2HOywC0YsNZQsOBu1CBTYYkIefDiKFL1zQHh
-8lwwNd4NP+OE3NzUNkCfh4DnFfg9WHkXUlD5UpxNRJ4gJA==
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQCzEfU8E+ZGTGtHXV5XhvM2Lg32fXUIjydXb34BGVPX6oN7+aNV
-S9eWayvW/+9/vUb0aCqilJrpFesgItV2T8VhhjOE++XUz46uNpcMU7wHMEAXUufP
-pztpFm8ZEk2tFKvadkSSoN8lb11juvZVkSkPlB65pFhSe4QKSp6J4HrkYwIDAQAB
-AoGBAKy8jvb0Lzby8q11yNLf7+78wCVdYi7ugMHcYA1JVFK8+zb1WfSm44FLQo/0
-dSChAjgz36TTexeLODPYxleJndjVcOMVzsLJjSM8dLpXsTS4FCeMbhw2s2u+xqKY
-bbPWfk+HOTyJjfnkcC5Nbg44eOmruq0gSmBeUXVM5UntlTnxAkEA7TGCA3h7kx5E
-Bl4zl2pc3gPAGt+dyfk5Po9mGJUUXhF5p2zueGmYWW74TmOWB1kzt4QRdYMzFePq
-zfDNXEa1CwJBAMFErdY0xp0UJ13WwBbUTk8rujqQdHtjw0klhpbuKkjxu2hN0wwM
-6p0D9qxF7JHaghqVRI0fAW/EE0OzdHMR9QkCQQDNR26dMFXKsoPu+vItljj/UEGf
-QG7gERiQ4yxaFBPHgdpGo0kT31eh9x9hQGDkxTe0GNG/YSgCRvm8+C3TMcKXAkBD
-dhGn36wkUFCddMSAM4NSJ1VN8/Z0y5HzCmI8dM3VwGtGMUQlxKxwOl30LEQzdS5M
-0SWojNYXiT2gOBfBwtbhAkEAhafl5QEOIgUz+XazS/IlZ8goNKdDVfYgK3mHHjvv
-nY5G+AuGebdNkXJr4KSWxDcN+C2i47zuj4QXA16MAOandA==
------END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert #2
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
+-----BEGIN CERTIFICATE-----
+MIID6jCCAtKgAwIBAgIJALnu1NlVpZ60MA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZzELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxHDAaBgNVBAMME1Rlc3QgU2VydmVyIENlcnQg
+IzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDrdi7j9yctG+L4EjBy
+gjPmEqZzOJEQba26MoQGzglU7e5Xf59Rb/hgVQuKAoiZe7/R8rK4zJ4W7iXdXw0L
+qBpyG8B5aGKeI32w+A9TcBApoXXL2CrYQEQjZwUIpLlYBIi2NkJj3nVkq5dgl1gO
+ALiQ+W8jg3kzg5Ec9rimp9r93N8wsSL3awsafurmYCvOf7leHaMP1WJ/zDRGUNHG
+/WtDjXc8ZUG1+6EXU9Jc2Fs+2Omf7fcN0l00AK/wPg8OaNS0rKyGq9JdIT9FRGV1
+bXe/rx58FaE5CItdwCSYhJvF/O95LWQoxJXye5bCFLmvDTEyVq9FMSCptfsmbXjE
+ZGsXAgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJ
+YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud
+DgQWBBR52UaWWTKzZGDH/X4mWNcuqeQVazAfBgNVHSMEGDAWgBQ2w2yI55X+sL3s
+zj49hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEANBW+XYLlHBqVY/31ie+3gRlS
+LPfy4SIqn0t3RJjagT29MXprblBO2cbMO8VGjkQdKGpmMXjxbht2arOOUXRHX4n/
+XTyn/QHEf0bcwIITMReO3DZUPAEw8hSjn9xEOM0IRVOCP+mH5fi74QzzQaZVCyYg
+5VtLKdww/+sc0nCbKl2KWgDluriH0nfVx95qgW3mg9dhXRr0zmf1w2zkBHYpARYL
+Dew6Z8EE4tS3HJu8/qM6meWzNtrfonQ3eiiMxjZBxzV46jchBwa2z9XYhP6AmpPb
+oeTSzcQNbWsxaGYzWo46oLDUZmJOwSBawbS31bZNMCoPIY6ukoesCzFSsUKZww==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
+MIIEowIBAAKCAQEA63Yu4/cnLRvi+BIwcoIz5hKmcziREG2tujKEBs4JVO3uV3+f
+UW/4YFULigKImXu/0fKyuMyeFu4l3V8NC6gachvAeWhiniN9sPgPU3AQKaF1y9gq
+2EBEI2cFCKS5WASItjZCY951ZKuXYJdYDgC4kPlvI4N5M4ORHPa4pqfa/dzfMLEi
+92sLGn7q5mArzn+5Xh2jD9Vif8w0RlDRxv1rQ413PGVBtfuhF1PSXNhbPtjpn+33
+DdJdNACv8D4PDmjUtKyshqvSXSE/RURldW13v68efBWhOQiLXcAkmISbxfzveS1k
+KMSV8nuWwhS5rw0xMlavRTEgqbX7Jm14xGRrFwIDAQABAoIBAHLsTPihIfLnYIE5
+x4GsQQ5zXeBw5ITDM37ktwHnQDC+rIzyUl1aLD1AZRBoKinXd4lOTqLZ4/NHKx4A
+DYr58mZtWyUmqLOMmQVuHXTZBlp7XtYuXMMNovQwjQlp9LicBeoBU6gQ5PVMtubD
+F4xGF89Sn0cTHW3iMkqTtQ5KcR1j57OcJO0FEb1vPvk2MXI5ZyAatUYE7YacbEzd
+rg02uIwx3FqNSkuSI79uz4hMdV5TPtuhxx9nTwj9aLUhXFeZ0mn2PVgVzEnnMoJb
++znlsZDgzDlJqdaD744YGWh8Z3OEssB35KfzFcdOeO6yH8lmv2Zfznk7pNPT7LTb
+Lae9VgkCgYEA92p1qnAB3NtJtNcaW53i0S5WJgS1hxWKvUDx3lTB9s8X9fHpqL1a
+E94fDfWzp/hax6FefUKIvBOukPLQ6bYjTMiFoOHzVirghAIuIUoMI5VtLhwD1hKs
+Lr7l/dptMgKb1nZHyXoKHRBthsy3K4+udsPi8TzMvYElgEqyQIe/Rk0CgYEA86GL
+8HC6zLszzKERDPBxrboRmoFvVUCTQDhsfj1M8aR3nQ8V5LkdIJc7Wqm/Ggfk9QRf
+rJ8M2WUMlU5CNnCn/KCrKzCNZIReze3fV+HnKdbcXGLvgbHPrhnz8yYehUFG+RGq
+bVyDWRU94T38izy2s5qMYrMJWZEYyXncSPbfcPMCgYAtaXfxcZ+V5xYPQFARMtiX
+5nZfggvDoJuXgx0h3tK/N2HBfcaSdzbaYLG4gTmZggc/jwnl2dl5E++9oSPhUdIG
+3ONSFUbxsOsGr9PBvnKd8WZZyUCXAVRjPBzAzF+whzQNWCZy/5htnz9LN7YDI9s0
+5113Q96cheDZPFydZY0hHQKBgQDVbEhNukM5xCiNcu+f2SaMnLp9EjQ4h5g3IvaP
+5B16daw/Dw8LzcohWboqIxeAsze0GD/D1ZUJAEd0qBjC3g+a9BjefervCjKOzXng
+38mEUm+6EwVjJSQcjSmycEs+Sr/kwr/8i5WYvU32+jk4tFgMoC+o6tQe/Uesf68k
+z/dPVwKBgGbF7Vv1/3SmhlOy+zYyvJ0CrWtKxH9QP6tLIEgEpd8x7YTSuCH94yok
+kToMXYA3sWNPt22GbRDZ+rcp4c7HkDx6I6vpdP9aQEwJTp0EPy0sgWr2XwYmreIQ
+NFmkk8Itn9EY2R9VBaP7GLv5kvwxDdLAnmwGmzVtbmaVdxCaBwUk
 -----END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/Email=server-certs@thawte.com
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/Email=server-certs@thawte.com
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8= 
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----
diff --git a/android.testssl/testssl b/android.testssl/testssl
index 3f24b1d..5ff4860 100755
--- a/android.testssl/testssl
+++ b/android.testssl/testssl
@@ -70,15 +70,6 @@ $ssltest -client_auth $CA $extra || exit 1
 echo test sslv2/sslv3 with both client and server authentication
 $ssltest -server_auth -client_auth $CA $extra || exit 1
 
-echo test sslv2/sslv3 with both client and server authentication and small client buffers
-$ssltest -server_auth -client_auth -c_small_records $CA $extra || exit 1
-
-echo test sslv2/sslv3 with both client and server authentication and small server buffers
-$ssltest -server_auth -client_auth -s_small_records $CA $extra || exit 1
-
-echo test sslv2/sslv3 with both client and server authentication and small client and server buffers
-$ssltest -server_auth -client_auth -c_small_records -s_small_records $CA $extra || exit 1
-
 echo test sslv2/sslv3 with both client and server authentication and handshake cutthrough
 $ssltest -server_auth -client_auth -cutthrough $CA $extra || exit 1
 
@@ -112,8 +103,8 @@ echo test sslv2/sslv3 via BIO pair
 $ssltest $extra || exit 1
 
 if [ $dsa_cert = NO ]; then
-  echo test sslv2/sslv3 w/o DHE via BIO pair
-  $ssltest -bio_pair -no_dhe $extra || exit 1
+  echo 'test sslv2/sslv3 w/o (EC)DHE via BIO pair'
+  $ssltest -bio_pair -no_dhe -no_ecdhe $extra || exit 1
 fi
 
 echo test sslv2/sslv3 with 1024bit DHE via BIO pair
@@ -131,6 +122,23 @@ $ssltest -bio_pair -server_auth -client_auth $CA $extra || exit 1
 echo test sslv2/sslv3 with both client and server authentication via BIO pair and app verify
 $ssltest -bio_pair -server_auth -client_auth -app_verify $CA $extra || exit 1
 
+echo "Testing ciphersuites"
+for protocol in TLSv1.2 SSLv3; do
+  echo "Testing ciphersuites for $protocol"
+  for cipher in `adb shell /system/bin/openssl ciphers "RSA+$protocol" | tr ':' ' '`; do
+    echo "Testing $cipher"
+    prot=""
+    if [ $protocol = "SSLv3" ] ; then
+      prot="-ssl3"
+    fi
+    $ssltest -cipher $cipher $prot
+    if [ $? -ne 0 ] ; then
+	  echo "Failed $cipher"
+	  exit 1
+    fi
+  done
+done
+
 #############################################################################
 
 if [ `adb shell /system/bin/openssl no-dh` = no-dh ]; then
@@ -143,8 +151,8 @@ fi
 if [ `adb shell /system/bin/openssl no-rsa` = no-dh ]; then
   echo skipping RSA tests
 else
-  echo test tls1 with 1024bit RSA, no DHE, multiple handshakes
-  adb shell /system/bin/ssltest -v -bio_pair -tls1 -cert /sdcard/android.testssl/server2.pem -no_dhe -num 10 -f -time $extra || exit 1
+  echo 'test tls1 with 1024bit RSA, no (EC)DHE, multiple handshakes'
+  adb shell /system/bin/ssltest -v -bio_pair -tls1 -cert /sdcard/android.testssl/server2.pem -no_dhe -no_ecdhe -num 10 -f -time $extra || exit 1
 
   if [ `adb shell /system/bin/openssl no-dh` = no-dh ]; then
     echo skipping RSA+DHE tests
@@ -160,4 +168,14 @@ $ssltest -tls1 -cipher PSK -psk abc123 $extra || exit 1
 echo test tls1 with PSK via BIO pair
 $ssltest -bio_pair -tls1 -cipher PSK -psk abc123 $extra || exit 1
 
+if adb shell /system/bin/openssl no-srp; then
+  echo skipping SRP tests
+else
+  echo test tls1 with SRP
+  $ssltest -tls1 -cipher SRP -srpuser test -srppass abc123
+
+  echo test tls1 with SRP via BIO pair
+  $ssltest -bio_pair -tls1 -cipher SRP -srpuser test -srppass abc123
+fi
+
 exit 0
diff --git a/ant.properties b/ant.properties
new file mode 100644
index 0000000..b0971e8
--- /dev/null
+++ b/ant.properties
@@ -0,0 +1,17 @@
+# This file is used to override default values used by the Ant build system.
+#
+# This file must be checked into Version Control Systems, as it is
+# integral to the build system of your project.
+
+# This file is only used by the Ant script.
+
+# You can use this to override default values such as
+#  'source.dir' for the location of your java source folder and
+#  'out.dir' for the location of your output folder.
+
+# You can also use it define how the release builds are signed by declaring
+# the following properties:
+#  'key.store' for the location of your keystore and
+#  'key.alias' for the name of the key to use.
+# The password will be asked during the build when you use the 'release' target.
+
diff --git a/apps/Android.mk b/apps/Android.mk
deleted file mode 100644
index 20cc5a9..0000000
--- a/apps/Android.mk
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2006 The Android Open Source Project
-
-LOCAL_PATH:= $(call my-dir)
-
-local_src_files:= \
-	app_rand.c \
-	apps.c \
-	asn1pars.c \
-	ca.c \
-	ciphers.c \
-	crl.c \
-	crl2p7.c \
-	dgst.c \
-	dh.c \
-	dhparam.c \
-	dsa.c \
-	dsaparam.c \
-	ecparam.c \
-	ec.c \
-	enc.c \
-	engine.c \
-	errstr.c \
-	gendh.c \
-	gendsa.c \
-	genpkey.c \
-	genrsa.c \
-	nseq.c \
-	ocsp.c \
-	openssl.c \
-	passwd.c \
-	pkcs12.c \
-	pkcs7.c \
-	pkcs8.c \
-	pkey.c \
-	pkeyparam.c \
-	pkeyutl.c \
-	prime.c \
-	rand.c \
-	req.c \
-	rsa.c \
-	rsautl.c \
-	s_cb.c \
-	s_client.c \
-	s_server.c \
-	s_socket.c \
-	s_time.c \
-	sess_id.c \
-	smime.c \
-	speed.c \
-	spkac.c \
-	verify.c \
-	version.c \
-	x509.c
-
-local_shared_libraries := \
-	libssl \
-	libcrypto
-
-local_c_includes := \
-	external/openssl \
-	external/openssl/include
-
-local_cflags := -DMONOLITH
-
-# These flags omit whole features from the commandline "openssl".
-# However, portions of these features are actually turned on.
-local_cflags += -DOPENSSL_NO_DTLS1
-
-include $(CLEAR_VARS)
-LOCAL_MODULE:= openssl
-LOCAL_MODULE_TAGS := optional
-LOCAL_SRC_FILES := $(local_src_files)
-LOCAL_SHARED_LIBRARIES := $(local_shared_libraries)
-LOCAL_C_INCLUDES := $(local_c_includes)
-LOCAL_CFLAGS := $(local_cflags)
-include $(LOCAL_PATH)/../android-config.mk
-include $(BUILD_EXECUTABLE)
-
-include $(CLEAR_VARS)
-LOCAL_MODULE:= openssl
-LOCAL_MODULE_TAGS := optional
-LOCAL_SRC_FILES := $(local_src_files)
-LOCAL_SHARED_LIBRARIES := $(local_shared_libraries)
-LOCAL_C_INCLUDES := $(local_c_includes)
-LOCAL_CFLAGS := $(local_cflags)
-include $(LOCAL_PATH)/../android-config.mk
-include $(BUILD_HOST_EXECUTABLE)
diff --git a/apps/apps.c b/apps/apps.c
index 38e6197..1096eee 100644
--- a/apps/apps.c
+++ b/apps/apps.c
@@ -109,7 +109,7 @@
  *
  */
 
-#ifndef _POSIX_C_SOURCE
+#if !defined(_POSIX_C_SOURCE) && defined(OPENSSL_SYS_VMS)
 #define _POSIX_C_SOURCE 2	/* On VMS, you need to define this to get
 				   the declaration of fileno().  The value
 				   2 is to make sure no function defined
@@ -1215,7 +1215,8 @@ STACK_OF(X509) *load_certs(BIO *err, const char *file, int format,
 	const char *pass, ENGINE *e, const char *desc)
 	{
 	STACK_OF(X509) *certs;
-	load_certs_crls(err, file, format, pass, e, desc, &certs, NULL);
+	if (!load_certs_crls(err, file, format, pass, e, desc, &certs, NULL))
+		return NULL;
 	return certs;
 	}	
 
@@ -1223,7 +1224,8 @@ STACK_OF(X509_CRL) *load_crls(BIO *err, const char *file, int format,
 	const char *pass, ENGINE *e, const char *desc)
 	{
 	STACK_OF(X509_CRL) *crls;
-	load_certs_crls(err, file, format, pass, e, desc, NULL, &crls);
+	if (!load_certs_crls(err, file, format, pass, e, desc, NULL, &crls))
+		return NULL;
 	return crls;
 	}	
 
@@ -2130,7 +2132,7 @@ X509_NAME *parse_name(char *subject, long chtype, int multirdn)
 	X509_NAME *n = NULL;
 	int nid;
 
-	if (!buf || !ne_types || !ne_values)
+	if (!buf || !ne_types || !ne_values || !mval)
 		{
 		BIO_printf(bio_err, "malloc error\n");
 		goto error;
@@ -2234,6 +2236,7 @@ X509_NAME *parse_name(char *subject, long chtype, int multirdn)
 	OPENSSL_free(ne_values);
 	OPENSSL_free(ne_types);
 	OPENSSL_free(buf);
+	OPENSSL_free(mval);
 	return n;
 
 error:
@@ -2242,6 +2245,8 @@ X509_NAME *parse_name(char *subject, long chtype, int multirdn)
 		OPENSSL_free(ne_values);
 	if (ne_types)
 		OPENSSL_free(ne_types);
+	if (mval)
+		OPENSSL_free(mval);
 	if (buf)
 		OPENSSL_free(buf);
 	return NULL;
@@ -2256,6 +2261,7 @@ int args_verify(char ***pargs, int *pargc,
 	int purpose = 0, depth = -1;
 	char **oldargs = *pargs;
 	char *arg = **pargs, *argn = (*pargs)[1];
+	time_t at_time = 0;
 	if (!strcmp(arg, "-policy"))
 		{
 		if (!argn)
@@ -2308,6 +2314,27 @@ int args_verify(char ***pargs, int *pargc,
 			}
 		(*pargs)++;
 		}
+	else if (strcmp(arg,"-attime") == 0)
+		{
+		if (!argn)
+			*badarg = 1;
+		else
+			{
+			long timestamp;
+			/* interpret the -attime argument as seconds since
+			 * Epoch */
+			if (sscanf(argn, "%li", &timestamp) != 1)
+				{
+				BIO_printf(bio_err,
+						"Error parsing timestamp %s\n",
+					   	argn);
+				*badarg = 1;
+				}
+			/* on some platforms time_t may be a float */
+			at_time = (time_t) timestamp;
+			}
+		(*pargs)++;
+		}
 	else if (!strcmp(arg, "-ignore_critical"))
 		flags |= X509_V_FLAG_IGNORE_CRITICAL;
 	else if (!strcmp(arg, "-issuer_checks"))
@@ -2362,6 +2389,9 @@ int args_verify(char ***pargs, int *pargc,
 	if (depth >= 0)
 		X509_VERIFY_PARAM_set_depth(*pm, depth);
 
+	if (at_time) 
+		X509_VERIFY_PARAM_set_time(*pm, at_time);
+
 	end:
 
 	(*pargs)++;
@@ -2693,6 +2723,50 @@ void jpake_server_auth(BIO *out, BIO *conn, const char *secret)
 
 #endif
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+/* next_protos_parse parses a comma separated list of strings into a string
+ * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
+ *   outlen: (output) set to the length of the resulting buffer on success.
+ *   err: (maybe NULL) on failure, an error message line is written to this BIO.
+ *   in: a NUL termianted string like "abc,def,ghi"
+ *
+ *   returns: a malloced buffer or NULL on failure.
+ */
+unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
+	{
+	size_t len;
+	unsigned char *out;
+	size_t i, start = 0;
+
+	len = strlen(in);
+	if (len >= 65535)
+		return NULL;
+
+	out = OPENSSL_malloc(strlen(in) + 1);
+	if (!out)
+		return NULL;
+
+	for (i = 0; i <= len; ++i)
+		{
+		if (i == len || in[i] == ',')
+			{
+			if (i - start > 255)
+				{
+				OPENSSL_free(out);
+				return NULL;
+				}
+			out[start] = i - start;
+			start = i + 1;
+			}
+		else
+			out[i+1] = in[i];
+		}
+
+	*outlen = len + 1;
+	return out;
+	}
+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
+
 /*
  * Platform-specific sections
  */
@@ -3018,46 +3092,3 @@ int raw_write_stdout(const void *buf,int siz)
 int raw_write_stdout(const void *buf,int siz)
 	{	return write(fileno(stdout),buf,siz);	}
 #endif
-
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-/* next_protos_parse parses a comma separated list of strings into a string
- * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
- *   outlen: (output) set to the length of the resulting buffer on success.
- *   in: a NUL termianted string like "abc,def,ghi"
- *
- *   returns: a malloced buffer or NULL on failure.
- */
-unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
-	{
-	size_t len;
-	unsigned char *out;
-	size_t i, start = 0;
-
-	len = strlen(in);
-	if (len >= 65535)
-		return NULL;
-
-	out = OPENSSL_malloc(strlen(in) + 1);
-	if (!out)
-		return NULL;
-
-	for (i = 0; i <= len; ++i)
-		{
-		if (i == len || in[i] == ',')
-			{
-			if (i - start > 255)
-				{
-				OPENSSL_free(out);
-				return NULL;
-				}
-			out[start] = i - start;
-			start = i + 1;
-			}
-		else
-			out[i+1] = in[i];
-		}
-
-	*outlen = len + 1;
-	return out;
-	}
-#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
diff --git a/apps/apps.h b/apps/apps.h
index 42072ec..c1ca99d 100644
--- a/apps/apps.h
+++ b/apps/apps.h
@@ -317,6 +317,12 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in);
 int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value);
 int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx,
 			const char *algname, ENGINE *e, int do_param);
+int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts);
+int do_X509_REQ_sign(BIO *err, X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts);
+int do_X509_CRL_sign(BIO *err, X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts);
 #ifndef OPENSSL_NO_PSK
 extern char *psk_key;
 #endif
@@ -325,6 +331,10 @@ void jpake_client_auth(BIO *out, BIO *conn, const char *secret);
 void jpake_server_auth(BIO *out, BIO *conn, const char *secret);
 #endif
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
+
 #define FORMAT_UNDEF    0
 #define FORMAT_ASN1     1
 #define FORMAT_TEXT     2
@@ -357,8 +367,7 @@ int raw_write_stdout(const void *,int);
 #define TM_START	0
 #define TM_STOP		1
 double app_tminterval (int stop,int usertime);
-#endif
 
-#ifndef OPENSSL_NO_NEXTPROTONEG
-unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
+#define OPENSSL_NO_SSL_INTERN
+
 #endif
diff --git a/apps/ca.c b/apps/ca.c
index 6b8b0ef..1cf50e0 100644
--- a/apps/ca.c
+++ b/apps/ca.c
@@ -197,26 +197,30 @@ extern int EF_ALIGNMENT;
 
 static void lookup_fail(const char *name, const char *tag);
 static int certify(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
-		   const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,CA_DB *db,
+		   const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+		   STACK_OF(CONF_VALUE) *policy,CA_DB *db,
 		   BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn, char *startdate,
 		   char *enddate, long days, int batch, char *ext_sect, CONF *conf,
 		   int verbose, unsigned long certopt, unsigned long nameopt,
 		   int default_op, int ext_copy, int selfsign);
 static int certify_cert(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
-			const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,
+			const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+			STACK_OF(CONF_VALUE) *policy,
 			CA_DB *db, BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn,
 			char *startdate, char *enddate, long days, int batch,
 			char *ext_sect, CONF *conf,int verbose, unsigned long certopt,
 			unsigned long nameopt, int default_op, int ext_copy,
 			ENGINE *e);
 static int certify_spkac(X509 **xret, char *infile,EVP_PKEY *pkey,X509 *x509,
-			 const EVP_MD *dgst,STACK_OF(CONF_VALUE) *policy,
+			 const EVP_MD *dgst,STACK_OF(OPENSSL_STRING) *sigopts,
+			 STACK_OF(CONF_VALUE) *policy,
 			 CA_DB *db, BIGNUM *serial,char *subj,unsigned long chtype, int multirdn, int email_dn,
 			 char *startdate, char *enddate, long days, char *ext_sect,
 			 CONF *conf, int verbose, unsigned long certopt, 
 			 unsigned long nameopt, int default_op, int ext_copy);
 static void write_new_certificate(BIO *bp, X509 *x, int output_der, int notext);
 static int do_body(X509 **xret, EVP_PKEY *pkey, X509 *x509, const EVP_MD *dgst,
+	STACK_OF(OPENSSL_STRING) *sigopts,
 	STACK_OF(CONF_VALUE) *policy, CA_DB *db, BIGNUM *serial,char *subj,unsigned long chtype, int multirdn,
 	int email_dn, char *startdate, char *enddate, long days, int batch,
        	int verbose, X509_REQ *req, char *ext_sect, CONF *conf,
@@ -311,6 +315,7 @@ int MAIN(int argc, char **argv)
 	const EVP_MD *dgst=NULL;
 	STACK_OF(CONF_VALUE) *attribs=NULL;
 	STACK_OF(X509) *cert_sk=NULL;
+	STACK_OF(OPENSSL_STRING) *sigopts = NULL;
 #undef BSIZE
 #define BSIZE 256
 	MS_STATIC char buf[3][BSIZE];
@@ -435,6 +440,15 @@ EF_ALIGNMENT=0;
 			if (--argc < 1) goto bad;
 			outdir= *(++argv);
 			}
+		else if (strcmp(*argv,"-sigopt") == 0)
+			{
+			if (--argc < 1)
+				goto bad;
+			if (!sigopts)
+				sigopts = sk_OPENSSL_STRING_new_null();
+			if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+				goto bad;
+			}
 		else if (strcmp(*argv,"-notext") == 0)
 			notext=1;
 		else if (strcmp(*argv,"-batch") == 0)
@@ -1170,8 +1184,9 @@ EF_ALIGNMENT=0;
 		if (spkac_file != NULL)
 			{
 			total++;
-			j=certify_spkac(&x,spkac_file,pkey,x509,dgst,attribs,db,
-				serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,extensions,
+			j=certify_spkac(&x,spkac_file,pkey,x509,dgst,sigopts,
+				attribs,db, serial,subj,chtype,multirdn,
+				email_dn,startdate,enddate,days,extensions,
 				conf,verbose,certopt,nameopt,default_op,ext_copy);
 			if (j < 0) goto err;
 			if (j > 0)
@@ -1194,7 +1209,8 @@ EF_ALIGNMENT=0;
 		if (ss_cert_file != NULL)
 			{
 			total++;
-			j=certify_cert(&x,ss_cert_file,pkey,x509,dgst,attribs,
+			j=certify_cert(&x,ss_cert_file,pkey,x509,dgst,sigopts,
+				attribs,
 				db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
 				extensions,conf,verbose, certopt, nameopt,
 				default_op, ext_copy, e);
@@ -1214,7 +1230,7 @@ EF_ALIGNMENT=0;
 		if (infile != NULL)
 			{
 			total++;
-			j=certify(&x,infile,pkey,x509p,dgst,attribs,db,
+			j=certify(&x,infile,pkey,x509p,dgst,sigopts, attribs,db,
 				serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
 				extensions,conf,verbose, certopt, nameopt,
 				default_op, ext_copy, selfsign);
@@ -1234,7 +1250,7 @@ EF_ALIGNMENT=0;
 		for (i=0; i<argc; i++)
 			{
 			total++;
-			j=certify(&x,argv[i],pkey,x509p,dgst,attribs,db,
+			j=certify(&x,argv[i],pkey,x509p,dgst,sigopts,attribs,db,
 				serial,subj,chtype,multirdn,email_dn,startdate,enddate,days,batch,
 				extensions,conf,verbose, certopt, nameopt,
 				default_op, ext_copy, selfsign);
@@ -1392,6 +1408,7 @@ EF_ALIGNMENT=0;
 			if (!NCONF_get_number(conf,section,
 				ENV_DEFAULT_CRL_HOURS, &crlhours))
 				crlhours = 0;
+			ERR_clear_error();
 			}
 		if ((crldays == 0) && (crlhours == 0) && (crlsec == 0))
 			{
@@ -1483,7 +1500,7 @@ EF_ALIGNMENT=0;
 			crlnumber = NULL;
 			}
 
-		if (!X509_CRL_sign(crl,pkey,dgst)) goto err;
+		if (!do_X509_CRL_sign(bio_err,crl,pkey,dgst,sigopts)) goto err;
 
 		PEM_write_bio_X509_CRL(Sout,crl);
 
@@ -1537,6 +1554,8 @@ EF_ALIGNMENT=0;
 	BN_free(serial);
 	BN_free(crlnumber);
 	free_index(db);
+	if (sigopts)
+		sk_OPENSSL_STRING_free(sigopts);
 	EVP_PKEY_free(pkey);
 	if (x509) X509_free(x509);
 	X509_CRL_free(crl);
@@ -1553,8 +1572,10 @@ static void lookup_fail(const char *name, const char *tag)
 	}
 
 static int certify(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
-	     const EVP_MD *dgst, STACK_OF(CONF_VALUE) *policy, CA_DB *db,
-	     BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn, char *startdate, char *enddate,
+	     const EVP_MD *dgst, STACK_OF(OPENSSL_STRING) *sigopts,
+	     STACK_OF(CONF_VALUE) *policy, CA_DB *db,
+	     BIGNUM *serial, char *subj,unsigned long chtype, int multirdn,
+	     int email_dn, char *startdate, char *enddate,
 	     long days, int batch, char *ext_sect, CONF *lconf, int verbose,
 	     unsigned long certopt, unsigned long nameopt, int default_op,
 	     int ext_copy, int selfsign)
@@ -1610,7 +1631,8 @@ static int certify(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
 	else
 		BIO_printf(bio_err,"Signature ok\n");
 
-	ok=do_body(xret,pkey,x509,dgst,policy,db,serial,subj,chtype,multirdn, email_dn,
+	ok=do_body(xret,pkey,x509,dgst,sigopts, policy,db,serial,subj,chtype,
+		multirdn, email_dn,
 		startdate,enddate,days,batch,verbose,req,ext_sect,lconf,
 		certopt, nameopt, default_op, ext_copy, selfsign);
 
@@ -1621,7 +1643,8 @@ static int certify(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
 	}
 
 static int certify_cert(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
-	     const EVP_MD *dgst, STACK_OF(CONF_VALUE) *policy, CA_DB *db,
+	     const EVP_MD *dgst, STACK_OF(OPENSSL_STRING) *sigopts,
+	     STACK_OF(CONF_VALUE) *policy, CA_DB *db,
 	     BIGNUM *serial, char *subj, unsigned long chtype, int multirdn, int email_dn, char *startdate, char *enddate,
 	     long days, int batch, char *ext_sect, CONF *lconf, int verbose,
 	     unsigned long certopt, unsigned long nameopt, int default_op,
@@ -1664,7 +1687,7 @@ static int certify_cert(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
 	if ((rreq=X509_to_X509_REQ(req,NULL,EVP_md5())) == NULL)
 		goto err;
 
-	ok=do_body(xret,pkey,x509,dgst,policy,db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,
+	ok=do_body(xret,pkey,x509,dgst,sigopts,policy,db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,
 		days,batch,verbose,rreq,ext_sect,lconf, certopt, nameopt, default_op,
 		ext_copy, 0);
 
@@ -1675,7 +1698,8 @@ static int certify_cert(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
 	}
 
 static int do_body(X509 **xret, EVP_PKEY *pkey, X509 *x509, const EVP_MD *dgst,
-	     STACK_OF(CONF_VALUE) *policy, CA_DB *db, BIGNUM *serial, char *subj,
+	     STACK_OF(OPENSSL_STRING) *sigopts, STACK_OF(CONF_VALUE) *policy,
+             CA_DB *db, BIGNUM *serial, char *subj,
 	     unsigned long chtype, int multirdn,
 	     int email_dn, char *startdate, char *enddate, long days, int batch,
 	     int verbose, X509_REQ *req, char *ext_sect, CONF *lconf,
@@ -2146,7 +2170,7 @@ static int do_body(X509 **xret, EVP_PKEY *pkey, X509 *x509, const EVP_MD *dgst,
 		EVP_PKEY_copy_parameters(pktmp,pkey);
 	EVP_PKEY_free(pktmp);
 
-	if (!X509_sign(ret,pkey,dgst))
+	if (!do_X509_sign(bio_err, ret,pkey,dgst, sigopts))
 		goto err;
 
 	/* We now just add it to the database */
@@ -2240,7 +2264,8 @@ static void write_new_certificate(BIO *bp, X509 *x, int output_der, int notext)
 	}
 
 static int certify_spkac(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
-	     const EVP_MD *dgst, STACK_OF(CONF_VALUE) *policy, CA_DB *db,
+	     const EVP_MD *dgst, STACK_OF(OPENSSL_STRING) *sigopts,
+	     STACK_OF(CONF_VALUE) *policy, CA_DB *db,
 	     BIGNUM *serial, char *subj,unsigned long chtype, int multirdn, int email_dn, char *startdate, char *enddate,
 	     long days, char *ext_sect, CONF *lconf, int verbose, unsigned long certopt,
 	     unsigned long nameopt, int default_op, int ext_copy)
@@ -2366,9 +2391,9 @@ static int certify_spkac(X509 **xret, char *infile, EVP_PKEY *pkey, X509 *x509,
 
 	X509_REQ_set_pubkey(req,pktmp);
 	EVP_PKEY_free(pktmp);
-	ok=do_body(xret,pkey,x509,dgst,policy,db,serial,subj,chtype,multirdn,email_dn,startdate,enddate,
-		   days,1,verbose,req,ext_sect,lconf, certopt, nameopt, default_op,
-			ext_copy, 0);
+	ok=do_body(xret,pkey,x509,dgst,sigopts,policy,db,serial,subj,chtype,
+		   multirdn,email_dn,startdate,enddate, days,1,verbose,req,
+		   ext_sect,lconf, certopt, nameopt, default_op, ext_copy, 0);
 err:
 	if (req != NULL) X509_REQ_free(req);
 	if (parms != NULL) CONF_free(parms);
@@ -2536,7 +2561,7 @@ static int get_certificate_status(const char *serial, CA_DB *db)
 			
 	/* Make it Upper Case */
 	for (i=0; row[DB_serial][i] != '\0'; i++)
-		row[DB_serial][i] = toupper(row[DB_serial][i]);
+		row[DB_serial][i] = toupper((unsigned char)row[DB_serial][i]);
 	
 
 	ok=1;
diff --git a/apps/ciphers.c b/apps/ciphers.c
index 3d4c60d..5f2b739 100644
--- a/apps/ciphers.c
+++ b/apps/ciphers.c
@@ -196,7 +196,7 @@ int MAIN(int argc, char **argv)
 			
 			if (Verbose)
 				{
-				unsigned long id = c->id;
+				unsigned long id = SSL_CIPHER_get_id(c);
 				int id0 = (int)(id >> 24);
 				int id1 = (int)((id >> 16) & 0xffL);
 				int id2 = (int)((id >> 8) & 0xffL);
diff --git a/apps/client.pem b/apps/client.pem
index 307910e..e7a47a7 100644
--- a/apps/client.pem
+++ b/apps/client.pem
@@ -1,24 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Client test cert (512 bit)
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Client Cert
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
 -----BEGIN CERTIFICATE-----
-MIIB6TCCAVICAQIwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU2WhcNOTgwNjA5
-MTM1NzU2WjBjMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxIzAhBgNVBAMTGkNsaWVudCB0ZXN0IGNl
-cnQgKDUxMiBiaXQpMFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBALtv55QyzG6i2Plw
-Z1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexmq/R4KedLjFEIYjocDui+IXs62NNt
-XrT8odkCAwEAATANBgkqhkiG9w0BAQQFAAOBgQBwtMmI7oGUG8nKmftQssATViH5
-NRRtoEw07DxJp/LfatHdrhqQB73eGdL5WILZJXk46Xz2e9WMSUjVCSYhdKxtflU3
-UR2Ajv1Oo0sTNdfz0wDqJNirLNtzyhhsaq8qMTrLwXrCP31VxBiigFSQSUFnZyTE
-9TKwhS4GlwbtCfxSKQ==
+MIID5zCCAs+gAwIBAgIJALnu1NlVpZ6yMA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZDELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxGTAXBgNVBAMMEFRlc3QgQ2xpZW50IENlcnQw
+ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC0ranbHRLcLVqN+0BzcZpY
++yOLqxzDWT1LD9eW1stC4NzXX9/DCtSIVyN7YIHdGLrIPr64IDdXXaMRzgZ2rOKs
+lmHCAiFpO/ja99gGCJRxH0xwQatqAULfJVHeUhs7OEGOZc2nWifjqKvGfNTilP7D
+nwi69ipQFq9oS19FmhwVHk2wg7KZGHI1qDyG04UrfCZMRitvS9+UVhPpIPjuiBi2
+x3/FZIpL5gXJvvFK6xHY63oq2asyzBATntBgnP4qJFWWcvRx24wF1PnZabxuVoL2
+bPnQ/KvONDrw3IdqkKhYNTul7jEcu3OlcZIMw+7DiaKJLAzKb/bBF5gm/pwW6As9
+AgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJYIZI
+AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW
+BBSZHKyLoTh7Mb409Zn/mK1ceSDAjDAfBgNVHSMEGDAWgBQ2w2yI55X+sL3szj49
+hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEAD0mL7PtPYgCEuDyOQSbLpeND5hVS
+curxQdGnrJ6Acrhodb7E9ccATokeb0PLx6HBLQUicxhTZIQ9FbO43YkQcOU6C3BB
+IlwskqmtN6+VmrQzNolHCDzvxNZs9lYL2VbGPGqVRyjZeHpoAlf9cQr8PgDb4d4b
+vUx2KAhHQvV2nkmYvKyXcgnRuHggumF87mkxidriGAEFwH4qfOqetUg64WyxP7P2
+QLipm04SyQa7ONtIApfVXgHcE42Py4/f4arzCzMjKe3VyhGkS7nsT55X/fWgTaRm
+CQPkO+H94P958WTvQDt77bQ+D3IvYaVvfil8n6HJMOJfFT0LJuSUbpSXJg==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIIBOwIBAAJBALtv55QyzG6i2PlwZ1pah7++Gv8L5j6Hnyr/uTZE1NLG0ABDDexm
-q/R4KedLjFEIYjocDui+IXs62NNtXrT8odkCAwEAAQJAbwXq0vJ/+uyEvsNgxLko
-/V86mGXQ/KrSkeKlL0r4ENxjcyeMAGoKu6J9yMY7+X9+Zm4nxShNfTsf/+Freoe1
-HQIhAPOSm5Q1YI+KIsII2GeVJx1U69+wnd71OasIPakS1L1XAiEAxQAW+J3/JWE0
-ftEYakbhUOKL8tD1OaFZS71/5GdG7E8CIQCefUMmySSvwd6kC0VlATSWbW+d+jp/
-nWmM1KvqnAo5uQIhALqEADu5U1Wvt8UN8UDGBRPQulHWNycuNV45d3nnskWPAiAw
-ueTyr6WsZ5+SD8g/Hy3xuvF3nPmJRH+rwvVihlcFOg==
+MIIEpQIBAAKCAQEAtK2p2x0S3C1ajftAc3GaWPsji6scw1k9Sw/XltbLQuDc11/f
+wwrUiFcje2CB3Ri6yD6+uCA3V12jEc4GdqzirJZhwgIhaTv42vfYBgiUcR9McEGr
+agFC3yVR3lIbOzhBjmXNp1on46irxnzU4pT+w58IuvYqUBavaEtfRZocFR5NsIOy
+mRhyNag8htOFK3wmTEYrb0vflFYT6SD47ogYtsd/xWSKS+YFyb7xSusR2Ot6Ktmr
+MswQE57QYJz+KiRVlnL0cduMBdT52Wm8blaC9mz50PyrzjQ68NyHapCoWDU7pe4x
+HLtzpXGSDMPuw4miiSwMym/2wReYJv6cFugLPQIDAQABAoIBAAZOyc9MhIwLSU4L
+p4RgQvM4UVVe8/Id+3XTZ8NsXExJbWxXfIhiqGjaIfL8u4vsgRjcl+v1s/jo2/iT
+KMab4o4D8gXD7UavQVDjtjb/ta79WL3SjRl2Uc9YjjMkyq6WmDNQeo2NKDdafCTB
+1uzSJtLNipB8Z53ELPuHJhxX9QMHrMnuha49riQgXZ7buP9iQrHJFhImBjSzbxJx
+L+TI6rkyLSf9Wi0Pd3L27Ob3QWNfNRYNSeTE+08eSRChkur5W0RuXAcuAICdQlCl
+LBvWO/LmmvbzCqiDcgy/TliSb6CGGwgiNG7LJZmlkYNj8laGwalNlYZs3UrVv6NO
+Br2loAECgYEA2kvCvPGj0Dg/6g7WhXDvAkEbcaL1tSeCxBbNH+6HS2UWMWvyTtCn
+/bbD519QIdkvayy1QjEf32GV/UjUVmlULMLBcDy0DGjtL3+XpIhLKWDNxN1v1/ai
+1oz23ZJCOgnk6K4qtFtlRS1XtynjA+rBetvYvLP9SKeFrnpzCgaA2r0CgYEA0+KX
+1ACXDTNH5ySX3kMjSS9xdINf+OOw4CvPHFwbtc9aqk2HePlEsBTz5I/W3rKwXva3
+NqZ/bRqVVeZB/hHKFywgdUQk2Uc5z/S7Lw70/w1HubNTXGU06Ngb6zOFAo/o/TwZ
+zTP1BMIKSOB6PAZPS3l+aLO4FRIRotfFhgRHOoECgYEAmiZbqt8cJaJDB/5YYDzC
+mp3tSk6gIb936Q6M5VqkMYp9pIKsxhk0N8aDCnTU+kIK6SzWBpr3/d9Ecmqmfyq7
+5SvWO3KyVf0WWK9KH0abhOm2BKm2HBQvI0DB5u8sUx2/hsvOnjPYDISbZ11t0MtK
+u35Zy89yMYcSsIYJjG/ROCUCgYEAgI2P9G5PNxEP5OtMwOsW84Y3Xat/hPAQFlI+
+HES+AzbFGWJkeT8zL2nm95tVkFP1sggZ7Kxjz3w7cpx7GX0NkbWSE9O+T51pNASV
+tN1sQ3p5M+/a+cnlqgfEGJVvc7iAcXQPa3LEi5h2yPR49QYXAgG6cifn3dDSpmwn
+SUI7PQECgYEApGCIIpSRPLAEHTGmP87RBL1smurhwmy2s/pghkvUkWehtxg0sGHh
+kuaqDWcskogv+QC0sVdytiLSz8G0DwcEcsHK1Fkyb8A+ayiw6jWJDo2m9+IF4Fww
+1Te6jFPYDESnbhq7+TLGgHGhtwcu5cnb4vSuYXGXKupZGzoLOBbv1Zw=
 -----END RSA PRIVATE KEY-----
diff --git a/apps/cms.c b/apps/cms.c
index d29a884..5f77f8f 100644
--- a/apps/cms.c
+++ b/apps/cms.c
@@ -136,6 +136,7 @@ int MAIN(int argc, char **argv)
 	char *engine=NULL;
 #endif
 	unsigned char *secret_key = NULL, *secret_keyid = NULL;
+	unsigned char *pwri_pass = NULL, *pwri_tmp = NULL;
 	size_t secret_keylen = 0, secret_keyidlen = 0;
 
 	ASN1_OBJECT *econtent_type = NULL;
@@ -232,6 +233,8 @@ int MAIN(int argc, char **argv)
 		else if (!strcmp(*args,"-camellia256"))
 				cipher = EVP_camellia_256_cbc();
 #endif
+		else if (!strcmp (*args, "-debug_decrypt")) 
+				flags |= CMS_DEBUG_DECRYPT;
 		else if (!strcmp (*args, "-text")) 
 				flags |= CMS_TEXT;
 		else if (!strcmp (*args, "-nointern")) 
@@ -326,6 +329,13 @@ int MAIN(int argc, char **argv)
 				}
 			secret_keyidlen = (size_t)ltmp;
 			}
+		else if (!strcmp(*args,"-pwri_password"))
+			{
+			if (!args[1])
+				goto argerr;
+			args++;
+			pwri_pass = (unsigned char *)*args;
+			}
 		else if (!strcmp(*args,"-econtent_type"))
 			{
 			if (!args[1])
@@ -559,7 +569,7 @@ int MAIN(int argc, char **argv)
 
 	else if (operation == SMIME_DECRYPT)
 		{
-		if (!recipfile && !keyfile && !secret_key)
+		if (!recipfile && !keyfile && !secret_key && !pwri_pass)
 			{
 			BIO_printf(bio_err, "No recipient certificate or key specified\n");
 			badarg = 1;
@@ -567,7 +577,7 @@ int MAIN(int argc, char **argv)
 		}
 	else if (operation == SMIME_ENCRYPT)
 		{
-		if (!*args && !secret_key)
+		if (!*args && !secret_key && !pwri_pass)
 			{
 			BIO_printf(bio_err, "No recipient(s) certificate(s) specified\n");
 			badarg = 1;
@@ -618,7 +628,7 @@ int MAIN(int argc, char **argv)
 		BIO_printf (bio_err, "-certsout file certificate output file\n");
 		BIO_printf (bio_err, "-signer file   signer certificate file\n");
 		BIO_printf (bio_err, "-recip  file   recipient certificate file for decryption\n");
-		BIO_printf (bio_err, "-skeyid        use subject key identifier\n");
+		BIO_printf (bio_err, "-keyid         use subject key identifier\n");
 		BIO_printf (bio_err, "-in file       input file\n");
 		BIO_printf (bio_err, "-inform arg    input format SMIME (default), PEM or DER\n");
 		BIO_printf (bio_err, "-inkey file    input private key (if not signer or recipient)\n");
@@ -917,6 +927,17 @@ int MAIN(int argc, char **argv)
 			secret_key = NULL;
 			secret_keyid = NULL;
 			}
+		if (pwri_pass)
+			{
+			pwri_tmp = (unsigned char *)BUF_strdup((char *)pwri_pass);
+			if (!pwri_tmp)
+				goto end;
+			if (!CMS_add0_recipient_password(cms,
+						-1, NID_undef, NID_undef,
+						 pwri_tmp, -1, NULL))
+				goto end;
+			pwri_tmp = NULL;
+			}
 		if (!(flags & CMS_STREAM))
 			{
 			if (!CMS_final(cms, in, NULL, flags))
@@ -1020,6 +1041,8 @@ int MAIN(int argc, char **argv)
 	ret = 4;
 	if (operation == SMIME_DECRYPT)
 		{
+		if (flags & CMS_DEBUG_DECRYPT)
+			CMS_decrypt(cms, NULL, NULL, NULL, NULL, flags);
 
 		if (secret_key)
 			{
@@ -1043,6 +1066,16 @@ int MAIN(int argc, char **argv)
 				}
 			}
 
+		if (pwri_pass)
+			{
+			if (!CMS_decrypt_set1_password(cms, pwri_pass, -1))
+				{
+				BIO_puts(bio_err,
+					"Error decrypting CMS using password\n");
+				goto end;
+				}
+			}
+
 		if (!CMS_decrypt(cms, NULL, NULL, indata, out, flags))
 			{
 			BIO_printf(bio_err, "Error decrypting CMS structure\n");
@@ -1167,6 +1200,8 @@ int MAIN(int argc, char **argv)
 		OPENSSL_free(secret_key);
 	if (secret_keyid)
 		OPENSSL_free(secret_keyid);
+	if (pwri_tmp)
+		OPENSSL_free(pwri_tmp);
 	if (econtent_type)
 		ASN1_OBJECT_free(econtent_type);
 	if (rr)
diff --git a/apps/dgst.c b/apps/dgst.c
index 9bf38ce..81bd870 100644
--- a/apps/dgst.c
+++ b/apps/dgst.c
@@ -127,6 +127,7 @@ int MAIN(int argc, char **argv)
 #endif
 	char *hmac_key=NULL;
 	char *mac_name=NULL;
+	int non_fips_allow = 0;
 	STACK_OF(OPENSSL_STRING) *sigopts = NULL, *macopts = NULL;
 
 	apps_startup();
@@ -215,6 +216,10 @@ int MAIN(int argc, char **argv)
 			out_bin = 1;
 		else if (strcmp(*argv,"-d") == 0)
 			debug=1;
+		else if (!strcmp(*argv,"-fips-fingerprint"))
+			hmac_key = "etaonrishdlcupfm";
+		else if (strcmp(*argv,"-non-fips-allow") == 0)
+			non_fips_allow=1;
 		else if (!strcmp(*argv,"-hmac"))
 			{
 			if (--argc < 1)
@@ -395,6 +400,13 @@ int MAIN(int argc, char **argv)
 			goto end;
 		}
 
+	if (non_fips_allow)
+		{
+		EVP_MD_CTX *md_ctx;
+		BIO_get_md_ctx(bmd,&md_ctx);
+		EVP_MD_CTX_set_flags(md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+		}
+
 	if (hmac_key)
 		{
 		sigkey = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, e,
diff --git a/apps/dhparam.c b/apps/dhparam.c
index b47097c..1297d6f 100644
--- a/apps/dhparam.c
+++ b/apps/dhparam.c
@@ -332,7 +332,6 @@ int MAIN(int argc, char **argv)
 			BIO_printf(bio_err,"This is going to take a long time\n");
 			if(!dh || !DH_generate_parameters_ex(dh, num, g, &cb))
 				{
-				if(dh) DH_free(dh);
 				ERR_print_errors(bio_err);
 				goto end;
 				}
diff --git a/apps/dsaparam.c b/apps/dsaparam.c
index fe72c1d..683d513 100644
--- a/apps/dsaparam.c
+++ b/apps/dsaparam.c
@@ -326,6 +326,7 @@ int MAIN(int argc, char **argv)
 				goto end;
 				}
 #endif
+			ERR_print_errors(bio_err);
 			BIO_printf(bio_err,"Error, DSA key generation failed\n");
 			goto end;
 			}
@@ -429,13 +430,19 @@ int MAIN(int argc, char **argv)
 
 		assert(need_rand);
 		if ((dsakey=DSAparams_dup(dsa)) == NULL) goto end;
-		if (!DSA_generate_key(dsakey)) goto end;
+		if (!DSA_generate_key(dsakey))
+			{
+			ERR_print_errors(bio_err);
+			DSA_free(dsakey);
+			goto end;
+			}
 		if 	(outformat == FORMAT_ASN1)
 			i=i2d_DSAPrivateKey_bio(out,dsakey);
 		else if (outformat == FORMAT_PEM)
 			i=PEM_write_bio_DSAPrivateKey(out,dsakey,NULL,NULL,0,NULL,NULL);
 		else	{
 			BIO_printf(bio_err,"bad output format specified for outfile\n");
+			DSA_free(dsakey);
 			goto end;
 			}
 		DSA_free(dsakey);
diff --git a/apps/enc.c b/apps/enc.c
index 076225c..719acc3 100644
--- a/apps/enc.c
+++ b/apps/enc.c
@@ -129,6 +129,7 @@ int MAIN(int argc, char **argv)
 	char *engine = NULL;
 #endif
 	const EVP_MD *dgst=NULL;
+	int non_fips_allow = 0;
 
 	apps_startup();
 
@@ -281,6 +282,8 @@ int MAIN(int argc, char **argv)
 			if (--argc < 1) goto bad;
 			md= *(++argv);
 			}
+		else if (strcmp(*argv,"-non-fips-allow") == 0)
+			non_fips_allow = 1;
 		else if	((argv[0][0] == '-') &&
 			((c=EVP_get_cipherbyname(&(argv[0][1]))) != NULL))
 			{
@@ -589,6 +592,11 @@ int MAIN(int argc, char **argv)
 		 */
 
 		BIO_get_cipher_ctx(benc, &ctx);
+
+		if (non_fips_allow)
+			EVP_CIPHER_CTX_set_flags(ctx,
+				EVP_CIPH_FLAG_NON_FIPS_ALLOW);
+
 		if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, enc))
 			{
 			BIO_printf(bio_err, "Error setting cipher %s\n",
diff --git a/apps/genrsa.c b/apps/genrsa.c
index 37e9310..ece114c 100644
--- a/apps/genrsa.c
+++ b/apps/genrsa.c
@@ -78,7 +78,7 @@
 #include <openssl/pem.h>
 #include <openssl/rand.h>
 
-#define DEFBITS	512
+#define DEFBITS	1024
 #undef PROG
 #define PROG genrsa_main
 
diff --git a/apps/ocsp.c b/apps/ocsp.c
index 01847df..83c5a76 100644
--- a/apps/ocsp.c
+++ b/apps/ocsp.c
@@ -617,7 +617,7 @@ int MAIN(int argc, char **argv)
 		BIO_printf (bio_err, "-ndays n	 	 number of days before next update\n");
 		BIO_printf (bio_err, "-resp_key_id       identify reponse by signing certificate key ID\n");
 		BIO_printf (bio_err, "-nrequest n        number of requests to accept (default unlimited)\n");
-		BIO_printf (bio_err, "-<dgst alg>     use specified digest in the request");
+		BIO_printf (bio_err, "-<dgst alg>     use specified digest in the request\n");
 		goto end;
 		}
 
diff --git a/apps/openssl.c b/apps/openssl.c
index 1068957..33b1655 100644
--- a/apps/openssl.c
+++ b/apps/openssl.c
@@ -129,6 +129,9 @@
 #include "progs.h"
 #include "s_apps.h"
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 /* The LHASH callbacks ("hash" & "cmp") have been replaced by functions with the
  * base prototypes (we cast each variable inside the function to the required
@@ -310,6 +313,19 @@ int main(int Argc, char *ARGV[])
 		CRYPTO_set_locking_callback(lock_dbg_cb);
 		}
 
+	if(getenv("OPENSSL_FIPS")) {
+#ifdef OPENSSL_FIPS
+		if (!FIPS_mode_set(1)) {
+			ERR_load_crypto_strings();
+			ERR_print_errors(BIO_new_fp(stderr,BIO_NOCLOSE));
+			EXIT(1);
+		}
+#else
+		fprintf(stderr, "FIPS mode not supported.\n");
+		EXIT(1);
+#endif
+		}
+
 	apps_startup();
 
 	/* Lets load up our environment a little */
diff --git a/apps/openssl.cnf b/apps/openssl.cnf
index 9d2cd5b..18760c6 100644
--- a/apps/openssl.cnf
+++ b/apps/openssl.cnf
@@ -145,7 +145,7 @@ localityName			= Locality Name (eg, city)
 organizationalUnitName		= Organizational Unit Name (eg, section)
 #organizationalUnitName_default	=
 
-commonName			= Common Name (eg, YOUR name)
+commonName			= Common Name (e.g. server FQDN or YOUR name)
 commonName_max			= 64
 
 emailAddress			= Email Address
diff --git a/apps/progs.h b/apps/progs.h
index 728bb6d..dd2298b 100644
--- a/apps/progs.h
+++ b/apps/progs.h
@@ -46,6 +46,7 @@ extern int engine_main(int argc,char *argv[]);
 extern int ocsp_main(int argc,char *argv[]);
 extern int prime_main(int argc,char *argv[]);
 extern int ts_main(int argc,char *argv[]);
+extern int srp_main(int argc,char *argv[]);
 
 #define FUNC_TYPE_GENERAL	1
 #define FUNC_TYPE_MD		2
@@ -149,6 +150,9 @@ FUNCTION functions[] = {
 #if 0 /* ANDROID */
 	{FUNC_TYPE_GENERAL,"ts",ts_main},
 #endif
+#ifndef OPENSSL_NO_SRP
+	{FUNC_TYPE_GENERAL,"srp",srp_main},
+#endif
 #ifndef OPENSSL_NO_MD2
 	{FUNC_TYPE_MD,"md2",dgst_main},
 #endif
diff --git a/apps/progs.pl b/apps/progs.pl
index de6fdea..39ca8f7 100644
--- a/apps/progs.pl
+++ b/apps/progs.pl
@@ -51,6 +51,8 @@
 		{ print "#ifndef OPENSSL_NO_CMS\n${str}#endif\n"; }
 	elsif ( ($_ =~ /^ocsp$/))
 		{ print "#ifndef OPENSSL_NO_OCSP\n${str}#endif\n"; }
+	elsif ( ($_ =~ /^srp$/))
+		{ print "#ifndef OPENSSL_NO_SRP\n${str}#endif\n"; }
 	else
 		{ print $str; }
 	}
diff --git a/apps/req.c b/apps/req.c
index 820cd18..8552658 100644
--- a/apps/req.c
+++ b/apps/req.c
@@ -165,7 +165,7 @@ int MAIN(int argc, char **argv)
 	EVP_PKEY_CTX *genctx = NULL;
 	const char *keyalg = NULL;
 	char *keyalgstr = NULL;
-	STACK_OF(OPENSSL_STRING) *pkeyopts = NULL;
+	STACK_OF(OPENSSL_STRING) *pkeyopts = NULL, *sigopts = NULL;
 	EVP_PKEY *pkey=NULL;
 	int i=0,badops=0,newreq=0,verbose=0,pkey_type=-1;
 	long newkey = -1;
@@ -310,6 +310,15 @@ int MAIN(int argc, char **argv)
 			if (!pkeyopts || !sk_OPENSSL_STRING_push(pkeyopts, *(++argv)))
 				goto bad;
 			}
+		else if (strcmp(*argv,"-sigopt") == 0)
+			{
+			if (--argc < 1)
+				goto bad;
+			if (!sigopts)
+				sigopts = sk_OPENSSL_STRING_new_null();
+			if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+				goto bad;
+			}
 		else if (strcmp(*argv,"-batch") == 0)
 			batch=1;
 		else if (strcmp(*argv,"-newhdr") == 0)
@@ -858,8 +867,9 @@ int MAIN(int argc, char **argv)
 					extensions);
 				goto end;
 				}
-			
-			if (!(i=X509_sign(x509ss,pkey,digest)))
+
+			i=do_X509_sign(bio_err, x509ss, pkey, digest, sigopts);
+			if (!i)
 				{
 				ERR_print_errors(bio_err);
 				goto end;
@@ -883,7 +893,8 @@ int MAIN(int argc, char **argv)
 					req_exts);
 				goto end;
 				}
-			if (!(i=X509_REQ_sign(req,pkey,digest)))
+			i=do_X509_REQ_sign(bio_err, req, pkey, digest, sigopts);
+			if (!i)
 				{
 				ERR_print_errors(bio_err);
 				goto end;
@@ -1084,6 +1095,8 @@ int MAIN(int argc, char **argv)
 		EVP_PKEY_CTX_free(genctx);
 	if (pkeyopts)
 		sk_OPENSSL_STRING_free(pkeyopts);
+	if (sigopts)
+		sk_OPENSSL_STRING_free(sigopts);
 #ifndef OPENSSL_NO_ENGINE
 	if (gen_eng)
 		ENGINE_free(gen_eng);
@@ -1756,3 +1769,68 @@ static int genpkey_cb(EVP_PKEY_CTX *ctx)
 #endif
 	return 1;
 	}
+
+static int do_sign_init(BIO *err, EVP_MD_CTX *ctx, EVP_PKEY *pkey,
+			const EVP_MD *md, STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	EVP_PKEY_CTX *pkctx = NULL;
+	int i;
+	EVP_MD_CTX_init(ctx);
+	if (!EVP_DigestSignInit(ctx, &pkctx, md, NULL, pkey))
+		return 0;
+	for (i = 0; i < sk_OPENSSL_STRING_num(sigopts); i++)
+		{
+		char *sigopt = sk_OPENSSL_STRING_value(sigopts, i);
+		if (pkey_ctrl_string(pkctx, sigopt) <= 0)
+			{
+			BIO_printf(err, "parameter error \"%s\"\n", sigopt);
+			ERR_print_errors(bio_err);
+			return 0;
+			}
+		}
+	return 1;
+	}
+
+int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	int rv;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
+	rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+	if (rv > 0)
+		rv = X509_sign_ctx(x, &mctx);
+	EVP_MD_CTX_cleanup(&mctx);
+	return rv > 0 ? 1 : 0;
+	}
+
+
+int do_X509_REQ_sign(BIO *err, X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	int rv;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
+	rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+	if (rv > 0)
+		rv = X509_REQ_sign_ctx(x, &mctx);
+	EVP_MD_CTX_cleanup(&mctx);
+	return rv > 0 ? 1 : 0;
+	}
+		
+	
+
+int do_X509_CRL_sign(BIO *err, X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md,
+			STACK_OF(OPENSSL_STRING) *sigopts)
+	{
+	int rv;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
+	rv = do_sign_init(err, &mctx, pkey, md, sigopts);
+	if (rv > 0)
+		rv = X509_CRL_sign_ctx(x, &mctx);
+	EVP_MD_CTX_cleanup(&mctx);
+	return rv > 0 ? 1 : 0;
+	}
+		
+	
diff --git a/apps/s_cb.c b/apps/s_cb.c
index c4f5512..84c3b44 100644
--- a/apps/s_cb.c
+++ b/apps/s_cb.c
@@ -237,8 +237,8 @@ int set_cert_stuff(SSL_CTX *ctx, char *cert_file, char *key_file)
 
 		/* If we are using DSA, we can copy the parameters from
 		 * the private key */
-		
-		
+
+
 		/* Now we know that a key and cert have been set against
 		 * the SSL context */
 		if (!SSL_CTX_check_private_key(ctx))
@@ -357,6 +357,12 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
 	case TLS1_VERSION:
 		str_version = "TLS 1.0 ";
 		break;
+	case TLS1_1_VERSION:
+		str_version = "TLS 1.1 ";
+		break;
+	case TLS1_2_VERSION:
+		str_version = "TLS 1.2 ";
+		break;
 	case DTLS1_VERSION:
 		str_version = "DTLS 1.0 ";
 		break;
@@ -430,6 +436,8 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
 
 	if (version == SSL3_VERSION ||
 	    version == TLS1_VERSION ||
+	    version == TLS1_1_VERSION ||
+	    version == TLS1_2_VERSION ||
 	    version == DTLS1_VERSION ||
 	    version == DTLS1_BAD_VER)
 		{
@@ -549,6 +557,9 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
 				case 114:
 					str_details2 = " bad_certificate_hash_value";
 					break;
+				case 115:
+					str_details2 = " unknown_psk_identity";
+					break;
 					}
 				}
 			}
@@ -597,6 +608,26 @@ void MS_CALLBACK msg_cb(int write_p, int version, int content_type, const void *
 					}
 				}
 			}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+		if (content_type == 24) /* Heartbeat */
+			{
+			str_details1 = ", Heartbeat";
+			
+			if (len > 0)
+				{
+				switch (((const unsigned char*)buf)[0])
+					{
+				case 1:
+					str_details1 = ", HeartbeatRequest";
+					break;
+				case 2:
+					str_details1 = ", HeartbeatResponse";
+					break;
+					}
+				}
+			}
+#endif
 		}
 
 	BIO_printf(bio, "%s %s%s [length %04lx]%s%s\n", str_write_p, str_version, str_content_type, (unsigned long)len, str_details1, str_details2);
@@ -657,6 +688,22 @@ void MS_CALLBACK tlsext_cb(SSL *s, int client_server, int type,
 		extname = "status request";
 		break;
 
+		case TLSEXT_TYPE_user_mapping:
+		extname = "user mapping";
+		break;
+
+		case TLSEXT_TYPE_client_authz:
+		extname = "client authz";
+		break;
+
+		case TLSEXT_TYPE_server_authz:
+		extname = "server authz";
+		break;
+
+		case TLSEXT_TYPE_cert_type:
+		extname = "cert type";
+		break;
+
 		case TLSEXT_TYPE_elliptic_curves:
 		extname = "elliptic curves";
 		break;
@@ -665,12 +712,28 @@ void MS_CALLBACK tlsext_cb(SSL *s, int client_server, int type,
 		extname = "EC point formats";
 		break;
 
+		case TLSEXT_TYPE_srp:
+		extname = "SRP";
+		break;
+
+		case TLSEXT_TYPE_signature_algorithms:
+		extname = "signature algorithms";
+		break;
+
+		case TLSEXT_TYPE_use_srtp:
+		extname = "use SRTP";
+		break;
+
+		case TLSEXT_TYPE_heartbeat:
+		extname = "heartbeat";
+		break;
+
 		case TLSEXT_TYPE_session_ticket:
-		extname = "server ticket";
+		extname = "session ticket";
 		break;
 
-		case TLSEXT_TYPE_renegotiate:
-		extname = "renegotiate";
+		case TLSEXT_TYPE_renegotiate: 
+		extname = "renegotiation info";
 		break;
 
 #ifdef TLSEXT_TYPE_opaque_prf_input
@@ -678,6 +741,11 @@ void MS_CALLBACK tlsext_cb(SSL *s, int client_server, int type,
 		extname = "opaque PRF input";
 		break;
 #endif
+#ifdef TLSEXT_TYPE_next_proto_neg
+		case TLSEXT_TYPE_next_proto_neg:
+		extname = "next protocol";
+		break;
+#endif
 
 		default:
 		extname = "unknown";
diff --git a/apps/s_client.c b/apps/s_client.c
index b951513..cb1efcd 100644
--- a/apps/s_client.c
+++ b/apps/s_client.c
@@ -163,6 +163,9 @@ typedef unsigned int u_int;
 #include <openssl/rand.h>
 #include <openssl/ocsp.h>
 #include <openssl/bn.h>
+#ifndef OPENSSL_NO_SRP
+#include <openssl/srp.h>
+#endif
 #include "s_apps.h"
 #include "timeouts.h"
 
@@ -203,6 +206,9 @@ static int c_status_req=0;
 static int c_msg=0;
 static int c_showcerts=0;
 
+static char *keymatexportlabel=NULL;
+static int keymatexportlen=20;
+
 static void sc_usage(void);
 static void print_stuff(BIO *berr,SSL *con,int full);
 #ifndef OPENSSL_NO_TLSEXT
@@ -315,13 +321,22 @@ static void sc_usage(void)
 # ifndef OPENSSL_NO_JPAKE
 	BIO_printf(bio_err," -jpake arg    - JPAKE secret to use\n");
 # endif
+#endif
+#ifndef OPENSSL_NO_SRP
+	BIO_printf(bio_err," -srpuser user     - SRP authentification for 'user'\n");
+	BIO_printf(bio_err," -srppass arg      - password for 'user'\n");
+	BIO_printf(bio_err," -srp_lateuser     - SRP username into second ClientHello message\n");
+	BIO_printf(bio_err," -srp_moregroups   - Tolerate other than the known g N values.\n");
+	BIO_printf(bio_err," -srp_strength int - minimal mength in bits for N (default %d).\n",SRP_MINIMAL_N);
 #endif
 	BIO_printf(bio_err," -ssl2         - just use SSLv2\n");
 	BIO_printf(bio_err," -ssl3         - just use SSLv3\n");
+	BIO_printf(bio_err," -tls1_2       - just use TLSv1.2\n");
+	BIO_printf(bio_err," -tls1_1       - just use TLSv1.1\n");
 	BIO_printf(bio_err," -tls1         - just use TLSv1\n");
 	BIO_printf(bio_err," -dtls1        - just use DTLSv1\n");    
 	BIO_printf(bio_err," -mtu          - set the link layer MTU\n");
-	BIO_printf(bio_err," -no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n");
+	BIO_printf(bio_err," -no_tls1_2/-no_tls1_1/-no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n");
 	BIO_printf(bio_err," -bugs         - Switch on all SSL implementation bug workarounds\n");
 	BIO_printf(bio_err," -serverpref   - Use server's cipher preferences (only SSLv2)\n");
 	BIO_printf(bio_err," -cipher       - preferred cipher to use, use the 'openssl ciphers'\n");
@@ -344,10 +359,16 @@ static void sc_usage(void)
 	BIO_printf(bio_err," -no_ticket        - disable use of RFC4507bis session tickets\n");
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
+	BIO_printf(bio_err," -alpn arg         - enable ALPN extension, considering named protocols supported (comma-separated list)\n");
 # endif
-	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
 #endif
+	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
 	BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
+#ifndef OPENSSL_NO_SRTP
+	BIO_printf(bio_err," -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n");
+#endif
+ 	BIO_printf(bio_err," -keymatexport label   - Export keying material using label\n");
+ 	BIO_printf(bio_err," -keymatexportlen len  - Export len bytes of keying material (default 20)\n");
 	}
 
 #ifndef OPENSSL_NO_TLSEXT
@@ -371,6 +392,124 @@ static int MS_CALLBACK ssl_servername_cb(SSL *s, int *ad, void *arg)
 	return SSL_TLSEXT_ERR_OK;
 	}
 
+#ifndef OPENSSL_NO_SRP
+
+/* This is a context that we pass to all callbacks */
+typedef struct srp_arg_st
+	{
+	char *srppassin;
+	char *srplogin;
+	int msg;   /* copy from c_msg */
+	int debug; /* copy from c_debug */
+	int amp;   /* allow more groups */
+	int strength /* minimal size for N */ ;
+	} SRP_ARG;
+
+#define SRP_NUMBER_ITERATIONS_FOR_PRIME 64
+
+static int srp_Verify_N_and_g(BIGNUM *N, BIGNUM *g)
+	{
+	BN_CTX *bn_ctx = BN_CTX_new();
+	BIGNUM *p = BN_new();
+	BIGNUM *r = BN_new();
+	int ret =
+		g != NULL && N != NULL && bn_ctx != NULL && BN_is_odd(N) &&
+		BN_is_prime_ex(N, SRP_NUMBER_ITERATIONS_FOR_PRIME, bn_ctx, NULL) &&
+		p != NULL && BN_rshift1(p, N) &&
+
+		/* p = (N-1)/2 */
+		BN_is_prime_ex(p, SRP_NUMBER_ITERATIONS_FOR_PRIME, bn_ctx, NULL) &&
+		r != NULL &&
+
+		/* verify g^((N-1)/2) == -1 (mod N) */
+		BN_mod_exp(r, g, p, N, bn_ctx) &&
+		BN_add_word(r, 1) &&
+		BN_cmp(r, N) == 0;
+
+	if(r)
+		BN_free(r);
+	if(p)
+		BN_free(p);
+	if(bn_ctx)
+		BN_CTX_free(bn_ctx);
+	return ret;
+	}
+
+/* This callback is used here for two purposes:
+   - extended debugging
+   - making some primality tests for unknown groups
+   The callback is only called for a non default group.
+
+   An application does not need the call back at all if
+   only the stanard groups are used.  In real life situations, 
+   client and server already share well known groups, 
+   thus there is no need to verify them. 
+   Furthermore, in case that a server actually proposes a group that
+   is not one of those defined in RFC 5054, it is more appropriate 
+   to add the group to a static list and then compare since 
+   primality tests are rather cpu consuming.
+*/
+
+static int MS_CALLBACK ssl_srp_verify_param_cb(SSL *s, void *arg)
+	{
+	SRP_ARG *srp_arg = (SRP_ARG *)arg;
+	BIGNUM *N = NULL, *g = NULL;
+	if (!(N = SSL_get_srp_N(s)) || !(g = SSL_get_srp_g(s)))
+		return 0;
+	if (srp_arg->debug || srp_arg->msg || srp_arg->amp == 1)
+		{
+    		BIO_printf(bio_err, "SRP parameters:\n"); 
+		BIO_printf(bio_err,"\tN="); BN_print(bio_err,N);
+		BIO_printf(bio_err,"\n\tg="); BN_print(bio_err,g);
+		BIO_printf(bio_err,"\n");
+		}
+
+	if (SRP_check_known_gN_param(g,N))
+		return 1;
+
+	if (srp_arg->amp == 1)
+		{
+		if (srp_arg->debug)
+			BIO_printf(bio_err, "SRP param N and g are not known params, going to check deeper.\n");
+
+/* The srp_moregroups is a real debugging feature.
+   Implementors should rather add the value to the known ones.
+   The minimal size has already been tested.
+*/
+		if (BN_num_bits(g) <= BN_BITS && srp_Verify_N_and_g(N,g))
+			return 1;
+		}	
+	BIO_printf(bio_err, "SRP param N and g rejected.\n");
+	return 0;
+	}
+
+#define PWD_STRLEN 1024
+
+static char * MS_CALLBACK ssl_give_srp_client_pwd_cb(SSL *s, void *arg)
+	{
+	SRP_ARG *srp_arg = (SRP_ARG *)arg;
+	char *pass = (char *)OPENSSL_malloc(PWD_STRLEN+1);
+	PW_CB_DATA cb_tmp;
+	int l;
+
+	cb_tmp.password = (char *)srp_arg->srppassin;
+	cb_tmp.prompt_info = "SRP user";
+	if ((l = password_callback(pass, PWD_STRLEN, 0, &cb_tmp))<0)
+		{
+		BIO_printf (bio_err, "Can't read Password\n");
+		OPENSSL_free(pass);
+		return NULL;
+		}
+	*(pass+l)= '\0';
+
+	return pass;
+	}
+
+#endif
+#ifndef OPENSSL_NO_SRTP
+	char *srtp_profiles = NULL;
+#endif
+
 # ifndef OPENSSL_NO_NEXTPROTONEG
 /* This the context that we pass to next_proto_cb */
 typedef struct tlsextnextprotoctx_st {
@@ -422,6 +561,9 @@ int MAIN(int argc, char **argv)
 	{
 	unsigned int off=0, clr=0;
 	SSL *con=NULL;
+#ifndef OPENSSL_NO_KRB5
+	KSSL_CTX *kctx;
+#endif
 	int s,k,width,state=0;
 	char *cbuf=NULL,*sbuf=NULL,*mbuf=NULL;
 	int cbuf_len,cbuf_off;
@@ -470,6 +612,7 @@ int MAIN(int argc, char **argv)
         {NULL,0};
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	const char *next_proto_neg_in = NULL;
+	const char *alpn_in = NULL;
 # endif
 #endif
 	char *sess_in = NULL;
@@ -481,14 +624,13 @@ int MAIN(int argc, char **argv)
 #ifndef OPENSSL_NO_JPAKE
 	char *jpake_secret = NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	char * srppass = NULL;
+	int srp_lateuser = 0;
+	SRP_ARG srp_arg = {NULL,NULL,0,0,0,1024};
+#endif
 
-#if !defined(OPENSSL_NO_SSL2) && !defined(OPENSSL_NO_SSL3)
 	meth=SSLv23_client_method();
-#elif !defined(OPENSSL_NO_SSL3)
-	meth=SSLv3_client_method();
-#elif !defined(OPENSSL_NO_SSL2)
-	meth=SSLv2_client_method();
-#endif
 
 	apps_startup();
 	c_Pause=0;
@@ -623,13 +765,44 @@ int MAIN(int argc, char **argv)
 			psk_key=*(++argv);
 			for (j = 0; j < strlen(psk_key); j++)
                                 {
-                                if (isxdigit((int)psk_key[j]))
+                                if (isxdigit((unsigned char)psk_key[j]))
                                         continue;
                                 BIO_printf(bio_err,"Not a hex number '%s'\n",*argv);
                                 goto bad;
                                 }
 			}
 #endif
+#ifndef OPENSSL_NO_SRP
+		else if (strcmp(*argv,"-srpuser") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_arg.srplogin= *(++argv);
+			meth=TLSv1_client_method();
+			}
+		else if (strcmp(*argv,"-srppass") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srppass= *(++argv);
+			meth=TLSv1_client_method();
+			}
+		else if (strcmp(*argv,"-srp_strength") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_arg.strength=atoi(*(++argv));
+			BIO_printf(bio_err,"SRP minimal length for N is %d\n",srp_arg.strength);
+			meth=TLSv1_client_method();
+			}
+		else if (strcmp(*argv,"-srp_lateuser") == 0)
+			{
+			srp_lateuser= 1;
+			meth=TLSv1_client_method();
+			}
+		else if	(strcmp(*argv,"-srp_moregroups") == 0)
+			{
+			srp_arg.amp=1;
+			meth=TLSv1_client_method();
+			}
+#endif
 #ifndef OPENSSL_NO_SSL2
 		else if	(strcmp(*argv,"-ssl2") == 0)
 			meth=SSLv2_client_method();
@@ -639,6 +812,10 @@ int MAIN(int argc, char **argv)
 			meth=SSLv3_client_method();
 #endif
 #ifndef OPENSSL_NO_TLS1
+		else if	(strcmp(*argv,"-tls1_2") == 0)
+			meth=TLSv1_2_client_method();
+		else if	(strcmp(*argv,"-tls1_1") == 0)
+			meth=TLSv1_1_client_method();
 		else if	(strcmp(*argv,"-tls1") == 0)
 			meth=TLSv1_client_method();
 #endif
@@ -687,6 +864,10 @@ int MAIN(int argc, char **argv)
 			if (--argc < 1) goto bad;
 			CAfile= *(++argv);
 			}
+		else if (strcmp(*argv,"-no_tls1_2") == 0)
+			off|=SSL_OP_NO_TLSv1_2;
+		else if (strcmp(*argv,"-no_tls1_1") == 0)
+			off|=SSL_OP_NO_TLSv1_1;
 		else if (strcmp(*argv,"-no_tls1") == 0)
 			off|=SSL_OP_NO_TLSv1;
 		else if (strcmp(*argv,"-no_ssl3") == 0)
@@ -704,6 +885,11 @@ int MAIN(int argc, char **argv)
 			if (--argc < 1) goto bad;
 			next_proto_neg_in = *(++argv);
 			}
+		else if (strcmp(*argv,"-alpn") == 0)
+			{
+			if (--argc < 1) goto bad;
+			alpn_in = *(++argv);
+			}
 # endif
 #endif
 		else if (strcmp(*argv,"-cutthrough") == 0)
@@ -774,7 +960,25 @@ int MAIN(int argc, char **argv)
 			jpake_secret = *++argv;
 			}
 #endif
-		else
+#ifndef OPENSSL_NO_SRTP
+		else if (strcmp(*argv,"-use_srtp") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srtp_profiles = *(++argv);
+			}
+#endif
+		else if (strcmp(*argv,"-keymatexport") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlabel= *(++argv);
+			}
+		else if (strcmp(*argv,"-keymatexportlen") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlen=atoi(*(++argv));
+			if (keymatexportlen == 0) goto bad;
+			}
+                else
 			{
 			BIO_printf(bio_err,"unknown option %s\n",*argv);
 			badop=1;
@@ -800,14 +1004,13 @@ int MAIN(int argc, char **argv)
 			goto end;
 			}
 		psk_identity = "JPAKE";
+		if (cipher)
+			{
+			BIO_printf(bio_err, "JPAKE sets cipher to PSK\n");
+			goto end;
+			}
+		cipher = "PSK";
 		}
-
-	if (cipher)
-		{
-		BIO_printf(bio_err, "JPAKE sets cipher to PSK\n");
-		goto end;
-		}
-	cipher = "PSK";
 #endif
 
 	OpenSSL_add_ssl_algorithms();
@@ -901,6 +1104,14 @@ int MAIN(int argc, char **argv)
 			}
 		}
 
+#ifndef OPENSSL_NO_SRP
+	if(!app_passwd(bio_err, srppass, NULL, &srp_arg.srppassin, NULL))
+		{
+		BIO_printf(bio_err, "Error getting password\n");
+		goto end;
+		}
+#endif
+
 	ctx=SSL_CTX_new(meth);
 	if (ctx == NULL)
 		{
@@ -936,6 +1147,10 @@ int MAIN(int argc, char **argv)
 			BIO_printf(bio_c_out, "PSK key given or JPAKE in use, setting client callback\n");
 		SSL_CTX_set_psk_client_callback(ctx, psk_client_cb);
 		}
+#endif
+#ifndef OPENSSL_NO_SRTP
+	if (srtp_profiles != NULL)
+		SSL_CTX_set_tlsext_use_srtp(ctx, srtp_profiles);
 #endif
 	if (bugs)
 		SSL_CTX_set_options(ctx,SSL_OP_ALL|off);
@@ -949,6 +1164,25 @@ int MAIN(int argc, char **argv)
 	 */
 	if (socket_type == SOCK_DGRAM) SSL_CTX_set_read_ahead(ctx, 1);
 
+#if !defined(OPENSSL_NO_TLSEXT)
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
+	if (next_proto.data)
+		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
+# endif
+	if (alpn_in)
+		{
+		unsigned short alpn_len;
+		unsigned char *alpn = next_protos_parse(&alpn_len, alpn_in);
+
+		if (alpn == NULL)
+			{
+			BIO_printf(bio_err, "Error parsing -alpn argument\n");
+			goto end;
+			}
+		SSL_CTX_set_alpn_protos(ctx, alpn, alpn_len);
+		}
+#endif
+
 	/* Enable handshake cutthrough for client connections using
 	 * strong ciphers. */
 	if (cutthrough)
@@ -958,11 +1192,6 @@ int MAIN(int argc, char **argv)
 		SSL_CTX_set_mode(ctx, ssl_mode);
 		}
 
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-	if (next_proto.data)
-		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
-#endif
-
 	if (state) SSL_CTX_set_info_callback(ctx,apps_ssl_info_callback);
 	if (cipher != NULL)
 		if(!SSL_CTX_set_cipher_list(ctx,cipher)) {
@@ -994,6 +1223,24 @@ int MAIN(int argc, char **argv)
 		SSL_CTX_set_tlsext_servername_callback(ctx, ssl_servername_cb);
 		SSL_CTX_set_tlsext_servername_arg(ctx, &tlsextcbp);
 		}
+#ifndef OPENSSL_NO_SRP
+        if (srp_arg.srplogin)
+		{
+		if (!srp_lateuser && !SSL_CTX_set_srp_username(ctx, srp_arg.srplogin))
+			{
+			BIO_printf(bio_err,"Unable to set SRP username\n");
+			goto end;
+			}
+		srp_arg.msg = c_msg;
+		srp_arg.debug = c_debug ;
+		SSL_CTX_set_srp_cb_arg(ctx,&srp_arg);
+		SSL_CTX_set_srp_client_pwd_callback(ctx, ssl_give_srp_client_pwd_cb);
+		SSL_CTX_set_srp_strength(ctx, srp_arg.strength);
+		if (c_msg || c_debug || srp_arg.amp == 0)
+			SSL_CTX_set_srp_verify_param_callback(ctx, ssl_srp_verify_param_cb);
+		}
+
+#endif
 #endif
 
 	con=SSL_new(ctx);
@@ -1032,9 +1279,10 @@ int MAIN(int argc, char **argv)
 		}
 #endif
 #ifndef OPENSSL_NO_KRB5
-	if (con  &&  (con->kssl_ctx = kssl_ctx_new()) != NULL)
+	if (con  &&  (kctx = kssl_ctx_new()) != NULL)
                 {
-                kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVER, host);
+		SSL_set0_kssl_ctx(con, kctx);
+                kssl_ctx_setstring(kctx, KSSL_SERVER, host);
 		}
 #endif	/* OPENSSL_NO_KRB5  */
 /*	SSL_set_cipher_list(con,"RC4-MD5"); */
@@ -1066,7 +1314,7 @@ int MAIN(int argc, char **argv)
 			}
 		}
 #endif                                              
-	if (c_Pause & 0x01) con->debug=1;
+	if (c_Pause & 0x01) SSL_set_debug(con, 1);
 
 	if ( SSL_version(con) == DTLS1_VERSION)
 		{
@@ -1115,7 +1363,7 @@ int MAIN(int argc, char **argv)
 
 	if (c_debug)
 		{
-		con->debug=1;
+		SSL_set_debug(con, 1);
 		BIO_set_callback(sbio,bio_dump_callback);
 		BIO_set_callback_arg(sbio,(char *)bio_c_out);
 		}
@@ -1649,6 +1897,14 @@ printf("read=%d pending=%d peek=%d\n",k,SSL_pending(con),SSL_peek(con,zbuf,10240
 				SSL_renegotiate(con);
 				cbuf_len=0;
 				}
+#ifndef OPENSSL_NO_HEARTBEATS
+			else if ((!c_ign_eof) && (cbuf[0] == 'B'))
+ 				{
+				BIO_printf(bio_err,"HEARTBEATING\n");
+				SSL_heartbeat(con);
+				cbuf_len=0;
+				}
+#endif
 			else
 				{
 				cbuf_len=i;
@@ -1676,6 +1932,10 @@ printf("read=%d pending=%d peek=%d\n",k,SSL_pending(con),SSL_peek(con,zbuf,10240
 			print_stuff(bio_c_out,con,1);
 		SSL_free(con);
 		}
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+	if (next_proto.data)
+		OPENSSL_free(next_proto.data);
+#endif
 	if (ctx != NULL) SSL_CTX_free(ctx);
 	if (cert)
 		X509_free(cert);
@@ -1683,6 +1943,8 @@ printf("read=%d pending=%d peek=%d\n",k,SSL_pending(con),SSL_peek(con,zbuf,10240
 		EVP_PKEY_free(key);
 	if (pass)
 		OPENSSL_free(pass);
+	if (vpm)
+		X509_VERIFY_PARAM_free(vpm);
 	if (cbuf != NULL) { OPENSSL_cleanse(cbuf,BUFSIZZ); OPENSSL_free(cbuf); }
 	if (sbuf != NULL) { OPENSSL_cleanse(sbuf,BUFSIZZ); OPENSSL_free(sbuf); }
 	if (mbuf != NULL) { OPENSSL_cleanse(mbuf,BUFSIZZ); OPENSSL_free(mbuf); }
@@ -1710,6 +1972,7 @@ static void print_stuff(BIO *bio, SSL *s, int full)
 #ifndef OPENSSL_NO_COMP
 	const COMP_METHOD *comp, *expansion;
 #endif
+	unsigned char *exportedkeymat;
 
 	if (full)
 		{
@@ -1800,7 +2063,7 @@ static void print_stuff(BIO *bio, SSL *s, int full)
 			BIO_number_read(SSL_get_rbio(s)),
 			BIO_number_written(SSL_get_wbio(s)));
 		}
-	BIO_printf(bio,((s->hit)?"---\nReused, ":"---\nNew, "));
+	BIO_printf(bio,(SSL_cache_hit(s)?"---\nReused, ":"---\nNew, "));
 	c=SSL_get_current_cipher(s);
 	BIO_printf(bio,"%s, Cipher is %s\n",
 		SSL_CIPHER_get_version(c),
@@ -1822,8 +2085,21 @@ static void print_stuff(BIO *bio, SSL *s, int full)
 	BIO_printf(bio,"Expansion: %s\n",
 		expansion ? SSL_COMP_get_name(expansion) : "NONE");
 #endif
+ 
+#ifdef SSL_DEBUG
+	{
+	/* Print out local port of connection: useful for debugging */
+	int sock;
+	struct sockaddr_in ladd;
+	socklen_t ladd_size = sizeof(ladd);
+	sock = SSL_get_fd(s);
+	getsockname(sock, (struct sockaddr *)&ladd, &ladd_size);
+	BIO_printf(bio_c_out, "LOCAL PORT is %u\n", ntohs(ladd.sin_port));
+	}
+#endif
 
-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+#if !defined(OPENSSL_NO_TLSEXT)
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
 	if (next_proto.status != -1) {
 		const unsigned char *proto;
 		unsigned int proto_len;
@@ -1832,9 +2108,60 @@ static void print_stuff(BIO *bio, SSL *s, int full)
 		BIO_write(bio, proto, proto_len);
 		BIO_write(bio, "\n", 1);
 	}
+	{
+		const unsigned char *proto;
+		unsigned int proto_len;
+		SSL_get0_alpn_selected(s, &proto, &proto_len);
+		if (proto_len > 0)
+			{
+			BIO_printf(bio, "ALPN protocol: ");
+			BIO_write(bio, proto, proto_len);
+			BIO_write(bio, "\n", 1);
+			}
+		else
+			BIO_printf(bio, "No ALPN negotiated\n");
+	}
+# endif
 #endif
 
+#ifndef OPENSSL_NO_SRTP
+ 	{
+ 	SRTP_PROTECTION_PROFILE *srtp_profile=SSL_get_selected_srtp_profile(s);
+ 
+	if(srtp_profile)
+		BIO_printf(bio,"SRTP Extension negotiated, profile=%s\n",
+			   srtp_profile->name);
+	}
+#endif
+ 
 	SSL_SESSION_print(bio,SSL_get_session(s));
+	if (keymatexportlabel != NULL)
+		{
+		BIO_printf(bio, "Keying material exporter:\n");
+		BIO_printf(bio, "    Label: '%s'\n", keymatexportlabel);
+		BIO_printf(bio, "    Length: %i bytes\n", keymatexportlen);
+		exportedkeymat = OPENSSL_malloc(keymatexportlen);
+		if (exportedkeymat != NULL)
+			{
+			if (!SSL_export_keying_material(s, exportedkeymat,
+						        keymatexportlen,
+						        keymatexportlabel,
+						        strlen(keymatexportlabel),
+						        NULL, 0, 0))
+				{
+				BIO_printf(bio, "    Error\n");
+				}
+			else
+				{
+				BIO_printf(bio, "    Keying material: ");
+				for (i=0; i<keymatexportlen; i++)
+					BIO_printf(bio, "%02X",
+						   exportedkeymat[i]);
+				BIO_printf(bio, "\n");
+				}
+			OPENSSL_free(exportedkeymat);
+			}
+		}
 	BIO_printf(bio,"---\n");
 	if (peer != NULL)
 		X509_free(peer);
diff --git a/apps/s_server.c b/apps/s_server.c
index a8e057c..8198d7f 100644
--- a/apps/s_server.c
+++ b/apps/s_server.c
@@ -186,6 +186,9 @@ typedef unsigned int u_int;
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#ifndef OPENSSL_NO_SRP
+#include <openssl/srp.h>
+#endif
 #include "s_apps.h"
 #include "timeouts.h"
 
@@ -290,6 +293,9 @@ static int cert_status_cb(SSL *s, void *arg);
 static int s_msg=0;
 static int s_quiet=0;
 
+static char *keymatexportlabel=NULL;
+static int keymatexportlen=20;
+
 static int hack=0;
 #ifndef OPENSSL_NO_ENGINE
 static char *engine_id=NULL;
@@ -302,6 +308,7 @@ static long socket_mtu;
 static int cert_chain = 0;
 #endif
 
+
 #ifndef OPENSSL_NO_PSK
 static char *psk_identity="Client_identity";
 char *psk_key=NULL; /* by default PSK is not used */
@@ -369,6 +376,52 @@ static unsigned int psk_server_cb(SSL *ssl, const char *identity,
         }
 #endif
 
+#ifndef OPENSSL_NO_SRP
+/* This is a context that we pass to callbacks */
+typedef struct srpsrvparm_st
+	{
+	char *login;
+	SRP_VBASE *vb;
+	SRP_user_pwd *user;
+	} srpsrvparm;
+
+/* This callback pretends to require some asynchronous logic in order to obtain
+   a verifier. When the callback is called for a new connection we return
+   with a negative value. This will provoke the accept etc to return with
+   an LOOKUP_X509. The main logic of the reinvokes the suspended call 
+   (which would normally occur after a worker has finished) and we
+   set the user parameters. 
+*/
+static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
+	{
+	srpsrvparm *p = (srpsrvparm *)arg;
+	if (p->login == NULL && p->user == NULL )
+		{
+		p->login = SSL_get_srp_username(s);
+		BIO_printf(bio_err, "SRP username = \"%s\"\n", p->login);
+		return (-1) ;
+		}
+
+	if (p->user == NULL)
+		{
+		BIO_printf(bio_err, "User %s doesn't exist\n", p->login);
+		return SSL3_AL_FATAL;
+		}
+	if (SSL_set_srp_server_param(s, p->user->N, p->user->g, p->user->s, p->user->v,
+				     p->user->info) < 0)
+		{
+		*ad = SSL_AD_INTERNAL_ERROR;
+		return SSL3_AL_FATAL;
+		}
+	BIO_printf(bio_err, "SRP parameters set: username = \"%s\" info=\"%s\" \n", p->login,p->user->info);
+	/* need to check whether there are memory leaks */
+	p->user = NULL;
+	p->login = NULL;
+	return SSL_ERROR_NONE;
+	}
+
+#endif
+
 #ifdef MONOLITH
 static void s_server_init(void)
 	{
@@ -455,9 +508,15 @@ static void sv_usage(void)
 # ifndef OPENSSL_NO_JPAKE
 	BIO_printf(bio_err," -jpake arg    - JPAKE secret to use\n");
 # endif
+#endif
+#ifndef OPENSSL_NO_SRP
+	BIO_printf(bio_err," -srpvfile file      - The verifier file for SRP\n");
+	BIO_printf(bio_err," -srpuserseed string - A seed string for a default user salt.\n");
 #endif
 	BIO_printf(bio_err," -ssl2         - Just talk SSLv2\n");
 	BIO_printf(bio_err," -ssl3         - Just talk SSLv3\n");
+	BIO_printf(bio_err," -tls1_2       - Just talk TLSv1.2\n");
+	BIO_printf(bio_err," -tls1_1       - Just talk TLSv1.1\n");
 	BIO_printf(bio_err," -tls1         - Just talk TLSv1\n");
 	BIO_printf(bio_err," -dtls1        - Just talk DTLSv1\n");
 	BIO_printf(bio_err," -timeout      - Enable timeouts\n");
@@ -466,6 +525,8 @@ static void sv_usage(void)
 	BIO_printf(bio_err," -no_ssl2      - Just disable SSLv2\n");
 	BIO_printf(bio_err," -no_ssl3      - Just disable SSLv3\n");
 	BIO_printf(bio_err," -no_tls1      - Just disable TLSv1\n");
+	BIO_printf(bio_err," -no_tls1_1    - Just disable TLSv1.1\n");
+	BIO_printf(bio_err," -no_tls1_2    - Just disable TLSv1.2\n");
 #ifndef OPENSSL_NO_DH
 	BIO_printf(bio_err," -no_dhe       - Disable ephemeral DH\n");
 #endif
@@ -495,7 +556,12 @@ static void sv_usage(void)
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	BIO_printf(bio_err," -nextprotoneg arg - set the advertised protocols for the NPN extension (comma-separated list)\n");
 # endif
+# ifndef OPENSSL_NO_SRTP
+        BIO_printf(bio_err," -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n");
+# endif
 #endif
+	BIO_printf(bio_err," -keymatexport label   - Export keying material using label\n");
+	BIO_printf(bio_err," -keymatexportlen len  - Export len bytes of keying material (default 20)\n");
 	}
 
 static int local_argc=0;
@@ -846,7 +912,9 @@ static int next_proto_cb(SSL *s, const unsigned char **data, unsigned int *len,
 
 	return SSL_TLSEXT_ERR_OK;
 	}
-# endif  /* ndef OPENSSL_NO_NPN */
+# endif  /* ndef OPENSSL_NO_NEXTPROTONEG */
+
+
 #endif
 
 int MAIN(int, char **);
@@ -854,6 +922,12 @@ int MAIN(int, char **);
 #ifndef OPENSSL_NO_JPAKE
 static char *jpake_secret = NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	static srpsrvparm srp_callback_parm;
+#endif
+#ifndef OPENSSL_NO_SRTP
+static char *srtp_profiles = NULL;
+#endif
 
 int MAIN(int argc, char *argv[])
 	{
@@ -885,8 +959,6 @@ int MAIN(int argc, char *argv[])
 #ifndef OPENSSL_NO_TLSEXT
 	EVP_PKEY *s_key2 = NULL;
 	X509 *s_cert2 = NULL;
-#endif
-#ifndef OPENSSL_NO_TLSEXT
         tlsextctx tlsextcbp = {NULL, NULL, SSL_TLSEXT_ERR_ALERT_WARNING};
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	const char *next_proto_neg_in = NULL;
@@ -897,13 +969,11 @@ int MAIN(int argc, char *argv[])
 	/* by default do not send a PSK identity hint */
 	static char *psk_identity_hint=NULL;
 #endif
-#if !defined(OPENSSL_NO_SSL2) && !defined(OPENSSL_NO_SSL3)
-	meth=SSLv23_server_method();
-#elif !defined(OPENSSL_NO_SSL3)
-	meth=SSLv3_server_method();
-#elif !defined(OPENSSL_NO_SSL2)
-	meth=SSLv2_server_method();
+#ifndef OPENSSL_NO_SRP
+	char *srpuserseed = NULL;
+	char *srp_verifier_file = NULL;
 #endif
+	meth=SSLv23_server_method();
 
 	local_argc=argc;
 	local_argv=argv;
@@ -1128,12 +1198,26 @@ int MAIN(int argc, char *argv[])
 			psk_key=*(++argv);
 			for (i=0; i<strlen(psk_key); i++)
 				{
-				if (isxdigit((int)psk_key[i]))
+				if (isxdigit((unsigned char)psk_key[i]))
 					continue;
 				BIO_printf(bio_err,"Not a hex number '%s'\n",*argv);
 				goto bad;
 				}
 			}
+#endif
+#ifndef OPENSSL_NO_SRP
+		else if (strcmp(*argv, "-srpvfile") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_verifier_file = *(++argv);
+			meth = TLSv1_server_method();
+			}
+		else if (strcmp(*argv, "-srpuserseed") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srpuserseed = *(++argv);
+			meth = TLSv1_server_method();
+			}
 #endif
 		else if	(strcmp(*argv,"-www") == 0)
 			{ www=1; }
@@ -1147,6 +1231,10 @@ int MAIN(int argc, char *argv[])
 			{ off|=SSL_OP_NO_SSLv3; }
 		else if	(strcmp(*argv,"-no_tls1") == 0)
 			{ off|=SSL_OP_NO_TLSv1; }
+		else if	(strcmp(*argv,"-no_tls1_1") == 0)
+			{ off|=SSL_OP_NO_TLSv1_1; }
+		else if	(strcmp(*argv,"-no_tls1_2") == 0)
+			{ off|=SSL_OP_NO_TLSv1_2; }
 		else if	(strcmp(*argv,"-no_comp") == 0)
 			{ off|=SSL_OP_NO_COMPRESSION; }
 #ifndef OPENSSL_NO_TLSEXT
@@ -1164,6 +1252,10 @@ int MAIN(int argc, char *argv[])
 #ifndef OPENSSL_NO_TLS1
 		else if	(strcmp(*argv,"-tls1") == 0)
 			{ meth=TLSv1_server_method(); }
+		else if	(strcmp(*argv,"-tls1_1") == 0)
+			{ meth=TLSv1_1_server_method(); }
+		else if	(strcmp(*argv,"-tls1_2") == 0)
+			{ meth=TLSv1_2_server_method(); }
 #endif
 #ifndef OPENSSL_NO_DTLS1
 		else if	(strcmp(*argv,"-dtls1") == 0)
@@ -1231,6 +1323,24 @@ int MAIN(int argc, char *argv[])
 			jpake_secret = *(++argv);
 			}
 #endif
+#ifndef OPENSSL_NO_SRTP
+		else if (strcmp(*argv,"-use_srtp") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srtp_profiles = *(++argv);
+			}
+#endif
+		else if (strcmp(*argv,"-keymatexport") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlabel= *(++argv);
+			}
+		else if (strcmp(*argv,"-keymatexportlen") == 0)
+			{
+			if (--argc < 1) goto bad;
+			keymatexportlen=atoi(*(++argv));
+			if (keymatexportlen == 0) goto bad;
+			}
 		else
 			{
 			BIO_printf(bio_err,"unknown option %s\n",*argv);
@@ -1330,6 +1440,21 @@ int MAIN(int argc, char *argv[])
 #endif
 		}
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) 
+	if (next_proto_neg_in)
+		{
+		unsigned short len;
+		next_proto.data = next_protos_parse(&len, next_proto_neg_in);
+		if (next_proto.data == NULL)
+			goto end;
+		next_proto.len = len;
+		}
+	else
+		{
+		next_proto.data = NULL;
+		}
+#endif
+
 
 	if (s_dcert_file)
 		{
@@ -1430,6 +1555,11 @@ int MAIN(int argc, char *argv[])
 	else
 		SSL_CTX_sess_set_cache_size(ctx,128);
 
+#ifndef OPENSSL_NO_SRTP
+	if (srtp_profiles != NULL)
+		SSL_CTX_set_tlsext_use_srtp(ctx, srtp_profiles);
+#endif
+
 #if 0
 	if (cipher == NULL) cipher=getenv("SSL_CIPHER");
 #endif
@@ -1607,7 +1737,7 @@ int MAIN(int argc, char *argv[])
 		}
 #endif
 	
-	if (!set_cert_key_stuff(ctx,s_cert,s_key))
+	if (!set_cert_key_stuff(ctx, s_cert, s_key))
 		goto end;
 #ifndef OPENSSL_NO_TLSEXT
 	if (ctx2 && !set_cert_key_stuff(ctx2,s_cert2,s_key2))
@@ -1615,7 +1745,7 @@ int MAIN(int argc, char *argv[])
 #endif
 	if (s_dcert != NULL)
 		{
-		if (!set_cert_key_stuff(ctx,s_dcert,s_dkey))
+		if (!set_cert_key_stuff(ctx, s_dcert, s_dkey))
 			goto end;
 		}
 
@@ -1653,21 +1783,6 @@ int MAIN(int argc, char *argv[])
 					goto end;
 					}
 				}
-# ifndef OPENSSL_NO_NEXTPROTONEG
-		if (next_proto_neg_in)
-			{
-			unsigned short len;
-			next_proto.data = next_protos_parse(&len,
-				next_proto_neg_in);
-			if (next_proto.data == NULL)
-				goto end;
-			next_proto.len = len;
-			}
-		else
-			{
-			next_proto.data = NULL;
-			}
-# endif
 #endif
 		RSA_free(rsa);
 		BIO_printf(bio_s_out,"\n");
@@ -1735,6 +1850,25 @@ int MAIN(int argc, char *argv[])
 		}
 #endif
 
+#ifndef OPENSSL_NO_SRP
+	if (srp_verifier_file != NULL)
+		{
+		srp_callback_parm.vb = SRP_VBASE_new(srpuserseed);
+		srp_callback_parm.user = NULL;
+		srp_callback_parm.login = NULL;
+		if ((ret = SRP_VBASE_init(srp_callback_parm.vb, srp_verifier_file)) != SRP_NO_ERROR)
+			{
+			BIO_printf(bio_err,
+				   "Cannot initialize SRP verifier file \"%s\":ret=%d\n",
+				   srp_verifier_file, ret);
+				goto end;
+			}
+		SSL_CTX_set_verify(ctx, SSL_VERIFY_NONE,verify_callback);
+		SSL_CTX_set_srp_cb_arg(ctx, &srp_callback_parm);  			
+		SSL_CTX_set_srp_username_callback(ctx, ssl_srp_server_param_cb);
+		}
+	else
+#endif
 	if (CAfile != NULL)
 		{
 		SSL_CTX_set_client_CA_list(ctx,SSL_load_client_CA_file(CAfile));
@@ -1766,7 +1900,15 @@ int MAIN(int argc, char *argv[])
 		OPENSSL_free(pass);
 	if (dpass)
 		OPENSSL_free(dpass);
+	if (vpm)
+		X509_VERIFY_PARAM_free(vpm);
 #ifndef OPENSSL_NO_TLSEXT
+	if (tlscstatp.host)
+		OPENSSL_free(tlscstatp.host);
+	if (tlscstatp.port)
+		OPENSSL_free(tlscstatp.port);
+	if (tlscstatp.path)
+		OPENSSL_free(tlscstatp.path);
 	if (ctx2 != NULL) SSL_CTX_free(ctx2);
 	if (s_cert2)
 		X509_free(s_cert2);
@@ -1816,6 +1958,9 @@ static int sv_body(char *hostname, int s, unsigned char *context)
 	unsigned long l;
 	SSL *con=NULL;
 	BIO *sbio;
+#ifndef OPENSSL_NO_KRB5
+	KSSL_CTX *kctx;
+#endif
 	struct timeval timeout;
 #if defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_NETWARE) || defined(OPENSSL_SYS_BEOS_R5)
 	struct timeval tv;
@@ -1856,12 +2001,11 @@ static int sv_body(char *hostname, int s, unsigned char *context)
 		}
 #endif
 #ifndef OPENSSL_NO_KRB5
-		if ((con->kssl_ctx = kssl_ctx_new()) != NULL)
+		if ((kctx = kssl_ctx_new()) != NULL)
                         {
-                        kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVICE,
-								KRB5SVC);
-                        kssl_ctx_setstring(con->kssl_ctx, KSSL_KEYTAB,
-								KRB5KEYTAB);
+			SSL_set0_kssl_ctx(con, kctx);
+                        kssl_ctx_setstring(kctx, KSSL_SERVICE, KRB5SVC);
+                        kssl_ctx_setstring(kctx, KSSL_KEYTAB, KRB5KEYTAB);
                         }
 #endif	/* OPENSSL_NO_KRB5 */
 		if(context)
@@ -1924,7 +2068,7 @@ static int sv_body(char *hostname, int s, unsigned char *context)
 
 	if (s_debug)
 		{
-		con->debug=1;
+		SSL_set_debug(con, 1);
 		BIO_set_callback(SSL_get_rbio(con),bio_dump_callback);
 		BIO_set_callback_arg(SSL_get_rbio(con),(char *)bio_s_out);
 		}
@@ -2053,6 +2197,16 @@ static int sv_body(char *hostname, int s, unsigned char *context)
 					goto err;
 					}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+				if ((buf[0] == 'B') &&
+					((buf[1] == '\n') || (buf[1] == '\r')))
+					{
+					BIO_printf(bio_err,"HEARTBEATING\n");
+					SSL_heartbeat(con);
+					i=0;
+					continue;
+					}
+#endif
 				if ((buf[0] == 'r') && 
 					((buf[1] == '\n') || (buf[1] == '\r')))
 					{
@@ -2096,6 +2250,18 @@ static int sv_body(char *hostname, int s, unsigned char *context)
 { static count=0; if (++count == 100) { count=0; SSL_renegotiate(con); } }
 #endif
 				k=SSL_write(con,&(buf[l]),(unsigned int)i);
+#ifndef OPENSSL_NO_SRP
+				while (SSL_get_error(con,k) == SSL_ERROR_WANT_X509_LOOKUP)
+					{
+					BIO_printf(bio_s_out,"LOOKUP renego during write\n");
+					srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+					if (srp_callback_parm.user) 
+						BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+					else 
+						BIO_printf(bio_s_out,"LOOKUP not successful\n");
+						k=SSL_write(con,&(buf[l]),(unsigned int)i);
+					}
+#endif
 				switch (SSL_get_error(con,k))
 					{
 				case SSL_ERROR_NONE:
@@ -2143,6 +2309,18 @@ static int sv_body(char *hostname, int s, unsigned char *context)
 				{
 again:	
 				i=SSL_read(con,(char *)buf,bufsize);
+#ifndef OPENSSL_NO_SRP
+				while (SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP)
+					{
+					BIO_printf(bio_s_out,"LOOKUP renego during read\n");
+					srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+					if (srp_callback_parm.user) 
+						BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+					else 
+						BIO_printf(bio_s_out,"LOOKUP not successful\n");
+					i=SSL_read(con,(char *)buf,bufsize);
+					}
+#endif
 				switch (SSL_get_error(con,i))
 					{
 				case SSL_ERROR_NONE:
@@ -2155,7 +2333,6 @@ static int sv_body(char *hostname, int s, unsigned char *context)
 					break;
 				case SSL_ERROR_WANT_WRITE:
 				case SSL_ERROR_WANT_READ:
-				case SSL_ERROR_WANT_X509_LOOKUP:
 					BIO_printf(bio_s_out,"Read BLOCK\n");
 					break;
 				case SSL_ERROR_SYSCALL:
@@ -2210,12 +2387,30 @@ static int init_ssl_connection(SSL *con)
 	X509 *peer;
 	long verify_error;
 	MS_STATIC char buf[BUFSIZ];
+#ifndef OPENSSL_NO_KRB5
+	char *client_princ;
+#endif
 #if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
 	const unsigned char *next_proto_neg;
 	unsigned next_proto_neg_len;
 #endif
+	unsigned char *exportedkeymat;
+
 
-	if ((i=SSL_accept(con)) <= 0)
+	i=SSL_accept(con);
+#ifndef OPENSSL_NO_SRP
+	while (i <= 0 &&  SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP) 
+		{
+			BIO_printf(bio_s_out,"LOOKUP during accept %s\n",srp_callback_parm.login);
+			srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+			if (srp_callback_parm.user) 
+				BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+			else 
+				BIO_printf(bio_s_out,"LOOKUP not successful\n");
+			i=SSL_accept(con);
+		}
+#endif
+	if (i <= 0)
 		{
 		if (BIO_sock_should_retry(i))
 			{
@@ -2253,6 +2448,7 @@ static int init_ssl_connection(SSL *con)
 		BIO_printf(bio_s_out,"Shared ciphers:%s\n",buf);
 	str=SSL_CIPHER_get_name(SSL_get_current_cipher(con));
 	BIO_printf(bio_s_out,"CIPHER is %s\n",(str != NULL)?str:"(NONE)");
+
 #if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
 	SSL_get0_next_proto_negotiated(con, &next_proto_neg, &next_proto_neg_len);
 	if (next_proto_neg)
@@ -2262,19 +2458,60 @@ static int init_ssl_connection(SSL *con)
 		BIO_printf(bio_s_out, "\n");
 		}
 #endif
-	if (con->hit) BIO_printf(bio_s_out,"Reused session-id\n");
+#ifndef OPENSSL_NO_SRTP
+	{
+	SRTP_PROTECTION_PROFILE *srtp_profile
+	  = SSL_get_selected_srtp_profile(con);
+
+	if(srtp_profile)
+		BIO_printf(bio_s_out,"SRTP Extension negotiated, profile=%s\n",
+			   srtp_profile->name);
+	}
+#endif
+	if (SSL_cache_hit(con)) BIO_printf(bio_s_out,"Reused session-id\n");
 	if (SSL_ctrl(con,SSL_CTRL_GET_FLAGS,0,NULL) &
 		TLS1_FLAGS_TLS_PADDING_BUG)
-		BIO_printf(bio_s_out,"Peer has incorrect TLSv1 block padding\n");
+		BIO_printf(bio_s_out,
+			   "Peer has incorrect TLSv1 block padding\n");
 #ifndef OPENSSL_NO_KRB5
-	if (con->kssl_ctx->client_princ != NULL)
+	client_princ = kssl_ctx_get0_client_princ(SSL_get0_kssl_ctx(con));
+	if (client_princ != NULL)
 		{
 		BIO_printf(bio_s_out,"Kerberos peer principal is %s\n",
-			con->kssl_ctx->client_princ);
+								client_princ);
 		}
 #endif /* OPENSSL_NO_KRB5 */
 	BIO_printf(bio_s_out, "Secure Renegotiation IS%s supported\n",
 		      SSL_get_secure_renegotiation_support(con) ? "" : " NOT");
+	if (keymatexportlabel != NULL)
+		{
+		BIO_printf(bio_s_out, "Keying material exporter:\n");
+		BIO_printf(bio_s_out, "    Label: '%s'\n", keymatexportlabel);
+		BIO_printf(bio_s_out, "    Length: %i bytes\n",
+			   keymatexportlen);
+		exportedkeymat = OPENSSL_malloc(keymatexportlen);
+		if (exportedkeymat != NULL)
+			{
+			if (!SSL_export_keying_material(con, exportedkeymat,
+						        keymatexportlen,
+						        keymatexportlabel,
+						        strlen(keymatexportlabel),
+						        NULL, 0, 0))
+				{
+				BIO_printf(bio_s_out, "    Error\n");
+				}
+			else
+				{
+				BIO_printf(bio_s_out, "    Keying material: ");
+				for (i=0; i<keymatexportlen; i++)
+					BIO_printf(bio_s_out, "%02X",
+						   exportedkeymat[i]);
+				BIO_printf(bio_s_out, "\n");
+				}
+			OPENSSL_free(exportedkeymat);
+			}
+		}
+
 	return(1);
 	}
 
@@ -2292,6 +2529,9 @@ static DH *load_dh_param(const char *dhfile)
 	return(ret);
 	}
 #endif
+#ifndef OPENSSL_NO_KRB5
+	char *client_princ;
+#endif
 
 #if 0
 static int load_CA(SSL_CTX *ctx, char *file)
@@ -2322,6 +2562,9 @@ static int www_body(char *hostname, int s, unsigned char *context)
 	SSL *con;
 	const SSL_CIPHER *c;
 	BIO *io,*ssl_bio,*sbio;
+#ifndef OPENSSL_NO_KRB5
+	KSSL_CTX *kctx;
+#endif
 
 	buf=OPENSSL_malloc(bufsize);
 	if (buf == NULL) return(0);
@@ -2353,10 +2596,10 @@ static int www_body(char *hostname, int s, unsigned char *context)
 			}
 #endif
 #ifndef OPENSSL_NO_KRB5
-	if ((con->kssl_ctx = kssl_ctx_new()) != NULL)
+	if ((kctx = kssl_ctx_new()) != NULL)
 		{
-		kssl_ctx_setstring(con->kssl_ctx, KSSL_SERVICE, KRB5SVC);
-		kssl_ctx_setstring(con->kssl_ctx, KSSL_KEYTAB, KRB5KEYTAB);
+		kssl_ctx_setstring(kctx, KSSL_SERVICE, KRB5SVC);
+		kssl_ctx_setstring(kctx, KSSL_KEYTAB, KRB5KEYTAB);
 		}
 #endif	/* OPENSSL_NO_KRB5 */
 	if(context) SSL_set_session_id_context(con, context,
@@ -2382,7 +2625,7 @@ static int www_body(char *hostname, int s, unsigned char *context)
 
 	if (s_debug)
 		{
-		con->debug=1;
+		SSL_set_debug(con, 1);
 		BIO_set_callback(SSL_get_rbio(con),bio_dump_callback);
 		BIO_set_callback_arg(SSL_get_rbio(con),(char *)bio_s_out);
 		}
@@ -2397,7 +2640,18 @@ static int www_body(char *hostname, int s, unsigned char *context)
 		if (hack)
 			{
 			i=SSL_accept(con);
-
+#ifndef OPENSSL_NO_SRP
+			while (i <= 0 &&  SSL_get_error(con,i) == SSL_ERROR_WANT_X509_LOOKUP) 
+		{
+			BIO_printf(bio_s_out,"LOOKUP during accept %s\n",srp_callback_parm.login);
+			srp_callback_parm.user = SRP_VBASE_get_by_user(srp_callback_parm.vb, srp_callback_parm.login); 
+			if (srp_callback_parm.user) 
+				BIO_printf(bio_s_out,"LOOKUP done %s\n",srp_callback_parm.user->info);
+			else 
+				BIO_printf(bio_s_out,"LOOKUP not successful\n");
+			i=SSL_accept(con);
+		}
+#endif
 			switch (SSL_get_error(con,i))
 				{
 			case SSL_ERROR_NONE:
@@ -2465,6 +2719,11 @@ static int www_body(char *hostname, int s, unsigned char *context)
 				}
 			BIO_puts(io,"\n");
 
+			BIO_printf(io,
+				"Secure Renegotiation IS%s supported\n",
+		      		SSL_get_secure_renegotiation_support(con) ?
+							"" : " NOT");
+
 			/* The following is evil and should not really
 			 * be done */
 			BIO_printf(io,"Ciphers supported in s_server binary\n");
@@ -2503,7 +2762,7 @@ static int www_body(char *hostname, int s, unsigned char *context)
 					}
 				BIO_puts(io,"\n");
 				}
-			BIO_printf(io,((con->hit)
+			BIO_printf(io,(SSL_cache_hit(con)
 				?"---\nReused, "
 				:"---\nNew, "));
 			c=SSL_get_current_cipher(con);
diff --git a/apps/s_socket.c b/apps/s_socket.c
index c08544a..380efdb 100644
--- a/apps/s_socket.c
+++ b/apps/s_socket.c
@@ -238,11 +238,10 @@ int init_client(int *sock, char *host, int port, int type)
 	{
 	unsigned char ip[4];
 
+	memset(ip, '\0', sizeof ip);
 	if (!host_ip(host,&(ip[0])))
-		{
-		return(0);
-		}
-	return(init_client_ip(sock,ip,port,type));
+		return 0;
+	return init_client_ip(sock,ip,port,type);
 	}
 
 static int init_client_ip(int *sock, unsigned char ip[4], int port, int type)
diff --git a/apps/server.pem b/apps/server.pem
index 56248e5..d0fc265 100644
--- a/apps/server.pem
+++ b/apps/server.pem
@@ -1,369 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (512 bit)
------BEGIN CERTIFICATE-----
-MIIB6TCCAVICAQYwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNMDAxMDE2MjIzMTAzWhcNMDMwMTE0
-MjIzMTAzWjBjMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxIzAhBgNVBAMTGlNlcnZlciB0ZXN0IGNl
-cnQgKDUxMiBiaXQpMFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBAJ+zw4Qnlf8SMVIP
-Fe9GEcStgOY2Ww/dgNdhjeD8ckUJNP5VZkVDTGiXav6ooKXfX3j/7tdkuD8Ey2//
-Kv7+ue0CAwEAATANBgkqhkiG9w0BAQQFAAOBgQCT0grFQeZaqYb5EYfk20XixZV4
-GmyAbXMftG1Eo7qGiMhYzRwGNWxEYojf5PZkYZXvSqZ/ZXHXa4g59jK/rJNnaVGM
-k+xIX8mxQvlV0n5O9PIha5BX5teZnkHKgL8aKKLKW1BK7YTngsfSzzaeame5iKfz
-itAE+OjGF+PFKbwX8Q==
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIIBPAIBAAJBAJ+zw4Qnlf8SMVIPFe9GEcStgOY2Ww/dgNdhjeD8ckUJNP5VZkVD
-TGiXav6ooKXfX3j/7tdkuD8Ey2//Kv7+ue0CAwEAAQJAN6W31vDEP2DjdqhzCDDu
-OA4NACqoiFqyblo7yc2tM4h4xMbC3Yx5UKMN9ZkCtX0gzrz6DyF47bdKcWBzNWCj
-gQIhANEoojVt7hq+SQ6MCN6FTAysGgQf56Q3TYoJMoWvdiXVAiEAw3e3rc+VJpOz
-rHuDo6bgpjUAAXM+v3fcpsfZSNO6V7kCIQCtbVjanpUwvZkMI9by02oUk9taki3b
-PzPfAfNPYAbCJQIhAJXNQDWyqwn/lGmR11cqY2y9nZ1+5w3yHGatLrcDnQHxAiEA
-vnlEGo8K85u+KwIOimM48ZG8oTk7iFdkqLJR1utT3aU=
------END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
+-----BEGIN CERTIFICATE-----
+MIID5zCCAs+gAwIBAgIJALnu1NlVpZ6zMA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZDELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxGTAXBgNVBAMMEFRlc3QgU2VydmVyIENlcnQw
+ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDzhPOSNtyyRspmeuUpxfNJ
+KCLTuf7g3uQ4zu4iHOmRO5TQci+HhVlLZrHF9XqFXcIP0y4pWDbMSGuiorUmzmfi
+R7bfSdI/+qIQt8KXRH6HNG1t8ou0VSvWId5TS5Dq/er5ODUr9OaaDva7EquHIcMv
+vPQGuI+OEAcnleVCy9HVEIySrO4P3CNIicnGkwwiAud05yUAq/gPXBC1hTtmlPD7
+TVcGVSEiJdvzqqlgv02qedGrkki6GY4S7GjZxrrf7Foc2EP+51LJzwLQx3/JfrCU
+41NEWAsu/Sl0tQabXESN+zJ1pDqoZ3uHMgpQjeGiE0olr+YcsSW/tJmiU9OiAr8R
+AgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJYIZI
+AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW
+BBSCvM8AABPR9zklmifnr9LvIBturDAfBgNVHSMEGDAWgBQ2w2yI55X+sL3szj49
+hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEAqb1NV0B0/pbpK9Z4/bNjzPQLTRLK
+WnSNm/Jh5v0GEUOE/Beg7GNjNrmeNmqxAlpqWz9qoeoFZax+QBpIZYjROU3TS3fp
+yLsrnlr0CDQ5R7kCCDGa8dkXxemmpZZLbUCpW2Uoy8sAA4JjN9OtsZY7dvUXFgJ7
+vVNTRnI01ghknbtD+2SxSQd3CWF6QhcRMAzZJ1z1cbbwGDDzfvGFPzJ+Sq+zEPds
+xoVLLSetCiBc+40ZcDS5dV98h9XD7JMTQfxzA7mNGv73JoZJA6nFgj+ADSlJsY/t
+JBv+z1iQRueoh9Qeee+ZbRifPouCB8FDx+AltvHTANdAq0t/K3o+pplMVA==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
+MIIEpAIBAAKCAQEA84TzkjbcskbKZnrlKcXzSSgi07n+4N7kOM7uIhzpkTuU0HIv
+h4VZS2axxfV6hV3CD9MuKVg2zEhroqK1Js5n4ke230nSP/qiELfCl0R+hzRtbfKL
+tFUr1iHeU0uQ6v3q+Tg1K/Tmmg72uxKrhyHDL7z0BriPjhAHJ5XlQsvR1RCMkqzu
+D9wjSInJxpMMIgLndOclAKv4D1wQtYU7ZpTw+01XBlUhIiXb86qpYL9NqnnRq5JI
+uhmOEuxo2ca63+xaHNhD/udSyc8C0Md/yX6wlONTRFgLLv0pdLUGm1xEjfsydaQ6
+qGd7hzIKUI3hohNKJa/mHLElv7SZolPTogK/EQIDAQABAoIBAADq9FwNtuE5IRQn
+zGtO4q7Y5uCzZ8GDNYr9RKp+P2cbuWDbvVAecYq2NV9QoIiWJOAYZKklOvekIju3
+r0UZLA0PRiIrTg6NrESx3JrjWDK8QNlUO7CPTZ39/K+FrmMkV9lem9yxjJjyC34D
+AQB+YRTx+l14HppjdxNwHjAVQpIx/uO2F5xAMuk32+3K+pq9CZUtrofe1q4Agj9R
+5s8mSy9pbRo9kW9wl5xdEotz1LivFOEiqPUJTUq5J5PeMKao3vdK726XI4Z455Nm
+W2/MA0YV0ug2FYinHcZdvKM6dimH8GLfa3X8xKRfzjGjTiMSwsdjgMa4awY3tEHH
+674jhAECgYEA/zqMrc0zsbNk83sjgaYIug5kzEpN4ic020rSZsmQxSCerJTgNhmg
+utKSCt0Re09Jt3LqG48msahX8ycqDsHNvlEGPQSbMu9IYeO3Wr3fAm75GEtFWePY
+BhM73I7gkRt4s8bUiUepMG/wY45c5tRF23xi8foReHFFe9MDzh8fJFECgYEA9EFX
+4qAik1pOJGNei9BMwmx0I0gfVEIgu0tzeVqT45vcxbxr7RkTEaDoAG6PlbWP6D9a
+WQNLp4gsgRM90ZXOJ4up5DsAWDluvaF4/omabMA+MJJ5kGZ0gCj5rbZbKqUws7x8
+bp+6iBfUPJUbcqNqFmi/08Yt7vrDnMnyMw2A/sECgYEAiiuRMxnuzVm34hQcsbhH
+6ymVqf7j0PW2qK0F4H1ocT9qhzWFd+RB3kHWrCjnqODQoI6GbGr/4JepHUpre1ex
+4UEN5oSS3G0ru0rC3U4C59dZ5KwDHFm7ffZ1pr52ljfQDUsrjjIMRtuiwNK2OoRa
+WSsqiaL+SDzSB+nBmpnAizECgYBdt/y6rerWUx4MhDwwtTnel7JwHyo2MDFS6/5g
+n8qC2Lj6/fMDRE22w+CA2esp7EJNQJGv+b27iFpbJEDh+/Lf5YzIT4MwVskQ5bYB
+JFcmRxUVmf4e09D7o705U/DjCgMH09iCsbLmqQ38ONIRSHZaJtMDtNTHD1yi+jF+
+OT43gQKBgQC/2OHZoko6iRlNOAQ/tMVFNq7fL81GivoQ9F1U0Qr+DH3ZfaH8eIkX
+xT0ToMPJUzWAn8pZv0snA0um6SIgvkCuxO84OkANCVbttzXImIsL7pFzfcwV/ERK
+UM6j0ZuSMFOCr/lGPAoOQU0fskidGEHi1/kW+suSr28TqsyYZpwBDQ==
 -----END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/Email=server-certs@thawte.com
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/Email=server-certs@thawte.com
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8= 
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----
diff --git a/apps/server2.pem b/apps/server2.pem
index 8bb6641..a3927cf 100644
--- a/apps/server2.pem
+++ b/apps/server2.pem
@@ -1,376 +1,52 @@
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Server test cert (1024 bit)
------BEGIN CERTIFICATE-----
-MIICLjCCAZcCAQEwDQYJKoZIhvcNAQEEBQAwWzELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYD
-VQQDExJUZXN0IENBICgxMDI0IGJpdCkwHhcNOTcwNjA5MTM1NzU0WhcNOTgwNjA5
-MTM1NzU0WjBkMQswCQYDVQQGEwJBVTETMBEGA1UECBMKUXVlZW5zbGFuZDEaMBgG
-A1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxJDAiBgNVBAMTG1NlcnZlciB0ZXN0IGNl
-cnQgKDEwMjQgYml0KTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAsxH1PBPm
-RkxrR11eV4bzNi4N9n11CI8nV29+ARlT1+qDe/mjVUvXlmsr1v/vf71G9GgqopSa
-6RXrICLVdk/FYYYzhPvl1M+OrjaXDFO8BzBAF1Lnz6c7aRZvGRJNrRSr2nZEkqDf
-JW9dY7r2VZEpD5QeuaRYUnuECkqeieB65GMCAwEAATANBgkqhkiG9w0BAQQFAAOB
-gQCWsOta6C0wiVzXz8wPmJKyTrurMlgUss2iSuW9366iwofZddsNg7FXniMzkIf6
-dp7jnmWZwKZ9cXsNUS2o4OL07qOk2HOywC0YsNZQsOBu1CBTYYkIefDiKFL1zQHh
-8lwwNd4NP+OE3NzUNkCfh4DnFfg9WHkXUlD5UpxNRJ4gJA==
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQCzEfU8E+ZGTGtHXV5XhvM2Lg32fXUIjydXb34BGVPX6oN7+aNV
-S9eWayvW/+9/vUb0aCqilJrpFesgItV2T8VhhjOE++XUz46uNpcMU7wHMEAXUufP
-pztpFm8ZEk2tFKvadkSSoN8lb11juvZVkSkPlB65pFhSe4QKSp6J4HrkYwIDAQAB
-AoGBAKy8jvb0Lzby8q11yNLf7+78wCVdYi7ugMHcYA1JVFK8+zb1WfSm44FLQo/0
-dSChAjgz36TTexeLODPYxleJndjVcOMVzsLJjSM8dLpXsTS4FCeMbhw2s2u+xqKY
-bbPWfk+HOTyJjfnkcC5Nbg44eOmruq0gSmBeUXVM5UntlTnxAkEA7TGCA3h7kx5E
-Bl4zl2pc3gPAGt+dyfk5Po9mGJUUXhF5p2zueGmYWW74TmOWB1kzt4QRdYMzFePq
-zfDNXEa1CwJBAMFErdY0xp0UJ13WwBbUTk8rujqQdHtjw0klhpbuKkjxu2hN0wwM
-6p0D9qxF7JHaghqVRI0fAW/EE0OzdHMR9QkCQQDNR26dMFXKsoPu+vItljj/UEGf
-QG7gERiQ4yxaFBPHgdpGo0kT31eh9x9hQGDkxTe0GNG/YSgCRvm8+C3TMcKXAkBD
-dhGn36wkUFCddMSAM4NSJ1VN8/Z0y5HzCmI8dM3VwGtGMUQlxKxwOl30LEQzdS5M
-0SWojNYXiT2gOBfBwtbhAkEAhafl5QEOIgUz+XazS/IlZ8goNKdDVfYgK3mHHjvv
-nY5G+AuGebdNkXJr4KSWxDcN+C2i47zuj4QXA16MAOandA==
------END RSA PRIVATE KEY-----
-subject=/C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-issuer= /C=US/O=AT&T Bell Laboratories/OU=Prototype Research CA
-notBefore=950413210656Z
-notAfter =970412210656Z
------BEGIN X509 CERTIFICATE-----
-
-MIICCDCCAXECAQAwDQYJKoZIhvcNAQEEBQAwTjELMAkGA1UEBhMCVVMxHzAdBgNV
-BAoUFkFUJlQgQmVsbCBMYWJvcmF0b3JpZXMxHjAcBgNVBAsUFVByb3RvdHlwZSBS
-ZXNlYXJjaCBDQTAeFw05NTA0MTMyMTA2NTZaFw05NzA0MTIyMTA2NTZaME4xCzAJ
-BgNVBAYTAlVTMR8wHQYDVQQKFBZBVCZUIEJlbGwgTGFib3JhdG9yaWVzMR4wHAYD
-VQQLFBVQcm90b3R5cGUgUmVzZWFyY2ggQ0EwgZwwDQYJKoZIhvcNAQEBBQADgYoA
-MIGGAoGAebOmgtSCl+wCYZc86UGYeTLY8cjmW2P0FN8ToT/u2pECCoFdrlycX0OR
-3wt0ZhpFXLVNeDnHwEE9veNUih7pCL2ZBFqoIoQkB1lZmXRiVtjGonz8BLm/qrFM
-YHb0lme/Ol+s118mwKVxnn6bSAeI/OXKhLaVdYZWk+aEaxEDkVkCAQ8wDQYJKoZI
-hvcNAQEEBQADgYEAAZMG14lZmZ8bahkaHaTV9dQf4p2FZiQTFwHP9ZyGsXPC+LT5
-dG5iTaRmyjNIJdPWohZDl97kAci79aBndvuEvRKOjLHs3WRGBIwERnAcnY9Mz8u/
-zIHK23PjYVxGGaZd669OJwD0CYyqH22HH9nFUGaoJdsv39ChW0NRdLE9+y8=
------END X509 CERTIFICATE-----
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test CA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJjCCAY8CAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTc0M1oXDTAxMDYw
-OTEzNTc0M1owWzELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRswGQYDVQQDExJUZXN0IENBICgxMDI0
-IGJpdCkwgZ8wDQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBAKO7o8t116VP6cgybTsZ
-DCZhr95nYlZuya3aCi1IKoztqwWnjbmDFIriOqGFPrZQ+moMETC9D59iRW/dFXSv
-1F65ka/XY2hLh9exCCo7XuUcDs53Qp3bI3AmMqHjgzE8oO3ajyJAzJkTTOUecQU2
-mw/gI4tMM0LqWMQS7luTy4+xAgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAM7achv3v
-hLQJcv/65eGEpBXM40ZDVoFQFFJWaY5p883HTqLB1x4FdzsXHH0QKBTcKpWwqyu4
-YDm3fb8oDugw72bCzfyZK/zVZPR/hVlqI/fvU109Qoc+7oPvIXWky71HfcK6ZBCA
-q30KIqGM/uoM60INq97qjDmCJapagcNBGQs=
------END CERTIFICATE-----
------BEGIN RSA PRIVATE KEY-----
-MIICXQIBAAKBgQCju6PLddelT+nIMm07GQwmYa/eZ2JWbsmt2gotSCqM7asFp425
-gxSK4jqhhT62UPpqDBEwvQ+fYkVv3RV0r9ReuZGv12NoS4fXsQgqO17lHA7Od0Kd
-2yNwJjKh44MxPKDt2o8iQMyZE0zlHnEFNpsP4COLTDNC6ljEEu5bk8uPsQIDAQAB
-AoGAVZmpFZsDZfr0l2S9tLLwpjRWNOlKATQkno6q2WesT0eGLQufTciY+c8ypfU6
-hyio8r5iUl/VhhdjhAtKx1mRpiotftHo/eYf8rtsrnprOnWG0bWjLjtIoMbcxGn2
-J3bN6LJmbJMjDs0eJ3KnTu646F3nDUw2oGAwmpzKXA1KAP0CQQDRvQhxk2D3Pehs
-HvG665u2pB5ipYQngEFlZO7RHJZzJOZEWSLuuMqaF/7pTfA5jiBvWqCgJeCRRInL
-21ru4dlPAkEAx9jj7BgKn5TYnMoBSSe0afjsV9oApVpN1Nacb1YDtCwy+scp3++s
-nFxlv98wxIlSdpwMUn+AUWfjiWR7Tu/G/wJBAJ/KjwZIrFVxewP0x2ILYsTRYLzz
-MS4PDsO7FB+I0i7DbBOifXS2oNSpd3I0CNMwrxFnUHzynpbOStVfN3ZL5w0CQQCa
-pwFahxBRhkJKsxhjoFJBX9yl75JoY4Wvm5Tbo9ih6UJaRx3kqfkN14L2BKYcsZgb
-KY9vmDOYy6iNfjDeWTfJAkBkfPUb8oTJ/nSP5zN6sqGxSY4krc4xLxpRmxoJ8HL2
-XfhqXkTzbU13RX9JJ/NZ8vQN9Vm2NhxRGJocQkmcdVtJ
------END RSA PRIVATE KEY-----
------BEGIN X509 CERTIFICATE-----
-MIICYDCCAiACAgEoMAkGBSsOAwINBQAwfDELMAkGA1UEBhMCVVMxNjA0BgNVBAoT
-LU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFuZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZ
-MBcGA1UECxMQVGVzdCBFbnZpcm9ubWVudDEaMBgGA1UECxMRRFNTLU5BU0EtUGls
-b3QtQ0EwHhcNOTYwMjI2MTYzMjQ1WhcNOTcwMjI1MTYzMjQ1WjB8MQswCQYDVQQG
-EwJVUzE2MDQGA1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFk
-bWluaXN0cmF0aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MRowGAYDVQQL
-ExFEU1MtTkFTQS1QaWxvdC1DQTCB8jAJBgUrDgMCDAUAA4HkADCB4AJBAMA/ssKb
-hPNUG7ZlASfVwEJU21O5OyF/iyBzgHI1O8eOhJGUYO8cc8wDMjR508Mr9cp6Uhl/
-ZB7FV5GkLNEnRHYCQQDUEaSg45P2qrDwixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLb
-bn3QK74T2IxY1yY+kCNq8XrIqf5fJJzIH0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3
-fVd0geUCQQCzCFUQAh+ZkEmp5804cs6ZWBhrUAfnra8lJItYo9xPcXgdIfLfibcX
-R71UsyO77MRD7B0+Ag2tq794IleCVcEEMAkGBSsOAwINBQADLwAwLAIUUayDfreR
-Yh2WeU86/pHNdkUC1IgCFEfxe1f0oMpxJyrJ5XIxTi7vGdoK
------END X509 CERTIFICATE-----
------BEGIN X509 CERTIFICATE-----
-
-MIICGTCCAdgCAwCqTDAJBgUrDgMCDQUAMHwxCzAJBgNVBAYTAlVTMTYwNAYDVQQK
-Ey1OYXRpb25hbCBBZXJvbmF1dGljcyBhbmQgU3BhY2UgQWRtaW5pc3RyYXRpb24x
-GTAXBgNVBAsTEFRlc3QgRW52aXJvbm1lbnQxGjAYBgNVBAsTEURTUy1OQVNBLVBp
-bG90LUNBMB4XDTk2MDUxNDE3MDE0MVoXDTk3MDUxNDE3MDE0MVowMzELMAkGA1UE
-BhMCQVUxDzANBgNVBAoTBk1pbmNvbTETMBEGA1UEAxMKRXJpYyBZb3VuZzCB8jAJ
-BgUrDgMCDAUAA4HkADCB4AJBAKbfHz6vE6pXXMTpswtGUec2tvnfLJUsoxE9qs4+
-ObZX7LmLvragNPUeiTJx7UOWZ5DfBj6bXLc8eYne0lP1g3ACQQDUEaSg45P2qrDw
-ixTRhFhmWz5Nvc4lRFQ/42XPcchiJBLbbn3QK74T2IxY1yY+kCNq8XrIqf5fJJzI
-H0J/xUP3AhUAsg2wsQHfDGYk/BOSulX3fVd0geUCQQCzCFUQAh+ZkEmp5804cs6Z
-WBhrUAfnra8lJItYo9xPcXgdIfLfibcXR71UsyO77MRD7B0+Ag2tq794IleCVcEE
-MAkGBSsOAwINBQADMAAwLQIUWsuuJRE3VT4ueWkWMAJMJaZjj1ECFQCYY0zX4bzM
-LC7obsrHD8XAHG+ZRG==
------END X509 CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICTTCCAbagAwIBAgIBADANBgkqhkiG9w0BAQQFADBMMQswCQYDVQQGEwJHQjEM
-MAoGA1UEChMDVUNMMRgwFgYDVQQLEw9JQ0UtVEVMIFByb2plY3QxFTATBgNVBAMT
-DFRydXN0RmFjdG9yeTAeFw05NzA0MjIxNDM5MTRaFw05ODA0MjIxNDM5MTRaMEwx
-CzAJBgNVBAYTAkdCMQwwCgYDVQQKEwNVQ0wxGDAWBgNVBAsTD0lDRS1URUwgUHJv
-amVjdDEVMBMGA1UEAxMMVHJ1c3RGYWN0b3J5MIGcMAoGBFUIAQECAgQAA4GNADCB
-iQKBgQCEieR8NcXkUW1f0G6aC6u0i8q/98JqS6RxK5YmHIGKCkuTWAUjzLfUa4dt
-U9igGCjTuxaDqlzEim+t/02pmiBZT9HaX++35MjQPUWmsChcYU5WyzGErXi+rQaw
-zlwS73zM8qiPj/97lXYycWhgL0VaiDSPxRXEUdWoaGruom4mNQIDAQABo0IwQDAd
-BgNVHQ4EFgQUHal1LZr7oVg5z6lYzrhTgZRCmcUwDgYDVR0PAQH/BAQDAgH2MA8G
-A1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQEEBQADgYEAfaggfl6FZoioecjv0dq8
-/DXo/u11iMZvXn08gjX/zl2b4wtPbShOSY5FhkSm8GeySasz+/Nwb/uzfnIhokWi
-lfPZHtlCWtXbIy/TN51eJyq04ceDCQDWvLC2enVg9KB+GJ34b5c5VaPRzq8MBxsA
-S7ELuYGtmYgYm9NZOIr7yU0=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIIB6jCCAZQCAgEtMA0GCSqGSIb3DQEBBAUAMIGAMQswCQYDVQQGEwJVUzE2MDQG
-A1UEChMtTmF0aW9uYWwgQWVyb25hdXRpY3MgYW5kIFNwYWNlIEFkbWluaXN0cmF0
-aW9uMRkwFwYDVQQLExBUZXN0IEVudmlyb25tZW50MR4wHAYDVQQLExVNRDUtUlNB
-LU5BU0EtUGlsb3QtQ0EwHhcNOTYwNDMwMjIwNTAwWhcNOTcwNDMwMjIwNTAwWjCB
-gDELMAkGA1UEBhMCVVMxNjA0BgNVBAoTLU5hdGlvbmFsIEFlcm9uYXV0aWNzIGFu
-ZCBTcGFjZSBBZG1pbmlzdHJhdGlvbjEZMBcGA1UECxMQVGVzdCBFbnZpcm9ubWVu
-dDEeMBwGA1UECxMVTUQ1LVJTQS1OQVNBLVBpbG90LUNBMFkwCgYEVQgBAQICAgAD
-SwAwSAJBALmmX5+GqAvcrWK13rfDrNX9UfeA7f+ijyBgeFQjYUoDpFqapw4nzQBL
-bAXug8pKkRwa2Zh8YODhXsRWu2F/UckCAwEAATANBgkqhkiG9w0BAQQFAANBAH9a
-OBA+QCsjxXgnSqHx04gcU8S49DVUb1f2XVoLnHlIb8RnX0k5O6mpHT5eti9bLkiW
-GJNMJ4L0AJ/ac+SmHZc=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIICajCCAdMCBDGA0QUwDQYJKoZIhvcNAQEEBQAwfTELMAkGA1UEBhMCQ2ExDzAN
-BgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmlsaXR5IEFjY2VwdGVkMR8w
-HQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRwwGgYDVQQDExNFbnRydXN0
-IERlbW8gV2ViIENBMB4XDTk2MDQyNjEzMzUwMVoXDTA2MDQyNjEzMzUwMVowfTEL
-MAkGA1UEBhMCQ2ExDzANBgNVBAcTBk5lcGVhbjEeMBwGA1UECxMVTm8gTGlhYmls
-aXR5IEFjY2VwdGVkMR8wHQYDVQQKExZGb3IgRGVtbyBQdXJwb3NlcyBPbmx5MRww
-GgYDVQQDExNFbnRydXN0IERlbW8gV2ViIENBMIGdMA0GCSqGSIb3DQEBAQUAA4GL
-ADCBhwKBgQCaroS7O1DA0hm4IefNYU1cx/nqOmzEnk291d1XqznDeF4wEgakbkCc
-zTKxK791yNpXG5RmngqH7cygDRTHZJ6mfCRn0wGC+AI00F2vYTGqPGRQL1N3lZT0
-YDKFC0SQeMMjFIZ1aeQigroFQnHo0VB3zWIMpNkka8PY9lxHZAmWwQIBAzANBgkq
-hkiG9w0BAQQFAAOBgQBAx0UMVA1s54lMQyXjMX5kj99FJN5itb8bK1Rk+cegPQPF
-cWO9SEWyEjjBjIkjjzAwBkaEszFsNGxemxtXvwjIm1xEUMTVlPEWTs2qnDvAUA9W
-YqhWbhH0toGT36236QAsqCZ76rbTRVSSX2BHyJwJMG2tCRv7kRJ//NIgxj3H4w==
------END CERTIFICATE-----
-
-issuer= /C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
-subject=/C=AU/ST=Queensland/O=CryptSoft Pty Ltd/CN=Test PCA (1024 bit)
------BEGIN CERTIFICATE-----
-MIICJzCCAZACAQAwDQYJKoZIhvcNAQEEBQAwXDELMAkGA1UEBhMCQVUxEzARBgNV
-BAgTClF1ZWVuc2xhbmQxGjAYBgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYD
-VQQDExNUZXN0IFBDQSAoMTAyNCBiaXQpMB4XDTk3MDYwOTEzNTczN1oXDTAxMDYw
-OTEzNTczN1owXDELMAkGA1UEBhMCQVUxEzARBgNVBAgTClF1ZWVuc2xhbmQxGjAY
-BgNVBAoTEUNyeXB0U29mdCBQdHkgTHRkMRwwGgYDVQQDExNUZXN0IFBDQSAoMTAy
-NCBiaXQpMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCdoWk/3+WcMlfjIrkg
-40ketmnQaEogQe1LLcuOJV6rKfUSAsPgwgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp
-22Jp85PmemiDzyUIStwk72qhp1imbANZvlmlCFKiQrjUyuDfu4TABmn+kkt3vR1Y
-BEOGt+IFye1UBVSATVdRJ2UVhwIDAQABMA0GCSqGSIb3DQEBBAUAA4GBABNA1u/S
-Cg/LJZWb7GliiKJsvuhxlE4E5JxQF2zMub/CSNbF97//tYSyj96sxeFQxZXbcjm9
-xt6mr/xNLA4szNQMJ4P+L7b5e/jC5DSqlwS+CUYJgaFs/SP+qJoCSu1bR3IM9XWO
-cRBpDmcBbYLkSyB92WURvsZ1LtjEcn+cdQVI
+subject= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = Test Server Cert #2
+issuer= C = UK, O = OpenSSL Group, OU = FOR TESTING PURPOSES ONLY, CN = OpenSSL Test Intermediate CA
+-----BEGIN CERTIFICATE-----
+MIID6jCCAtKgAwIBAgIJALnu1NlVpZ60MA0GCSqGSIb3DQEBBQUAMHAxCzAJBgNV
+BAYTAlVLMRYwFAYDVQQKDA1PcGVuU1NMIEdyb3VwMSIwIAYDVQQLDBlGT1IgVEVT
+VElORyBQVVJQT1NFUyBPTkxZMSUwIwYDVQQDDBxPcGVuU1NMIFRlc3QgSW50ZXJt
+ZWRpYXRlIENBMB4XDTExMTIwODE0MDE0OFoXDTIxMTAxNjE0MDE0OFowZzELMAkG
+A1UEBhMCVUsxFjAUBgNVBAoMDU9wZW5TU0wgR3JvdXAxIjAgBgNVBAsMGUZPUiBU
+RVNUSU5HIFBVUlBPU0VTIE9OTFkxHDAaBgNVBAMME1Rlc3QgU2VydmVyIENlcnQg
+IzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDrdi7j9yctG+L4EjBy
+gjPmEqZzOJEQba26MoQGzglU7e5Xf59Rb/hgVQuKAoiZe7/R8rK4zJ4W7iXdXw0L
+qBpyG8B5aGKeI32w+A9TcBApoXXL2CrYQEQjZwUIpLlYBIi2NkJj3nVkq5dgl1gO
+ALiQ+W8jg3kzg5Ec9rimp9r93N8wsSL3awsafurmYCvOf7leHaMP1WJ/zDRGUNHG
+/WtDjXc8ZUG1+6EXU9Jc2Fs+2Omf7fcN0l00AK/wPg8OaNS0rKyGq9JdIT9FRGV1
+bXe/rx58FaE5CItdwCSYhJvF/O95LWQoxJXye5bCFLmvDTEyVq9FMSCptfsmbXjE
+ZGsXAgMBAAGjgY8wgYwwDAYDVR0TAQH/BAIwADAOBgNVHQ8BAf8EBAMCBeAwLAYJ
+YIZIAYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1Ud
+DgQWBBR52UaWWTKzZGDH/X4mWNcuqeQVazAfBgNVHSMEGDAWgBQ2w2yI55X+sL3s
+zj49hqshgYfa2jANBgkqhkiG9w0BAQUFAAOCAQEANBW+XYLlHBqVY/31ie+3gRlS
+LPfy4SIqn0t3RJjagT29MXprblBO2cbMO8VGjkQdKGpmMXjxbht2arOOUXRHX4n/
+XTyn/QHEf0bcwIITMReO3DZUPAEw8hSjn9xEOM0IRVOCP+mH5fi74QzzQaZVCyYg
+5VtLKdww/+sc0nCbKl2KWgDluriH0nfVx95qgW3mg9dhXRr0zmf1w2zkBHYpARYL
+Dew6Z8EE4tS3HJu8/qM6meWzNtrfonQ3eiiMxjZBxzV46jchBwa2z9XYhP6AmpPb
+oeTSzcQNbWsxaGYzWo46oLDUZmJOwSBawbS31bZNMCoPIY6ukoesCzFSsUKZww==
 -----END CERTIFICATE-----
 -----BEGIN RSA PRIVATE KEY-----
-MIICXAIBAAKBgQCdoWk/3+WcMlfjIrkg40ketmnQaEogQe1LLcuOJV6rKfUSAsPg
-wgsabJ/wn8TxA1yy3eKJbFl3OiUXMRsp22Jp85PmemiDzyUIStwk72qhp1imbANZ
-vlmlCFKiQrjUyuDfu4TABmn+kkt3vR1YBEOGt+IFye1UBVSATVdRJ2UVhwIDAQAB
-AoGAba4fTtuap5l7/8ZsbE7Z1O32KJY4ZcOZukLOLUUhXxXduT+FTgGWujc0/rgc
-z9qYCLlNZHOouMYTgtSfYvuMuLZ11VIt0GYH+nRioLShE59Yy+zCRyC+gPigS1kz
-xvo14AsOIPYV14Tk/SsHyq6E0eTk7VzaIE197giiINUERPECQQDSKmtPTh/lRKw7
-HSZSM0I1mFWn/1zqrAbontRQY5w98QWIOe5qmzYyFbPXYT3d9BzlsMyhgiRNoBbD
-yvohSHXJAkEAwAHx6ezAZeWWzD5yXD36nyjpkVCw7Tk7TSmOceLJMWt1QcrCfqlS
-xA5jjpQ6Z8suU5DdtWAryM2sAir1WisYzwJAd6Zcx56jvAQ3xcPXsE6scBTVFzrj
-7FqZ6E+cclPzfLQ+QQsyOBE7bpI6e/FJppY26XGZXo3YGzV8IGXrt40oOQJALETG
-h86EFXo3qGOFbmsDy4pdP5nBERCu8X1xUCSfintiD4c2DInxgS5oGclnJeMcjTvL
-QjQoJCX3UJCi/OUO1QJBAKgcDHWjMvt+l1pjJBsSEZ0HX9AAIIVx0RQmbFGS+F2Q
-hhu5l77WnnZOQ9vvhV5u7NPCUF9nhU3jh60qWWO8mkc=
+MIIEowIBAAKCAQEA63Yu4/cnLRvi+BIwcoIz5hKmcziREG2tujKEBs4JVO3uV3+f
+UW/4YFULigKImXu/0fKyuMyeFu4l3V8NC6gachvAeWhiniN9sPgPU3AQKaF1y9gq
+2EBEI2cFCKS5WASItjZCY951ZKuXYJdYDgC4kPlvI4N5M4ORHPa4pqfa/dzfMLEi
+92sLGn7q5mArzn+5Xh2jD9Vif8w0RlDRxv1rQ413PGVBtfuhF1PSXNhbPtjpn+33
+DdJdNACv8D4PDmjUtKyshqvSXSE/RURldW13v68efBWhOQiLXcAkmISbxfzveS1k
+KMSV8nuWwhS5rw0xMlavRTEgqbX7Jm14xGRrFwIDAQABAoIBAHLsTPihIfLnYIE5
+x4GsQQ5zXeBw5ITDM37ktwHnQDC+rIzyUl1aLD1AZRBoKinXd4lOTqLZ4/NHKx4A
+DYr58mZtWyUmqLOMmQVuHXTZBlp7XtYuXMMNovQwjQlp9LicBeoBU6gQ5PVMtubD
+F4xGF89Sn0cTHW3iMkqTtQ5KcR1j57OcJO0FEb1vPvk2MXI5ZyAatUYE7YacbEzd
+rg02uIwx3FqNSkuSI79uz4hMdV5TPtuhxx9nTwj9aLUhXFeZ0mn2PVgVzEnnMoJb
++znlsZDgzDlJqdaD744YGWh8Z3OEssB35KfzFcdOeO6yH8lmv2Zfznk7pNPT7LTb
+Lae9VgkCgYEA92p1qnAB3NtJtNcaW53i0S5WJgS1hxWKvUDx3lTB9s8X9fHpqL1a
+E94fDfWzp/hax6FefUKIvBOukPLQ6bYjTMiFoOHzVirghAIuIUoMI5VtLhwD1hKs
+Lr7l/dptMgKb1nZHyXoKHRBthsy3K4+udsPi8TzMvYElgEqyQIe/Rk0CgYEA86GL
+8HC6zLszzKERDPBxrboRmoFvVUCTQDhsfj1M8aR3nQ8V5LkdIJc7Wqm/Ggfk9QRf
+rJ8M2WUMlU5CNnCn/KCrKzCNZIReze3fV+HnKdbcXGLvgbHPrhnz8yYehUFG+RGq
+bVyDWRU94T38izy2s5qMYrMJWZEYyXncSPbfcPMCgYAtaXfxcZ+V5xYPQFARMtiX
+5nZfggvDoJuXgx0h3tK/N2HBfcaSdzbaYLG4gTmZggc/jwnl2dl5E++9oSPhUdIG
+3ONSFUbxsOsGr9PBvnKd8WZZyUCXAVRjPBzAzF+whzQNWCZy/5htnz9LN7YDI9s0
+5113Q96cheDZPFydZY0hHQKBgQDVbEhNukM5xCiNcu+f2SaMnLp9EjQ4h5g3IvaP
+5B16daw/Dw8LzcohWboqIxeAsze0GD/D1ZUJAEd0qBjC3g+a9BjefervCjKOzXng
+38mEUm+6EwVjJSQcjSmycEs+Sr/kwr/8i5WYvU32+jk4tFgMoC+o6tQe/Uesf68k
+z/dPVwKBgGbF7Vv1/3SmhlOy+zYyvJ0CrWtKxH9QP6tLIEgEpd8x7YTSuCH94yok
+kToMXYA3sWNPt22GbRDZ+rcp4c7HkDx6I6vpdP9aQEwJTp0EPy0sgWr2XwYmreIQ
+NFmkk8Itn9EY2R9VBaP7GLv5kvwxDdLAnmwGmzVtbmaVdxCaBwUk
 -----END RSA PRIVATE KEY-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Commercial Certification Authority
-notBefore=941104185834Z
-notAfter =991103185834Z
------BEGIN X509 CERTIFICATE-----
-
-MIICIzCCAZACBQJBAAAWMA0GCSqGSIb3DQEBAgUAMFwxCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVy
-Y2lhbCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDQxODU4MzRaFw05
-OTExMDMxODU4MzRaMFwxCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0YSBT
-ZWN1cml0eSwgSW5jLjErMCkGA1UECxMiQ29tbWVyY2lhbCBDZXJ0aWZpY2F0aW9u
-IEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCk+4Fie84QJ93o
-975sbsZwmdu41QUDaSiCnHJ/lj+O7Kwpkj+KFPhCdr69XQO5kNTQvAayUTNfxMK/
-touPmbZiImDd298ggrTKoi8tUO2UMt7gVY3UaOLgTNLNBRYulWZcYVI4HlGogqHE
-7yXpCuaLK44xZtn42f29O2nZ6wIDAQABMA0GCSqGSIb3DQEBAgUAA34AdrW2EP4j
-9/dZYkuwX5zBaLxJu7NJbyFHXSudVMQAKD+YufKKg5tgf+tQx6sFEC097TgCwaVI
-0v5loMC86qYjFmZsGySp8+x5NRhPJsjjr1BKx6cxa9B8GJ1Qv6km+iYrRpwUqbtb
-MJhCKLVLU7tDCZJAuqiqWqTGtotXTcU=
------END X509 CERTIFICATE-----
-subject=/C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-issuer= /C=US/O=RSA Data Security, Inc./OU=Secure Server Certification Authority
-notBefore=941109235417Z
-notAfter =991231235417Z
------BEGIN X509 CERTIFICATE-----
-
-MIICKTCCAZYCBQJBAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMSAw
-HgYDVQQKExdSU0EgRGF0YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJl
-IFNlcnZlciBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NDExMDkyMzU0MTda
-Fw05OTEyMzEyMzU0MTdaMF8xCzAJBgNVBAYTAlVTMSAwHgYDVQQKExdSU0EgRGF0
-YSBTZWN1cml0eSwgSW5jLjEuMCwGA1UECxMlU2VjdXJlIFNlcnZlciBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBmzANBgkqhkiG9w0BAQEFAAOBiQAwgYUCfgCSznrB
-roM+WqqJg1esJQF2DK2ujiw3zus1eGRUA+WEQFHJv48I4oqCCNIWhjdV6bEhAq12
-aIGaBaJLyUslZiJWbIgHj/eBWW2EB2VwE3F2Ppt3TONQiVaYSLkdpykaEy5KEVmc
-HhXVSVQsczppgrGXOZxtcGdI5d0t1sgeewIDAQABMA0GCSqGSIb3DQEBAgUAA34A
-iNHReSHO4ovo+MF9NFM/YYPZtgs4F7boviGNjwC4i1N+RGceIr2XJ+CchcxK9oU7
-suK+ktPlDemvXA4MRpX/oRxePug2WHpzpgr4IhFrwwk4fia7c+8AvQKk8xQNMD9h
-cHsg/jKjn7P0Z1LctO6EjJY2IN6BCINxIYoPnqk=
------END X509 CERTIFICATE-----
-subject=/C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/Email=server-certs@thawte.com
-issuer= /C=ZA/SP=Western Cape/L=Cape Town/O=Thawte Consulting cc
-	/OU=Certification Services Division/CN=Thawte Server CA
-	/Email=server-certs@thawte.com
------BEGIN CERTIFICATE-----
-MIIC+TCCAmICAQAwDQYJKoZIhvcNAQEEBQAwgcQxCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkq
-hkiG9w0BCQEWF3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMB4XDTk2MDcyNzE4MDc1
-N1oXDTk4MDcyNzE4MDc1N1owgcQxCzAJBgNVBAYTAlpBMRUwEwYDVQQIEwxXZXN0
-ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMUVGhhd3RlIENv
-bnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2VydmljZXMgRGl2
-aXNpb24xGTAXBgNVBAMTEFRoYXd0ZSBTZXJ2ZXIgQ0ExJjAkBgkqhkiG9w0BCQEW
-F3NlcnZlci1jZXJ0c0B0aGF3dGUuY29tMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCB
-iQKBgQDTpFBuyP9Wa+bPXbbqDGh1R6KqwtqEJfyo9EdR2oW1IHSUhh4PdcnpCGH1
-Bm0wbhUZAulSwGLbTZme4moMRDjN/r7jZAlwxf6xaym2L0nIO9QnBCUQly/nkG3A
-KEKZ10xD3sP1IW1Un13DWOHA5NlbsLjctHvfNjrCtWYiEtaHDQIDAQABMA0GCSqG
-SIb3DQEBBAUAA4GBAIsvn7ifX3RUIrvYXtpI4DOfARkTogwm6o7OwVdl93yFhDcX
-7h5t0XZ11MUAMziKdde3rmTvzUYIUCYoY5b032IwGMTvdiclK+STN6NP2m5nvFAM
-qJT5gC5O+j/jBuZRQ4i0AMYQr5F4lT8oBJnhgafw6PL8aDY2vMHGSPl9+7uf
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIIDDTCCAnYCAQAwDQYJKoZIhvcNAQEEBQAwgc4xCzAJBgNVBAYTAlpBMRUwEwYD
-VQQIEwxXZXN0ZXJuIENhcGUxEjAQBgNVBAcTCUNhcGUgVG93bjEdMBsGA1UEChMU
-VGhhd3RlIENvbnN1bHRpbmcgY2MxKDAmBgNVBAsTH0NlcnRpZmljYXRpb24gU2Vy
-dmljZXMgRGl2aXNpb24xITAfBgNVBAMTGFRoYXd0ZSBQcmVtaXVtIFNlcnZlciBD
-QTEoMCYGCSqGSIb3DQEJARYZcHJlbWl1bS1zZXJ2ZXJAdGhhd3RlLmNvbTAeFw05
-NjA3MjcxODA3MTRaFw05ODA3MjcxODA3MTRaMIHOMQswCQYDVQQGEwJaQTEVMBMG
-A1UECBMMV2VzdGVybiBDYXBlMRIwEAYDVQQHEwlDYXBlIFRvd24xHTAbBgNVBAoT
-FFRoYXd0ZSBDb25zdWx0aW5nIGNjMSgwJgYDVQQLEx9DZXJ0aWZpY2F0aW9uIFNl
-cnZpY2VzIERpdmlzaW9uMSEwHwYDVQQDExhUaGF3dGUgUHJlbWl1bSBTZXJ2ZXIg
-Q0ExKDAmBgkqhkiG9w0BCQEWGXByZW1pdW0tc2VydmVyQHRoYXd0ZS5jb20wgZ8w
-DQYJKoZIhvcNAQEBBQADgY0AMIGJAoGBANI2NmqL18JbntqBQWKPOO5JBFXW0O8c
-G5UWR+8YSDU6UvQragaPOy/qVuOvho2eF/eetGV1Ak3vywmiIVHYm9Bn0LoNkgYU
-c9STy5cqAJxcTgy8+hVS/PJEbtoRSm4Iny8t4/mqOoZztkZTWMiJBb2DEbhzP6oH
-jfRCTedAnRw3AgMBAAEwDQYJKoZIhvcNAQEEBQADgYEAutFIgTRZVYerIZfL9lvR
-w9Eifvvo5KTZ3h+Bj+VzNnyw4Qc/IyXkPOu6SIiH9LQ3sCmWBdxpe+qr4l77rLj2
-GYuMtESFfn1XVALzkYgC7JcPuTOjMfIiMByt+uFf8AV8x0IW/Qkuv+hEQcyM9vxK
-3VZdLbCVIhNoEsysrxCpxcI=
------END CERTIFICATE-----
-Tims test GCI CA
-
------BEGIN CERTIFICATE-----
-MIIB8DCCAZoCAQAwDQYJKoZIhvcNAQEEBQAwgYIxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2RldmVsb3BtZW50MRkwFwYDVQQDExBD
-cnlwdFNvZnQgRGV2IENBMB4XDTk3MDMyMjEzMzQwNFoXDTk4MDMyMjEzMzQwNFow
-gYIxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhC
-cmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxFDASBgNVBAsTC2Rl
-dmVsb3BtZW50MRkwFwYDVQQDExBDcnlwdFNvZnQgRGV2IENBMFwwDQYJKoZIhvcN
-AQEBBQADSwAwSAJBAOAOAqogG5QwAmLhzyO4CoRnx/wVy4NZP4dxJy83O1EnL0rw
-OdsamJKvPOLHgSXo3gDu9uVyvCf/QJmZAmC5ml8CAwEAATANBgkqhkiG9w0BAQQF
-AANBADRRS/GVdd7rAqRW6SdmgLJduOU2yq3avBu99kRqbp9A/dLu6r6jU+eP4oOA
-TfdbFZtAAD2Hx9jUtY3tfdrJOb8= 
------END CERTIFICATE-----
-
------BEGIN CERTIFICATE-----
-MIICVjCCAgACAQAwDQYJKoZIhvcNAQEEBQAwgbUxCzAJBgNVBAYTAkFVMRMwEQYD
-VQQIEwpRdWVlbnNsYW5kMREwDwYDVQQHEwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5
-cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsTI1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9O
-IEFVVEhPUklUSUVTMTQwMgYDVQQDEytaRVJPIFZBTFVFIENBIC0gREVNT05TVFJB
-VElPTiBQVVJQT1NFUyBPTkxZMB4XDTk3MDQwMzEzMjI1NFoXDTk4MDQwMzEzMjI1
-NFowgbUxCzAJBgNVBAYTAkFVMRMwEQYDVQQIEwpRdWVlbnNsYW5kMREwDwYDVQQH
-EwhCcmlzYmFuZTEaMBgGA1UEChMRQ3J5cHRTb2Z0IFB0eSBMdGQxLDAqBgNVBAsT
-I1dPUlRITEVTUyBDRVJUSUZJQ0FUSU9OIEFVVEhPUklUSUVTMTQwMgYDVQQDEyta
-RVJPIFZBTFVFIENBIC0gREVNT05TVFJBVElPTiBQVVJQT1NFUyBPTkxZMFwwDQYJ
-KoZIhvcNAQEBBQADSwAwSAJBAOZ7T7yqP/tyspcko3yPY1y0Cm2EmwNvzW4QgVXR
-Fjs3HmJ4xtSpXdo6mwcGezL3Abt/aQXaxv9PU8xt+Jr0OFUCAwEAATANBgkqhkiG
-9w0BAQQFAANBAOQpYmGgyCqCy1OljgJhCqQOu627oVlHzK1L+t9vBaMfn40AVUR4
-WzQVWO31KTgi5vTK1U+3h46fgUWqQ0h+6rU=
------END CERTIFICATE-----
------BEGIN CERTIFICATE-----
-MIAwgKADAgECAgEAMA0GCSqGSIb3DQEBBAUAMGIxETAPBgNVBAcTCEludGVybmV0
-MRcwFQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xh
-c3MgMSBDQSAtIEluZGl2aWR1YWwgU3Vic2NyaWJlcjAeFw05NjA0MDgxMDIwMjda
-Fw05NzA0MDgxMDIwMjdaMGIxETAPBgNVBAcTCEludGVybmV0MRcwFQYDVQQKEw5W
-ZXJpU2lnbiwgSW5jLjE0MDIGA1UECxMrVmVyaVNpZ24gQ2xhc3MgMSBDQSAtIElu
-ZGl2aWR1YWwgU3Vic2NyaWJlcjCAMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC2
-FKbPTdAFDdjKI9BvqrQpkmOOLPhvltcunXZLEbE2jVfJw/0cxrr+Hgi6M8qV6r7j
-W80GqLd5HUQq7XPysVKDaBBwZJHXPmv5912dFEObbpdFmIFH0S3L3bty10w/cari
-QPJUObwW7s987LrbP2wqsxaxhhKdrpM01bjV0Pc+qQIDAQABAAAAADANBgkqhkiG
-9w0BAQQFAAOBgQA+1nJryNt8VBRjRr07ArDAV/3jAH7GjDc9jsrxZS68ost9v06C
-TvTNKGL+LISNmFLXl+JXhgGB0JZ9fvyYzNgHQ46HBUng1H6voalfJgS2KdEo50wW
-8EFZYMDkT1k4uynwJqkVN2QJK/2q4/A/VCov5h6SlM8Affg2W+1TLqvqkwAA
------END CERTIFICATE-----
-
- subject=/L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
- issuer= /L=Internet/O=VeriSign, Inc./OU=VeriSign Class 2 CA - Individual Subscriber
-
------BEGIN CERTIFICATE-----
-MIIEkzCCA/ygAwIBAgIRANDTUpSRL3nTFeMrMayFSPAwDQYJKoZIhvcNAQECBQAw
-YjERMA8GA1UEBxMISW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQw
-MgYDVQQLEytWZXJpU2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3Jp
-YmVyMB4XDTk2MDYwNDAwMDAwMFoXDTk4MDYwNDIzNTk1OVowYjERMA8GA1UEBxMI
-SW50ZXJuZXQxFzAVBgNVBAoTDlZlcmlTaWduLCBJbmMuMTQwMgYDVQQLEytWZXJp
-U2lnbiBDbGFzcyAyIENBIC0gSW5kaXZpZHVhbCBTdWJzY3JpYmVyMIGfMA0GCSqG
-SIb3DQEBAQUAA4GNADCBiQKBgQC6A+2czKGRcYMfm8gdnk+0de99TDDzsqo0v5nb
-RsbUmMcdRQ7nsMbRWe0SAb/9QoLTZ/cJ0iOBqdrkz7UpqqKarVoTSdlSMVM92tWp
-3bJncZHQD1t4xd6lQVdI1/T6R+5J0T1ukOdsI9Jmf+F28S6g3R3L1SFwiHKeZKZv
-z+793wIDAQABo4ICRzCCAkMwggIpBgNVHQMBAf8EggIdMIICGTCCAhUwggIRBgtg
-hkgBhvhFAQcBATCCAgAWggGrVGhpcyBjZXJ0aWZpY2F0ZSBpbmNvcnBvcmF0ZXMg
-YnkgcmVmZXJlbmNlLCBhbmQgaXRzIHVzZSBpcyBzdHJpY3RseSBzdWJqZWN0IHRv
-LCB0aGUgVmVyaVNpZ24gQ2VydGlmaWNhdGlvbiBQcmFjdGljZSBTdGF0ZW1lbnQg
-KENQUyksIGF2YWlsYWJsZSBhdDogaHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL0NQ
-Uy0xLjA7IGJ5IEUtbWFpbCBhdCBDUFMtcmVxdWVzdHNAdmVyaXNpZ24uY29tOyBv
-ciBieSBtYWlsIGF0IFZlcmlTaWduLCBJbmMuLCAyNTkzIENvYXN0IEF2ZS4sIE1v
-dW50YWluIFZpZXcsIENBIDk0MDQzIFVTQSBUZWwuICsxICg0MTUpIDk2MS04ODMw
-IENvcHlyaWdodCAoYykgMTk5NiBWZXJpU2lnbiwgSW5jLiAgQWxsIFJpZ2h0cyBS
-ZXNlcnZlZC4gQ0VSVEFJTiBXQVJSQU5USUVTIERJU0NMQUlNRUQgYW5kIExJQUJJ
-TElUWSBMSU1JVEVELqAOBgxghkgBhvhFAQcBAQGhDgYMYIZIAYb4RQEHAQECMC8w
-LRYraHR0cHM6Ly93d3cudmVyaXNpZ24uY29tL3JlcG9zaXRvcnkvQ1BTLTEuMDAU
-BglghkgBhvhCAQEBAf8EBAMCAgQwDQYJKoZIhvcNAQECBQADgYEApRJRkNBqLLgs
-53IR/d18ODdLOWMTZ+QOOxBrq460iBEdUwgF8vmPRX1ku7UiDeNzaLlurE6eFqHq
-2zPyK5j60zfTLVJMWKcQWwTJLjHtXrW8pxhNtFc6Fdvy5ZkHnC/9NIl7/t4U6WqB
-p4y+p7SdMIkEwIZfds0VbnQyX5MRUJY=
------END CERTIFICATE-----
-
- subject=/C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 3 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKhAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgMyBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAyVxZ
-nvIbigEUtBDfBEDb41evakVAj4QMC9Ez2dkRz+4CWB8l9yqoRAWq7AMfeH+ek7ma
-AKojfdashaJjRcdyJ8z0TMZ1cdI5709C8HXfCpDGjiBvmA/4rCNfcCk2pMmG57Ga
-IMtTpYXnPb59mv4kRTPcdhXtD6JxZExlLoFoRacCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQB1Zmw+0c2B27X4LzZRtvdCvM1Cr9wO+hVs+GeTVzrrtpLotgHKjLeOQ7RJ
-Zfk+7r11Ri7J/CVdqMcvi5uPaM+0nJcYwE3vH9mvgrPmZLiEXIqaB1JDYft0nls6
-NvxMsvwaPxUupVs8G5DsiCnkWRb5zget7Ond2tIxik/W2O8XjQ==
------END CERTIFICATE-----
- subject=/C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
- issuer= /C=US/O=VeriSign, Inc./OU=Class 4 Public Primary Certification Authority
------BEGIN CERTIFICATE-----
-MIICMTCCAZoCBQKmAAABMA0GCSqGSIb3DQEBAgUAMF8xCzAJBgNVBAYTAlVTMRcw
-FQYDVQQKEw5WZXJpU2lnbiwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMg
-UHJpbWFyeSBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eTAeFw05NjAxMjkwMDAwMDBa
-Fw05OTEyMzEyMzU5NTlaMF8xCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5WZXJpU2ln
-biwgSW5jLjE3MDUGA1UECxMuQ2xhc3MgNCBQdWJsaWMgUHJpbWFyeSBDZXJ0aWZp
-Y2F0aW9uIEF1dGhvcml0eTCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEA0LJ1
-9njQrlpQ9OlQqZ+M1++RlHDo0iSQdomF1t+s5gEXMoDwnZNHvJplnR+Xrr/phnVj
-IIm9gFidBAydqMEk6QvlMXi9/C0MN2qeeIDpRnX57aP7E3vIwUzSo+/1PLBij0pd
-O92VZ48TucE81qcmm+zDO3rZTbxtm+gVAePwR6kCAwEAATANBgkqhkiG9w0BAQIF
-AAOBgQBT3dPwnCR+QKri/AAa19oM/DJhuBUNlvP6Vxt/M3yv6ZiaYch6s7f/sdyZ
-g9ysEvxwyR84Qu1E9oAuW2szaayc01znX1oYx7EteQSWQZGZQbE8DbqEOcY7l/Am
-yY7uvcxClf8exwI/VAx49byqYHwCaejcrOICdmHEPgPq0ook0Q==
------END CERTIFICATE-----
diff --git a/apps/sess_id.c b/apps/sess_id.c
index b99179f..b16686c 100644
--- a/apps/sess_id.c
+++ b/apps/sess_id.c
@@ -90,6 +90,7 @@ int MAIN(int, char **);
 int MAIN(int argc, char **argv)
 	{
 	SSL_SESSION *x=NULL;
+	X509 *peer = NULL;
 	int ret=1,i,num,badops=0;
 	BIO *out=NULL;
 	int informat,outformat;
@@ -163,16 +164,17 @@ int MAIN(int argc, char **argv)
 	ERR_load_crypto_strings();
 	x=load_sess_id(infile,informat);
 	if (x == NULL) { goto end; }
+	peer = SSL_SESSION_get0_peer(x);
 
 	if(context)
 	    {
-	    x->sid_ctx_length=strlen(context);
-	    if(x->sid_ctx_length > SSL_MAX_SID_CTX_LENGTH)
+	    size_t ctx_len = strlen(context);
+	    if(ctx_len > SSL_MAX_SID_CTX_LENGTH)
 		{
 		BIO_printf(bio_err,"Context too long\n");
 		goto end;
 		}
-	    memcpy(x->sid_ctx,context,x->sid_ctx_length);
+	    SSL_SESSION_set1_id_context(x, (unsigned char *)context, ctx_len);
 	    }
 
 #ifdef undef
@@ -231,10 +233,10 @@ int MAIN(int argc, char **argv)
 
 		if (cert)
 			{
-			if (x->peer == NULL)
+			if (peer == NULL)
 				BIO_puts(out,"No certificate present\n");
 			else
-				X509_print(out,x->peer);
+				X509_print(out,peer);
 			}
 		}
 
@@ -253,12 +255,12 @@ int MAIN(int argc, char **argv)
 			goto end;
 			}
 		}
-	else if (!noout && (x->peer != NULL)) /* just print the certificate */
+	else if (!noout && (peer != NULL)) /* just print the certificate */
 		{
 		if 	(outformat == FORMAT_ASN1)
-			i=(int)i2d_X509_bio(out,x->peer);
+			i=(int)i2d_X509_bio(out,peer);
 		else if (outformat == FORMAT_PEM)
-			i=PEM_write_bio_X509(out,x->peer);
+			i=PEM_write_bio_X509(out,peer);
 		else	{
 			BIO_printf(bio_err,"bad output format specified for outfile\n");
 			goto end;
diff --git a/apps/speed.c b/apps/speed.c
index b3c5442..9c251eb 100644
--- a/apps/speed.c
+++ b/apps/speed.c
@@ -108,8 +108,14 @@
 #include <signal.h>
 #endif
 
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__CYGWIN__)
 #include <windows.h>
+# if defined(__CYGWIN__) && !defined(_WIN32)
+  /* <windows.h> should define _WIN32, which normally is mutually
+   * exclusive with __CYGWIN__, but if it didn't... */
+#  define _WIN32
+  /* this is done because Cygwin alarm() fails sometimes. */
+# endif
 #endif
 
 #include <openssl/bn.h>
@@ -183,6 +189,25 @@
 #ifndef OPENSSL_NO_ECDH
 #include <openssl/ecdh.h>
 #endif
+#include <openssl/modes.h>
+
+#ifdef OPENSSL_FIPS
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef AES_set_encrypt_key
+#undef AES_set_decrypt_key
+#undef DES_set_key_unchecked
+#endif
+#define BF_set_key	private_BF_set_key
+#define CAST_set_key	private_CAST_set_key
+#define idea_set_encrypt_key	private_idea_set_encrypt_key
+#define SEED_set_key	private_SEED_set_key
+#define RC2_set_key	private_RC2_set_key
+#define RC4_set_key	private_RC4_set_key
+#define DES_set_key_unchecked	private_DES_set_key_unchecked
+#define AES_set_encrypt_key	private_AES_set_encrypt_key
+#define AES_set_decrypt_key	private_AES_set_decrypt_key
+#define Camellia_set_key	private_Camellia_set_key
+#endif
 
 #ifndef HAVE_FORK
 # if defined(OPENSSL_SYS_VMS) || defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_MACINTOSH_CLASSIC) || defined(OPENSSL_SYS_OS2) || defined(OPENSSL_SYS_NETWARE)
@@ -214,7 +239,7 @@ static void print_result(int alg,int run_no,int count,double time_used);
 static int do_multi(int multi);
 #endif
 
-#define ALGOR_NUM	29
+#define ALGOR_NUM	30
 #define SIZE_NUM	5
 #define RSA_NUM		4
 #define DSA_NUM		3
@@ -229,7 +254,7 @@ static const char *names[ALGOR_NUM]={
   "aes-128 cbc","aes-192 cbc","aes-256 cbc",
   "camellia-128 cbc","camellia-192 cbc","camellia-256 cbc",
   "evp","sha256","sha512","whirlpool",
-  "aes-128 ige","aes-192 ige","aes-256 ige"};
+  "aes-128 ige","aes-192 ige","aes-256 ige","ghash" };
 static double results[ALGOR_NUM][SIZE_NUM];
 static int lengths[SIZE_NUM]={16,64,256,1024,8*1024};
 #ifndef OPENSSL_NO_RSA
@@ -273,9 +298,12 @@ static SIGRETTYPE sig_done(int sig)
 
 #if defined(_WIN32)
 
-#define SIGALRM
+#if !defined(SIGALRM)
+# define SIGALRM
+#endif
 static unsigned int lapse,schlock;
-static void alarm(unsigned int secs) { lapse = secs*1000; }
+static void alarm_win32(unsigned int secs) { lapse = secs*1000; }
+#define alarm alarm_win32
 
 static DWORD WINAPI sleepy(VOID *arg)
 	{
@@ -469,6 +497,7 @@ int MAIN(int argc, char **argv)
 #define D_IGE_128_AES   26
 #define D_IGE_192_AES   27
 #define D_IGE_256_AES   28
+#define D_GHASH		29
 	double d=0.0;
 	long c[ALGOR_NUM][SIZE_NUM];
 #define	R_DSA_512	0
@@ -894,6 +923,10 @@ int MAIN(int argc, char **argv)
 			doit[D_CBC_192_AES]=1;
 			doit[D_CBC_256_AES]=1;
 			}
+		else if (strcmp(*argv,"ghash") == 0)
+			{
+			doit[D_GHASH]=1;
+			}
 		else
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -1264,6 +1297,7 @@ int MAIN(int argc, char **argv)
 	c[D_IGE_128_AES][0]=count;
 	c[D_IGE_192_AES][0]=count;
 	c[D_IGE_256_AES][0]=count;
+	c[D_GHASH][0]=count;
 
 	for (i=1; i<SIZE_NUM; i++)
 		{
@@ -1458,7 +1492,7 @@ int MAIN(int argc, char **argv)
 # error "You cannot disable DES on systems without SIGALRM."
 #endif /* OPENSSL_NO_DES */
 #else
-#define COND(c)	(run)
+#define COND(c)	(run && count<0x7fffffff)
 #define COUNT(d) (count)
 #ifndef _WIN32
 	signal(SIGALRM,sig_done);
@@ -1755,7 +1789,22 @@ int MAIN(int argc, char **argv)
 			print_result(D_IGE_256_AES,j,count,d);
 			}
 		}
+	if (doit[D_GHASH])
+		{
+		GCM128_CONTEXT *ctx = CRYPTO_gcm128_new(&aes_ks1,(block128_f)AES_encrypt);
+		CRYPTO_gcm128_setiv (ctx,(unsigned char *)"0123456789ab",12);
 
+		for (j=0; j<SIZE_NUM; j++)
+			{
+			print_message(names[D_GHASH],c[D_GHASH][j],lengths[j]);
+			Time_F(START);
+			for (count=0,run=1; COND(c[D_GHASH][j]); count++)
+				CRYPTO_gcm128_aad(ctx,buf,lengths[j]);
+			d=Time_F(STOP);
+			print_result(D_GHASH,j,count,d);
+			}
+		CRYPTO_gcm128_release(ctx);
+		}
 
 #endif
 #endif
@@ -2550,7 +2599,7 @@ static void pkey_print_message(const char *str, const char *str2, long num,
 	BIO_printf(bio_err,mr ? "+DTP:%d:%s:%s:%d\n"
 			   : "Doing %d bit %s %s's for %ds: ",bits,str,str2,tm);
 	(void)BIO_flush(bio_err);
-	alarm(RSA_SECONDS);
+	alarm(tm);
 #else
 	BIO_printf(bio_err,mr ? "+DNP:%ld:%d:%s:%s\n"
 			   : "Doing %ld %d bit %s %s's: ",num,bits,str,str2);
@@ -2610,7 +2659,11 @@ static int do_multi(int multi)
 	fds=malloc(multi*sizeof *fds);
 	for(n=0 ; n < multi ; ++n)
 		{
-		pipe(fd);
+		if (pipe(fd) == -1)
+			{
+			fprintf(stderr, "pipe failure\n");
+			exit(1);
+			}
 		fflush(stdout);
 		fflush(stderr);
 		if(fork())
@@ -2622,7 +2675,11 @@ static int do_multi(int multi)
 			{
 			close(fd[0]);
 			close(1);
-			dup(fd[1]);
+			if (dup(fd[1]) == -1)
+				{
+				fprintf(stderr, "dup failed\n");
+				exit(1);
+				}
 			close(fd[1]);
 			mr=1;
 			usertime=0;
diff --git a/apps/srp.c b/apps/srp.c
new file mode 100644
index 0000000..9c7ae18
--- /dev/null
+++ b/apps/srp.c
@@ -0,0 +1,756 @@
+/* apps/srp.c */
+/* Written by Peter Sylvester (peter.sylvester@edelweb.fr)  
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#include <openssl/opensslconf.h>
+
+#ifndef OPENSSL_NO_SRP
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <openssl/conf.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#include <openssl/txt_db.h>
+#include <openssl/buffer.h>
+#include <openssl/srp.h>
+
+#include "apps.h"
+
+#undef PROG
+#define PROG srp_main
+
+#define BASE_SECTION	"srp"
+#define CONFIG_FILE "openssl.cnf"
+
+#define ENV_RANDFILE		"RANDFILE"
+
+#define ENV_DATABASE		"srpvfile"
+#define ENV_DEFAULT_SRP		"default_srp"
+
+static char *srp_usage[]={
+"usage: srp [args] [user] \n",
+"\n",
+" -verbose        Talk alot while doing things\n",
+" -config file    A config file\n",
+" -name arg       The particular srp definition to use\n",
+" -srpvfile arg   The srp verifier file name\n",
+" -add            add an user and srp verifier\n",
+" -modify         modify the srp verifier of an existing user\n",
+" -delete         delete user from verifier file\n",
+" -list           list user\n",
+" -gn arg         g and N values to be used for new verifier\n",
+" -userinfo arg   additional info to be set for user\n",
+" -passin arg     input file pass phrase source\n",
+" -passout arg    output file pass phrase source\n",
+#ifndef OPENSSL_NO_ENGINE
+" -engine e         - use engine e, possibly a hardware device.\n",
+#endif
+NULL
+};
+
+#ifdef EFENCE
+extern int EF_PROTECT_FREE;
+extern int EF_PROTECT_BELOW;
+extern int EF_ALIGNMENT;
+#endif
+
+static CONF *conf=NULL;
+static char *section=NULL;
+
+#define VERBOSE if (verbose) 
+#define VVERBOSE if (verbose>1) 
+
+
+int MAIN(int, char **);
+
+static int get_index(CA_DB *db, char* id, char type)
+	{
+	char ** pp;
+	int i;
+	if (id == NULL) return -1;
+	if (type == DB_SRP_INDEX) 
+	for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+		{
+		pp = sk_OPENSSL_PSTRING_value(db->db->data,i);
+		if (pp[DB_srptype][0] == DB_SRP_INDEX  && !strcmp(id,pp[DB_srpid])) 
+			return i;
+		}
+	else for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+		{
+		pp = sk_OPENSSL_PSTRING_value(db->db->data,i);
+
+		if (pp[DB_srptype][0] != DB_SRP_INDEX && !strcmp(id,pp[DB_srpid])) 
+			return i;
+		}
+
+	return -1 ; 
+	}
+
+static void print_entry(CA_DB *db, BIO *bio, int indx, int verbose, char *s)
+	{
+	if (indx >= 0 && verbose)
+		{
+		int j;
+		char **pp = sk_OPENSSL_PSTRING_value(db->db->data, indx);
+		BIO_printf(bio, "%s \"%s\"\n", s, pp[DB_srpid]);
+		for (j = 0; j < DB_NUMBER; j++)
+			{
+			BIO_printf(bio_err,"  %d = \"%s\"\n", j, pp[j]);
+			}
+		}
+	}
+
+static void print_index(CA_DB *db, BIO *bio, int indexindex, int verbose)
+	{
+	print_entry(db, bio, indexindex, verbose, "g N entry") ;
+	}
+
+static void print_user(CA_DB *db, BIO *bio, int userindex, int verbose)
+	{
+	if (verbose > 0)
+		{
+		char **pp = sk_OPENSSL_PSTRING_value(db->db->data,userindex);
+
+		if (pp[DB_srptype][0] != 'I')
+			{
+			print_entry(db, bio, userindex, verbose, "User entry");
+			print_entry(db, bio, get_index(db, pp[DB_srpgN], 'I'), verbose, "g N entry");
+			}
+
+		}
+	}
+
+static int update_index(CA_DB *db, BIO *bio, char **row)
+	{
+	char ** irow;
+	int i;
+
+	if ((irow=(char **)OPENSSL_malloc(sizeof(char *)*(DB_NUMBER+1))) == NULL)
+		{
+		BIO_printf(bio_err,"Memory allocation failure\n");
+		return 0;
+		}
+
+	for (i=0; i<DB_NUMBER; i++)
+		{
+		irow[i]=row[i];
+		row[i]=NULL;
+		}
+	irow[DB_NUMBER]=NULL;
+
+	if (!TXT_DB_insert(db->db,irow))
+		{
+		BIO_printf(bio,"failed to update srpvfile\n");
+		BIO_printf(bio,"TXT_DB error number %ld\n",db->db->error);
+		OPENSSL_free(irow);
+		return 0;
+		}
+	return 1;
+	}
+
+static void lookup_fail(const char *name, char *tag)
+	{
+	BIO_printf(bio_err,"variable lookup failed for %s::%s\n",name,tag);
+	}
+
+
+static char *srp_verify_user(const char *user, const char *srp_verifier,
+			     char *srp_usersalt, const char *g, const char *N,
+			     const char *passin, BIO *bio, int verbose)
+	{
+	char password[1024];
+	PW_CB_DATA cb_tmp;
+	char *verifier = NULL;
+	char *gNid = NULL;
+
+	cb_tmp.prompt_info = user;
+	cb_tmp.password = passin;
+
+ 	if (password_callback(password, 1024, 0, &cb_tmp) >0)
+		{
+		VERBOSE BIO_printf(bio,"Validating\n   user=\"%s\"\n srp_verifier=\"%s\"\n srp_usersalt=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,srp_verifier,srp_usersalt, g, N);
+		BIO_printf(bio, "Pass %s\n", password);
+
+		if (!(gNid=SRP_create_verifier(user, password, &srp_usersalt, &verifier, N, g)))
+			{
+			BIO_printf(bio, "Internal error validating SRP verifier\n");
+			}
+		else
+			{
+			if (strcmp(verifier, srp_verifier))
+				gNid = NULL;
+			OPENSSL_free(verifier);
+			}
+		}
+	return gNid;
+	}
+
+static char *srp_create_user(char *user, char **srp_verifier,
+			     char **srp_usersalt, char *g, char *N,
+			     char *passout, BIO *bio, int verbose)
+	{
+ 	char password[1024];
+        PW_CB_DATA cb_tmp;
+	char *gNid = NULL;
+	char *salt = NULL;
+        cb_tmp.prompt_info = user;
+        cb_tmp.password = passout;
+
+	if (password_callback(password,1024,1,&cb_tmp) >0)
+		{
+		VERBOSE BIO_printf(bio,"Creating\n user=\"%s\"\n g=\"%s\"\n N=\"%s\"\n",user,g,N);
+		if (!(gNid =SRP_create_verifier(user, password, &salt, srp_verifier, N, g)))
+			{
+			BIO_printf(bio,"Internal error creating SRP verifier\n");
+			}
+		else 
+			*srp_usersalt = salt;
+		VVERBOSE BIO_printf(bio,"gNid=%s salt =\"%s\"\n verifier =\"%s\"\n", gNid,salt, *srp_verifier);
+
+		}
+	return gNid;
+	}
+
+int MAIN(int argc, char **argv)
+	{
+	int add_user = 0;
+	int list_user= 0;
+	int delete_user= 0;
+	int modify_user= 0;
+	char * user = NULL;
+
+	char *passargin = NULL, *passargout = NULL;
+	char *passin = NULL, *passout = NULL;
+        char * gN = NULL;
+	int gNindex = -1;
+	char ** gNrow = NULL;
+	int maxgN = -1;
+
+	char * userinfo = NULL;
+
+	int badops=0;
+	int ret=1;
+	int errors=0;
+	int verbose=0;
+	int doupdatedb=0;
+	char *configfile=NULL;
+	char *dbfile=NULL;
+	CA_DB *db=NULL;
+	char **pp ;
+	int i;
+	long errorline = -1;
+	char *randfile=NULL;
+#ifndef OPENSSL_NO_ENGINE
+	char *engine = NULL;
+#endif
+	char *tofree=NULL;
+	DB_ATTR db_attr;
+
+#ifdef EFENCE
+EF_PROTECT_FREE=1;
+EF_PROTECT_BELOW=1;
+EF_ALIGNMENT=0;
+#endif
+
+	apps_startup();
+
+	conf = NULL;
+	section = NULL;
+
+	if (bio_err == NULL)
+		if ((bio_err=BIO_new(BIO_s_file())) != NULL)
+			BIO_set_fp(bio_err,stderr,BIO_NOCLOSE|BIO_FP_TEXT);
+
+	argc--;
+	argv++;
+	while (argc >= 1 && badops == 0)
+		{
+		if	(strcmp(*argv,"-verbose") == 0)
+			verbose++;
+		else if	(strcmp(*argv,"-config") == 0)
+			{
+			if (--argc < 1) goto bad;
+			configfile= *(++argv);
+			}
+		else if (strcmp(*argv,"-name") == 0)
+			{
+			if (--argc < 1) goto bad;
+			section= *(++argv);
+			}
+		else if	(strcmp(*argv,"-srpvfile") == 0)
+			{
+			if (--argc < 1) goto bad;
+			dbfile= *(++argv);
+			}
+		else if (strcmp(*argv,"-add") == 0)
+			add_user=1;
+		else if (strcmp(*argv,"-delete") == 0)
+			delete_user=1;
+		else if (strcmp(*argv,"-modify") == 0)
+			modify_user=1;
+		else if (strcmp(*argv,"-list") == 0)
+			list_user=1;
+		else if (strcmp(*argv,"-gn") == 0)
+			{
+			if (--argc < 1) goto bad;
+			gN= *(++argv);
+			}
+		else if (strcmp(*argv,"-userinfo") == 0)
+			{
+			if (--argc < 1) goto bad;
+			userinfo= *(++argv);
+			}
+		else if (strcmp(*argv,"-passin") == 0)
+			{
+			if (--argc < 1) goto bad;
+			passargin= *(++argv);
+			}
+		else if (strcmp(*argv,"-passout") == 0)
+			{
+			if (--argc < 1) goto bad;
+			passargout= *(++argv);
+			}
+#ifndef OPENSSL_NO_ENGINE
+		else if (strcmp(*argv,"-engine") == 0)
+			{
+			if (--argc < 1) goto bad;
+			engine= *(++argv);
+			}
+#endif
+
+		else if (**argv == '-')
+			{
+bad:
+			BIO_printf(bio_err,"unknown option %s\n",*argv);
+			badops=1;
+			break;
+			}
+		else 
+			break;
+	
+		argc--;
+		argv++;
+		}
+
+	if (dbfile && configfile)
+		{
+		BIO_printf(bio_err,"-dbfile and -configfile cannot be specified together.\n");
+		badops = 1;
+		}
+	if (add_user+delete_user+modify_user+list_user != 1)
+		{
+		BIO_printf(bio_err,"Exactly one of the options -add, -delete, -modify -list must be specified.\n");
+		badops = 1;
+		}
+	if (delete_user+modify_user+delete_user== 1 && argc <= 0)
+		{
+		BIO_printf(bio_err,"Need at least one user for options -add, -delete, -modify. \n");
+		badops = 1;
+		}
+	if ((passin || passout) && argc != 1 )
+		{
+		BIO_printf(bio_err,"-passin, -passout arguments only valid with one user.\n");
+		badops = 1;
+		}
+
+	if (badops)
+		{
+		for (pp=srp_usage; (*pp != NULL); pp++)
+			BIO_printf(bio_err,"%s",*pp);
+
+		BIO_printf(bio_err," -rand file%cfile%c...\n", LIST_SEPARATOR_CHAR, LIST_SEPARATOR_CHAR);
+		BIO_printf(bio_err,"                 load the file (or the files in the directory) into\n");
+		BIO_printf(bio_err,"                 the random number generator\n");
+		goto err;
+		}
+
+	ERR_load_crypto_strings();
+
+#ifndef OPENSSL_NO_ENGINE
+	setup_engine(bio_err, engine, 0);
+#endif
+
+	if(!app_passwd(bio_err, passargin, passargout, &passin, &passout))
+		{
+		BIO_printf(bio_err, "Error getting passwords\n");
+		goto err;
+		}
+
+        if (!dbfile)
+		{
+
+
+	/*****************************************************************/
+		tofree=NULL;
+		if (configfile == NULL) configfile = getenv("OPENSSL_CONF");
+		if (configfile == NULL) configfile = getenv("SSLEAY_CONF");
+		if (configfile == NULL)
+			{
+			const char *s=X509_get_default_cert_area();
+			size_t len;
+
+#ifdef OPENSSL_SYS_VMS
+			len = strlen(s)+sizeof(CONFIG_FILE);
+			tofree=OPENSSL_malloc(len);
+			strcpy(tofree,s);
+#else
+			len = strlen(s)+sizeof(CONFIG_FILE)+1;
+			tofree=OPENSSL_malloc(len);
+			BUF_strlcpy(tofree,s,len);
+			BUF_strlcat(tofree,"/",len);
+#endif
+			BUF_strlcat(tofree,CONFIG_FILE,len);
+			configfile=tofree;
+			}
+
+		VERBOSE BIO_printf(bio_err,"Using configuration from %s\n",configfile);
+		conf = NCONF_new(NULL);
+		if (NCONF_load(conf,configfile,&errorline) <= 0)
+			{
+			if (errorline <= 0)
+				BIO_printf(bio_err,"error loading the config file '%s'\n",
+					configfile);
+			else
+				BIO_printf(bio_err,"error on line %ld of config file '%s'\n"
+					,errorline,configfile);
+			goto err;
+			}
+		if(tofree)
+			{
+			OPENSSL_free(tofree);
+			tofree = NULL;
+			}
+
+		if (!load_config(bio_err, conf))
+			goto err;
+
+	/* Lets get the config section we are using */
+		if (section == NULL)
+			{
+			VERBOSE BIO_printf(bio_err,"trying to read " ENV_DEFAULT_SRP " in \" BASE_SECTION \"\n");
+
+			section=NCONF_get_string(conf,BASE_SECTION,ENV_DEFAULT_SRP);
+			if (section == NULL)
+				{
+				lookup_fail(BASE_SECTION,ENV_DEFAULT_SRP);
+				goto err;
+				}
+			}
+         
+		if (randfile == NULL && conf)
+	        	randfile = NCONF_get_string(conf, BASE_SECTION, "RANDFILE");
+
+	
+		VERBOSE BIO_printf(bio_err,"trying to read " ENV_DATABASE " in section \"%s\"\n",section);
+
+		if ((dbfile=NCONF_get_string(conf,section,ENV_DATABASE)) == NULL)
+			{
+			lookup_fail(section,ENV_DATABASE);
+			goto err;
+			}
+
+        	}
+	if (randfile == NULL)
+		ERR_clear_error();
+       	else 
+		app_RAND_load_file(randfile, bio_err, 0);
+
+	VERBOSE BIO_printf(bio_err,"Trying to read SRP verifier file \"%s\"\n",dbfile);
+
+	db = load_index(dbfile, &db_attr);
+	if (db == NULL) goto err;
+
+	/* Lets check some fields */
+	for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+		{
+		pp = sk_OPENSSL_PSTRING_value(db->db->data, i);
+	
+		if (pp[DB_srptype][0] == DB_SRP_INDEX)
+			{
+			maxgN = i;
+			if (gNindex < 0 && gN != NULL && !strcmp(gN, pp[DB_srpid]))
+				gNindex = i;
+
+			print_index(db, bio_err, i, verbose > 1);
+			}
+		}
+	
+	VERBOSE BIO_printf(bio_err, "Database initialised\n");
+
+	if (gNindex >= 0)
+		{
+		gNrow = sk_OPENSSL_PSTRING_value(db->db->data,gNindex);
+		print_entry(db, bio_err, gNindex, verbose > 1, "Default g and N");
+		}
+	else if (maxgN > 0 && !SRP_get_default_gN(gN))
+		{
+		BIO_printf(bio_err, "No g and N value for index \"%s\"\n", gN);
+		goto err;
+		}
+	else
+		{
+		VERBOSE BIO_printf(bio_err, "Database has no g N information.\n");
+		gNrow = NULL;
+		}
+	
+
+	VVERBOSE BIO_printf(bio_err,"Starting user processing\n");
+
+	if (argc > 0)
+		user = *(argv++) ;
+
+	while (list_user || user)
+		{
+		int userindex = -1;
+		if (user) 
+			VVERBOSE BIO_printf(bio_err, "Processing user \"%s\"\n", user);
+		if ((userindex = get_index(db, user, 'U')) >= 0)
+			{
+			print_user(db, bio_err, userindex, (verbose > 0) || list_user);
+			}
+		
+		if (list_user)
+			{
+			if (user == NULL)
+				{
+				BIO_printf(bio_err,"List all users\n");
+
+				for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+					{
+					print_user(db,bio_err, i, 1);
+					}
+				list_user = 0;
+				}
+			else if (userindex < 0)
+				{
+				BIO_printf(bio_err, "user \"%s\" does not exist, ignored. t\n",
+					   user);
+				errors++;
+				}
+			}
+		else if (add_user)
+			{
+			if (userindex >= 0)
+				{
+				/* reactivation of a new user */
+				char **row = sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+				BIO_printf(bio_err, "user \"%s\" reactivated.\n", user);
+				row[DB_srptype][0] = 'V';
+
+				doupdatedb = 1;
+				}
+			else
+				{
+				char *row[DB_NUMBER] ; char *gNid;
+				row[DB_srpverifier] = NULL;
+				row[DB_srpsalt] = NULL;
+				row[DB_srpinfo] = NULL;
+				if (!(gNid = srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:gN,gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose)))
+					{
+						BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned .\n", user);
+						errors++;
+						goto err;
+					}
+				row[DB_srpid] = BUF_strdup(user);
+				row[DB_srptype] = BUF_strdup("v");
+				row[DB_srpgN] = BUF_strdup(gNid);
+
+				if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] ||
+					(userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo)))) || 
+					!update_index(db, bio_err, row))
+					{
+					if (row[DB_srpid]) OPENSSL_free(row[DB_srpid]);
+					if (row[DB_srpgN]) OPENSSL_free(row[DB_srpgN]);
+					if (row[DB_srpinfo]) OPENSSL_free(row[DB_srpinfo]);
+					if (row[DB_srptype]) OPENSSL_free(row[DB_srptype]);
+					if (row[DB_srpverifier]) OPENSSL_free(row[DB_srpverifier]);
+					if (row[DB_srpsalt]) OPENSSL_free(row[DB_srpsalt]);
+					goto err;
+					}
+				doupdatedb = 1;
+				}
+			}
+		else if (modify_user)
+			{
+			if (userindex < 0)
+				{
+				BIO_printf(bio_err,"user \"%s\" does not exist, operation ignored.\n",user);
+				errors++;
+				}
+			else
+				{
+
+				char **row = sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+				char type = row[DB_srptype][0];
+				if (type == 'v')
+					{
+					BIO_printf(bio_err,"user \"%s\" already updated, operation ignored.\n",user);
+					errors++;
+					}
+				else
+					{
+					char *gNid;
+
+					if (row[DB_srptype][0] == 'V')
+						{
+						int user_gN;
+						char **irow = NULL;
+						VERBOSE BIO_printf(bio_err,"Verifying password for user \"%s\"\n",user);
+						if ( (user_gN = get_index(db, row[DB_srpgN], DB_SRP_INDEX)) >= 0)
+							irow = (char **)sk_OPENSSL_PSTRING_value(db->db->data, userindex);
+
+ 						if (!srp_verify_user(user, row[DB_srpverifier], row[DB_srpsalt], irow ? irow[DB_srpsalt] : row[DB_srpgN], irow ? irow[DB_srpverifier] : NULL, passin, bio_err, verbose))
+							{
+							BIO_printf(bio_err, "Invalid password for user \"%s\", operation abandoned.\n", user);
+							errors++;
+							goto err;
+							}
+						} 
+					VERBOSE BIO_printf(bio_err,"Password for user \"%s\" ok.\n",user);
+
+					if (!(gNid=srp_create_user(user,&(row[DB_srpverifier]), &(row[DB_srpsalt]),gNrow?gNrow[DB_srpsalt]:NULL, gNrow?gNrow[DB_srpverifier]:NULL, passout, bio_err,verbose)))
+						{
+						BIO_printf(bio_err, "Cannot create srp verifier for user \"%s\", operation abandoned.\n", user);
+						errors++;
+						goto err;
+						}
+
+					row[DB_srptype][0] = 'v';
+					row[DB_srpgN] = BUF_strdup(gNid);
+ 
+					if (!row[DB_srpid] || !row[DB_srpgN] || !row[DB_srptype] || !row[DB_srpverifier] || !row[DB_srpsalt] ||
+						(userinfo && (!(row[DB_srpinfo] = BUF_strdup(userinfo)))))  
+						goto err;
+
+					doupdatedb = 1;
+					}
+				}
+			}
+		else if (delete_user)
+			{
+			if (userindex < 0)
+				{
+				BIO_printf(bio_err, "user \"%s\" does not exist, operation ignored. t\n", user);
+				errors++;
+				}
+			else
+				{
+				char **xpp = sk_OPENSSL_PSTRING_value(db->db->data,userindex);
+				BIO_printf(bio_err, "user \"%s\" revoked. t\n", user);
+
+				xpp[DB_srptype][0] = 'R';
+				
+				doupdatedb = 1;
+				}
+			}
+		if (--argc > 0)
+			user = *(argv++) ;
+		else
+			{
+			user = NULL;
+			list_user = 0;
+			}
+		}
+
+	VERBOSE BIO_printf(bio_err,"User procession done.\n");
+
+
+	if (doupdatedb)
+		{
+		/* Lets check some fields */
+		for (i = 0; i < sk_OPENSSL_PSTRING_num(db->db->data); i++)
+			{
+			pp = sk_OPENSSL_PSTRING_value(db->db->data,i);
+	
+			if (pp[DB_srptype][0] == 'v')
+				{
+				pp[DB_srptype][0] = 'V';
+				print_user(db, bio_err, i, verbose);
+				}
+			}
+
+		VERBOSE BIO_printf(bio_err, "Trying to update srpvfile.\n");
+		if (!save_index(dbfile, "new", db)) goto err;
+				
+		VERBOSE BIO_printf(bio_err, "Temporary srpvfile created.\n");
+		if (!rotate_index(dbfile, "new", "old")) goto err;
+
+		VERBOSE BIO_printf(bio_err, "srpvfile updated.\n");
+		}
+
+	ret = (errors != 0);
+err:
+	if (errors != 0)
+	VERBOSE BIO_printf(bio_err,"User errors %d.\n",errors);
+
+	VERBOSE BIO_printf(bio_err,"SRP terminating with code %d.\n",ret);
+	if(tofree)
+		OPENSSL_free(tofree);
+	if (ret) ERR_print_errors(bio_err);
+	if (randfile) app_RAND_write_file(randfile, bio_err);
+	if (conf) NCONF_free(conf);
+	if (db) free_index(db);
+
+	OBJ_cleanup();
+	apps_shutdown();
+	OPENSSL_EXIT(ret);
+	}
+
+
+
+#endif
+
diff --git a/apps/verify.c b/apps/verify.c
index 9163997..893670f 100644
--- a/apps/verify.c
+++ b/apps/verify.c
@@ -222,25 +222,37 @@ int MAIN(int argc, char **argv)
 			goto end;
 		}
 
-	if (argc < 1) check(cert_ctx, NULL, untrusted, trusted, crls, e);
+	ret = 0;
+	if (argc < 1)
+		{ 
+		if (1 != check(cert_ctx, NULL, untrusted, trusted, crls, e))
+			ret = -1;
+		}
 	else
+		{
 		for (i=0; i<argc; i++)
-			check(cert_ctx,argv[i], untrusted, trusted, crls, e);
-	ret=0;
+			if (1 != check(cert_ctx,argv[i], untrusted, trusted, crls, e))
+				ret = -1;
+		}
+
 end:
 	if (ret == 1) {
 		BIO_printf(bio_err,"usage: verify [-verbose] [-CApath path] [-CAfile file] [-purpose purpose] [-crl_check]");
+		BIO_printf(bio_err," [-attime timestamp]");
 #ifndef OPENSSL_NO_ENGINE
 		BIO_printf(bio_err," [-engine e]");
 #endif
 		BIO_printf(bio_err," cert1 cert2 ...\n");
+
 		BIO_printf(bio_err,"recognized usages:\n");
-		for(i = 0; i < X509_PURPOSE_get_count(); i++) {
+		for(i = 0; i < X509_PURPOSE_get_count(); i++)
+			{
 			X509_PURPOSE *ptmp;
 			ptmp = X509_PURPOSE_get0(i);
-			BIO_printf(bio_err, "\t%-10s\t%s\n", X509_PURPOSE_get0_sname(ptmp),
-								X509_PURPOSE_get0_name(ptmp));
-		}
+			BIO_printf(bio_err, "\t%-10s\t%s\n",
+				   X509_PURPOSE_get0_sname(ptmp),
+				   X509_PURPOSE_get0_name(ptmp));
+			}
 	}
 	if (vpm) X509_VERIFY_PARAM_free(vpm);
 	if (cert_ctx != NULL) X509_STORE_free(cert_ctx);
@@ -248,7 +260,7 @@ int MAIN(int argc, char **argv)
 	sk_X509_pop_free(trusted, X509_free);
 	sk_X509_CRL_pop_free(crls, X509_CRL_free);
 	apps_shutdown();
-	OPENSSL_EXIT(ret);
+	OPENSSL_EXIT(ret < 0 ? 2 : ret);
 	}
 
 static int check(X509_STORE *ctx, char *file,
diff --git a/apps/x509.c b/apps/x509.c
index ed1e8c6..3863ab9 100644
--- a/apps/x509.c
+++ b/apps/x509.c
@@ -157,9 +157,10 @@ static int MS_CALLBACK callb(int ok, X509_STORE_CTX *ctx);
 static int sign (X509 *x, EVP_PKEY *pkey,int days,int clrext, const EVP_MD *digest,
 						CONF *conf, char *section);
 static int x509_certify (X509_STORE *ctx,char *CAfile,const EVP_MD *digest,
-			 X509 *x,X509 *xca,EVP_PKEY *pkey,char *serial,
-			 int create,int days, int clrext, CONF *conf, char *section,
-						ASN1_INTEGER *sno);
+			 X509 *x,X509 *xca,EVP_PKEY *pkey,
+			 STACK_OF(OPENSSL_STRING) *sigopts,
+			 char *serial, int create ,int days, int clrext,
+			 CONF *conf, char *section, ASN1_INTEGER *sno);
 static int purpose_print(BIO *bio, X509 *cert, X509_PURPOSE *pt);
 static int reqfile=0;
 
@@ -172,6 +173,7 @@ int MAIN(int argc, char **argv)
 	X509_REQ *req=NULL;
 	X509 *x=NULL,*xca=NULL;
 	ASN1_OBJECT *objtmp;
+	STACK_OF(OPENSSL_STRING) *sigopts = NULL;
 	EVP_PKEY *Upkey=NULL,*CApkey=NULL;
 	ASN1_INTEGER *sno = NULL;
 	int i,num,badops=0;
@@ -271,13 +273,22 @@ int MAIN(int argc, char **argv)
 			if (--argc < 1) goto bad;
 			CAkeyformat=str2fmt(*(++argv));
 			}
+		else if (strcmp(*argv,"-sigopt") == 0)
+			{
+			if (--argc < 1)
+				goto bad;
+			if (!sigopts)
+				sigopts = sk_OPENSSL_STRING_new_null();
+			if (!sigopts || !sk_OPENSSL_STRING_push(sigopts, *(++argv)))
+				goto bad;
+			}
 		else if (strcmp(*argv,"-days") == 0)
 			{
 			if (--argc < 1) goto bad;
 			days=atoi(*(++argv));
 			if (days == 0)
 				{
-				BIO_printf(STDout,"bad number of days\n");
+				BIO_printf(bio_err,"bad number of days\n");
 				goto bad;
 				}
 			}
@@ -901,7 +912,7 @@ int MAIN(int argc, char **argv)
 				}
 			else if (text == i)
 				{
-				X509_print_ex(out,x,nmflag, certflag);
+				X509_print_ex(STDout,x,nmflag, certflag);
 				}
 			else if (startdate == i)
 				{
@@ -970,7 +981,8 @@ int MAIN(int argc, char **argv)
 				
 				assert(need_rand);
 				if (!x509_certify(ctx,CAfile,digest,x,xca,
-					CApkey, CAserial,CA_createserial,days, clrext,
+					CApkey, sigopts,
+					CAserial,CA_createserial,days, clrext,
 					extconf, extsect, sno))
 					goto end;
 				}
@@ -987,7 +999,7 @@ int MAIN(int argc, char **argv)
 				else
 					{
 					pk=load_key(bio_err,
-						keyfile, FORMAT_PEM, 0,
+						keyfile, keyformat, 0,
 						passin, e, "request key");
 					if (pk == NULL) goto end;
 					}
@@ -1081,6 +1093,8 @@ int MAIN(int argc, char **argv)
 	X509_free(xca);
 	EVP_PKEY_free(Upkey);
 	EVP_PKEY_free(CApkey);
+	if (sigopts)
+		sk_OPENSSL_STRING_free(sigopts);
 	X509_REQ_free(rq);
 	ASN1_INTEGER_free(sno);
 	sk_ASN1_OBJECT_pop_free(trust, ASN1_OBJECT_free);
@@ -1131,8 +1145,11 @@ static ASN1_INTEGER *x509_load_serial(char *CAfile, char *serialfile, int create
 	}
 
 static int x509_certify(X509_STORE *ctx, char *CAfile, const EVP_MD *digest,
-	     X509 *x, X509 *xca, EVP_PKEY *pkey, char *serialfile, int create,
-	     int days, int clrext, CONF *conf, char *section, ASN1_INTEGER *sno)
+	     		X509 *x, X509 *xca, EVP_PKEY *pkey,
+			STACK_OF(OPENSSL_STRING) *sigopts,
+	  		char *serialfile, int create,
+	     		int days, int clrext, CONF *conf, char *section,
+			ASN1_INTEGER *sno)
 	{
 	int ret=0;
 	ASN1_INTEGER *bs=NULL;
@@ -1191,7 +1208,8 @@ static int x509_certify(X509_STORE *ctx, char *CAfile, const EVP_MD *digest,
                 if (!X509V3_EXT_add_nconf(conf, &ctx2, section, x)) goto end;
 		}
 
-	if (!X509_sign(x,pkey,digest)) goto end;
+	if (!do_X509_sign(bio_err, x, pkey, digest, sigopts))
+		goto end;
 	ret=1;
 end:
 	X509_STORE_CTX_cleanup(&xsc);
diff --git a/build-config.mk b/build-config.mk
new file mode 100644
index 0000000..18e6a86
--- /dev/null
+++ b/build-config.mk
@@ -0,0 +1,35 @@
+# Auto-generated - DO NOT EDIT!
+# To regenerate, edit openssl.config, then run:
+#     ./import_openssl.sh import /path/to/openssl-1.0.1e.tar.gz
+#
+openssl_cflags := \
+  -DOPENSSL_THREADS \
+  -D_REENTRANT \
+  -DDSO_DLFCN \
+  -DHAVE_DLFCN_H \
+  -DL_ENDIAN \
+  -DTERMIO \
+  -DOPENSSL_NO_CAMELLIA \
+  -DOPENSSL_NO_CAPIENG \
+  -DOPENSSL_NO_CAST \
+  -DOPENSSL_NO_CMS \
+  -DOPENSSL_NO_DTLS1 \
+  -DOPENSSL_NO_EC_NISTP_64_GCC_128 \
+  -DOPENSSL_NO_GMP \
+  -DOPENSSL_NO_GOST \
+  -DOPENSSL_NO_HEARTBEATS \
+  -DOPENSSL_NO_IDEA \
+  -DOPENSSL_NO_JPAKE \
+  -DOPENSSL_NO_MD2 \
+  -DOPENSSL_NO_MDC2 \
+  -DOPENSSL_NO_RC5 \
+  -DOPENSSL_NO_RDRAND \
+  -DOPENSSL_NO_RFC3779 \
+  -DOPENSSL_NO_RSAX \
+  -DOPENSSL_NO_SCTP \
+  -DOPENSSL_NO_SEED \
+  -DOPENSSL_NO_SHA0 \
+  -DOPENSSL_NO_STATIC_ENGINE \
+  -DOPENSSL_NO_STORE \
+  -DOPENSSL_NO_WHIRLPOOL \
+
diff --git a/build.xml b/build.xml
new file mode 100644
index 0000000..f07d7de
--- /dev/null
+++ b/build.xml
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project name="." default="help">
+
+    <!-- The local.properties file is created and updated by the 'android' tool.
+         It contains the path to the SDK. It should *NOT* be checked into
+         Version Control Systems. -->
+    <property file="local.properties" />
+
+    <!-- The ant.properties file can be created by you. It is only edited by the
+         'android' tool to add properties to it.
+         This is the place to change some Ant specific build properties.
+         Here are some properties you may want to change/update:
+
+         source.dir
+             The name of the source directory. Default is 'src'.
+         out.dir
+             The name of the output directory. Default is 'bin'.
+
+         For other overridable properties, look at the beginning of the rules
+         files in the SDK, at tools/ant/build.xml
+
+         Properties related to the SDK location or the project target should
+         be updated using the 'android' tool with the 'update' action.
+
+         This file is an integral part of the build system for your
+         application and should be checked into Version Control Systems.
+
+         -->
+    <property file="ant.properties" />
+
+    <!-- if sdk.dir was not set from one of the property file, then
+         get it from the ANDROID_HOME env var.
+         This must be done before we load project.properties since
+         the proguard config can use sdk.dir -->
+    <property environment="env" />
+    <condition property="sdk.dir" value="${env.ANDROID_HOME}">
+        <isset property="env.ANDROID_HOME" />
+    </condition>
+
+    <!-- The project.properties file is created and updated by the 'android'
+         tool, as well as ADT.
+
+         This contains project specific properties such as project target, and library
+         dependencies. Lower level build properties are stored in ant.properties
+         (or in .classpath for Eclipse projects).
+
+         This file is an integral part of the build system for your
+         application and should be checked into Version Control Systems. -->
+    <loadproperties srcFile="project.properties" />
+
+    <!-- quick check on sdk.dir -->
+    <fail
+            message="sdk.dir is missing. Make sure to generate local.properties using 'android update project' or to inject it through the ANDROID_HOME environment variable."
+            unless="sdk.dir"
+    />
+
+    <!--
+        Import per project custom build rules if present at the root of the project.
+        This is the place to put custom intermediary targets such as:
+            -pre-build
+            -pre-compile
+            -post-compile (This is typically used for code obfuscation.
+                           Compiled code location: ${out.classes.absolute.dir}
+                           If this is not done in place, override ${out.dex.input.absolute.dir})
+            -post-package
+            -post-build
+            -pre-clean
+    -->
+    <import file="custom_rules.xml" optional="true" />
+
+    <!-- Import the actual build file.
+
+         To customize existing targets, there are two options:
+         - Customize only one target:
+             - copy/paste the target into this file, *before* the
+               <import> task.
+             - customize it to your needs.
+         - Customize the whole content of build.xml
+             - copy/paste the content of the rules files (minus the top node)
+               into this file, replacing the <import> task.
+             - customize to your needs.
+
+         ***********************
+         ****** IMPORTANT ******
+         ***********************
+         In all cases you must update the value of version-tag below to read 'custom' instead of an integer,
+         in order to avoid having your file be overridden by tools such as "android update project"
+    -->
+    <!-- version-tag: 1 -->
+    <import file="${sdk.dir}/tools/ant/build.xml" />
+
+</project>
diff --git a/check-all-builds.sh b/check-all-builds.sh
new file mode 100755
index 0000000..98dc391
--- /dev/null
+++ b/check-all-builds.sh
@@ -0,0 +1,628 @@
+#!/bin/sh
+#
+
+set -e
+export LANG=C
+export LC_ALL=C
+
+PROGDIR=$(dirname "$0")
+PROGNAME=$(basename "$0")
+
+panic () {
+  echo "ERROR: $@"
+  exit 1
+}
+
+VERBOSE=1
+
+# Dump message is $VERBOSE >= $1
+# $1+: message.
+dump_n () {
+  local LOG_LEVEL=$1
+  shift
+  if [ "$VERBOSE" -ge "$LOG_LEVEL" ]; then
+    printf "%s\n" "$@"
+  fi
+}
+
+# Dump a message unless --quiet is used.
+# $1+: message.
+dump () {
+  dump_n 1 "$@"
+}
+
+# Dump a message if --verbose is used only.
+# $1+: message.
+log () {
+  dump_n 2 "$@"
+}
+
+# Run a command silently, unless --verbose or '--verbose --verbose'
+# is used.
+# $1+: Command
+# Return: command status.
+run () {
+  log "COMMAND: $*"
+  case $VERBOSE in
+    0)
+      "$@" >/dev/null 2>&1 || return $?
+      ;;
+    1)
+      "$@" >/dev/null || return $?
+      ;;
+    *)
+      "$@" || return $?
+      ;;
+  esac
+}
+
+# $1: string
+# Out: input string, with capital letters replaced by small ones.
+tolower () {
+  echo "$1" | tr '[A-Z]' '[a-z]'
+}
+
+# Return value of a given variable.
+# $1: Variable name
+var_value () {
+  eval printf \"%s\" \"\$$1\"
+}
+
+# Remove some items from a list
+# $1: input space-separated list
+# $2: space-separated list of items to remove from 1
+# Out: items of $1 without items of $2
+filter_out () {
+  local TMP=$(mktemp)
+  local RESULT
+  printf "" > $TMP
+  echo "$2" | tr ' ' '\n' > $TMP
+  RESULT=$(echo "$1" | tr ' ' '\n' | fgrep -x -v -f $TMP | tr '\n' ' ')
+  rm -f $TMP
+  echo "$RESULT"
+}
+
+src_to_obj () {
+  case $1 in
+    *.c)
+      echo ${1%%.c}.o
+      ;;
+    *.S)
+      echo ${1%%.S}.o
+      ;;
+    *)
+      echo $1
+      ;;
+  esac
+}
+
+# Determine host operating system.
+HOST_OS=$(uname -s)
+case $HOST_OS in
+  Linux)
+    HOST_OS=linux
+    ;;
+  Darwin)
+    HOST_OS=darwin
+    ;;
+esac
+
+# Determine host architecture
+HOST_ARCH=$(uname -m)
+case $HOST_ARCH in
+  i?86)
+    HOST_ARCH=x86
+    ;;
+esac
+
+ANDROID_HOST_TAG=$HOST_OS-$HOST_ARCH
+
+case $ANDROID_HOST_TAG in
+  linux-x86_64|darwin-x86-64)
+    ANDROID_HOST_TAG=$HOST_OS-x86
+    ;;
+  *)
+    panic "Sorry, this script can only run on 64-bit Linux or Darwin"
+esac
+
+# Determine number of cores
+case $HOST_OS in
+  linux)
+    NUM_CORES=$(grep -c "processor" /proc/cpuinfo)
+    ;;
+  darwin)
+    NUM_CORES=$(sysctl -n hw.ncpu)
+    ;;
+  *)
+    NUM_CORES=1
+    ;;
+esac
+
+# The list of supported Android target architectures.
+ANDROID_ARCHS="arm x86 mips"
+
+BUILD_TYPES=
+for ARCH in $ANDROID_ARCHS; do
+  BUILD_TYPES="$BUILD_TYPES android-$ARCH"
+done
+ANDROID_BUILD_TYPES=$BUILD_TYPES
+
+# NOTE: The $HOST_OS-x86_64 is currently broken because the single
+#       <openssl/opensslconf.h> header is tailored for 32-bits.
+HOST_BUILD_TYPES="$HOST_OS-x86 $HOST_OS-generic32 $HOST_OS-generic64"
+
+BUILD_TYPES="$ANDROID_BUILD_TYPES $HOST_BUILD_TYPES"
+
+# Parse command-line
+DO_HELP=
+SRC_DIR=$(cd $PROGDIR && pwd)
+OUT_DIR=out
+BUILD_DIR=
+BUILD_TYPES=
+NUM_JOBS=$NUM_CORES
+ANDROID_BUILD_TOP=$(cd $PROGDIR/../.. && pwd)
+for OPT; do
+  case $OPT in
+    --help|-h|-?)
+      DO_HELP=true
+      ;;
+    --build-dir=*)
+      BUILD_DIR=${OPT##--build-dir=}
+      ;;
+    --verbose)
+      VERBOSE=$(( $VERBOSE + 1 ))
+      ;;
+    --jobs=*)
+      NUM_JOBS=${OPT##--jobs=}
+      ;;
+    --quiet)
+      VERBOSE=$(( $VERBOSE - 1 ))
+      ;;
+    -j*)
+      NUM_JOBS=${OPT##-j}
+      ;;
+    -*)
+      panic "Unknown option '$OPT', see --help for details."
+      ;;
+    *)
+      BUILD_TYPES="$BUILD_TYPES $OPT"
+      ;;
+  esac
+done
+
+# Print help when needed.
+if [ "$DO_HELP" ]; then
+  echo \
+"Usage: $PROGNAME [options] [<build-type> ...]
+
+This script is used to ensure that all OpenSSL build variants compile
+properly. It can be used after modifying external/openssl/openssl.config
+and re-running import_openssl.sh to check that any changes didn't break
+the build.
+
+A <build-type> is a description of a given build of the library and its
+program. Its format is:
+
+  <compiler>-<system>-<arch>
+
+Where: <compiler> is either 'gcc' or 'clang'.
+       <system>   is 'android', 'linux' or 'darwin'.
+       <arch>     is 'arm', 'x86'  or 'mips'.
+
+By default, it rebuilds the sources for the following build types:
+"
+  for BUILD_TYPE in $BUILD_TYPES; do
+    echo "  $BUILD_TYPE"
+  done
+
+  echo \
+"However, you can pass custom values on the command-line instead.
+
+This scripts generates a custom Makefile in a temporary directory, then
+launches 'make' in it to build all binaries in parallel. In case of
+problem, you can use the --build-dir=<path> option to specify a custom
+build-directory, which will _not_ be removed when the script exits.
+
+For example, to better see why a build fails:
+
+   ./$PROGNAME --build-dir=/tmp/mydir
+   make -C /tmp/mydir V=1
+
+Valid options:
+
+  --help|-h|-?        Print this message.
+  --build-dir=<path>  Specify build directory.
+  --jobs=<count>      Run <count> parallel build jobs [$NUM_JOBS].
+  -j<count>           Same as --jobs=<count>.
+  --verbose           Increase verbosity.
+  --quiet             Decrease verbosity.
+"
+  exit 0
+fi
+
+log "Host OS: $HOST_OS"
+log "Host arch: $HOST_ARCH"
+log "Host CPU count: $NUM_CORES"
+
+if [ -z "$BUILD_TYPES" ]; then
+  BUILD_TYPES="$ANDROID_BUILD_TYPES $HOST_BUILD_TYPES"
+fi
+log "Build types: $BUILD_TYPES"
+
+if [ -z "$BUILD_DIR" ]; then
+  # Create a temporary directory, ensure it gets destroyed properly
+  # when the script exits.
+  BUILD_DIR=$(mktemp -d)
+  clean_build_dir () {
+    log "Cleaning up temporary directory: $BUILD_DIR"
+    rm -rf "$BUILD_DIR"
+    exit $1
+  }
+  trap "clean_build_dir 0" EXIT
+  trap "clean_build_dir \$?" INT HUP QUIT TERM
+  log "Using temporary build directory: $BUILD_DIR"
+else
+  log "Using user build directory: $BUILD_DIR"
+fi
+
+mkdir -p "$BUILD_DIR" && rm -rf "$BUILD_DIR"/*
+
+MAKEFILE=$BUILD_DIR/GNUmakefile
+
+# Return source files for a given module and architecture.
+# $1: module prefix (e.g. CRYPTO)
+# $2: build arch.
+get_module_src_files_for_arch () {
+  local prefix=$1
+  local arch=$2
+  local src_files="$(var_value OPENSSL_${prefix}_SOURCES)"
+  src_files="$src_files $(var_value OPENSSL_${prefix}_SOURCES_${arch})"
+  local exclude_files="$(var_value OPENSSL_${prefix}_SOURCES_EXCLUDES_${arch})"
+  src_files=$(filter_out "$src_files" "$exclude_files")
+  echo "$src_files"
+}
+
+# Return the compiler defines for a given module and architecture
+# $1: module prefix (e.g. CRYPTO)
+# $2 build arch.
+get_module_defines_for_arch () {
+  local prefix=$1
+  local arch=$2
+  local defines="$(var_value OPENSSL_${prefix}_DEFINES)"
+  defines="$defines $(var_value OPENSSL_${prefix}_DEFINES_${arch})"
+  echo "$defines"
+}
+
+# $1: module prefix (e.g. CRYPTO)
+get_module_c_includes () {
+  var_value OPENSSL_$1_INCLUDES
+}
+
+# $1: build type (e.g. gcc-android-arm)
+# Out: build arch.
+get_build_arch () {
+  echo "$1" | cut -d- -f3
+}
+
+# $1: build arch
+# Out: GNU configuration target (e.g. arm-linux-androideabi)
+get_build_arch_target () {
+  case $1 in
+    arm)
+      echo "arm-linux-androideabi"
+      ;;
+    x86)
+      echo "i686-linux-android"
+      ;;
+    mips)
+      echo "mipsel-linux-android"
+      ;;
+    *)
+      echo "$1-linux-android"
+      ;;
+  esac
+}
+
+GCC_VERSION=4.7
+CLANG_VERSION=3.1
+
+get_prebuilt_gcc_dir_for_arch () {
+  local arch=$1
+  local target=$(get_build_arch_target $arch)
+  echo "$ANDROID_BUILD_TOP/prebuilts/gcc/$ANDROID_HOST_TAG/$arch/$target-$GCC_VERSION"
+}
+
+get_prebuilt_clang () {
+  echo "$ANDROID_BUILD_TOP/prebuilts/clang/$ANDROID_HOST_TAG/$CLANG_VERSION/clang"
+}
+
+get_prebuilt_ndk_sysroot_for_arch () {
+  echo "$ANDROID_BUILD_TOP/prebuilts/ndk/current/platforms/android-9/arch-$1"
+}
+
+get_c_runtime_file () {
+  local build_type=$1
+  local arch=$(get_build_arch $build_type)
+  local filename=$2
+  echo "$(get_prebuilt_ndk_sysroot_for_arch $arch)/usr/lib/$filename"
+}
+
+# $1: build type (e.g. gcc-android-arm)
+get_build_compiler () {
+  local arch=$(get_build_arch $1)
+  local target=$(get_build_arch_target $arch)
+  local gcc_dir=$(get_prebuilt_gcc_dir_for_arch $arch);
+  local result
+
+  # Get the toolchain binary.
+  case $1 in
+    gcc-android-*)
+      result="$gcc_dir/bin/$target-gcc"
+      ;;
+    clang-android-*)
+      result="$(get_prebuilt_clang) -target $target -B$gcc_dir/$target/bin -I$gcc_dir/lib/gcc/$target/$GCC_VERSION/include"
+      ;;
+    gcc-*)
+      result=gcc
+      ;;
+    clang-*) # Must have host clang compiler.
+      result=clang
+      ;;
+  esac
+
+  compiler_check=$(which $result 2>/dev/null || echo "")
+  if [ -z "$compiler_check" ]; then
+    panic "Could not find compiler: $result"
+  fi
+
+  # Get the Android sysroot if needed.
+  case $1 in
+    *-android-*)
+      result="$result --sysroot=$(get_prebuilt_ndk_sysroot_for_arch $arch)"
+      ;;
+  esac
+
+  # Force -m32 flag when needed for 32-bit builds.
+  case $1 in
+    *-linux-x86|*-darwin-x86|*-generic32)
+      result="$result -m32"
+      ;;
+  esac
+  echo "$result"
+}
+
+# $1: build type.
+# Out: common compiler flags for this build.
+get_build_c_flags () {
+  local result="-O2 -fPIC"
+  case $1 in
+    *-android-arm)
+      result="$result -march=armv7-a -mfpu=vfpv3-d16"
+      ;;
+  esac
+
+  case $1 in
+    *-generic32|*-generic64)
+      # Generic builds do not compile without this flag.
+      result="$result -DOPENSSL_NO_ASM"
+      ;;
+  esac
+  echo "$result"
+}
+
+# $1: build type.
+# Out: linker for this build.
+get_build_linker () {
+  get_build_compiler $1
+}
+
+clear_sources () {
+  g_all_objs=""
+}
+
+# Generate build instructions to compile source files.
+# Also update g_all_objs.
+# $1: module prefix (e.g. CRYPTO)
+# $2: build type
+build_sources () {
+  local prefix=$1
+  local build_type=$2
+  echo "## build_sources prefix='$prefix' build_type='$build_type'"
+  local arch=$(get_build_arch $build_type)
+  local src_files=$(get_module_src_files_for_arch $prefix $arch)
+  local c_defines=$(get_module_defines_for_arch $prefix $arch)
+  local c_includes=$(get_module_c_includes $prefix "$SRC_DIR")
+  local build_cc=$(get_build_compiler $build_type)
+  local build_cflags=$(get_build_c_flags $build_type)
+  local build_linker=$(get_build_linker $build_type)
+  local src obj def inc
+
+  printf "OUT_DIR := $OUT_DIR/$build_type\n\n"
+  printf "BUILD_CC := $build_cc\n\n"
+  printf "BUILD_LINKER := $build_linker\n\n"
+  printf "BUILD_CFLAGS := $build_cflags"
+  for inc in $c_includes; do
+    printf " -I\$(SRC_DIR)/$inc"
+  done
+  for def in $c_defines; do
+    printf " -D$def"
+  done
+  printf "\n\n"
+  printf "BUILD_OBJECTS :=\n\n"
+
+  case $build_type in
+    clang-android-*)
+      # The version of clang that comes with the platform build doesn't
+      # support simple linking of shared libraries and executables. One
+      # has to provide the C runtime files explicitely.
+      local crtbegin_so=$(get_c_runtime_file $build_type crtbegin_so.o)
+      local crtend_so=$(get_c_runtime_file $build_type crtend_so.o)
+      local crtbegin_exe=$(get_c_runtime_file $build_type crtbegin_dynamic.o)
+      local crtend_exe=$(get_c_runtime_file $build_type crtend_android.o)
+      printf "CRTBEGIN_SO := $crtbegin_so\n"
+      printf "CRTEND_SO := $crtend_so\n"
+      printf "CRTBEGIN_EXE := $crtbegin_exe\n"
+      printf "CRTEND_EXE := $crtend_exe\n"
+      printf "\n"
+      ;;
+  esac
+
+  for src in $src_files; do
+    obj=$(src_to_obj $src)
+    g_all_objs="$g_all_objs $obj"
+    printf "OBJ := \$(OUT_DIR)/$obj\n"
+    printf "BUILD_OBJECTS += \$(OBJ)\n"
+    printf "\$(OBJ): PRIVATE_CC := \$(BUILD_CC)\n"
+    printf "\$(OBJ): PRIVATE_CFLAGS := \$(BUILD_CFLAGS)\n"
+    printf "\$(OBJ): \$(SRC_DIR)/$src\n"
+    printf "\t@echo [$build_type] CC $src\n"
+    printf "\t@mkdir -p \$\$(dirname \$@)\n"
+    printf "\t\$(hide) \$(PRIVATE_CC) \$(PRIVATE_CFLAGS) -c -o \$@ \$<\n"
+    printf "\n"
+  done
+  printf "\n"
+}
+
+# $1: library name (e.g. crypto).
+# $2: module prefix (e.g. CRYPTO).
+# $3: build type.
+# $4: source directory.
+# $5: output directory.
+build_shared_library () {
+  local name=$1
+  local prefix=$2
+  local build_type=$3
+  local src_dir="$4"
+  local out_dir="$5"
+  local shlib="lib${name}.so"
+  local build_linker=$(get_build_linker $build_type)
+  clear_sources
+  build_sources $prefix $build_type
+
+  # TODO(digit): Make the clang build link properly.
+  printf "SHLIB=\$(OUT_DIR)/$shlib\n"
+  printf "\$(SHLIB): PRIVATE_LINKER := \$(BUILD_LINKER)\n"
+  case $build_type in
+    clang-android-*)
+      printf "\$(SHLIB): PRIVATE_CRTBEGIN := \$(CRTBEGIN_SO)\n"
+      printf "\$(SHLIB): PRIVATE_CRTEND := \$(CRTEND_SO)\n"
+      ;;
+  esac
+  printf "\$(SHLIB): \$(BUILD_OBJECTS)\n"
+  printf "\t@echo [$build_type] SHARED_LIBRARY $(basename $shlib)\n"
+  printf "\t@mkdir -p \$\$(dirname \$@)\n"
+  case $build_type in
+    clang-android-*)
+      printf "\t\$(hide) \$(PRIVATE_LINKER) -nostdlib -shared -o \$@ \$(PRIVATE_CRTBEGIN) \$^ \$(PRIVATE_CRTEND)\n"
+      ;;
+    *)
+      printf "\t\$(hide) \$(PRIVATE_LINKER) -shared -o \$@ \$^\n"
+      ;;
+  esac
+  printf "\n"
+}
+
+# $1: executable name.
+# $2: module prefix (e.g. APPS).
+# $3: build type.
+# $4: source directory.
+# $5: output directory.
+# $6: dependent shared libraries (e.g. 'crypto ssl')
+build_executable () {
+  local name=$1
+  local prefix=$2
+  local build_type=$3
+  local src_dir="$4"
+  local out_dir="$5"
+  local shlibs="$6"
+  local build_linker=$(get_build_linker $build_type)
+  clear_sources
+  build_sources $prefix $build_type
+
+  # TODO(digit): Make the clang build link properly.
+  exec=$name
+  all_shlibs=
+  printf "EXEC := \$(OUT_DIR)/$name\n"
+  printf "openssl_all: \$(EXEC)\n"
+  printf "\$(EXEC): PRIVATE_LINKER := \$(BUILD_LINKER)\n"
+  printf "\$(EXEC): \$(BUILD_OBJECTS)"
+  for lib in $shlibs; do
+    printf " \$(OUT_DIR)/lib${lib}.so"
+  done
+  printf "\n"
+  printf "\t@echo [$build_type] EXECUTABLE $name\n"
+  printf "\t@mkdir -p \$\$(dirname \$@)\n"
+  printf "\t\$(hide) \$(PRIVATE_LINKER) -o \$@ \$^\n"
+  printf "\n"
+}
+
+ALL_BUILDS=
+
+generate_openssl_build () {
+  local build_type=$1
+  local out="$OUT_DIR/$build_type"
+  ALL_BUILDS="$ALL_BUILDS $build_type"
+  echo "# Build type: $build_type"
+  build_shared_library crypto CRYPTO $build_type "$SRC_DIR" "$out"
+  build_shared_library ssl SSL $build_type "$SRC_DIR" "$out"
+  build_executable openssl APPS $build_type "$SRC_DIR" "$out" "crypto ssl"
+}
+
+generate_makefile () {
+  echo \
+"# Auto-generated by $PROGDIR - do not edit
+
+.PHONY: openssl_all
+
+all: openssl_all
+
+# Use 'make V=1' to print build commands.
+ifeq (1,\$(V))
+hide :=
+else
+hide := @
+endif
+
+SRC_DIR=$SRC_DIR
+OUT_DIR=$OUT_DIR
+"
+
+  for BUILD_TYPE in $BUILD_TYPES; do
+    generate_openssl_build gcc-$BUILD_TYPE
+  done
+
+# TODO(digit): Make the Clang build run.
+#   for BUILD_TYPE in $ANDROID_BUILD_TYPES; do
+#     generate_openssl_build clang-$BUILD_TYPE
+#   done
+}
+
+. $SRC_DIR/openssl.config
+
+
+
+dump "Generating Makefile"
+log "Makefile path: $MAKEFILE"
+generate_makefile > $MAKEFILE
+
+dump "Building libraries with $NUM_JOBS jobs"
+dump "For the following builds:"
+for BUILD in $ALL_BUILDS; do
+  dump "  $BUILD"
+done
+MAKE_FLAGS="-j$NUM_JOBS"
+if [ "$VERBOSE" -gt 2 ]; then
+  MAKE_FLAGS="$MAKE_FLAGS V=1"
+fi
+run make $MAKE_FLAGS -f "$MAKEFILE" -C "$BUILD_DIR"
+case $? in
+  0)
+    dump "All OK, congratulations!"
+    ;;
+  *)
+    dump "Error, try doing the following to inspect the issues:"
+    dump "   $PROGNAME --build-dir=/tmp/mybuild"
+    dump "   make -C /tmp/mybuild V=1"
+    dump ""
+    ;;
+esac
diff --git a/crypto/Android.mk b/crypto/Android.mk
deleted file mode 100644
index a5bfe3c..0000000
--- a/crypto/Android.mk
+++ /dev/null
@@ -1,589 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
-mips_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM
-
-arm_src_files := \
-    aes/asm/aes-armv4.s \
-    bn/asm/armv4-mont.s \
-    bn/bn_asm.c \
-    sha/asm/sha1-armv4-large.s \
-    sha/asm/sha256-armv4.s \
-    sha/asm/sha512-armv4.s
-
-mips_src_files := \
-    aes/asm/aes-mips.s \
-    bn/asm/bn-mips.s \
-    bn/asm/mips-mont.s \
-    sha/asm/sha1-mips.s \
-    sha/asm/sha256-mips.s
-
-other_arch_src_files := \
-    aes/aes_core.c \
-    bn/bn_asm.c
-
-local_src_files := \
-	cryptlib.c \
-	mem.c \
-	mem_clr.c \
-	mem_dbg.c \
-	cversion.c \
-	ex_data.c \
-	cpt_err.c \
-	ebcdic.c \
-	uid.c \
-	o_time.c \
-	o_str.c \
-	o_dir.c \
-	aes/aes_cbc.c \
-	aes/aes_cfb.c \
-	aes/aes_ctr.c \
-	aes/aes_ecb.c \
-	aes/aes_misc.c \
-	aes/aes_ofb.c \
-	aes/aes_wrap.c \
-	asn1/a_bitstr.c \
-	asn1/a_bool.c \
-	asn1/a_bytes.c \
-	asn1/a_d2i_fp.c \
-	asn1/a_digest.c \
-	asn1/a_dup.c \
-	asn1/a_enum.c \
-	asn1/a_gentm.c \
-	asn1/a_i2d_fp.c \
-	asn1/a_int.c \
-	asn1/a_mbstr.c \
-	asn1/a_object.c \
-	asn1/a_octet.c \
-	asn1/a_print.c \
-	asn1/a_set.c \
-	asn1/a_sign.c \
-	asn1/a_strex.c \
-	asn1/a_strnid.c \
-	asn1/a_time.c \
-	asn1/a_type.c \
-	asn1/a_utctm.c \
-	asn1/a_utf8.c \
-	asn1/a_verify.c \
-	asn1/ameth_lib.c \
-	asn1/asn1_err.c \
-	asn1/asn1_gen.c \
-	asn1/asn1_lib.c \
-	asn1/asn1_par.c \
-	asn1/asn_mime.c \
-	asn1/asn_moid.c \
-	asn1/asn_pack.c \
-	asn1/bio_asn1.c \
-	asn1/bio_ndef.c \
-	asn1/d2i_pr.c \
-	asn1/d2i_pu.c \
-	asn1/evp_asn1.c \
-	asn1/f_enum.c \
-	asn1/f_int.c \
-	asn1/f_string.c \
-	asn1/i2d_pr.c \
-	asn1/i2d_pu.c \
-	asn1/n_pkey.c \
-	asn1/nsseq.c \
-	asn1/p5_pbe.c \
-	asn1/p5_pbev2.c \
-	asn1/p8_pkey.c \
-	asn1/t_bitst.c \
-	asn1/t_crl.c \
-	asn1/t_pkey.c \
-	asn1/t_req.c \
-	asn1/t_spki.c \
-	asn1/t_x509.c \
-	asn1/t_x509a.c \
-	asn1/tasn_dec.c \
-	asn1/tasn_enc.c \
-	asn1/tasn_fre.c \
-	asn1/tasn_new.c \
-	asn1/tasn_prn.c \
-	asn1/tasn_typ.c \
-	asn1/tasn_utl.c \
-	asn1/x_algor.c \
-	asn1/x_attrib.c \
-	asn1/x_bignum.c \
-	asn1/x_crl.c \
-	asn1/x_exten.c \
-	asn1/x_info.c \
-	asn1/x_long.c \
-	asn1/x_name.c \
-	asn1/x_nx509.c \
-	asn1/x_pkey.c \
-	asn1/x_pubkey.c \
-	asn1/x_req.c \
-	asn1/x_sig.c \
-	asn1/x_spki.c \
-	asn1/x_val.c \
-	asn1/x_x509.c \
-	asn1/x_x509a.c \
-	bf/bf_cfb64.c \
-	bf/bf_ecb.c \
-	bf/bf_enc.c \
-	bf/bf_ofb64.c \
-	bf/bf_skey.c \
-	bio/b_dump.c \
-	bio/b_print.c \
-	bio/b_sock.c \
-	bio/bf_buff.c \
-	bio/bf_nbio.c \
-	bio/bf_null.c \
-	bio/bio_cb.c \
-	bio/bio_err.c \
-	bio/bio_lib.c \
-	bio/bss_acpt.c \
-	bio/bss_bio.c \
-	bio/bss_conn.c \
-	bio/bss_dgram.c \
-	bio/bss_fd.c \
-	bio/bss_file.c \
-	bio/bss_log.c \
-	bio/bss_mem.c \
-	bio/bss_null.c \
-	bio/bss_sock.c \
-	bn/bn_add.c \
-	bn/bn_blind.c \
-	bn/bn_const.c \
-	bn/bn_ctx.c \
-	bn/bn_div.c \
-	bn/bn_err.c \
-	bn/bn_exp.c \
-	bn/bn_exp2.c \
-	bn/bn_gcd.c \
-	bn/bn_gf2m.c \
-	bn/bn_kron.c \
-	bn/bn_lib.c \
-	bn/bn_mod.c \
-	bn/bn_mont.c \
-	bn/bn_mpi.c \
-	bn/bn_mul.c \
-	bn/bn_nist.c \
-	bn/bn_prime.c \
-	bn/bn_print.c \
-	bn/bn_rand.c \
-	bn/bn_recp.c \
-	bn/bn_shift.c \
-	bn/bn_sqr.c \
-	bn/bn_sqrt.c \
-	bn/bn_word.c \
-	buffer/buf_err.c \
-	buffer/buffer.c \
-	comp/c_rle.c \
-	comp/c_zlib.c \
-	comp/comp_err.c \
-	comp/comp_lib.c \
-	conf/conf_api.c \
-	conf/conf_def.c \
-	conf/conf_err.c \
-	conf/conf_lib.c \
-	conf/conf_mall.c \
-	conf/conf_mod.c \
-	conf/conf_sap.c \
-	des/cbc_cksm.c \
-	des/cbc_enc.c \
-	des/cfb64ede.c \
-	des/cfb64enc.c \
-	des/cfb_enc.c \
-	des/des_enc.c \
-	des/des_old.c \
-	des/des_old2.c \
-	des/ecb3_enc.c \
-	des/ecb_enc.c \
-	des/ede_cbcm_enc.c \
-	des/enc_read.c \
-	des/enc_writ.c \
-	des/fcrypt.c \
-	des/fcrypt_b.c \
-	des/ofb64ede.c \
-	des/ofb64enc.c \
-	des/ofb_enc.c \
-	des/pcbc_enc.c \
-	des/qud_cksm.c \
-	des/rand_key.c \
-	des/read2pwd.c \
-	des/rpc_enc.c \
-	des/set_key.c \
-	des/str2key.c \
-	des/xcbc_enc.c \
-	dh/dh_ameth.c \
-	dh/dh_asn1.c \
-	dh/dh_check.c \
-	dh/dh_depr.c \
-	dh/dh_err.c \
-	dh/dh_gen.c \
-	dh/dh_key.c \
-	dh/dh_lib.c \
-	dh/dh_pmeth.c \
-	dsa/dsa_ameth.c \
-	dsa/dsa_asn1.c \
-	dsa/dsa_depr.c \
-	dsa/dsa_err.c \
-	dsa/dsa_gen.c \
-	dsa/dsa_key.c \
-	dsa/dsa_lib.c \
-	dsa/dsa_ossl.c \
-	dsa/dsa_pmeth.c \
-	dsa/dsa_prn.c \
-	dsa/dsa_sign.c \
-	dsa/dsa_vrf.c \
-	dso/dso_dl.c \
-	dso/dso_dlfcn.c \
-	dso/dso_err.c \
-	dso/dso_lib.c \
-	dso/dso_null.c \
-	dso/dso_openssl.c \
-	ec/ec2_mult.c \
-	ec/ec2_smpl.c \
-	ec/ec_ameth.c \
-	ec/ec_asn1.c \
-	ec/ec_check.c \
-	ec/ec_curve.c \
-	ec/ec_cvt.c \
-	ec/ec_err.c \
-	ec/ec_key.c \
-	ec/ec_lib.c \
-	ec/ec_mult.c \
-	ec/ec_pmeth.c \
-	ec/ec_print.c \
-	ec/eck_prn.c \
-	ec/ecp_mont.c \
-	ec/ecp_nist.c \
-	ec/ecp_smpl.c \
-	ecdh/ech_err.c \
-	ecdh/ech_key.c \
-	ecdh/ech_lib.c \
-	ecdh/ech_ossl.c \
-	ecdsa/ecs_asn1.c \
-	ecdsa/ecs_err.c \
-	ecdsa/ecs_lib.c \
-	ecdsa/ecs_ossl.c \
-	ecdsa/ecs_sign.c \
-	ecdsa/ecs_vrf.c \
-	err/err.c \
-	err/err_all.c \
-	err/err_prn.c \
-	evp/bio_b64.c \
-	evp/bio_enc.c \
-	evp/bio_md.c \
-	evp/bio_ok.c \
-	evp/c_all.c \
-	evp/c_allc.c \
-	evp/c_alld.c \
-	evp/digest.c \
-	evp/e_aes.c \
-	evp/e_bf.c \
-	evp/e_des.c \
-	evp/e_des3.c \
-	evp/e_null.c \
-	evp/e_old.c \
-	evp/e_rc2.c \
-	evp/e_rc4.c \
-	evp/e_rc5.c \
-	evp/e_xcbc_d.c \
-	evp/encode.c \
-	evp/evp_acnf.c \
-	evp/evp_enc.c \
-	evp/evp_err.c \
-	evp/evp_key.c \
-	evp/evp_lib.c \
-	evp/evp_pbe.c \
-	evp/evp_pkey.c \
-	evp/m_dss.c \
-	evp/m_dss1.c \
-	evp/m_ecdsa.c \
-	evp/m_md4.c \
-	evp/m_md5.c \
-	evp/m_mdc2.c \
-	evp/m_null.c \
-	evp/m_ripemd.c \
-	evp/m_sha1.c \
-	evp/m_sigver.c \
-	evp/m_wp.c \
-	evp/names.c \
-	evp/p5_crpt.c \
-	evp/p5_crpt2.c \
-	evp/p_dec.c \
-	evp/p_enc.c \
-	evp/p_lib.c \
-	evp/p_open.c \
-	evp/p_seal.c \
-	evp/p_sign.c \
-	evp/p_verify.c \
-	evp/pmeth_fn.c \
-	evp/pmeth_gn.c \
-	evp/pmeth_lib.c \
-	hmac/hm_ameth.c \
-	hmac/hm_pmeth.c \
-	hmac/hmac.c \
-	krb5/krb5_asn.c \
-	lhash/lh_stats.c \
-	lhash/lhash.c \
-	md4/md4_dgst.c \
-	md4/md4_one.c \
-	md5/md5_dgst.c \
-	md5/md5_one.c \
-	modes/cbc128.c \
-	modes/cfb128.c \
-	modes/ctr128.c \
-	modes/ofb128.c \
-	objects/o_names.c \
-	objects/obj_dat.c \
-	objects/obj_err.c \
-	objects/obj_lib.c \
-	objects/obj_xref.c \
-	ocsp/ocsp_asn.c \
-	ocsp/ocsp_cl.c \
-	ocsp/ocsp_err.c \
-	ocsp/ocsp_ext.c \
-	ocsp/ocsp_ht.c \
-	ocsp/ocsp_lib.c \
-	ocsp/ocsp_prn.c \
-	ocsp/ocsp_srv.c \
-	ocsp/ocsp_vfy.c \
-	pem/pem_all.c \
-	pem/pem_err.c \
-	pem/pem_info.c \
-	pem/pem_lib.c \
-	pem/pem_oth.c \
-	pem/pem_pk8.c \
-	pem/pem_pkey.c \
-	pem/pem_seal.c \
-	pem/pem_sign.c \
-	pem/pem_x509.c \
-	pem/pem_xaux.c \
-	pem/pvkfmt.c \
-	pkcs12/p12_add.c \
-	pkcs12/p12_asn.c \
-	pkcs12/p12_attr.c \
-	pkcs12/p12_crpt.c \
-	pkcs12/p12_crt.c \
-	pkcs12/p12_decr.c \
-	pkcs12/p12_init.c \
-	pkcs12/p12_key.c \
-	pkcs12/p12_kiss.c \
-	pkcs12/p12_mutl.c \
-	pkcs12/p12_npas.c \
-	pkcs12/p12_p8d.c \
-	pkcs12/p12_p8e.c \
-	pkcs12/p12_utl.c \
-	pkcs12/pk12err.c \
-	pkcs7/pk7_asn1.c \
-	pkcs7/pk7_attr.c \
-	pkcs7/pk7_doit.c \
-	pkcs7/pk7_lib.c	\
-	pkcs7/pk7_mime.c \
-	pkcs7/pk7_smime.c \
-	pkcs7/pkcs7err.c \
-	rand/md_rand.c \
-	rand/rand_egd.c \
-	rand/rand_err.c \
-	rand/rand_lib.c \
-	rand/rand_unix.c \
-	rand/randfile.c \
-	rc2/rc2_cbc.c \
-	rc2/rc2_ecb.c \
-	rc2/rc2_skey.c \
-	rc2/rc2cfb64.c \
-	rc2/rc2ofb64.c \
-	rc4/rc4_enc.c \
-	rc4/rc4_skey.c \
-	ripemd/rmd_dgst.c \
-	ripemd/rmd_one.c \
-	rsa/rsa_ameth.c \
-	rsa/rsa_asn1.c \
-	rsa/rsa_chk.c \
-	rsa/rsa_eay.c \
-	rsa/rsa_err.c \
-	rsa/rsa_gen.c \
-	rsa/rsa_lib.c \
-	rsa/rsa_none.c \
-	rsa/rsa_null.c \
-	rsa/rsa_oaep.c \
-	rsa/rsa_pk1.c \
-	rsa/rsa_pmeth.c \
-	rsa/rsa_prn.c \
-	rsa/rsa_pss.c \
-	rsa/rsa_saos.c \
-	rsa/rsa_sign.c \
-	rsa/rsa_ssl.c \
-	rsa/rsa_x931.c \
-	sha/sha1_one.c \
-	sha/sha1dgst.c \
-	sha/sha256.c \
-	sha/sha512.c \
-	sha/sha_dgst.c \
-	stack/stack.c \
-	ts/ts_err.c \
-	txt_db/txt_db.c \
-	ui/ui_compat.c \
-	ui/ui_err.c \
-	ui/ui_lib.c \
-	ui/ui_openssl.c \
-	ui/ui_util.c \
-	x509/by_dir.c \
-	x509/by_file.c \
-	x509/x509_att.c \
-	x509/x509_cmp.c \
-	x509/x509_d2.c \
-	x509/x509_def.c \
-	x509/x509_err.c \
-	x509/x509_ext.c \
-	x509/x509_lu.c \
-	x509/x509_obj.c \
-	x509/x509_r2x.c \
-	x509/x509_req.c \
-	x509/x509_set.c \
-	x509/x509_trs.c \
-	x509/x509_txt.c \
-	x509/x509_v3.c \
-	x509/x509_vfy.c \
-	x509/x509_vpm.c \
-	x509/x509cset.c \
-	x509/x509name.c \
-	x509/x509rset.c \
-	x509/x509spki.c \
-	x509/x509type.c \
-	x509/x_all.c \
-	x509v3/pcy_cache.c \
-	x509v3/pcy_data.c \
-	x509v3/pcy_lib.c \
-	x509v3/pcy_map.c \
-	x509v3/pcy_node.c \
-	x509v3/pcy_tree.c \
-	x509v3/v3_akey.c \
-	x509v3/v3_akeya.c \
-	x509v3/v3_alt.c \
-	x509v3/v3_bcons.c \
-	x509v3/v3_bitst.c \
-	x509v3/v3_conf.c \
-	x509v3/v3_cpols.c \
-	x509v3/v3_crld.c \
-	x509v3/v3_enum.c \
-	x509v3/v3_extku.c \
-	x509v3/v3_genn.c \
-	x509v3/v3_ia5.c \
-	x509v3/v3_info.c \
-	x509v3/v3_int.c \
-	x509v3/v3_lib.c \
-	x509v3/v3_ncons.c \
-	x509v3/v3_ocsp.c \
-	x509v3/v3_pci.c \
-	x509v3/v3_pcia.c \
-	x509v3/v3_pcons.c \
-	x509v3/v3_pku.c \
-	x509v3/v3_pmaps.c \
-	x509v3/v3_prn.c \
-	x509v3/v3_purp.c \
-	x509v3/v3_skey.c \
-	x509v3/v3_sxnet.c \
-	x509v3/v3_utl.c \
-	x509v3/v3err.c
-
-local_c_includes := \
-	external/openssl \
-	external/openssl/crypto/asn1 \
-	external/openssl/crypto/evp \
-	external/openssl/include \
-	external/openssl/include/openssl \
-	external/zlib
-
-local_c_flags := -DNO_WINDOWS_BRAINDEATH
-
-#######################################
-# target static library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
-	LOCAL_SRC_FILES += $(arm_src_files)
-	LOCAL_CFLAGS += $(arm_cflags)
-endif
-ifeq ($(TARGET_ARCH),mips)
-    ifneq (($TARGET_HAS_BIGENDIAN),true)
-      LOCAL_SRC_FILES += $(mips_src_files)
-      LOCAL_CFLAGS += $(mips_cflags)
-    else
-      LOCAL_SRC_FILES += $(other_arch_src_files)
-    endif
-endif
-ifeq ($(TARGET_ARCH),x86)
-	LOCAL_SRC_FILES += $(other_arch_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_STATIC_LIBRARY)
-
-#######################################
-# target shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-# Use the NDK prebuilt libz and libdl.
-LOCAL_LDFLAGS += -lz -ldl
-else
-LOCAL_SHARED_LIBRARIES += libz libdl
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
-	LOCAL_SRC_FILES += $(arm_src_files)
-	LOCAL_CFLAGS += $(arm_cflags)
-endif
-ifeq ($(TARGET_ARCH),mips)
-    ifneq (($TARGET_HAS_BIGENDIAN),true)
-      LOCAL_SRC_FILES += $(mips_src_files)
-      LOCAL_CFLAGS += $(mips_cflags)
-    else
-      LOCAL_SRC_FILES += $(other_arch_src_files)
-    endif
-endif
-ifeq ($(TARGET_ARCH),x86)
-	LOCAL_SRC_FILES += $(other_arch_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_SHARED_LIBRARY)
-
-#######################################
-# host shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SRC_FILES += $(other_arch_src_files)
-LOCAL_STATIC_LIBRARIES += libz
-LOCAL_LDLIBS += -ldl
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_HOST_SHARED_LIBRARY)
-
-########################################
-# host static library, which is used by some SDK tools.
-
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SRC_FILES += $(other_arch_src_files)
-LOCAL_STATIC_LIBRARIES += libz
-LOCAL_LDLIBS += -ldl
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/crypto/LPdir_win32.c b/crypto/LPdir_win32.c
new file mode 100644
index 0000000..e39872d
--- /dev/null
+++ b/crypto/LPdir_win32.c
@@ -0,0 +1,30 @@
+/* $LP: LPlib/source/LPdir_win32.c,v 1.3 2004/08/26 13:36:05 _cvs_levitte Exp $ */
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define LP_SYS_WIN32
+#define LP_MULTIBYTE_AVAILABLE
+#include "LPdir_win.c"
diff --git a/crypto/aes/aes.h b/crypto/aes/aes.h
index d2c9973..031abf0 100644
--- a/crypto/aes/aes.h
+++ b/crypto/aes/aes.h
@@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	AES_KEY *key);
 
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+
 void AES_encrypt(const unsigned char *in, unsigned char *out,
 	const AES_KEY *key);
 void AES_decrypt(const unsigned char *in, unsigned char *out,
diff --git a/crypto/aes/aes_core.c b/crypto/aes/aes_core.c
index a7ec54f..8f5210a 100644
--- a/crypto/aes/aes_core.c
+++ b/crypto/aes/aes_core.c
@@ -625,7 +625,7 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 
 	u32 *rk;
@@ -726,7 +726,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -734,7 +734,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 
@@ -1201,7 +1201,7 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 	u32 *rk;
    	int i = 0;
@@ -1301,7 +1301,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -1309,7 +1309,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 
diff --git a/crypto/aes/aes_misc.c b/crypto/aes/aes_misc.c
index 4fead1b..f083488 100644
--- a/crypto/aes/aes_misc.c
+++ b/crypto/aes/aes_misc.c
@@ -50,6 +50,7 @@
  */
 
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 #include <openssl/aes.h>
 #include "aes_locl.h"
 
@@ -62,3 +63,23 @@ const char *AES_options(void) {
         return "aes(partial)";
 #endif
 }
+
+/* FIPS wrapper functions to block low level AES calls in FIPS mode */
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_encrypt_key(userKey, bits, key);
+	}
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_decrypt_key(userKey, bits, key);
+	}
diff --git a/crypto/aes/asm/aes-586.S b/crypto/aes/asm/aes-586.S
new file mode 100644
index 0000000..20c0238
--- /dev/null
+++ b/crypto/aes/asm/aes-586.S
@@ -0,0 +1,3239 @@
+.file	"aes-586.s"
+.text
+.type	_x86_AES_encrypt_compact,@function
+.align	16
+_x86_AES_encrypt_compact:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	-128(%ebp),%edi
+	movl	-96(%ebp),%esi
+	movl	-64(%ebp),%edi
+	movl	-32(%ebp),%esi
+	movl	(%ebp),%edi
+	movl	32(%ebp),%esi
+	movl	64(%ebp),%edi
+	movl	96(%ebp),%esi
+.align	16
+.L000loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ch,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$8,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$24,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+
+	movl	%ecx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%ecx
+	roll	$24,%ecx
+	xorl	%esi,%ecx
+	rorl	$16,%ebp
+	xorl	%ebp,%ecx
+	rorl	$8,%ebp
+	xorl	%ebp,%ecx
+	movl	%edx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%edx,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%edx
+	roll	$24,%edx
+	xorl	%esi,%edx
+	rorl	$16,%ebp
+	xorl	%ebp,%edx
+	rorl	$8,%ebp
+	xorl	%ebp,%edx
+	movl	%eax,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%eax,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%eax
+	roll	$24,%eax
+	xorl	%esi,%eax
+	rorl	$16,%ebp
+	xorl	%ebp,%eax
+	rorl	$8,%ebp
+	xorl	%ebp,%eax
+	movl	%ebx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%edi
+	subl	%ebp,%esi
+	andl	$4278124286,%edi
+	andl	$454761243,%esi
+	movl	%ebx,%ebp
+	xorl	%edi,%esi
+	xorl	%esi,%ebx
+	roll	$24,%ebx
+	xorl	%esi,%ebx
+	rorl	$16,%ebp
+	xorl	%ebp,%ebx
+	rorl	$8,%ebp
+	xorl	%ebp,%ebx
+	movl	20(%esp),%edi
+	movl	28(%esp),%ebp
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L000loop
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ch,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$8,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$24,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+
+	xorl	16(%edi),%eax
+	xorl	20(%edi),%ebx
+	xorl	24(%edi),%ecx
+	xorl	28(%edi),%edx
+	ret
+.size	_x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact
+.type	_sse_AES_encrypt_compact,@function
+.align	16
+_sse_AES_encrypt_compact:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	$454761243,%eax
+	movl	%eax,8(%esp)
+	movl	%eax,12(%esp)
+	movl	-128(%ebp),%eax
+	movl	-96(%ebp),%ebx
+	movl	-64(%ebp),%ecx
+	movl	-32(%ebp),%edx
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%edx
+.align	16
+.L001loop:
+	pshufw	$8,%mm0,%mm1
+	pshufw	$13,%mm4,%mm5
+	movd	%mm1,%eax
+	movd	%mm5,%ebx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	pshufw	$13,%mm0,%mm2
+	movzbl	%ah,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	shll	$8,%edx
+	shrl	$16,%eax
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	pshufw	$8,%mm4,%mm6
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%edx
+	shrl	$16,%ebx
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm0
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	movd	%mm2,%eax
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	movd	%mm6,%ebx
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm1
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	shrl	$16,%ebx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	shrl	$16,%eax
+	punpckldq	%mm1,%mm0
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	andl	$255,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$16,%eax
+	orl	%eax,%edx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm4
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	orl	%ebx,%edx
+	movd	%edx,%mm5
+	punpckldq	%mm5,%mm4
+	addl	$16,%edi
+	cmpl	24(%esp),%edi
+	ja	.L002out
+	movq	8(%esp),%mm2
+	pxor	%mm3,%mm3
+	pxor	%mm7,%mm7
+	movq	%mm0,%mm1
+	movq	%mm4,%mm5
+	pcmpgtb	%mm0,%mm3
+	pcmpgtb	%mm4,%mm7
+	pand	%mm2,%mm3
+	pand	%mm2,%mm7
+	pshufw	$177,%mm0,%mm2
+	pshufw	$177,%mm4,%mm6
+	paddb	%mm0,%mm0
+	paddb	%mm4,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pshufw	$177,%mm2,%mm3
+	pshufw	$177,%mm6,%mm7
+	pxor	%mm0,%mm1
+	pxor	%mm4,%mm5
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	movq	%mm3,%mm2
+	movq	%mm7,%mm6
+	pslld	$8,%mm3
+	pslld	$8,%mm7
+	psrld	$24,%mm2
+	psrld	$24,%mm6
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	movq	%mm1,%mm3
+	movq	%mm5,%mm7
+	movq	(%edi),%mm2
+	movq	8(%edi),%mm6
+	psrld	$8,%mm1
+	psrld	$8,%mm5
+	movl	-128(%ebp),%eax
+	pslld	$24,%mm3
+	pslld	$24,%mm7
+	movl	-64(%ebp),%ebx
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	movl	(%ebp),%ecx
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	movl	64(%ebp),%edx
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	jmp	.L001loop
+.align	16
+.L002out:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	ret
+.size	_sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact
+.type	_x86_AES_encrypt,@function
+.align	16
+_x86_AES_encrypt:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+.align	16
+.L003loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%bh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%ch,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%dh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movzbl	%bh,%edi
+	xorl	1(%ebp,%edi,8),%esi
+
+	movl	20(%esp),%edi
+	movl	(%ebp,%edx,8),%edx
+	movzbl	%ah,%eax
+	xorl	3(%ebp,%eax,8),%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	xorl	2(%ebp,%ebx,8),%edx
+	movl	8(%esp),%ebx
+	xorl	1(%ebp,%ecx,8),%edx
+	movl	%esi,%ecx
+
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L003loop
+	movl	%eax,%esi
+	andl	$255,%esi
+	movl	2(%ebp,%esi,8),%esi
+	andl	$255,%esi
+	movzbl	%bh,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$65280,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$16711680,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movl	2(%ebp,%edi,8),%edi
+	andl	$4278190080,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	shrl	$16,%ebx
+	movl	2(%ebp,%esi,8),%esi
+	andl	$255,%esi
+	movzbl	%ch,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$65280,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$16711680,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$24,%edi
+	movl	2(%ebp,%edi,8),%edi
+	andl	$4278190080,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	shrl	$24,%ecx
+	movl	2(%ebp,%esi,8),%esi
+	andl	$255,%esi
+	movzbl	%dh,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$65280,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edx
+	andl	$255,%edi
+	movl	(%ebp,%edi,8),%edi
+	andl	$16711680,%edi
+	xorl	%edi,%esi
+	movzbl	%bh,%edi
+	movl	2(%ebp,%edi,8),%edi
+	andl	$4278190080,%edi
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movl	2(%ebp,%edx,8),%edx
+	andl	$255,%edx
+	movzbl	%ah,%eax
+	movl	(%ebp,%eax,8),%eax
+	andl	$65280,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	andl	$255,%ebx
+	movl	(%ebp,%ebx,8),%ebx
+	andl	$16711680,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	movl	2(%ebp,%ecx,8),%ecx
+	andl	$4278190080,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	ret
+.align	64
+.LAES_Te:
+.long	2774754246,2774754246
+.long	2222750968,2222750968
+.long	2574743534,2574743534
+.long	2373680118,2373680118
+.long	234025727,234025727
+.long	3177933782,3177933782
+.long	2976870366,2976870366
+.long	1422247313,1422247313
+.long	1345335392,1345335392
+.long	50397442,50397442
+.long	2842126286,2842126286
+.long	2099981142,2099981142
+.long	436141799,436141799
+.long	1658312629,1658312629
+.long	3870010189,3870010189
+.long	2591454956,2591454956
+.long	1170918031,1170918031
+.long	2642575903,2642575903
+.long	1086966153,1086966153
+.long	2273148410,2273148410
+.long	368769775,368769775
+.long	3948501426,3948501426
+.long	3376891790,3376891790
+.long	200339707,200339707
+.long	3970805057,3970805057
+.long	1742001331,1742001331
+.long	4255294047,4255294047
+.long	3937382213,3937382213
+.long	3214711843,3214711843
+.long	4154762323,4154762323
+.long	2524082916,2524082916
+.long	1539358875,1539358875
+.long	3266819957,3266819957
+.long	486407649,486407649
+.long	2928907069,2928907069
+.long	1780885068,1780885068
+.long	1513502316,1513502316
+.long	1094664062,1094664062
+.long	49805301,49805301
+.long	1338821763,1338821763
+.long	1546925160,1546925160
+.long	4104496465,4104496465
+.long	887481809,887481809
+.long	150073849,150073849
+.long	2473685474,2473685474
+.long	1943591083,1943591083
+.long	1395732834,1395732834
+.long	1058346282,1058346282
+.long	201589768,201589768
+.long	1388824469,1388824469
+.long	1696801606,1696801606
+.long	1589887901,1589887901
+.long	672667696,672667696
+.long	2711000631,2711000631
+.long	251987210,251987210
+.long	3046808111,3046808111
+.long	151455502,151455502
+.long	907153956,907153956
+.long	2608889883,2608889883
+.long	1038279391,1038279391
+.long	652995533,652995533
+.long	1764173646,1764173646
+.long	3451040383,3451040383
+.long	2675275242,2675275242
+.long	453576978,453576978
+.long	2659418909,2659418909
+.long	1949051992,1949051992
+.long	773462580,773462580
+.long	756751158,756751158
+.long	2993581788,2993581788
+.long	3998898868,3998898868
+.long	4221608027,4221608027
+.long	4132590244,4132590244
+.long	1295727478,1295727478
+.long	1641469623,1641469623
+.long	3467883389,3467883389
+.long	2066295122,2066295122
+.long	1055122397,1055122397
+.long	1898917726,1898917726
+.long	2542044179,2542044179
+.long	4115878822,4115878822
+.long	1758581177,1758581177
+.long	0,0
+.long	753790401,753790401
+.long	1612718144,1612718144
+.long	536673507,536673507
+.long	3367088505,3367088505
+.long	3982187446,3982187446
+.long	3194645204,3194645204
+.long	1187761037,1187761037
+.long	3653156455,3653156455
+.long	1262041458,1262041458
+.long	3729410708,3729410708
+.long	3561770136,3561770136
+.long	3898103984,3898103984
+.long	1255133061,1255133061
+.long	1808847035,1808847035
+.long	720367557,720367557
+.long	3853167183,3853167183
+.long	385612781,385612781
+.long	3309519750,3309519750
+.long	3612167578,3612167578
+.long	1429418854,1429418854
+.long	2491778321,2491778321
+.long	3477423498,3477423498
+.long	284817897,284817897
+.long	100794884,100794884
+.long	2172616702,2172616702
+.long	4031795360,4031795360
+.long	1144798328,1144798328
+.long	3131023141,3131023141
+.long	3819481163,3819481163
+.long	4082192802,4082192802
+.long	4272137053,4272137053
+.long	3225436288,3225436288
+.long	2324664069,2324664069
+.long	2912064063,2912064063
+.long	3164445985,3164445985
+.long	1211644016,1211644016
+.long	83228145,83228145
+.long	3753688163,3753688163
+.long	3249976951,3249976951
+.long	1977277103,1977277103
+.long	1663115586,1663115586
+.long	806359072,806359072
+.long	452984805,452984805
+.long	250868733,250868733
+.long	1842533055,1842533055
+.long	1288555905,1288555905
+.long	336333848,336333848
+.long	890442534,890442534
+.long	804056259,804056259
+.long	3781124030,3781124030
+.long	2727843637,2727843637
+.long	3427026056,3427026056
+.long	957814574,957814574
+.long	1472513171,1472513171
+.long	4071073621,4071073621
+.long	2189328124,2189328124
+.long	1195195770,1195195770
+.long	2892260552,2892260552
+.long	3881655738,3881655738
+.long	723065138,723065138
+.long	2507371494,2507371494
+.long	2690670784,2690670784
+.long	2558624025,2558624025
+.long	3511635870,3511635870
+.long	2145180835,2145180835
+.long	1713513028,1713513028
+.long	2116692564,2116692564
+.long	2878378043,2878378043
+.long	2206763019,2206763019
+.long	3393603212,3393603212
+.long	703524551,703524551
+.long	3552098411,3552098411
+.long	1007948840,1007948840
+.long	2044649127,2044649127
+.long	3797835452,3797835452
+.long	487262998,487262998
+.long	1994120109,1994120109
+.long	1004593371,1004593371
+.long	1446130276,1446130276
+.long	1312438900,1312438900
+.long	503974420,503974420
+.long	3679013266,3679013266
+.long	168166924,168166924
+.long	1814307912,1814307912
+.long	3831258296,3831258296
+.long	1573044895,1573044895
+.long	1859376061,1859376061
+.long	4021070915,4021070915
+.long	2791465668,2791465668
+.long	2828112185,2828112185
+.long	2761266481,2761266481
+.long	937747667,937747667
+.long	2339994098,2339994098
+.long	854058965,854058965
+.long	1137232011,1137232011
+.long	1496790894,1496790894
+.long	3077402074,3077402074
+.long	2358086913,2358086913
+.long	1691735473,1691735473
+.long	3528347292,3528347292
+.long	3769215305,3769215305
+.long	3027004632,3027004632
+.long	4199962284,4199962284
+.long	133494003,133494003
+.long	636152527,636152527
+.long	2942657994,2942657994
+.long	2390391540,2390391540
+.long	3920539207,3920539207
+.long	403179536,403179536
+.long	3585784431,3585784431
+.long	2289596656,2289596656
+.long	1864705354,1864705354
+.long	1915629148,1915629148
+.long	605822008,605822008
+.long	4054230615,4054230615
+.long	3350508659,3350508659
+.long	1371981463,1371981463
+.long	602466507,602466507
+.long	2094914977,2094914977
+.long	2624877800,2624877800
+.long	555687742,555687742
+.long	3712699286,3712699286
+.long	3703422305,3703422305
+.long	2257292045,2257292045
+.long	2240449039,2240449039
+.long	2423288032,2423288032
+.long	1111375484,1111375484
+.long	3300242801,3300242801
+.long	2858837708,2858837708
+.long	3628615824,3628615824
+.long	84083462,84083462
+.long	32962295,32962295
+.long	302911004,302911004
+.long	2741068226,2741068226
+.long	1597322602,1597322602
+.long	4183250862,4183250862
+.long	3501832553,3501832553
+.long	2441512471,2441512471
+.long	1489093017,1489093017
+.long	656219450,656219450
+.long	3114180135,3114180135
+.long	954327513,954327513
+.long	335083755,335083755
+.long	3013122091,3013122091
+.long	856756514,856756514
+.long	3144247762,3144247762
+.long	1893325225,1893325225
+.long	2307821063,2307821063
+.long	2811532339,2811532339
+.long	3063651117,3063651117
+.long	572399164,572399164
+.long	2458355477,2458355477
+.long	552200649,552200649
+.long	1238290055,1238290055
+.long	4283782570,4283782570
+.long	2015897680,2015897680
+.long	2061492133,2061492133
+.long	2408352771,2408352771
+.long	4171342169,4171342169
+.long	2156497161,2156497161
+.long	386731290,386731290
+.long	3669999461,3669999461
+.long	837215959,837215959
+.long	3326231172,3326231172
+.long	3093850320,3093850320
+.long	3275833730,3275833730
+.long	2962856233,2962856233
+.long	1999449434,1999449434
+.long	286199582,286199582
+.long	3417354363,3417354363
+.long	4233385128,4233385128
+.long	3602627437,3602627437
+.long	974525996,974525996
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.byte	99,124,119,123,242,107,111,197
+.byte	48,1,103,43,254,215,171,118
+.byte	202,130,201,125,250,89,71,240
+.byte	173,212,162,175,156,164,114,192
+.byte	183,253,147,38,54,63,247,204
+.byte	52,165,229,241,113,216,49,21
+.byte	4,199,35,195,24,150,5,154
+.byte	7,18,128,226,235,39,178,117
+.byte	9,131,44,26,27,110,90,160
+.byte	82,59,214,179,41,227,47,132
+.byte	83,209,0,237,32,252,177,91
+.byte	106,203,190,57,74,76,88,207
+.byte	208,239,170,251,67,77,51,133
+.byte	69,249,2,127,80,60,159,168
+.byte	81,163,64,143,146,157,56,245
+.byte	188,182,218,33,16,255,243,210
+.byte	205,12,19,236,95,151,68,23
+.byte	196,167,126,61,100,93,25,115
+.byte	96,129,79,220,34,42,144,136
+.byte	70,238,184,20,222,94,11,219
+.byte	224,50,58,10,73,6,36,92
+.byte	194,211,172,98,145,149,228,121
+.byte	231,200,55,109,141,213,78,169
+.byte	108,86,244,234,101,122,174,8
+.byte	186,120,37,46,28,166,180,198
+.byte	232,221,116,31,75,189,139,138
+.byte	112,62,181,102,72,3,246,14
+.byte	97,53,87,185,134,193,29,158
+.byte	225,248,152,17,105,217,142,148
+.byte	155,30,135,233,206,85,40,223
+.byte	140,161,137,13,191,230,66,104
+.byte	65,153,45,15,176,84,187,22
+.long	1,2,4,8
+.long	16,32,64,128
+.long	27,54,0,0
+.long	0,0,0,0
+.size	_x86_AES_encrypt,.-_x86_AES_encrypt
+.globl	AES_encrypt
+.type	AES_encrypt,@function
+.align	16
+AES_encrypt:
+.L_AES_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	28(%esp),%edi
+	movl	%esp,%eax
+	subl	$36,%esp
+	andl	$-64,%esp
+	leal	-127(%edi),%ebx
+	subl	%esp,%ebx
+	negl	%ebx
+	andl	$960,%ebx
+	subl	%ebx,%esp
+	addl	$4,%esp
+	movl	%eax,28(%esp)
+	call	.L004pic_point
+.L004pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L004pic_point](%ebp),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	leal	.LAES_Te-.L004pic_point(%ebp),%ebp
+	leal	764(%esp),%ebx
+	subl	%ebp,%ebx
+	andl	$768,%ebx
+	leal	2176(%ebp,%ebx,1),%ebp
+	btl	$25,(%eax)
+	jnc	.L005x86
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm4
+	call	_sse_AES_encrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movq	%mm0,(%esi)
+	movq	%mm4,8(%esi)
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	16
+.L005x86:
+	movl	%ebp,24(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	call	_x86_AES_encrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	AES_encrypt,.-.L_AES_encrypt_begin
+.type	_x86_AES_decrypt_compact,@function
+.align	16
+_x86_AES_decrypt_compact:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	-128(%ebp),%edi
+	movl	-96(%ebp),%esi
+	movl	-64(%ebp),%edi
+	movl	-32(%ebp),%esi
+	movl	(%ebp),%edi
+	movl	32(%ebp),%esi
+	movl	64(%ebp),%edi
+	movl	96(%ebp),%esi
+.align	16
+.L006loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ah,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ch,%ecx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$8,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	shrl	$24,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	xorl	%eax,%edx
+	movl	%ecx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%eax
+	subl	%edi,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	xorl	%eax,%esi
+	movl	%esi,%eax
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%eax,%eax,1),%ebx
+	subl	%edi,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%ecx,%eax
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ebx,%ebx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%ecx,%ebx
+	roll	$8,%ecx
+	xorl	%esi,%ebp
+	xorl	%eax,%ecx
+	xorl	%ebp,%eax
+	roll	$24,%eax
+	xorl	%ebx,%ecx
+	xorl	%ebp,%ebx
+	roll	$16,%ebx
+	xorl	%ebp,%ecx
+	roll	$8,%ebp
+	xorl	%eax,%ecx
+	xorl	%ebx,%ecx
+	movl	4(%esp),%eax
+	xorl	%ebp,%ecx
+	movl	%ecx,12(%esp)
+	movl	%edx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%edx,%edx,1),%ebx
+	subl	%edi,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%edi,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%edx,%ebx
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%edx,%ecx
+	roll	$8,%edx
+	xorl	%esi,%ebp
+	xorl	%ebx,%edx
+	xorl	%ebp,%ebx
+	roll	$24,%ebx
+	xorl	%ecx,%edx
+	xorl	%ebp,%ecx
+	roll	$16,%ecx
+	xorl	%ebp,%edx
+	roll	$8,%ebp
+	xorl	%ebx,%edx
+	xorl	%ecx,%edx
+	movl	8(%esp),%ebx
+	xorl	%ebp,%edx
+	movl	%edx,16(%esp)
+	movl	%eax,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%eax,%eax,1),%ecx
+	subl	%edi,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%edx
+	subl	%edi,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%eax,%ecx
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%edx,%edx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%eax,%edx
+	roll	$8,%eax
+	xorl	%esi,%ebp
+	xorl	%ecx,%eax
+	xorl	%ebp,%ecx
+	roll	$24,%ecx
+	xorl	%edx,%eax
+	xorl	%ebp,%edx
+	roll	$16,%edx
+	xorl	%ebp,%eax
+	roll	$8,%ebp
+	xorl	%ecx,%eax
+	xorl	%edx,%eax
+	xorl	%ebp,%eax
+	movl	%ebx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%edi,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%ecx,%ecx,1),%edx
+	subl	%edi,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%ebx,%ecx
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%edi
+	shrl	$7,%edi
+	leal	(%edx,%edx,1),%ebp
+	subl	%edi,%esi
+	andl	$4278124286,%ebp
+	andl	$454761243,%esi
+	xorl	%ebx,%edx
+	roll	$8,%ebx
+	xorl	%esi,%ebp
+	xorl	%ecx,%ebx
+	xorl	%ebp,%ecx
+	roll	$24,%ecx
+	xorl	%edx,%ebx
+	xorl	%ebp,%edx
+	roll	$16,%edx
+	xorl	%ebp,%ebx
+	roll	$8,%ebp
+	xorl	%ecx,%ebx
+	xorl	%edx,%ebx
+	movl	12(%esp),%ecx
+	xorl	%ebp,%ebx
+	movl	16(%esp),%edx
+	movl	20(%esp),%edi
+	movl	28(%esp),%ebp
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L006loop
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%ah,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	-128(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	movzbl	%ch,%ecx
+	movzbl	-128(%ebp,%ecx,1),%ecx
+	shll	$8,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	shrl	$24,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	xorl	16(%edi),%eax
+	xorl	20(%edi),%ebx
+	xorl	24(%edi),%ecx
+	xorl	28(%edi),%edx
+	ret
+.size	_x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact
+.type	_sse_AES_decrypt_compact,@function
+.align	16
+_sse_AES_decrypt_compact:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+	movl	$454761243,%eax
+	movl	%eax,8(%esp)
+	movl	%eax,12(%esp)
+	movl	-128(%ebp),%eax
+	movl	-96(%ebp),%ebx
+	movl	-64(%ebp),%ecx
+	movl	-32(%ebp),%edx
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%edx
+.align	16
+.L007loop:
+	pshufw	$12,%mm0,%mm1
+	movd	%mm1,%eax
+	pshufw	$9,%mm4,%mm5
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%ecx
+	movd	%mm5,%ebx
+	movzbl	%ah,%edx
+	movzbl	-128(%ebp,%edx,1),%edx
+	shll	$8,%edx
+	pshufw	$6,%mm0,%mm2
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%ecx
+	shrl	$16,%eax
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%edx
+	shrl	$16,%ebx
+	pshufw	$3,%mm4,%mm6
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%ecx
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	movd	%ecx,%mm0
+	movzbl	%al,%esi
+	movd	%mm2,%eax
+	movzbl	-128(%ebp,%esi,1),%ecx
+	shll	$16,%ecx
+	movzbl	%bl,%esi
+	movd	%mm6,%ebx
+	movzbl	-128(%ebp,%esi,1),%esi
+	orl	%esi,%ecx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	orl	%esi,%edx
+	movzbl	%bl,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%edx
+	movd	%edx,%mm1
+	movzbl	%ah,%esi
+	movzbl	-128(%ebp,%esi,1),%edx
+	shll	$8,%edx
+	movzbl	%bh,%esi
+	shrl	$16,%eax
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$24,%esi
+	orl	%esi,%edx
+	shrl	$16,%ebx
+	punpckldq	%mm1,%mm0
+	movzbl	%bh,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$8,%esi
+	orl	%esi,%ecx
+	andl	$255,%ebx
+	movzbl	-128(%ebp,%ebx,1),%ebx
+	orl	%ebx,%edx
+	movzbl	%al,%esi
+	movzbl	-128(%ebp,%esi,1),%esi
+	shll	$16,%esi
+	orl	%esi,%edx
+	movd	%edx,%mm4
+	movzbl	%ah,%eax
+	movzbl	-128(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	orl	%eax,%ecx
+	movd	%ecx,%mm5
+	punpckldq	%mm5,%mm4
+	addl	$16,%edi
+	cmpl	24(%esp),%edi
+	ja	.L008out
+	movq	%mm0,%mm3
+	movq	%mm4,%mm7
+	pshufw	$228,%mm0,%mm2
+	pshufw	$228,%mm4,%mm6
+	movq	%mm0,%mm1
+	movq	%mm4,%mm5
+	pshufw	$177,%mm0,%mm0
+	pshufw	$177,%mm4,%mm4
+	pslld	$8,%mm2
+	pslld	$8,%mm6
+	psrld	$8,%mm3
+	psrld	$8,%mm7
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pslld	$16,%mm2
+	pslld	$16,%mm6
+	psrld	$16,%mm3
+	psrld	$16,%mm7
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	movq	8(%esp),%mm3
+	pxor	%mm2,%mm2
+	pxor	%mm6,%mm6
+	pcmpgtb	%mm1,%mm2
+	pcmpgtb	%mm5,%mm6
+	pand	%mm3,%mm2
+	pand	%mm3,%mm6
+	paddb	%mm1,%mm1
+	paddb	%mm5,%mm5
+	pxor	%mm2,%mm1
+	pxor	%mm6,%mm5
+	movq	%mm1,%mm3
+	movq	%mm5,%mm7
+	movq	%mm1,%mm2
+	movq	%mm5,%mm6
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	pslld	$24,%mm3
+	pslld	$24,%mm7
+	psrld	$8,%mm2
+	psrld	$8,%mm6
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	movq	8(%esp),%mm2
+	pxor	%mm3,%mm3
+	pxor	%mm7,%mm7
+	pcmpgtb	%mm1,%mm3
+	pcmpgtb	%mm5,%mm7
+	pand	%mm2,%mm3
+	pand	%mm2,%mm7
+	paddb	%mm1,%mm1
+	paddb	%mm5,%mm5
+	pxor	%mm3,%mm1
+	pxor	%mm7,%mm5
+	pshufw	$177,%mm1,%mm3
+	pshufw	$177,%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm3,%mm3
+	pxor	%mm7,%mm7
+	pcmpgtb	%mm1,%mm3
+	pcmpgtb	%mm5,%mm7
+	pand	%mm2,%mm3
+	pand	%mm2,%mm7
+	paddb	%mm1,%mm1
+	paddb	%mm5,%mm5
+	pxor	%mm3,%mm1
+	pxor	%mm7,%mm5
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	movq	%mm1,%mm3
+	movq	%mm5,%mm7
+	pshufw	$177,%mm1,%mm2
+	pshufw	$177,%mm5,%mm6
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	pslld	$8,%mm1
+	pslld	$8,%mm5
+	psrld	$8,%mm3
+	psrld	$8,%mm7
+	movq	(%edi),%mm2
+	movq	8(%edi),%mm6
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	movl	-128(%ebp),%eax
+	pslld	$16,%mm1
+	pslld	$16,%mm5
+	movl	-64(%ebp),%ebx
+	psrld	$16,%mm3
+	psrld	$16,%mm7
+	movl	(%ebp),%ecx
+	pxor	%mm1,%mm0
+	pxor	%mm5,%mm4
+	movl	64(%ebp),%edx
+	pxor	%mm3,%mm0
+	pxor	%mm7,%mm4
+	pxor	%mm2,%mm0
+	pxor	%mm6,%mm4
+	jmp	.L007loop
+.align	16
+.L008out:
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	ret
+.size	_sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact
+.type	_x86_AES_decrypt,@function
+.align	16
+_x86_AES_decrypt:
+	movl	%edi,20(%esp)
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,24(%esp)
+.align	16
+.L009loop:
+	movl	%eax,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%dh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,4(%esp)
+
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%ah,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+	movl	%esi,8(%esp)
+
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movl	(%ebp,%esi,8),%esi
+	movzbl	%bh,%edi
+	xorl	3(%ebp,%edi,8),%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	xorl	2(%ebp,%edi,8),%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	xorl	1(%ebp,%edi,8),%esi
+
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movl	(%ebp,%edx,8),%edx
+	movzbl	%ch,%ecx
+	xorl	3(%ebp,%ecx,8),%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	xorl	2(%ebp,%ebx,8),%edx
+	movl	8(%esp),%ebx
+	shrl	$24,%eax
+	xorl	1(%ebp,%eax,8),%edx
+	movl	4(%esp),%eax
+
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	cmpl	24(%esp),%edi
+	movl	%edi,20(%esp)
+	jb	.L009loop
+	leal	2176(%ebp),%ebp
+	movl	-128(%ebp),%edi
+	movl	-96(%ebp),%esi
+	movl	-64(%ebp),%edi
+	movl	-32(%ebp),%esi
+	movl	(%ebp),%edi
+	movl	32(%ebp),%esi
+	movl	64(%ebp),%edi
+	movl	96(%ebp),%esi
+	leal	-128(%ebp),%ebp
+	movl	%eax,%esi
+	andl	$255,%esi
+	movzbl	(%ebp,%esi,1),%esi
+	movzbl	%dh,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ebx,%edi
+	shrl	$24,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,4(%esp)
+	movl	%ebx,%esi
+	andl	$255,%esi
+	movzbl	(%ebp,%esi,1),%esi
+	movzbl	%ah,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%ecx,%edi
+	shrl	$24,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	%esi,8(%esp)
+	movl	%ecx,%esi
+	andl	$255,%esi
+	movzbl	(%ebp,%esi,1),%esi
+	movzbl	%bh,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$8,%edi
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	shrl	$16,%edi
+	andl	$255,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$16,%edi
+	xorl	%edi,%esi
+	movl	%edx,%edi
+	shrl	$24,%edi
+	movzbl	(%ebp,%edi,1),%edi
+	shll	$24,%edi
+	xorl	%edi,%esi
+	movl	20(%esp),%edi
+	andl	$255,%edx
+	movzbl	(%ebp,%edx,1),%edx
+	movzbl	%ch,%ecx
+	movzbl	(%ebp,%ecx,1),%ecx
+	shll	$8,%ecx
+	xorl	%ecx,%edx
+	movl	%esi,%ecx
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	movzbl	(%ebp,%ebx,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%edx
+	movl	8(%esp),%ebx
+	shrl	$24,%eax
+	movzbl	(%ebp,%eax,1),%eax
+	shll	$24,%eax
+	xorl	%eax,%edx
+	movl	4(%esp),%eax
+	leal	-2048(%ebp),%ebp
+	addl	$16,%edi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	ret
+.align	64
+.LAES_Td:
+.long	1353184337,1353184337
+.long	1399144830,1399144830
+.long	3282310938,3282310938
+.long	2522752826,2522752826
+.long	3412831035,3412831035
+.long	4047871263,4047871263
+.long	2874735276,2874735276
+.long	2466505547,2466505547
+.long	1442459680,1442459680
+.long	4134368941,4134368941
+.long	2440481928,2440481928
+.long	625738485,625738485
+.long	4242007375,4242007375
+.long	3620416197,3620416197
+.long	2151953702,2151953702
+.long	2409849525,2409849525
+.long	1230680542,1230680542
+.long	1729870373,1729870373
+.long	2551114309,2551114309
+.long	3787521629,3787521629
+.long	41234371,41234371
+.long	317738113,317738113
+.long	2744600205,2744600205
+.long	3338261355,3338261355
+.long	3881799427,3881799427
+.long	2510066197,2510066197
+.long	3950669247,3950669247
+.long	3663286933,3663286933
+.long	763608788,763608788
+.long	3542185048,3542185048
+.long	694804553,694804553
+.long	1154009486,1154009486
+.long	1787413109,1787413109
+.long	2021232372,2021232372
+.long	1799248025,1799248025
+.long	3715217703,3715217703
+.long	3058688446,3058688446
+.long	397248752,397248752
+.long	1722556617,1722556617
+.long	3023752829,3023752829
+.long	407560035,407560035
+.long	2184256229,2184256229
+.long	1613975959,1613975959
+.long	1165972322,1165972322
+.long	3765920945,3765920945
+.long	2226023355,2226023355
+.long	480281086,480281086
+.long	2485848313,2485848313
+.long	1483229296,1483229296
+.long	436028815,436028815
+.long	2272059028,2272059028
+.long	3086515026,3086515026
+.long	601060267,601060267
+.long	3791801202,3791801202
+.long	1468997603,1468997603
+.long	715871590,715871590
+.long	120122290,120122290
+.long	63092015,63092015
+.long	2591802758,2591802758
+.long	2768779219,2768779219
+.long	4068943920,4068943920
+.long	2997206819,2997206819
+.long	3127509762,3127509762
+.long	1552029421,1552029421
+.long	723308426,723308426
+.long	2461301159,2461301159
+.long	4042393587,4042393587
+.long	2715969870,2715969870
+.long	3455375973,3455375973
+.long	3586000134,3586000134
+.long	526529745,526529745
+.long	2331944644,2331944644
+.long	2639474228,2639474228
+.long	2689987490,2689987490
+.long	853641733,853641733
+.long	1978398372,1978398372
+.long	971801355,971801355
+.long	2867814464,2867814464
+.long	111112542,111112542
+.long	1360031421,1360031421
+.long	4186579262,4186579262
+.long	1023860118,1023860118
+.long	2919579357,2919579357
+.long	1186850381,1186850381
+.long	3045938321,3045938321
+.long	90031217,90031217
+.long	1876166148,1876166148
+.long	4279586912,4279586912
+.long	620468249,620468249
+.long	2548678102,2548678102
+.long	3426959497,3426959497
+.long	2006899047,2006899047
+.long	3175278768,3175278768
+.long	2290845959,2290845959
+.long	945494503,945494503
+.long	3689859193,3689859193
+.long	1191869601,1191869601
+.long	3910091388,3910091388
+.long	3374220536,3374220536
+.long	0,0
+.long	2206629897,2206629897
+.long	1223502642,1223502642
+.long	2893025566,2893025566
+.long	1316117100,1316117100
+.long	4227796733,4227796733
+.long	1446544655,1446544655
+.long	517320253,517320253
+.long	658058550,658058550
+.long	1691946762,1691946762
+.long	564550760,564550760
+.long	3511966619,3511966619
+.long	976107044,976107044
+.long	2976320012,2976320012
+.long	266819475,266819475
+.long	3533106868,3533106868
+.long	2660342555,2660342555
+.long	1338359936,1338359936
+.long	2720062561,2720062561
+.long	1766553434,1766553434
+.long	370807324,370807324
+.long	179999714,179999714
+.long	3844776128,3844776128
+.long	1138762300,1138762300
+.long	488053522,488053522
+.long	185403662,185403662
+.long	2915535858,2915535858
+.long	3114841645,3114841645
+.long	3366526484,3366526484
+.long	2233069911,2233069911
+.long	1275557295,1275557295
+.long	3151862254,3151862254
+.long	4250959779,4250959779
+.long	2670068215,2670068215
+.long	3170202204,3170202204
+.long	3309004356,3309004356
+.long	880737115,880737115
+.long	1982415755,1982415755
+.long	3703972811,3703972811
+.long	1761406390,1761406390
+.long	1676797112,1676797112
+.long	3403428311,3403428311
+.long	277177154,277177154
+.long	1076008723,1076008723
+.long	538035844,538035844
+.long	2099530373,2099530373
+.long	4164795346,4164795346
+.long	288553390,288553390
+.long	1839278535,1839278535
+.long	1261411869,1261411869
+.long	4080055004,4080055004
+.long	3964831245,3964831245
+.long	3504587127,3504587127
+.long	1813426987,1813426987
+.long	2579067049,2579067049
+.long	4199060497,4199060497
+.long	577038663,577038663
+.long	3297574056,3297574056
+.long	440397984,440397984
+.long	3626794326,3626794326
+.long	4019204898,4019204898
+.long	3343796615,3343796615
+.long	3251714265,3251714265
+.long	4272081548,4272081548
+.long	906744984,906744984
+.long	3481400742,3481400742
+.long	685669029,685669029
+.long	646887386,646887386
+.long	2764025151,2764025151
+.long	3835509292,3835509292
+.long	227702864,227702864
+.long	2613862250,2613862250
+.long	1648787028,1648787028
+.long	3256061430,3256061430
+.long	3904428176,3904428176
+.long	1593260334,1593260334
+.long	4121936770,4121936770
+.long	3196083615,3196083615
+.long	2090061929,2090061929
+.long	2838353263,2838353263
+.long	3004310991,3004310991
+.long	999926984,999926984
+.long	2809993232,2809993232
+.long	1852021992,1852021992
+.long	2075868123,2075868123
+.long	158869197,158869197
+.long	4095236462,4095236462
+.long	28809964,28809964
+.long	2828685187,2828685187
+.long	1701746150,1701746150
+.long	2129067946,2129067946
+.long	147831841,147831841
+.long	3873969647,3873969647
+.long	3650873274,3650873274
+.long	3459673930,3459673930
+.long	3557400554,3557400554
+.long	3598495785,3598495785
+.long	2947720241,2947720241
+.long	824393514,824393514
+.long	815048134,815048134
+.long	3227951669,3227951669
+.long	935087732,935087732
+.long	2798289660,2798289660
+.long	2966458592,2966458592
+.long	366520115,366520115
+.long	1251476721,1251476721
+.long	4158319681,4158319681
+.long	240176511,240176511
+.long	804688151,804688151
+.long	2379631990,2379631990
+.long	1303441219,1303441219
+.long	1414376140,1414376140
+.long	3741619940,3741619940
+.long	3820343710,3820343710
+.long	461924940,461924940
+.long	3089050817,3089050817
+.long	2136040774,2136040774
+.long	82468509,82468509
+.long	1563790337,1563790337
+.long	1937016826,1937016826
+.long	776014843,776014843
+.long	1511876531,1511876531
+.long	1389550482,1389550482
+.long	861278441,861278441
+.long	323475053,323475053
+.long	2355222426,2355222426
+.long	2047648055,2047648055
+.long	2383738969,2383738969
+.long	2302415851,2302415851
+.long	3995576782,3995576782
+.long	902390199,902390199
+.long	3991215329,3991215329
+.long	1018251130,1018251130
+.long	1507840668,1507840668
+.long	1064563285,1064563285
+.long	2043548696,2043548696
+.long	3208103795,3208103795
+.long	3939366739,3939366739
+.long	1537932639,1537932639
+.long	342834655,342834655
+.long	2262516856,2262516856
+.long	2180231114,2180231114
+.long	1053059257,1053059257
+.long	741614648,741614648
+.long	1598071746,1598071746
+.long	1925389590,1925389590
+.long	203809468,203809468
+.long	2336832552,2336832552
+.long	1100287487,1100287487
+.long	1895934009,1895934009
+.long	3736275976,3736275976
+.long	2632234200,2632234200
+.long	2428589668,2428589668
+.long	1636092795,1636092795
+.long	1890988757,1890988757
+.long	1952214088,1952214088
+.long	1113045200,1113045200
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.byte	82,9,106,213,48,54,165,56
+.byte	191,64,163,158,129,243,215,251
+.byte	124,227,57,130,155,47,255,135
+.byte	52,142,67,68,196,222,233,203
+.byte	84,123,148,50,166,194,35,61
+.byte	238,76,149,11,66,250,195,78
+.byte	8,46,161,102,40,217,36,178
+.byte	118,91,162,73,109,139,209,37
+.byte	114,248,246,100,134,104,152,22
+.byte	212,164,92,204,93,101,182,146
+.byte	108,112,72,80,253,237,185,218
+.byte	94,21,70,87,167,141,157,132
+.byte	144,216,171,0,140,188,211,10
+.byte	247,228,88,5,184,179,69,6
+.byte	208,44,30,143,202,63,15,2
+.byte	193,175,189,3,1,19,138,107
+.byte	58,145,17,65,79,103,220,234
+.byte	151,242,207,206,240,180,230,115
+.byte	150,172,116,34,231,173,53,133
+.byte	226,249,55,232,28,117,223,110
+.byte	71,241,26,113,29,41,197,137
+.byte	111,183,98,14,170,24,190,27
+.byte	252,86,62,75,198,210,121,32
+.byte	154,219,192,254,120,205,90,244
+.byte	31,221,168,51,136,7,199,49
+.byte	177,18,16,89,39,128,236,95
+.byte	96,81,127,169,25,181,74,13
+.byte	45,229,122,159,147,201,156,239
+.byte	160,224,59,77,174,42,245,176
+.byte	200,235,187,60,131,83,153,97
+.byte	23,43,4,126,186,119,214,38
+.byte	225,105,20,99,85,33,12,125
+.size	_x86_AES_decrypt,.-_x86_AES_decrypt
+.globl	AES_decrypt
+.type	AES_decrypt,@function
+.align	16
+AES_decrypt:
+.L_AES_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	28(%esp),%edi
+	movl	%esp,%eax
+	subl	$36,%esp
+	andl	$-64,%esp
+	leal	-127(%edi),%ebx
+	subl	%esp,%ebx
+	negl	%ebx
+	andl	$960,%ebx
+	subl	%ebx,%esp
+	addl	$4,%esp
+	movl	%eax,28(%esp)
+	call	.L010pic_point
+.L010pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010pic_point](%ebp),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	leal	.LAES_Td-.L010pic_point(%ebp),%ebp
+	leal	764(%esp),%ebx
+	subl	%ebp,%ebx
+	andl	$768,%ebx
+	leal	2176(%ebp,%ebx,1),%ebp
+	btl	$25,(%eax)
+	jnc	.L011x86
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm4
+	call	_sse_AES_decrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movq	%mm0,(%esi)
+	movq	%mm4,8(%esi)
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	16
+.L011x86:
+	movl	%ebp,24(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	call	_x86_AES_decrypt_compact
+	movl	28(%esp),%esp
+	movl	24(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	AES_decrypt,.-.L_AES_decrypt_begin
+.globl	AES_cbc_encrypt
+.type	AES_cbc_encrypt,@function
+.align	16
+AES_cbc_encrypt:
+.L_AES_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ecx
+	cmpl	$0,%ecx
+	je	.L012drop_out
+	call	.L013pic_point
+.L013pic_point:
+	popl	%ebp
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L013pic_point](%ebp),%eax
+	movl	OPENSSL_ia32cap_P@GOT(%eax),%eax
+	cmpl	$0,40(%esp)
+	leal	.LAES_Te-.L013pic_point(%ebp),%ebp
+	jne	.L014picked_te
+	leal	.LAES_Td-.LAES_Te(%ebp),%ebp
+.L014picked_te:
+	pushfl
+	cld
+	cmpl	$512,%ecx
+	jb	.L015slow_way
+	testl	$15,%ecx
+	jnz	.L015slow_way
+	btl	$28,(%eax)
+	jc	.L015slow_way
+	leal	-324(%esp),%esi
+	andl	$-64,%esi
+	movl	%ebp,%eax
+	leal	2304(%ebp),%ebx
+	movl	%esi,%edx
+	andl	$4095,%eax
+	andl	$4095,%ebx
+	andl	$4095,%edx
+	cmpl	%ebx,%edx
+	jb	.L016tbl_break_out
+	subl	%ebx,%edx
+	subl	%edx,%esi
+	jmp	.L017tbl_ok
+.align	4
+.L016tbl_break_out:
+	subl	%eax,%edx
+	andl	$4095,%edx
+	addl	$384,%edx
+	subl	%edx,%esi
+.align	4
+.L017tbl_ok:
+	leal	24(%esp),%edx
+	xchgl	%esi,%esp
+	addl	$4,%esp
+	movl	%ebp,24(%esp)
+	movl	%esi,28(%esp)
+	movl	(%edx),%eax
+	movl	4(%edx),%ebx
+	movl	12(%edx),%edi
+	movl	16(%edx),%esi
+	movl	20(%edx),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edi,44(%esp)
+	movl	%esi,48(%esp)
+	movl	$0,316(%esp)
+	movl	%edi,%ebx
+	movl	$61,%ecx
+	subl	%ebp,%ebx
+	movl	%edi,%esi
+	andl	$4095,%ebx
+	leal	76(%esp),%edi
+	cmpl	$2304,%ebx
+	jb	.L018do_copy
+	cmpl	$3852,%ebx
+	jb	.L019skip_copy
+.align	4
+.L018do_copy:
+	movl	%edi,44(%esp)
+.long	2784229001
+.L019skip_copy:
+	movl	$16,%edi
+.align	4
+.L020prefetch_tbl:
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%esi
+	leal	128(%ebp),%ebp
+	subl	$1,%edi
+	jnz	.L020prefetch_tbl
+	subl	$2048,%ebp
+	movl	32(%esp),%esi
+	movl	48(%esp),%edi
+	cmpl	$0,%edx
+	je	.L021fast_decrypt
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+.align	16
+.L022fast_enc_loop:
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	xorl	(%esi),%eax
+	xorl	4(%esi),%ebx
+	xorl	8(%esi),%ecx
+	xorl	12(%esi),%edx
+	movl	44(%esp),%edi
+	call	_x86_AES_encrypt
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	leal	16(%esi),%esi
+	movl	40(%esp),%ecx
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edx
+	movl	%edx,36(%esp)
+	subl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jnz	.L022fast_enc_loop
+	movl	48(%esp),%esi
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	cmpl	$0,316(%esp)
+	movl	44(%esp),%edi
+	je	.L023skip_ezero
+	movl	$60,%ecx
+	xorl	%eax,%eax
+.align	4
+.long	2884892297
+.L023skip_ezero:
+	movl	28(%esp),%esp
+	popfl
+.L012drop_out:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L021fast_decrypt:
+	cmpl	36(%esp),%esi
+	je	.L024fast_dec_in_place
+	movl	%edi,52(%esp)
+.align	4
+.align	16
+.L025fast_dec_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	44(%esp),%edi
+	call	_x86_AES_decrypt
+	movl	52(%esp),%edi
+	movl	40(%esp),%esi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	36(%esp),%edi
+	movl	32(%esp),%esi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	40(%esp),%ecx
+	movl	%esi,52(%esp)
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edi
+	movl	%edi,36(%esp)
+	subl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jnz	.L025fast_dec_loop
+	movl	52(%esp),%edi
+	movl	48(%esp),%esi
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	jmp	.L026fast_dec_out
+.align	16
+.L024fast_dec_in_place:
+.L027fast_dec_in_place_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	leal	60(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	44(%esp),%edi
+	call	_x86_AES_decrypt
+	movl	48(%esp),%edi
+	movl	36(%esp),%esi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	leal	16(%esi),%esi
+	movl	%esi,36(%esp)
+	leal	60(%esp),%esi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	32(%esp),%esi
+	movl	40(%esp),%ecx
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	subl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jnz	.L027fast_dec_in_place_loop
+.align	4
+.L026fast_dec_out:
+	cmpl	$0,316(%esp)
+	movl	44(%esp),%edi
+	je	.L028skip_dzero
+	movl	$60,%ecx
+	xorl	%eax,%eax
+.align	4
+.long	2884892297
+.L028skip_dzero:
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L015slow_way:
+	movl	(%eax),%eax
+	movl	36(%esp),%edi
+	leal	-80(%esp),%esi
+	andl	$-64,%esi
+	leal	-143(%edi),%ebx
+	subl	%esi,%ebx
+	negl	%ebx
+	andl	$960,%ebx
+	subl	%ebx,%esi
+	leal	768(%esi),%ebx
+	subl	%ebp,%ebx
+	andl	$768,%ebx
+	leal	2176(%ebp,%ebx,1),%ebp
+	leal	24(%esp),%edx
+	xchgl	%esi,%esp
+	addl	$4,%esp
+	movl	%ebp,24(%esp)
+	movl	%esi,28(%esp)
+	movl	%eax,52(%esp)
+	movl	(%edx),%eax
+	movl	4(%edx),%ebx
+	movl	16(%edx),%esi
+	movl	20(%edx),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edi,44(%esp)
+	movl	%esi,48(%esp)
+	movl	%esi,%edi
+	movl	%eax,%esi
+	cmpl	$0,%edx
+	je	.L029slow_decrypt
+	cmpl	$16,%ecx
+	movl	%ebx,%edx
+	jb	.L030slow_enc_tail
+	btl	$25,52(%esp)
+	jnc	.L031slow_enc_x86
+	movq	(%edi),%mm0
+	movq	8(%edi),%mm4
+.align	16
+.L032slow_enc_loop_sse:
+	pxor	(%esi),%mm0
+	pxor	8(%esi),%mm4
+	movl	44(%esp),%edi
+	call	_sse_AES_encrypt_compact
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	movl	40(%esp),%ecx
+	movq	%mm0,(%edi)
+	movq	%mm4,8(%edi)
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edx
+	movl	%edx,36(%esp)
+	subl	$16,%ecx
+	cmpl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jae	.L032slow_enc_loop_sse
+	testl	$15,%ecx
+	jnz	.L030slow_enc_tail
+	movl	48(%esp),%esi
+	movq	%mm0,(%esi)
+	movq	%mm4,8(%esi)
+	emms
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L031slow_enc_x86:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+.align	4
+.L033slow_enc_loop_x86:
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	xorl	(%esi),%eax
+	xorl	4(%esi),%ebx
+	xorl	8(%esi),%ecx
+	xorl	12(%esi),%edx
+	movl	44(%esp),%edi
+	call	_x86_AES_encrypt_compact
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	40(%esp),%ecx
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	leal	16(%edi),%edx
+	movl	%edx,36(%esp)
+	subl	$16,%ecx
+	cmpl	$16,%ecx
+	movl	%ecx,40(%esp)
+	jae	.L033slow_enc_loop_x86
+	testl	$15,%ecx
+	jnz	.L030slow_enc_tail
+	movl	48(%esp),%esi
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L030slow_enc_tail:
+	emms
+	movl	%edx,%edi
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	cmpl	%esi,%edi
+	je	.L034enc_in_place
+.align	4
+.long	2767451785
+	jmp	.L035enc_skip_in_place
+.L034enc_in_place:
+	leal	(%edi,%ecx,1),%edi
+.L035enc_skip_in_place:
+	movl	%ebx,%ecx
+	xorl	%eax,%eax
+.align	4
+.long	2868115081
+	movl	48(%esp),%edi
+	movl	%edx,%esi
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	$16,40(%esp)
+	jmp	.L033slow_enc_loop_x86
+.align	16
+.L029slow_decrypt:
+	btl	$25,52(%esp)
+	jnc	.L036slow_dec_loop_x86
+.align	4
+.L037slow_dec_loop_sse:
+	movq	(%esi),%mm0
+	movq	8(%esi),%mm4
+	movl	44(%esp),%edi
+	call	_sse_AES_decrypt_compact
+	movl	32(%esp),%esi
+	leal	60(%esp),%eax
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	48(%esp),%edi
+	movq	(%esi),%mm1
+	movq	8(%esi),%mm5
+	pxor	(%edi),%mm0
+	pxor	8(%edi),%mm4
+	movq	%mm1,(%edi)
+	movq	%mm5,8(%edi)
+	subl	$16,%ecx
+	jc	.L038slow_dec_partial_sse
+	movq	%mm0,(%ebx)
+	movq	%mm4,8(%ebx)
+	leal	16(%ebx),%ebx
+	movl	%ebx,36(%esp)
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	movl	%ecx,40(%esp)
+	jnz	.L037slow_dec_loop_sse
+	emms
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L038slow_dec_partial_sse:
+	movq	%mm0,(%eax)
+	movq	%mm4,8(%eax)
+	emms
+	addl	$16,%ecx
+	movl	%ebx,%edi
+	movl	%eax,%esi
+.align	4
+.long	2767451785
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L036slow_dec_loop_x86:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	leal	60(%esp),%edi
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	44(%esp),%edi
+	call	_x86_AES_decrypt_compact
+	movl	48(%esp),%edi
+	movl	40(%esp),%esi
+	xorl	(%edi),%eax
+	xorl	4(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	12(%edi),%edx
+	subl	$16,%esi
+	jc	.L039slow_dec_partial_x86
+	movl	%esi,40(%esp)
+	movl	36(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	leal	16(%esi),%esi
+	movl	%esi,36(%esp)
+	leal	60(%esp),%esi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	32(%esp),%esi
+	leal	16(%esi),%esi
+	movl	%esi,32(%esp)
+	jnz	.L036slow_dec_loop_x86
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+	pushfl
+.align	16
+.L039slow_dec_partial_x86:
+	leal	60(%esp),%esi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	40(%esp),%ecx
+	movl	36(%esp),%edi
+	leal	60(%esp),%esi
+.align	4
+.long	2767451785
+	movl	28(%esp),%esp
+	popfl
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin
+.type	_x86_AES_set_encrypt_key,@function
+.align	16
+_x86_AES_set_encrypt_key:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	24(%esp),%esi
+	movl	32(%esp),%edi
+	testl	$-1,%esi
+	jz	.L040badpointer
+	testl	$-1,%edi
+	jz	.L040badpointer
+	call	.L041pic_point
+.L041pic_point:
+	popl	%ebp
+	leal	.LAES_Te-.L041pic_point(%ebp),%ebp
+	leal	2176(%ebp),%ebp
+	movl	-128(%ebp),%eax
+	movl	-96(%ebp),%ebx
+	movl	-64(%ebp),%ecx
+	movl	-32(%ebp),%edx
+	movl	(%ebp),%eax
+	movl	32(%ebp),%ebx
+	movl	64(%ebp),%ecx
+	movl	96(%ebp),%edx
+	movl	28(%esp),%ecx
+	cmpl	$128,%ecx
+	je	.L04210rounds
+	cmpl	$192,%ecx
+	je	.L04312rounds
+	cmpl	$256,%ecx
+	je	.L04414rounds
+	movl	$-2,%eax
+	jmp	.L045exit
+.L04210rounds:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	xorl	%ecx,%ecx
+	jmp	.L04610shortcut
+.align	4
+.L04710loop:
+	movl	(%edi),%eax
+	movl	12(%edi),%edx
+.L04610shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	xorl	896(%ebp,%ecx,4),%eax
+	movl	%eax,16(%edi)
+	xorl	4(%edi),%eax
+	movl	%eax,20(%edi)
+	xorl	8(%edi),%eax
+	movl	%eax,24(%edi)
+	xorl	12(%edi),%eax
+	movl	%eax,28(%edi)
+	incl	%ecx
+	addl	$16,%edi
+	cmpl	$10,%ecx
+	jl	.L04710loop
+	movl	$10,80(%edi)
+	xorl	%eax,%eax
+	jmp	.L045exit
+.L04312rounds:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	16(%esi),%ecx
+	movl	20(%esi),%edx
+	movl	%ecx,16(%edi)
+	movl	%edx,20(%edi)
+	xorl	%ecx,%ecx
+	jmp	.L04812shortcut
+.align	4
+.L04912loop:
+	movl	(%edi),%eax
+	movl	20(%edi),%edx
+.L04812shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	xorl	896(%ebp,%ecx,4),%eax
+	movl	%eax,24(%edi)
+	xorl	4(%edi),%eax
+	movl	%eax,28(%edi)
+	xorl	8(%edi),%eax
+	movl	%eax,32(%edi)
+	xorl	12(%edi),%eax
+	movl	%eax,36(%edi)
+	cmpl	$7,%ecx
+	je	.L05012break
+	incl	%ecx
+	xorl	16(%edi),%eax
+	movl	%eax,40(%edi)
+	xorl	20(%edi),%eax
+	movl	%eax,44(%edi)
+	addl	$24,%edi
+	jmp	.L04912loop
+.L05012break:
+	movl	$12,72(%edi)
+	xorl	%eax,%eax
+	jmp	.L045exit
+.L04414rounds:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,12(%edi)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	movl	%eax,16(%edi)
+	movl	%ebx,20(%edi)
+	movl	%ecx,24(%edi)
+	movl	%edx,28(%edi)
+	xorl	%ecx,%ecx
+	jmp	.L05114shortcut
+.align	4
+.L05214loop:
+	movl	28(%edi),%edx
+.L05114shortcut:
+	movl	(%edi),%eax
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	xorl	896(%ebp,%ecx,4),%eax
+	movl	%eax,32(%edi)
+	xorl	4(%edi),%eax
+	movl	%eax,36(%edi)
+	xorl	8(%edi),%eax
+	movl	%eax,40(%edi)
+	xorl	12(%edi),%eax
+	movl	%eax,44(%edi)
+	cmpl	$6,%ecx
+	je	.L05314break
+	incl	%ecx
+	movl	%eax,%edx
+	movl	16(%edi),%eax
+	movzbl	%dl,%esi
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shrl	$16,%edx
+	shll	$8,%ebx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+	movzbl	-128(%ebp,%esi,1),%ebx
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+	movl	%eax,48(%edi)
+	xorl	20(%edi),%eax
+	movl	%eax,52(%edi)
+	xorl	24(%edi),%eax
+	movl	%eax,56(%edi)
+	xorl	28(%edi),%eax
+	movl	%eax,60(%edi)
+	addl	$32,%edi
+	jmp	.L05214loop
+.L05314break:
+	movl	$14,48(%edi)
+	xorl	%eax,%eax
+	jmp	.L045exit
+.L040badpointer:
+	movl	$-1,%eax
+.L045exit:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,@function
+.align	16
+private_AES_set_encrypt_key:
+.L_private_AES_set_encrypt_key_begin:
+	call	_x86_AES_set_encrypt_key
+	ret
+.size	private_AES_set_encrypt_key,.-.L_private_AES_set_encrypt_key_begin
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,@function
+.align	16
+private_AES_set_decrypt_key:
+.L_private_AES_set_decrypt_key_begin:
+	call	_x86_AES_set_encrypt_key
+	cmpl	$0,%eax
+	je	.L054proceed
+	ret
+.L054proceed:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%esi
+	movl	240(%esi),%ecx
+	leal	(,%ecx,4),%ecx
+	leal	(%esi,%ecx,4),%edi
+.align	4
+.L055invert:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	(%edi),%ecx
+	movl	4(%edi),%edx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	%ecx,(%esi)
+	movl	%edx,4(%esi)
+	movl	8(%esi),%eax
+	movl	12(%esi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	%eax,8(%edi)
+	movl	%ebx,12(%edi)
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	addl	$16,%esi
+	subl	$16,%edi
+	cmpl	%edi,%esi
+	jne	.L055invert
+	movl	28(%esp),%edi
+	movl	240(%edi),%esi
+	leal	-2(%esi,%esi,1),%esi
+	leal	(%edi,%esi,8),%esi
+	movl	%esi,28(%esp)
+	movl	16(%edi),%eax
+.align	4
+.L056permute:
+	addl	$16,%edi
+	movl	%eax,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%ebx
+	subl	%ebp,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%ebp,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%eax,%ebx
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	xorl	%eax,%ecx
+	subl	%ebp,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	roll	$8,%eax
+	xorl	%esi,%edx
+	movl	4(%edi),%ebp
+	xorl	%ebx,%eax
+	xorl	%edx,%ebx
+	xorl	%ecx,%eax
+	roll	$24,%ebx
+	xorl	%edx,%ecx
+	xorl	%edx,%eax
+	roll	$16,%ecx
+	xorl	%ebx,%eax
+	roll	$8,%edx
+	xorl	%ecx,%eax
+	movl	%ebp,%ebx
+	xorl	%edx,%eax
+	movl	%eax,(%edi)
+	movl	%ebx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	subl	%ebp,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	xorl	%ecx,%esi
+	movl	%esi,%ecx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	subl	%ebp,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%ebx,%ecx
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%eax
+	xorl	%ebx,%edx
+	subl	%ebp,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	roll	$8,%ebx
+	xorl	%esi,%eax
+	movl	8(%edi),%ebp
+	xorl	%ecx,%ebx
+	xorl	%eax,%ecx
+	xorl	%edx,%ebx
+	roll	$24,%ecx
+	xorl	%eax,%edx
+	xorl	%eax,%ebx
+	roll	$16,%edx
+	xorl	%ecx,%ebx
+	roll	$8,%eax
+	xorl	%edx,%ebx
+	movl	%ebp,%ecx
+	xorl	%eax,%ebx
+	movl	%ebx,4(%edi)
+	movl	%ecx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ecx,%ecx,1),%edx
+	subl	%ebp,%esi
+	andl	$4278124286,%edx
+	andl	$454761243,%esi
+	xorl	%edx,%esi
+	movl	%esi,%edx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%eax
+	subl	%ebp,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	xorl	%ecx,%edx
+	xorl	%eax,%esi
+	movl	%esi,%eax
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%ebx
+	xorl	%ecx,%eax
+	subl	%ebp,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	roll	$8,%ecx
+	xorl	%esi,%ebx
+	movl	12(%edi),%ebp
+	xorl	%edx,%ecx
+	xorl	%ebx,%edx
+	xorl	%eax,%ecx
+	roll	$24,%edx
+	xorl	%ebx,%eax
+	xorl	%ebx,%ecx
+	roll	$16,%eax
+	xorl	%edx,%ecx
+	roll	$8,%ebx
+	xorl	%eax,%ecx
+	movl	%ebp,%edx
+	xorl	%ebx,%ecx
+	movl	%ecx,8(%edi)
+	movl	%edx,%esi
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%edx,%edx,1),%eax
+	subl	%ebp,%esi
+	andl	$4278124286,%eax
+	andl	$454761243,%esi
+	xorl	%eax,%esi
+	movl	%esi,%eax
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%eax,%eax,1),%ebx
+	subl	%ebp,%esi
+	andl	$4278124286,%ebx
+	andl	$454761243,%esi
+	xorl	%edx,%eax
+	xorl	%ebx,%esi
+	movl	%esi,%ebx
+	andl	$2155905152,%esi
+	movl	%esi,%ebp
+	shrl	$7,%ebp
+	leal	(%ebx,%ebx,1),%ecx
+	xorl	%edx,%ebx
+	subl	%ebp,%esi
+	andl	$4278124286,%ecx
+	andl	$454761243,%esi
+	roll	$8,%edx
+	xorl	%esi,%ecx
+	movl	16(%edi),%ebp
+	xorl	%eax,%edx
+	xorl	%ecx,%eax
+	xorl	%ebx,%edx
+	roll	$24,%eax
+	xorl	%ecx,%ebx
+	xorl	%ecx,%edx
+	roll	$16,%ebx
+	xorl	%eax,%edx
+	roll	$8,%ecx
+	xorl	%ebx,%edx
+	movl	%ebp,%eax
+	xorl	%ecx,%edx
+	movl	%edx,12(%edi)
+	cmpl	28(%esp),%edi
+	jb	.L056permute
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	private_AES_set_decrypt_key,.-.L_private_AES_set_decrypt_key_begin
+.byte	65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89
+.byte	80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114
+.byte	111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
old mode 100755
new mode 100644
index aab40e6..51b500d
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -39,13 +39,13 @@
 # but exhibits up to 10% improvement on other cores.
 #
 # Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
 # This made it possible to implement little-endian variant of the
 # algorithm without modifying the base C code. Motivating factor for
 # the undertaken effort was that it appeared that in tight IA-32
 # register window little-endian flavor could achieve slightly higher
 # Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent �-archs...
+# better performance on most recent µ-archs...
 #
 # Third version adds AES_cbc_encrypt implementation, which resulted in
 # up to 40% performance imrovement of CBC benchmark results. 40% was
@@ -223,7 +223,7 @@
 $speed_limit=512;	# chunks smaller than $speed_limit are
 			# processed with compact routine in CBC mode
 $small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
-			# recent �-archs], but ~5 times smaller!
+			# recent µ-archs], but ~5 times smaller!
 			# I favor compact code to minimize cache
 			# contention and in hope to "collect" 5% back
 			# in real-life applications...
@@ -562,7 +562,7 @@ ()
 # Performance is not actually extraordinary in comparison to pure
 # x86 code. In particular encrypt performance is virtually the same.
 # Decrypt performance on the other hand is 15-20% better on newer
-# �-archs [but we're thankful for *any* improvement here], and ~50%
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
 # better on PIII:-) And additionally on the pros side this code
 # eliminates redundant references to stack and thus relieves/
 # minimizes the pressure on the memory bus.
@@ -2854,12 +2854,12 @@ ()
     &set_label("exit");
 &function_end("_x86_AES_set_encrypt_key");
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
+&function_begin_B("private_AES_set_encrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&ret	();
-&function_end_B("AES_set_encrypt_key");
+&function_end_B("private_AES_set_encrypt_key");
 
 sub deckey()
 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2916,9 +2916,9 @@ ()
 	&mov	(&DWP(4*$i,$key),$tp1);
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
+&function_begin_B("private_AES_set_decrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&cmp	("eax",0);
 	&je	(&label("proceed"));
@@ -2974,7 +2974,7 @@ ()
 	&jb	(&label("permute"));
 
 	&xor	("eax","eax");			# return success
-&function_end("AES_set_decrypt_key");
+&function_end("private_AES_set_decrypt_key");
 &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
diff --git a/crypto/aes/asm/aes-armv4.s b/crypto/aes/asm/aes-armv4.S
similarity index 93%
rename from crypto/aes/asm/aes-armv4.s
rename to crypto/aes/asm/aes-armv4.S
index 27c681c..2697d4c 100644
--- a/crypto/aes/asm/aes-armv4.s
+++ b/crypto/aes/asm/aes-armv4.S
@@ -1,3 +1,4 @@
+#include "arm_arch.h"
 .text
 .code	32
 
@@ -118,7 +119,7 @@ AES_encrypt:
 	mov	r12,r0		@ inp
 	mov	r11,r2
 	sub	r10,r3,#AES_encrypt-AES_Te	@ Te
-
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -147,10 +148,33 @@ AES_encrypt:
 	orr	r3,r3,r4,lsl#8
 	orr	r3,r3,r5,lsl#16
 	orr	r3,r3,r6,lsl#24
-
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
 	bl	_armv4_AES_encrypt
 
 	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
 	mov	r4,r0,lsr#24		@ write output in endian-neutral
 	mov	r5,r0,lsr#16		@ manner...
 	mov	r6,r0,lsr#8
@@ -179,11 +203,15 @@ AES_encrypt:
 	strb	r5,[r12,#13]
 	strb	r6,[r12,#14]
 	strb	r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _armv4_AES_encrypt,%function
@@ -223,11 +251,11 @@ _armv4_AES_encrypt:
 	and	r8,lr,r2,lsr#16	@ i1
 	eor	r6,r6,r9,ror#8
 	and	r9,lr,r2
-	eor	r1,r1,r4,ror#24
 	ldr	r7,[r10,r7,lsl#2]	@ Te2[s2>>8]
+	eor	r1,r1,r4,ror#24
+	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
 	mov	r2,r2,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
 	ldr	r9,[r10,r9,lsl#2]	@ Te3[s2>>0]
 	eor	r0,r0,r7,ror#16
 	ldr	r2,[r10,r2,lsl#2]	@ Te0[s2>>24]
@@ -236,16 +264,16 @@ _armv4_AES_encrypt:
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r6,r9,ror#16
 	and	r9,lr,r3,lsr#16	@ i2
-	eor	r2,r2,r5,ror#16
 	ldr	r7,[r10,r7,lsl#2]	@ Te3[s3>>0]
+	eor	r2,r2,r5,ror#16
+	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
 	ldr	r9,[r10,r9,lsl#2]	@ Te1[s3>>16]
 	eor	r0,r0,r7,ror#24
-	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
-	eor	r1,r1,r8,ror#16
 	ldr	r7,[r11],#16
+	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
 	eor	r2,r2,r9,ror#8
 	ldr	r4,[r11,#-12]
 	eor	r3,r3,r6,ror#8
@@ -285,11 +313,11 @@ _armv4_AES_encrypt:
 	and	r8,lr,r2,lsr#16	@ i1
 	eor	r6,r9,r6,lsl#8
 	and	r9,lr,r2
-	eor	r1,r4,r1,lsl#24
 	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s2>>8]
+	eor	r1,r4,r1,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
 	mov	r2,r2,lsr#24
 
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
 	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s2>>0]
 	eor	r0,r7,r0,lsl#8
 	ldrb	r2,[r10,r2,lsl#2]	@ Te4[s2>>24]
@@ -298,15 +326,15 @@ _armv4_AES_encrypt:
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r9,r6,lsl#8
 	and	r9,lr,r3,lsr#16	@ i2
-	eor	r2,r5,r2,lsl#24
 	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s3>>0]
+	eor	r2,r5,r2,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
 	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s3>>16]
 	eor	r0,r7,r0,lsl#8
-	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
 	ldr	r7,[r11,#0]
+	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
 	eor	r1,r1,r8,lsl#8
 	ldr	r4,[r11,#4]
 	eor	r2,r2,r9,lsl#16
@@ -323,10 +351,11 @@ _armv4_AES_encrypt:
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type   private_AES_set_encrypt_key,%function
 .align	5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 	teq	r0,#0
 	moveq	r0,#-1
@@ -344,12 +373,13 @@ AES_set_encrypt_key:
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	r10,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+	sub	r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
 
 	mov	r12,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	r11,r2			@ key
 
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -382,6 +412,22 @@ AES_set_encrypt_key:
 	orr	r3,r3,r6,lsl#24
 	str	r2,[r11,#-8]
 	str	r3,[r11,#-4]
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r11],#16
+	str	r1,[r11,#-12]
+	str	r2,[r11,#-8]
+	str	r3,[r11,#-4]
+#endif
 
 	teq	lr,#128
 	bne	.Lnot128
@@ -418,6 +464,7 @@ AES_set_encrypt_key:
 	b	.Ldone
 
 .Lnot128:
+#if __ARM_ARCH__<7
 	ldrb	r8,[r12,#19]
 	ldrb	r4,[r12,#18]
 	ldrb	r5,[r12,#17]
@@ -434,6 +481,16 @@ AES_set_encrypt_key:
 	str	r8,[r11],#8
 	orr	r9,r9,r6,lsl#24
 	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#16]
+	ldr	r9,[r12,#20]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
 
 	teq	lr,#192
 	bne	.Lnot192
@@ -478,6 +535,7 @@ AES_set_encrypt_key:
 	b	.L192_loop
 
 .Lnot192:
+#if __ARM_ARCH__<7
 	ldrb	r8,[r12,#27]
 	ldrb	r4,[r12,#26]
 	ldrb	r5,[r12,#25]
@@ -494,6 +552,16 @@ AES_set_encrypt_key:
 	str	r8,[r11],#8
 	orr	r9,r9,r6,lsl#24
 	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#24]
+	ldr	r9,[r12,#28]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
 
 	mov	r12,#14
 	str	r12,[r11,#240-32]
@@ -558,14 +626,14 @@ AES_set_encrypt_key:
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
-.global AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type   private_AES_set_decrypt_key,%function
 .align	5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
-	bl	AES_set_encrypt_key
+	bl	_armv4_AES_set_encrypt_key
 	teq	r0,#0
 	ldrne	lr,[sp],#4              @ pop lr
 	bne	.Labrt
@@ -639,11 +707,15 @@ AES_set_decrypt_key:
 	bne	.Lmix
 
 	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 
 .type	AES_Td,%object
 .align	5
@@ -758,7 +830,7 @@ AES_decrypt:
 	mov	r12,r0		@ inp
 	mov	r11,r2
 	sub	r10,r3,#AES_decrypt-AES_Td		@ Td
-
+#if __ARM_ARCH__<7
 	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
 	ldrb	r4,[r12,#2]	@ manner...
 	ldrb	r5,[r12,#1]
@@ -787,10 +859,33 @@ AES_decrypt:
 	orr	r3,r3,r4,lsl#8
 	orr	r3,r3,r5,lsl#16
 	orr	r3,r3,r6,lsl#24
-
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
 	bl	_armv4_AES_decrypt
 
 	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
 	mov	r4,r0,lsr#24		@ write output in endian-neutral
 	mov	r5,r0,lsr#16		@ manner...
 	mov	r6,r0,lsr#8
@@ -819,11 +914,15 @@ AES_decrypt:
 	strb	r5,[r12,#13]
 	strb	r6,[r12,#14]
 	strb	r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _armv4_AES_decrypt,%function
@@ -863,11 +962,11 @@ _armv4_AES_decrypt:
 	and	r8,lr,r2		@ i1
 	eor	r6,r9,r6,ror#8
 	and	r9,lr,r2,lsr#16
-	eor	r1,r1,r4,ror#8
 	ldr	r7,[r10,r7,lsl#2]	@ Td2[s2>>8]
+	eor	r1,r1,r4,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
 	mov	r2,r2,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
 	ldr	r9,[r10,r9,lsl#2]	@ Td1[s2>>16]
 	eor	r0,r0,r7,ror#16
 	ldr	r2,[r10,r2,lsl#2]	@ Td0[s2>>24]
@@ -876,22 +975,22 @@ _armv4_AES_decrypt:
 	and	r8,lr,r3,lsr#8	@ i1
 	eor	r6,r9,r6,ror#8
 	and	r9,lr,r3		@ i2
-	eor	r2,r2,r5,ror#8
 	ldr	r7,[r10,r7,lsl#2]	@ Td1[s3>>16]
+	eor	r2,r2,r5,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
 	mov	r3,r3,lsr#24
 
-	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
 	ldr	r9,[r10,r9,lsl#2]	@ Td3[s3>>0]
 	eor	r0,r0,r7,ror#8
-	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
+	ldr	r7,[r11],#16
 	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
 	eor	r2,r2,r9,ror#24
-	ldr	r7,[r11],#16
-	eor	r3,r3,r6,ror#8
 
 	ldr	r4,[r11,#-12]
-	ldr	r5,[r11,#-8]
 	eor	r0,r0,r7
+	ldr	r5,[r11,#-8]
+	eor	r3,r3,r6,ror#8
 	ldr	r6,[r11,#-4]
 	and	r7,lr,r0,lsr#16
 	eor	r1,r1,r4
@@ -932,11 +1031,11 @@ _armv4_AES_decrypt:
 	and	r7,lr,r2,lsr#8	@ i0
 	eor	r5,r5,r8,lsl#8
 	and	r8,lr,r2		@ i1
-	eor	r6,r6,r9,lsl#8
 	ldrb	r7,[r10,r7]		@ Td4[s2>>8]
+	eor	r6,r6,r9,lsl#8
+	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
 	and	r9,lr,r2,lsr#16
 
-	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
 	ldrb	r2,[r10,r2,lsr#24]	@ Td4[s2>>24]
 	eor	r0,r0,r7,lsl#8
 	ldrb	r9,[r10,r9]		@ Td4[s2>>16]
@@ -944,11 +1043,11 @@ _armv4_AES_decrypt:
 	and	r7,lr,r3,lsr#16	@ i0
 	eor	r2,r5,r2,lsl#16
 	and	r8,lr,r3,lsr#8	@ i1
-	eor	r6,r6,r9,lsl#16
 	ldrb	r7,[r10,r7]		@ Td4[s3>>16]
+	eor	r6,r6,r9,lsl#16
+	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
 	and	r9,lr,r3		@ i2
 
-	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
 	ldrb	r9,[r10,r9]		@ Td4[s3>>0]
 	ldrb	r3,[r10,r3,lsr#24]	@ Td4[s3>>24]
 	eor	r0,r0,r7,lsl#16
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index c51ee1f..86b86c4 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -27,6 +27,11 @@
 # Rescheduling for dual-issue pipeline resulted in 12% improvement on
 # Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
@@ -46,6 +51,7 @@
 $rounds="r12";
 
 $code=<<___;
+#include "arm_arch.h"
 .text
 .code	32
 
@@ -166,7 +172,7 @@
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -195,10 +201,33 @@
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_encrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -227,11 +256,15 @@
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$t3,$i3,ror#8
 	and	$i3,lr,$s2
-	eor	$s1,$s1,$t1,ror#24
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
+	eor	$s1,$s1,$t1,ror#24
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
@@ -284,16 +317,16 @@
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$t3,$i3,ror#16
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$s2,$t2,ror#16
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
+	eor	$s2,$s2,$t2,ror#16
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
 	eor	$s0,$s0,$i1,ror#24
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
-	eor	$s1,$s1,$i2,ror#16
 	ldr	$i1,[$key],#16
+	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
 	eor	$s2,$s2,$i3,ror#8
 	ldr	$t1,[$key,#-12]
 	eor	$s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s2
-	eor	$s1,$t1,$s1,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
+	eor	$s1,$t1,$s1,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
 	eor	$s0,$i1,$s0,lsl#8
 	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
@@ -346,15 +379,15 @@
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$t2,$s2,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
+	eor	$s2,$t2,$s2,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
 	eor	$s0,$i1,$s0,lsl#8
-	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	ldr	$i1,[$key,#0]
+	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	eor	$s1,$s1,$i2,lsl#8
 	ldr	$t1,[$key,#4]
 	eor	$s2,$s2,$i3,lsl#16
@@ -371,10 +404,11 @@
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type   private_AES_set_encrypt_key,%function
 .align	5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 	teq	r0,#0
 	moveq	r0,#-1
@@ -392,12 +426,13 @@
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+	sub	$tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
 
 	mov	$rounds,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	$key,r2			@ key
 
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -430,6 +465,22 @@
 	orr	$s3,$s3,$t3,lsl#24
 	str	$s2,[$key,#-8]
 	str	$s3,[$key,#-4]
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$key],#16
+	str	$s1,[$key,#-12]
+	str	$s2,[$key,#-8]
+	str	$s3,[$key,#-4]
+#endif
 
 	teq	lr,#128
 	bne	.Lnot128
@@ -466,6 +517,7 @@
 	b	.Ldone
 
 .Lnot128:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#19]
 	ldrb	$t1,[$rounds,#18]
 	ldrb	$t2,[$rounds,#17]
@@ -482,6 +534,16 @@
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#16]
+	ldr	$i3,[$rounds,#20]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	teq	lr,#192
 	bne	.Lnot192
@@ -526,6 +588,7 @@
 	b	.L192_loop
 
 .Lnot192:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#27]
 	ldrb	$t1,[$rounds,#26]
 	ldrb	$t2,[$rounds,#25]
@@ -542,6 +605,16 @@
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#24]
+	ldr	$i3,[$rounds,#28]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	mov	$rounds,#14
 	str	$rounds,[$key,#240-32]
@@ -606,14 +679,14 @@
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
-.global AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type   private_AES_set_decrypt_key,%function
 .align	5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
-	bl	AES_set_encrypt_key
+	bl	_armv4_AES_set_encrypt_key
 	teq	r0,#0
 	ldrne	lr,[sp],#4              @ pop lr
 	bne	.Labrt
@@ -692,11 +765,15 @@
 	bne	.Lmix
 
 	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 
 .type	AES_Td,%object
 .align	5
@@ -811,7 +888,7 @@
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -840,10 +917,33 @@
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_decrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -872,11 +972,15 @@
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _armv4_AES_decrypt,%function
@@ -916,11 +1020,11 @@
 	and	$i2,lr,$s2		@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s2,lsr#16
-	eor	$s1,$s1,$t1,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
+	eor	$s1,$s1,$t1,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
@@ -929,22 +1033,22 @@
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s3		@ i2
-	eor	$s2,$s2,$t2,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
+	eor	$s2,$s2,$t2,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
 	eor	$s0,$s0,$i1,ror#8
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
+	ldr	$i1,[$key],#16
 	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
 	eor	$s2,$s2,$i3,ror#24
-	ldr	$i1,[$key],#16
-	eor	$s3,$s3,$t3,ror#8
 
 	ldr	$t1,[$key,#-12]
-	ldr	$t2,[$key,#-8]
 	eor	$s0,$s0,$i1
+	ldr	$t2,[$key,#-8]
+	eor	$s3,$s3,$t3,ror#8
 	ldr	$t3,[$key,#-4]
 	and	$i1,lr,$s0,lsr#16
 	eor	$s1,$s1,$t1
@@ -985,11 +1089,11 @@
 	and	$i1,lr,$s2,lsr#8	@ i0
 	eor	$t2,$t2,$i2,lsl#8
 	and	$i2,lr,$s2		@ i1
-	eor	$t3,$t3,$i3,lsl#8
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
+	eor	$t3,$t3,$i3,lsl#8
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	and	$i3,lr,$s2,lsr#16
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
 	eor	$s0,$s0,$i1,lsl#8
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
@@ -997,11 +1101,11 @@
 	and	$i1,lr,$s3,lsr#16	@ i0
 	eor	$s2,$t2,$s2,lsl#16
 	and	$i2,lr,$s3,lsr#8	@ i1
-	eor	$t3,$t3,$i3,lsl#16
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
+	eor	$t3,$t3,$i3,lsl#16
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	and	$i3,lr,$s3		@ i2
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
 	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
 	eor	$s0,$s0,$i1,lsl#16
diff --git a/crypto/aes/asm/aes-mips.s b/crypto/aes/asm/aes-mips.S
similarity index 98%
rename from crypto/aes/asm/aes-mips.s
rename to crypto/aes/asm/aes-mips.S
index 0c1d85a..f5750bf 100644
--- a/crypto/aes/asm/aes-mips.s
+++ b/crypto/aes/asm/aes-mips.S
@@ -259,6 +259,7 @@ AES_encrypt:
 	.frame	$29,64,$31
 	.mask	3237937152,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,64
 	sw	$31,64-1*4($29)
 	sw	$30,64-2*4($29)
@@ -270,8 +271,6 @@ AES_encrypt:
 	sw	$18,64-8*4($29)
 	sw	$17,64-9*4($29)
 	sw	$16,64-10*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_encrypt
 	.set	reorder
 	la	$7,AES_Te		# PIC-ified 'load address'
 
@@ -567,6 +566,7 @@ AES_decrypt:
 	.frame	$29,64,$31
 	.mask	3237937152,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,64
 	sw	$31,64-1*4($29)
 	sw	$30,64-2*4($29)
@@ -578,8 +578,6 @@ AES_decrypt:
 	sw	$18,64-8*4($29)
 	sw	$17,64-9*4($29)
 	sw	$16,64-10*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_decrypt
 	.set	reorder
 	la	$7,AES_Td		# PIC-ified 'load address'
 
@@ -870,17 +868,16 @@ _mips_AES_set_encrypt_key:
 	nop
 .end	_mips_AES_set_encrypt_key
 
-.globl	AES_set_encrypt_key
-.ent	AES_set_encrypt_key
-AES_set_encrypt_key:
+.globl	private_AES_set_encrypt_key
+.ent	private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
 	.frame	$29,32,$31
 	.mask	3221225472,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,32
 	sw	$31,32-1*4($29)
 	sw	$30,32-2*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_set_encrypt_key
 	.set	reorder
 	la	$7,AES_Te		# PIC-ified 'load address'
 
@@ -892,19 +889,18 @@ AES_set_encrypt_key:
 	lw	$30,32-2*4($29)
 	jr	$31
 	add $29,32
-.end	AES_set_encrypt_key
+.end	private_AES_set_encrypt_key
 .align	5
-.globl	AES_set_decrypt_key
-.ent	AES_set_decrypt_key
-AES_set_decrypt_key:
+.globl	private_AES_set_decrypt_key
+.ent	private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
 	.frame	$29,32,$31
 	.mask	3221225472,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,32
 	sw	$31,32-1*4($29)
 	sw	$30,32-2*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_set_decrypt_key
 	.set	reorder
 	la	$7,AES_Te		# PIC-ified 'load address'
 
@@ -1004,7 +1000,7 @@ AES_set_decrypt_key:
 	lw	$30,32-2*4($29)
 	jr	$31
 	add $29,32
-.end	AES_set_decrypt_key
+.end	private_AES_set_decrypt_key
 .rdata
 .align	6
 AES_Te:
diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl
index 2ce6def..e523954 100644
--- a/crypto/aes/asm/aes-mips.pl
+++ b/crypto/aes/asm/aes-mips.pl
@@ -1036,9 +1036,9 @@
 	nop
 .end	_mips_AES_set_encrypt_key
 
-.globl	AES_set_encrypt_key
-.ent	AES_set_encrypt_key
-AES_set_encrypt_key:
+.globl	private_AES_set_encrypt_key
+.ent	private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
 	.frame	$sp,$FRAMESIZE,$ra
 	.mask	$SAVED_REGS_MASK,-$SZREG
 	.set	noreorder
@@ -1060,7 +1060,7 @@
 ___
 $code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
 	.cplocal	$Tbl
-	.cpsetup	$pf,$zero,AES_set_encrypt_key
+	.cpsetup	$pf,$zero,private_AES_set_encrypt_key
 ___
 $code.=<<___;
 	.set	reorder
@@ -1083,7 +1083,7 @@
 $code.=<<___;
 	jr	$ra
 	$PTR_ADD $sp,$FRAMESIZE
-.end	AES_set_encrypt_key
+.end	private_AES_set_encrypt_key
 ___
 
 my ($head,$tail)=($inp,$bits);
@@ -1091,9 +1091,9 @@
 my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
 $code.=<<___;
 .align	5
-.globl	AES_set_decrypt_key
-.ent	AES_set_decrypt_key
-AES_set_decrypt_key:
+.globl	private_AES_set_decrypt_key
+.ent	private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
 	.frame	$sp,$FRAMESIZE,$ra
 	.mask	$SAVED_REGS_MASK,-$SZREG
 	.set	noreorder
@@ -1115,7 +1115,7 @@
 ___
 $code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
 	.cplocal	$Tbl
-	.cpsetup	$pf,$zero,AES_set_decrypt_key
+	.cpsetup	$pf,$zero,private_AES_set_decrypt_key
 ___
 $code.=<<___;
 	.set	reorder
@@ -1226,7 +1226,7 @@
 $code.=<<___;
 	jr	$ra
 	$PTR_ADD $sp,$FRAMESIZE
-.end	AES_set_decrypt_key
+.end	private_AES_set_decrypt_key
 ___
 }}}
 
diff --git a/crypto/aes/asm/aes-parisc.pl b/crypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000..c36b6a2
--- /dev/null
+++ b/crypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1021 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$inp="%r26";	# arg0
+$out="%r25";	# arg1
+$key="%r24";	# arg2
+
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+
+$tbl="%r28";
+$rounds="%r29";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+AES_encrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$enc_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Te-L\$enc_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$enc_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$enc_inp_aligned
+	bl	_parisc_AES_encrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$enc_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$enc_done
+	stb	$s3,15($out)
+
+L\$enc_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$enc_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_encrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	12($key),$t3
+	_srm	$s0,24,$acc0
+	xor	$t1,$s1,$s1
+	ldw	16($key),$t0
+	_srm	$s1,16,$acc1
+	xor	$t2,$s2,$s2
+	ldw	20($key),$t1
+	xor	$t3,$s3,$s3
+	ldw	24($key),$t2
+	ldw	28($key),$t3
+L\$enc_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$enc_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch te4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch te4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch te4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch te4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch te4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch te4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch te4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch te4
+	_srm	$s1,16,$acc1
+		xor	$acc14,$s3,$s3
+	b	L\$enc_loop
+		xor	$acc15,$s3,$s3
+
+	.ALIGN	16
+L\$enc_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t2,16,$acc5
+	_srm	$t3,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t3,16,$acc9
+	_srm	$t0,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t0,16,$acc13
+	_srm	$t1,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t2,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Te
+	.WORD	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+	.WORD	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+	.WORD	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+	.WORD	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+	.WORD	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+	.WORD	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+	.WORD	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+	.WORD	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+	.WORD	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+	.WORD	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+	.WORD	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+	.WORD	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+	.WORD	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+	.WORD	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+	.WORD	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+	.WORD	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+	.WORD	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+	.WORD	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+	.WORD	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+	.WORD	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+	.WORD	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+	.WORD	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+	.WORD	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+	.WORD	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+	.WORD	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+	.WORD	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+	.WORD	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+	.WORD	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+	.WORD	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+	.WORD	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+	.WORD	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+	.WORD	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+	.WORD	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+	.WORD	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+	.WORD	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+	.WORD	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+	.WORD	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+	.WORD	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+	.WORD	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+	.WORD	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+	.WORD	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+	.WORD	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+	.WORD	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+	.WORD	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+	.WORD	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+	.WORD	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+	.WORD	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+	.WORD	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+	.WORD	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+	.WORD	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+	.WORD	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+	.WORD	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+	.WORD	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+	.WORD	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+	.WORD	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+	.WORD	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+	.WORD	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+	.WORD	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+	.WORD	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+	.WORD	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+	.WORD	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+	.WORD	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+	.WORD	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+	.WORD	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+	.BYTE	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.BYTE	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.BYTE	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.BYTE	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.BYTE	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.BYTE	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.BYTE	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.BYTE	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.BYTE	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.BYTE	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.BYTE	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.BYTE	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.BYTE	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.BYTE	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.BYTE	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.BYTE	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.BYTE	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.BYTE	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.BYTE	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.BYTE	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.BYTE	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.BYTE	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.BYTE	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.BYTE	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.BYTE	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.BYTE	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.BYTE	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.BYTE	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.BYTE	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.BYTE	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.BYTE	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.BYTE	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+
+$code.=<<___;
+	.EXPORT	AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	16
+AES_decrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$dec_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Td-L\$dec_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$dec_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$dec_inp_aligned
+	bl	_parisc_AES_decrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$dec_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$dec_done
+	stb	$s3,15($out)
+
+L\$dec_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$dec_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_decrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	ldw	12($key),$t3
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	16($key),$t0
+	xor	$t1,$s1,$s1
+	ldw	20($key),$t1
+	_srm	$s0,24,$acc0
+	xor	$t2,$s2,$s2
+	ldw	24($key),$t2
+	xor	$t3,$s3,$s3
+	ldw	28($key),$t3
+	_srm	$s3,16,$acc1
+L\$dec_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$dec_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch td4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch td4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch td4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch td4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch td4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch td4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch td4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch td4
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+	b	L\$dec_loop
+	_srm	$s3,16,$acc1
+
+	.ALIGN	16
+L\$dec_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t0,16,$acc5
+	_srm	$t1,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t1,16,$acc9
+	_srm	$t2,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t2,16,$acc13
+	_srm	$t3,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t0,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Td
+	.WORD	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+	.WORD	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+	.WORD	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+	.WORD	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+	.WORD	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+	.WORD	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+	.WORD	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+	.WORD	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+	.WORD	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+	.WORD	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+	.WORD	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+	.WORD	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+	.WORD	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+	.WORD	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+	.WORD	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+	.WORD	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+	.WORD	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+	.WORD	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+	.WORD	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+	.WORD	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+	.WORD	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+	.WORD	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+	.WORD	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+	.WORD	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+	.WORD	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+	.WORD	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+	.WORD	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+	.WORD	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+	.WORD	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+	.WORD	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+	.WORD	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+	.WORD	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+	.WORD	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+	.WORD	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+	.WORD	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+	.WORD	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+	.WORD	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+	.WORD	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+	.WORD	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+	.WORD	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+	.WORD	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+	.WORD	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+	.WORD	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+	.WORD	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+	.WORD	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+	.WORD	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+	.WORD	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+	.WORD	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+	.WORD	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+	.WORD	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+	.WORD	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+	.WORD	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+	.WORD	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+	.WORD	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+	.WORD	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+	.WORD	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+	.WORD	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+	.WORD	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+	.WORD	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+	.WORD	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+	.WORD	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+	.WORD	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+	.WORD	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+	.WORD	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+	.BYTE	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.BYTE	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.BYTE	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.BYTE	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.BYTE	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.BYTE	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.BYTE	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.BYTE	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.BYTE	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.BYTE	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.BYTE	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.BYTE	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.BYTE	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.BYTE	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.BYTE	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.BYTE	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.BYTE	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.BYTE	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.BYTE	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.BYTE	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.BYTE	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.BYTE	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.BYTE	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.BYTE	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.BYTE	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.BYTE	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.BYTE	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.BYTE	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.BYTE	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.BYTE	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.BYTE	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.BYTE	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	# translate made up instructons: _ror, _srm
+	s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/				or
+
+	s/_srm(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+		:            sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+
+	s/,\*/,/ if ($SIZE_T==4);
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl
index f82c5e1..7c52cbe 100644
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl
@@ -7,7 +7,7 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
 #
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
 
 # February 2010
 #
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
 # performance improvement on the platfrom in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$STU	="stwu";
 	$POP	="lwz";
 	$PUSH	="stw";
@@ -116,15 +118,19 @@ ()
 	addi	$Tbl0,$Tbl0,`128-8`
 	mtlr	r0
 	blr
-	.space	`32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 LAES_Td:
 	mflr	r0
 	bcl	20,31,\$+4
 	mflr	$Tbl0	;    vvvvvvvv "distance" between . and 1st data entry
-	addi	$Tbl0,$Tbl0,`128-8-32+2048+256`
+	addi	$Tbl0,$Tbl0,`128-64-8+2048+256`
 	mtlr	r0
 	blr
-	.space	`128-32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`128-64-9*4`
 ___
 &_data_word(
 	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ ()
 .globl	.AES_encrypt
 .align	7
 .AES_encrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ ()
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
+
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Lenc_unaligned
 
+Lenc_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -363,8 +375,80 @@ ()
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Lenc_done
+
+Lenc_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Lenc_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Lenc_unaligned_ok
+
+Lenc_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Te
+	bl	Lppc_AES_encrypt_compact
+
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ ()
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_encrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -413,44 +500,44 @@ ()
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s1,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s2,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s3,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s0,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -466,60 +553,60 @@ ()
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Te4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s1,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s2,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc06,$s3,`32-16`,24,31
+	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc00,$Tbl2,$acc00
-	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s3,$acc07,16,8,15
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc08,8,16,23
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc09,8,16,23
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc10,8,16,23
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc11,8,16,23
 	or	$s0,$s0,$acc12
 	or	$s1,$s1,$acc13
@@ -530,29 +617,31 @@ ()
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_encrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 	mtctr	$acc00
 .align	4
 Lenc_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ ()
 	rlwinm	$acc06,$s3,`32-16`,24,31
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -612,12 +701,12 @@ ()
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ ()
 	and	$acc03,$acc03,$mask1b
 	xor	$acc00,$acc00,$acc08	# r2
 	xor	$acc01,$acc01,$acc09
+	 rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
 	xor	$acc02,$acc02,$acc10
+	 rotlwi	$acc13,$s1,16
 	xor	$acc03,$acc03,$acc11
+	 rotlwi	$acc14,$s2,16
 
-	rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
-	rotlwi	$acc13,$s1,16
-	rotlwi	$acc14,$s2,16
-	rotlwi	$acc15,$s3,16
 	xor	$s0,$s0,$acc00		# r0^r2
+	rotlwi	$acc15,$s3,16
 	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$s0,$s0,24		# ROTATE(r2^r0,24)
+	xor	$s2,$s2,$acc02
 	rotrwi	$s1,$s1,24
+	xor	$s3,$s3,$acc03
 	rotrwi	$s2,$s2,24
-	rotrwi	$s3,$s3,24
 	xor	$s0,$s0,$acc00		# ROTATE(r2^r0,24)^r2
+	rotrwi	$s3,$s3,24
 	xor	$s1,$s1,$acc01
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	rotlwi	$acc08,$acc12,8		# ROTATE(r0,24)
-	rotlwi	$acc09,$acc13,8
-	rotlwi	$acc10,$acc14,8
-	rotlwi	$acc11,$acc15,8
 	xor	$s0,$s0,$acc12		#
+	rotlwi	$acc09,$acc13,8
 	xor	$s1,$s1,$acc13
+	rotlwi	$acc10,$acc14,8
 	xor	$s2,$s2,$acc14
+	rotlwi	$acc11,$acc15,8
 	xor	$s3,$s3,$acc15
 	xor	$s0,$s0,$acc08		#
 	xor	$s1,$s1,$acc09
@@ -673,14 +762,15 @@ ()
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.AES_decrypt
 .align	7
 .AES_decrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ ()
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Ldec_unaligned
+
+Ldec_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -712,8 +809,80 @@ ()
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Ldec_done
+
+Ldec_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Ldec_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Ldec_unaligned_ok
+
+Ldec_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Td
+	bl	Lppc_AES_decrypt_compact
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
+
+Ldec_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ ()
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_decrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -762,44 +934,44 @@ ()
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s3,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s0,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s1,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s2,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -815,56 +987,56 @@ ()
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Td4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s3,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s0,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc00,$Tbl2,$acc00
+	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc01,$Tbl2,$acc01
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc06,$s1,`32-16`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ ()
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_decrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 ___
 $code.=<<___ if ($SIZE_T==8);
 	insrdi	$mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ ()
 Ldec_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ ()
 	rlwinm	$acc06,$s1,`32-16`,24,31
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -1030,12 +1204,12 @@ ()
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ ()
 	and	$acc06,$acc02,$mask80
 	and	$acc07,$acc03,$mask80
 	srwi	$acc08,$acc04,7		# r1>>7
-	srwi	$acc09,$acc05,7
-	srwi	$acc10,$acc06,7
-	srwi	$acc11,$acc07,7
 	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
+	srwi	$acc09,$acc05,7
 	andc	$acc13,$acc01,$mask80
+	srwi	$acc10,$acc06,7
 	andc	$acc14,$acc02,$mask80
+	srwi	$acc11,$acc07,7
 	andc	$acc15,$acc03,$mask80
 	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
 	sub	$acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ ()
 
 	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
 	and	$acc09,$acc05,$mask80
-	and	$acc10,$acc06,$mask80
-	and	$acc11,$acc07,$mask80
 	srwi	$acc12,$acc08,7		# r1>>7
+	and	$acc10,$acc06,$mask80
 	srwi	$acc13,$acc09,7
+	and	$acc11,$acc07,$mask80
 	srwi	$acc14,$acc10,7
-	srwi	$acc15,$acc11,7
 	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
+	srwi	$acc15,$acc11,7
 	sub	$acc09,$acc09,$acc13
 	sub	$acc10,$acc10,$acc14
 	sub	$acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ()
 $code.=<<___;
 	rotrwi	$s0,$s0,8		# = ROTATE(r0,8)
 	rotrwi	$s1,$s1,8
-	rotrwi	$s2,$s2,8
-	rotrwi	$s3,$s3,8
 	xor	$s0,$s0,$acc00		# ^= r2^r0
+	rotrwi	$s2,$s2,8
 	xor	$s1,$s1,$acc01
+	rotrwi	$s3,$s3,8
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	xor	$acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ ()
 	xor	$acc02,$acc02,$acc10
 	xor	$acc03,$acc03,$acc11
 	xor	$s0,$s0,$acc04		# ^= r4^r0
-	xor	$s1,$s1,$acc05
-	xor	$s2,$s2,$acc06
-	xor	$s3,$s3,$acc07
 	rotrwi	$acc00,$acc00,24
+	xor	$s1,$s1,$acc05
 	rotrwi	$acc01,$acc01,24
+	xor	$s2,$s2,$acc06
 	rotrwi	$acc02,$acc02,24
+	xor	$s3,$s3,$acc07
 	rotrwi	$acc03,$acc03,24
 	xor	$acc04,$acc04,$acc08
 	xor	$acc05,$acc05,$acc09
 	xor	$acc06,$acc06,$acc10
 	xor	$acc07,$acc07,$acc11
 	xor	$s0,$s0,$acc08		# ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
-	xor	$s1,$s1,$acc09
-	xor	$s2,$s2,$acc10
-	xor	$s3,$s3,$acc11
 	rotrwi	$acc04,$acc04,16
+	xor	$s1,$s1,$acc09
 	rotrwi	$acc05,$acc05,16
+	xor	$s2,$s2,$acc10
 	rotrwi	$acc06,$acc06,16
+	xor	$s3,$s3,$acc11
 	rotrwi	$acc07,$acc07,16
 	xor	$s0,$s0,$acc00		# ^= ROTATE(r8^r2^r0,24)
-	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$acc08,$acc08,8
+	xor	$s1,$s1,$acc01
 	rotrwi	$acc09,$acc09,8
+	xor	$s2,$s2,$acc02
 	rotrwi	$acc10,$acc10,8
+	xor	$s3,$s3,$acc03
 	rotrwi	$acc11,$acc11,8
 	xor	$s0,$s0,$acc04		# ^= ROTATE(r8^r4^r0,16)
 	xor	$s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ ()
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
-.long	0
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
 .asciz	"AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align	7
 ___
diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl
index 7e01889..e75dcd0 100644
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
 # Unlike previous version hardware support detection takes place only
 # at the moment of key schedule setup, which is denoted in key->rounds.
 # This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
 #
 # Add AES_cbc_encrypt, which gives incredible performance improvement,
 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
 # because software implementation was optimized.
 
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $softonly=0;	# allow hardware support
 
 $t0="%r0";	$mask="%r0";
@@ -69,6 +114,8 @@
 $ra="%r14";
 $sp="%r15";
 
+$stdframe=16*$SIZE_T+4*8;
+
 sub _data_word()
 { my $i;
     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ ()
 .Lesoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -220,20 +267,20 @@ ()
 	larl	$tbl,AES_Te
 	bras	$ra,_s390x_AES_encrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _s390x_AES_encrypt,\@function
 .align	16
 _s390x_AES_encrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -397,7 +444,7 @@ ()
 	or	$s2,$i3
 	or	$s3,$t3
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	xr	$s0,$t0
 	xr	$s1,$t2
 	x	$s2,24($key)
@@ -536,7 +583,7 @@ ()
 .Ldsoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -546,20 +593,20 @@ ()
 	larl	$tbl,AES_Td
 	bras	$ra,_s390x_AES_decrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _s390x_AES_decrypt,\@function
 .align	16
 _s390x_AES_decrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -703,7 +750,7 @@ ()
 	nr	$i1,$mask
 	nr	$i2,$mask
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	or	$s1,$t1
 	l	$t0,16($key)
 	l	$t1,20($key)
@@ -732,14 +779,15 @@ ()
 $code.=<<___;
 # void AES_set_encrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
 	lghi	$t0,0
-	clgr	$inp,$t0
+	cl${g}r	$inp,$t0
 	je	.Lminus1
-	clgr	$key,$t0
+	cl${g}r	$key,$t0
 	je	.Lminus1
 
 	lghi	$t0,128
@@ -789,7 +837,8 @@ ()
 	je	1f
 	lg	%r1,24($inp)
 	stg	%r1,24($key)
-1:	st	$bits,236($key)	# save bits
+1:	st	$bits,236($key)	# save bits [for debugging purposes]
+	lgr	$t0,%r5
 	st	%r5,240($key)	# save km code
 	lghi	%r2,0
 	br	%r14
@@ -797,7 +846,7 @@ ()
 $code.=<<___;
 .align	16
 .Lekey_internal:
-	stmg	%r6,%r13,48($sp)	# all non-volatile regs
+	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
 
 	larl	$tbl,AES_Te+2048
 
@@ -857,8 +906,9 @@ ()
 	la	$key,16($key)		# key+=4
 	la	$t3,4($t3)		# i++
 	brct	$rounds,.L128_loop
+	lghi	$t0,10
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -905,8 +955,9 @@ ()
 	st	$s2,32($key)
 	st	$s3,36($key)
 	brct	$rounds,.L192_continue
+	lghi	$t0,12
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -967,8 +1018,9 @@ ()
 	st	$s2,40($key)
 	st	$s3,44($key)
 	brct	$rounds,.L256_continue
+	lghi	$t0,14
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -1011,42 +1063,34 @@ ()
 .Lminus1:
 	lghi	%r2,-1
 	br	$ra
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 # void AES_set_decrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function
 .align	16
-AES_set_decrypt_key:
-	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
-	stg	$ra,112($sp)		# save non-volatile registers!
-	bras	$ra,AES_set_encrypt_key
-	lg	$key,32($sp)
-	lg	$ra,112($sp)
+private_AES_set_decrypt_key:
+	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
+	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
+	bras	$ra,_s390x_AES_set_encrypt_key
+	#l${g}	$key,4*$SIZE_T($sp)
+	l${g}	$ra,14*$SIZE_T($sp)
 	ltgr	%r2,%r2
 	bnzr	$ra
 ___
 $code.=<<___ if (!$softonly);
-	l	$t0,240($key)
+	#l	$t0,240($key)
 	lhi	$t1,16
 	cr	$t0,$t1
 	jl	.Lgo
 	oill	$t0,0x80	# set "decrypt" bit
 	st	$t0,240($key)
 	br	$ra
-
-.align	16
-.Ldkey_internal:
-	stg	$key,32($sp)
-	stg	$ra,40($sp)
-	bras	$ra,.Lekey_internal
-	lg	$key,32($sp)
-	lg	$ra,40($sp)
 ___
 $code.=<<___;
-
-.Lgo:	llgf	$rounds,240($key)
+.align	16
+.Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
 	la	$i1,0($key)
 	sllg	$i2,$rounds,4
 	la	$i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ ()
 	la	$key,4($key)
 	brct	$rounds,.Lmix
 
-	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
 	lghi	%r2,0
 	br	$ra
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
-#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
 #                     size_t length, const AES_KEY *key,
 #                     unsigned char *ivec, const int enc)
 {
@@ -1163,7 +1208,7 @@ ()
 	l	%r0,240($key)	# load kmc code
 	lghi	$key,15		# res=len%16, len-=res;
 	ngr	$key,$len
-	slgr	$len,$key
+	sl${g}r	$len,$key
 	la	%r1,16($sp)	# parameter block - ivec || key
 	jz	.Lkmc_truncated
 	.long	0xb92f0042	# kmc %r4,%r2
@@ -1181,34 +1226,34 @@ ()
 	tmll	%r0,0x80
 	jnz	.Lkmc_truncated_dec
 	lghi	%r1,0
-	stg	%r1,128($sp)
-	stg	%r1,136($sp)
+	stg	%r1,16*$SIZE_T($sp)
+	stg	%r1,16*$SIZE_T+8($sp)
 	bras	%r1,1f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 1:	ex	$key,0(%r1)
 	la	%r1,16($sp)	# restore parameter block
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
 	j	.Lkmc_done
 .align	16
 .Lkmc_truncated_dec:
-	stg	$out,64($sp)
-	la	$out,128($sp)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$out,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
-	lg	$out,64($sp)
+	l${g}	$out,4*$SIZE_T($sp)
 	bras	%r1,2f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 2:	ex	$key,0(%r1)
 	j	.Lkmc_done
 .align	16
 .Lcbc_software:
 ___
 $code.=<<___;
-	stmg	$key,$ra,40($sp)
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
 	lhi	%r0,0
-	cl	%r0,164($sp)
+	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
 	je	.Lcbc_decrypt
 
 	larl	$tbl,AES_Te
@@ -1219,10 +1264,10 @@ ()
 	llgf	$s3,12($ivp)
 
 	lghi	$t0,16
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 .Lcbc_enc_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	x	$s0,0($inp)
 	x	$s1,4($inp)
 	x	$s2,8($inp)
@@ -1231,7 +1276,7 @@ ()
 
 	bras	$ra,_s390x_AES_encrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
@@ -1240,33 +1285,33 @@ ()
 	la	$inp,16($inp)
 	la	$out,16($out)
 	lghi	$t0,16
-	ltgr	$len,$len
+	lt${g}r	$len,$len
 	jz	.Lcbc_enc_done
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 	j	.Lcbc_enc_loop
 .align	16
 .Lcbc_enc_done:
-	lg	$ivp,48($sp)
+	l${g}	$ivp,6*$SIZE_T($sp)
 	st	$s0,0($ivp)
 	st	$s1,4($ivp)	
 	st	$s2,8($ivp)
 	st	$s3,12($ivp)
 
-	lmg	%r7,$ra,56($sp)
+	lm${g}	%r7,$ra,7*$SIZE_T($sp)
 	br	$ra
 
 .align	16
 .Lcbc_enc_tail:
 	aghi	$len,15
 	lghi	$t0,0
-	stg	$t0,128($sp)
-	stg	$t0,136($sp)
+	stg	$t0,16*$SIZE_T($sp)
+	stg	$t0,16*$SIZE_T+8($sp)
 	bras	$t1,3f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 3:	ex	$len,0($t1)
 	lghi	$len,0
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	j	.Lcbc_enc_loop
 
 .align	16
@@ -1275,10 +1320,10 @@ ()
 
 	lg	$t0,0($ivp)
 	lg	$t1,8($ivp)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 .Lcbc_dec_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
 	llgf	$s2,8($inp)
@@ -1287,7 +1332,7 @@ ()
 
 	bras	$ra,_s390x_AES_decrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	sllg	$s0,$s0,32
 	sllg	$s2,$s2,32
 	lr	$s0,$s1
@@ -1295,15 +1340,15 @@ ()
 
 	lg	$t0,0($inp)
 	lg	$t1,8($inp)
-	xg	$s0,128($sp)
-	xg	$s2,136($sp)
+	xg	$s0,16*$SIZE_T($sp)
+	xg	$s2,16*$SIZE_T+8($sp)
 	lghi	$s1,16
-	slgr	$len,$s1
+	sl${g}r	$len,$s1
 	brc	4,.Lcbc_dec_tail	# if borrow
 	brc	2,.Lcbc_dec_done	# if zero
 	stg	$s0,0($out)
 	stg	$s2,8($out)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 	la	$inp,16($inp)
 	la	$out,16($out)
@@ -1313,7 +1358,7 @@ ()
 	stg	$s0,0($out)
 	stg	$s2,8($out)
 .Lcbc_dec_exit:
-	lmg	$ivp,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	stmg	$t0,$t1,0($ivp)
 
 	br	$ra
@@ -1321,19 +1366,872 @@ ()
 .align	16
 .Lcbc_dec_tail:
 	aghi	$len,15
-	stg	$s0,128($sp)
-	stg	$s2,136($sp)
+	stg	$s0,16*$SIZE_T($sp)
+	stg	$s2,16*$SIZE_T+8($sp)
 	bras	$s1,4f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 4:	ex	$len,0($s1)
 	j	.Lcbc_dec_exit
 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
-.comm  OPENSSL_s390xcap_P,8,8
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+#                     size_t blocks, const AES_KEY *key,
+#                     const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4";	# blocks and out are swapped
+my $len="%r3";
+my $key="%r5";	my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl	AES_ctr32_encrypt
+.type	AES_ctr32_encrypt,\@function
+.align	16
+AES_ctr32_encrypt:
+	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+	l	%r0,240($key)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lctr32_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+	la	%r1,0($key)	# %r1 is permanent copy of $key
+	lg	$iv0,0($ivp)	# load ivec
+	lg	$ivp,8($ivp)
+
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
+	st${g}	$s2,0($sp)	# back-chain
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_hw_switch	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
+	larl	$s0,OPENSSL_s390xcap_P
+	lg	$s0,8($s0)
+	tmhh	$s0,0x0004	# check for message_security-assist-4
+	jz	.Lctr32_km_loop
+
+	llgfr	$s0,%r0
+	lgr	$s1,%r1
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb92d2042	# kmctr %r4,%r2,%r2
+
+	llihh	%r0,0x8000	# check if kmctr supports the function code
+	srlg	%r0,%r0,0($s0)
+	ng	%r0,16($sp)
+	lgr	%r0,$s0
+	lgr	%r1,$s1
+	jz	.Lctr32_km_loop
+
+####### kmctr code
+	algr	$out,$inp	# restore $out
+	lgr	$s1,$len	# $s1 undertakes $len
+	j	.Lctr32_kmctr_loop
+.align	16
+.Lctr32_kmctr_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_kmctr_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_kmctr_prepare
+
+	#la	$inp,0($inp)	# inp
+	sllg	$len,$fp,4	# len
+	#la	$out,0($out)	# out
+	la	$s2,16($sp)	# iv
+	.long	0xb92da042	# kmctr $out,$s2,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+
+	slgr	$s1,$fp
+	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
+	algr	$fp,$s1
+	lghi	$s1,0
+	brc	4+1,.Lctr32_kmctr_loop	# not zero
+
+	l${g}	$sp,0($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+___
+$code.=<<___;
+.Lctr32_km_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_km_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_km_prepare
+
+	la	$s0,16($sp)	# inp
+	sllg	$s1,$fp,4	# len
+	la	$s2,16($sp)	# out
+	.long	0xb92e00a8	# km %r10,%r8
+	brc	1,.-4		# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+	slgr	$s2,$inp
+.Lctr32_km_xor:
+	lg	$s0,0($inp)
+	lg	$s1,8($inp)
+	xg	$s0,0($s2,$inp)
+	xg	$s1,8($s2,$inp)
+	stg	$s0,0($out,$inp)
+	stg	$s1,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lctr32_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_km_loop	# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lctr32_km_loop	# not zero
+
+	l${g}	$s0,0($sp)
+	l${g}	$s1,$SIZE_T($sp)
+	la	$s2,16($sp)
+.Lctr32_km_zap:
+	stg	$s0,0($s2)
+	stg	$s0,8($s2)
+	la	$s2,16($s2)
+	brct	$s1,.Lctr32_km_zap
+
+	la	$sp,0($s0)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lctr32_software:
+___
+$code.=<<___;
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
+	sl${g}r	$inp,$out
+	larl	$tbl,AES_Te
+	llgf	$t1,12($ivp)
+
+.Lctr32_loop:
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
+	llgf	$s0,0($ivp)
+	llgf	$s1,4($ivp)
+	llgf	$s2,8($ivp)
+	lgr	$s3,$t1
+	st	$t1,16*$SIZE_T($sp)
+	lgr	%r4,$key
+
+	bras	$ra,_s390x_AES_encrypt
+
+	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
+	llgf	$t1,16*$SIZE_T($sp)
+	x	$s0,0($inp,$out)
+	x	$s1,4($inp,$out)
+	x	$s2,8($inp,$out)
+	x	$s3,12($inp,$out)
+	stm	$s0,$s3,0($out)
+
+	la	$out,16($out)
+	ahi	$t1,1		# 32-bit increment
+	brct	$len,.Lctr32_loop
+
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4";	# len and out are swapped
+my $len="%r3";
+my $key1="%r5";	# $i1
+my $key2="%r6";	# $i2
+my $fp="%r7";	# $i3
+my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
+
+$code.=<<___;
+.type	_s390x_xts_km,\@function
+.align	16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+	llgfr	$s0,%r0			# put aside the function code
+	lghi	$s1,0x7f
+	nr	$s1,%r0
+	lghi	%r0,0			# query capability vector
+	la	%r1,$tweak-16($sp)
+	.long	0xb92e0042		# km %r4,%r2
+	llihh	%r1,0x8000
+	srlg	%r1,%r1,32($s1)		# check for 32+function code
+	ng	%r1,$tweak-16($sp)
+	lgr	%r0,$s0			# restore the function code
+	la	%r1,0($key1)		# restore $key1
+	jz	.Lxts_km_vanilla
+
+	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
+	algr	$out,$inp
+
+	oill	%r0,32			# switch to xts function code
+	aghi	$s1,-18			#
+	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
+	la	%r1,$tweak-16($sp)
+	slgr	%r1,$s1			# parameter block position
+	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
+	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
+					# yes, it contains junk and overlaps
+					# with the tweak in 128-bit case.
+					# it's done to avoid conditional
+					# branch.
+	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
+
+	.long	0xb92e0042		# km %r4,%r2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	lrvg	$s0,$tweak+0($sp)	# load the last tweak
+	lrvg	$s1,$tweak+8($sp)
+	stmg	%r0,%r3,$tweak-32($sp)	# wipe copy of the key
+
+	nill	%r0,0xffdf		# switch back to original function code
+	la	%r1,0($key1)		# restore pointer to $key1
+	slgr	$out,$inp
+
+	llgc	$len,2*$SIZE_T-1($sp)
+	nill	$len,0x0f		# $len%=16
+	br	$ra
+	
+.align	16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	nill	$fp,0xfff0	# round to 16*n
+	st${g}	$s2,0($sp)	# back-chain
+	nill	$len,0xfff0	# redundant
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_go	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lxts_km_go:
+	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
+	lrvg	$s1,$tweak+8($s2)
+
+	la	$s2,16($sp)		# vector of ascending tweak values
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+	j	.Lxts_km_start
+
+.Lxts_km_loop:
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_prepare:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	algr	$s0,$s0
+	alcgr	$s1,$s1
+	xgr	$s0,$i1
+.Lxts_km_start:
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	stg	$i1,0($s2,$inp)
+	stg	$i2,8($s2,$inp)
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_prepare
+
+	slgr	$inp,$fp		# rewind $inp
+	la	$s2,0($out,$inp)
+	lgr	$s3,$fp
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_xor:
+	lg	$i1,0($out,$inp)
+	lg	$i2,8($out,$inp)
+	xg	$i1,0($s2,$inp)
+	xg	$i2,8($s2,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_loop		# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lxts_km_loop	# not zero
+
+	l${g}	$i1,0($sp)		# back-chain
+	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
+	la	$i2,16($sp)
+	srlg	$fp,$fp,4
+.Lxts_km_zap:
+	stg	$i1,0($i2)
+	stg	$i1,8($i2)
+	la	$i2,16($i2)
+	brct	$fp,.Lxts_km_zap
+
+	la	$sp,0($i1)
+	llgc	$len,2*$SIZE_T-1($i1)
+	nill	$len,0x0f		# $len%=16
+	bzr	$ra
+
+	# generate one more tweak...
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	algr	$s0,$s0
+	alcgr	$s1,$s1
+	xgr	$s0,$i1
+
+	ltr	$len,$len		# clear zero flag
+	br	$ra
+.size	_s390x_xts_km,.-_s390x_xts_km
+
+.globl	AES_xts_encrypt
+.type	AES_xts_encrypt,\@function
+.align	16
+AES_xts_encrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	srag	$len,$len,4		# formally wrong, because it expands
+					# sign byte, but who can afford asking
+					# to process more than 2^63-1 bytes?
+					# I use it, because it sets condition
+					# code...
+	bcr	8,$ra			# abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_enc_software
+
+	st${g}	$ra,5*$SIZE_T($sp)
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	sllg	$len,$len,4		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed anymore
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_enc_km_done
+
+	aghi	$inp,-16		# take one step back
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_enc_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_km_steal
+
+	la	$s2,0($i3)
+	lghi	$s3,16
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($s2)
+	xg	$i2,8($s2)
+	stg	$i1,0($s2)
+	stg	$i2,8($s2)
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($i3)
+	xg	$i2,8($i3)
+	stg	$i1,0($i3)
+	stg	$i2,8($i3)
+
+.Lxts_enc_km_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$tweak+8($sp)
+	l${g}	$ra,5*$SIZE_T($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_enc_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+
+	l${g}	$s3,$stdframe($sp)	# ivp
+	llgf	$s0,0($s3)		# load iv
+	llgf	$s1,4($s3)
+	llgf	$s2,8($s3)
+	llgf	$s3,12($s3)
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	j	.Lxts_enc_enter
+
+.align	16
+.Lxts_enc_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+	la	$inp,16($inp)		# $inp+=16
+.Lxts_enc_enter:
+	x	$s0,0($inp)		# ^=*($inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	brct${g}	$len,.Lxts_enc_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_enc_done
+
+	la	$i3,0($inp,$out)	# put aside real $out
+.Lxts_enc_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_steal
+	la	$out,0($i3)		# restore real $out
+
+	# generate last tweak...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,`$tweak+0`($sp)	# ^=tweak
+	x	$s1,`$tweak+4`($sp)
+	x	$s2,`$tweak+8`($sp)
+	x	$s3,`$tweak+12`($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+
+.Lxts_enc_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+$code.=<<___;
+.globl	AES_xts_decrypt
+.type	AES_xts_decrypt,\@function
+.align	16
+AES_xts_decrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	aghi	$len,-16
+	bcr	4,$ra			# abort if less than zero. formally
+					# wrong, because $len is unsigned,
+					# but who can afford asking to
+					# process more than 2^63-1 bytes?
+	tmll	$len,0x0f
+	jnz	.Lxts_dec_proceed
+	aghi	$len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_dec_software
+
+	st${g}	$ra,5*$SIZE_T($sp)
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	nill	$len,0xfff0		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed past this point
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+
+	ltgr	$len,$len
+	jz	.Lxts_dec_km_short
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_dec_km_done
+
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+	j	.Lxts_dec_km_2ndtweak
+
+.Lxts_dec_km_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%=16
+	lrvg	$s0,$tweak+0($sp)	# load the tweak
+	lrvg	$s1,$tweak+8($sp)
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+
+.Lxts_dec_km_2ndtweak:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	algr	$s0,$s0
+	alcgr	$s1,$s1
+	xgr	$s0,$i1
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$i2,0($out,$inp)
+	lghi	$i3,16
+	.long	0xb92e0066		# km $i2,$i2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0
+	lrvgr	$i2,$s1
+	xg	$i1,0($out,$inp)
+	xg	$i2,8($out,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_km_steal
+
+	lgr	$s0,$s2
+	lgr	$s1,$s3
+	xg	$s0,0($i3)
+	xg	$s1,8($i3)
+	stg	$s0,0($i3)
+	stg	$s1,8($i3)
+	la	$s0,0($i3)
+	lghi	$s1,16
+	.long	0xb92e0088		# km $s0,$s0
+	brc	1,.-4			# can this happen?
+	xg	$s2,0($i3)
+	xg	$s3,8($i3)
+	stg	$s2,0($i3)
+	stg	$s3,8($i3)
+.Lxts_dec_km_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$tweak+8($sp)
+	l${g}	$ra,5*$SIZE_T($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_dec_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	srlg	$len,$len,4
+	slgr	$out,$inp
+
+	l${g}	$s3,$stdframe($sp)	# ivp
+	llgf	$s0,0($s3)		# load iv
+	llgf	$s1,4($s3)
+	llgf	$s2,8($s3)
+	llgf	$s3,12($s3)
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	larl	$tbl,AES_Td
+	lt${g}r	$len,$len
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	jz	.Lxts_dec_short
+	j	.Lxts_dec_enter
+
+.align	16
+.Lxts_dec_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+.Lxts_dec_enter:
+	x	$s0,0($inp)		# tweak^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	la	$inp,16($inp)
+	brct${g}	$len,.Lxts_dec_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_dec_done
+
+	# generate pair of tweaks...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$i2,$s1			# flip byte order
+	lrvgr	$i3,$s3
+	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
+	j	.Lxts_dec_2ndtweak
+
+.align	16
+.Lxts_dec_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	algr	$s1,$s1
+	alcgr	$s3,$s3
+	xgr	$s1,%r1
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
+	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak-16+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
+	x	$s1,$tweak-16+4($sp)
+	x	$s2,$tweak-16+8($sp)
+	x	$s3,$tweak-16+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_steal
+	la	$out,0($i3)		# restore real $out
+
+	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
+	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
+	stg	$sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_decrypt,.-AES_xts_decrypt
 ___
 }
 $code.=<<___;
 .string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
+close STDOUT;	# force flush
diff --git a/crypto/aes/asm/aes-sparcv9.pl b/crypto/aes/asm/aes-sparcv9.pl
index c57b3a2..403c4d1 100755
--- a/crypto/aes/asm/aes-sparcv9.pl
+++ b/crypto/aes/asm/aes-sparcv9.pl
@@ -1176,6 +1176,7 @@ ()
 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
 # undesired effect, so just omit them and sacrifice some portion of
 # percent in performance...
-$code =~ s/fmovs.*$//gem;
+$code =~ s/fmovs.*$//gm;
 
 print $code;
+close STDOUT;	# ensure flush
diff --git a/crypto/aes/asm/aes-x86_64.S b/crypto/aes/asm/aes-x86_64.S
new file mode 100644
index 0000000..e385566
--- /dev/null
+++ b/crypto/aes/asm/aes-x86_64.S
@@ -0,0 +1,2541 @@
+.text	
+.type	_x86_64_AES_encrypt,@function
+.align	16
+_x86_64_AES_encrypt:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+
+	movl	240(%r15),%r13d
+	subl	$1,%r13d
+	jmp	.Lenc_loop
+.align	16
+.Lenc_loop:
+
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movl	0(%r14,%rsi,8),%r10d
+	movl	0(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r12d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	%dl,%ebp
+	xorl	3(%r14,%rsi,8),%r10d
+	xorl	3(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r8d
+
+	movzbl	%dh,%esi
+	shrl	$16,%ecx
+	movzbl	%ah,%ebp
+	xorl	3(%r14,%rsi,8),%r12d
+	shrl	$16,%edx
+	xorl	3(%r14,%rbp,8),%r8d
+
+	shrl	$16,%ebx
+	leaq	16(%r15),%r15
+	shrl	$16,%eax
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	xorl	2(%r14,%rsi,8),%r10d
+	xorl	2(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	%bl,%ebp
+	xorl	1(%r14,%rsi,8),%r10d
+	xorl	1(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r8d
+
+	movl	12(%r15),%edx
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movl	0(%r15),%eax
+	xorl	1(%r14,%rdi,8),%r12d
+	xorl	1(%r14,%rbp,8),%r8d
+
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	subl	$1,%r13d
+	jnz	.Lenc_loop
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movzbl	2(%r14,%rsi,8),%r10d
+	movzbl	2(%r14,%rdi,8),%r11d
+	movzbl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%dl,%esi
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movzbl	2(%r14,%rsi,8),%r8d
+	movl	0(%r14,%rdi,8),%edi
+	movl	0(%r14,%rbp,8),%ebp
+
+	andl	$65280,%edi
+	andl	$65280,%ebp
+
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+	shrl	$16,%ecx
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	shrl	$16,%edx
+	movl	0(%r14,%rsi,8),%esi
+	movl	0(%r14,%rdi,8),%edi
+
+	andl	$65280,%esi
+	andl	$65280,%edi
+	shrl	$16,%ebx
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+	shrl	$16,%eax
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	movl	0(%r14,%rsi,8),%esi
+	movl	0(%r14,%rdi,8),%edi
+	movl	0(%r14,%rbp,8),%ebp
+
+	andl	$16711680,%esi
+	andl	$16711680,%edi
+	andl	$16711680,%ebp
+
+	xorl	%esi,%r10d
+	xorl	%edi,%r11d
+	xorl	%ebp,%r12d
+
+	movzbl	%bl,%esi
+	movzbl	%dh,%edi
+	movzbl	%ah,%ebp
+	movl	0(%r14,%rsi,8),%esi
+	movl	2(%r14,%rdi,8),%edi
+	movl	2(%r14,%rbp,8),%ebp
+
+	andl	$16711680,%esi
+	andl	$4278190080,%edi
+	andl	$4278190080,%ebp
+
+	xorl	%esi,%r8d
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movl	16+12(%r15),%edx
+	movl	2(%r14,%rsi,8),%esi
+	movl	2(%r14,%rdi,8),%edi
+	movl	16+0(%r15),%eax
+
+	andl	$4278190080,%esi
+	andl	$4278190080,%edi
+
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+
+	movl	16+4(%r15),%ebx
+	movl	16+8(%r15),%ecx
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
+.type	_x86_64_AES_encrypt_compact,@function
+.align	16
+_x86_64_AES_encrypt_compact:
+	leaq	128(%r14),%r8
+	movl	0-128(%r8),%edi
+	movl	32-128(%r8),%ebp
+	movl	64-128(%r8),%r10d
+	movl	96-128(%r8),%r11d
+	movl	128-128(%r8),%edi
+	movl	160-128(%r8),%ebp
+	movl	192-128(%r8),%r10d
+	movl	224-128(%r8),%r11d
+	jmp	.Lenc_loop_compact
+.align	16
+.Lenc_loop_compact:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+	leaq	16(%r15),%r15
+	movzbl	%al,%r10d
+	movzbl	%bl,%r11d
+	movzbl	%cl,%r12d
+	movzbl	(%r14,%r10,1),%r10d
+	movzbl	(%r14,%r11,1),%r11d
+	movzbl	(%r14,%r12,1),%r12d
+
+	movzbl	%dl,%r8d
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	(%r14,%r8,1),%r8d
+	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	(%r14,%rdi,1),%r13d
+
+	movzbl	%dh,%ebp
+	movzbl	%ah,%esi
+	shrl	$16,%ecx
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	shrl	$16,%edx
+
+	movzbl	%cl,%edi
+	shll	$8,%r9d
+	shll	$8,%r13d
+	movzbl	(%r14,%rdi,1),%edi
+	xorl	%r9d,%r10d
+	xorl	%r13d,%r11d
+
+	movzbl	%dl,%r9d
+	shrl	$16,%eax
+	shrl	$16,%ebx
+	movzbl	%al,%r13d
+	shll	$8,%ebp
+	shll	$8,%esi
+	movzbl	(%r14,%r9,1),%r9d
+	movzbl	(%r14,%r13,1),%r13d
+	xorl	%ebp,%r12d
+	xorl	%esi,%r8d
+
+	movzbl	%bl,%ebp
+	movzbl	%dh,%esi
+	shll	$16,%edi
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	xorl	%edi,%r10d
+
+	movzbl	%ah,%edi
+	shrl	$8,%ecx
+	shrl	$8,%ebx
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rcx,1),%edx
+	movzbl	(%r14,%rbx,1),%ecx
+	shll	$16,%r9d
+	shll	$16,%r13d
+	shll	$16,%ebp
+	xorl	%r9d,%r11d
+	xorl	%r13d,%r12d
+	xorl	%ebp,%r8d
+
+	shll	$24,%esi
+	shll	$24,%edi
+	shll	$24,%edx
+	xorl	%esi,%r10d
+	shll	$24,%ecx
+	xorl	%edi,%r11d
+	movl	%r10d,%eax
+	movl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	cmpq	16(%rsp),%r15
+	je	.Lenc_compact_done
+	movl	%eax,%esi
+	movl	%ebx,%edi
+	andl	$2155905152,%esi
+	andl	$2155905152,%edi
+	movl	%esi,%r10d
+	movl	%edi,%r11d
+	shrl	$7,%r10d
+	leal	(%rax,%rax,1),%r8d
+	shrl	$7,%r11d
+	leal	(%rbx,%rbx,1),%r9d
+	subl	%r10d,%esi
+	subl	%r11d,%edi
+	andl	$4278124286,%r8d
+	andl	$4278124286,%r9d
+	andl	$454761243,%esi
+	andl	$454761243,%edi
+	movl	%eax,%r10d
+	movl	%ebx,%r11d
+	xorl	%esi,%r8d
+	xorl	%edi,%r9d
+
+	xorl	%r8d,%eax
+	xorl	%r9d,%ebx
+	movl	%ecx,%esi
+	movl	%edx,%edi
+	roll	$24,%eax
+	roll	$24,%ebx
+	andl	$2155905152,%esi
+	andl	$2155905152,%edi
+	xorl	%r8d,%eax
+	xorl	%r9d,%ebx
+	movl	%esi,%r12d
+	movl	%edi,%ebp
+	rorl	$16,%r10d
+	rorl	$16,%r11d
+	shrl	$7,%r12d
+	leal	(%rcx,%rcx,1),%r8d
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	shrl	$7,%ebp
+	leal	(%rdx,%rdx,1),%r9d
+	rorl	$8,%r10d
+	rorl	$8,%r11d
+	subl	%r12d,%esi
+	subl	%ebp,%edi
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+
+	andl	$4278124286,%r8d
+	andl	$4278124286,%r9d
+	andl	$454761243,%esi
+	andl	$454761243,%edi
+	movl	%ecx,%r12d
+	movl	%edx,%ebp
+	xorl	%esi,%r8d
+	xorl	%edi,%r9d
+
+	xorl	%r8d,%ecx
+	xorl	%r9d,%edx
+	roll	$24,%ecx
+	roll	$24,%edx
+	xorl	%r8d,%ecx
+	xorl	%r9d,%edx
+	movl	0(%r14),%esi
+	rorl	$16,%r12d
+	rorl	$16,%ebp
+	movl	64(%r14),%edi
+	xorl	%r12d,%ecx
+	xorl	%ebp,%edx
+	movl	128(%r14),%r8d
+	rorl	$8,%r12d
+	rorl	$8,%ebp
+	movl	192(%r14),%r9d
+	xorl	%r12d,%ecx
+	xorl	%ebp,%edx
+	jmp	.Lenc_loop_compact
+.align	16
+.Lenc_compact_done:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
+.globl	AES_encrypt
+.type	AES_encrypt,@function
+.align	16
+.globl	asm_AES_encrypt
+.hidden	asm_AES_encrypt
+asm_AES_encrypt:
+AES_encrypt:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r10
+	leaq	-63(%rdx),%rcx
+	andq	$-64,%rsp
+	subq	%rsp,%rcx
+	negq	%rcx
+	andq	$960,%rcx
+	subq	%rcx,%rsp
+	subq	$32,%rsp
+
+	movq	%rsi,16(%rsp)
+	movq	%r10,24(%rsp)
+.Lenc_prologue:
+
+	movq	%rdx,%r15
+	movl	240(%r15),%r13d
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+
+	shll	$4,%r13d
+	leaq	(%r15,%r13,1),%rbp
+	movq	%r15,(%rsp)
+	movq	%rbp,8(%rsp)
+
+
+	leaq	.LAES_Te+2048(%rip),%r14
+	leaq	768(%rsp),%rbp
+	subq	%r14,%rbp
+	andq	$768,%rbp
+	leaq	(%r14,%rbp,1),%r14
+
+	call	_x86_64_AES_encrypt_compact
+
+	movq	16(%rsp),%r9
+	movq	24(%rsp),%rsi
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lenc_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_encrypt,.-AES_encrypt
+.type	_x86_64_AES_decrypt,@function
+.align	16
+_x86_64_AES_decrypt:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+
+	movl	240(%r15),%r13d
+	subl	$1,%r13d
+	jmp	.Ldec_loop
+.align	16
+.Ldec_loop:
+
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movl	0(%r14,%rsi,8),%r10d
+	movl	0(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r12d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	%dl,%ebp
+	xorl	3(%r14,%rsi,8),%r10d
+	xorl	3(%r14,%rdi,8),%r11d
+	movl	0(%r14,%rbp,8),%r8d
+
+	movzbl	%bh,%esi
+	shrl	$16,%eax
+	movzbl	%ch,%ebp
+	xorl	3(%r14,%rsi,8),%r12d
+	shrl	$16,%edx
+	xorl	3(%r14,%rbp,8),%r8d
+
+	shrl	$16,%ebx
+	leaq	16(%r15),%r15
+	shrl	$16,%ecx
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	xorl	2(%r14,%rsi,8),%r10d
+	xorl	2(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r12d
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	movzbl	%bl,%ebp
+	xorl	1(%r14,%rsi,8),%r10d
+	xorl	1(%r14,%rdi,8),%r11d
+	xorl	2(%r14,%rbp,8),%r8d
+
+	movzbl	%dh,%esi
+	movl	12(%r15),%edx
+	movzbl	%ah,%ebp
+	xorl	1(%r14,%rsi,8),%r12d
+	movl	0(%r15),%eax
+	xorl	1(%r14,%rbp,8),%r8d
+
+	xorl	%r10d,%eax
+	movl	4(%r15),%ebx
+	movl	8(%r15),%ecx
+	xorl	%r12d,%ecx
+	xorl	%r11d,%ebx
+	xorl	%r8d,%edx
+	subl	$1,%r13d
+	jnz	.Ldec_loop
+	leaq	2048(%r14),%r14
+	movzbl	%al,%esi
+	movzbl	%bl,%edi
+	movzbl	%cl,%ebp
+	movzbl	(%r14,%rsi,1),%r10d
+	movzbl	(%r14,%rdi,1),%r11d
+	movzbl	(%r14,%rbp,1),%r12d
+
+	movzbl	%dl,%esi
+	movzbl	%dh,%edi
+	movzbl	%ah,%ebp
+	movzbl	(%r14,%rsi,1),%r8d
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$8,%edi
+	shll	$8,%ebp
+
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+	shrl	$16,%edx
+
+	movzbl	%bh,%esi
+	movzbl	%ch,%edi
+	shrl	$16,%eax
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+
+	shll	$8,%esi
+	shll	$8,%edi
+	shrl	$16,%ebx
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+	shrl	$16,%ecx
+
+	movzbl	%cl,%esi
+	movzbl	%dl,%edi
+	movzbl	%al,%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$16,%esi
+	shll	$16,%edi
+	shll	$16,%ebp
+
+	xorl	%esi,%r10d
+	xorl	%edi,%r11d
+	xorl	%ebp,%r12d
+
+	movzbl	%bl,%esi
+	movzbl	%bh,%edi
+	movzbl	%ch,%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movzbl	(%r14,%rbp,1),%ebp
+
+	shll	$16,%esi
+	shll	$24,%edi
+	shll	$24,%ebp
+
+	xorl	%esi,%r8d
+	xorl	%edi,%r10d
+	xorl	%ebp,%r11d
+
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movl	16+12(%r15),%edx
+	movzbl	(%r14,%rsi,1),%esi
+	movzbl	(%r14,%rdi,1),%edi
+	movl	16+0(%r15),%eax
+
+	shll	$24,%esi
+	shll	$24,%edi
+
+	xorl	%esi,%r12d
+	xorl	%edi,%r8d
+
+	movl	16+4(%r15),%ebx
+	movl	16+8(%r15),%ecx
+	leaq	-2048(%r14),%r14
+	xorl	%r10d,%eax
+	xorl	%r11d,%ebx
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
+.type	_x86_64_AES_decrypt_compact,@function
+.align	16
+_x86_64_AES_decrypt_compact:
+	leaq	128(%r14),%r8
+	movl	0-128(%r8),%edi
+	movl	32-128(%r8),%ebp
+	movl	64-128(%r8),%r10d
+	movl	96-128(%r8),%r11d
+	movl	128-128(%r8),%edi
+	movl	160-128(%r8),%ebp
+	movl	192-128(%r8),%r10d
+	movl	224-128(%r8),%r11d
+	jmp	.Ldec_loop_compact
+
+.align	16
+.Ldec_loop_compact:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+	leaq	16(%r15),%r15
+	movzbl	%al,%r10d
+	movzbl	%bl,%r11d
+	movzbl	%cl,%r12d
+	movzbl	(%r14,%r10,1),%r10d
+	movzbl	(%r14,%r11,1),%r11d
+	movzbl	(%r14,%r12,1),%r12d
+
+	movzbl	%dl,%r8d
+	movzbl	%dh,%esi
+	movzbl	%ah,%edi
+	movzbl	(%r14,%r8,1),%r8d
+	movzbl	(%r14,%rsi,1),%r9d
+	movzbl	(%r14,%rdi,1),%r13d
+
+	movzbl	%bh,%ebp
+	movzbl	%ch,%esi
+	shrl	$16,%ecx
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	shrl	$16,%edx
+
+	movzbl	%cl,%edi
+	shll	$8,%r9d
+	shll	$8,%r13d
+	movzbl	(%r14,%rdi,1),%edi
+	xorl	%r9d,%r10d
+	xorl	%r13d,%r11d
+
+	movzbl	%dl,%r9d
+	shrl	$16,%eax
+	shrl	$16,%ebx
+	movzbl	%al,%r13d
+	shll	$8,%ebp
+	shll	$8,%esi
+	movzbl	(%r14,%r9,1),%r9d
+	movzbl	(%r14,%r13,1),%r13d
+	xorl	%ebp,%r12d
+	xorl	%esi,%r8d
+
+	movzbl	%bl,%ebp
+	movzbl	%bh,%esi
+	shll	$16,%edi
+	movzbl	(%r14,%rbp,1),%ebp
+	movzbl	(%r14,%rsi,1),%esi
+	xorl	%edi,%r10d
+
+	movzbl	%ch,%edi
+	shll	$16,%r9d
+	shll	$16,%r13d
+	movzbl	(%r14,%rdi,1),%ebx
+	xorl	%r9d,%r11d
+	xorl	%r13d,%r12d
+
+	movzbl	%dh,%edi
+	shrl	$8,%eax
+	shll	$16,%ebp
+	movzbl	(%r14,%rdi,1),%ecx
+	movzbl	(%r14,%rax,1),%edx
+	xorl	%ebp,%r8d
+
+	shll	$24,%esi
+	shll	$24,%ebx
+	shll	$24,%ecx
+	xorl	%esi,%r10d
+	shll	$24,%edx
+	xorl	%r11d,%ebx
+	movl	%r10d,%eax
+	xorl	%r12d,%ecx
+	xorl	%r8d,%edx
+	cmpq	16(%rsp),%r15
+	je	.Ldec_compact_done
+
+	movq	256+0(%r14),%rsi
+	shlq	$32,%rbx
+	shlq	$32,%rdx
+	movq	256+8(%r14),%rdi
+	orq	%rbx,%rax
+	orq	%rdx,%rcx
+	movq	256+16(%r14),%rbp
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+	shrq	$7,%r9
+	leaq	(%rax,%rax,1),%r8
+	shrq	$7,%r12
+	leaq	(%rcx,%rcx,1),%r11
+	subq	%r9,%rbx
+	subq	%r12,%rdx
+	andq	%rdi,%r8
+	andq	%rdi,%r11
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r8,%rbx
+	xorq	%r11,%rdx
+	movq	%rbx,%r8
+	movq	%rdx,%r11
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	leaq	(%r8,%r8,1),%r9
+	shrq	$7,%r13
+	leaq	(%r11,%r11,1),%r12
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	andq	%rdi,%r9
+	andq	%rdi,%r12
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r9,%rbx
+	xorq	%r12,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	xorq	%rax,%r8
+	shrq	$7,%r13
+	xorq	%rcx,%r11
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	leaq	(%r9,%r9,1),%r10
+	leaq	(%r12,%r12,1),%r13
+	xorq	%rax,%r9
+	xorq	%rcx,%r12
+	andq	%rdi,%r10
+	andq	%rdi,%r13
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%rbx,%r10
+	xorq	%rdx,%r13
+
+	xorq	%r10,%rax
+	xorq	%r13,%rcx
+	xorq	%r10,%r8
+	xorq	%r13,%r11
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	xorq	%r10,%r9
+	xorq	%r13,%r12
+	shrq	$32,%rbx
+	shrq	$32,%rdx
+	xorq	%r8,%r10
+	xorq	%r11,%r13
+	roll	$8,%eax
+	roll	$8,%ecx
+	xorq	%r9,%r10
+	xorq	%r12,%r13
+
+	roll	$8,%ebx
+	roll	$8,%edx
+	xorl	%r10d,%eax
+	xorl	%r13d,%ecx
+	shrq	$32,%r10
+	shrq	$32,%r13
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	%r8,%r10
+	movq	%r11,%r13
+	shrq	$32,%r10
+	shrq	$32,%r13
+	roll	$24,%r8d
+	roll	$24,%r11d
+	roll	$24,%r10d
+	roll	$24,%r13d
+	xorl	%r8d,%eax
+	xorl	%r11d,%ecx
+	movq	%r9,%r8
+	movq	%r12,%r11
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	0(%r14),%rsi
+	shrq	$32,%r8
+	shrq	$32,%r11
+	movq	64(%r14),%rdi
+	roll	$16,%r9d
+	roll	$16,%r12d
+	movq	128(%r14),%rbp
+	roll	$16,%r8d
+	roll	$16,%r11d
+	movq	192(%r14),%r10
+	xorl	%r9d,%eax
+	xorl	%r12d,%ecx
+	movq	256(%r14),%r13
+	xorl	%r8d,%ebx
+	xorl	%r11d,%edx
+	jmp	.Ldec_loop_compact
+.align	16
+.Ldec_compact_done:
+	xorl	0(%r15),%eax
+	xorl	4(%r15),%ebx
+	xorl	8(%r15),%ecx
+	xorl	12(%r15),%edx
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
+.globl	AES_decrypt
+.type	AES_decrypt,@function
+.align	16
+.globl	asm_AES_decrypt
+.hidden	asm_AES_decrypt
+asm_AES_decrypt:
+AES_decrypt:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r10
+	leaq	-63(%rdx),%rcx
+	andq	$-64,%rsp
+	subq	%rsp,%rcx
+	negq	%rcx
+	andq	$960,%rcx
+	subq	%rcx,%rsp
+	subq	$32,%rsp
+
+	movq	%rsi,16(%rsp)
+	movq	%r10,24(%rsp)
+.Ldec_prologue:
+
+	movq	%rdx,%r15
+	movl	240(%r15),%r13d
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+
+	shll	$4,%r13d
+	leaq	(%r15,%r13,1),%rbp
+	movq	%r15,(%rsp)
+	movq	%rbp,8(%rsp)
+
+
+	leaq	.LAES_Td+2048(%rip),%r14
+	leaq	768(%rsp),%rbp
+	subq	%r14,%rbp
+	andq	$768,%rbp
+	leaq	(%r14,%rbp,1),%r14
+	shrq	$3,%rbp
+	addq	%rbp,%r14
+
+	call	_x86_64_AES_decrypt_compact
+
+	movq	16(%rsp),%r9
+	movq	24(%rsp),%rsi
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Ldec_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_decrypt,.-AES_decrypt
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,@function
+.align	16
+private_AES_set_encrypt_key:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$8,%rsp
+.Lenc_key_prologue:
+
+	call	_x86_64_AES_set_encrypt_key
+
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbp
+	movq	48(%rsp),%rbx
+	addq	$56,%rsp
+.Lenc_key_epilogue:
+	.byte	0xf3,0xc3
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+
+.type	_x86_64_AES_set_encrypt_key,@function
+.align	16
+_x86_64_AES_set_encrypt_key:
+	movl	%esi,%ecx
+	movq	%rdi,%rsi
+	movq	%rdx,%rdi
+
+	testq	$-1,%rsi
+	jz	.Lbadpointer
+	testq	$-1,%rdi
+	jz	.Lbadpointer
+
+	leaq	.LAES_Te(%rip),%rbp
+	leaq	2048+128(%rbp),%rbp
+
+
+	movl	0-128(%rbp),%eax
+	movl	32-128(%rbp),%ebx
+	movl	64-128(%rbp),%r8d
+	movl	96-128(%rbp),%edx
+	movl	128-128(%rbp),%eax
+	movl	160-128(%rbp),%ebx
+	movl	192-128(%rbp),%r8d
+	movl	224-128(%rbp),%edx
+
+	cmpl	$128,%ecx
+	je	.L10rounds
+	cmpl	$192,%ecx
+	je	.L12rounds
+	cmpl	$256,%ecx
+	je	.L14rounds
+	movq	$-2,%rax
+	jmp	.Lexit
+
+.L10rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rdx,8(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L10shortcut
+.align	4
+.L10loop:
+	movl	0(%rdi),%eax
+	movl	12(%rdi),%edx
+.L10shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,16(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,20(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,24(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,28(%rdi)
+	addl	$1,%ecx
+	leaq	16(%rdi),%rdi
+	cmpl	$10,%ecx
+	jl	.L10loop
+
+	movl	$10,80(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.L12rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rdx,16(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L12shortcut
+.align	4
+.L12loop:
+	movl	0(%rdi),%eax
+	movl	20(%rdi),%edx
+.L12shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,24(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,28(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,32(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,36(%rdi)
+
+	cmpl	$7,%ecx
+	je	.L12break
+	addl	$1,%ecx
+
+	xorl	16(%rdi),%eax
+	movl	%eax,40(%rdi)
+	xorl	20(%rdi),%eax
+	movl	%eax,44(%rdi)
+
+	leaq	24(%rdi),%rdi
+	jmp	.L12loop
+.L12break:
+	movl	$12,72(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.L14rounds:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rcx
+	movq	24(%rsi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+
+	shrq	$32,%rdx
+	xorl	%ecx,%ecx
+	jmp	.L14shortcut
+.align	4
+.L14loop:
+	movl	0(%rdi),%eax
+	movl	28(%rdi),%edx
+.L14shortcut:
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$8,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	xorl	1024-128(%rbp,%rcx,4),%eax
+	movl	%eax,32(%rdi)
+	xorl	4(%rdi),%eax
+	movl	%eax,36(%rdi)
+	xorl	8(%rdi),%eax
+	movl	%eax,40(%rdi)
+	xorl	12(%rdi),%eax
+	movl	%eax,44(%rdi)
+
+	cmpl	$6,%ecx
+	je	.L14break
+	addl	$1,%ecx
+
+	movl	%eax,%edx
+	movl	16(%rdi),%eax
+	movzbl	%dl,%esi
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shrl	$16,%edx
+	shll	$8,%ebx
+	movzbl	%dl,%esi
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	movzbl	%dh,%esi
+	shll	$16,%ebx
+	xorl	%ebx,%eax
+
+	movzbl	-128(%rbp,%rsi,1),%ebx
+	shll	$24,%ebx
+	xorl	%ebx,%eax
+
+	movl	%eax,48(%rdi)
+	xorl	20(%rdi),%eax
+	movl	%eax,52(%rdi)
+	xorl	24(%rdi),%eax
+	movl	%eax,56(%rdi)
+	xorl	28(%rdi),%eax
+	movl	%eax,60(%rdi)
+
+	leaq	32(%rdi),%rdi
+	jmp	.L14loop
+.L14break:
+	movl	$14,48(%rdi)
+	xorq	%rax,%rax
+	jmp	.Lexit
+
+.Lbadpointer:
+	movq	$-1,%rax
+.Lexit:
+.byte	0xf3,0xc3			
+.size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,@function
+.align	16
+private_AES_set_decrypt_key:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdx
+.Ldec_key_prologue:
+
+	call	_x86_64_AES_set_encrypt_key
+	movq	(%rsp),%r8
+	cmpl	$0,%eax
+	jne	.Labort
+
+	movl	240(%r8),%r14d
+	xorq	%rdi,%rdi
+	leaq	(%rdi,%r14,4),%rcx
+	movq	%r8,%rsi
+	leaq	(%r8,%rcx,4),%rdi
+.align	4
+.Linvert:
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	0(%rdi),%rcx
+	movq	8(%rdi),%rdx
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,0(%rsi)
+	movq	%rdx,8(%rsi)
+	leaq	16(%rsi),%rsi
+	leaq	-16(%rdi),%rdi
+	cmpq	%rsi,%rdi
+	jne	.Linvert
+
+	leaq	.LAES_Te+2048+1024(%rip),%rax
+
+	movq	40(%rax),%rsi
+	movq	48(%rax),%rdi
+	movq	56(%rax),%rbp
+
+	movq	%r8,%r15
+	subl	$1,%r14d
+.align	4
+.Lpermute:
+	leaq	16(%r15),%r15
+	movq	0(%r15),%rax
+	movq	8(%r15),%rcx
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+	shrq	$7,%r9
+	leaq	(%rax,%rax,1),%r8
+	shrq	$7,%r12
+	leaq	(%rcx,%rcx,1),%r11
+	subq	%r9,%rbx
+	subq	%r12,%rdx
+	andq	%rdi,%r8
+	andq	%rdi,%r11
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r8,%rbx
+	xorq	%r11,%rdx
+	movq	%rbx,%r8
+	movq	%rdx,%r11
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	leaq	(%r8,%r8,1),%r9
+	shrq	$7,%r13
+	leaq	(%r11,%r11,1),%r12
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	andq	%rdi,%r9
+	andq	%rdi,%r12
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%r9,%rbx
+	xorq	%r12,%rdx
+	movq	%rbx,%r9
+	movq	%rdx,%r12
+
+	andq	%rsi,%rbx
+	andq	%rsi,%rdx
+	movq	%rbx,%r10
+	movq	%rdx,%r13
+	shrq	$7,%r10
+	xorq	%rax,%r8
+	shrq	$7,%r13
+	xorq	%rcx,%r11
+	subq	%r10,%rbx
+	subq	%r13,%rdx
+	leaq	(%r9,%r9,1),%r10
+	leaq	(%r12,%r12,1),%r13
+	xorq	%rax,%r9
+	xorq	%rcx,%r12
+	andq	%rdi,%r10
+	andq	%rdi,%r13
+	andq	%rbp,%rbx
+	andq	%rbp,%rdx
+	xorq	%rbx,%r10
+	xorq	%rdx,%r13
+
+	xorq	%r10,%rax
+	xorq	%r13,%rcx
+	xorq	%r10,%r8
+	xorq	%r13,%r11
+	movq	%rax,%rbx
+	movq	%rcx,%rdx
+	xorq	%r10,%r9
+	xorq	%r13,%r12
+	shrq	$32,%rbx
+	shrq	$32,%rdx
+	xorq	%r8,%r10
+	xorq	%r11,%r13
+	roll	$8,%eax
+	roll	$8,%ecx
+	xorq	%r9,%r10
+	xorq	%r12,%r13
+
+	roll	$8,%ebx
+	roll	$8,%edx
+	xorl	%r10d,%eax
+	xorl	%r13d,%ecx
+	shrq	$32,%r10
+	shrq	$32,%r13
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+	movq	%r8,%r10
+	movq	%r11,%r13
+	shrq	$32,%r10
+	shrq	$32,%r13
+	roll	$24,%r8d
+	roll	$24,%r11d
+	roll	$24,%r10d
+	roll	$24,%r13d
+	xorl	%r8d,%eax
+	xorl	%r11d,%ecx
+	movq	%r9,%r8
+	movq	%r12,%r11
+	xorl	%r10d,%ebx
+	xorl	%r13d,%edx
+
+
+	shrq	$32,%r8
+	shrq	$32,%r11
+
+	roll	$16,%r9d
+	roll	$16,%r12d
+
+	roll	$16,%r8d
+	roll	$16,%r11d
+
+	xorl	%r9d,%eax
+	xorl	%r12d,%ecx
+
+	xorl	%r8d,%ebx
+	xorl	%r11d,%edx
+	movl	%eax,0(%r15)
+	movl	%ebx,4(%r15)
+	movl	%ecx,8(%r15)
+	movl	%edx,12(%r15)
+	subl	$1,%r14d
+	jnz	.Lpermute
+
+	xorq	%rax,%rax
+.Labort:
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbp
+	movq	48(%rsp),%rbx
+	addq	$56,%rsp
+.Ldec_key_epilogue:
+	.byte	0xf3,0xc3
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.globl	AES_cbc_encrypt
+.type	AES_cbc_encrypt,@function
+.align	16
+
+.globl	asm_AES_cbc_encrypt
+.hidden	asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
+AES_cbc_encrypt:
+	cmpq	$0,%rdx
+	je	.Lcbc_epilogue
+	pushfq
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+.Lcbc_prologue:
+
+	cld
+	movl	%r9d,%r9d
+
+	leaq	.LAES_Te(%rip),%r14
+	cmpq	$0,%r9
+	jne	.Lcbc_picked_te
+	leaq	.LAES_Td(%rip),%r14
+.Lcbc_picked_te:
+
+	movl	OPENSSL_ia32cap_P(%rip),%r10d
+	cmpq	$512,%rdx
+	jb	.Lcbc_slow_prologue
+	testq	$15,%rdx
+	jnz	.Lcbc_slow_prologue
+	btl	$28,%r10d
+	jc	.Lcbc_slow_prologue
+
+
+	leaq	-88-248(%rsp),%r15
+	andq	$-64,%r15
+
+
+	movq	%r14,%r10
+	leaq	2304(%r14),%r11
+	movq	%r15,%r12
+	andq	$4095,%r10
+	andq	$4095,%r11
+	andq	$4095,%r12
+
+	cmpq	%r11,%r12
+	jb	.Lcbc_te_break_out
+	subq	%r11,%r12
+	subq	%r12,%r15
+	jmp	.Lcbc_te_ok
+.Lcbc_te_break_out:
+	subq	%r10,%r12
+	andq	$4095,%r12
+	addq	$320,%r12
+	subq	%r12,%r15
+.align	4
+.Lcbc_te_ok:
+
+	xchgq	%rsp,%r15
+
+	movq	%r15,16(%rsp)
+.Lcbc_fast_body:
+	movq	%rdi,24(%rsp)
+	movq	%rsi,32(%rsp)
+	movq	%rdx,40(%rsp)
+	movq	%rcx,48(%rsp)
+	movq	%r8,56(%rsp)
+	movl	$0,80+240(%rsp)
+	movq	%r8,%rbp
+	movq	%r9,%rbx
+	movq	%rsi,%r9
+	movq	%rdi,%r8
+	movq	%rcx,%r15
+
+	movl	240(%r15),%eax
+
+	movq	%r15,%r10
+	subq	%r14,%r10
+	andq	$4095,%r10
+	cmpq	$2304,%r10
+	jb	.Lcbc_do_ecopy
+	cmpq	$4096-248,%r10
+	jb	.Lcbc_skip_ecopy
+.align	4
+.Lcbc_do_ecopy:
+	movq	%r15,%rsi
+	leaq	80(%rsp),%rdi
+	leaq	80(%rsp),%r15
+	movl	$30,%ecx
+.long	0x90A548F3	
+	movl	%eax,(%rdi)
+.Lcbc_skip_ecopy:
+	movq	%r15,0(%rsp)
+
+	movl	$18,%ecx
+.align	4
+.Lcbc_prefetch_te:
+	movq	0(%r14),%r10
+	movq	32(%r14),%r11
+	movq	64(%r14),%r12
+	movq	96(%r14),%r13
+	leaq	128(%r14),%r14
+	subl	$1,%ecx
+	jnz	.Lcbc_prefetch_te
+	leaq	-2304(%r14),%r14
+
+	cmpq	$0,%rbx
+	je	.LFAST_DECRYPT
+
+
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+.align	4
+.Lcbc_fast_enc_loop:
+	xorl	0(%r8),%eax
+	xorl	4(%r8),%ebx
+	xorl	8(%r8),%ecx
+	xorl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_encrypt
+
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	subq	$16,%r10
+	testq	$-16,%r10
+	movq	%r10,40(%rsp)
+	jnz	.Lcbc_fast_enc_loop
+	movq	56(%rsp),%rbp
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	jmp	.Lcbc_fast_cleanup
+
+
+.align	16
+.LFAST_DECRYPT:
+	cmpq	%r8,%r9
+	je	.Lcbc_fast_dec_in_place
+
+	movq	%rbp,64(%rsp)
+.align	4
+.Lcbc_fast_dec_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_decrypt
+
+	movq	64(%rsp),%rbp
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	xorl	0(%rbp),%eax
+	xorl	4(%rbp),%ebx
+	xorl	8(%rbp),%ecx
+	xorl	12(%rbp),%edx
+	movq	%r8,%rbp
+
+	subq	$16,%r10
+	movq	%r10,40(%rsp)
+	movq	%rbp,64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	jnz	.Lcbc_fast_dec_loop
+	movq	56(%rsp),%r12
+	movq	0(%rbp),%r10
+	movq	8(%rbp),%r11
+	movq	%r10,0(%r12)
+	movq	%r11,8(%r12)
+	jmp	.Lcbc_fast_cleanup
+
+.align	16
+.Lcbc_fast_dec_in_place:
+	movq	0(%rbp),%r10
+	movq	8(%rbp),%r11
+	movq	%r10,0+64(%rsp)
+	movq	%r11,8+64(%rsp)
+.align	4
+.Lcbc_fast_dec_in_place_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+
+	call	_x86_64_AES_decrypt
+
+	movq	24(%rsp),%r8
+	movq	40(%rsp),%r10
+	xorl	0+64(%rsp),%eax
+	xorl	4+64(%rsp),%ebx
+	xorl	8+64(%rsp),%ecx
+	xorl	12+64(%rsp),%edx
+
+	movq	0(%r8),%r11
+	movq	8(%r8),%r12
+	subq	$16,%r10
+	jz	.Lcbc_fast_dec_in_place_done
+
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	movq	%r10,40(%rsp)
+	jmp	.Lcbc_fast_dec_in_place_loop
+.Lcbc_fast_dec_in_place_done:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+.align	4
+.Lcbc_fast_cleanup:
+	cmpl	$0,80+240(%rsp)
+	leaq	80(%rsp),%rdi
+	je	.Lcbc_exit
+	movl	$30,%ecx
+	xorq	%rax,%rax
+.long	0x90AB48F3	
+
+	jmp	.Lcbc_exit
+
+
+.align	16
+.Lcbc_slow_prologue:
+
+	leaq	-88(%rsp),%rbp
+	andq	$-64,%rbp
+
+	leaq	-88-63(%rcx),%r10
+	subq	%rbp,%r10
+	negq	%r10
+	andq	$960,%r10
+	subq	%r10,%rbp
+
+	xchgq	%rsp,%rbp
+
+	movq	%rbp,16(%rsp)
+.Lcbc_slow_body:
+
+
+
+
+	movq	%r8,56(%rsp)
+	movq	%r8,%rbp
+	movq	%r9,%rbx
+	movq	%rsi,%r9
+	movq	%rdi,%r8
+	movq	%rcx,%r15
+	movq	%rdx,%r10
+
+	movl	240(%r15),%eax
+	movq	%r15,0(%rsp)
+	shll	$4,%eax
+	leaq	(%r15,%rax,1),%rax
+	movq	%rax,8(%rsp)
+
+
+	leaq	2048(%r14),%r14
+	leaq	768-8(%rsp),%rax
+	subq	%r14,%rax
+	andq	$768,%rax
+	leaq	(%r14,%rax,1),%r14
+
+	cmpq	$0,%rbx
+	je	.LSLOW_DECRYPT
+
+
+	testq	$-16,%r10
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+	jz	.Lcbc_slow_enc_tail	
+
+.align	4
+.Lcbc_slow_enc_loop:
+	xorl	0(%r8),%eax
+	xorl	4(%r8),%ebx
+	xorl	8(%r8),%ecx
+	xorl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+	movq	%r9,32(%rsp)
+	movq	%r10,40(%rsp)
+
+	call	_x86_64_AES_encrypt_compact
+
+	movq	24(%rsp),%r8
+	movq	32(%rsp),%r9
+	movq	40(%rsp),%r10
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	subq	$16,%r10
+	testq	$-16,%r10
+	jnz	.Lcbc_slow_enc_loop
+	testq	$15,%r10
+	jnz	.Lcbc_slow_enc_tail
+	movq	56(%rsp),%rbp
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	jmp	.Lcbc_exit
+
+.align	4
+.Lcbc_slow_enc_tail:
+	movq	%rax,%r11
+	movq	%rcx,%r12
+	movq	%r10,%rcx
+	movq	%r8,%rsi
+	movq	%r9,%rdi
+.long	0x9066A4F3		
+	movq	$16,%rcx
+	subq	%r10,%rcx
+	xorq	%rax,%rax
+.long	0x9066AAF3		
+	movq	%r9,%r8
+	movq	$16,%r10
+	movq	%r11,%rax
+	movq	%r12,%rcx
+	jmp	.Lcbc_slow_enc_loop	
+
+.align	16
+.LSLOW_DECRYPT:
+	shrq	$3,%rax
+	addq	%rax,%r14
+
+	movq	0(%rbp),%r11
+	movq	8(%rbp),%r12
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+.align	4
+.Lcbc_slow_dec_loop:
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movq	0(%rsp),%r15
+	movq	%r8,24(%rsp)
+	movq	%r9,32(%rsp)
+	movq	%r10,40(%rsp)
+
+	call	_x86_64_AES_decrypt_compact
+
+	movq	24(%rsp),%r8
+	movq	32(%rsp),%r9
+	movq	40(%rsp),%r10
+	xorl	0+64(%rsp),%eax
+	xorl	4+64(%rsp),%ebx
+	xorl	8+64(%rsp),%ecx
+	xorl	12+64(%rsp),%edx
+
+	movq	0(%r8),%r11
+	movq	8(%r8),%r12
+	subq	$16,%r10
+	jc	.Lcbc_slow_dec_partial
+	jz	.Lcbc_slow_dec_done
+
+	movq	%r11,0+64(%rsp)
+	movq	%r12,8+64(%rsp)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	leaq	16(%r8),%r8
+	leaq	16(%r9),%r9
+	jmp	.Lcbc_slow_dec_loop
+.Lcbc_slow_dec_done:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0(%r9)
+	movl	%ebx,4(%r9)
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+
+	jmp	.Lcbc_exit
+
+.align	4
+.Lcbc_slow_dec_partial:
+	movq	56(%rsp),%rdi
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+
+	movl	%eax,0+64(%rsp)
+	movl	%ebx,4+64(%rsp)
+	movl	%ecx,8+64(%rsp)
+	movl	%edx,12+64(%rsp)
+
+	movq	%r9,%rdi
+	leaq	64(%rsp),%rsi
+	leaq	16(%r10),%rcx
+.long	0x9066A4F3	
+	jmp	.Lcbc_exit
+
+.align	16
+.Lcbc_exit:
+	movq	16(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lcbc_popfq:
+	popfq
+.Lcbc_epilogue:
+	.byte	0xf3,0xc3
+.size	AES_cbc_encrypt,.-AES_cbc_encrypt
+.align	64
+.LAES_Te:
+.long	0xa56363c6,0xa56363c6
+.long	0x847c7cf8,0x847c7cf8
+.long	0x997777ee,0x997777ee
+.long	0x8d7b7bf6,0x8d7b7bf6
+.long	0x0df2f2ff,0x0df2f2ff
+.long	0xbd6b6bd6,0xbd6b6bd6
+.long	0xb16f6fde,0xb16f6fde
+.long	0x54c5c591,0x54c5c591
+.long	0x50303060,0x50303060
+.long	0x03010102,0x03010102
+.long	0xa96767ce,0xa96767ce
+.long	0x7d2b2b56,0x7d2b2b56
+.long	0x19fefee7,0x19fefee7
+.long	0x62d7d7b5,0x62d7d7b5
+.long	0xe6abab4d,0xe6abab4d
+.long	0x9a7676ec,0x9a7676ec
+.long	0x45caca8f,0x45caca8f
+.long	0x9d82821f,0x9d82821f
+.long	0x40c9c989,0x40c9c989
+.long	0x877d7dfa,0x877d7dfa
+.long	0x15fafaef,0x15fafaef
+.long	0xeb5959b2,0xeb5959b2
+.long	0xc947478e,0xc947478e
+.long	0x0bf0f0fb,0x0bf0f0fb
+.long	0xecadad41,0xecadad41
+.long	0x67d4d4b3,0x67d4d4b3
+.long	0xfda2a25f,0xfda2a25f
+.long	0xeaafaf45,0xeaafaf45
+.long	0xbf9c9c23,0xbf9c9c23
+.long	0xf7a4a453,0xf7a4a453
+.long	0x967272e4,0x967272e4
+.long	0x5bc0c09b,0x5bc0c09b
+.long	0xc2b7b775,0xc2b7b775
+.long	0x1cfdfde1,0x1cfdfde1
+.long	0xae93933d,0xae93933d
+.long	0x6a26264c,0x6a26264c
+.long	0x5a36366c,0x5a36366c
+.long	0x413f3f7e,0x413f3f7e
+.long	0x02f7f7f5,0x02f7f7f5
+.long	0x4fcccc83,0x4fcccc83
+.long	0x5c343468,0x5c343468
+.long	0xf4a5a551,0xf4a5a551
+.long	0x34e5e5d1,0x34e5e5d1
+.long	0x08f1f1f9,0x08f1f1f9
+.long	0x937171e2,0x937171e2
+.long	0x73d8d8ab,0x73d8d8ab
+.long	0x53313162,0x53313162
+.long	0x3f15152a,0x3f15152a
+.long	0x0c040408,0x0c040408
+.long	0x52c7c795,0x52c7c795
+.long	0x65232346,0x65232346
+.long	0x5ec3c39d,0x5ec3c39d
+.long	0x28181830,0x28181830
+.long	0xa1969637,0xa1969637
+.long	0x0f05050a,0x0f05050a
+.long	0xb59a9a2f,0xb59a9a2f
+.long	0x0907070e,0x0907070e
+.long	0x36121224,0x36121224
+.long	0x9b80801b,0x9b80801b
+.long	0x3de2e2df,0x3de2e2df
+.long	0x26ebebcd,0x26ebebcd
+.long	0x6927274e,0x6927274e
+.long	0xcdb2b27f,0xcdb2b27f
+.long	0x9f7575ea,0x9f7575ea
+.long	0x1b090912,0x1b090912
+.long	0x9e83831d,0x9e83831d
+.long	0x742c2c58,0x742c2c58
+.long	0x2e1a1a34,0x2e1a1a34
+.long	0x2d1b1b36,0x2d1b1b36
+.long	0xb26e6edc,0xb26e6edc
+.long	0xee5a5ab4,0xee5a5ab4
+.long	0xfba0a05b,0xfba0a05b
+.long	0xf65252a4,0xf65252a4
+.long	0x4d3b3b76,0x4d3b3b76
+.long	0x61d6d6b7,0x61d6d6b7
+.long	0xceb3b37d,0xceb3b37d
+.long	0x7b292952,0x7b292952
+.long	0x3ee3e3dd,0x3ee3e3dd
+.long	0x712f2f5e,0x712f2f5e
+.long	0x97848413,0x97848413
+.long	0xf55353a6,0xf55353a6
+.long	0x68d1d1b9,0x68d1d1b9
+.long	0x00000000,0x00000000
+.long	0x2cededc1,0x2cededc1
+.long	0x60202040,0x60202040
+.long	0x1ffcfce3,0x1ffcfce3
+.long	0xc8b1b179,0xc8b1b179
+.long	0xed5b5bb6,0xed5b5bb6
+.long	0xbe6a6ad4,0xbe6a6ad4
+.long	0x46cbcb8d,0x46cbcb8d
+.long	0xd9bebe67,0xd9bebe67
+.long	0x4b393972,0x4b393972
+.long	0xde4a4a94,0xde4a4a94
+.long	0xd44c4c98,0xd44c4c98
+.long	0xe85858b0,0xe85858b0
+.long	0x4acfcf85,0x4acfcf85
+.long	0x6bd0d0bb,0x6bd0d0bb
+.long	0x2aefefc5,0x2aefefc5
+.long	0xe5aaaa4f,0xe5aaaa4f
+.long	0x16fbfbed,0x16fbfbed
+.long	0xc5434386,0xc5434386
+.long	0xd74d4d9a,0xd74d4d9a
+.long	0x55333366,0x55333366
+.long	0x94858511,0x94858511
+.long	0xcf45458a,0xcf45458a
+.long	0x10f9f9e9,0x10f9f9e9
+.long	0x06020204,0x06020204
+.long	0x817f7ffe,0x817f7ffe
+.long	0xf05050a0,0xf05050a0
+.long	0x443c3c78,0x443c3c78
+.long	0xba9f9f25,0xba9f9f25
+.long	0xe3a8a84b,0xe3a8a84b
+.long	0xf35151a2,0xf35151a2
+.long	0xfea3a35d,0xfea3a35d
+.long	0xc0404080,0xc0404080
+.long	0x8a8f8f05,0x8a8f8f05
+.long	0xad92923f,0xad92923f
+.long	0xbc9d9d21,0xbc9d9d21
+.long	0x48383870,0x48383870
+.long	0x04f5f5f1,0x04f5f5f1
+.long	0xdfbcbc63,0xdfbcbc63
+.long	0xc1b6b677,0xc1b6b677
+.long	0x75dadaaf,0x75dadaaf
+.long	0x63212142,0x63212142
+.long	0x30101020,0x30101020
+.long	0x1affffe5,0x1affffe5
+.long	0x0ef3f3fd,0x0ef3f3fd
+.long	0x6dd2d2bf,0x6dd2d2bf
+.long	0x4ccdcd81,0x4ccdcd81
+.long	0x140c0c18,0x140c0c18
+.long	0x35131326,0x35131326
+.long	0x2fececc3,0x2fececc3
+.long	0xe15f5fbe,0xe15f5fbe
+.long	0xa2979735,0xa2979735
+.long	0xcc444488,0xcc444488
+.long	0x3917172e,0x3917172e
+.long	0x57c4c493,0x57c4c493
+.long	0xf2a7a755,0xf2a7a755
+.long	0x827e7efc,0x827e7efc
+.long	0x473d3d7a,0x473d3d7a
+.long	0xac6464c8,0xac6464c8
+.long	0xe75d5dba,0xe75d5dba
+.long	0x2b191932,0x2b191932
+.long	0x957373e6,0x957373e6
+.long	0xa06060c0,0xa06060c0
+.long	0x98818119,0x98818119
+.long	0xd14f4f9e,0xd14f4f9e
+.long	0x7fdcdca3,0x7fdcdca3
+.long	0x66222244,0x66222244
+.long	0x7e2a2a54,0x7e2a2a54
+.long	0xab90903b,0xab90903b
+.long	0x8388880b,0x8388880b
+.long	0xca46468c,0xca46468c
+.long	0x29eeeec7,0x29eeeec7
+.long	0xd3b8b86b,0xd3b8b86b
+.long	0x3c141428,0x3c141428
+.long	0x79dedea7,0x79dedea7
+.long	0xe25e5ebc,0xe25e5ebc
+.long	0x1d0b0b16,0x1d0b0b16
+.long	0x76dbdbad,0x76dbdbad
+.long	0x3be0e0db,0x3be0e0db
+.long	0x56323264,0x56323264
+.long	0x4e3a3a74,0x4e3a3a74
+.long	0x1e0a0a14,0x1e0a0a14
+.long	0xdb494992,0xdb494992
+.long	0x0a06060c,0x0a06060c
+.long	0x6c242448,0x6c242448
+.long	0xe45c5cb8,0xe45c5cb8
+.long	0x5dc2c29f,0x5dc2c29f
+.long	0x6ed3d3bd,0x6ed3d3bd
+.long	0xefacac43,0xefacac43
+.long	0xa66262c4,0xa66262c4
+.long	0xa8919139,0xa8919139
+.long	0xa4959531,0xa4959531
+.long	0x37e4e4d3,0x37e4e4d3
+.long	0x8b7979f2,0x8b7979f2
+.long	0x32e7e7d5,0x32e7e7d5
+.long	0x43c8c88b,0x43c8c88b
+.long	0x5937376e,0x5937376e
+.long	0xb76d6dda,0xb76d6dda
+.long	0x8c8d8d01,0x8c8d8d01
+.long	0x64d5d5b1,0x64d5d5b1
+.long	0xd24e4e9c,0xd24e4e9c
+.long	0xe0a9a949,0xe0a9a949
+.long	0xb46c6cd8,0xb46c6cd8
+.long	0xfa5656ac,0xfa5656ac
+.long	0x07f4f4f3,0x07f4f4f3
+.long	0x25eaeacf,0x25eaeacf
+.long	0xaf6565ca,0xaf6565ca
+.long	0x8e7a7af4,0x8e7a7af4
+.long	0xe9aeae47,0xe9aeae47
+.long	0x18080810,0x18080810
+.long	0xd5baba6f,0xd5baba6f
+.long	0x887878f0,0x887878f0
+.long	0x6f25254a,0x6f25254a
+.long	0x722e2e5c,0x722e2e5c
+.long	0x241c1c38,0x241c1c38
+.long	0xf1a6a657,0xf1a6a657
+.long	0xc7b4b473,0xc7b4b473
+.long	0x51c6c697,0x51c6c697
+.long	0x23e8e8cb,0x23e8e8cb
+.long	0x7cdddda1,0x7cdddda1
+.long	0x9c7474e8,0x9c7474e8
+.long	0x211f1f3e,0x211f1f3e
+.long	0xdd4b4b96,0xdd4b4b96
+.long	0xdcbdbd61,0xdcbdbd61
+.long	0x868b8b0d,0x868b8b0d
+.long	0x858a8a0f,0x858a8a0f
+.long	0x907070e0,0x907070e0
+.long	0x423e3e7c,0x423e3e7c
+.long	0xc4b5b571,0xc4b5b571
+.long	0xaa6666cc,0xaa6666cc
+.long	0xd8484890,0xd8484890
+.long	0x05030306,0x05030306
+.long	0x01f6f6f7,0x01f6f6f7
+.long	0x120e0e1c,0x120e0e1c
+.long	0xa36161c2,0xa36161c2
+.long	0x5f35356a,0x5f35356a
+.long	0xf95757ae,0xf95757ae
+.long	0xd0b9b969,0xd0b9b969
+.long	0x91868617,0x91868617
+.long	0x58c1c199,0x58c1c199
+.long	0x271d1d3a,0x271d1d3a
+.long	0xb99e9e27,0xb99e9e27
+.long	0x38e1e1d9,0x38e1e1d9
+.long	0x13f8f8eb,0x13f8f8eb
+.long	0xb398982b,0xb398982b
+.long	0x33111122,0x33111122
+.long	0xbb6969d2,0xbb6969d2
+.long	0x70d9d9a9,0x70d9d9a9
+.long	0x898e8e07,0x898e8e07
+.long	0xa7949433,0xa7949433
+.long	0xb69b9b2d,0xb69b9b2d
+.long	0x221e1e3c,0x221e1e3c
+.long	0x92878715,0x92878715
+.long	0x20e9e9c9,0x20e9e9c9
+.long	0x49cece87,0x49cece87
+.long	0xff5555aa,0xff5555aa
+.long	0x78282850,0x78282850
+.long	0x7adfdfa5,0x7adfdfa5
+.long	0x8f8c8c03,0x8f8c8c03
+.long	0xf8a1a159,0xf8a1a159
+.long	0x80898909,0x80898909
+.long	0x170d0d1a,0x170d0d1a
+.long	0xdabfbf65,0xdabfbf65
+.long	0x31e6e6d7,0x31e6e6d7
+.long	0xc6424284,0xc6424284
+.long	0xb86868d0,0xb86868d0
+.long	0xc3414182,0xc3414182
+.long	0xb0999929,0xb0999929
+.long	0x772d2d5a,0x772d2d5a
+.long	0x110f0f1e,0x110f0f1e
+.long	0xcbb0b07b,0xcbb0b07b
+.long	0xfc5454a8,0xfc5454a8
+.long	0xd6bbbb6d,0xd6bbbb6d
+.long	0x3a16162c,0x3a16162c
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
+.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
+.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
+.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
+.align	64
+.LAES_Td:
+.long	0x50a7f451,0x50a7f451
+.long	0x5365417e,0x5365417e
+.long	0xc3a4171a,0xc3a4171a
+.long	0x965e273a,0x965e273a
+.long	0xcb6bab3b,0xcb6bab3b
+.long	0xf1459d1f,0xf1459d1f
+.long	0xab58faac,0xab58faac
+.long	0x9303e34b,0x9303e34b
+.long	0x55fa3020,0x55fa3020
+.long	0xf66d76ad,0xf66d76ad
+.long	0x9176cc88,0x9176cc88
+.long	0x254c02f5,0x254c02f5
+.long	0xfcd7e54f,0xfcd7e54f
+.long	0xd7cb2ac5,0xd7cb2ac5
+.long	0x80443526,0x80443526
+.long	0x8fa362b5,0x8fa362b5
+.long	0x495ab1de,0x495ab1de
+.long	0x671bba25,0x671bba25
+.long	0x980eea45,0x980eea45
+.long	0xe1c0fe5d,0xe1c0fe5d
+.long	0x02752fc3,0x02752fc3
+.long	0x12f04c81,0x12f04c81
+.long	0xa397468d,0xa397468d
+.long	0xc6f9d36b,0xc6f9d36b
+.long	0xe75f8f03,0xe75f8f03
+.long	0x959c9215,0x959c9215
+.long	0xeb7a6dbf,0xeb7a6dbf
+.long	0xda595295,0xda595295
+.long	0x2d83bed4,0x2d83bed4
+.long	0xd3217458,0xd3217458
+.long	0x2969e049,0x2969e049
+.long	0x44c8c98e,0x44c8c98e
+.long	0x6a89c275,0x6a89c275
+.long	0x78798ef4,0x78798ef4
+.long	0x6b3e5899,0x6b3e5899
+.long	0xdd71b927,0xdd71b927
+.long	0xb64fe1be,0xb64fe1be
+.long	0x17ad88f0,0x17ad88f0
+.long	0x66ac20c9,0x66ac20c9
+.long	0xb43ace7d,0xb43ace7d
+.long	0x184adf63,0x184adf63
+.long	0x82311ae5,0x82311ae5
+.long	0x60335197,0x60335197
+.long	0x457f5362,0x457f5362
+.long	0xe07764b1,0xe07764b1
+.long	0x84ae6bbb,0x84ae6bbb
+.long	0x1ca081fe,0x1ca081fe
+.long	0x942b08f9,0x942b08f9
+.long	0x58684870,0x58684870
+.long	0x19fd458f,0x19fd458f
+.long	0x876cde94,0x876cde94
+.long	0xb7f87b52,0xb7f87b52
+.long	0x23d373ab,0x23d373ab
+.long	0xe2024b72,0xe2024b72
+.long	0x578f1fe3,0x578f1fe3
+.long	0x2aab5566,0x2aab5566
+.long	0x0728ebb2,0x0728ebb2
+.long	0x03c2b52f,0x03c2b52f
+.long	0x9a7bc586,0x9a7bc586
+.long	0xa50837d3,0xa50837d3
+.long	0xf2872830,0xf2872830
+.long	0xb2a5bf23,0xb2a5bf23
+.long	0xba6a0302,0xba6a0302
+.long	0x5c8216ed,0x5c8216ed
+.long	0x2b1ccf8a,0x2b1ccf8a
+.long	0x92b479a7,0x92b479a7
+.long	0xf0f207f3,0xf0f207f3
+.long	0xa1e2694e,0xa1e2694e
+.long	0xcdf4da65,0xcdf4da65
+.long	0xd5be0506,0xd5be0506
+.long	0x1f6234d1,0x1f6234d1
+.long	0x8afea6c4,0x8afea6c4
+.long	0x9d532e34,0x9d532e34
+.long	0xa055f3a2,0xa055f3a2
+.long	0x32e18a05,0x32e18a05
+.long	0x75ebf6a4,0x75ebf6a4
+.long	0x39ec830b,0x39ec830b
+.long	0xaaef6040,0xaaef6040
+.long	0x069f715e,0x069f715e
+.long	0x51106ebd,0x51106ebd
+.long	0xf98a213e,0xf98a213e
+.long	0x3d06dd96,0x3d06dd96
+.long	0xae053edd,0xae053edd
+.long	0x46bde64d,0x46bde64d
+.long	0xb58d5491,0xb58d5491
+.long	0x055dc471,0x055dc471
+.long	0x6fd40604,0x6fd40604
+.long	0xff155060,0xff155060
+.long	0x24fb9819,0x24fb9819
+.long	0x97e9bdd6,0x97e9bdd6
+.long	0xcc434089,0xcc434089
+.long	0x779ed967,0x779ed967
+.long	0xbd42e8b0,0xbd42e8b0
+.long	0x888b8907,0x888b8907
+.long	0x385b19e7,0x385b19e7
+.long	0xdbeec879,0xdbeec879
+.long	0x470a7ca1,0x470a7ca1
+.long	0xe90f427c,0xe90f427c
+.long	0xc91e84f8,0xc91e84f8
+.long	0x00000000,0x00000000
+.long	0x83868009,0x83868009
+.long	0x48ed2b32,0x48ed2b32
+.long	0xac70111e,0xac70111e
+.long	0x4e725a6c,0x4e725a6c
+.long	0xfbff0efd,0xfbff0efd
+.long	0x5638850f,0x5638850f
+.long	0x1ed5ae3d,0x1ed5ae3d
+.long	0x27392d36,0x27392d36
+.long	0x64d90f0a,0x64d90f0a
+.long	0x21a65c68,0x21a65c68
+.long	0xd1545b9b,0xd1545b9b
+.long	0x3a2e3624,0x3a2e3624
+.long	0xb1670a0c,0xb1670a0c
+.long	0x0fe75793,0x0fe75793
+.long	0xd296eeb4,0xd296eeb4
+.long	0x9e919b1b,0x9e919b1b
+.long	0x4fc5c080,0x4fc5c080
+.long	0xa220dc61,0xa220dc61
+.long	0x694b775a,0x694b775a
+.long	0x161a121c,0x161a121c
+.long	0x0aba93e2,0x0aba93e2
+.long	0xe52aa0c0,0xe52aa0c0
+.long	0x43e0223c,0x43e0223c
+.long	0x1d171b12,0x1d171b12
+.long	0x0b0d090e,0x0b0d090e
+.long	0xadc78bf2,0xadc78bf2
+.long	0xb9a8b62d,0xb9a8b62d
+.long	0xc8a91e14,0xc8a91e14
+.long	0x8519f157,0x8519f157
+.long	0x4c0775af,0x4c0775af
+.long	0xbbdd99ee,0xbbdd99ee
+.long	0xfd607fa3,0xfd607fa3
+.long	0x9f2601f7,0x9f2601f7
+.long	0xbcf5725c,0xbcf5725c
+.long	0xc53b6644,0xc53b6644
+.long	0x347efb5b,0x347efb5b
+.long	0x7629438b,0x7629438b
+.long	0xdcc623cb,0xdcc623cb
+.long	0x68fcedb6,0x68fcedb6
+.long	0x63f1e4b8,0x63f1e4b8
+.long	0xcadc31d7,0xcadc31d7
+.long	0x10856342,0x10856342
+.long	0x40229713,0x40229713
+.long	0x2011c684,0x2011c684
+.long	0x7d244a85,0x7d244a85
+.long	0xf83dbbd2,0xf83dbbd2
+.long	0x1132f9ae,0x1132f9ae
+.long	0x6da129c7,0x6da129c7
+.long	0x4b2f9e1d,0x4b2f9e1d
+.long	0xf330b2dc,0xf330b2dc
+.long	0xec52860d,0xec52860d
+.long	0xd0e3c177,0xd0e3c177
+.long	0x6c16b32b,0x6c16b32b
+.long	0x99b970a9,0x99b970a9
+.long	0xfa489411,0xfa489411
+.long	0x2264e947,0x2264e947
+.long	0xc48cfca8,0xc48cfca8
+.long	0x1a3ff0a0,0x1a3ff0a0
+.long	0xd82c7d56,0xd82c7d56
+.long	0xef903322,0xef903322
+.long	0xc74e4987,0xc74e4987
+.long	0xc1d138d9,0xc1d138d9
+.long	0xfea2ca8c,0xfea2ca8c
+.long	0x360bd498,0x360bd498
+.long	0xcf81f5a6,0xcf81f5a6
+.long	0x28de7aa5,0x28de7aa5
+.long	0x268eb7da,0x268eb7da
+.long	0xa4bfad3f,0xa4bfad3f
+.long	0xe49d3a2c,0xe49d3a2c
+.long	0x0d927850,0x0d927850
+.long	0x9bcc5f6a,0x9bcc5f6a
+.long	0x62467e54,0x62467e54
+.long	0xc2138df6,0xc2138df6
+.long	0xe8b8d890,0xe8b8d890
+.long	0x5ef7392e,0x5ef7392e
+.long	0xf5afc382,0xf5afc382
+.long	0xbe805d9f,0xbe805d9f
+.long	0x7c93d069,0x7c93d069
+.long	0xa92dd56f,0xa92dd56f
+.long	0xb31225cf,0xb31225cf
+.long	0x3b99acc8,0x3b99acc8
+.long	0xa77d1810,0xa77d1810
+.long	0x6e639ce8,0x6e639ce8
+.long	0x7bbb3bdb,0x7bbb3bdb
+.long	0x097826cd,0x097826cd
+.long	0xf418596e,0xf418596e
+.long	0x01b79aec,0x01b79aec
+.long	0xa89a4f83,0xa89a4f83
+.long	0x656e95e6,0x656e95e6
+.long	0x7ee6ffaa,0x7ee6ffaa
+.long	0x08cfbc21,0x08cfbc21
+.long	0xe6e815ef,0xe6e815ef
+.long	0xd99be7ba,0xd99be7ba
+.long	0xce366f4a,0xce366f4a
+.long	0xd4099fea,0xd4099fea
+.long	0xd67cb029,0xd67cb029
+.long	0xafb2a431,0xafb2a431
+.long	0x31233f2a,0x31233f2a
+.long	0x3094a5c6,0x3094a5c6
+.long	0xc066a235,0xc066a235
+.long	0x37bc4e74,0x37bc4e74
+.long	0xa6ca82fc,0xa6ca82fc
+.long	0xb0d090e0,0xb0d090e0
+.long	0x15d8a733,0x15d8a733
+.long	0x4a9804f1,0x4a9804f1
+.long	0xf7daec41,0xf7daec41
+.long	0x0e50cd7f,0x0e50cd7f
+.long	0x2ff69117,0x2ff69117
+.long	0x8dd64d76,0x8dd64d76
+.long	0x4db0ef43,0x4db0ef43
+.long	0x544daacc,0x544daacc
+.long	0xdf0496e4,0xdf0496e4
+.long	0xe3b5d19e,0xe3b5d19e
+.long	0x1b886a4c,0x1b886a4c
+.long	0xb81f2cc1,0xb81f2cc1
+.long	0x7f516546,0x7f516546
+.long	0x04ea5e9d,0x04ea5e9d
+.long	0x5d358c01,0x5d358c01
+.long	0x737487fa,0x737487fa
+.long	0x2e410bfb,0x2e410bfb
+.long	0x5a1d67b3,0x5a1d67b3
+.long	0x52d2db92,0x52d2db92
+.long	0x335610e9,0x335610e9
+.long	0x1347d66d,0x1347d66d
+.long	0x8c61d79a,0x8c61d79a
+.long	0x7a0ca137,0x7a0ca137
+.long	0x8e14f859,0x8e14f859
+.long	0x893c13eb,0x893c13eb
+.long	0xee27a9ce,0xee27a9ce
+.long	0x35c961b7,0x35c961b7
+.long	0xede51ce1,0xede51ce1
+.long	0x3cb1477a,0x3cb1477a
+.long	0x59dfd29c,0x59dfd29c
+.long	0x3f73f255,0x3f73f255
+.long	0x79ce1418,0x79ce1418
+.long	0xbf37c773,0xbf37c773
+.long	0xeacdf753,0xeacdf753
+.long	0x5baafd5f,0x5baafd5f
+.long	0x146f3ddf,0x146f3ddf
+.long	0x86db4478,0x86db4478
+.long	0x81f3afca,0x81f3afca
+.long	0x3ec468b9,0x3ec468b9
+.long	0x2c342438,0x2c342438
+.long	0x5f40a3c2,0x5f40a3c2
+.long	0x72c31d16,0x72c31d16
+.long	0x0c25e2bc,0x0c25e2bc
+.long	0x8b493c28,0x8b493c28
+.long	0x41950dff,0x41950dff
+.long	0x7101a839,0x7101a839
+.long	0xdeb30c08,0xdeb30c08
+.long	0x9ce4b4d8,0x9ce4b4d8
+.long	0x90c15664,0x90c15664
+.long	0x6184cb7b,0x6184cb7b
+.long	0x70b632d5,0x70b632d5
+.long	0x745c6c48,0x745c6c48
+.long	0x4257b8d0,0x4257b8d0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte	65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
index a545e89..34cbb5d 100755
--- a/crypto/aes/asm/aes-x86_64.pl
+++ b/crypto/aes/asm/aes-x86_64.pl
@@ -36,7 +36,8 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $verticalspin=1;	# unlike 32-bit version $verticalspin performs
 			# ~15% better on both AMD and Intel cores
@@ -588,6 +589,9 @@ ()
 .globl	AES_encrypt
 .type	AES_encrypt,\@function,3
 .align	16
+.globl	asm_AES_encrypt
+.hidden	asm_AES_encrypt
+asm_AES_encrypt:
 AES_encrypt:
 	push	%rbx
 	push	%rbp
@@ -1184,6 +1188,9 @@ ()
 .globl	AES_decrypt
 .type	AES_decrypt,\@function,3
 .align	16
+.globl	asm_AES_decrypt
+.hidden	asm_AES_decrypt
+asm_AES_decrypt:
 AES_decrypt:
 	push	%rbx
 	push	%rbp
@@ -1277,13 +1284,13 @@ ()
 ___
 }
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function,3
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function,3
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12			# redundant, but allows to share 
@@ -1304,7 +1311,7 @@ ()
 	add	\$56,%rsp
 .Lenc_key_epilogue:
 	ret
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
 .align	16
@@ -1547,13 +1554,13 @@ ()
 ___
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function,3
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function,3
 .align	16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -1622,7 +1629,7 @@ ()
 	add	\$56,%rsp
 .Ldec_key_epilogue:
 	ret
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -1648,6 +1655,9 @@ ()
 .type	AES_cbc_encrypt,\@function,6
 .align	16
 .extern	OPENSSL_ia32cap_P
+.globl	asm_AES_cbc_encrypt
+.hidden	asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
 AES_cbc_encrypt:
 	cmp	\$0,%rdx	# check length
 	je	.Lcbc_epilogue
@@ -2766,13 +2776,13 @@ ()
 	.rva	.LSEH_end_AES_decrypt
 	.rva	.LSEH_info_AES_decrypt
 
-	.rva	.LSEH_begin_AES_set_encrypt_key
-	.rva	.LSEH_end_AES_set_encrypt_key
-	.rva	.LSEH_info_AES_set_encrypt_key
+	.rva	.LSEH_begin_private_AES_set_encrypt_key
+	.rva	.LSEH_end_private_AES_set_encrypt_key
+	.rva	.LSEH_info_private_AES_set_encrypt_key
 
-	.rva	.LSEH_begin_AES_set_decrypt_key
-	.rva	.LSEH_end_AES_set_decrypt_key
-	.rva	.LSEH_info_AES_set_decrypt_key
+	.rva	.LSEH_begin_private_AES_set_decrypt_key
+	.rva	.LSEH_end_private_AES_set_decrypt_key
+	.rva	.LSEH_info_private_AES_set_decrypt_key
 
 	.rva	.LSEH_begin_AES_cbc_encrypt
 	.rva	.LSEH_end_AES_cbc_encrypt
@@ -2788,11 +2798,11 @@ ()
 	.byte	9,0,0,0
 	.rva	block_se_handler
 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
+.LSEH_info_private_AES_set_encrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
+.LSEH_info_private_AES_set_decrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.S b/crypto/aes/asm/aesni-sha1-x86_64.S
new file mode 100644
index 0000000..32fd600
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha1-x86_64.S
@@ -0,0 +1,1396 @@
+.text	
+
+
+.globl	aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc,@function
+.align	16
+aesni_cbc_sha1_enc:
+
+	movl	OPENSSL_ia32cap_P+0(%rip),%r10d
+	movl	OPENSSL_ia32cap_P+4(%rip),%r11d
+	jmp	aesni_cbc_sha1_enc_ssse3
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc_ssse3,@function
+.align	16
+aesni_cbc_sha1_enc_ssse3:
+	movq	8(%rsp),%r10
+
+
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-104(%rsp),%rsp
+
+
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movdqu	(%r8),%xmm11
+	movq	%r8,88(%rsp)
+	shlq	$6,%r14
+	subq	%r12,%r13
+	movl	240(%r15),%r8d
+	addq	%r10,%r14
+
+	leaq	K_XX_XX(%rip),%r11
+	movl	0(%r9),%eax
+	movl	4(%r9),%ebx
+	movl	8(%r9),%ecx
+	movl	12(%r9),%edx
+	movl	%ebx,%esi
+	movl	16(%r9),%ebp
+
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r10),%xmm0
+	movdqu	16(%r10),%xmm1
+	movdqu	32(%r10),%xmm2
+	movdqu	48(%r10),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r10
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm0
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	movups	(%r15),%xmm13
+	movups	16(%r15),%xmm14
+	jmp	.Loop_ssse3
+.align	16
+.Loop_ssse3:
+	movdqa	%xmm1,%xmm4
+	addl	0(%rsp),%ebp
+	movups	0(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	xorl	%edx,%ecx
+	movdqa	%xmm3,%xmm8
+.byte	102,15,58,15,224,8
+	movl	%eax,%edi
+	roll	$5,%eax
+	paddd	%xmm3,%xmm9
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrldq	$4,%xmm8
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	pxor	%xmm0,%xmm4
+	rorl	$2,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm2,%xmm8
+	addl	4(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pxor	%xmm8,%xmm4
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm9,48(%rsp)
+	xorl	%ecx,%edi
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	addl	%ebp,%edx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm4,%xmm8
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	8(%rsp),%ecx
+	xorl	%ebx,%eax
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	roll	$5,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrld	$31,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm4
+	addl	12(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	0(%r11),%xmm10
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm4
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movdqa	%xmm2,%xmm5
+	addl	16(%rsp),%eax
+	xorl	%ebp,%edx
+	movdqa	%xmm4,%xmm9
+.byte	102,15,58,15,233,8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm4,%xmm10
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrldq	$4,%xmm9
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	pxor	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm3,%xmm9
+	addl	20(%rsp),%ebp
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pxor	%xmm9,%xmm5
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm10,0(%rsp)
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm5,%xmm9
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	24(%rsp),%edx
+	xorl	%ecx,%ebx
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	psrld	$31,%xmm9
+	xorl	%ecx,%esi
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	addl	%ebp,%edx
+	movdqa	%xmm8,%xmm10
+	rorl	$7,%eax
+	addl	%esi,%edx
+	psrld	$30,%xmm8
+	por	%xmm9,%xmm5
+	addl	28(%rsp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	roll	$5,%edx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	16(%r11),%xmm8
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm5
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movdqa	%xmm3,%xmm6
+	addl	32(%rsp),%ebx
+	xorl	%eax,%ebp
+	movdqa	%xmm5,%xmm10
+.byte	102,15,58,15,242,8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	paddd	%xmm5,%xmm8
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	psrldq	$4,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	pxor	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm4,%xmm10
+	addl	36(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	pxor	%xmm10,%xmm6
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm8,16(%rsp)
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm6,%xmm10
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	40(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%ecx
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	roll	$5,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrld	$31,%xmm10
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	psrld	$30,%xmm9
+	por	%xmm10,%xmm6
+	addl	44(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	16(%r11),%xmm9
+	xorl	%ecx,%edi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm6
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movdqa	%xmm4,%xmm7
+	addl	48(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm8
+.byte	102,15,58,15,251,8
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm6,%xmm9
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrldq	$4,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	pxor	%xmm3,%xmm7
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm5,%xmm8
+	addl	52(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	pxor	%xmm8,%xmm7
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm9,32(%rsp)
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm7,%xmm8
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	56(%rsp),%eax
+	xorl	%ebp,%edx
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrld	$31,%xmm8
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm7
+	addl	60(%rsp),%ebp
+	cmpl	$11,%r8d
+	jb	.Laesenclast1
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast1
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast1:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	16(%r11),%xmm10
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	pxor	%xmm9,%xmm7
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movdqa	%xmm7,%xmm9
+	addl	0(%rsp),%edx
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,206,8
+	xorl	%ecx,%ebx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm7,%xmm10
+	xorl	%ecx,%esi
+	movups	16(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	movups	%xmm11,0(%r13,%r12,1)
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	4(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm10,48(%rsp)
+	movl	%edx,%esi
+	roll	$5,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	pslld	$2,%xmm0
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	psrld	$30,%xmm9
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	8(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%edi
+	roll	$5,%ecx
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	por	%xmm9,%xmm0
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	movdqa	%xmm0,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	16(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,215,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm0,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	movdqa	%xmm8,0(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm10,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%eax,%edi
+	movdqa	%xmm1,%xmm8
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	32(%rsp),%eax
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,192,8
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	movdqa	32(%r11),%xmm10
+	paddd	%xmm1,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	movdqa	%xmm9,16(%rsp)
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	pslld	$2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	por	%xmm8,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%ebx,%edi
+	movdqa	%xmm2,%xmm9
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	48(%rsp),%ebx
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,201,8
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm2,%xmm10
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	movdqa	%xmm10,32(%rsp)
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%esi
+	psrld	$30,%xmm9
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	por	%xmm9,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ecx,%edi
+	movdqa	%xmm3,%xmm10
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	0(%rsp),%ecx
+	pxor	%xmm0,%xmm4
+.byte	102,68,15,58,15,210,8
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edx,%ecx
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm3,%xmm8
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm8,48(%rsp)
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	pslld	$2,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%ebp,%esi
+	psrld	$30,%xmm10
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	por	%xmm10,%xmm4
+	addl	12(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	xorl	%edx,%edi
+	movdqa	%xmm4,%xmm8
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	16(%rsp),%edx
+	pxor	%xmm1,%xmm5
+.byte	102,68,15,58,15,195,8
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm4,%xmm9
+	rorl	$7,%eax
+	addl	%esi,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm9,0(%rsp)
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Laesenclast2
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast2
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast2:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	pslld	$2,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	por	%xmm8,%xmm5
+	addl	28(%rsp),%eax
+	xorl	%ebp,%edi
+	movdqa	%xmm5,%xmm9
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ecx,%edi
+	movups	32(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	movups	%xmm11,16(%r13,%r12,1)
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	pxor	%xmm2,%xmm6
+.byte	102,68,15,58,15,204,8
+	xorl	%edx,%ecx
+	addl	32(%rsp),%ebp
+	andl	%edx,%edi
+	pxor	%xmm7,%xmm6
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm5,%xmm10
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	pxor	%xmm9,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm10,16(%rsp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%rsp),%edx
+	andl	%ecx,%esi
+	pslld	$2,%xmm6
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	psrld	$30,%xmm9
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	por	%xmm9,%xmm6
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm10
+	addl	40(%rsp),%ecx
+	andl	%ebx,%edi
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	44(%rsp),%ebx
+	andl	%eax,%esi
+	andl	%ebp,%edi
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%edi
+	pxor	%xmm3,%xmm7
+.byte	102,68,15,58,15,213,8
+	xorl	%ebp,%edx
+	addl	48(%rsp),%eax
+	andl	%ebp,%edi
+	pxor	%xmm0,%xmm7
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	48(%r11),%xmm9
+	paddd	%xmm6,%xmm8
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	pxor	%xmm10,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm8,32(%rsp)
+	movl	%ecx,%esi
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	xorl	%edx,%ecx
+	addl	52(%rsp),%ebp
+	andl	%edx,%esi
+	pslld	$2,%xmm7
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	psrld	$30,%xmm10
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	por	%xmm10,%xmm7
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm7,%xmm8
+	addl	56(%rsp),%edx
+	andl	%ecx,%edi
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%rsp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%edi
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,198,8
+	xorl	%eax,%ebp
+	addl	0(%rsp),%ebx
+	andl	%eax,%edi
+	pxor	%xmm1,%xmm0
+	andl	%ebp,%esi
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	rorl	$7,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm7,%xmm9
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	pxor	%xmm8,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	4(%rsp),%eax
+	andl	%ebp,%esi
+	pslld	$2,%xmm0
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	psrld	$30,%xmm8
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	por	%xmm8,%xmm0
+	movl	%ecx,%edi
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%ecx
+	movdqa	%xmm0,%xmm9
+	addl	8(%rsp),%ebp
+	andl	%edx,%edi
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%rsp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%edi
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,207,8
+	xorl	%ebx,%eax
+	addl	16(%rsp),%ecx
+	andl	%ebx,%edi
+	pxor	%xmm2,%xmm1
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm0,%xmm10
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	pxor	%xmm9,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movdqa	%xmm1,%xmm9
+	movdqa	%xmm10,0(%rsp)
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	20(%rsp),%ebx
+	andl	%eax,%esi
+	pslld	$2,%xmm1
+	andl	%ebp,%edi
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	por	%xmm9,%xmm1
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm1,%xmm10
+	addl	24(%rsp),%eax
+	andl	%ebp,%edi
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	cmpl	$11,%r8d
+	jb	.Laesenclast3
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast3
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast3:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	xorl	%edx,%ecx
+	addl	28(%rsp),%ebp
+	andl	%edx,%esi
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%edi
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,208,8
+	xorl	%ecx,%ebx
+	addl	32(%rsp),%edx
+	andl	%ecx,%edi
+	pxor	%xmm3,%xmm2
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm1,%xmm8
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	pxor	%xmm10,%xmm2
+	roll	$5,%ebp
+	movups	48(%r12),%xmm12
+	xorps	%xmm13,%xmm12
+	movups	%xmm11,32(%r13,%r12,1)
+	xorps	%xmm12,%xmm11
+.byte	102,69,15,56,220,222
+	movups	32(%r15),%xmm15
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm8,16(%rsp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%rsp),%ecx
+	andl	%ebx,%esi
+	pslld	$2,%xmm2
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	psrld	$30,%xmm10
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	por	%xmm10,%xmm2
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm2,%xmm8
+	addl	40(%rsp),%ebx
+	andl	%eax,%edi
+	andl	%ebp,%esi
+.byte	102,69,15,56,220,223
+	movups	48(%r15),%xmm14
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	44(%rsp),%eax
+	andl	%ebp,%esi
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	addl	48(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	64(%r15),%xmm15
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,193,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm2,%xmm9
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	movdqa	%xmm9,32(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm8
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	80(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm8,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	0(%rsp),%eax
+	paddd	%xmm3,%xmm10
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm10,48(%rsp)
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	4(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	96(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	12(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+.byte	102,69,15,56,220,223
+	movups	112(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	cmpq	%r14,%r10
+	je	.Ldone_ssse3
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r10),%xmm0
+	movdqu	16(%r10),%xmm1
+	movdqu	32(%r10),%xmm2
+	movdqu	48(%r10),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r10
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+.byte	102,15,56,0,206
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm9,%xmm0
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movdqa	%xmm0,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	psubd	%xmm9,%xmm0
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+.byte	102,15,56,0,214
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm9,%xmm1
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movdqa	%xmm1,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	psubd	%xmm9,%xmm1
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+.byte	102,15,56,0,222
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	paddd	%xmm9,%xmm2
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movdqa	%xmm2,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	psubd	%xmm9,%xmm2
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Laesenclast4
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast4
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast4:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movups	%xmm11,48(%r13,%r12,1)
+	leaq	64(%r12),%r12
+
+	addl	0(%r9),%eax
+	addl	4(%r9),%esi
+	addl	8(%r9),%ecx
+	addl	12(%r9),%edx
+	movl	%eax,0(%r9)
+	addl	16(%r9),%ebp
+	movl	%esi,4(%r9)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+	movl	%ebp,16(%r9)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	128(%r15),%xmm15
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+.byte	102,69,15,56,220,223
+	movups	144(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+.byte	102,69,15,56,220,222
+	movups	160(%r15),%xmm15
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	cmpl	$11,%r8d
+	jb	.Laesenclast5
+	movups	176(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	192(%r15),%xmm15
+.byte	102,69,15,56,220,222
+	je	.Laesenclast5
+	movups	208(%r15),%xmm14
+.byte	102,69,15,56,220,223
+	movups	224(%r15),%xmm15
+.byte	102,69,15,56,220,222
+.Laesenclast5:
+.byte	102,69,15,56,221,223
+	movups	16(%r15),%xmm14
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movups	%xmm11,48(%r13,%r12,1)
+	movq	88(%rsp),%r8
+
+	addl	0(%r9),%eax
+	addl	4(%r9),%esi
+	addl	8(%r9),%ecx
+	movl	%eax,0(%r9)
+	addl	12(%r9),%edx
+	movl	%esi,4(%r9)
+	addl	16(%r9),%ebp
+	movl	%ecx,8(%r9)
+	movl	%edx,12(%r9)
+	movl	%ebp,16(%r9)
+	movups	%xmm11,(%r8)
+	leaq	104(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	
+
+.byte	65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000..3c8f6c1
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1250 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#		AES-128-CBC	+SHA1		stitch      gain
+# Westmere	3.77[+5.6]	9.37		6.65	    +41%
+# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
+#
+#		AES-192-CBC
+# Westmere	4.51		10.11		6.97	    +45%
+# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
+#
+#		AES-256-CBC
+# Westmere	5.25		10.85		7.25	    +50%
+# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
+#
+# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#	background information. Above numbers in parentheses are SSSE3
+#	results collected on AVX-capable CPU, i.e. apply on OSes that
+#	don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#		AES-128-CBC	AES-192-CBC	AES-256-CBC
+# Westmere	1.31		1.55		1.80
+# Sandy Bridge	0.93		1.06		1.22
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void aesni_cbc_sha1_enc(const void *inp,
+#			void *out,
+#			size_t length,
+#			const AES_KEY *key,
+#			unsigned char *iv,
+#			SHA_CTX *ctx,
+#			const void *in0);
+
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
+
+.globl	aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc,\@abi-omnipotent
+.align	16
+aesni_cbc_sha1_enc:
+	# caller should check for SSSE3 and AES-NI bits
+	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r11d		# mask AVX bit
+	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
+	or	%r11d,%r10d
+	cmp	\$`1<<28|1<<30`,%r10d
+	je	aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+	jmp	aesni_cbc_sha1_enc_ssse3
+	ret
+.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_ssse3,\@function,6
+.align	16
+aesni_cbc_sha1_enc_ssse3:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_ssse3		# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	movdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	movups	($key),$rndkey0		# $key[0]
+	movups	16($key),$rndkey[0]	# forward reference
+	jmp	.Loop_ssse3
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	movups		`16*$n`($in0),$in		# load input
+	xorps		$rndkey0,$in
+___
+      $code.=<<___ if ($n);
+	movups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	xorps		$in,$iv
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Laesenclast$sn
+	movups		`32+16*($k+0)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+1)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+	je		.Laesenclast$sn
+	movups		`32+16*($k+2)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+3)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+.Laesenclast$sn:
+	aesenclast	$rndkey[0],$iv
+	movups		16($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_20_39 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_40_59 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+$code.=<<___;
+.align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	movups	$iv,($ivp)			# write IV
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_avx,\@function,6
+.align	16
+aesni_cbc_sha1_enc_avx:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_avx			# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	vzeroall
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	vmovdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	\$112,$key		# size optimization
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	vmovups	-112($key),$rndkey0	# $key[0]
+	vmovups	16-112($key),$rndkey[0]	# forward reference
+	jmp	.Loop_avx
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	vmovups		`16*$n`($in0),$in		# load input
+	vxorps		$rndkey0,$in,$in
+___
+      $code.=<<___ if ($n);
+	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	vxorps		$in,$iv,$iv
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
+	je		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+	vaesenclast	$rndkey[0],$iv,$iv
+	vmovups		16-112($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	vmovups	$iv,($ivp)			# write IV
+	vzeroall
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+
+.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	96(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
+
+	mov	0(%rax),%r15
+	mov	8(%rax),%r14
+	mov	16(%rax),%r13
+	mov	24(%rax),%r12
+	mov	32(%rax),%rbp
+	mov	40(%rax),%rbx
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	ssse3_handler,.-ssse3_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/aes/asm/aesni-x86.S b/crypto/aes/asm/aesni-x86.S
new file mode 100644
index 0000000..0766bb5
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86.S
@@ -0,0 +1,2143 @@
+.file	"crypto/aes/asm/aesni-x86.s"
+.text
+.globl	aesni_encrypt
+.type	aesni_encrypt,@function
+.align	16
+aesni_encrypt:
+.L_aesni_encrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L000enc1_loop_1:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L000enc1_loop_1
+.byte	102,15,56,221,209
+	movups	%xmm2,(%eax)
+	ret
+.size	aesni_encrypt,.-.L_aesni_encrypt_begin
+.globl	aesni_decrypt
+.type	aesni_decrypt,@function
+.align	16
+aesni_decrypt:
+.L_aesni_decrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L001dec1_loop_2:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L001dec1_loop_2
+.byte	102,15,56,223,209
+	movups	%xmm2,(%eax)
+	ret
+.size	aesni_decrypt,.-.L_aesni_decrypt_begin
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	(%edx),%xmm0
+.L002enc3_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%ecx
+.byte	102,15,56,220,225
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leal	32(%edx),%edx
+.byte	102,15,56,220,224
+	movups	(%edx),%xmm0
+	jnz	.L002enc3_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	(%edx),%xmm0
+.L003dec3_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%ecx
+.byte	102,15,56,222,225
+	movups	16(%edx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leal	32(%edx),%edx
+.byte	102,15,56,222,224
+	movups	(%edx),%xmm0
+	jnz	.L003dec3_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shrl	$1,%ecx
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	(%edx),%xmm0
+.L004enc4_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leal	32(%edx),%edx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	(%edx),%xmm0
+	jnz	.L004enc4_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shrl	$1,%ecx
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	(%edx),%xmm0
+.L005dec4_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%ecx
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	16(%edx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leal	32(%edx),%edx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	(%edx),%xmm0
+	jnz	.L005dec4_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	decl	%ecx
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,220,249
+	jmp	.L_aesni_encrypt6_enter
+.align	16
+.L006enc6_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.align	16
+.L_aesni_encrypt6_enter:
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leal	32(%edx),%edx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%edx),%xmm0
+	jnz	.L006enc6_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+	movups	(%edx),%xmm0
+	shrl	$1,%ecx
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	decl	%ecx
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,222,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,222,249
+	jmp	.L_aesni_decrypt6_enter
+.align	16
+.L007dec6_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%ecx
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.align	16
+.L_aesni_decrypt6_enter:
+	movups	16(%edx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leal	32(%edx),%edx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%edx),%xmm0
+	jnz	.L007dec6_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,@function
+.align	16
+aesni_ecb_encrypt:
+.L_aesni_ecb_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	andl	$-16,%eax
+	jz	.L008ecb_ret
+	movl	240(%edx),%ecx
+	testl	%ebx,%ebx
+	jz	.L009ecb_decrypt
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L010ecb_enc_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L011ecb_enc_loop6_enter
+.align	16
+.L012ecb_enc_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L011ecb_enc_loop6_enter:
+	call	_aesni_encrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L012ecb_enc_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L008ecb_ret
+.L010ecb_enc_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L013ecb_enc_one
+	movups	16(%esi),%xmm3
+	je	.L014ecb_enc_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L015ecb_enc_three
+	movups	48(%esi),%xmm5
+	je	.L016ecb_enc_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L013ecb_enc_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L017enc1_loop_3:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L017enc1_loop_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L014ecb_enc_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L015ecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L016ecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L009ecb_decrypt:
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L018ecb_dec_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L019ecb_dec_loop6_enter
+.align	16
+.L020ecb_dec_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L019ecb_dec_loop6_enter:
+	call	_aesni_decrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L020ecb_dec_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L008ecb_ret
+.L018ecb_dec_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L021ecb_dec_one
+	movups	16(%esi),%xmm3
+	je	.L022ecb_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L023ecb_dec_three
+	movups	48(%esi),%xmm5
+	je	.L024ecb_dec_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L021ecb_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L025dec1_loop_4:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L025dec1_loop_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L022ecb_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L023ecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L008ecb_ret
+.align	16
+.L024ecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L008ecb_ret:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ecb_encrypt,.-.L_aesni_ecb_encrypt_begin
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,@function
+.align	16
+aesni_ccm64_encrypt_blocks:
+.L_aesni_ccm64_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	shrl	$1,%ecx
+	leal	(%edx),%ebp
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+.L026ccm64_enc_outer:
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm6
+	xorps	%xmm0,%xmm2
+	movups	16(%ebp),%xmm1
+	xorps	%xmm6,%xmm0
+	leal	32(%ebp),%edx
+	xorps	%xmm0,%xmm3
+	movups	(%edx),%xmm0
+.L027ccm64_enc2_loop:
+.byte	102,15,56,220,209
+	decl	%ecx
+.byte	102,15,56,220,217
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+	leal	32(%edx),%edx
+.byte	102,15,56,220,216
+	movups	(%edx),%xmm0
+	jnz	.L027ccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	decl	%eax
+	leal	16(%esi),%esi
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	jnz	.L026ccm64_enc_outer
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ccm64_encrypt_blocks,.-.L_aesni_ccm64_encrypt_blocks_begin
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,@function
+.align	16
+aesni_ccm64_decrypt_blocks:
+.L_aesni_ccm64_decrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L028enc1_loop_5:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L028enc1_loop_5
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+	leal	16(%esi),%esi
+	jmp	.L029ccm64_dec_outer
+.align	16
+.L029ccm64_dec_outer:
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movl	%ebx,%ecx
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	subl	$1,%eax
+	jz	.L030ccm64_dec_break
+	movups	(%ebp),%xmm0
+	shrl	$1,%ecx
+	movups	16(%ebp),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%ebp),%edx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	(%edx),%xmm0
+.L031ccm64_dec2_loop:
+.byte	102,15,56,220,209
+	decl	%ecx
+.byte	102,15,56,220,217
+	movups	16(%edx),%xmm1
+.byte	102,15,56,220,208
+	leal	32(%edx),%edx
+.byte	102,15,56,220,216
+	movups	(%edx),%xmm0
+	jnz	.L031ccm64_dec2_loop
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leal	16(%esi),%esi
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	jmp	.L029ccm64_dec_outer
+.align	16
+.L030ccm64_dec_break:
+	movl	%ebp,%edx
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%edx),%edx
+	xorps	%xmm6,%xmm3
+.L032enc1_loop_6:
+.byte	102,15,56,220,217
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L032enc1_loop_6
+.byte	102,15,56,221,217
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ccm64_decrypt_blocks,.-.L_aesni_ccm64_decrypt_blocks_begin
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,@function
+.align	16
+aesni_ctr32_encrypt_blocks:
+.L_aesni_ctr32_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$88,%esp
+	andl	$-16,%esp
+	movl	%ebp,80(%esp)
+	cmpl	$1,%eax
+	je	.L033ctr32_one_shortcut
+	movdqu	(%ebx),%xmm7
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$6,%ecx
+	xorl	%ebp,%ebp
+	movl	%ecx,16(%esp)
+	movl	%ecx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%ebp,28(%esp)
+.byte	102,15,58,22,251,3
+.byte	102,15,58,34,253,3
+	movl	240(%edx),%ecx
+	bswap	%ebx
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	(%esp),%xmm2
+.byte	102,15,58,34,203,0
+	leal	3(%ebx),%ebp
+.byte	102,15,58,34,197,0
+	incl	%ebx
+.byte	102,15,58,34,203,1
+	incl	%ebp
+.byte	102,15,58,34,197,1
+	incl	%ebx
+.byte	102,15,58,34,203,2
+	incl	%ebp
+.byte	102,15,58,34,197,2
+	movdqa	%xmm1,48(%esp)
+.byte	102,15,56,0,202
+	movdqa	%xmm0,64(%esp)
+.byte	102,15,56,0,194
+	pshufd	$192,%xmm1,%xmm2
+	pshufd	$128,%xmm1,%xmm3
+	cmpl	$6,%eax
+	jb	.L034ctr32_tail
+	movdqa	%xmm7,32(%esp)
+	shrl	$1,%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$6,%eax
+	jmp	.L035ctr32_loop6
+.align	16
+.L035ctr32_loop6:
+	pshufd	$64,%xmm1,%xmm4
+	movdqa	32(%esp),%xmm1
+	pshufd	$192,%xmm0,%xmm5
+	por	%xmm1,%xmm2
+	pshufd	$128,%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	pshufd	$64,%xmm0,%xmm7
+	por	%xmm1,%xmm4
+	por	%xmm1,%xmm5
+	por	%xmm1,%xmm6
+	por	%xmm1,%xmm7
+	movups	(%ebp),%xmm0
+	movups	16(%ebp),%xmm1
+	leal	32(%ebp),%edx
+	decl	%ecx
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	%xmm2,(%edi)
+	movdqa	16(%esp),%xmm0
+	xorps	%xmm1,%xmm4
+	movdqa	48(%esp),%xmm1
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	paddd	%xmm0,%xmm1
+	paddd	64(%esp),%xmm0
+	movdqa	(%esp),%xmm2
+	movups	48(%esi),%xmm3
+	movups	64(%esi),%xmm4
+	xorps	%xmm3,%xmm5
+	movups	80(%esi),%xmm3
+	leal	96(%esi),%esi
+	movdqa	%xmm1,48(%esp)
+.byte	102,15,56,0,202
+	xorps	%xmm4,%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+.byte	102,15,56,0,194
+	movups	%xmm6,64(%edi)
+	pshufd	$192,%xmm1,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movl	%ebx,%ecx
+	pshufd	$128,%xmm1,%xmm3
+	subl	$6,%eax
+	jnc	.L035ctr32_loop6
+	addl	$6,%eax
+	jz	.L036ctr32_ret
+	movl	%ebp,%edx
+	leal	1(,%ecx,2),%ecx
+	movdqa	32(%esp),%xmm7
+.L034ctr32_tail:
+	por	%xmm7,%xmm2
+	cmpl	$2,%eax
+	jb	.L037ctr32_one
+	pshufd	$64,%xmm1,%xmm4
+	por	%xmm7,%xmm3
+	je	.L038ctr32_two
+	pshufd	$192,%xmm0,%xmm5
+	por	%xmm7,%xmm4
+	cmpl	$4,%eax
+	jb	.L039ctr32_three
+	pshufd	$128,%xmm0,%xmm6
+	por	%xmm7,%xmm5
+	je	.L040ctr32_four
+	por	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm4
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L033ctr32_one_shortcut:
+	movups	(%ebx),%xmm2
+	movl	240(%edx),%ecx
+.L037ctr32_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L041enc1_loop_7:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L041enc1_loop_7
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	xorps	%xmm2,%xmm6
+	movups	%xmm6,(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L038ctr32_two:
+	call	_aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L039ctr32_three:
+	call	_aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	movups	32(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L036ctr32_ret
+.align	16
+.L040ctr32_four:
+	call	_aesni_encrypt4
+	movups	(%esi),%xmm6
+	movups	16(%esi),%xmm7
+	movups	32(%esi),%xmm1
+	xorps	%xmm6,%xmm2
+	movups	48(%esi),%xmm0
+	xorps	%xmm7,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L036ctr32_ret:
+	movl	80(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_ctr32_encrypt_blocks,.-.L_aesni_ctr32_encrypt_blocks_begin
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,@function
+.align	16
+aesni_xts_encrypt:
+.L_aesni_xts_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L042enc1_loop_8:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L042enc1_loop_8
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	movl	240(%edx),%ecx
+	andl	$-16,%esp
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$96,%eax
+	jc	.L043xts_enc_short
+	shrl	$1,%ecx
+	movl	%ecx,%ebx
+	jmp	.L044xts_enc_loop6
+.align	16
+.L044xts_enc_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	leal	32(%ebp),%edx
+	pxor	16(%esp),%xmm3
+.byte	102,15,56,220,209
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,220,217
+	pxor	48(%esp),%xmm5
+	decl	%ecx
+.byte	102,15,56,220,225
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	movl	%ebx,%ecx
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L044xts_enc_loop6
+	leal	1(,%ecx,2),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L043xts_enc_short:
+	addl	$96,%eax
+	jz	.L045xts_enc_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L046xts_enc_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L047xts_enc_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L048xts_enc_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L049xts_enc_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L050xts_enc_done
+.align	16
+.L046xts_enc_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L051enc1_loop_9:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L051enc1_loop_9
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L047xts_enc_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L048xts_enc_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L049xts_enc_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_encrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L050xts_enc_done
+.align	16
+.L045xts_enc_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L052xts_enc_ret
+	movdqa	%xmm1,%xmm5
+	movl	%eax,112(%esp)
+	jmp	.L053xts_enc_steal
+.align	16
+.L050xts_enc_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L052xts_enc_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm5
+	paddq	%xmm1,%xmm1
+	pand	96(%esp),%xmm5
+	pxor	%xmm1,%xmm5
+.L053xts_enc_steal:
+	movzbl	(%esi),%ecx
+	movzbl	-16(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,-16(%edi)
+	movb	%dl,(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L053xts_enc_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	-16(%edi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L054enc1_loop_10:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L054enc1_loop_10
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,-16(%edi)
+.L052xts_enc_ret:
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_xts_encrypt,.-.L_aesni_xts_encrypt_begin
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,@function
+.align	16
+aesni_xts_decrypt:
+.L_aesni_xts_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L055enc1_loop_11:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L055enc1_loop_11
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	andl	$-16,%esp
+	xorl	%ebx,%ebx
+	testl	$15,%eax
+	setnz	%bl
+	shll	$4,%ebx
+	subl	%ebx,%eax
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	subl	$96,%eax
+	jc	.L056xts_dec_short
+	shrl	$1,%ecx
+	movl	%ecx,%ebx
+	jmp	.L057xts_dec_loop6
+.align	16
+.L057xts_dec_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	leal	32(%ebp),%edx
+	pxor	16(%esp),%xmm3
+.byte	102,15,56,222,209
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,222,217
+	pxor	48(%esp),%xmm5
+	decl	%ecx
+.byte	102,15,56,222,225
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,222,241
+	movups	(%edx),%xmm0
+.byte	102,15,56,222,249
+	call	.L_aesni_decrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	movl	%ebx,%ecx
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L057xts_dec_loop6
+	leal	1(,%ecx,2),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L056xts_dec_short:
+	addl	$96,%eax
+	jz	.L058xts_dec_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L059xts_dec_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L060xts_dec_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L061xts_dec_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L062xts_dec_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_decrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L063xts_dec_done
+.align	16
+.L059xts_dec_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L064dec1_loop_12:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L064dec1_loop_12
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L060xts_dec_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	_aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L061xts_dec_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L062xts_dec_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_decrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L063xts_dec_done
+.align	16
+.L058xts_dec_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L065xts_dec_ret
+	movl	%eax,112(%esp)
+	jmp	.L066xts_dec_only_one_more
+.align	16
+.L063xts_dec_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L065xts_dec_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+.L066xts_dec_only_one_more:
+	pshufd	$19,%xmm0,%xmm5
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm5
+	pxor	%xmm1,%xmm5
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L067dec1_loop_13:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L067dec1_loop_13
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+.L068xts_dec_steal:
+	movzbl	16(%esi),%ecx
+	movzbl	(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,(%edi)
+	movb	%dl,16(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L068xts_dec_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%edi),%xmm2
+	xorps	%xmm6,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L069dec1_loop_14:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L069dec1_loop_14
+.byte	102,15,56,223,209
+	xorps	%xmm6,%xmm2
+	movups	%xmm2,(%edi)
+.L065xts_dec_ret:
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin
+.globl	aesni_cbc_encrypt
+.type	aesni_cbc_encrypt,@function
+.align	16
+aesni_cbc_encrypt:
+.L_aesni_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	%esp,%ebx
+	movl	24(%esp),%edi
+	subl	$24,%ebx
+	movl	28(%esp),%eax
+	andl	$-16,%ebx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebp
+	testl	%eax,%eax
+	jz	.L070cbc_abort
+	cmpl	$0,40(%esp)
+	xchgl	%esp,%ebx
+	movups	(%ebp),%xmm7
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ecx,%ebx
+	je	.L071cbc_decrypt
+	movaps	%xmm7,%xmm2
+	cmpl	$16,%eax
+	jb	.L072cbc_enc_tail
+	subl	$16,%eax
+	jmp	.L073cbc_enc_loop
+.align	16
+.L073cbc_enc_loop:
+	movups	(%esi),%xmm7
+	leal	16(%esi),%esi
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm7
+	leal	32(%edx),%edx
+	xorps	%xmm7,%xmm2
+.L074enc1_loop_15:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L074enc1_loop_15
+.byte	102,15,56,221,209
+	movl	%ebx,%ecx
+	movl	%ebp,%edx
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	subl	$16,%eax
+	jnc	.L073cbc_enc_loop
+	addl	$16,%eax
+	jnz	.L072cbc_enc_tail
+	movaps	%xmm2,%xmm7
+	jmp	.L075cbc_ret
+.L072cbc_enc_tail:
+	movl	%eax,%ecx
+.long	2767451785
+	movl	$16,%ecx
+	subl	%eax,%ecx
+	xorl	%eax,%eax
+.long	2868115081
+	leal	-16(%edi),%edi
+	movl	%ebx,%ecx
+	movl	%edi,%esi
+	movl	%ebp,%edx
+	jmp	.L073cbc_enc_loop
+.align	16
+.L071cbc_decrypt:
+	cmpl	$80,%eax
+	jbe	.L076cbc_dec_tail
+	movaps	%xmm7,(%esp)
+	subl	$80,%eax
+	jmp	.L077cbc_dec_loop6_enter
+.align	16
+.L078cbc_dec_loop6:
+	movaps	%xmm0,(%esp)
+	movups	%xmm7,(%edi)
+	leal	16(%edi),%edi
+.L077cbc_dec_loop6_enter:
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%esi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	96(%esi),%esi
+	movups	%xmm4,32(%edi)
+	movl	%ebx,%ecx
+	movups	%xmm5,48(%edi)
+	movl	%ebp,%edx
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	subl	$96,%eax
+	ja	.L078cbc_dec_loop6
+	movaps	%xmm7,%xmm2
+	movaps	%xmm0,%xmm7
+	addl	$80,%eax
+	jle	.L079cbc_dec_tail_collected
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+.L076cbc_dec_tail:
+	movups	(%esi),%xmm2
+	movaps	%xmm2,%xmm6
+	cmpl	$16,%eax
+	jbe	.L080cbc_dec_one
+	movups	16(%esi),%xmm3
+	movaps	%xmm3,%xmm5
+	cmpl	$32,%eax
+	jbe	.L081cbc_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$48,%eax
+	jbe	.L082cbc_dec_three
+	movups	48(%esi),%xmm5
+	cmpl	$64,%eax
+	jbe	.L083cbc_dec_four
+	movups	64(%esi),%xmm6
+	movaps	%xmm7,(%esp)
+	movups	(%esi),%xmm2
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm7
+	xorps	%xmm0,%xmm6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movaps	%xmm6,%xmm2
+	subl	$80,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L080cbc_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L084dec1_loop_16:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L084dec1_loop_16
+.byte	102,15,56,223,209
+	xorps	%xmm7,%xmm2
+	movaps	%xmm6,%xmm7
+	subl	$16,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L081cbc_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movaps	%xmm3,%xmm2
+	leal	16(%edi),%edi
+	movaps	%xmm5,%xmm7
+	subl	$32,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L082cbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm5,%xmm4
+	movups	%xmm2,(%edi)
+	movaps	%xmm4,%xmm2
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movups	32(%esi),%xmm7
+	subl	$48,%eax
+	jmp	.L079cbc_dec_tail_collected
+.align	16
+.L083cbc_dec_four:
+	call	_aesni_decrypt4
+	movups	16(%esi),%xmm1
+	movups	32(%esi),%xmm0
+	xorps	%xmm7,%xmm2
+	movups	48(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movaps	%xmm5,%xmm2
+	subl	$64,%eax
+.L079cbc_dec_tail_collected:
+	andl	$15,%eax
+	jnz	.L085cbc_dec_tail_partial
+	movups	%xmm2,(%edi)
+	jmp	.L075cbc_ret
+.align	16
+.L085cbc_dec_tail_partial:
+	movaps	%xmm2,(%esp)
+	movl	$16,%ecx
+	movl	%esp,%esi
+	subl	%eax,%ecx
+.long	2767451785
+.L075cbc_ret:
+	movl	16(%esp),%esp
+	movl	36(%esp),%ebp
+	movups	%xmm7,(%ebp)
+.L070cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aesni_cbc_encrypt,.-.L_aesni_cbc_encrypt_begin
+.type	_aesni_set_encrypt_key,@function
+.align	16
+_aesni_set_encrypt_key:
+	testl	%eax,%eax
+	jz	.L086bad_pointer
+	testl	%edx,%edx
+	jz	.L086bad_pointer
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	leal	16(%edx),%edx
+	cmpl	$256,%ecx
+	je	.L08714rounds
+	cmpl	$192,%ecx
+	je	.L08812rounds
+	cmpl	$128,%ecx
+	jne	.L089bad_keybits
+.align	16
+.L09010rounds:
+	movl	$9,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,200,1
+	call	.L091key_128_cold
+.byte	102,15,58,223,200,2
+	call	.L092key_128
+.byte	102,15,58,223,200,4
+	call	.L092key_128
+.byte	102,15,58,223,200,8
+	call	.L092key_128
+.byte	102,15,58,223,200,16
+	call	.L092key_128
+.byte	102,15,58,223,200,32
+	call	.L092key_128
+.byte	102,15,58,223,200,64
+	call	.L092key_128
+.byte	102,15,58,223,200,128
+	call	.L092key_128
+.byte	102,15,58,223,200,27
+	call	.L092key_128
+.byte	102,15,58,223,200,54
+	call	.L092key_128
+	movups	%xmm0,(%edx)
+	movl	%ecx,80(%edx)
+	xorl	%eax,%eax
+	ret
+.align	16
+.L092key_128:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.L091key_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L08812rounds:
+	movq	16(%eax),%xmm2
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L093key_192a_cold
+.byte	102,15,58,223,202,2
+	call	.L094key_192b
+.byte	102,15,58,223,202,4
+	call	.L095key_192a
+.byte	102,15,58,223,202,8
+	call	.L094key_192b
+.byte	102,15,58,223,202,16
+	call	.L095key_192a
+.byte	102,15,58,223,202,32
+	call	.L094key_192b
+.byte	102,15,58,223,202,64
+	call	.L095key_192a
+.byte	102,15,58,223,202,128
+	call	.L094key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	xorl	%eax,%eax
+	ret
+.align	16
+.L095key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	16
+.L093key_192a_cold:
+	movaps	%xmm2,%xmm5
+.L096key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	16
+.L094key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	.L096key_192b_warm
+.align	16
+.L08714rounds:
+	movups	16(%eax),%xmm2
+	movl	$13,%ecx
+	leal	16(%edx),%edx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L097key_256a_cold
+.byte	102,15,58,223,200,1
+	call	.L098key_256b
+.byte	102,15,58,223,202,2
+	call	.L099key_256a
+.byte	102,15,58,223,200,2
+	call	.L098key_256b
+.byte	102,15,58,223,202,4
+	call	.L099key_256a
+.byte	102,15,58,223,200,4
+	call	.L098key_256b
+.byte	102,15,58,223,202,8
+	call	.L099key_256a
+.byte	102,15,58,223,200,8
+	call	.L098key_256b
+.byte	102,15,58,223,202,16
+	call	.L099key_256a
+.byte	102,15,58,223,200,16
+	call	.L098key_256b
+.byte	102,15,58,223,202,32
+	call	.L099key_256a
+.byte	102,15,58,223,200,32
+	call	.L098key_256b
+.byte	102,15,58,223,202,64
+	call	.L099key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	ret
+.align	16
+.L099key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+.L097key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L098key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.align	4
+.L086bad_pointer:
+	movl	$-1,%eax
+	ret
+.align	4
+.L089bad_keybits:
+	movl	$-2,%eax
+	ret
+.size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl	aesni_set_encrypt_key
+.type	aesni_set_encrypt_key,@function
+.align	16
+aesni_set_encrypt_key:
+.L_aesni_set_encrypt_key_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	ret
+.size	aesni_set_encrypt_key,.-.L_aesni_set_encrypt_key_begin
+.globl	aesni_set_decrypt_key
+.type	aesni_set_decrypt_key,@function
+.align	16
+aesni_set_decrypt_key:
+.L_aesni_set_decrypt_key_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	movl	12(%esp),%edx
+	shll	$4,%ecx
+	testl	%eax,%eax
+	jnz	.L100dec_key_ret
+	leal	16(%edx,%ecx,1),%eax
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+	movups	%xmm0,(%eax)
+	movups	%xmm1,(%edx)
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+.L101dec_key_inverse:
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+	movups	%xmm0,16(%eax)
+	movups	%xmm1,-16(%edx)
+	cmpl	%edx,%eax
+	ja	.L101dec_key_inverse
+	movups	(%edx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%edx)
+	xorl	%eax,%eax
+.L100dec_key_ret:
+	ret
+.size	aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte	115,108,46,111,114,103,62,0
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000..3dc345b
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+#	53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-586.pl:-)
+$inline=1;		# inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+if ($PREFIX eq "aesni")	{ $movekey=*movups; }
+else			{ $movekey=*movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx";	# backup copy for $rounds
+$key_="ebp";	# backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";	$in1="xmm5";
+$inout4="xmm6";	$in0="xmm6";
+$inout5="xmm7";	$ivec="xmm7";
+
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc	{ aescommon(0xdb,@_); }
+sub aesenc	{ aescommon(0xdc,@_); }
+sub aesenclast	{ aescommon(0xdd,@_); }
+sub aesdec	{ aescommon(0xde,@_); }
+sub aesdeclast	{ aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+  $sn++;
+
+    &$movekey		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey1,&QWP(16,$key));
+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
+    &lea		($key,&DWP(32,$key));
+    &xorps		($inout,$ivec)		if (defined($ivec));
+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&dec		($rounds);
+	&$movekey	($rndkey1,&QWP(0,$key));
+	&lea		($key,&DWP(16,$key));
+    &jnz		(&label("${p}1_loop_$sn"));
+    eval"&aes${p}last	($inout,$rndkey1)";
+}}
+
+sub aesni_generate1	# fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+    &function_begin_B("_aesni_${p}rypt1");
+	&movups		($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(0x10,$key));
+	&xorps		($inout,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0x20,$key));
+	&lea		($key,&DWP(0x30,$key));
+	&cmp		($rounds,11);
+	&jb		(&label("${p}128"));
+	&lea		($key,&DWP(0x20,$key));
+	&je		(&label("${p}192"));
+	&lea		($key,&DWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x30,$key));
+    &set_label("${p}192");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x10,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x30,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x50,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x60,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x70,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+    eval"&aes${p}last	($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
+sub aesni_generate3
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}3_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("${p}3_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&shr		($rounds,1);
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}4_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}4_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);	# pxor does better here
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&pxor		($inout2,$rndkey0);
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&pxor		($inout3,$rndkey0);
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&pxor		($inout4,$rndkey0);
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&pxor		($inout5,$rndkey0);
+	eval"&aes${p}	($inout4,$rndkey1)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	eval"&aes${p}	($inout5,$rndkey1)";
+	&jmp		(&label("_aesni_${p}rypt6_enter"));
+
+    &set_label("${p}6_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	eval"&aes${p}	($inout4,$rndkey1)";
+	eval"&aes${p}	($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter",16);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	eval"&aes${p}	($inout4,$rndkey0)";
+	eval"&aes${p}	($inout5,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}6_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}	($inout4,$rndkey1)";
+    eval"&aes${p}	($inout5,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    eval"&aes${p}last	($inout4,$rndkey0)";
+    eval"&aes${p}last	($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                         size_t length, const AES_KEY *key,
+#                         int enc);
+&function_begin("aesni_ecb_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&and	($len,-16);
+	&jz	(&label("ecb_ret"));
+	&mov	($rounds,&DWP(240,$key));
+	&test	($rounds_,$rounds_);
+	&jz	(&label("ecb_decrypt"));
+
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_enc_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+	&call	("_aesni_encrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_enc_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_enc_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_enc_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_enc_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_enc_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&jmp	(&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_dec_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+	&call	("_aesni_decrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_dec_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_dec_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_dec_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_dec_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_dec_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&shr	($rounds,1);
+	&lea	($key_,&DWP(0,$key));
+	&movdqa	($inout3,&QWP(0,"esp"));
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds_,$rounds);
+	&pshufb	($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&mov		($rounds,$rounds_);
+	&movups		($in0,&QWP(0,$inp));
+
+	&xorps		($inout0,$rndkey0);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($rndkey0,$in0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($cmac,$rndkey0);		# cmac^=inp
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_enc2_loop"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+
+	&dec	($len);
+	&lea	($inp,&DWP(16,$inp));
+	&xorps	($in0,$inout0);			# inp^=E(ivec)
+	&movdqa	($inout0,$ivec);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+	&jnz	(&label("ccm64_enc_outer"));
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
+	&movdqa	($inout0,$ivec);
+
+	&mov	($key_,$key);
+	&mov	($rounds_,$rounds);
+
+	&pshufb	($ivec,$inout3);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));		# load inp
+	&paddq	($ivec,&QWP(16,"esp"));
+	&lea	($inp,&QWP(16,$inp));
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+	&xorps	($in0,$inout0);			# inp ^= E(ivec)
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds,$rounds_);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+
+	&sub	($len,1);
+	&jz	(&label("ccm64_dec_break"));
+
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($in0,$rndkey0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($inout0,$rndkey0);
+	&xorps		($cmac,$in0);		# cmac^=out
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_dec2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_dec2_loop"));
+	&movups		($in0,&QWP(0,$inp));	# load inp
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&lea		($inp,&QWP(16,$inp));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+	&mov	($key,$key_);
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
+	else
+	{   &call	("_aesni_encrypt1",$cmac);	}
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+#	0	pshufb mask
+#	16	vector addend: 0,6,6,6
+# 	32	counter-less ivec
+#	48	1st triplet of counter vector
+#	64	2nd triplet of counter vector
+#	80	saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($key_,"esp");
+	&sub	("esp",88);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(80,"esp"),$key_);
+
+	&cmp	($len,1);
+	&je	(&label("ctr32_one_shortcut"));
+
+	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds,6);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(20,"esp"),$rounds);
+	&mov	(&DWP(24,"esp"),$rounds);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
+	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
+
+	&mov	($rounds,&DWP(240,$key));	# key->rounds
+
+	# compose 2 vectors of 3x32-bit counters
+	&bswap	($rounds_);
+	&pxor	($rndkey1,$rndkey1);
+	&pxor	($rndkey0,$rndkey0);
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
+	&pinsrd	($rndkey1,$rounds_,0);
+	&lea	($key_,&DWP(3,$rounds_));
+	&pinsrd	($rndkey0,$key_,0);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,1);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,1);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,2);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,2);
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+
+	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&cmp	($len,6);
+	&jb	(&label("ctr32_tail"));
+	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
+	&shr	($rounds,1);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,6);
+	&jmp	(&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout0,$rndkey1);		# merge counter-less ivec
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout1,$rndkey1);
+	&pshufd	($inout5,$rndkey0,1<<6);
+	&por	($inout2,$rndkey1);
+	&por	($inout3,$rndkey1);
+	&por	($inout4,$rndkey1);
+	&por	($inout5,$rndkey1);
+
+	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&lea		($key,&DWP(32,$key_));
+	&dec		($rounds);
+	&pxor		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&aesenc		($inout0,$rndkey1);
+	&pxor		($inout2,$rndkey0);
+	&aesenc		($inout1,$rndkey1);
+	&pxor		($inout3,$rndkey0);
+	&aesenc		($inout2,$rndkey1);
+	&pxor		($inout4,$rndkey0);
+	&aesenc		($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	&aesenc		($inout4,$rndkey1);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&aesenc		($inout5,$rndkey1);
+
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
+	&xorps	($inout2,$rndkey1);
+	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+
+	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
+	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
+
+	&movups	($inout1,&QWP(0x30,$inp));
+	&movups	($inout2,&QWP(0x40,$inp));
+	&xorps	($inout3,$inout1);
+	&movups	($inout1,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&xorps	($inout4,$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&xorps	($inout5,$inout1);
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&movups	(&QWP(0x40,$out),$inout4);
+	&pshufd	($inout0,$rndkey1,3<<6);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+
+	&mov	($rounds,$rounds_);
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&sub	($len,6);
+	&jnc	(&label("ctr32_loop6"));
+
+	&add	($len,6);
+	&jz	(&label("ctr32_ret"));
+	&mov	($key,$key_);
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
+
+&set_label("ctr32_tail");
+	&por	($inout0,$inout5);
+	&cmp	($len,2);
+	&jb	(&label("ctr32_one"));
+
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&por	($inout1,$inout5);
+	&je	(&label("ctr32_two"));
+
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout2,$inout5);
+	&cmp	($len,4);
+	&jb	(&label("ctr32_three"));
+
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout3,$inout5);
+	&je	(&label("ctr32_four"));
+
+	&por	($inout4,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout2,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout4,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
+	&mov	($rounds,&DWP(240,$key));
+	
+&set_label("ctr32_one");
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));
+	&xorps	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&movups	($inout5,&QWP(0x20,$inp));
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	($inout4,&QWP(0,$inp));
+	&movups	($inout5,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout0,$inout4);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout1,$inout5);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+	&mov	("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&and	("esp",-16);			# align stack
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,16*6);
+	&jc	(&label("xts_enc_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesenc	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesenc	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesenc	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesenc	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesenc	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesenc	($inout5,$rndkey1);
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_enc_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_enc_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_enc_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_enc_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_enc_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_enc_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_encrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout2);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_encrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+	&movdqa	($inout3,$tweak);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($inout3,$twtmp,0x13);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+	&movz	($rounds,&BP(0,$inp));
+	&movz	($key,&BP(-16,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(-16,$out),&LB($rounds));
+	&mov	(&BP(0,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_enc_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(-16,$out));	# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(-16,$out),$inout0);	# write output
+
+&set_label("xts_enc_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&and	("esp",-16);			# align stack
+
+	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
+	&test	($len,15);
+	&setnz	(&LB($rounds_));
+	&shl	($rounds_,4);
+	&sub	($len,$rounds_);
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&sub	($len,16*6);
+	&jc	(&label("xts_dec_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesdec	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesdec	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesdec	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesdec	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesdec	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesdec	($inout5,$rndkey1);
+	&call		(&label("_aesni_decrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_dec_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_dec_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_dec_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_dec_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_dec_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_dec_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_decrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_decrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(16*6,"esp"));
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+	&pshufd	($inout3,$twtmp,0x13);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,$twmask);		# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$inp));		# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_steal");
+	&movz	($rounds,&BP(16,$inp));
+	&movz	($key,&BP(0,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(0,$out),&LB($rounds));
+	&mov	(&BP(16,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_dec_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$out));		# load input
+	&xorps	($inout0,$inout4);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout4);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                           size_t length, const AES_KEY *key,
+#                           unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($rounds_,"esp");
+	&mov	($out,&wparam(1));
+	&sub	($rounds_,24);
+	&mov	($len,&wparam(2));
+	&and	($rounds_,-16);
+	&mov	($key,&wparam(3));
+	&mov	($key_,&wparam(4));
+	&test	($len,$len);
+	&jz	(&label("cbc_abort"));
+
+	&cmp	(&wparam(5),0);
+	&xchg	($rounds_,"esp");		# alloca
+	&movups	($ivec,&QWP(0,$key_));		# load IV
+	&mov	($rounds,&DWP(240,$key));
+	&mov	($key_,$key);			# backup $key
+	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&je	(&label("cbc_decrypt"));
+
+	&movaps	($inout0,$ivec);
+	&cmp	($len,16);
+	&jb	(&label("cbc_enc_tail"));
+	&sub	($len,16);
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movups	($ivec,&QWP(0,$inp));		# input actually
+	&lea	($inp,&DWP(16,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
+	else
+	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($key,$key_);		# restore $key
+	&movups	(&QWP(0,$out),$inout0);	# store output
+	&lea	($out,&DWP(16,$out));
+	&sub	($len,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&add	($len,16);
+	&jnz	(&label("cbc_enc_tail"));
+	&movaps	($ivec,$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+	&mov	("ecx",$len);		# zaps $rounds
+	&data_word(0xA4F3F689);		# rep movsb
+	&mov	("ecx",16);		# zero tail
+	&sub	("ecx",$len);
+	&xor	("eax","eax");		# zaps $len
+	&data_word(0xAAF3F689);		# rep stosb
+	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($inp,$out);		# $inp and $out are the same
+	&mov	($key,$key_);		# restore $key
+	&jmp	(&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+	&cmp	($len,0x50);
+	&jbe	(&label("cbc_dec_tail"));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
+	&movups	(&QWP(0,$out),$inout5);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+
+	&call	("_aesni_decrypt6");
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout4,$rndkey0);
+	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
+	&xorps	($inout5,$rndkey1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($inp,&DWP(0x60,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&mov	($rounds,$rounds_)		# restore $rounds
+	&movups	(&QWP(0x30,$out),$inout3);
+	&mov	($key,$key_);			# restore $key
+	&movups	(&QWP(0x40,$out),$inout4);
+	&lea	($out,&DWP(0x50,$out));
+	&sub	($len,0x60);
+	&ja	(&label("cbc_dec_loop6"));
+
+	&movaps	($inout0,$inout5);
+	&movaps	($ivec,$rndkey0);
+	&add	($len,0x50);
+	&jle	(&label("cbc_dec_tail_collected"));
+	&movups	(&QWP(0,$out),$inout0);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&movaps	($in0,$inout0);
+	&cmp	($len,0x10);
+	&jbe	(&label("cbc_dec_one"));
+
+	&movups	($inout1,&QWP(0x10,$inp));
+	&movaps	($in1,$inout1);
+	&cmp	($len,0x20);
+	&jbe	(&label("cbc_dec_two"));
+
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x30);
+	&jbe	(&label("cbc_dec_three"));
+
+	&movups	($inout3,&QWP(0x30,$inp));
+	&cmp	($len,0x40);
+	&jbe	(&label("cbc_dec_four"));
+
+	&movups	($inout4,&QWP(0x40,$inp));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&movups	($inout0,&QWP(0,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($ivec,&QWP(0x40,$inp));	# IV
+	&xorps	($inout4,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&lea	($out,&DWP(0x40,$out));
+	&movaps	($inout0,$inout4);
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$ivec);
+	&movaps	($ivec,$in0);
+	&sub	($len,0x10);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout1);
+	&lea	($out,&DWP(0x10,$out));
+	&movaps	($ivec,$in1);
+	&sub	($len,0x20);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&xorps	($inout2,$in1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout2);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($out,&DWP(0x20,$out));
+	&movups	($ivec,&QWP(0x20,$inp));
+	&sub	($len,0x30);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	($rndkey1,&QWP(0x10,$inp));
+	&movups	($rndkey0,&QWP(0x20,$inp));
+	&xorps	($inout0,$ivec);
+	&movups	($ivec,&QWP(0x30,$inp));
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&lea	($out,&DWP(0x30,$out));
+	&movaps	($inout0,$inout3);
+	&sub	($len,0x40);
+
+&set_label("cbc_dec_tail_collected");
+	&and	($len,15);
+	&jnz	(&label("cbc_dec_tail_partial"));
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+	&movaps	(&QWP(0,"esp"),$inout0);
+	&mov	("ecx",16);
+	&mov	($inp,"esp");
+	&sub	("ecx",$len);
+	&data_word(0xA4F3F689);		# rep movsb
+
+&set_label("cbc_ret");
+	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
+	&mov	($key_,&wparam(4));
+	&movups	(&QWP(0,$key_),$ivec);	# output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+#	"eax"	const unsigned char *userKey
+#	$rounds	int bits
+#	$key	AES_KEY *key
+# output:
+#	"eax"	return code
+#	$round	rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+	&test	("eax","eax");
+	&jz	(&label("bad_pointer"));
+	&test	($key,$key);
+	&jz	(&label("bad_pointer"));
+
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&lea	($key,&DWP(16,$key));
+	&cmp	($rounds,256);
+	&je	(&label("14rounds"));
+	&cmp	($rounds,192);
+	&je	(&label("12rounds"));
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
+&set_label("10rounds",16);
+	&mov		($rounds,9);
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
+	&call		(&label("key_128_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
+	&call		(&label("key_128"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(80,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_128",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_128_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("12rounds",16);
+	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
+	&mov		($rounds,11);
+	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
+	&call		(&label("key_192a_cold"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
+	&call		(&label("key_192b"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(48,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_192a",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+	&movaps		("xmm5","xmm2");
+&set_label("key_192b_warm");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&movdqa		("xmm3","xmm2");
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&pslldq		("xmm3",4);
+	&xorps		("xmm0","xmm4");
+	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
+	&pxor		("xmm2","xmm3");
+	&pxor		("xmm0","xmm1");
+	&pshufd		("xmm3","xmm0",0b11111111);
+	&pxor		("xmm2","xmm3");
+	&ret();
+
+&set_label("key_192b",16);
+	&movaps		("xmm3","xmm0");
+	&shufps		("xmm5","xmm0",0b01000100);
+	&$movekey	(&QWP(0,$key),"xmm5");
+	&shufps		("xmm3","xmm2",0b01001110);
+	&$movekey	(&QWP(16,$key),"xmm3");
+	&lea		($key,&DWP(32,$key));
+	&jmp		(&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&mov		($rounds,13);
+	&lea		($key,&DWP(16,$key));
+	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
+	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
+	&call		(&label("key_256a_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
+	&call		(&label("key_256a"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(16,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_256a",16);
+	&$movekey	(&QWP(0,$key),"xmm2");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("key_256b",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+
+	&shufps		("xmm4","xmm2",0b00010000);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm4","xmm2",0b10001100);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
+	&xorps		("xmm2","xmm1");
+	&ret();
+
+&set_label("bad_pointer",4);
+	&mov	("eax",-1);
+	&ret	();
+&set_label("bad_keybits",4);
+	&mov	("eax",-2);
+	&ret	();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&mov	($key,&wparam(2));
+	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
+	&test	("eax","eax");
+	&jnz	(&label("dec_key_ret"));
+	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
+
+	&$movekey	("xmm0",&QWP(0,$key));	# just swap
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&$movekey	(&QWP(0,"eax"),"xmm0");
+	&$movekey	(&QWP(0,$key),"xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&aesimc		("xmm0","xmm0");
+	&aesimc		("xmm1","xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+	&$movekey	(&QWP(16,"eax"),"xmm0");
+	&$movekey	(&QWP(-16,$key),"xmm1");
+	&cmp		("eax",$key);
+	&ja		(&label("dec_key_inverse"));
+
+	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
+	&aesimc		("xmm0","xmm0");
+	&$movekey	(&QWP(0,$key),"xmm0");
+
+	&xor		("eax","eax");		# return success
+&set_label("dec_key_ret");
+	&ret	();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/crypto/aes/asm/aesni-x86_64.S b/crypto/aes/asm/aesni-x86_64.S
new file mode 100644
index 0000000..917c832
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86_64.S
@@ -0,0 +1,2535 @@
+.text	
+.globl	aesni_encrypt
+.type	aesni_encrypt,@function
+.align	16
+aesni_encrypt:
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_enc1_1	
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	.byte	0xf3,0xc3
+.size	aesni_encrypt,.-aesni_encrypt
+
+.globl	aesni_decrypt
+.type	aesni_decrypt,@function
+.align	16
+aesni_decrypt:
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_2:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_dec1_2	
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	.byte	0xf3,0xc3
+.size	aesni_decrypt, .-aesni_decrypt
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	(%rcx),%xmm0
+
+.Lenc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	(%rcx),%xmm0
+
+.Ldec_loop3:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop3
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	(%rcx),%xmm0
+
+.Lenc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	(%rcx),%xmm0
+
+.Ldec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,220,241
+	movups	(%rcx),%xmm0
+.byte	102,15,56,220,249
+	jmp	.Lenc_loop6_enter
+.align	16
+.Lenc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lenc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	movups	(%rcx),%xmm0
+.byte	102,15,56,222,249
+	jmp	.Ldec_loop6_enter
+.align	16
+.Ldec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.Ldec_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.type	_aesni_encrypt8,@function
+.align	16
+_aesni_encrypt8:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,220,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	16(%rcx),%xmm1
+	jmp	.Lenc_loop8_enter
+.align	16
+.Lenc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	16(%rcx),%xmm1
+.Lenc_loop8_enter:
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	(%rcx),%xmm0
+	jnz	.Lenc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	.byte	0xf3,0xc3
+.size	_aesni_encrypt8,.-_aesni_encrypt8
+.type	_aesni_decrypt8,@function
+.align	16
+_aesni_decrypt8:
+	movups	(%rcx),%xmm0
+	shrl	$1,%eax
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+	jmp	.Ldec_loop8_enter
+.align	16
+.Ldec_loop8:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+.Ldec_loop8_enter:
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	(%rcx),%xmm0
+	jnz	.Ldec_loop8
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+.byte	102,68,15,56,223,192
+.byte	102,68,15,56,223,200
+	.byte	0xf3,0xc3
+.size	_aesni_decrypt8,.-_aesni_decrypt8
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,@function
+.align	16
+aesni_ecb_encrypt:
+	andq	$-16,%rdx
+	jz	.Lecb_ret
+
+	movl	240(%rcx),%eax
+	movups	(%rcx),%xmm0
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	testl	%r8d,%r8d
+	jz	.Lecb_decrypt
+
+	cmpq	$128,%rdx
+	jb	.Lecb_enc_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	jmp	.Lecb_enc_loop8_enter
+.align	16
+.Lecb_enc_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	subq	$128,%rdx
+	jnc	.Lecb_enc_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$128,%rdx
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$32,%rdx
+	jb	.Lecb_enc_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_enc_two
+	movups	32(%rdi),%xmm4
+	cmpq	$64,%rdx
+	jb	.Lecb_enc_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_enc_four
+	movups	64(%rdi),%xmm6
+	cmpq	$96,%rdx
+	jb	.Lecb_enc_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_enc_six
+	movdqu	96(%rdi),%xmm8
+	call	_aesni_encrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_3:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_3	
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	jmp	.Lecb_ret
+
+.align	16
+.Lecb_decrypt:
+	cmpq	$128,%rdx
+	jb	.Lecb_dec_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	jmp	.Lecb_dec_loop8_enter
+.align	16
+.Lecb_dec_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	movups	(%r11),%xmm0
+	subq	$128,%rdx
+	jnc	.Lecb_dec_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$128,%rdx
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$32,%rdx
+	jb	.Lecb_dec_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_dec_two
+	movups	32(%rdi),%xmm4
+	cmpq	$64,%rdx
+	jb	.Lecb_dec_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_dec_four
+	movups	64(%rdi),%xmm6
+	cmpq	$96,%rdx
+	jb	.Lecb_dec_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_dec_six
+	movups	96(%rdi),%xmm8
+	movups	(%rcx),%xmm0
+	call	_aesni_decrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_4:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_4	
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+
+.Lecb_ret:
+	.byte	0xf3,0xc3
+.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,@function
+.align	16
+aesni_ccm64_encrypt_blocks:
+	movl	240(%rcx),%eax
+	movdqu	(%r8),%xmm9
+	movdqa	.Lincrement64(%rip),%xmm6
+	movdqa	.Lbswap_mask(%rip),%xmm7
+
+	shrl	$1,%eax
+	leaq	0(%rcx),%r11
+	movdqu	(%r9),%xmm3
+	movdqa	%xmm9,%xmm2
+	movl	%eax,%r10d
+.byte	102,68,15,56,0,207
+	jmp	.Lccm64_enc_outer
+.align	16
+.Lccm64_enc_outer:
+	movups	(%r11),%xmm0
+	movl	%r10d,%eax
+	movups	(%rdi),%xmm8
+
+	xorps	%xmm0,%xmm2
+	movups	16(%r11),%xmm1
+	xorps	%xmm8,%xmm0
+	leaq	32(%r11),%rcx
+	xorps	%xmm0,%xmm3
+	movups	(%rcx),%xmm0
+
+.Lccm64_enc2_loop:
+.byte	102,15,56,220,209
+	decl	%eax
+.byte	102,15,56,220,217
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,216
+	movups	0(%rcx),%xmm0
+	jnz	.Lccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	%xmm6,%xmm9
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+
+	decq	%rdx
+	leaq	16(%rdi),%rdi
+	xorps	%xmm2,%xmm8
+	movdqa	%xmm9,%xmm2
+	movups	%xmm8,(%rsi)
+	leaq	16(%rsi),%rsi
+.byte	102,15,56,0,215
+	jnz	.Lccm64_enc_outer
+
+	movups	%xmm3,(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,@function
+.align	16
+aesni_ccm64_decrypt_blocks:
+	movl	240(%rcx),%eax
+	movups	(%r8),%xmm9
+	movdqu	(%r9),%xmm3
+	movdqa	.Lincrement64(%rip),%xmm6
+	movdqa	.Lbswap_mask(%rip),%xmm7
+
+	movaps	%xmm9,%xmm2
+	movl	%eax,%r10d
+	movq	%rcx,%r11
+.byte	102,68,15,56,0,207
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_5:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_5	
+.byte	102,15,56,221,209
+	movups	(%rdi),%xmm8
+	paddq	%xmm6,%xmm9
+	leaq	16(%rdi),%rdi
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	%xmm2,%xmm8
+	movdqa	%xmm9,%xmm2
+	movl	%r10d,%eax
+	movups	%xmm8,(%rsi)
+	leaq	16(%rsi),%rsi
+.byte	102,15,56,0,215
+
+	subq	$1,%rdx
+	jz	.Lccm64_dec_break
+
+	movups	(%r11),%xmm0
+	shrl	$1,%eax
+	movups	16(%r11),%xmm1
+	xorps	%xmm0,%xmm8
+	leaq	32(%r11),%rcx
+	xorps	%xmm0,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	(%rcx),%xmm0
+
+.Lccm64_dec2_loop:
+.byte	102,15,56,220,209
+	decl	%eax
+.byte	102,15,56,220,217
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,216
+	movups	0(%rcx),%xmm0
+	jnz	.Lccm64_dec2_loop
+	movups	(%rdi),%xmm8
+	paddq	%xmm6,%xmm9
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	16(%rdi),%rdi
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	jmp	.Lccm64_dec_outer
+
+.align	16
+.Lccm64_dec_break:
+
+	movups	(%r11),%xmm0
+	movups	16(%r11),%xmm1
+	xorps	%xmm0,%xmm8
+	leaq	32(%r11),%r11
+	xorps	%xmm8,%xmm3
+.Loop_enc1_6:
+.byte	102,15,56,220,217
+	decl	%eax
+	movups	(%r11),%xmm1
+	leaq	16(%r11),%r11
+	jnz	.Loop_enc1_6	
+.byte	102,15,56,221,217
+	movups	%xmm3,(%r9)
+	.byte	0xf3,0xc3
+.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,@function
+.align	16
+aesni_ctr32_encrypt_blocks:
+	cmpq	$1,%rdx
+	je	.Lctr32_one_shortcut
+
+	movdqu	(%r8),%xmm14
+	movdqa	.Lbswap_mask(%rip),%xmm15
+	xorl	%eax,%eax
+.byte	102,69,15,58,22,242,3
+.byte	102,68,15,58,34,240,3
+
+	movl	240(%rcx),%eax
+	bswapl	%r10d
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+.byte	102,69,15,58,34,226,0
+	leaq	3(%r10),%r11
+.byte	102,69,15,58,34,235,0
+	incl	%r10d
+.byte	102,69,15,58,34,226,1
+	incq	%r11
+.byte	102,69,15,58,34,235,1
+	incl	%r10d
+.byte	102,69,15,58,34,226,2
+	incq	%r11
+.byte	102,69,15,58,34,235,2
+	movdqa	%xmm12,-40(%rsp)
+.byte	102,69,15,56,0,231
+	movdqa	%xmm13,-24(%rsp)
+.byte	102,69,15,56,0,239
+
+	pshufd	$192,%xmm12,%xmm2
+	pshufd	$128,%xmm12,%xmm3
+	pshufd	$64,%xmm12,%xmm4
+	cmpq	$6,%rdx
+	jb	.Lctr32_tail
+	shrl	$1,%eax
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	subq	$6,%rdx
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	pshufd	$192,%xmm13,%xmm5
+	por	%xmm14,%xmm2
+	movups	(%r11),%xmm0
+	pshufd	$128,%xmm13,%xmm6
+	por	%xmm14,%xmm3
+	movups	16(%r11),%xmm1
+	pshufd	$64,%xmm13,%xmm7
+	por	%xmm14,%xmm4
+	por	%xmm14,%xmm5
+	xorps	%xmm0,%xmm2
+	por	%xmm14,%xmm6
+	por	%xmm14,%xmm7
+
+
+
+
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,220,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,217
+	movdqa	.Lincrement32(%rip),%xmm13
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,225
+	movdqa	-40(%rsp),%xmm12
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	jmp	.Lctr32_enc_loop6_enter
+.align	16
+.Lctr32_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lctr32_enc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lctr32_enc_loop6
+
+.byte	102,15,56,220,209
+	paddd	%xmm13,%xmm12
+.byte	102,15,56,220,217
+	paddd	-24(%rsp),%xmm13
+.byte	102,15,56,220,225
+	movdqa	%xmm12,-40(%rsp)
+.byte	102,15,56,220,233
+	movdqa	%xmm13,-24(%rsp)
+.byte	102,15,56,220,241
+.byte	102,69,15,56,0,231
+.byte	102,15,56,220,249
+.byte	102,69,15,56,0,239
+
+.byte	102,15,56,221,208
+	movups	(%rdi),%xmm8
+.byte	102,15,56,221,216
+	movups	16(%rdi),%xmm9
+.byte	102,15,56,221,224
+	movups	32(%rdi),%xmm10
+.byte	102,15,56,221,232
+	movups	48(%rdi),%xmm11
+.byte	102,15,56,221,240
+	movups	64(%rdi),%xmm1
+.byte	102,15,56,221,248
+	movups	80(%rdi),%xmm0
+	leaq	96(%rdi),%rdi
+
+	xorps	%xmm2,%xmm8
+	pshufd	$192,%xmm12,%xmm2
+	xorps	%xmm3,%xmm9
+	pshufd	$128,%xmm12,%xmm3
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	pshufd	$64,%xmm12,%xmm4
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	xorps	%xmm6,%xmm1
+	movups	%xmm11,48(%rsi)
+	xorps	%xmm7,%xmm0
+	movups	%xmm1,64(%rsi)
+	movups	%xmm0,80(%rsi)
+	leaq	96(%rsi),%rsi
+	movl	%r10d,%eax
+	subq	$6,%rdx
+	jnc	.Lctr32_loop6
+
+	addq	$6,%rdx
+	jz	.Lctr32_done
+	movq	%r11,%rcx
+	leal	1(%rax,%rax,1),%eax
+
+.Lctr32_tail:
+	por	%xmm14,%xmm2
+	movups	(%rdi),%xmm8
+	cmpq	$2,%rdx
+	jb	.Lctr32_one
+
+	por	%xmm14,%xmm3
+	movups	16(%rdi),%xmm9
+	je	.Lctr32_two
+
+	pshufd	$192,%xmm13,%xmm5
+	por	%xmm14,%xmm4
+	movups	32(%rdi),%xmm10
+	cmpq	$4,%rdx
+	jb	.Lctr32_three
+
+	pshufd	$128,%xmm13,%xmm6
+	por	%xmm14,%xmm5
+	movups	48(%rdi),%xmm11
+	je	.Lctr32_four
+
+	por	%xmm14,%xmm6
+	xorps	%xmm7,%xmm7
+
+	call	_aesni_encrypt6
+
+	movups	64(%rdi),%xmm1
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	xorps	%xmm6,%xmm1
+	movups	%xmm11,48(%rsi)
+	movups	%xmm1,64(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one_shortcut:
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm8
+	movl	240(%rcx),%eax
+.Lctr32_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_7:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_7	
+.byte	102,15,56,221,209
+	xorps	%xmm2,%xmm8
+	movups	%xmm8,(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	movups	%xmm9,16(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	movups	%xmm10,32(%rsi)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_four:
+	call	_aesni_encrypt4
+	xorps	%xmm2,%xmm8
+	xorps	%xmm3,%xmm9
+	movups	%xmm8,(%rsi)
+	xorps	%xmm4,%xmm10
+	movups	%xmm9,16(%rsi)
+	xorps	%xmm5,%xmm11
+	movups	%xmm10,32(%rsi)
+	movups	%xmm11,48(%rsi)
+
+.Lctr32_done:
+	.byte	0xf3,0xc3
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,@function
+.align	16
+aesni_xts_encrypt:
+	leaq	-104(%rsp),%rsp
+	movups	(%r9),%xmm15
+	movl	240(%r8),%eax
+	movl	240(%rcx),%r10d
+	movups	(%r8),%xmm0
+	movups	16(%r8),%xmm1
+	leaq	32(%r8),%r8
+	xorps	%xmm0,%xmm15
+.Loop_enc1_8:
+.byte	102,68,15,56,220,249
+	decl	%eax
+	movups	(%r8),%xmm1
+	leaq	16(%r8),%r8
+	jnz	.Loop_enc1_8	
+.byte	102,68,15,56,221,249
+	movq	%rcx,%r11
+	movl	%r10d,%eax
+	movq	%rdx,%r9
+	andq	$-16,%rdx
+
+	movdqa	.Lxts_magic(%rip),%xmm8
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	subq	$96,%rdx
+	jc	.Lxts_enc_short
+
+	shrl	$1,%eax
+	subl	$1,%eax
+	movl	%eax,%r10d
+	jmp	.Lxts_enc_grandloop
+
+.align	16
+.Lxts_enc_grandloop:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	0(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm12,%xmm4
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+	pxor	%xmm13,%xmm5
+	movups	(%r11),%xmm0
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+
+
+
+	movups	16(%r11),%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,220,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm13,48(%rsp)
+.byte	102,15,56,220,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,220,241
+	movdqa	%xmm15,80(%rsp)
+.byte	102,15,56,220,249
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	jmp	.Lxts_enc_loop6_enter
+
+.align	16
+.Lxts_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	decl	%eax
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.Lxts_enc_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	(%rcx),%xmm0
+	jnz	.Lxts_enc_loop6
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	16(%rcx),%xmm1
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	32(%rcx),%xmm0
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,220,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,220,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,220,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,221,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,221,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,221,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	xorps	0(%rsp),%xmm2
+	pand	%xmm8,%xmm9
+	xorps	16(%rsp),%xmm3
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+
+	xorps	32(%rsp),%xmm4
+	movups	%xmm2,0(%rsi)
+	xorps	48(%rsp),%xmm5
+	movups	%xmm3,16(%rsi)
+	xorps	64(%rsp),%xmm6
+	movups	%xmm4,32(%rsi)
+	xorps	80(%rsp),%xmm7
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$96,%rdx
+	jnc	.Lxts_enc_grandloop
+
+	leal	3(%rax,%rax,1),%eax
+	movq	%r11,%rcx
+	movl	%eax,%r10d
+
+.Lxts_enc_short:
+	addq	$96,%rdx
+	jz	.Lxts_enc_done
+
+	cmpq	$32,%rdx
+	jb	.Lxts_enc_one
+	je	.Lxts_enc_two
+
+	cmpq	$64,%rdx
+	jb	.Lxts_enc_three
+	je	.Lxts_enc_four
+
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	leaq	80(%rdi),%rdi
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm6
+
+	call	_aesni_encrypt6
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm15,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	xorps	%xmm14,%xmm6
+	movdqu	%xmm4,32(%rsi)
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_one:
+	movups	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_9:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_9	
+.byte	102,15,56,221,209
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm10
+	movups	%xmm2,(%rsi)
+	leaq	16(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_two:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	leaq	32(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+
+	call	_aesni_encrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm12,%xmm10
+	xorps	%xmm11,%xmm3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	leaq	32(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_three:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	leaq	48(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+
+	call	_aesni_encrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm13,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	leaq	48(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_four:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	xorps	%xmm10,%xmm2
+	movups	48(%rdi),%xmm5
+	leaq	64(%rdi),%rdi
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	xorps	%xmm13,%xmm5
+
+	call	_aesni_encrypt4
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm15,%xmm10
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_done:
+	andq	$15,%r9
+	jz	.Lxts_enc_ret
+	movq	%r9,%rdx
+
+.Lxts_enc_steal:
+	movzbl	(%rdi),%eax
+	movzbl	-16(%rsi),%ecx
+	leaq	1(%rdi),%rdi
+	movb	%al,-16(%rsi)
+	movb	%cl,0(%rsi)
+	leaq	1(%rsi),%rsi
+	subq	$1,%rdx
+	jnz	.Lxts_enc_steal
+
+	subq	%r9,%rsi
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	-16(%rsi),%xmm2
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_10:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_10	
+.byte	102,15,56,221,209
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,-16(%rsi)
+
+.Lxts_enc_ret:
+	leaq	104(%rsp),%rsp
+.Lxts_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_xts_encrypt,.-aesni_xts_encrypt
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,@function
+.align	16
+aesni_xts_decrypt:
+	leaq	-104(%rsp),%rsp
+	movups	(%r9),%xmm15
+	movl	240(%r8),%eax
+	movl	240(%rcx),%r10d
+	movups	(%r8),%xmm0
+	movups	16(%r8),%xmm1
+	leaq	32(%r8),%r8
+	xorps	%xmm0,%xmm15
+.Loop_enc1_11:
+.byte	102,68,15,56,220,249
+	decl	%eax
+	movups	(%r8),%xmm1
+	leaq	16(%r8),%r8
+	jnz	.Loop_enc1_11	
+.byte	102,68,15,56,221,249
+	xorl	%eax,%eax
+	testq	$15,%rdx
+	setnz	%al
+	shlq	$4,%rax
+	subq	%rax,%rdx
+
+	movq	%rcx,%r11
+	movl	%r10d,%eax
+	movq	%rdx,%r9
+	andq	$-16,%rdx
+
+	movdqa	.Lxts_magic(%rip),%xmm8
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm9
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+	subq	$96,%rdx
+	jc	.Lxts_dec_short
+
+	shrl	$1,%eax
+	subl	$1,%eax
+	movl	%eax,%r10d
+	jmp	.Lxts_dec_grandloop
+
+.align	16
+.Lxts_dec_grandloop:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	0(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm12,%xmm4
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+	pxor	%xmm13,%xmm5
+	movups	(%r11),%xmm0
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+
+
+
+	movups	16(%r11),%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movdqa	%xmm10,0(%rsp)
+.byte	102,15,56,222,209
+	leaq	32(%r11),%rcx
+	pxor	%xmm0,%xmm4
+	movdqa	%xmm11,16(%rsp)
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	movdqa	%xmm12,32(%rsp)
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm13,48(%rsp)
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	movups	(%rcx),%xmm0
+	decl	%eax
+	movdqa	%xmm14,64(%rsp)
+.byte	102,15,56,222,241
+	movdqa	%xmm15,80(%rsp)
+.byte	102,15,56,222,249
+	pxor	%xmm14,%xmm14
+	pcmpgtd	%xmm15,%xmm14
+	jmp	.Lxts_dec_loop6_enter
+
+.align	16
+.Lxts_dec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	decl	%eax
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.Lxts_dec_loop6_enter:
+	movups	16(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	leaq	32(%rcx),%rcx
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	(%rcx),%xmm0
+	jnz	.Lxts_dec_loop6
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	16(%rcx),%xmm1
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	32(%rcx),%xmm0
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm11
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,222,209
+	pand	%xmm8,%xmm9
+.byte	102,15,56,222,217
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,222,225
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm12
+	paddq	%xmm15,%xmm15
+.byte	102,15,56,223,208
+	pand	%xmm8,%xmm9
+.byte	102,15,56,223,216
+	pcmpgtd	%xmm15,%xmm14
+.byte	102,15,56,223,224
+	pxor	%xmm9,%xmm15
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+
+	pshufd	$19,%xmm14,%xmm9
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm15,%xmm13
+	paddq	%xmm15,%xmm15
+	xorps	0(%rsp),%xmm2
+	pand	%xmm8,%xmm9
+	xorps	16(%rsp),%xmm3
+	pcmpgtd	%xmm15,%xmm14
+	pxor	%xmm9,%xmm15
+
+	xorps	32(%rsp),%xmm4
+	movups	%xmm2,0(%rsi)
+	xorps	48(%rsp),%xmm5
+	movups	%xmm3,16(%rsi)
+	xorps	64(%rsp),%xmm6
+	movups	%xmm4,32(%rsi)
+	xorps	80(%rsp),%xmm7
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$96,%rdx
+	jnc	.Lxts_dec_grandloop
+
+	leal	3(%rax,%rax,1),%eax
+	movq	%r11,%rcx
+	movl	%eax,%r10d
+
+.Lxts_dec_short:
+	addq	$96,%rdx
+	jz	.Lxts_dec_done
+
+	cmpq	$32,%rdx
+	jb	.Lxts_dec_one
+	je	.Lxts_dec_two
+
+	cmpq	$64,%rdx
+	jb	.Lxts_dec_three
+	je	.Lxts_dec_four
+
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movdqu	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movdqu	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movdqu	32(%rdi),%xmm4
+	pxor	%xmm10,%xmm2
+	movdqu	48(%rdi),%xmm5
+	pxor	%xmm11,%xmm3
+	movdqu	64(%rdi),%xmm6
+	leaq	80(%rdi),%rdi
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm6
+
+	call	_aesni_decrypt6
+
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	movdqu	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movdqu	%xmm3,16(%rsi)
+	xorps	%xmm14,%xmm6
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm14,%xmm14
+	movdqu	%xmm5,48(%rsi)
+	pcmpgtd	%xmm15,%xmm14
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	pshufd	$19,%xmm14,%xmm11
+	andq	$15,%r9
+	jz	.Lxts_dec_ret
+
+	movdqa	%xmm15,%xmm10
+	paddq	%xmm15,%xmm15
+	pand	%xmm8,%xmm11
+	pxor	%xmm15,%xmm11
+	jmp	.Lxts_dec_done2
+
+.align	16
+.Lxts_dec_one:
+	movups	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_12:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_12	
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm10
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm12,%xmm11
+	leaq	16(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_two:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	leaq	32(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+
+	call	_aesni_decrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm12,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm13,%xmm11
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	leaq	32(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_three:
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	32(%rdi),%xmm4
+	leaq	48(%rdi),%rdi
+	xorps	%xmm10,%xmm2
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+
+	call	_aesni_decrypt3
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm13,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm15,%xmm11
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	leaq	48(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_four:
+	pshufd	$19,%xmm14,%xmm9
+	movdqa	%xmm15,%xmm14
+	paddq	%xmm15,%xmm15
+	movups	(%rdi),%xmm2
+	pand	%xmm8,%xmm9
+	movups	16(%rdi),%xmm3
+	pxor	%xmm9,%xmm15
+
+	movups	32(%rdi),%xmm4
+	xorps	%xmm10,%xmm2
+	movups	48(%rdi),%xmm5
+	leaq	64(%rdi),%rdi
+	xorps	%xmm11,%xmm3
+	xorps	%xmm12,%xmm4
+	xorps	%xmm13,%xmm5
+
+	call	_aesni_decrypt4
+
+	xorps	%xmm10,%xmm2
+	movdqa	%xmm14,%xmm10
+	xorps	%xmm11,%xmm3
+	movdqa	%xmm15,%xmm11
+	xorps	%xmm12,%xmm4
+	movups	%xmm2,(%rsi)
+	xorps	%xmm13,%xmm5
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_done:
+	andq	$15,%r9
+	jz	.Lxts_dec_ret
+.Lxts_dec_done2:
+	movq	%r9,%rdx
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	(%rdi),%xmm2
+	xorps	%xmm11,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_13:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_13	
+.byte	102,15,56,223,209
+	xorps	%xmm11,%xmm2
+	movups	%xmm2,(%rsi)
+
+.Lxts_dec_steal:
+	movzbl	16(%rdi),%eax
+	movzbl	(%rsi),%ecx
+	leaq	1(%rdi),%rdi
+	movb	%al,(%rsi)
+	movb	%cl,16(%rsi)
+	leaq	1(%rsi),%rsi
+	subq	$1,%rdx
+	jnz	.Lxts_dec_steal
+
+	subq	%r9,%rsi
+	movq	%r11,%rcx
+	movl	%r10d,%eax
+
+	movups	(%rsi),%xmm2
+	xorps	%xmm10,%xmm2
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_14:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_14	
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+
+.Lxts_dec_ret:
+	leaq	104(%rsp),%rsp
+.Lxts_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_xts_decrypt,.-aesni_xts_decrypt
+.globl	aesni_cbc_encrypt
+.type	aesni_cbc_encrypt,@function
+.align	16
+aesni_cbc_encrypt:
+	testq	%rdx,%rdx
+	jz	.Lcbc_ret
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	testl	%r9d,%r9d
+	jz	.Lcbc_decrypt
+
+	movups	(%r8),%xmm2
+	movl	%r10d,%eax
+	cmpq	$16,%rdx
+	jb	.Lcbc_enc_tail
+	subq	$16,%rdx
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	(%rdi),%xmm3
+	leaq	16(%rdi),%rdi
+
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm3
+	leaq	32(%rcx),%rcx
+	xorps	%xmm3,%xmm2
+.Loop_enc1_15:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_15	
+.byte	102,15,56,221,209
+	movl	%r10d,%eax
+	movq	%r11,%rcx
+	movups	%xmm2,0(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$16,%rdx
+	jnc	.Lcbc_enc_loop
+	addq	$16,%rdx
+	jnz	.Lcbc_enc_tail
+	movups	%xmm2,(%r8)
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	movq	%rdx,%rcx
+	xchgq	%rdi,%rsi
+.long	0x9066A4F3	
+	movl	$16,%ecx
+	subq	%rdx,%rcx
+	xorl	%eax,%eax
+.long	0x9066AAF3	
+	leaq	-16(%rdi),%rdi
+	movl	%r10d,%eax
+	movq	%rdi,%rsi
+	movq	%r11,%rcx
+	xorq	%rdx,%rdx
+	jmp	.Lcbc_enc_loop	
+
+.align	16
+.Lcbc_decrypt:
+	movups	(%r8),%xmm9
+	movl	%r10d,%eax
+	cmpq	$112,%rdx
+	jbe	.Lcbc_dec_tail
+	shrl	$1,%r10d
+	subq	$112,%rdx
+	movl	%r10d,%eax
+	movaps	%xmm9,-24(%rsp)
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movaps	%xmm0,-24(%rsp)
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+	movups	(%rcx),%xmm0
+	movups	(%rdi),%xmm2
+	movups	16(%rdi),%xmm3
+	movups	16(%rcx),%xmm1
+
+	leaq	32(%rcx),%rcx
+	movdqu	32(%rdi),%xmm4
+	xorps	%xmm0,%xmm2
+	movdqu	48(%rdi),%xmm5
+	xorps	%xmm0,%xmm3
+	movdqu	64(%rdi),%xmm6
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm4
+	movdqu	80(%rdi),%xmm7
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	movdqu	96(%rdi),%xmm8
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm6
+	movdqu	112(%rdi),%xmm9
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm7
+	decl	%eax
+.byte	102,15,56,222,241
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm9
+	movups	(%rcx),%xmm0
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	16(%rcx),%xmm1
+
+	call	.Ldec_loop8_enter
+
+	movups	(%rdi),%xmm1
+	movups	16(%rdi),%xmm0
+	xorps	-24(%rsp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%rdi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%rdi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%rdi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%rdi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	96(%rdi),%xmm1
+	xorps	%xmm0,%xmm8
+	movups	112(%rdi),%xmm0
+	xorps	%xmm1,%xmm9
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm6,64(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm7,80(%rsi)
+	leaq	128(%rdi),%rdi
+	movups	%xmm8,96(%rsi)
+	leaq	112(%rsi),%rsi
+	subq	$128,%rdx
+	ja	.Lcbc_dec_loop8
+
+	movaps	%xmm9,%xmm2
+	movaps	%xmm0,%xmm9
+	addq	$112,%rdx
+	jle	.Lcbc_dec_tail_collected
+	movups	%xmm2,(%rsi)
+	leal	1(%r10,%r10,1),%eax
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_tail:
+	movups	(%rdi),%xmm2
+	movaps	%xmm2,%xmm8
+	cmpq	$16,%rdx
+	jbe	.Lcbc_dec_one
+
+	movups	16(%rdi),%xmm3
+	movaps	%xmm3,%xmm7
+	cmpq	$32,%rdx
+	jbe	.Lcbc_dec_two
+
+	movups	32(%rdi),%xmm4
+	movaps	%xmm4,%xmm6
+	cmpq	$48,%rdx
+	jbe	.Lcbc_dec_three
+
+	movups	48(%rdi),%xmm5
+	cmpq	$64,%rdx
+	jbe	.Lcbc_dec_four
+
+	movups	64(%rdi),%xmm6
+	cmpq	$80,%rdx
+	jbe	.Lcbc_dec_five
+
+	movups	80(%rdi),%xmm7
+	cmpq	$96,%rdx
+	jbe	.Lcbc_dec_six
+
+	movups	96(%rdi),%xmm8
+	movaps	%xmm9,-24(%rsp)
+	call	_aesni_decrypt8
+	movups	(%rdi),%xmm1
+	movups	16(%rdi),%xmm0
+	xorps	-24(%rsp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%rdi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%rdi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%rdi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%rdi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	96(%rdi),%xmm9
+	xorps	%xmm0,%xmm8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	movaps	%xmm8,%xmm2
+	subq	$112,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_16:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_16	
+.byte	102,15,56,223,209
+	xorps	%xmm9,%xmm2
+	movaps	%xmm8,%xmm9
+	subq	$16,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	xorps	%xmm4,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	movaps	%xmm7,%xmm9
+	movaps	%xmm3,%xmm2
+	leaq	16(%rsi),%rsi
+	subq	$32,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%rsi)
+	movaps	%xmm6,%xmm9
+	movaps	%xmm4,%xmm2
+	leaq	32(%rsi),%rsi
+	subq	$48,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	call	_aesni_decrypt4
+	xorps	%xmm9,%xmm2
+	movups	48(%rdi),%xmm9
+	xorps	%xmm8,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%rsi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm4,32(%rsi)
+	movaps	%xmm5,%xmm2
+	leaq	48(%rsi),%rsi
+	subq	$64,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	16(%rdi),%xmm1
+	movups	32(%rdi),%xmm0
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	xorps	%xmm1,%xmm4
+	movups	48(%rdi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	64(%rdi),%xmm9
+	xorps	%xmm1,%xmm6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	leaq	64(%rsi),%rsi
+	movaps	%xmm6,%xmm2
+	subq	$80,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_six:
+	call	_aesni_decrypt6
+	movups	16(%rdi),%xmm1
+	movups	32(%rdi),%xmm0
+	xorps	%xmm9,%xmm2
+	xorps	%xmm8,%xmm3
+	xorps	%xmm1,%xmm4
+	movups	48(%rdi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	64(%rdi),%xmm0
+	xorps	%xmm1,%xmm6
+	movups	80(%rdi),%xmm9
+	xorps	%xmm0,%xmm7
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	movaps	%xmm7,%xmm2
+	subq	$96,%rdx
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_tail_collected:
+	andq	$15,%rdx
+	movups	%xmm9,(%r8)
+	jnz	.Lcbc_dec_tail_partial
+	movups	%xmm2,(%rsi)
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	%xmm2,-24(%rsp)
+	movq	$16,%rcx
+	movq	%rsi,%rdi
+	subq	%rdx,%rcx
+	leaq	-24(%rsp),%rsi
+.long	0x9066A4F3	
+
+.Lcbc_dec_ret:
+.Lcbc_ret:
+	.byte	0xf3,0xc3
+.size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
+.globl	aesni_set_decrypt_key
+.type	aesni_set_decrypt_key,@function
+.align	16
+aesni_set_decrypt_key:
+.byte	0x48,0x83,0xEC,0x08	
+	call	__aesni_set_encrypt_key
+	shll	$4,%esi
+	testl	%eax,%eax
+	jnz	.Ldec_key_ret
+	leaq	16(%rdx,%rsi,1),%rdi
+
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+	movups	%xmm0,(%rdi)
+	movups	%xmm1,(%rdx)
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+
+.Ldec_key_inverse:
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+	movups	%xmm0,16(%rdi)
+	movups	%xmm1,-16(%rdx)
+	cmpq	%rdx,%rdi
+	ja	.Ldec_key_inverse
+
+	movups	(%rdx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%rdi)
+.Ldec_key_ret:
+	addq	$8,%rsp
+	.byte	0xf3,0xc3
+.LSEH_end_set_decrypt_key:
+.size	aesni_set_decrypt_key,.-aesni_set_decrypt_key
+.globl	aesni_set_encrypt_key
+.type	aesni_set_encrypt_key,@function
+.align	16
+aesni_set_encrypt_key:
+__aesni_set_encrypt_key:
+.byte	0x48,0x83,0xEC,0x08	
+	movq	$-1,%rax
+	testq	%rdi,%rdi
+	jz	.Lenc_key_ret
+	testq	%rdx,%rdx
+	jz	.Lenc_key_ret
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds
+	cmpl	$192,%esi
+	je	.L12rounds
+	cmpl	$128,%esi
+	jne	.Lbad_keybits
+
+.L10rounds:
+	movl	$9,%esi
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,64
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,128
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,27
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,54
+	call	.Lkey_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,128
+	call	.Lkey_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	movq	$-2,%rax
+.Lenc_key_ret:
+	addq	$8,%rsp
+	.byte	0xf3,0xc3
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192a:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_256b:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	.byte	0xf3,0xc3
+.size	aesni_set_encrypt_key,.-aesni_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long	6,6,6,0
+.Lincrement64:
+.long	1,0,0,0
+.Lxts_magic:
+.long	0x87,0,1,0
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
new file mode 100644
index 0000000..0dbb194
--- /dev/null
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -0,0 +1,3069 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
+# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
+# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
+# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
+# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
+# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3 
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor		3x	6x	8x
+# theoretical asymptotic limit		1.67	0.83	0.625
+# measured performance for 8KB block	1.05	0.86	0.84
+#
+# "as if" interleave factor		4.7x	5.8x	6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt				1.16	0.93	0.93
+# CTR					1.14	0.91	n/a
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions  still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+$code=".text\n";
+
+$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8";	# cbc, ctr, ...
+
+$rnds_="%r10d";	# backup copy for $rounds
+$key_="%r11";	# backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0";	$rndkey1="%xmm1";
+$inout0="%xmm2";	$inout1="%xmm3";
+$inout2="%xmm4";	$inout3="%xmm5";
+$inout4="%xmm6";	$inout5="%xmm7";
+$inout6="%xmm8";	$inout7="%xmm9";
+
+$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
+$in0="%xmm8";		$iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+	$movkey	($key),$rndkey0
+	$movkey	16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+	xorps	$rndkey0,$ivec
+	lea	32($key),$key
+	xorps	$ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+	lea	32($key),$key
+	xorps	$rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+	aes${p}	$rndkey1,$inout
+	dec	$rounds
+	$movkey	($key),$rndkey1
+	lea	16($key),$key
+	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
+	aes${p}last	$rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl	${PREFIX}_encrypt
+.type	${PREFIX}_encrypt,\@abi-omnipotent
+.align	16
+${PREFIX}_encrypt:
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# key->rounds
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)		# output
+	ret
+.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl	${PREFIX}_decrypt
+.type	${PREFIX}_decrypt,\@abi-omnipotent
+.align	16
+${PREFIX}_decrypt:
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# key->rounds
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)		# output
+	ret
+.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x...
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt3,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt3:
+	$movkey	($key),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key),$rndkey1
+	lea	32($key),$key
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
+
+.L${dir}_loop3:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop3
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	ret
+.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt4,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt4:
+	$movkey	($key),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key),$rndkey1
+	lea	32($key),$key
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	xorps	$rndkey0,$inout3
+	$movkey	($key),$rndkey0
+
+.L${dir}_loop4:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop4
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	ret
+.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt6,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt6:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout5
+	jmp		.L${dir}_loop6_enter
+.align	16
+.L${dir}_loop6:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+.L${dir}_loop6_enter:				# happens to be 16-byte aligned
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop6
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	ret
+.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt8,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt8:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	xorps		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aes${dir}	$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+	jmp		.L${dir}_loop8_enter
+.align	16
+.L${dir}_loop8:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+.L${dir}_loop8_enter:				# happens to be 16-byte aligned
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	aes${dir}	$rndkey0,$inout6
+	aes${dir}	$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop8
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	aes${dir}last	$rndkey0,$inout6
+	aes${dir}last	$rndkey0,$inout7
+	ret
+.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#			  size_t length, const AES_KEY *key,
+#			  int enc);
+$code.=<<___;
+.globl	aesni_ecb_encrypt
+.type	aesni_ecb_encrypt,\@function,5
+.align	16
+aesni_ecb_encrypt:
+	and	\$-16,$len
+	jz	.Lecb_ret
+
+	mov	240($key),$rounds	# key->rounds
+	$movkey	($key),$rndkey0
+	mov	$key,$key_		# backup $key
+	mov	$rounds,$rnds_		# backup $rounds
+	test	%r8d,%r8d		# 5th argument
+	jz	.Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+	cmp	\$0x80,$len
+	jb	.Lecb_enc_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	sub	\$0x80,$len
+	jnc	.Lecb_enc_loop8
+
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	($inp),$inout0
+	cmp	\$0x20,$len
+	jb	.Lecb_enc_one
+	movups	0x10($inp),$inout1
+	je	.Lecb_enc_two
+	movups	0x20($inp),$inout2
+	cmp	\$0x40,$len
+	jb	.Lecb_enc_three
+	movups	0x30($inp),$inout3
+	je	.Lecb_enc_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_enc_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_enc_six
+	movdqu	0x60($inp),$inout6
+	call	_aesni_encrypt8
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	xorps	$inout2,$inout2
+	call	_aesni_encrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	$inout5,$inout5
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	jmp	.Lecb_ret
+#--------------------------- ECB DECRYPT ------------------------------#
+.align	16
+.Lecb_decrypt:
+	cmp	\$0x80,$len
+	jb	.Lecb_dec_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	$movkey	($key_),$rndkey0
+	sub	\$0x80,$len
+	jnc	.Lecb_dec_loop8
+
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	($inp),$inout0
+	cmp	\$0x20,$len
+	jb	.Lecb_dec_one
+	movups	0x10($inp),$inout1
+	je	.Lecb_dec_two
+	movups	0x20($inp),$inout2
+	cmp	\$0x40,$len
+	jb	.Lecb_dec_three
+	movups	0x30($inp),$inout3
+	je	.Lecb_dec_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_dec_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_dec_six
+	movups	0x60($inp),$inout6
+	$movkey	($key),$rndkey0
+	call	_aesni_decrypt8
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	xorps	$inout2,$inout2
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+
+.Lecb_ret:
+	ret
+.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9";	# 6th argument
+
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_enc_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movdqu	($ivp),$iv
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	shr	\$1,$rounds
+	lea	0($key),$key_
+	movdqu	($cmac),$inout1
+	movdqa	$iv,$inout0
+	mov	$rounds,$rnds_
+	pshufb	$bswap_mask,$iv
+	jmp	.Lccm64_enc_outer
+.align	16
+.Lccm64_enc_outer:
+	$movkey	($key_),$rndkey0
+	mov	$rnds_,$rounds
+	movups	($inp),$in0			# load inp
+
+	xorps	$rndkey0,$inout0		# counter
+	$movkey	16($key_),$rndkey1
+	xorps	$in0,$rndkey0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout1		# cmac^=inp
+	$movkey	($key),$rndkey0
+
+.Lccm64_enc2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_enc2_loop
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	paddq	$increment,$iv
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+
+	dec	$len
+	lea	16($inp),$inp
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+	jnz	.Lccm64_enc_outer
+
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_dec_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movups	($ivp),$iv
+	movdqu	($cmac),$inout1
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	movaps	$iv,$inout0
+	mov	$rounds,$rnds_
+	mov	$key,$key_
+	pshufb	$bswap_mask,$iv
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	lea	16($inp),$inp
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	mov	$rnds_,$rounds
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+
+	sub	\$1,$len
+	jz	.Lccm64_dec_break
+
+	$movkey	($key_),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key_),$rndkey1
+	xorps	$rndkey0,$in0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout0
+	xorps	$in0,$inout1			# cmac^=out
+	$movkey	($key),$rndkey0
+
+.Lccm64_dec2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_dec2_loop
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	lea	16($inp),$inp
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+	jmp	.Lccm64_dec_outer
+
+.align	16
+.Lccm64_dec_break:
+	#xorps	$in0,$inout1			# cmac^=out
+___
+	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+{
+my $reserved = $win64?0:-0x28;
+my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
+my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
+my $bswap_mask="%xmm15";
+
+$code.=<<___;
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,\@function,5
+.align	16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0xc8(%rsp),%rsp
+	movaps	%xmm6,0x20(%rsp)
+	movaps	%xmm7,0x30(%rsp)
+	movaps	%xmm8,0x40(%rsp)
+	movaps	%xmm9,0x50(%rsp)
+	movaps	%xmm10,0x60(%rsp)
+	movaps	%xmm11,0x70(%rsp)
+	movaps	%xmm12,0x80(%rsp)
+	movaps	%xmm13,0x90(%rsp)
+	movaps	%xmm14,0xa0(%rsp)
+	movaps	%xmm15,0xb0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+	cmp	\$1,$len
+	je	.Lctr32_one_shortcut
+
+	movdqu	($ivp),$ivec
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+	xor	$rounds,$rounds
+	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
+	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
+
+	mov	240($key),$rounds		# key->rounds
+	bswap	$rnds_
+	pxor	$iv0,$iv0			# vector of 3 32-bit counters
+	pxor	$iv1,$iv1			# vector of 3 32-bit counters
+	pinsrd	\$0,$rnds_,$iv0
+	lea	3($rnds_),$key_
+	pinsrd	\$0,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$1,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$1,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$2,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$2,$key_,$iv1
+	movdqa	$iv0,$reserved(%rsp)
+	pshufb	$bswap_mask,$iv0
+	movdqa	$iv1,`$reserved+0x10`(%rsp)
+	pshufb	$bswap_mask,$iv1
+
+	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
+	pshufd	\$`2<<6`,$iv0,$inout1
+	pshufd	\$`1<<6`,$iv0,$inout2
+	cmp	\$6,$len
+	jb	.Lctr32_tail
+	shr	\$1,$rounds
+	mov	$key,$key_			# backup $key
+	mov	$rounds,$rnds_			# backup $rounds
+	sub	\$6,$len
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout0			# merge counter-less ivec
+	 $movkey	($key_),$rndkey0
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout1
+	 $movkey	16($key_),$rndkey1
+	pshufd	\$`1<<6`,$iv1,$inout5
+	por	$ivec,$inout2
+	por	$ivec,$inout3
+	 xorps		$rndkey0,$inout0
+	por	$ivec,$inout4
+	por	$ivec,$inout5
+
+	# inline _aesni_encrypt6 and interleave last rounds
+	# with own code...
+
+	pxor		$rndkey0,$inout1
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	aesenc		$rndkey1,$inout1
+	 movdqa		.Lincrement32(%rip),$iv1
+	pxor		$rndkey0,$inout3
+	aesenc		$rndkey1,$inout2
+	 movdqa		$reserved(%rsp),$iv0
+	pxor		$rndkey0,$inout4
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	jmp		.Lctr32_enc_loop6_enter
+.align	16
+.Lctr32_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lctr32_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lctr32_enc_loop6
+
+	aesenc		$rndkey1,$inout0
+	 paddd		$iv1,$iv0		# increment counter vector
+	aesenc		$rndkey1,$inout1
+	 paddd		`$reserved+0x10`(%rsp),$iv1
+	aesenc		$rndkey1,$inout2
+	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
+	aesenc		$rndkey1,$inout3
+	 movdqa		$iv1,`$reserved+0x10`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 pshufb		$bswap_mask,$iv0	# byte swap
+	aesenc		$rndkey1,$inout5
+	 pshufb		$bswap_mask,$iv1
+
+	aesenclast	$rndkey0,$inout0
+	 movups		($inp),$in0		# load input
+	aesenclast	$rndkey0,$inout1
+	 movups		0x10($inp),$in1
+	aesenclast	$rndkey0,$inout2
+	 movups		0x20($inp),$in2
+	aesenclast	$rndkey0,$inout3
+	 movups		0x30($inp),$in3
+	aesenclast	$rndkey0,$inout4
+	 movups		0x40($inp),$rndkey1
+	aesenclast	$rndkey0,$inout5
+	 movups		0x50($inp),$rndkey0
+	 lea	0x60($inp),$inp
+
+	xorps	$inout0,$in0			# xor
+	 pshufd	\$`3<<6`,$iv0,$inout0
+	xorps	$inout1,$in1
+	 pshufd	\$`2<<6`,$iv0,$inout1
+	movups	$in0,($out)			# store output
+	xorps	$inout2,$in2
+	 pshufd	\$`1<<6`,$iv0,$inout2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	xorps	$inout5,$rndkey0
+	movups	$rndkey1,0x40($out)
+	movups	$rndkey0,0x50($out)
+	lea	0x60($out),$out
+	mov	$rnds_,$rounds
+	sub	\$6,$len
+	jnc	.Lctr32_loop6
+
+	add	\$6,$len
+	jz	.Lctr32_done
+	mov	$key_,$key			# restore $key
+	lea	1($rounds,$rounds),$rounds	# restore original value
+
+.Lctr32_tail:
+	por	$ivec,$inout0
+	movups	($inp),$in0
+	cmp	\$2,$len
+	jb	.Lctr32_one
+
+	por	$ivec,$inout1
+	movups	0x10($inp),$in1
+	je	.Lctr32_two
+
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout2
+	movups	0x20($inp),$in2
+	cmp	\$4,$len
+	jb	.Lctr32_three
+
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout3
+	movups	0x30($inp),$in3
+	je	.Lctr32_four
+
+	por	$ivec,$inout4
+	xorps	$inout5,$inout5
+
+	call	_aesni_encrypt6
+
+	movups	0x40($inp),$rndkey1
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	movups	$rndkey1,0x40($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one_shortcut:
+	movups	($ivp),$inout0
+	movups	($inp),$in0
+	mov	240($key),$rounds		# key->rounds
+.Lctr32_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	$inout0,$in0
+	movups	$in0,($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	xorps	$inout2,$inout2
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	movups	$in2,0x20($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_four:
+	call	_aesni_encrypt4
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	movups	$in3,0x30($out)
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+	movaps	0x20(%rsp),%xmm6
+	movaps	0x30(%rsp),%xmm7
+	movaps	0x40(%rsp),%xmm8
+	movaps	0x50(%rsp),%xmm9
+	movaps	0x60(%rsp),%xmm10
+	movaps	0x70(%rsp),%xmm11
+	movaps	0x80(%rsp),%xmm12
+	movaps	0x90(%rsp),%xmm13
+	movaps	0xa0(%rsp),%xmm14
+	movaps	0xb0(%rsp),%xmm15
+	lea	0xc8(%rsp),%rsp
+.Lctr32_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x68 + ($win64?160:0);
+
+$code.=<<___;
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,\@function,6
+.align	16
+aesni_xts_encrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240(%r8),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_enc_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_enc_grandloop
+
+.align	16
+.Lxts_enc_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_encrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesenc		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesenc		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesenc		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_enc_loop6_enter
+
+.align	16
+.Lxts_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lxts_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_enc_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey0,$inout3
+	 aesenc		$rndkey0,$inout4
+	 aesenc		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenclast	$rndkey0,$inout3
+	 aesenclast	$rndkey0,$inout4
+	 aesenclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_enc_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_enc_short:
+	add	\$16*6,$len
+	jz	.Lxts_enc_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_enc_one
+	je	.Lxts_enc_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_enc_three
+	je	.Lxts_enc_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_encrypt6
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
+	movups	$inout0,($out)
+	lea	16*1($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_four:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_encrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_done:
+	and	\$15,$len_
+	jz	.Lxts_enc_ret
+	mov	$len_,$len
+
+.Lxts_enc_steal:
+	movzb	($inp),%eax			# borrow $rounds ...
+	movzb	-16($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,-16($out)
+	mov	%cl,0($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_enc_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	-16($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,-16($out)
+
+.Lxts_enc_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_enc_epilogue:
+	ret
+.size	aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,\@function,6
+.align	16
+aesni_xts_decrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240($key2),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	xor	%eax,%eax			# if ($len%16) len-=16;
+	test	\$15,$len
+	setnz	%al
+	shl	\$4,%rax
+	sub	%rax,$len
+
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_dec_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_dec_grandloop
+
+.align	16
+.Lxts_dec_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_decrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesdec		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesdec		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesdec		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_dec_loop6_enter
+
+.align	16
+.Lxts_dec_loop6:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	dec		$rounds
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+.Lxts_dec_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	lea		32($key),$key
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_dec_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey0,$inout3
+	 aesdec		$rndkey0,$inout4
+	 aesdec		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdeclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdeclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdeclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdeclast	$rndkey0,$inout3
+	 aesdeclast	$rndkey0,$inout4
+	 aesdeclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_dec_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_dec_short:
+	add	\$16*6,$len
+	jz	.Lxts_dec_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_dec_one
+	je	.Lxts_dec_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_dec_three
+	je	.Lxts_dec_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_decrypt6
+
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	 pxor		$twtmp,$twtmp
+	movdqu	$inout3,16*3($out)
+	 pcmpgtd	@tweak[5],$twtmp
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	pand	$twmask,@tweak[1]		# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	jmp	.Lxts_dec_done2
+
+.align	16
+.Lxts_dec_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
+	movups	$inout0,($out)
+	movdqa	@tweak[2],@tweak[1]
+	lea	16*1($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
+	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[3],@tweak[1]
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
+	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_four:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movups	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movups	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_decrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[4],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_done:
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+.Lxts_dec_done2:
+	mov	$len_,$len
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($inp),$inout0
+	xorps	@tweak[1],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[1],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp),%eax			# borrow $rounds ...
+	movzb	($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,($out)
+	mov	%cl,16($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_dec_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_dec_epilogue:
+	ret
+.size	aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+} }}
+
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#			    size_t length, const AES_KEY *key,
+#			    unsigned char *ivp,const int enc);
+{
+my $reserved = $win64?0x40:-0x18;	# used in decrypt
+$code.=<<___;
+.globl	${PREFIX}_cbc_encrypt
+.type	${PREFIX}_cbc_encrypt,\@function,6
+.align	16
+${PREFIX}_cbc_encrypt:
+	test	$len,$len		# check length
+	jz	.Lcbc_ret
+
+	mov	240($key),$rnds_	# key->rounds
+	mov	$key,$key_		# backup $key
+	test	%r9d,%r9d		# 6th argument
+	jz	.Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+	movups	($ivp),$inout0		# load iv as initial state
+	mov	$rnds_,$rounds
+	cmp	\$16,$len
+	jb	.Lcbc_enc_tail
+	sub	\$16,$len
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	($inp),$inout1		# load input
+	lea	16($inp),$inp
+	#xorps	$inout1,$inout0
+___
+	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+	mov	$rnds_,$rounds		# restore $rounds
+	mov	$key_,$key		# restore $key
+	movups	$inout0,0($out)		# store output
+	lea	16($out),$out
+	sub	\$16,$len
+	jnc	.Lcbc_enc_loop
+	add	\$16,$len
+	jnz	.Lcbc_enc_tail
+	movups	$inout0,($ivp)
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	mov	$len,%rcx	# zaps $key
+	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
+	.long	0x9066A4F3	# rep movsb
+	mov	\$16,%ecx	# zero tail
+	sub	$len,%rcx
+	xor	%eax,%eax
+	.long	0x9066AAF3	# rep stosb
+	lea	-16(%rdi),%rdi	# rewind $out by 1 block
+	mov	$rnds_,$rounds	# restore $rounds
+	mov	%rdi,%rsi	# $inp and $out are the same
+	mov	$key_,$key	# restore $key
+	xor	$len,$len	# len=16
+	jmp	.Lcbc_enc_loop	# one more spin
+#--------------------------- CBC DECRYPT ------------------------------#
+.align	16
+.Lcbc_decrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lcbc_decrypt_body:
+___
+$code.=<<___;
+	movups	($ivp),$iv
+	mov	$rnds_,$rounds
+	cmp	\$0x70,$len
+	jbe	.Lcbc_dec_tail
+	shr	\$1,$rnds_
+	sub	\$0x70,$len
+	mov	$rnds_,$rounds
+	movaps	$iv,$reserved(%rsp)
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movaps	$rndkey0,$reserved(%rsp)	# save IV
+	movups	$inout7,($out)
+	lea	0x10($out),$out
+.Lcbc_dec_loop8_enter:
+	$movkey		($key),$rndkey0
+	movups	($inp),$inout0			# load input
+	movups	0x10($inp),$inout1
+	$movkey		16($key),$rndkey1
+
+	lea		32($key),$key
+	movdqu	0x20($inp),$inout2
+	xorps		$rndkey0,$inout0
+	movdqu	0x30($inp),$inout3
+	xorps		$rndkey0,$inout1
+	movdqu	0x40($inp),$inout4
+	aesdec		$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	movdqu	0x50($inp),$inout5
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	movdqu	0x60($inp),$inout6
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	movdqu	0x70($inp),$inout7
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aesdec		$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aesdec		$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aesdec		$rndkey1,$inout6
+	aesdec		$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+
+	call		.Ldec_loop8_enter
+
+	movups	($inp),$rndkey1		# re-load input
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$rndkey1
+	xorps	$rndkey0,$inout6
+	movups	0x70($inp),$rndkey0	# IV
+	xorps	$rndkey1,$inout7
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout4,0x40($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout5,0x50($out)
+	lea	0x80($inp),$inp
+	movups	$inout6,0x60($out)
+	lea	0x70($out),$out
+	sub	\$0x80,$len
+	ja	.Lcbc_dec_loop8
+
+	movaps	$inout7,$inout0
+	movaps	$rndkey0,$iv
+	add	\$0x70,$len
+	jle	.Lcbc_dec_tail_collected
+	movups	$inout0,($out)
+	lea	1($rnds_,$rnds_),$rounds
+	lea	0x10($out),$out
+.Lcbc_dec_tail:
+	movups	($inp),$inout0
+	movaps	$inout0,$in0
+	cmp	\$0x10,$len
+	jbe	.Lcbc_dec_one
+
+	movups	0x10($inp),$inout1
+	movaps	$inout1,$in1
+	cmp	\$0x20,$len
+	jbe	.Lcbc_dec_two
+
+	movups	0x20($inp),$inout2
+	movaps	$inout2,$in2
+	cmp	\$0x30,$len
+	jbe	.Lcbc_dec_three
+
+	movups	0x30($inp),$inout3
+	cmp	\$0x40,$len
+	jbe	.Lcbc_dec_four
+
+	movups	0x40($inp),$inout4
+	cmp	\$0x50,$len
+	jbe	.Lcbc_dec_five
+
+	movups	0x50($inp),$inout5
+	cmp	\$0x60,$len
+	jbe	.Lcbc_dec_six
+
+	movups	0x60($inp),$inout6
+	movaps	$iv,$reserved(%rsp)	# save IV
+	call	_aesni_decrypt8
+	movups	($inp),$rndkey1
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$iv		# IV
+	xorps	$rndkey0,$inout6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	lea	0x60($out),$out
+	movaps	$inout6,$inout0
+	sub	\$0x70,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_one:
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	$iv,$inout0
+	movaps	$in0,$iv
+	sub	\$0x10,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	xorps	$inout2,$inout2
+	call	_aesni_decrypt3
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	movaps	$in1,$iv
+	movaps	$inout1,$inout0
+	lea	0x10($out),$out
+	sub	\$0x20,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	xorps	$in1,$inout2
+	movups	$inout1,0x10($out)
+	movaps	$in2,$iv
+	movaps	$inout2,$inout0
+	lea	0x20($out),$out
+	sub	\$0x30,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	call	_aesni_decrypt4
+	xorps	$iv,$inout0
+	movups	0x30($inp),$iv
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	xorps	$in1,$inout2
+	movups	$inout1,0x10($out)
+	xorps	$in2,$inout3
+	movups	$inout2,0x20($out)
+	movaps	$inout3,$inout0
+	lea	0x30($out),$out
+	sub	\$0x40,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$iv
+	xorps	$rndkey1,$inout4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	lea	0x40($out),$out
+	movaps	$inout4,$inout0
+	sub	\$0x50,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_six:
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$rndkey0
+	xorps	$rndkey1,$inout4
+	movups	0x50($inp),$iv
+	xorps	$rndkey0,$inout5
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	lea	0x50($out),$out
+	movaps	$inout5,$inout0
+	sub	\$0x60,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_tail_collected:
+	and	\$15,$len
+	movups	$iv,($ivp)
+	jnz	.Lcbc_dec_tail_partial
+	movups	$inout0,($out)
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	$inout0,$reserved(%rsp)
+	mov	\$16,%rcx
+	mov	$out,%rdi
+	sub	$len,%rcx
+	lea	$reserved(%rsp),%rsi
+	.long	0x9066A4F3	# rep movsb
+
+.Lcbc_dec_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+___
+$code.=<<___;
+.Lcbc_ret:
+	ret
+.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+} 
+# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+#				int bits, AES_KEY *key)
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl	${PREFIX}_set_decrypt_key
+.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align	16
+${PREFIX}_set_decrypt_key:
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+	call	__aesni_set_encrypt_key
+	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
+	test	%eax,%eax
+	jnz	.Ldec_key_ret
+	lea	16($key,$bits),$inp	# points at the end of key schedule
+
+	$movkey	($key),%xmm0		# just swap
+	$movkey	($inp),%xmm1
+	$movkey	%xmm0,($inp)
+	$movkey	%xmm1,($key)
+	lea	16($key),$key
+	lea	-16($inp),$inp
+
+.Ldec_key_inverse:
+	$movkey	($key),%xmm0		# swap and inverse
+	$movkey	($inp),%xmm1
+	aesimc	%xmm0,%xmm0
+	aesimc	%xmm1,%xmm1
+	lea	16($key),$key
+	lea	-16($inp),$inp
+	$movkey	%xmm0,16($inp)
+	$movkey	%xmm1,-16($key)
+	cmp	$key,$inp
+	ja	.Ldec_key_inverse
+
+	$movkey	($key),%xmm0		# inverse middle
+	aesimc	%xmm0,%xmm0
+	$movkey	%xmm0,($inp)
+.Ldec_key_ret:
+	add	\$8,%rsp
+	ret
+.LSEH_end_set_decrypt_key:
+.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+
+# This is based on submission by
+#
+#	Huang Ying <ying.huang@intel.com>
+#	Vinodh Gopal <vinodh.gopal@intel.com>
+#	Kahraman Akdemir
+#
+# Agressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+$code.=<<___;
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align	16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+	mov	\$-1,%rax
+	test	$inp,$inp
+	jz	.Lenc_key_ret
+	test	$key,$key
+	jz	.Lenc_key_ret
+
+	movups	($inp),%xmm0		# pull first 128 bits of *userKey
+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+	lea	16($key),%rax
+	cmp	\$256,$bits
+	je	.L14rounds
+	cmp	\$192,$bits
+	je	.L12rounds
+	cmp	\$128,$bits
+	jne	.Lbad_keybits
+
+.L10rounds:
+	mov	\$9,$bits			# 10 rounds for 128-bit key
+	$movkey	%xmm0,($key)			# round 0
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
+	call		.Lkey_expansion_128_cold
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_128
+	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
+	call		.Lkey_expansion_128
+	$movkey	%xmm0,(%rax)
+	mov	$bits,80(%rax)	# 240(%rdx)
+	xor	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
+	mov	\$11,$bits			# 12 rounds for 192
+	$movkey	%xmm0,($key)			# round 0
+	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
+	call		.Lkey_expansion_192a_cold
+	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
+	call		.Lkey_expansion_192b
+	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
+	call		.Lkey_expansion_192a
+	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
+	call		.Lkey_expansion_192b
+	$movkey	%xmm0,(%rax)
+	mov	$bits,48(%rax)	# 240(%rdx)
+	xor	%rax, %rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16($inp),%xmm2			# remaning half of *userKey
+	mov	\$13,$bits			# 14 rounds for 256
+	lea	16(%rax),%rax
+	$movkey	%xmm0,($key)			# round 0
+	$movkey	%xmm2,16($key)			# round 1
+	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
+	call		.Lkey_expansion_256a_cold
+	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
+	call		.Lkey_expansion_256a
+	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
+	call		.Lkey_expansion_256b
+	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
+	call		.Lkey_expansion_256a
+	$movkey	%xmm0,(%rax)
+	mov	$bits,16(%rax)	# 240(%rdx)
+	xor	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	mov	\$-2,%rax
+.Lenc_key_ret:
+	add	\$8,%rsp
+	ret
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4, %xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+
+.align 16
+.Lkey_expansion_192a:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	pslldq	\$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	\$0b11111111,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+
+.align 16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	\$0b01000100,%xmm0,%xmm5
+	$movkey	%xmm5,(%rax)
+	shufps	\$0b01001110,%xmm2,%xmm3
+	$movkey	%xmm3,16(%rax)
+	lea	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	$movkey	%xmm2,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
+	ret
+
+.align 16
+.Lkey_expansion_256b:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+
+	shufps	\$0b00010000,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10001100,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm2
+	ret
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+	.long	6,6,6,0
+.Lincrement64:
+	.long	1,0,0,0
+.Lxts_magic:
+	.long	0x87,0,1,0
+
+.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type	ecb_se_handler,\@abi-omnipotent
+.align	16
+ecb_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	jmp	.Lcommon_seh_tail
+.size	ecb_se_handler,.-ecb_se_handler
+
+.type	ccm64_se_handler,\@abi-omnipotent
+.align	16
+ccm64_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	ccm64_se_handler,.-ccm64_se_handler
+
+.type	ctr32_se_handler,\@abi-omnipotent
+.align	16
+ctr32_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lctr32_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lctr32_ret(%rip),%r10
+	cmp	%r10,%rbx
+	jae	.Lcommon_seh_tail
+
+	lea	0x20(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xc8(%rax),%rax		# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	ctr32_se_handler,.-ctr32_se_handler
+
+.type	xts_se_handler,\@abi-omnipotent
+.align	16
+xts_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue lable
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0x60(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x68+160(%rax),%rax	# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	xts_se_handler,.-xts_se_handler
+___
+$code.=<<___;
+.type	cbc_se_handler,\@abi-omnipotent
+.align	16
+cbc_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lcbc_decrypt(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	lea	.Lcbc_decrypt_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
+	jb	.Lrestore_cbc_rax
+
+	lea	.Lcbc_ret(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>="epilogue" label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# top of stack
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+	jmp	.Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+	mov	120($context),%rax
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	cbc_se_handler,.-cbc_se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+	.rva	.LSEH_begin_aesni_ecb_encrypt
+	.rva	.LSEH_end_aesni_ecb_encrypt
+	.rva	.LSEH_info_ecb
+
+	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_info_ccm64_enc
+
+	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_info_ccm64_dec
+
+	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_info_ctr32
+
+	.rva	.LSEH_begin_aesni_xts_encrypt
+	.rva	.LSEH_end_aesni_xts_encrypt
+	.rva	.LSEH_info_xts_enc
+
+	.rva	.LSEH_begin_aesni_xts_decrypt
+	.rva	.LSEH_end_aesni_xts_decrypt
+	.rva	.LSEH_info_xts_dec
+___
+$code.=<<___;
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_cbc
+
+	.rva	${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_set_decrypt_key
+	.rva	.LSEH_info_key
+
+	.rva	${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_set_encrypt_key
+	.rva	.LSEH_info_key
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+	.byte	9,0,0,0
+	.rva	ecb_se_handler
+.LSEH_info_ccm64_enc:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
+.LSEH_info_ccm64_dec:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
+.LSEH_info_ctr32:
+	.byte	9,0,0,0
+	.rva	ctr32_se_handler
+.LSEH_info_xts_enc:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.LSEH_info_xts_dec:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.LSEH_info_cbc:
+	.byte	9,0,0,0
+	.rva	cbc_se_handler
+.LSEH_info_key:
+	.byte	0x01,0x04,0x01,0x00
+	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
+___
+}
+
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	rex(\@opcode,$4,$3);
+	push @opcode,0x0f,0x3a,0xdf;
+	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
+	my $c=$2;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	return ".byte\t".join(',',@opcode);
+    }
+    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesimc" => 0xdb,
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
+		"aesdec" => 0xde,	"aesdeclast" => 0xdf
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/aes/asm/bsaes-x86_64.S b/crypto/aes/asm/bsaes-x86_64.S
new file mode 100644
index 0000000..6ceb3da
--- /dev/null
+++ b/crypto/aes/asm/bsaes-x86_64.S
@@ -0,0 +1,2561 @@
+.text	
+
+
+
+
+.type	_bsaes_encrypt8,@function
+.align	64
+_bsaes_encrypt8:
+	leaq	.LBS0(%rip),%r11
+
+	movdqa	(%rax),%xmm8
+	leaq	16(%rax),%rax
+	movdqa	80(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+.byte	102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$1,%xmm3
+	pxor	%xmm6,%xmm5
+	pxor	%xmm4,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm6
+	psllq	$1,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$1,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm2,%xmm1
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm15
+	pxor	%xmm1,%xmm2
+	psllq	$1,%xmm1
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm4,%xmm9
+	psrlq	$2,%xmm4
+	movdqa	%xmm3,%xmm10
+	psrlq	$2,%xmm3
+	pxor	%xmm6,%xmm4
+	pxor	%xmm5,%xmm3
+	pand	%xmm8,%xmm4
+	pand	%xmm8,%xmm3
+	pxor	%xmm4,%xmm6
+	psllq	$2,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$2,%xmm3
+	pxor	%xmm9,%xmm4
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm2,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm2
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm2,%xmm9
+	psrlq	$4,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$4,%xmm1
+	pxor	%xmm6,%xmm2
+	pxor	%xmm5,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$4,%xmm2
+	pxor	%xmm1,%xmm5
+	psllq	$4,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm4
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	decl	%r10d
+	jmp	.Lenc_sbox
+.align	16
+.Lenc_loop:
+	pxor	0(%rax),%xmm15
+	pxor	16(%rax),%xmm0
+.byte	102,68,15,56,0,255
+	pxor	32(%rax),%xmm1
+.byte	102,15,56,0,199
+	pxor	48(%rax),%xmm2
+.byte	102,15,56,0,207
+	pxor	64(%rax),%xmm3
+.byte	102,15,56,0,215
+	pxor	80(%rax),%xmm4
+.byte	102,15,56,0,223
+	pxor	96(%rax),%xmm5
+.byte	102,15,56,0,231
+	pxor	112(%rax),%xmm6
+.byte	102,15,56,0,239
+	leaq	128(%rax),%rax
+.byte	102,15,56,0,247
+.Lenc_sbox:
+	pxor	%xmm5,%xmm4
+	pxor	%xmm0,%xmm1
+	pxor	%xmm15,%xmm2
+	pxor	%xmm1,%xmm5
+	pxor	%xmm15,%xmm4
+
+	pxor	%xmm2,%xmm5
+	pxor	%xmm6,%xmm2
+	pxor	%xmm4,%xmm6
+	pxor	%xmm3,%xmm2
+	pxor	%xmm4,%xmm3
+	pxor	%xmm0,%xmm2
+
+	pxor	%xmm6,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm6,%xmm10
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm4,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm11
+
+	pxor	%xmm3,%xmm10
+	pxor	%xmm1,%xmm9
+	pxor	%xmm2,%xmm8
+	movdqa	%xmm10,%xmm13
+	pxor	%xmm3,%xmm12
+	movdqa	%xmm9,%xmm7
+	pxor	%xmm15,%xmm11
+	movdqa	%xmm10,%xmm14
+
+	por	%xmm8,%xmm9
+	por	%xmm11,%xmm10
+	pxor	%xmm7,%xmm14
+	pand	%xmm11,%xmm13
+	pxor	%xmm8,%xmm11
+	pand	%xmm8,%xmm7
+	pand	%xmm11,%xmm14
+	movdqa	%xmm2,%xmm11
+	pxor	%xmm15,%xmm11
+	pand	%xmm11,%xmm12
+	pxor	%xmm12,%xmm10
+	pxor	%xmm12,%xmm9
+	movdqa	%xmm6,%xmm12
+	movdqa	%xmm4,%xmm11
+	pxor	%xmm0,%xmm12
+	pxor	%xmm5,%xmm11
+	movdqa	%xmm12,%xmm8
+	pand	%xmm11,%xmm12
+	por	%xmm11,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm14,%xmm10
+	pxor	%xmm13,%xmm9
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm1,%xmm11
+	pxor	%xmm13,%xmm7
+	movdqa	%xmm3,%xmm12
+	pxor	%xmm13,%xmm8
+	movdqa	%xmm0,%xmm13
+	pand	%xmm2,%xmm11
+	movdqa	%xmm6,%xmm14
+	pand	%xmm15,%xmm12
+	pand	%xmm4,%xmm13
+	por	%xmm5,%xmm14
+	pxor	%xmm11,%xmm10
+	pxor	%xmm12,%xmm9
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm7
+
+
+
+
+
+	movdqa	%xmm10,%xmm11
+	pand	%xmm8,%xmm10
+	pxor	%xmm9,%xmm11
+
+	movdqa	%xmm7,%xmm13
+	movdqa	%xmm11,%xmm14
+	pxor	%xmm10,%xmm13
+	pand	%xmm13,%xmm14
+
+	movdqa	%xmm8,%xmm12
+	pxor	%xmm9,%xmm14
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm9,%xmm10
+
+	pand	%xmm10,%xmm12
+
+	movdqa	%xmm13,%xmm9
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm12,%xmm9
+	pxor	%xmm12,%xmm8
+
+	pand	%xmm7,%xmm9
+
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm8
+
+	pand	%xmm14,%xmm13
+
+	pxor	%xmm11,%xmm13
+	movdqa	%xmm5,%xmm11
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm14,%xmm9
+	pxor	%xmm13,%xmm9
+	pand	%xmm5,%xmm9
+	pxor	%xmm4,%xmm5
+	pand	%xmm14,%xmm4
+	pand	%xmm13,%xmm5
+	pxor	%xmm4,%xmm5
+	pxor	%xmm9,%xmm4
+	pxor	%xmm15,%xmm11
+	pxor	%xmm2,%xmm7
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm15,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm2,%xmm15
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm2
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm15
+	pxor	%xmm11,%xmm7
+	pxor	%xmm2,%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm11,%xmm5
+	pxor	%xmm11,%xmm15
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm2
+
+	movdqa	%xmm6,%xmm11
+	movdqa	%xmm0,%xmm7
+	pxor	%xmm3,%xmm11
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm3,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm1,%xmm3
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm1
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm3
+	pxor	%xmm11,%xmm7
+	pxor	%xmm1,%xmm3
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm1
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	pxor	%xmm13,%xmm10
+	pand	%xmm6,%xmm10
+	pxor	%xmm0,%xmm6
+	pand	%xmm14,%xmm0
+	pand	%xmm13,%xmm6
+	pxor	%xmm0,%xmm6
+	pxor	%xmm10,%xmm0
+	pxor	%xmm11,%xmm6
+	pxor	%xmm11,%xmm3
+	pxor	%xmm7,%xmm0
+	pxor	%xmm7,%xmm1
+	pxor	%xmm15,%xmm6
+	pxor	%xmm5,%xmm0
+	pxor	%xmm6,%xmm3
+	pxor	%xmm15,%xmm5
+	pxor	%xmm0,%xmm15
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	pxor	%xmm2,%xmm1
+	pxor	%xmm4,%xmm2
+	pxor	%xmm4,%xmm3
+
+	pxor	%xmm2,%xmm5
+	decl	%r10d
+	jl	.Lenc_done
+	pshufd	$147,%xmm15,%xmm7
+	pshufd	$147,%xmm0,%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$147,%xmm3,%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$147,%xmm5,%xmm10
+	pxor	%xmm9,%xmm3
+	pshufd	$147,%xmm2,%xmm11
+	pxor	%xmm10,%xmm5
+	pshufd	$147,%xmm6,%xmm12
+	pxor	%xmm11,%xmm2
+	pshufd	$147,%xmm1,%xmm13
+	pxor	%xmm12,%xmm6
+	pshufd	$147,%xmm4,%xmm14
+	pxor	%xmm13,%xmm1
+	pxor	%xmm14,%xmm4
+
+	pxor	%xmm15,%xmm8
+	pxor	%xmm4,%xmm7
+	pxor	%xmm4,%xmm8
+	pshufd	$78,%xmm15,%xmm15
+	pxor	%xmm0,%xmm9
+	pshufd	$78,%xmm0,%xmm0
+	pxor	%xmm2,%xmm12
+	pxor	%xmm7,%xmm15
+	pxor	%xmm6,%xmm13
+	pxor	%xmm8,%xmm0
+	pxor	%xmm5,%xmm11
+	pshufd	$78,%xmm2,%xmm7
+	pxor	%xmm1,%xmm14
+	pshufd	$78,%xmm6,%xmm8
+	pxor	%xmm3,%xmm10
+	pshufd	$78,%xmm5,%xmm2
+	pxor	%xmm4,%xmm10
+	pshufd	$78,%xmm4,%xmm6
+	pxor	%xmm4,%xmm11
+	pshufd	$78,%xmm1,%xmm5
+	pxor	%xmm11,%xmm7
+	pshufd	$78,%xmm3,%xmm1
+	pxor	%xmm12,%xmm8
+
+	pxor	%xmm10,%xmm2
+	pxor	%xmm14,%xmm6
+	pxor	%xmm13,%xmm5
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm9,%xmm1
+	movdqa	%xmm8,%xmm4
+	movdqa	48(%r11),%xmm7
+	jnz	.Lenc_loop
+	movdqa	64(%r11),%xmm7
+	jmp	.Lenc_loop
+.align	16
+.Lenc_done:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm2,%xmm10
+	psrlq	$1,%xmm2
+	pxor	%xmm4,%xmm1
+	pxor	%xmm6,%xmm2
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm2
+	pxor	%xmm1,%xmm4
+	psllq	$1,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$1,%xmm2
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm3,%xmm9
+	psrlq	$1,%xmm3
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm5,%xmm3
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm3
+	pand	%xmm7,%xmm15
+	pxor	%xmm3,%xmm5
+	psllq	$1,%xmm3
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm3
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm6,%xmm9
+	psrlq	$2,%xmm6
+	movdqa	%xmm2,%xmm10
+	psrlq	$2,%xmm2
+	pxor	%xmm4,%xmm6
+	pxor	%xmm1,%xmm2
+	pand	%xmm8,%xmm6
+	pand	%xmm8,%xmm2
+	pxor	%xmm6,%xmm4
+	psllq	$2,%xmm6
+	pxor	%xmm2,%xmm1
+	psllq	$2,%xmm2
+	pxor	%xmm9,%xmm6
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm5,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm5
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm5,%xmm9
+	psrlq	$4,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$4,%xmm3
+	pxor	%xmm4,%xmm5
+	pxor	%xmm1,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm4
+	psllq	$4,%xmm5
+	pxor	%xmm3,%xmm1
+	psllq	$4,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm6,%xmm0
+	pxor	%xmm2,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm6
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm2
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	(%rax),%xmm7
+	pxor	%xmm7,%xmm3
+	pxor	%xmm7,%xmm5
+	pxor	%xmm7,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm1
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm15
+	pxor	%xmm7,%xmm0
+	.byte	0xf3,0xc3
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type	_bsaes_decrypt8,@function
+.align	64
+_bsaes_decrypt8:
+	leaq	.LBS0(%rip),%r11
+
+	movdqa	(%rax),%xmm8
+	leaq	16(%rax),%rax
+	movdqa	-48(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+.byte	102,15,56,0,247
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm3,%xmm10
+	psrlq	$1,%xmm3
+	pxor	%xmm6,%xmm5
+	pxor	%xmm4,%xmm3
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm3
+	pxor	%xmm5,%xmm6
+	psllq	$1,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$1,%xmm3
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm1,%xmm9
+	psrlq	$1,%xmm1
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm2,%xmm1
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm1
+	pand	%xmm7,%xmm15
+	pxor	%xmm1,%xmm2
+	psllq	$1,%xmm1
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm1
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm4,%xmm9
+	psrlq	$2,%xmm4
+	movdqa	%xmm3,%xmm10
+	psrlq	$2,%xmm3
+	pxor	%xmm6,%xmm4
+	pxor	%xmm5,%xmm3
+	pand	%xmm8,%xmm4
+	pand	%xmm8,%xmm3
+	pxor	%xmm4,%xmm6
+	psllq	$2,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$2,%xmm3
+	pxor	%xmm9,%xmm4
+	pxor	%xmm10,%xmm3
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm2,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm2
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm2,%xmm9
+	psrlq	$4,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$4,%xmm1
+	pxor	%xmm6,%xmm2
+	pxor	%xmm5,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm6
+	psllq	$4,%xmm2
+	pxor	%xmm1,%xmm5
+	psllq	$4,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm4
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm3
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	decl	%r10d
+	jmp	.Ldec_sbox
+.align	16
+.Ldec_loop:
+	pxor	0(%rax),%xmm15
+	pxor	16(%rax),%xmm0
+.byte	102,68,15,56,0,255
+	pxor	32(%rax),%xmm1
+.byte	102,15,56,0,199
+	pxor	48(%rax),%xmm2
+.byte	102,15,56,0,207
+	pxor	64(%rax),%xmm3
+.byte	102,15,56,0,215
+	pxor	80(%rax),%xmm4
+.byte	102,15,56,0,223
+	pxor	96(%rax),%xmm5
+.byte	102,15,56,0,231
+	pxor	112(%rax),%xmm6
+.byte	102,15,56,0,239
+	leaq	128(%rax),%rax
+.byte	102,15,56,0,247
+.Ldec_sbox:
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm6,%xmm3
+	pxor	%xmm6,%xmm1
+	pxor	%xmm3,%xmm5
+	pxor	%xmm5,%xmm6
+	pxor	%xmm6,%xmm0
+
+	pxor	%xmm0,%xmm15
+	pxor	%xmm4,%xmm1
+	pxor	%xmm15,%xmm2
+	pxor	%xmm15,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm12
+	movdqa	%xmm4,%xmm11
+
+	pxor	%xmm15,%xmm10
+	pxor	%xmm3,%xmm9
+	pxor	%xmm5,%xmm8
+	movdqa	%xmm10,%xmm13
+	pxor	%xmm15,%xmm12
+	movdqa	%xmm9,%xmm7
+	pxor	%xmm1,%xmm11
+	movdqa	%xmm10,%xmm14
+
+	por	%xmm8,%xmm9
+	por	%xmm11,%xmm10
+	pxor	%xmm7,%xmm14
+	pand	%xmm11,%xmm13
+	pxor	%xmm8,%xmm11
+	pand	%xmm8,%xmm7
+	pand	%xmm11,%xmm14
+	movdqa	%xmm5,%xmm11
+	pxor	%xmm1,%xmm11
+	pand	%xmm11,%xmm12
+	pxor	%xmm12,%xmm10
+	pxor	%xmm12,%xmm9
+	movdqa	%xmm2,%xmm12
+	movdqa	%xmm0,%xmm11
+	pxor	%xmm6,%xmm12
+	pxor	%xmm4,%xmm11
+	movdqa	%xmm12,%xmm8
+	pand	%xmm11,%xmm12
+	por	%xmm11,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm14,%xmm10
+	pxor	%xmm13,%xmm9
+	pxor	%xmm14,%xmm8
+	movdqa	%xmm3,%xmm11
+	pxor	%xmm13,%xmm7
+	movdqa	%xmm15,%xmm12
+	pxor	%xmm13,%xmm8
+	movdqa	%xmm6,%xmm13
+	pand	%xmm5,%xmm11
+	movdqa	%xmm2,%xmm14
+	pand	%xmm1,%xmm12
+	pand	%xmm0,%xmm13
+	por	%xmm4,%xmm14
+	pxor	%xmm11,%xmm10
+	pxor	%xmm12,%xmm9
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm7
+
+
+
+
+
+	movdqa	%xmm10,%xmm11
+	pand	%xmm8,%xmm10
+	pxor	%xmm9,%xmm11
+
+	movdqa	%xmm7,%xmm13
+	movdqa	%xmm11,%xmm14
+	pxor	%xmm10,%xmm13
+	pand	%xmm13,%xmm14
+
+	movdqa	%xmm8,%xmm12
+	pxor	%xmm9,%xmm14
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm9,%xmm10
+
+	pand	%xmm10,%xmm12
+
+	movdqa	%xmm13,%xmm9
+	pxor	%xmm7,%xmm12
+
+	pxor	%xmm12,%xmm9
+	pxor	%xmm12,%xmm8
+
+	pand	%xmm7,%xmm9
+
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm8
+
+	pand	%xmm14,%xmm13
+
+	pxor	%xmm11,%xmm13
+	movdqa	%xmm4,%xmm11
+	movdqa	%xmm0,%xmm7
+	movdqa	%xmm14,%xmm9
+	pxor	%xmm13,%xmm9
+	pand	%xmm4,%xmm9
+	pxor	%xmm0,%xmm4
+	pand	%xmm14,%xmm0
+	pand	%xmm13,%xmm4
+	pxor	%xmm0,%xmm4
+	pxor	%xmm9,%xmm0
+	pxor	%xmm1,%xmm11
+	pxor	%xmm5,%xmm7
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm1,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm5,%xmm1
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm5
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm1
+	pxor	%xmm11,%xmm7
+	pxor	%xmm5,%xmm1
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm5
+	pxor	%xmm11,%xmm4
+	pxor	%xmm11,%xmm1
+	pxor	%xmm7,%xmm0
+	pxor	%xmm7,%xmm5
+
+	movdqa	%xmm2,%xmm11
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm15,%xmm11
+	pxor	%xmm3,%xmm7
+	movdqa	%xmm14,%xmm10
+	movdqa	%xmm12,%xmm9
+	pxor	%xmm13,%xmm10
+	pxor	%xmm8,%xmm9
+	pand	%xmm11,%xmm10
+	pand	%xmm15,%xmm9
+	pxor	%xmm7,%xmm11
+	pxor	%xmm3,%xmm15
+	pand	%xmm14,%xmm7
+	pand	%xmm12,%xmm3
+	pand	%xmm13,%xmm11
+	pand	%xmm8,%xmm15
+	pxor	%xmm11,%xmm7
+	pxor	%xmm3,%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm9,%xmm3
+	pxor	%xmm12,%xmm14
+	pxor	%xmm8,%xmm13
+	movdqa	%xmm14,%xmm10
+	pxor	%xmm13,%xmm10
+	pand	%xmm2,%xmm10
+	pxor	%xmm6,%xmm2
+	pand	%xmm14,%xmm6
+	pand	%xmm13,%xmm2
+	pxor	%xmm6,%xmm2
+	pxor	%xmm10,%xmm6
+	pxor	%xmm11,%xmm2
+	pxor	%xmm11,%xmm15
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm3
+	pxor	%xmm6,%xmm0
+	pxor	%xmm4,%xmm5
+
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm1
+	pxor	%xmm6,%xmm4
+	pxor	%xmm1,%xmm3
+	pxor	%xmm15,%xmm6
+	pxor	%xmm4,%xmm3
+	pxor	%xmm5,%xmm2
+	pxor	%xmm0,%xmm5
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm15,%xmm3
+	pxor	%xmm2,%xmm6
+	decl	%r10d
+	jl	.Ldec_done
+
+	pshufd	$147,%xmm4,%xmm14
+	movdqa	%xmm5,%xmm9
+	pxor	%xmm6,%xmm4
+	pxor	%xmm6,%xmm5
+	pshufd	$147,%xmm15,%xmm7
+	movdqa	%xmm6,%xmm12
+	pxor	%xmm15,%xmm6
+	pxor	%xmm0,%xmm15
+	pshufd	$147,%xmm0,%xmm8
+	pxor	%xmm5,%xmm0
+	pxor	%xmm2,%xmm15
+	pxor	%xmm3,%xmm0
+	pshufd	$147,%xmm3,%xmm10
+	pxor	%xmm15,%xmm5
+	pxor	%xmm4,%xmm3
+	pxor	%xmm2,%xmm4
+	pshufd	$147,%xmm2,%xmm13
+	movdqa	%xmm1,%xmm11
+	pxor	%xmm1,%xmm2
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm3
+	pxor	%xmm12,%xmm2
+	pxor	%xmm9,%xmm3
+	pxor	%xmm11,%xmm3
+	pshufd	$147,%xmm12,%xmm12
+
+	pxor	%xmm4,%xmm6
+	pxor	%xmm7,%xmm4
+	pxor	%xmm8,%xmm6
+	pshufd	$147,%xmm9,%xmm9
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm6
+	pxor	%xmm14,%xmm4
+	pshufd	$147,%xmm11,%xmm11
+	pxor	%xmm13,%xmm14
+	pxor	%xmm4,%xmm6
+
+	pxor	%xmm7,%xmm5
+	pshufd	$147,%xmm7,%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm15
+	pshufd	$147,%xmm8,%xmm8
+	pxor	%xmm9,%xmm5
+	pxor	%xmm9,%xmm3
+	pxor	%xmm14,%xmm15
+	pshufd	$147,%xmm9,%xmm9
+	pxor	%xmm10,%xmm5
+	pxor	%xmm10,%xmm1
+	pxor	%xmm10,%xmm0
+	pshufd	$147,%xmm10,%xmm10
+	pxor	%xmm11,%xmm2
+	pxor	%xmm11,%xmm3
+	pxor	%xmm14,%xmm2
+	pxor	%xmm12,%xmm5
+	pxor	%xmm11,%xmm0
+	pxor	%xmm12,%xmm14
+
+	pxor	%xmm14,%xmm3
+	pshufd	$147,%xmm11,%xmm11
+	pxor	%xmm14,%xmm1
+	pxor	%xmm14,%xmm0
+
+	pxor	%xmm12,%xmm14
+	pshufd	$147,%xmm12,%xmm12
+	pxor	%xmm13,%xmm14
+
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm11,%xmm2
+	pshufd	$147,%xmm13,%xmm13
+	pxor	%xmm7,%xmm15
+	pxor	%xmm12,%xmm2
+	pxor	%xmm9,%xmm15
+	pshufd	$147,%xmm14,%xmm14
+
+	pxor	%xmm6,%xmm5
+	pxor	%xmm8,%xmm6
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm5
+	pxor	%xmm12,%xmm6
+	pxor	%xmm12,%xmm4
+	pxor	%xmm14,%xmm6
+	pshufd	$147,%xmm7,%xmm7
+	pxor	%xmm13,%xmm4
+	pxor	%xmm6,%xmm5
+	pxor	%xmm8,%xmm0
+	pshufd	$147,%xmm8,%xmm8
+
+	pxor	%xmm14,%xmm2
+	pxor	%xmm9,%xmm0
+	pxor	%xmm9,%xmm3
+	pshufd	$147,%xmm9,%xmm9
+	pxor	%xmm13,%xmm15
+	pxor	%xmm10,%xmm13
+	pxor	%xmm2,%xmm0
+	pxor	%xmm13,%xmm5
+
+	pxor	%xmm13,%xmm1
+	pxor	%xmm12,%xmm3
+	pxor	%xmm11,%xmm1
+	pshufd	$147,%xmm11,%xmm11
+	pxor	%xmm13,%xmm3
+	pxor	%xmm14,%xmm1
+	pxor	%xmm10,%xmm13
+
+	pshufd	$147,%xmm12,%xmm12
+	pshufd	$147,%xmm13,%xmm13
+	pshufd	$147,%xmm14,%xmm14
+	pshufd	$147,%xmm10,%xmm10
+
+
+	pxor	%xmm6,%xmm0
+	pxor	%xmm6,%xmm8
+	pxor	%xmm12,%xmm7
+	pxor	%xmm12,%xmm8
+	pxor	%xmm7,%xmm5
+	pxor	%xmm4,%xmm7
+	pxor	%xmm13,%xmm8
+	pxor	%xmm14,%xmm13
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm2
+	pxor	%xmm0,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm5,%xmm10
+	pxor	%xmm9,%xmm3
+	pxor	%xmm15,%xmm9
+	pxor	%xmm14,%xmm10
+	pxor	%xmm3,%xmm12
+	pxor	%xmm13,%xmm9
+	pxor	%xmm13,%xmm12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm2,%xmm14
+
+	movdqa	%xmm7,%xmm15
+	movdqa	%xmm8,%xmm0
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm10,%xmm2
+	movdqa	%xmm11,%xmm3
+	movdqa	%xmm12,%xmm4
+	movdqa	%xmm13,%xmm5
+	movdqa	%xmm14,%xmm6
+	movdqa	-16(%r11),%xmm7
+	jnz	.Ldec_loop
+	movdqa	-32(%r11),%xmm7
+	jmp	.Ldec_loop
+.align	16
+.Ldec_done:
+	movdqa	0(%r11),%xmm7
+	movdqa	16(%r11),%xmm8
+	movdqa	%xmm2,%xmm9
+	psrlq	$1,%xmm2
+	movdqa	%xmm1,%xmm10
+	psrlq	$1,%xmm1
+	pxor	%xmm4,%xmm2
+	pxor	%xmm6,%xmm1
+	pand	%xmm7,%xmm2
+	pand	%xmm7,%xmm1
+	pxor	%xmm2,%xmm4
+	psllq	$1,%xmm2
+	pxor	%xmm1,%xmm6
+	psllq	$1,%xmm1
+	pxor	%xmm9,%xmm2
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm5,%xmm9
+	psrlq	$1,%xmm5
+	movdqa	%xmm15,%xmm10
+	psrlq	$1,%xmm15
+	pxor	%xmm3,%xmm5
+	pxor	%xmm0,%xmm15
+	pand	%xmm7,%xmm5
+	pand	%xmm7,%xmm15
+	pxor	%xmm5,%xmm3
+	psllq	$1,%xmm5
+	pxor	%xmm15,%xmm0
+	psllq	$1,%xmm15
+	pxor	%xmm9,%xmm5
+	pxor	%xmm10,%xmm15
+	movdqa	32(%r11),%xmm7
+	movdqa	%xmm6,%xmm9
+	psrlq	$2,%xmm6
+	movdqa	%xmm1,%xmm10
+	psrlq	$2,%xmm1
+	pxor	%xmm4,%xmm6
+	pxor	%xmm2,%xmm1
+	pand	%xmm8,%xmm6
+	pand	%xmm8,%xmm1
+	pxor	%xmm6,%xmm4
+	psllq	$2,%xmm6
+	pxor	%xmm1,%xmm2
+	psllq	$2,%xmm1
+	pxor	%xmm9,%xmm6
+	pxor	%xmm10,%xmm1
+	movdqa	%xmm0,%xmm9
+	psrlq	$2,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$2,%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm15
+	pand	%xmm8,%xmm0
+	pand	%xmm8,%xmm15
+	pxor	%xmm0,%xmm3
+	psllq	$2,%xmm0
+	pxor	%xmm15,%xmm5
+	psllq	$2,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	%xmm3,%xmm9
+	psrlq	$4,%xmm3
+	movdqa	%xmm5,%xmm10
+	psrlq	$4,%xmm5
+	pxor	%xmm4,%xmm3
+	pxor	%xmm2,%xmm5
+	pand	%xmm7,%xmm3
+	pand	%xmm7,%xmm5
+	pxor	%xmm3,%xmm4
+	psllq	$4,%xmm3
+	pxor	%xmm5,%xmm2
+	psllq	$4,%xmm5
+	pxor	%xmm9,%xmm3
+	pxor	%xmm10,%xmm5
+	movdqa	%xmm0,%xmm9
+	psrlq	$4,%xmm0
+	movdqa	%xmm15,%xmm10
+	psrlq	$4,%xmm15
+	pxor	%xmm6,%xmm0
+	pxor	%xmm1,%xmm15
+	pand	%xmm7,%xmm0
+	pand	%xmm7,%xmm15
+	pxor	%xmm0,%xmm6
+	psllq	$4,%xmm0
+	pxor	%xmm15,%xmm1
+	psllq	$4,%xmm15
+	pxor	%xmm9,%xmm0
+	pxor	%xmm10,%xmm15
+	movdqa	(%rax),%xmm7
+	pxor	%xmm7,%xmm5
+	pxor	%xmm7,%xmm3
+	pxor	%xmm7,%xmm1
+	pxor	%xmm7,%xmm6
+	pxor	%xmm7,%xmm2
+	pxor	%xmm7,%xmm4
+	pxor	%xmm7,%xmm15
+	pxor	%xmm7,%xmm0
+	.byte	0xf3,0xc3
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+.type	_bsaes_key_convert,@function
+.align	16
+_bsaes_key_convert:
+	leaq	.Lmasks(%rip),%r11
+	movdqu	(%rcx),%xmm7
+	leaq	16(%rcx),%rcx
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	movdqa	48(%r11),%xmm3
+	movdqa	64(%r11),%xmm4
+	pcmpeqd	%xmm5,%xmm5
+
+	movdqu	(%rcx),%xmm6
+	movdqa	%xmm7,(%rax)
+	leaq	16(%rax),%rax
+	decl	%r10d
+	jmp	.Lkey_loop
+.align	16
+.Lkey_loop:
+.byte	102,15,56,0,244
+
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm1,%xmm9
+
+	pand	%xmm6,%xmm8
+	pand	%xmm6,%xmm9
+	movdqa	%xmm2,%xmm10
+	pcmpeqb	%xmm0,%xmm8
+	psllq	$4,%xmm0
+	movdqa	%xmm3,%xmm11
+	pcmpeqb	%xmm1,%xmm9
+	psllq	$4,%xmm1
+
+	pand	%xmm6,%xmm10
+	pand	%xmm6,%xmm11
+	movdqa	%xmm0,%xmm12
+	pcmpeqb	%xmm2,%xmm10
+	psllq	$4,%xmm2
+	movdqa	%xmm1,%xmm13
+	pcmpeqb	%xmm3,%xmm11
+	psllq	$4,%xmm3
+
+	movdqa	%xmm2,%xmm14
+	movdqa	%xmm3,%xmm15
+	pxor	%xmm5,%xmm8
+	pxor	%xmm5,%xmm9
+
+	pand	%xmm6,%xmm12
+	pand	%xmm6,%xmm13
+	movdqa	%xmm8,0(%rax)
+	pcmpeqb	%xmm0,%xmm12
+	psrlq	$4,%xmm0
+	movdqa	%xmm9,16(%rax)
+	pcmpeqb	%xmm1,%xmm13
+	psrlq	$4,%xmm1
+	leaq	16(%rcx),%rcx
+
+	pand	%xmm6,%xmm14
+	pand	%xmm6,%xmm15
+	movdqa	%xmm10,32(%rax)
+	pcmpeqb	%xmm2,%xmm14
+	psrlq	$4,%xmm2
+	movdqa	%xmm11,48(%rax)
+	pcmpeqb	%xmm3,%xmm15
+	psrlq	$4,%xmm3
+	movdqu	(%rcx),%xmm6
+
+	pxor	%xmm5,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm12,64(%rax)
+	movdqa	%xmm13,80(%rax)
+	movdqa	%xmm14,96(%rax)
+	movdqa	%xmm15,112(%rax)
+	leaq	128(%rax),%rax
+	decl	%r10d
+	jnz	.Lkey_loop
+
+	movdqa	80(%r11),%xmm7
+
+	.byte	0xf3,0xc3
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+
+.globl	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,@function
+.align	16
+bsaes_cbc_encrypt:
+	cmpl	$0,%r9d
+	jne	asm_AES_cbc_encrypt
+	cmpq	$128,%rdx
+	jb	asm_AES_cbc_encrypt
+
+	movq	%rsp,%rax
+.Lcbc_dec_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movl	240(%rcx),%eax
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movq	%r8,%rbx
+	shrq	$4,%r14
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7
+	movdqa	%xmm6,(%rax)
+	movdqa	%xmm7,(%rsp)
+
+	movdqu	(%rbx),%xmm14
+	subq	$8,%r14
+.Lcbc_dec_loop:
+	movdqu	0(%r12),%xmm15
+	movdqu	16(%r12),%xmm0
+	movdqu	32(%r12),%xmm1
+	movdqu	48(%r12),%xmm2
+	movdqu	64(%r12),%xmm3
+	movdqu	80(%r12),%xmm4
+	movq	%rsp,%rax
+	movdqu	96(%r12),%xmm5
+	movl	%edx,%r10d
+	movdqu	112(%r12),%xmm6
+	movdqa	%xmm14,32(%rbp)
+
+	call	_bsaes_decrypt8
+
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm6
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm2
+	movdqu	112(%r12),%xmm14
+	pxor	%xmm13,%xmm4
+	movdqu	%xmm15,0(%r13)
+	leaq	128(%r12),%r12
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+	subq	$8,%r14
+	jnc	.Lcbc_dec_loop
+
+	addq	$8,%r14
+	jz	.Lcbc_dec_done
+
+	movdqu	0(%r12),%xmm15
+	movq	%rsp,%rax
+	movl	%edx,%r10d
+	cmpq	$2,%r14
+	jb	.Lcbc_dec_one
+	movdqu	16(%r12),%xmm0
+	je	.Lcbc_dec_two
+	movdqu	32(%r12),%xmm1
+	cmpq	$4,%r14
+	jb	.Lcbc_dec_three
+	movdqu	48(%r12),%xmm2
+	je	.Lcbc_dec_four
+	movdqu	64(%r12),%xmm3
+	cmpq	$6,%r14
+	jb	.Lcbc_dec_five
+	movdqu	80(%r12),%xmm4
+	je	.Lcbc_dec_six
+	movdqu	96(%r12),%xmm5
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm6
+	movdqu	96(%r12),%xmm14
+	pxor	%xmm12,%xmm2
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_six:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm1
+	movdqu	80(%r12),%xmm14
+	pxor	%xmm11,%xmm6
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_five:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm3
+	movdqu	64(%r12),%xmm14
+	pxor	%xmm10,%xmm1
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_four:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm5
+	movdqu	48(%r12),%xmm14
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_three:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm0
+	movdqu	32(%r12),%xmm14
+	pxor	%xmm8,%xmm5
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_two:
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm14
+	pxor	%xmm7,%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_one:
+	leaq	(%r12),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm14
+	movdqu	%xmm14,(%r13)
+	movdqa	%xmm15,%xmm14
+
+.Lcbc_dec_done:
+	movdqu	%xmm14,(%rbx)
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lcbc_dec_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lcbc_dec_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lcbc_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,@function
+.align	16
+bsaes_ctr32_encrypt_blocks:
+	movq	%rsp,%rax
+.Lctr_enc_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movdqu	(%r8),%xmm0
+	movl	240(%rcx),%eax
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	movdqa	%xmm0,32(%rbp)
+	cmpq	$8,%rdx
+	jb	.Lctr_enc_short
+
+	movl	%eax,%ebx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%ebx,%r10d
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7
+	movdqa	%xmm7,(%rax)
+
+	movdqa	(%rsp),%xmm8
+	leaq	.LADD1(%rip),%r11
+	movdqa	32(%rbp),%xmm15
+	movdqa	-32(%r11),%xmm7
+.byte	102,68,15,56,0,199
+.byte	102,68,15,56,0,255
+	movdqa	%xmm8,(%rsp)
+	jmp	.Lctr_enc_loop
+.align	16
+.Lctr_enc_loop:
+	movdqa	%xmm15,32(%rbp)
+	movdqa	%xmm15,%xmm0
+	movdqa	%xmm15,%xmm1
+	paddd	0(%r11),%xmm0
+	movdqa	%xmm15,%xmm2
+	paddd	16(%r11),%xmm1
+	movdqa	%xmm15,%xmm3
+	paddd	32(%r11),%xmm2
+	movdqa	%xmm15,%xmm4
+	paddd	48(%r11),%xmm3
+	movdqa	%xmm15,%xmm5
+	paddd	64(%r11),%xmm4
+	movdqa	%xmm15,%xmm6
+	paddd	80(%r11),%xmm5
+	paddd	96(%r11),%xmm6
+
+
+
+	movdqa	(%rsp),%xmm8
+	leaq	16(%rsp),%rax
+	movdqa	-16(%r11),%xmm7
+	pxor	%xmm8,%xmm15
+	pxor	%xmm8,%xmm0
+.byte	102,68,15,56,0,255
+	pxor	%xmm8,%xmm1
+.byte	102,15,56,0,199
+	pxor	%xmm8,%xmm2
+.byte	102,15,56,0,207
+	pxor	%xmm8,%xmm3
+.byte	102,15,56,0,215
+	pxor	%xmm8,%xmm4
+.byte	102,15,56,0,223
+	pxor	%xmm8,%xmm5
+.byte	102,15,56,0,231
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,239
+	leaq	.LBS0(%rip),%r11
+.byte	102,15,56,0,247
+	movl	%ebx,%r10d
+
+	call	_bsaes_encrypt8_bitslice
+
+	subq	$8,%r14
+	jc	.Lctr_enc_loop_done
+
+	movdqu	0(%r12),%xmm7
+	movdqu	16(%r12),%xmm8
+	movdqu	32(%r12),%xmm9
+	movdqu	48(%r12),%xmm10
+	movdqu	64(%r12),%xmm11
+	movdqu	80(%r12),%xmm12
+	movdqu	96(%r12),%xmm13
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	pxor	%xmm15,%xmm7
+	movdqa	32(%rbp),%xmm15
+	pxor	%xmm8,%xmm0
+	movdqu	%xmm7,0(%r13)
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	%xmm10,%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	%xmm11,%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	%xmm12,%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	%xmm13,%xmm1
+	movdqu	%xmm6,80(%r13)
+	pxor	%xmm14,%xmm4
+	movdqu	%xmm1,96(%r13)
+	leaq	.LADD1(%rip),%r11
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+	paddd	112(%r11),%xmm15
+	jnz	.Lctr_enc_loop
+
+	jmp	.Lctr_enc_done
+.align	16
+.Lctr_enc_loop_done:
+	addq	$8,%r14
+	movdqu	0(%r12),%xmm7
+	pxor	%xmm7,%xmm15
+	movdqu	%xmm15,0(%r13)
+	cmpq	$2,%r14
+	jb	.Lctr_enc_done
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm8,%xmm0
+	movdqu	%xmm0,16(%r13)
+	je	.Lctr_enc_done
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm9,%xmm3
+	movdqu	%xmm3,32(%r13)
+	cmpq	$4,%r14
+	jb	.Lctr_enc_done
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm10,%xmm5
+	movdqu	%xmm5,48(%r13)
+	je	.Lctr_enc_done
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm11,%xmm2
+	movdqu	%xmm2,64(%r13)
+	cmpq	$6,%r14
+	jb	.Lctr_enc_done
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm12,%xmm6
+	movdqu	%xmm6,80(%r13)
+	je	.Lctr_enc_done
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm13,%xmm1
+	movdqu	%xmm1,96(%r13)
+	jmp	.Lctr_enc_done
+
+.align	16
+.Lctr_enc_short:
+	leaq	32(%rbp),%rdi
+	leaq	48(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt
+	movdqu	(%r12),%xmm0
+	leaq	16(%r12),%r12
+	movl	44(%rbp),%eax
+	bswapl	%eax
+	pxor	48(%rbp),%xmm0
+	incl	%eax
+	movdqu	%xmm0,(%r13)
+	bswapl	%eax
+	leaq	16(%r13),%r13
+	movl	%eax,44(%rsp)
+	decq	%r14
+	jnz	.Lctr_enc_short
+
+.Lctr_enc_done:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lctr_enc_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lctr_enc_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lctr_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,@function
+.align	16
+bsaes_xts_encrypt:
+	movq	%rsp,%rax
+.Lxts_enc_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+
+	leaq	(%r9),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r8),%rdx
+	call	asm_AES_encrypt		
+
+	movl	240(%r15),%eax
+	movq	%r14,%rbx
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7
+	movdqa	%xmm7,(%rax)
+
+	andq	$-16,%r14
+	subq	$128,%rsp
+	movdqa	32(%rbp),%xmm6
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+
+	subq	$128,%r14
+	jc	.Lxts_enc_short
+	jmp	.Lxts_enc_loop
+
+.align	16
+.Lxts_enc_loop:
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	movdqa	%xmm6,112(%rsp)
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	pxor	%xmm14,%xmm6
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	96(%rsp),%xmm1
+	movdqu	%xmm6,80(%r13)
+	pxor	112(%rsp),%xmm4
+	movdqu	%xmm1,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+
+	subq	$128,%r14
+	jnc	.Lxts_enc_loop
+
+.Lxts_enc_short:
+	addq	$128,%r14
+	jz	.Lxts_enc_done
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	cmpq	$16,%r14
+	je	.Lxts_enc_1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	cmpq	$32,%r14
+	je	.Lxts_enc_2
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	cmpq	$48,%r14
+	je	.Lxts_enc_3
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	cmpq	$64,%r14
+	je	.Lxts_enc_4
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	cmpq	$80,%r14
+	je	.Lxts_enc_5
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	cmpq	$96,%r14
+	je	.Lxts_enc_6
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm6,112(%rsp)
+	leaq	112(%r12),%r12
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	pxor	96(%rsp),%xmm1
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm1,96(%r13)
+	leaq	112(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_6:
+	pxor	%xmm11,%xmm3
+	leaq	96(%r12),%r12
+	pxor	%xmm12,%xmm4
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm2,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	leaq	96(%r13),%r13
+
+	movdqa	96(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_5:
+	pxor	%xmm10,%xmm2
+	leaq	80(%r12),%r12
+	pxor	%xmm11,%xmm3
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	pxor	64(%rsp),%xmm2
+	movdqu	%xmm5,48(%r13)
+	movdqu	%xmm2,64(%r13)
+	leaq	80(%r13),%r13
+
+	movdqa	80(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_4:
+	pxor	%xmm9,%xmm1
+	leaq	64(%r12),%r12
+	pxor	%xmm10,%xmm2
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm5
+	movdqu	%xmm3,32(%r13)
+	movdqu	%xmm5,48(%r13)
+	leaq	64(%r13),%r13
+
+	movdqa	64(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_3:
+	pxor	%xmm8,%xmm0
+	leaq	48(%r12),%r12
+	pxor	%xmm9,%xmm1
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm3
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm3,32(%r13)
+	leaq	48(%r13),%r13
+
+	movdqa	48(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_2:
+	pxor	%xmm7,%xmm15
+	leaq	32(%r12),%r12
+	pxor	%xmm8,%xmm0
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_encrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	leaq	32(%r13),%r13
+
+	movdqa	32(%rsp),%xmm6
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_1:
+	pxor	%xmm15,%xmm7
+	leaq	16(%r12),%r12
+	movdqa	%xmm7,32(%rbp)
+	leaq	32(%rbp),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt		
+	pxor	32(%rbp),%xmm15
+
+
+
+
+
+	movdqu	%xmm15,0(%r13)
+	leaq	16(%r13),%r13
+
+	movdqa	16(%rsp),%xmm6
+
+.Lxts_enc_done:
+	andl	$15,%ebx
+	jz	.Lxts_enc_ret
+	movq	%r13,%rdx
+
+.Lxts_enc_steal:
+	movzbl	(%r12),%eax
+	movzbl	-16(%rdx),%ecx
+	leaq	1(%r12),%r12
+	movb	%al,-16(%rdx)
+	movb	%cl,0(%rdx)
+	leaq	1(%rdx),%rdx
+	subl	$1,%ebx
+	jnz	.Lxts_enc_steal
+
+	movdqu	-16(%r13),%xmm15
+	leaq	32(%rbp),%rdi
+	pxor	%xmm6,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_encrypt		
+	pxor	32(%rbp),%xmm6
+	movdqu	%xmm6,-16(%r13)
+
+.Lxts_enc_ret:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lxts_enc_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lxts_enc_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lxts_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,@function
+.align	16
+bsaes_xts_decrypt:
+	movq	%rsp,%rax
+.Lxts_dec_prologue:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	leaq	-72(%rsp),%rsp
+	movq	%rsp,%rbp
+	movq	%rdi,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+
+	leaq	(%r9),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r8),%rdx
+	call	asm_AES_encrypt		
+
+	movl	240(%r15),%eax
+	movq	%r14,%rbx
+
+	movl	%eax,%edx
+	shlq	$7,%rax
+	subq	$96,%rax
+	subq	%rax,%rsp
+
+	movq	%rsp,%rax
+	movq	%r15,%rcx
+	movl	%edx,%r10d
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7
+	movdqa	%xmm6,(%rax)
+	movdqa	%xmm7,(%rsp)
+
+	xorl	%eax,%eax
+	andq	$-16,%r14
+	testl	$15,%ebx
+	setnz	%al
+	shlq	$4,%rax
+	subq	%rax,%r14
+
+	subq	$128,%rsp
+	movdqa	32(%rbp),%xmm6
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+
+	subq	$128,%r14
+	jc	.Lxts_dec_short
+	jmp	.Lxts_dec_loop
+
+.align	16
+.Lxts_dec_loop:
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqu	112(%r12),%xmm14
+	leaq	128(%r12),%r12
+	movdqa	%xmm6,112(%rsp)
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	pxor	%xmm14,%xmm6
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	pxor	96(%rsp),%xmm2
+	movdqu	%xmm6,80(%r13)
+	pxor	112(%rsp),%xmm4
+	movdqu	%xmm2,96(%r13)
+	movdqu	%xmm4,112(%r13)
+	leaq	128(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+
+	subq	$128,%r14
+	jnc	.Lxts_dec_loop
+
+.Lxts_dec_short:
+	addq	$128,%r14
+	jz	.Lxts_dec_done
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm15
+	movdqa	%xmm6,0(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm6,16(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	0(%r12),%xmm7
+	cmpq	$16,%r14
+	je	.Lxts_dec_1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm1
+	movdqa	%xmm6,32(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	16(%r12),%xmm8
+	cmpq	$32,%r14
+	je	.Lxts_dec_2
+	pxor	%xmm7,%xmm15
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm6,48(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	32(%r12),%xmm9
+	cmpq	$48,%r14
+	je	.Lxts_dec_3
+	pxor	%xmm8,%xmm0
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm3
+	movdqa	%xmm6,64(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	48(%r12),%xmm10
+	cmpq	$64,%r14
+	je	.Lxts_dec_4
+	pxor	%xmm9,%xmm1
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm6,80(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	64(%r12),%xmm11
+	cmpq	$80,%r14
+	je	.Lxts_dec_5
+	pxor	%xmm10,%xmm2
+	pshufd	$19,%xmm14,%xmm13
+	pxor	%xmm14,%xmm14
+	movdqa	%xmm6,%xmm5
+	movdqa	%xmm6,96(%rsp)
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	pcmpgtd	%xmm6,%xmm14
+	pxor	%xmm13,%xmm6
+	movdqu	80(%r12),%xmm12
+	cmpq	$96,%r14
+	je	.Lxts_dec_6
+	pxor	%xmm11,%xmm3
+	movdqu	96(%r12),%xmm13
+	pxor	%xmm12,%xmm4
+	movdqa	%xmm6,112(%rsp)
+	leaq	112(%r12),%r12
+	pxor	%xmm13,%xmm5
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	pxor	96(%rsp),%xmm2
+	movdqu	%xmm6,80(%r13)
+	movdqu	%xmm2,96(%r13)
+	leaq	112(%r13),%r13
+
+	movdqa	112(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_6:
+	pxor	%xmm11,%xmm3
+	leaq	96(%r12),%r12
+	pxor	%xmm12,%xmm4
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	pxor	80(%rsp),%xmm6
+	movdqu	%xmm1,64(%r13)
+	movdqu	%xmm6,80(%r13)
+	leaq	96(%r13),%r13
+
+	movdqa	96(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_5:
+	pxor	%xmm10,%xmm2
+	leaq	80(%r12),%r12
+	pxor	%xmm11,%xmm3
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	pxor	64(%rsp),%xmm1
+	movdqu	%xmm3,48(%r13)
+	movdqu	%xmm1,64(%r13)
+	leaq	80(%r13),%r13
+
+	movdqa	80(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_4:
+	pxor	%xmm9,%xmm1
+	leaq	64(%r12),%r12
+	pxor	%xmm10,%xmm2
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	pxor	48(%rsp),%xmm3
+	movdqu	%xmm5,32(%r13)
+	movdqu	%xmm3,48(%r13)
+	leaq	64(%r13),%r13
+
+	movdqa	64(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_3:
+	pxor	%xmm8,%xmm0
+	leaq	48(%r12),%r12
+	pxor	%xmm9,%xmm1
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	pxor	32(%rsp),%xmm5
+	movdqu	%xmm0,16(%r13)
+	movdqu	%xmm5,32(%r13)
+	leaq	48(%r13),%r13
+
+	movdqa	48(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_2:
+	pxor	%xmm7,%xmm15
+	leaq	32(%r12),%r12
+	pxor	%xmm8,%xmm0
+	leaq	128(%rsp),%rax
+	movl	%edx,%r10d
+
+	call	_bsaes_decrypt8
+
+	pxor	0(%rsp),%xmm15
+	pxor	16(%rsp),%xmm0
+	movdqu	%xmm15,0(%r13)
+	movdqu	%xmm0,16(%r13)
+	leaq	32(%r13),%r13
+
+	movdqa	32(%rsp),%xmm6
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_1:
+	pxor	%xmm15,%xmm7
+	leaq	16(%r12),%r12
+	movdqa	%xmm7,32(%rbp)
+	leaq	32(%rbp),%rdi
+	leaq	32(%rbp),%rsi
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm15
+
+
+
+
+
+	movdqu	%xmm15,0(%r13)
+	leaq	16(%r13),%r13
+
+	movdqa	16(%rsp),%xmm6
+
+.Lxts_dec_done:
+	andl	$15,%ebx
+	jz	.Lxts_dec_ret
+
+	pxor	%xmm14,%xmm14
+	movdqa	.Lxts_magic(%rip),%xmm12
+	pcmpgtd	%xmm6,%xmm14
+	pshufd	$19,%xmm14,%xmm13
+	movdqa	%xmm6,%xmm5
+	paddq	%xmm6,%xmm6
+	pand	%xmm12,%xmm13
+	movdqu	(%r12),%xmm15
+	pxor	%xmm13,%xmm6
+
+	leaq	32(%rbp),%rdi
+	pxor	%xmm6,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm6
+	movq	%r13,%rdx
+	movdqu	%xmm6,(%r13)
+
+.Lxts_dec_steal:
+	movzbl	16(%r12),%eax
+	movzbl	(%rdx),%ecx
+	leaq	1(%r12),%r12
+	movb	%al,(%rdx)
+	movb	%cl,16(%rdx)
+	leaq	1(%rdx),%rdx
+	subl	$1,%ebx
+	jnz	.Lxts_dec_steal
+
+	movdqu	(%r13),%xmm15
+	leaq	32(%rbp),%rdi
+	pxor	%xmm5,%xmm15
+	leaq	32(%rbp),%rsi
+	movdqa	%xmm15,32(%rbp)
+	leaq	(%r15),%rdx
+	call	asm_AES_decrypt		
+	pxor	32(%rbp),%xmm5
+	movdqu	%xmm5,(%r13)
+
+.Lxts_dec_ret:
+	leaq	(%rsp),%rax
+	pxor	%xmm0,%xmm0
+.Lxts_dec_bzero:
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	leaq	32(%rax),%rax
+	cmpq	%rax,%rbp
+	ja	.Lxts_dec_bzero
+
+	leaq	(%rbp),%rsp
+	movq	72(%rsp),%r15
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rax
+	leaq	120(%rsp),%rsp
+	movq	%rax,%rbp
+.Lxts_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+.type	_bsaes_const,@object
+.align	64
+_bsaes_const:
+.LM0ISR:
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:
+.quad	0x5555555555555555, 0x5555555555555555
+.LBS1:
+.quad	0x3333333333333333, 0x3333333333333333
+.LBS2:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+.quad	0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:
+.quad	0x0000000000000000, 0x0000000100000000
+.LADD2:
+.quad	0x0000000000000000, 0x0000000200000000
+.LADD3:
+.quad	0x0000000000000000, 0x0000000300000000
+.LADD4:
+.quad	0x0000000000000000, 0x0000000400000000
+.LADD5:
+.quad	0x0000000000000000, 0x0000000500000000
+.LADD6:
+.quad	0x0000000000000000, 0x0000000600000000
+.LADD7:
+.quad	0x0000000000000000, 0x0000000700000000
+.LADD8:
+.quad	0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+.long	0x87,0,1,0
+.Lmasks:
+.quad	0x0101010101010101, 0x0101010101010101
+.quad	0x0202020202020202, 0x0202020202020202
+.quad	0x0404040404040404, 0x0404040404040404
+.quad	0x0808080808080808, 0x0808080808080808
+.LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+.quad	0x6363636363636363, 0x6363636363636363
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.align	64
+.size	_bsaes_const,.-_bsaes_const
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000..ceb02b5
--- /dev/null
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3045 @@
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode]				###
+### bitsliced implementation for Intel Core 2 processors	###
+### requires support of SSE extensions up to SSSE3		###
+### Author: Emilia Käsper and Peter Schwabe			###
+### Date: 2009-03-19						###
+### Public domain						###
+###								###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
+### further information.					###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+#   from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+#   allowed to feed its output back to aesenc[last], this was
+#   achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+#   relies on conversion of "conventional" key schedule as returned
+#   by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+#   to skip one shiftrows(), reduce bit-sliced key schedule and
+#   speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+#		Emilia's	this(*)		difference
+#
+# Core 2    	9.30		8.69		+7%
+# Nehalem(**) 	7.63		6.98		+9%
+# Atom	    	17.1		17.4		-2%(***)
+#
+# (*)	Comparison is not completely fair, because "this" is ECB,
+#	i.e. no extra processing such as counter values calculation
+#	and xor-ing input as in Emilia's CTR implementation is
+#	performed. However, the CTR calculations stand for not more
+#	than 1% of total time, so comparison is *rather* fair.
+#
+# (**)	Results were collected on Westmere, which is considered to
+#	be equivalent to Nehalem for this code.
+#
+# (***)	Slowdown on Atom is rather strange per se, because original
+#	implementation has a number of 9+-bytes instructions, which
+#	are bad for Atom front-end, and which I eliminated completely.
+#	In attempt to address deterioration sbox() was tested in FP
+#	SIMD "domain" (movaps instead of movdqa, xorps instead of
+#	pxor, etc.). While it resulted in nominal 4% improvement on
+#	Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# 		conversion	conversion/8x block
+# Core 2	240		0.22
+# Nehalem	180		0.20
+# Atom		430		0.19
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2	11.0
+# Nehalem	9.16
+# Atom		20.9
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+#						<appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
+my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InBasisChange	(@b);
+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[5]
+	pxor	@b[1], @b[2]
+	pxor	@b[0], @b[3]
+	pxor	@b[2], @b[6]
+	pxor 	@b[0], @b[5]
+
+	pxor	@b[3], @b[6]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[1], @b[3]
+
+	pxor	@b[7], @b[2]
+	pxor	@b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[1], @b[6]
+
+	pxor	@b[5], @b[1]
+	pxor	@b[3], @b[5]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+
+	pxor	@b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InvInBasisChange	(@b);
+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {		# OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+	pxor	@b[7], @b[4]
+
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+	pxor	@b[7], @b[3]
+	pxor	@b[3], @b[5]
+	pxor	@b[5], @b[1]
+
+	pxor	@b[1], @b[6]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange {		# InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+	pxor	@b[5], @b[1]
+	pxor	@b[7], @b[2]
+
+	pxor	@b[1], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	 pxor 	@b[0], @b[5]
+	pxor	@b[7], @b[3]
+	 pxor	@b[2], @b[6]
+	 pxor	@b[1], @b[2]
+	pxor	@b[3], @b[6]
+
+	pxor	@b[0], @b[3]
+	pxor	@b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor 	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x1, $x0
+	pxor	$t0, $x1
+___
+}
+
+sub Mul_GF4_N {				# not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x0, $x1
+	pxor	$t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	 movdqa	$y2, $t1
+	pxor	$y1, $t0
+	 pxor 	$y3, $t1
+	pand	$x0, $t0
+	 pand	$x2, $t1
+	pxor	$x1, $x0
+	 pxor	$x3, $x2
+	pand	$y0, $x1
+	 pand	$y2, $x3
+	pand	$y1, $x0
+	 pand	$y3, $x2
+	pxor	$x0, $x1
+	 pxor	$x3, $x2
+	pxor	$t0, $x0
+	 pxor	$t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+	movdqa	@x[0], @t[0]
+	movdqa	@x[1], @t[1]
+___
+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+	pxor	@x[2], @t[0]
+	pxor	@x[3], @t[1]
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@t[0], @x[0]
+	pxor	@t[0], @x[2]
+	pxor	@t[1], @x[1]
+	pxor	@t[1], @x[3]
+
+	movdqa	@x[4], @t[0]
+	movdqa	@x[5], @t[1]
+	pxor	@x[6], @t[0]
+	pxor	@x[7], @t[1]
+___
+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+	pxor	@t[0], @x[4]
+	pxor	@t[0], @x[6]
+	pxor	@t[1], @x[5]
+	pxor	@t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+	movdqa	@x[4], @t[3]
+	movdqa	@x[5], @t[2]
+	movdqa	@x[1], @t[1]
+	movdqa	@x[7], @s[1]
+	movdqa	@x[0], @s[0]
+
+	pxor	@x[6], @t[3]
+	pxor	@x[7], @t[2]
+	pxor	@x[3], @t[1]
+	 movdqa	@t[3], @s[2]
+	pxor	@x[6], @s[1]
+	 movdqa	@t[2], @t[0]
+	pxor	@x[2], @s[0]
+	 movdqa	@t[3], @s[3]
+
+	por	@t[1], @t[2]
+	por	@s[0], @t[3]
+	pxor	@t[0], @s[3]
+	pand	@s[0], @s[2]
+	pxor	@t[1], @s[0]
+	pand	@t[1], @t[0]
+	pand	@s[0], @s[3]
+	movdqa	@x[3], @s[0]
+	pxor	@x[2], @s[0]
+	pand	@s[0], @s[1]
+	pxor	@s[1], @t[3]
+	pxor	@s[1], @t[2]
+	movdqa	@x[4], @s[1]
+	movdqa	@x[1], @s[0]
+	pxor	@x[5], @s[1]
+	pxor	@x[0], @s[0]
+	movdqa	@s[1], @t[1]
+	pand	@s[0], @s[1]
+	por	@s[0], @t[1]
+	pxor	@s[1], @t[0]
+	pxor	@s[3], @t[3]
+	pxor	@s[2], @t[2]
+	pxor	@s[3], @t[1]
+	movdqa	@x[7], @s[0]
+	pxor	@s[2], @t[0]
+	movdqa	@x[6], @s[1]
+	pxor	@s[2], @t[1]
+	movdqa	@x[5], @s[2]
+	pand	@x[3], @s[0]
+	movdqa	@x[4], @s[3]
+	pand	@x[2], @s[1]
+	pand	@x[1], @s[2]
+	por	@x[0], @s[3]
+	pxor	@s[0], @t[3]
+	pxor	@s[1], @t[2]
+	pxor	@s[2], @t[1]
+	pxor	@s[3], @t[0] 
+
+	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+	# new smaller inversion
+
+	movdqa	@t[3], @s[0]
+	pand	@t[1], @t[3]
+	pxor	@t[2], @s[0]
+
+	movdqa	@t[0], @s[2]
+	movdqa	@s[0], @s[3]
+	pxor	@t[3], @s[2]
+	pand	@s[2], @s[3]
+
+	movdqa	@t[1], @s[1]
+	pxor	@t[2], @s[3]
+	pxor	@t[0], @s[1]
+
+	pxor	@t[2], @t[3]
+
+	pand	@t[3], @s[1]
+
+	movdqa	@s[2], @t[2]
+	pxor	@t[0], @s[1]
+
+	pxor	@s[1], @t[2]
+	pxor	@s[1], @t[1]
+
+	pand	@t[0], @t[2]
+
+	pxor	@t[2], @s[2]
+	pxor	@t[2], @t[1]
+
+	pand	@s[3], @s[2]
+
+	pxor	@s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+	pxor	0x00($key),@x[0]
+	pxor	0x10($key),@x[1]
+	pshufb	$mask,@x[0]
+	pxor	0x20($key),@x[2]
+	pshufb	$mask,@x[1]
+	pxor	0x30($key),@x[3]
+	pshufb	$mask,@x[2]
+	pxor	0x40($key),@x[4]
+	pshufb	$mask,@x[3]
+	pxor	0x50($key),@x[5]
+	pshufb	$mask,@x[4]
+	pxor	0x60($key),@x[6]
+	pshufb	$mask,@x[5]
+	pxor	0x70($key),@x[7]
+	pshufb	$mask,@x[6]
+	lea	0x80($key),$key
+	pshufb	$mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
+	pshufd	\$0x93, @x[1], @t[1]
+	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
+	pshufd	\$0x93, @x[2], @t[2]
+	 pxor	@t[1], @x[1]
+	pshufd	\$0x93, @x[3], @t[3]
+	 pxor	@t[2], @x[2]
+	pshufd	\$0x93, @x[4], @t[4]
+	 pxor	@t[3], @x[3]
+	pshufd	\$0x93, @x[5], @t[5]
+	 pxor	@t[4], @x[4]
+	pshufd	\$0x93, @x[6], @t[6]
+	 pxor	@t[5], @x[5]
+	pshufd	\$0x93, @x[7], @t[7]
+	 pxor	@t[6], @x[6]
+	 pxor	@t[7], @x[7]
+
+	pxor	@x[0], @t[1]
+	pxor	@x[7], @t[0]
+	pxor	@x[7], @t[1]
+	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
+	pxor	@x[1], @t[2]
+	 pshufd	\$0x4E, @x[1], @x[1]
+	pxor	@x[4], @t[5]
+	 pxor	@t[0], @x[0]
+	pxor	@x[5], @t[6]
+	 pxor	@t[1], @x[1]
+	pxor	@x[3], @t[4]
+	 pshufd	\$0x4E, @x[4], @t[0]
+	pxor	@x[6], @t[7]
+	 pshufd	\$0x4E, @x[5], @t[1]
+	pxor	@x[2], @t[3]
+	 pshufd	\$0x4E, @x[3], @x[4]
+	pxor	@x[7], @t[3]
+	 pshufd	\$0x4E, @x[7], @x[5]
+	pxor	@x[7], @t[4]
+	 pshufd	\$0x4E, @x[6], @x[3]
+	pxor	@t[4], @t[0]
+	 pshufd	\$0x4E, @x[2], @x[6]
+	pxor	@t[5], @t[1]
+
+	pxor	@t[3], @x[4]
+	pxor	@t[7], @x[5]
+	pxor	@t[6], @x[3]
+	 movdqa	@t[0], @x[2]
+	pxor	@t[2], @x[6]
+	 movdqa	@t[1], @x[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+	# multiplication by 0x0e
+	pshufd	\$0x93, @x[7], @t[7]
+	movdqa	@x[2], @t[2]
+	pxor	@x[5], @x[7]		# 7 5
+	pxor	@x[5], @x[2]		# 2 5
+	pshufd	\$0x93, @x[0], @t[0]
+	movdqa	@x[5], @t[5]
+	pxor	@x[0], @x[5]		# 5 0		[1]
+	pxor	@x[1], @x[0]		# 0 1
+	pshufd	\$0x93, @x[1], @t[1]
+	pxor	@x[2], @x[1]		# 1 25
+	pxor	@x[6], @x[0]		# 01 6		[2]
+	pxor	@x[3], @x[1]		# 125 3		[4]
+	pshufd	\$0x93, @x[3], @t[3]
+	pxor	@x[0], @x[2]		# 25 016	[3]
+	pxor	@x[7], @x[3]		# 3 75
+	pxor	@x[6], @x[7]		# 75 6		[0]
+	pshufd	\$0x93, @x[6], @t[6]
+	movdqa	@x[4], @t[4]
+	pxor	@x[4], @x[6]		# 6 4
+	pxor	@x[3], @x[4]		# 4 375		[6]
+	pxor	@x[7], @x[3]		# 375 756=36
+	pxor	@t[5], @x[6]		# 64 5		[7]
+	pxor	@t[2], @x[3]		# 36 2
+	pxor	@t[4], @x[3]		# 362 4		[5]
+	pshufd	\$0x93, @t[5], @t[5]
+___
+					my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+	# multiplication by 0x0b
+	pxor	@y[0], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[1], @y[1]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[5], @y[0]
+	pxor	@t[6], @y[1]
+	pxor	@t[7], @y[0]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @t[7]		# clobber t[7]
+	pxor	@y[0], @y[1]
+
+	pxor	@t[0], @y[3]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[1], @y[2]
+	pxor	@t[1], @y[4]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[1], @t[1]
+	pxor	@t[2], @y[3]
+	pxor	@t[2], @y[5]
+	pxor	@t[7], @y[2]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[3], @y[3]
+	pxor	@t[3], @y[6]
+	pxor	@t[3], @y[4]
+	pshufd	\$0x93, @t[3], @t[3]
+	pxor	@t[4], @y[7]
+	pxor	@t[4], @y[5]
+	pxor	@t[7], @y[7]
+	pxor	@t[5], @y[3]
+	pxor	@t[4], @y[4]
+	pxor	@t[5], @t[7]		# clobber t[7] even more
+
+	pxor	@t[7], @y[5]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[7], @y[6]
+	pxor	@t[7], @y[4]
+
+	pxor	@t[5], @t[7]
+	pshufd	\$0x93, @t[5], @t[5]
+	pxor	@t[6], @t[7]		# restore t[7]
+
+	# multiplication by 0x0d
+	pxor	@y[7], @y[4]
+	pxor	@t[4], @y[7]
+	pshufd	\$0x93, @t[6], @t[6]
+	pxor	@t[0], @y[2]
+	pxor	@t[5], @y[7]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[7], @t[7]
+
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[0], @y[3]
+	pxor	@t[5], @y[1]
+	pxor	@t[5], @y[0]
+	pxor	@t[7], @y[1]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[6], @y[0]
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[4]
+	pshufd	\$0x93, @t[1], @t[1]
+
+	pxor	@t[7], @y[7]
+	pxor	@t[2], @y[4]
+	pxor	@t[2], @y[5]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[6], @y[2]
+	pxor	@t[3], @t[6]		# clobber t[6]
+	pxor	@y[7], @y[4]
+	pxor	@t[6], @y[3]
+
+	pxor	@t[6], @y[6]
+	pxor	@t[5], @y[5]
+	pxor	@t[4], @y[6]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @y[5]
+	pxor	@t[7], @y[6]
+	pxor	@t[3], @t[6]		# restore t[6]
+
+	pshufd	\$0x93, @t[5], @t[5]
+	pshufd	\$0x93, @t[6], @t[6]
+	pshufd	\$0x93, @t[7], @t[7]
+	pshufd	\$0x93, @t[3], @t[3]
+
+	# multiplication by 0x09
+	pxor	@y[1], @y[4]
+	pxor	@y[1], @t[1]		# t[1]=y[1]
+	pxor	@t[5], @t[0]		# clobber t[0]
+	pxor	@t[5], @t[1]
+	pxor	@t[0], @y[3]
+	pxor	@y[0], @t[0]		# t[0]=y[0]
+	pxor	@t[6], @t[1]
+	pxor	@t[7], @t[6]		# clobber t[6]
+	pxor	@t[1], @y[4]
+	pxor	@t[4], @y[7]
+	pxor	@y[4], @t[4]		# t[4]=y[4]
+	pxor	@t[3], @y[6]
+	pxor	@y[3], @t[3]		# t[3]=y[3]
+	pxor	@t[2], @y[5]
+	pxor	@y[2], @t[2]		# t[2]=y[2]
+	pxor	@t[7], @t[3]
+	pxor	@y[5], @t[5]		# t[5]=y[5]
+	pxor	@t[6], @t[2]
+	pxor	@t[6], @t[5]
+	pxor	@y[6], @t[6]		# t[6]=y[6]
+	pxor	@y[7], @t[7]		# t[7]=y[7]
+
+	movdqa	@t[0],@XMM[0]
+	movdqa	@t[1],@XMM[1]
+	movdqa	@t[2],@XMM[2]
+	movdqa	@t[3],@XMM[3]
+	movdqa	@t[4],@XMM[4]
+	movdqa	@t[5],@XMM[5]
+	movdqa	@t[6],@XMM[6]
+	movdqa	@t[7],@XMM[7]
+___
+}
+
+sub aesenc {				# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x30($const),@t[0]	# .LSR
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast {			# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x40($const),@t[0]	# .LSRM0
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+$code.=<<___
+	pxor	0x00($key),@b[0]
+	pxor	0x10($key),@b[1]
+	pxor	0x20($key),@b[4]
+	pxor	0x30($key),@b[6]
+	pxor	0x40($key),@b[3]
+	pxor	0x50($key),@b[7]
+	pxor	0x60($key),@b[2]
+	pxor	0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+	movdqa	$b,$t
+	psrlq	\$$n,$b
+	pxor  	$a,$b
+	pand	$mask,$b
+	pxor	$b,$a
+	psllq	\$$n,$b
+	pxor	$t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+	movdqa	$b0,$t0
+	psrlq	\$$n,$b0
+	 movdqa	$b1,$t1
+	 psrlq	\$$n,$b1
+	pxor  	$a0,$b0
+	 pxor  	$a1,$b1
+	pand	$mask,$b0
+	 pand	$mask,$b1
+	pxor	$b0,$a0
+	psllq	\$$n,$b0
+	 pxor	$b1,$a1
+	 psllq	\$$n,$b1
+	pxor	$t0,$b0
+	 pxor	$t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+	movdqa	0x00($const),$t0	# .LBS0
+	movdqa	0x10($const),$t1	# .LBS1
+___
+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+	movdqa	0x20($const),$t0	# .LBS2
+___
+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern	asm_AES_encrypt
+.extern	asm_AES_decrypt
+
+.type	_bsaes_encrypt8,\@abi-omnipotent
+.align	64
+_bsaes_encrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	0x50($const), @XMM[8]	# .LM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Lenc_sbox
+.align	16
+.Lenc_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+	&Sbox		(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Lenc_done
+___
+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+	movdqa	0x30($const), @XMM[8]	# .LSR
+	jnz	.Lenc_loop
+	movdqa	0x40($const), @XMM[8]	# .LSRM0
+	jmp	.Lenc_loop
+.align	16
+.Lenc_done:
+___
+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type	_bsaes_decrypt8,\@abi-omnipotent
+.align	64
+_bsaes_decrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Ldec_sbox
+.align	16
+.Ldec_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+	&InvSbox	(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Ldec_done
+___
+	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+	movdqa	-0x10($const), @XMM[8]	# .LISR
+	jnz	.Ldec_loop
+	movdqa	-0x20($const), @XMM[8]	# .LISRM0
+	jmp	.Ldec_loop
+.align	16
+.Ldec_done:
+___
+	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
+	movdqa	@x[0], @x[2]
+	movdqa	@x[1], @x[3]
+___
+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+	movdqa	@x[0], @x[4]
+	movdqa	@x[2], @x[6]
+	movdqa	@x[1], @x[5]
+	movdqa	@x[3], @x[7]
+___
+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type	_bsaes_key_convert,\@abi-omnipotent
+.align	16
+_bsaes_key_convert:
+	lea	.Lmasks(%rip), $const
+	movdqu	($inp), %xmm7		# load round 0 key
+	lea	0x10($inp), $inp
+	movdqa	0x00($const), %xmm0	# 0x01...
+	movdqa	0x10($const), %xmm1	# 0x02...
+	movdqa	0x20($const), %xmm2	# 0x04...
+	movdqa	0x30($const), %xmm3	# 0x08...
+	movdqa	0x40($const), %xmm4	# .LM0
+	pcmpeqd	%xmm5, %xmm5		# .LNOT
+
+	movdqu	($inp), %xmm6		# load round 1 key
+	movdqa	%xmm7, ($out)		# save round 0 key
+	lea	0x10($out), $out
+	dec	$rounds
+	jmp	.Lkey_loop
+.align	16
+.Lkey_loop:
+	pshufb	%xmm4, %xmm6		# .LM0
+
+	movdqa	%xmm0,	%xmm8
+	movdqa	%xmm1,	%xmm9
+
+	pand	%xmm6,	%xmm8
+	pand	%xmm6,	%xmm9
+	movdqa	%xmm2,	%xmm10
+	pcmpeqb	%xmm0,	%xmm8
+	psllq	\$4,	%xmm0		# 0x10...
+	movdqa	%xmm3,	%xmm11
+	pcmpeqb	%xmm1,	%xmm9
+	psllq	\$4,	%xmm1		# 0x20...
+
+	pand	%xmm6,	%xmm10
+	pand	%xmm6,	%xmm11
+	movdqa	%xmm0,	%xmm12
+	pcmpeqb	%xmm2,	%xmm10
+	psllq	\$4,	%xmm2		# 0x40...
+	movdqa	%xmm1,	%xmm13
+	pcmpeqb	%xmm3,	%xmm11
+	psllq	\$4,	%xmm3		# 0x80...
+
+	movdqa	%xmm2,	%xmm14
+	movdqa	%xmm3,	%xmm15
+	 pxor	%xmm5,	%xmm8		# "pnot"
+	 pxor	%xmm5,	%xmm9
+
+	pand	%xmm6,	%xmm12
+	pand	%xmm6,	%xmm13
+	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
+	pcmpeqb	%xmm0,	%xmm12
+	psrlq	\$4,	%xmm0		# 0x01...
+	 movdqa	%xmm9, 0x10($out)
+	pcmpeqb	%xmm1,	%xmm13
+	psrlq	\$4,	%xmm1		# 0x02...
+	 lea	0x10($inp), $inp
+
+	pand	%xmm6,	%xmm14
+	pand	%xmm6,	%xmm15
+	 movdqa	%xmm10, 0x20($out)
+	pcmpeqb	%xmm2,	%xmm14
+	psrlq	\$4,	%xmm2		# 0x04...
+	 movdqa	%xmm11, 0x30($out)
+	pcmpeqb	%xmm3,	%xmm15
+	psrlq	\$4,	%xmm3		# 0x08...
+	 movdqu	($inp), %xmm6		# load next round key
+
+	pxor	%xmm5, %xmm13		# "pnot"
+	pxor	%xmm5, %xmm14
+	movdqa	%xmm12, 0x40($out)
+	movdqa	%xmm13, 0x50($out)
+	movdqa	%xmm14, 0x60($out)
+	movdqa	%xmm15, 0x70($out)
+	lea	0x80($out),$out
+	dec	$rounds
+	jnz	.Lkey_loop
+
+	movdqa	0x50($const), %xmm7	# .L63
+	#movdqa	%xmm6, ($out)		# don't save last round key
+	ret
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) {	# following four functions are unsupported interface
+			# used for benchmarking...
+$code.=<<___;
+.globl	bsaes_enc_key_convert
+.type	bsaes_enc_key_convert,\@function,2
+.align	16
+bsaes_enc_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+	ret
+.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl	bsaes_encrypt_128
+.type	bsaes_encrypt_128,\@function,4
+.align	16
+bsaes_encrypt_128:
+.Lenc128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Lenc128_loop
+	ret
+.size	bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl	bsaes_dec_key_convert
+.type	bsaes_dec_key_convert,\@function,2
+.align	16
+bsaes_dec_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	($out),%xmm7		# fix up round 0 key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,($out)
+	ret
+.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl	bsaes_decrypt_128
+.type	bsaes_decrypt_128,\@function,4
+.align	16
+bsaes_decrypt_128:
+.Ldec128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Ldec128_loop
+	ret
+.size	bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl	bsaes_ecb_encrypt_blocks
+.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_encrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_enc_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	sub	\$8,$len
+.Lecb_enc_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_enc_loop
+
+	add	\$8,$len
+	jz	.Lecb_enc_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_enc_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_enc_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_enc_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_enc_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_enc_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_enc_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_six:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_five:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_four:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_three:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_two:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_one:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_enc_short
+
+.Lecb_enc_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_enc_epilogue:
+	ret
+.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl	bsaes_ecb_decrypt_blocks
+.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_decrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_dec_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	sub	\$8,$len
+.Lecb_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_dec_loop
+
+	add	\$8,$len
+	jz	.Lecb_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_six:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_five:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_four:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_three:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_two:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_one:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_dec_short
+
+.Lecb_dec_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_dec_epilogue:
+	ret
+.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern	asm_AES_cbc_encrypt
+.globl	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,\@abi-omnipotent
+.align	16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+	mov	48(%rsp),$arg6		# pull direction flag
+___
+$code.=<<___;
+	cmp	\$0,$arg6
+	jne	asm_AES_cbc_encrypt
+	cmp	\$128,$arg3
+	jb	asm_AES_cbc_encrypt
+
+	mov	%rsp, %rax
+.Lcbc_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	mov	$arg5, %rbx
+	shr	\$4, $len		# bytes to blocks
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	movdqu	(%rbx), @XMM[15]	# load IV
+	sub	\$8,$len
+.Lcbc_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%edx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+
+	call	_bsaes_decrypt8
+
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[13], @XMM[3]
+	movdqu	0x70($inp), @XMM[15]	# IV
+	pxor	@XMM[14], @XMM[5]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x80($inp), $inp
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lcbc_dec_loop
+
+	add	\$8,$len
+	jz	.Lcbc_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lcbc_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lcbc_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lcbc_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lcbc_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lcbc_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lcbc_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[15]	# IV
+	pxor	@XMM[13], @XMM[3]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_six:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[15]	# IV
+	pxor	@XMM[12], @XMM[7]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_five:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[15]	# IV
+	pxor	@XMM[11], @XMM[2]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_four:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[15]	# IV
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_three:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[15]	# IV
+	pxor	@XMM[9], @XMM[6]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_two:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[15]	# IV
+	pxor	@XMM[8], @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_one:
+	lea	($inp), $arg1
+	lea	0x20(%rbp), $arg2	# buffer output
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[15]	# ^= IV
+	movdqu	@XMM[15], ($out)	# write output
+	movdqa	@XMM[0], @XMM[15]	# IV
+
+.Lcbc_dec_done:
+	movdqu	@XMM[15], (%rbx)	# return IV
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lcbc_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lcbc_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lcbc_dec_epilogue:
+	ret
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ctr32_encrypt_blocks:
+	mov	%rsp, %rax
+.Lctr_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	movdqu	($arg5), %xmm0		# load counter
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	movdqa	%xmm0, 0x20(%rbp)	# copy counter
+	cmp	\$8, $arg3
+	jb	.Lctr_enc_short
+
+	mov	%eax, %ebx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%ebx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	movdqa	(%rsp), @XMM[9]		# load round0 key
+	lea	.LADD1(%rip), %r11
+	movdqa	0x20(%rbp), @XMM[0]	# counter copy
+	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
+	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
+	pshufb	@XMM[8], @XMM[0]
+	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
+	jmp	.Lctr_enc_loop
+.align	16
+.Lctr_enc_loop:
+	movdqa	@XMM[0], 0x20(%rbp)	# save counter
+	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
+	movdqa	@XMM[0], @XMM[2]
+	paddd	0x00(%r11), @XMM[1]	# .LADD1
+	movdqa	@XMM[0], @XMM[3]
+	paddd	0x10(%r11), @XMM[2]	# .LADD2
+	movdqa	@XMM[0], @XMM[4]
+	paddd	0x20(%r11), @XMM[3]	# .LADD3
+	movdqa	@XMM[0], @XMM[5]
+	paddd	0x30(%r11), @XMM[4]	# .LADD4
+	movdqa	@XMM[0], @XMM[6]
+	paddd	0x40(%r11), @XMM[5]	# .LADD5
+	movdqa	@XMM[0], @XMM[7]
+	paddd	0x50(%r11), @XMM[6]	# .LADD6
+	paddd	0x60(%r11), @XMM[7]	# .LADD7
+
+	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	# to flip byte order in 32-bit counter
+	movdqa	(%rsp), @XMM[9]		# round 0 key
+	lea	0x10(%rsp), %rax	# pass key schedule
+	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	lea	.LBS0(%rip), %r11	# constants table
+	 pshufb	@XMM[8], @XMM[7]
+	mov	%ebx,%r10d		# pass rounds
+
+	call	_bsaes_encrypt8_bitslice
+
+	sub	\$8,$len
+	jc	.Lctr_enc_loop_done
+
+	movdqu	0x00($inp), @XMM[8]	# load input
+	movdqu	0x10($inp), @XMM[9]
+	movdqu	0x20($inp), @XMM[10]
+	movdqu	0x30($inp), @XMM[11]
+	movdqu	0x40($inp), @XMM[12]
+	movdqu	0x50($inp), @XMM[13]
+	movdqu	0x60($inp), @XMM[14]
+	movdqu	0x70($inp), @XMM[15]
+	lea	0x80($inp),$inp
+	pxor	@XMM[0], @XMM[8]
+	movdqa	0x20(%rbp), @XMM[0]	# load counter
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[8], 0x00($out)	# write output
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	@XMM[15], @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	lea	.LADD1(%rip), %r11
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	paddd	0x70(%r11), @XMM[0]	# .LADD8
+	jnz	.Lctr_enc_loop
+
+	jmp	.Lctr_enc_done
+.align	16
+.Lctr_enc_loop_done:
+	add	\$8, $len
+	movdqu	0x00($inp), @XMM[8]	# load input
+	pxor	@XMM[8], @XMM[0]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	cmp	\$2,$len
+	jb	.Lctr_enc_done
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[1], 0x10($out)
+	je	.Lctr_enc_done
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[4], 0x20($out)
+	cmp	\$4,$len
+	jb	.Lctr_enc_done
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[6], 0x30($out)
+	je	.Lctr_enc_done
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[3], 0x40($out)
+	cmp	\$6,$len
+	jb	.Lctr_enc_done
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[7], 0x50($out)
+	je	.Lctr_enc_done
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lctr_enc_done
+
+.align	16
+.Lctr_enc_short:
+	lea	0x20(%rbp), $arg1
+	lea	0x30(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	movdqu	($inp), @XMM[1]
+	lea	16($inp), $inp
+	mov	0x2c(%rbp), %eax	# load 32-bit counter
+	bswap	%eax
+	pxor	0x30(%rbp), @XMM[1]
+	inc	%eax			# increment
+	movdqu	@XMM[1], ($out)
+	bswap	%eax
+	lea	16($out), $out
+	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
+	dec	$len
+	jnz	.Lctr_enc_short
+
+.Lctr_enc_done:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lctr_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lctr_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lctr_enc_epilogue:
+	ret
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$code.=<<___;
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_encrypt:
+	mov	%rsp, %rax
+.Lxts_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6, %xmm7		# fix up last round key
+	movdqa	%xmm7, (%rax)		# save last round key
+
+	and	\$-16, $len
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_enc_short
+	jmp	.Lxts_enc_loop
+
+.align	16
+.Lxts_enc_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_enc_loop
+
+.Lxts_enc_short:
+	add	\$0x80, $len
+	jz	.Lxts_enc_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_enc_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_encrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_enc_done:
+	and	\$15, %ebx
+	jz	.Lxts_enc_ret
+	mov	$out, %rdx
+
+.Lxts_enc_steal:
+	movzb	($inp), %eax
+	movzb	-16(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, -16(%rdx)
+	mov	%cl, 0(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_enc_steal
+
+	movdqu	-16($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	movdqu	@XMM[7], -16($out)
+
+.Lxts_enc_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_enc_epilogue:
+	ret
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_decrypt:
+	mov	%rsp, %rax
+.Lxts_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp), %xmm7		# fix up round 0 key
+	movdqa	%xmm6, (%rax)		# save last round key
+	movdqa	%xmm7, (%rsp)
+
+	xor	%eax, %eax		# if ($len%16) len-=16;
+	and	\$-16, $len
+	test	\$15, %ebx
+	setnz	%al
+	shl	\$4, %rax
+	sub	%rax, $len
+
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_dec_short
+	jmp	.Lxts_dec_loop
+
+.align	16
+.Lxts_dec_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_dec_loop
+
+.Lxts_dec_short:
+	add	\$0x80, $len
+	jz	.Lxts_dec_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_dec_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_decrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_dec_done:
+	and	\$15, %ebx
+	jz	.Lxts_dec_ret
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	movdqa	@XMM[7], @XMM[6]
+	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	movdqu	($inp), @XMM[0]
+	pxor	$twres, @XMM[7]
+
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	mov	$out, %rdx
+	movdqu	@XMM[7], ($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp), %eax
+	movzb	(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, (%rdx)
+	mov	%cl, 16(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_dec_steal
+
+	movdqu	($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[6], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[6]
+	movdqu	@XMM[6], ($out)
+
+.Lxts_dec_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_dec_epilogue:
+	ret
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type	_bsaes_const,\@object
+.align	64
+_bsaes_const:
+.LM0ISR:	# InvShiftRows constants
+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:		# bit-slice constants
+	.quad	0x5555555555555555, 0x5555555555555555
+.LBS1:
+	.quad	0x3333333333333333, 0x3333333333333333
+.LBS2:
+	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:		# shiftrows constants
+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:	# byte-swap upper dword
+	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:		# counter increment constants
+	.quad	0x0000000000000000, 0x0000000100000000
+.LADD2:
+	.quad	0x0000000000000000, 0x0000000200000000
+.LADD3:
+	.quad	0x0000000000000000, 0x0000000300000000
+.LADD4:
+	.quad	0x0000000000000000, 0x0000000400000000
+.LADD5:
+	.quad	0x0000000000000000, 0x0000000500000000
+.LADD6:
+	.quad	0x0000000000000000, 0x0000000600000000
+.LADD7:
+	.quad	0x0000000000000000, 0x0000000700000000
+.LADD8:
+	.quad	0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+	.long	0x87,0,1,0
+.Lmasks:
+	.quad	0x0101010101010101, 0x0101010101010101
+	.quad	0x0202020202020202, 0x0202020202020202
+	.quad	0x0404040404040404, 0x0404040404040404
+	.quad	0x0808080808080808, 0x0808080808080808
+.LM0:
+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+	.quad	0x6363636363636363, 0x6363636363636363
+.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align	64
+.size	_bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	160($context),%rax	# pull context->Rbp
+
+	lea	0x40(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xa0(%rax),%rax		# adjust stack pointer
+
+	mov	0x70(%rax),%rbp
+	mov	0x68(%rax),%rbx
+	mov	0x60(%rax),%r12
+	mov	0x58(%rax),%r13
+	mov	0x50(%rax),%r14
+	mov	0x48(%rax),%r15
+	lea	0x78(%rax),%rax		# adjust stack pointer
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($ecb);
+	.rva	.Lecb_enc_prologue
+	.rva	.Lecb_enc_epilogue
+	.rva	.Lecb_enc_info
+
+	.rva	.Lecb_dec_prologue
+	.rva	.Lecb_dec_epilogue
+	.rva	.Lecb_dec_info
+___
+$code.=<<___;
+	.rva	.Lcbc_dec_prologue
+	.rva	.Lcbc_dec_epilogue
+	.rva	.Lcbc_dec_info
+
+	.rva	.Lctr_enc_prologue
+	.rva	.Lctr_enc_epilogue
+	.rva	.Lctr_enc_info
+
+	.rva	.Lxts_enc_prologue
+	.rva	.Lxts_enc_epilogue
+	.rva	.Lxts_enc_info
+
+	.rva	.Lxts_dec_prologue
+	.rva	.Lxts_dec_epilogue
+	.rva	.Lxts_dec_info
+
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
+.Lecb_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
+.Lctr_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
+.Lxts_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.Lxts_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/aes/asm/vpaes-x86.S b/crypto/aes/asm/vpaes-x86.S
new file mode 100644
index 0000000..c53a507
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86.S
@@ -0,0 +1,661 @@
+.file	"vpaes-x86.s"
+.text
+.align	64
+.L_vpaes_consts:
+.long	218628480,235210255,168496130,67568393
+.long	252381056,17041926,33884169,51187212
+.long	252645135,252645135,252645135,252645135
+.long	1512730624,3266504856,1377990664,3401244816
+.long	830229760,1275146365,2969422977,3447763452
+.long	3411033600,2979783055,338359620,2782886510
+.long	4209124096,907596821,221174255,1006095553
+.long	191964160,3799684038,3164090317,1589111125
+.long	182528256,1777043520,2877432650,3265356744
+.long	1874708224,3503451415,3305285752,363511674
+.long	1606117888,3487855781,1093350906,2384367825
+.long	197121,67569157,134941193,202313229
+.long	67569157,134941193,202313229,197121
+.long	134941193,202313229,197121,67569157
+.long	202313229,197121,67569157,134941193
+.long	33619971,100992007,168364043,235736079
+.long	235736079,33619971,100992007,168364043
+.long	168364043,235736079,33619971,100992007
+.long	100992007,168364043,235736079,33619971
+.long	50462976,117835012,185207048,252579084
+.long	252314880,51251460,117574920,184942860
+.long	184682752,252054788,50987272,118359308
+.long	118099200,185467140,251790600,50727180
+.long	2946363062,528716217,1300004225,1881839624
+.long	1532713819,1532713819,1532713819,1532713819
+.long	3602276352,4288629033,3737020424,4153884961
+.long	1354558464,32357713,2958822624,3775749553
+.long	1201988352,132424512,1572796698,503232858
+.long	2213177600,1597421020,4103937655,675398315
+.long	2749646592,4273543773,1511898873,121693092
+.long	3040248576,1103263732,2871565598,1608280554
+.long	2236667136,2588920351,482954393,64377734
+.long	3069987328,291237287,2117370568,3650299247
+.long	533321216,3573750986,2572112006,1401264716
+.long	1339849704,2721158661,548607111,3445553514
+.long	2128193280,3054596040,2183486460,1257083700
+.long	655635200,1165381986,3923443150,2344132524
+.long	190078720,256924420,290342170,357187870
+.long	1610966272,2263057382,4103205268,309794674
+.long	2592527872,2233205587,1335446729,3402964816
+.long	3973531904,3225098121,3002836325,1918774430
+.long	3870401024,2102906079,2284471353,4117666579
+.long	617007872,1021508343,366931923,691083277
+.long	2528395776,3491914898,2968704004,1613121270
+.long	3445188352,3247741094,844474987,4093578302
+.long	651481088,1190302358,1689581232,574775300
+.long	4289380608,206939853,2555985458,2489840491
+.long	2130264064,327674451,3566485037,3349835193
+.long	2470714624,316102159,3636825756,3393945945
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte	118,101,114,115,105,116,121,41,0
+.align	64
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	addl	(%esp),%ebp
+	movdqa	-48(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm6
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movl	$16,%ecx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%edx),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	16(%ebp),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	192(%ebp),%ebx
+	jmp	.L000enc_entry
+.align	16
+.L001enc_loop:
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	movdqa	64(%ebp),%xmm5
+.byte	102,15,56,0,234
+	movdqa	-64(%ebx,%ecx,1),%xmm1
+	movdqa	80(%ebp),%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm5,%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
+	movdqa	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	addl	$16,%edx
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addl	$16,%ecx
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm3,%xmm0
+	subl	$1,%eax
+.L000enc_entry:
+	movdqa	%xmm6,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+	movdqa	-32(%ebp),%xmm5
+.byte	102,15,56,0,232
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm7,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm7,%xmm3
+	movdqu	(%edx),%xmm5
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	jnz	.L001enc_loop
+	movdqa	96(%ebp),%xmm4
+	movdqa	112(%ebp),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%ebx,%ecx,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	movl	240(%edx),%eax
+	leal	608(%ebp),%ebx
+	movdqa	%xmm6,%xmm1
+	movdqa	-64(%ebx),%xmm2
+	pandn	%xmm0,%xmm1
+	movl	%eax,%ecx
+	psrld	$4,%xmm1
+	movdqu	(%edx),%xmm5
+	shll	$4,%ecx
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	-48(%ebx),%xmm0
+	xorl	$48,%ecx
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm5,%xmm2
+	movdqa	176(%ebp),%xmm5
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	-352(%ebx,%ecx,1),%ecx
+	jmp	.L002dec_entry
+.align	16
+.L003dec_loop:
+	movdqa	-32(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	-16(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	addl	$16,%edx
+.byte	102,15,56,0,197
+	movdqa	(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	subl	$1,%eax
+.byte	102,15,56,0,197
+	movdqa	32(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	48(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,197
+	movdqa	64(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	80(%ebx),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,58,15,237,12
+.L002dec_entry:
+	movdqa	%xmm6,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm7,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm7,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqu	(%edx),%xmm0
+	jnz	.L003dec_loop
+	movdqa	96(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%ebx),%xmm0
+	movdqa	(%ecx),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+	addl	(%esp),%ebp
+	movdqu	(%esi),%xmm0
+	movdqa	320(%ebp),%xmm2
+	movdqa	%xmm0,%xmm3
+	leal	(%ebp),%ebx
+	movdqa	%xmm2,4(%esp)
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+	testl	%edi,%edi
+	jnz	.L004schedule_am_decrypting
+	movdqu	%xmm0,(%edx)
+	jmp	.L005schedule_go
+.L004schedule_am_decrypting:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%edx)
+	xorl	$48,%ecx
+.L005schedule_go:
+	cmpl	$192,%eax
+	ja	.L006schedule_256
+	je	.L007schedule_192
+.L008schedule_128:
+	movl	$10,%eax
+.L009loop_schedule_128:
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.L009loop_schedule_128
+.align	16
+.L007schedule_192:
+	movdqu	8(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%eax
+.L011loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.L011loop_schedule_192
+.align	16
+.L006schedule_256:
+	movdqu	16(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%eax
+.L012loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,20(%esp)
+	movdqa	%xmm6,%xmm7
+	call	.L_vpaes_schedule_low_round
+	movdqa	20(%esp),%xmm7
+	jmp	.L012loop_schedule_256
+.align	16
+.L010schedule_mangle_last:
+	leal	384(%ebp),%ebx
+	testl	%edi,%edi
+	jnz	.L013schedule_mangle_last_dec
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,193
+	leal	352(%ebp),%ebx
+	addl	$32,%edx
+.L013schedule_mangle_last_dec:
+	addl	$-16,%edx
+	pxor	336(%ebp),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm0
+	pxor	%xmm0,%xmm6
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	pxor	%xmm1,%xmm1
+	movhlps	%xmm1,%xmm6
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+	movdqa	8(%esp),%xmm2
+	pxor	%xmm1,%xmm1
+.byte	102,15,58,15,202,15
+.byte	102,15,58,15,210,15
+	pxor	%xmm1,%xmm7
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+	movdqa	%xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	336(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm4
+	movdqa	-48(%ebp),%xmm5
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm4,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm5,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm5,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	-16(%ebp),%xmm2
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	movdqa	(%ebx),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	128(%ebp),%xmm5
+	testl	%edi,%edi
+	jnz	.L014schedule_mangle_dec
+	addl	$16,%edx
+	pxor	336(%ebp),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+	jmp	.L015schedule_mangle_both
+.align	16
+.L014schedule_mangle_dec:
+	movdqa	-16(%ebp),%xmm2
+	leal	416(%ebp),%esi
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm4
+	movdqa	(%esi),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	32(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	64(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	96(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	addl	$-16,%edx
+.L015schedule_mangle_both:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	addl	$-16,%ecx
+	andl	$48,%ecx
+	movdqu	%xmm3,(%edx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	movl	$48,%ecx
+	movl	$0,%edi
+	leal	.L_vpaes_consts+0x30-.L016pic_point,%ebp
+	call	_vpaes_schedule_core
+.L016pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	shll	$4,%ebx
+	leal	16(%edx,%ebx,1),%edx
+	movl	$1,%edi
+	movl	%eax,%ecx
+	shrl	$1,%ecx
+	andl	$32,%ecx
+	xorl	$32,%ecx
+	leal	.L_vpaes_consts+0x30-.L017pic_point,%ebp
+	call	_vpaes_schedule_core
+.L017pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	.L_vpaes_consts+0x30-.L018pic_point,%ebp
+	call	_vpaes_preheat
+.L018pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	.L_vpaes_consts+0x30-.L019pic_point,%ebp
+	call	_vpaes_preheat
+.L019pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	subl	$16,%eax
+	jc	.L020cbc_abort
+	leal	-56(%esp),%ebx
+	movl	36(%esp),%ebp
+	andl	$-16,%ebx
+	movl	40(%esp),%ecx
+	xchgl	%esp,%ebx
+	movdqu	(%ebp),%xmm1
+	subl	%esi,%edi
+	movl	%ebx,48(%esp)
+	movl	%edi,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,8(%esp)
+	movl	%eax,%edi
+	leal	.L_vpaes_consts+0x30-.L021pic_point,%ebp
+	call	_vpaes_preheat
+.L021pic_point:
+	cmpl	$0,%ecx
+	je	.L022cbc_dec_loop
+	jmp	.L023cbc_enc_loop
+.align	16
+.L023cbc_enc_loop:
+	movdqu	(%esi),%xmm0
+	pxor	%xmm1,%xmm0
+	call	_vpaes_encrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	movdqa	%xmm0,%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L023cbc_enc_loop
+	jmp	.L024cbc_done
+.align	16
+.L022cbc_dec_loop:
+	movdqu	(%esi),%xmm0
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm0,32(%esp)
+	call	_vpaes_decrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	pxor	16(%esp),%xmm0
+	movdqa	32(%esp),%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L022cbc_dec_loop
+.L024cbc_done:
+	movl	8(%esp),%ebx
+	movl	48(%esp),%esp
+	movdqu	%xmm1,(%ebx)
+.L020cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000..1533e2c
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#		aes-586.pl		vpaes-x86.pl
+#
+# Core 2(**)	29.1/42.3/18.3		22.0/25.6(***)
+# Nehalem	27.9/40.4/18.1		10.3/12.0
+# Atom		102./119./60.1		64.5/85.3(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +32%/65%  improvement on Core 2
+#	and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#	code path).
+#
+#						<appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;		# inv, inva
+	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10;		# s0F
+	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00;		# input transform (lo, hi)
+	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20;		# sb1u, sb1t
+	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;		# sb2u, sb2t
+	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;		# sbou, sbot
+	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80;	# mc_forward
+	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0;	# mc_backward
+	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100;		# sr
+	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140;		# rcon
+	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150;		# s63: all equal to 0x63 transformed
+	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160;		# output transform
+	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
+	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+$k_dksd=0x1a0;		# decryption key schedule: invskew x*D
+	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0;		# decryption key schedule: invskew x*B
+	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
+	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200;		# decryption key schedule: invskew x*9
+	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+$k_dipt=0x220;		# decryption input transform
+	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
+	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
+	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
+	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
+	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0;		# decryption sbox final output
+	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align	(64);
+
+&function_begin_B("_vpaes_preheat");
+	&add	($const,&DWP(0,"esp"));
+	&movdqa	("xmm7",&QWP($k_inv,$const));
+	&movdqa	("xmm6",&QWP($k_s0F,$const));
+	&ret	();
+&function_end_B("_vpaes_preheat");
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+	&mov	($magic,16);
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6")
+	&movdqa	("xmm2",&QWP($k_ipt,$const));
+	&pandn	("xmm1","xmm0");
+	&movdqu	("xmm5",&QWP(0,$key));
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm2","xmm5");
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($base,&DWP($k_mc_backward,$const));
+	&jmp	(&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+	# middle of middle round
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
+	&pshufb	("xmm4","xmm2");		# 4 = sb1u
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
+	&pshufb	("xmm5","xmm2");		# 4 = sb2u
+	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+	&pshufb	("xmm2","xmm3");		# 2 = sb2t
+	&pxor	("xmm2","xmm5");		# 2 = 2A
+	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
+	&movdqa	("xmm3","xmm0");		# 3 = A
+	&pshufb	("xmm0","xmm1");		# 0 = B
+	&add	($key,16);			# next key
+	&pxor	("xmm0","xmm2");		# 0 = 2A+B
+	&pshufb	("xmm3","xmm4");		# 3 = D
+	&add	($magic,16);			# next mc
+	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
+	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
+	&and	($magic,0x30);			# ... mod 4
+	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
+	&sub	($round,1);			# nr--
+
+&set_label("enc_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm5","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("enc_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
+	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm1");
+	&ret	();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+	&mov	($round,&DWP(240,$key));
+	&lea	($base,&DWP($k_dsbd,$const));
+	&movdqa	("xmm1","xmm6");
+	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+	&pandn	("xmm1","xmm0");
+	&mov	($magic,$round);
+	&psrld	("xmm1",4)
+	&movdqu	("xmm5",&QWP(0,$key));
+	&shl	($magic,4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+	&xor	($magic,0x30);
+	&pshufb	("xmm0","xmm1");
+	&and	($magic,0x30);
+	&pxor	("xmm2","xmm5");
+	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+	&jmp	(&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+##  Inverse mix columns
+##
+	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
+	&pshufb	("xmm4","xmm2");		# 4 = sb9u
+	&pxor	("xmm4","xmm0");
+	&movdqa	("xmm0",&QWP(-0x10,$base));	# 0 : sb9t
+	&pshufb	("xmm0","xmm3");		# 0 = sb9t
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&add	($key,16);			# next round key
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
+	&pshufb	("xmm4","xmm2");		# 4 = sbdu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x10,$base));	# 0 : sbdt
+	&pshufb	("xmm0","xmm3");		# 0 = sbdt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&sub	($round,1);			# nr--
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
+	&pshufb	("xmm4","xmm2");		# 4 = sbbu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x30,$base));	# 0 : sbbt
+	&pshufb	("xmm0","xmm3");		# 0 = sbbt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
+	&pshufb	("xmm4","xmm2");		# 4 = sbeu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x50,$base));	# 0 : sbet
+	&pshufb	("xmm0","xmm3");		# 0 = sbet
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&palignr("xmm5","xmm5",12);
+
+&set_label("dec_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqu	("xmm0",&QWP(0,$key));
+	&jnz	(&label("dec_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
+	&movdqa	("xmm2",&QWP(0,$magic));
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+	&add	($const,&DWP(0,"esp"));
+	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
+	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
+
+	# input transform
+	&movdqa	("xmm3","xmm0");
+	&lea	($base,&DWP($k_ipt,$const));
+	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
+	&call	("_vpaes_schedule_transform");
+	&movdqa	("xmm7","xmm0");
+
+	&test	($out,$out);
+	&jnz	(&label("schedule_am_decrypting"));
+
+	# encrypting, output zeroth round key after transform
+	&movdqu	(&QWP(0,$key),"xmm0");
+	&jmp	(&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+	# decrypting, output zeroth round key after shiftrows
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&xor	($magic,0x30);
+
+&set_label("schedule_go");
+	&cmp	($round,192);
+	&ja	(&label("schedule_256"));
+	&je	(&label("schedule_192"));
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+	&mov	($round,10);
+
+&set_label("loop_schedule_128");
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# write output
+	&jmp	(&label("loop_schedule_128"));
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+&set_label("schedule_192",16);
+	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&movdqa	("xmm6","xmm0");		# save short part
+	&pxor	("xmm4","xmm4");		# clear 4
+	&movhlps("xmm6","xmm4");		# clobber low side with zeros
+	&mov	($round,4);
+
+&set_label("loop_schedule_192");
+	&call	("_vpaes_schedule_round");
+	&palignr("xmm0","xmm6",8);
+	&call	("_vpaes_schedule_mangle");	# save key n
+	&call	("_vpaes_schedule_192_smear");
+	&call	("_vpaes_schedule_mangle");	# save key n+1
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# save key n+2
+	&call	("_vpaes_schedule_192_smear");
+	&jmp	(&label("loop_schedule_192"));
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&mov	($round,7);
+
+&set_label("loop_schedule_256");
+	&call	("_vpaes_schedule_mangle");	# output low result
+	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
+
+	# high round
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	
+
+	# low round. swap xmm7 and xmm6
+	&pshufd	("xmm0","xmm0",0xFF);
+	&movdqa	(&QWP(20,"esp"),"xmm7");
+	&movdqa	("xmm7","xmm6");
+	&call	("_vpaes_schedule_low_round");
+	&movdqa	("xmm7",&QWP(20,"esp"));
+
+	&jmp	(&label("loop_schedule_256"));
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+	# schedule last round key from xmm0
+	&lea	($base,&DWP($k_deskew,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_last_dec"));
+
+	# encrypting
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm0","xmm1");		# output permute
+	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
+	&add	($key,32);
+
+&set_label("schedule_mangle_last_dec");
+	&add	($key,-16);
+	&pxor	("xmm0",&QWP($k_s63,$const));
+	&call	("_vpaes_schedule_transform");	# output transform
+	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
+
+	# cleanup
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&pxor	("xmm6","xmm6");
+	&pxor	("xmm7","xmm7");
+	&ret	();
+&function_end_B("_vpaes_schedule_core");
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+	&pshufd	("xmm0","xmm6",0x80);		# d c 0 0 -> c 0 0 0
+	&pxor	("xmm6","xmm0");		# -> c+d c 0 0
+	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
+	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
+	&movdqa	("xmm0","xmm6");
+	&pxor	("xmm1","xmm1");
+	&movhlps("xmm6","xmm1");		# clobber low side with zeros
+	&ret	();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+	# extract rcon from xmm8
+	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
+	&pxor	("xmm1","xmm1");
+	&palignr("xmm1","xmm2",15);
+	&palignr("xmm2","xmm2",15);
+	&pxor	("xmm7","xmm1");
+
+	# rotate
+	&pshufd	("xmm0","xmm0",0xFF);
+	&palignr("xmm0","xmm0",1);
+
+	# fall through...
+	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
+
+	# low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+	# smear xmm7
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",4);
+	&pxor	("xmm7","xmm1");
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",8);
+	&pxor	("xmm7","xmm1");
+	&pxor	("xmm7",&QWP($k_s63,$const));
+
+	# subbyte
+	&movdqa	("xmm4",&QWP($k_s0F,$const));
+	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
+	&movdqa	("xmm1","xmm4");	
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm4");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm5");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm5");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = sbox output
+
+	# add in smeared stuff
+	&pxor	("xmm0","xmm7");
+	&movdqa	("xmm7","xmm0");
+	&ret	();
+&function_end_B("_vpaes_schedule_round");
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm2");
+	&movdqa	("xmm2",&QWP(0,$base));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP(16,$base));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+	&movdqa	("xmm4","xmm0");	# save xmm0 for later
+	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_dec"));
+
+	# encrypting
+	&add	($key,16);
+	&pxor	("xmm4",&QWP($k_s63,$const));
+	&pshufb	("xmm4","xmm5");
+	&movdqa	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+
+	&jmp	(&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+	# inverse mix columns
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&lea	($inp,&DWP($k_dksd,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm4");
+	&psrld	("xmm1",4);			# 1 = hi
+	&pand	("xmm4","xmm2");		# 4 = lo
+
+	&movdqa	("xmm2",&QWP(0,$inp));
+	&pshufb	("xmm2","xmm4");
+	&movdqa	("xmm3",&QWP(0x10,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x20,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x30,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x40,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x50,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x60,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x70,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+
+	&add	($key,-16);
+
+&set_label("schedule_mangle_both");
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&add	($magic,-16);
+	&and	($magic,0x30);
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&ret	();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
+	&mov	($magic,0x30);
+	&mov	($out,0);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
+	&shl	($base,4);
+	&lea	($key,&DWP(16,$key,$base));
+
+	&mov	($out,1);
+	&mov	($magic,$round);
+	&shr	($magic,1);
+	&and	($magic,32);
+	&xor	($magic,32);			# nbist==192?0:32;
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_encrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_decrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));		# inp
+	&mov	($out,&wparam(1));		# out
+	&mov	($round,&wparam(2));		# len
+	&mov	($key,&wparam(3));		# key
+	&sub	($round,16);
+	&jc	(&label("cbc_abort"));
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($const,&wparam(4));		# ivp
+	&and	($base,-16);
+	&mov	($magic,&wparam(5));		# enc
+	&xchg	($base,"esp");			# alloca
+	&movdqu	("xmm1",&QWP(0,$const));	# load IV
+	&sub	($out,$inp);
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	(&DWP(0,"esp"),$out);		# save out
+	&mov	(&DWP(4,"esp"),$key)		# save key
+	&mov	(&DWP(8,"esp"),$const);		# save ivp
+	&mov	($out,$round);			# $out works as $len
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&cmp	($magic,0);
+	&je	(&label("cbc_dec_loop"));
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&pxor	("xmm0","xmm1");		# inp^=iv
+	&call	("_vpaes_encrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&movdqa	("xmm1","xmm0");
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&jmp	(&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
+	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
+	&call	("_vpaes_decrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
+	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+	&mov	($base,&DWP(8,"esp"));		# restore ivp
+	&mov	("esp",&DWP(48,"esp"));
+	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();
diff --git a/crypto/aes/asm/vpaes-x86_64.S b/crypto/aes/asm/vpaes-x86_64.S
new file mode 100644
index 0000000..2b68e61
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86_64.S
@@ -0,0 +1,828 @@
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm15,%xmm5
+.byte	102,15,56,0,234
+	movdqa	-64(%r11,%r10,1),%xmm1
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm5,%xmm2
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$48,%r11
+	pxor	%xmm3,%xmm0
+	subq	$1,%rax
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm5
+.byte	102,15,56,0,232
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+	movdqu	(%r9),%xmm5
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	.byte	0xf3,0xc3
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_dipt+16(%rip),%xmm0
+	xorq	$48,%r11
+	leaq	.Lk_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$48,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	.Ldec_entry
+
+.align	16
+.Ldec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	-16(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+
+.byte	102,15,56,0,197
+	movdqa	0(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	16(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	subq	$1,%rax
+
+.byte	102,15,56,0,197
+	movdqa	32(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	48(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+.byte	102,15,56,0,197
+	movdqa	64(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	80(%r10),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+.byte	102,15,58,15,237,12
+
+.Ldec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqu	(%r9),%xmm0
+	jnz	.Ldec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	.byte	0xf3,0xc3
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+
+
+
+
+
+	call	_vpaes_preheat		
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	.Lschedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$48,%r8
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform	
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform	
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+
+
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform 
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm0
+	pxor	%xmm0,%xmm6
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	pxor	%xmm1,%xmm1
+	movhlps	%xmm1,%xmm6
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+
+	leaq	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$48,%r8
+	movdqu	%xmm3,(%rdx)
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$48,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_decrypt,.-vpaes_decrypt
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	.Lcbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,(%r8)
+.Lcbc_abort:
+	.byte	0xf3,0xc3
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	.byte	0xf3,0xc3
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000..41f2e46
--- /dev/null
+++ b/crypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1207 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+#		aes-x86_64.pl		vpaes-x86_64.pl
+#
+# Core 2(**)	30.5/43.7/14.3		21.8/25.7(***)
+# Nehalem	30.5/42.2/14.6		 9.8/11.8
+# Atom		63.9/79.0/32.1		64.0/84.8(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +40%/78% improvement on Core 2
+#	(as implied, over "hyper-threading-safe" code path).
+#
+#						<appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+	mov	%rdx,	%r9
+	mov	\$16,	%r11
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm5,	%xmm2
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	lea	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align 16
+.Lenc_loop:
+	# middle of middle round
+	movdqa  %xmm13,	%xmm4	# 4 : sb1u
+	pshufb  %xmm2,	%xmm4	# 4 = sb1u
+	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
+	movdqa  %xmm12,	%xmm0	# 0 : sb1t
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	movdqa  %xmm15,	%xmm5	# 4 : sb2u
+	pshufb	%xmm2,	%xmm5	# 4 = sb2u
+	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
+	movdqa	%xmm14, %xmm2	# 2 : sb2t
+	pshufb	%xmm3,  %xmm2	# 2 = sb2t
+	pxor	%xmm5,	%xmm2	# 2 = 2A
+	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
+	movdqa	%xmm0,  %xmm3	# 3 = A
+	pshufb  %xmm1,  %xmm0	# 0 = B
+	add	\$16,	%r9	# next key
+	pxor	%xmm2,  %xmm0	# 0 = 2A+B
+	pshufb	%xmm4,	%xmm3	# 3 = D
+	add	\$16,	%r11	# next mc
+	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
+	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
+	and	\$0x30,	%r11	# ... mod 4
+	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
+	sub	\$1,%rax	# nr--
+
+.Lenc_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,   	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm5	# 2 : a/k
+	pshufb  %xmm0,  %xmm5	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa	%xmm10, %xmm3   # 3 : 1/jak
+	movdqu	(%r9),	%xmm5
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Lenc_loop
+
+	# middle of last round
+	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm1,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+	
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,\@abi-omnipotent
+.align	16
+_vpaes_decrypt_core:
+	mov	%rdx,	%r9		# load key
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	mov	%rax,	%r11
+	psrld	\$4,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	shl	\$4,	%r11
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_dipt+16(%rip), %xmm0 # ipthi
+	xor	\$0x30,	%r11
+	lea	.Lk_dsbd(%rip),%r10
+	pshufb	%xmm1,	%xmm0
+	and	\$0x30,	%r11
+	pxor	%xmm5,	%xmm2
+	movdqa	.Lk_mc_forward+48(%rip), %xmm5
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	add	%r10,	%r11
+	jmp	.Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+##  Inverse mix columns
+##
+	movdqa  -0x20(%r10),%xmm4	# 4 : sb9u
+	pshufb	%xmm2,	%xmm4		# 4 = sb9u
+	pxor	%xmm0,	%xmm4
+	movdqa  -0x10(%r10),%xmm0	# 0 : sb9t
+	pshufb	%xmm3,	%xmm0		# 0 = sb9t
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	add	\$16, %r9		# next round key
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
+	pshufb	%xmm2,	%xmm4		# 4 = sbdu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x10(%r10),%xmm0	# 0 : sbdt
+	pshufb	%xmm3,	%xmm0		# 0 = sbdt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	sub	\$1,%rax		# nr--
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
+	pshufb	%xmm2,	%xmm4		# 4 = sbbu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x30(%r10),%xmm0	# 0 : sbbt
+	pshufb	%xmm3,	%xmm0		# 0 = sbbt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
+	pshufb	%xmm2,	%xmm4		# 4 = sbeu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x50(%r10),%xmm0	# 0 : sbet
+	pshufb	%xmm3,	%xmm0		# 0 = sbet
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+	palignr	\$12,	%xmm5,	%xmm5
+	
+.Ldec_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,    %xmm1	# 1 = i
+	pand	%xmm9, 	%xmm0	# 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2	# 2 = io
+	movdqa	%xmm10, %xmm3	# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3	# 3 = 1/jak
+	pxor	%xmm1,  %xmm3	# 3 = jo
+	movdqu	(%r9),	%xmm0
+	jnz	.Ldec_loop
+
+	# middle of last round
+	movdqa	0x60(%r10), %xmm4	# 3 : sbou
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm0,  %xmm4	# 4 = sb1u + k
+	movdqa	0x70(%r10), %xmm0	# 0 : sbot
+	movdqa	-0x160(%r11), %xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm2,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_schedule_core,\@abi-omnipotent
+.align	16
+_vpaes_schedule_core:
+	# rdi = key
+	# rsi = size in bits
+	# rdx = buffer
+	# rcx = direction.  0=encrypt, 1=decrypt
+
+	call	_vpaes_preheat		# load the tables
+	movdqa	.Lk_rcon(%rip), %xmm8	# load rcon
+	movdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	# input transform
+	movdqa	%xmm0,	%xmm3
+	lea	.Lk_ipt(%rip), %r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,	%xmm7
+
+	lea	.Lk_sr(%rip),%r10
+	test	%rcx,	%rcx
+	jnz	.Lschedule_am_decrypting
+
+	# encrypting, output zeroth round key after transform
+	movdqu	%xmm0,	(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	# decrypting, output zeroth round key after shiftrows
+	movdqa	(%r8,%r10),%xmm1
+	pshufb  %xmm1,	%xmm3
+	movdqu	%xmm3,	(%rdx)
+	xor	\$0x30, %r8
+
+.Lschedule_go:
+	cmp	\$192,	%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	\$10, %esi
+	
+.Loop_schedule_128:
+	call 	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# write output
+	jmp 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	movdqa	%xmm0,	%xmm6		# save short part
+	pxor	%xmm4,	%xmm4		# clear 4
+	movhlps	%xmm4,	%xmm6		# clobber low side with zeros
+	mov	\$4,	%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+	palignr	\$8,%xmm6,%xmm0	
+	call	_vpaes_schedule_mangle	# save key n
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle	# save key n+1
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# save key n+2
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	mov	\$7, %esi
+	
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	# output low result
+	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	# high round
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+
+	# low round. swap xmm7 and xmm6
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	movdqa	%xmm7,	%xmm5
+	movdqa	%xmm6,	%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,	%xmm7
+	
+	jmp	.Loop_schedule_256
+
+	
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	16
+.Lschedule_mangle_last:
+	# schedule last round key from xmm0
+	lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+	# encrypting
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,	%xmm0		# output permute
+	lea	.Lk_opt(%rip),	%r11	# prepare to output transform
+	add	\$32,	%rdx
+
+.Lschedule_mangle_last_dec:
+	add	\$-16,	%rdx
+	pxor	.Lk_s63(%rip),	%xmm0
+	call	_vpaes_schedule_transform # output transform
+	movdqu	%xmm0,	(%rdx)		# save last key
+
+	# cleanup
+	pxor	%xmm0,  %xmm0
+	pxor	%xmm1,  %xmm1
+	pxor	%xmm2,  %xmm2
+	pxor	%xmm3,  %xmm3
+	pxor	%xmm4,  %xmm4
+	pxor	%xmm5,  %xmm5
+	pxor	%xmm6,  %xmm6
+	pxor	%xmm7,  %xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,\@abi-omnipotent
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	\$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
+	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
+	pshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	pxor	%xmm0,	%xmm6		# -> b+c+d b+c b a
+	movdqa	%xmm6,	%xmm0
+	pxor	%xmm1,	%xmm1
+	movhlps	%xmm1,	%xmm6		# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,\@abi-omnipotent
+.align	16
+_vpaes_schedule_round:
+	# extract rcon from xmm8
+	pxor	%xmm1,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm8
+	pxor	%xmm1,	%xmm7
+
+	# rotate
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	palignr	\$1,	%xmm0,	%xmm0
+	
+	# fall through...
+	
+	# low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	# smear xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$4,	%xmm7
+	pxor	%xmm1,	%xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$8,	%xmm7
+	pxor	%xmm1,	%xmm7
+	pxor	.Lk_s63(%rip), %xmm7
+
+	# subbytes
+	movdqa  %xmm9, 	%xmm1
+	pandn	%xmm0, 	%xmm1
+	psrld	\$4,    %xmm1		# 1 = i
+	pand	%xmm9, 	%xmm0		# 0 = k
+	movdqa	%xmm11, %xmm2		# 2 : a/k
+	pshufb  %xmm0,  %xmm2		# 2 = a/k
+	pxor	%xmm1,	%xmm0		# 0 = j
+	movdqa	%xmm10,	%xmm3		# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
+	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4		# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
+	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2		# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2		# 2 = io
+	movdqa	%xmm10, %xmm3		# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
+	pxor	%xmm1,  %xmm3		# 3 = jo
+	movdqa	%xmm13, %xmm4		# 4 : sbou
+	pshufb  %xmm2,  %xmm4		# 4 = sbou
+	movdqa	%xmm12, %xmm0		# 0 : sbot
+	pshufb  %xmm3,	%xmm0		# 0 = sb1t
+	pxor	%xmm4, 	%xmm0		# 0 = sbox output
+
+	# add in smeared stuff
+	pxor	%xmm7,	%xmm0	
+	movdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,\@abi-omnipotent
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	movdqa	(%r11), %xmm2 	# lo
+	pshufb	%xmm0,	%xmm2
+	movdqa	16(%r11), %xmm0 # hi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm2,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,\@abi-omnipotent
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,	%xmm4	# save xmm0 for later
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_dec
+
+	# encrypting
+	add	\$16,	%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+	pshufb	%xmm5,	%xmm4
+	movdqa	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+	# inverse mix columns
+	lea	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm4,	%xmm1
+	psrld	\$4,	%xmm1	# 1 = hi
+	pand	%xmm9,	%xmm4	# 4 = lo
+
+	movdqa	0x00(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	movdqa	0x10(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x20(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x30(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x40(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x50(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x60(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x70(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+
+	add	\$-16,	%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,%xmm3
+	add	\$-16,	%r8
+	and	\$0x30,	%r8
+	movdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@function,3
+.align	16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	\$0,%ecx
+	mov	\$0x30,%r8d
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl	${PREFIX}_set_decrypt_key
+.type	${PREFIX}_set_decrypt_key,\@function,3
+.align	16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	shl	\$4,%eax
+	lea	16(%rdx,%rax),%rdx
+
+	mov	\$1,%ecx
+	mov	%esi,%r8d
+	shr	\$1,%r8d
+	and	\$32,%r8d
+	xor	\$32,%r8d	# nbits==192?0:32
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl	${PREFIX}_encrypt
+.type	${PREFIX}_encrypt,\@function,3
+.align	16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl	${PREFIX}_decrypt
+.type	${PREFIX}_decrypt,\@function,3
+.align	16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+#                       size_t length, const AES_KEY *key,
+#                       unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl	${PREFIX}_cbc_encrypt
+.type	${PREFIX}_cbc_encrypt,\@function,6
+.align	16
+${PREFIX}_cbc_encrypt:
+	xchg	$key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+	sub	\$16,$len
+	jc	.Lcbc_abort
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+	movdqu	($ivp),%xmm6		# load IV
+	sub	$inp,$out
+	call	_vpaes_preheat
+	cmp	\$0,${enc}d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	($inp),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	($inp),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,($ivp)		# save IV
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+.Lcbc_abort:
+	ret
+.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_preheat,\@abi-omnipotent
+.align	16
+_vpaes_preheat:
+	lea	.Lk_s0F(%rip), %r10
+	movdqa	-0x20(%r10), %xmm10	# .Lk_inv
+	movdqa	-0x10(%r10), %xmm11	# .Lk_inv+16
+	movdqa	0x00(%r10), %xmm9	# .Lk_s0F
+	movdqa	0x30(%r10), %xmm13	# .Lk_sb1
+	movdqa	0x40(%r10), %xmm12	# .Lk_sb1+16
+	movdqa	0x50(%r10), %xmm15	# .Lk_sb2
+	movdqa	0x60(%r10), %xmm14	# .Lk_sb2+16
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+.type	_vpaes_consts,\@object
+.align	64
+_vpaes_consts:
+.Lk_inv:	# inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:	# s0F
+	.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:	# input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:	# sb1u, sb1t
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:	# sb2u, sb2t
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:	# sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:	# mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:		# sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:	# rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:	# s63: all equal to 0x63 transformed
+	.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:	# output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:	# deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+.Lk_dksd:	# decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	# decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	# decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+.Lk_dipt:	# decryption input transform
+	.quad	0x0F505B040B545F00, 0x154A411E114E451A
+	.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:	# decryption sbox output *9*u, *9*t
+	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	# decryption sbox output *D*u, *D*t
+	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	# decryption sbox output *B*u, *B*t
+	.quad	0xD022649296B44200, 0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	# decryption sbox output *E*u, *E*t
+	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:	# decryption sbox final output
+	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz	"Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	16(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xb8(%rax),%rax		# adjust stack pointer
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_encrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_decrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_encrypt
+	.rva	.LSEH_end_${PREFIX}_encrypt
+	.rva	.LSEH_info_${PREFIX}_encrypt
+
+	.rva	.LSEH_begin_${PREFIX}_decrypt
+	.rva	.LSEH_end_${PREFIX}_decrypt
+	.rva	.LSEH_info_${PREFIX}_decrypt
+
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_${PREFIX}_cbc_encrypt
+
+.section	.xdata
+.align	8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_key_body,.Lenc_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_key_body,.Ldec_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_body,.Lenc_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_body,.Ldec_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_body,.Lcbc_epilogue		# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
new file mode 100644
index 0000000..5a83107
--- /dev/null
+++ b/crypto/arm_arch.h
@@ -0,0 +1,51 @@
+#ifndef __ARM_ARCH_H__
+#define __ARM_ARCH_H__
+
+#if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+#  define __ARM_ARCH__ __TARGET_ARCH_ARM
+#  if defined(__BIG_ENDIAN)
+#   define __ARMEB__
+#  else
+#   define __ARMEL__
+#  endif
+# elif defined(__GNUC__)
+  /*
+   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+   * bunch of below macros. See all_architectires[] table in
+   * gcc/config/arm/arm.c. On a side note it defines
+   * __ARMEL__/__ARMEB__ for little-/big-endian.
+   */
+#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
+	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
+	defined(__ARM_ARCH_7EM__)
+#   define __ARM_ARCH__ 7
+#  elif	defined(__ARM_ARCH_6__)	|| defined(__ARM_ARCH_6J__)	|| \
+	defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)	|| \
+	defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__)	|| \
+	defined(__ARM_ARCH_6T2__)
+#   define __ARM_ARCH__ 6
+#  elif	defined(__ARM_ARCH_5__)	|| defined(__ARM_ARCH_5T__)	|| \
+	defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__)	|| \
+	defined(__ARM_ARCH_5TEJ__)
+#   define __ARM_ARCH__ 5
+#  elif	defined(__ARM_ARCH_4__)	|| defined(__ARM_ARCH_4T__)
+#   define __ARM_ARCH__ 4
+#  else
+#   error "unsupported ARM architecture"
+#  endif
+# endif
+#endif
+
+#ifdef OPENSSL_FIPSCANISTER
+#include <openssl/fipssyms.h>
+#endif
+
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+                                     
+#define ARMV7_NEON      (1<<0)
+#define ARMV7_TICK      (1<<1)
+#endif
+
+#endif
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
new file mode 100644
index 0000000..2d618de
--- /dev/null
+++ b/crypto/armv4cpuid.S
@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.align	5
+.global	_armv7_neon_probe
+.type	_armv7_neon_probe,%function
+_armv7_neon_probe:
+	.word	0xf26ee1fe	@ vorr	q15,q15,q15
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_neon_probe,.-_armv7_neon_probe
+
+.global	_armv7_tick
+.type	_armv7_tick,%function
+_armv7_tick:
+	mrc	p15,0,r0,c9,c13,0
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_tick,.-_armv7_tick
+
+.global	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd:	ldrex	r2,[r0]
+	add	r3,r2,r1
+	strex	r2,r3,[r0]
+	cmp	r2,#0
+	bne	.Ladd
+	mov	r0,r3
+	.word	0xe12fff1e	@ bx	lr
+#else
+	stmdb	sp!,{r4-r6,lr}
+	ldr	r2,.Lspinlock
+	adr	r3,.Lspinlock
+	mov	r4,r0
+	mov	r5,r1
+	add	r6,r3,r2	@ &spinlock
+	b	.+8
+.Lspin:	bl	sched_yield
+	mov	r0,#-1
+	swp	r0,r0,[r6]
+	cmp	r0,#0
+	bne	.Lspin
+
+	ldr	r2,[r4]
+	add	r2,r2,r5
+	str	r2,[r4]
+	str	r0,[r6]		@ release spinlock
+	ldmia	sp!,{r4-r6,lr}
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+#endif
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global	OPENSSL_cleanse
+.type	OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+	eor	ip,ip,ip
+	cmp	r1,#7
+	subhs	r1,r1,#4
+	bhs	.Lot
+	cmp	r1,#0
+	beq	.Lcleanse_done
+.Little:
+	strb	ip,[r0],#1
+	subs	r1,r1,#1
+	bhi	.Little
+	b	.Lcleanse_done
+
+.Lot:	tst	r0,#3
+	beq	.Laligned
+	strb	ip,[r0],#1
+	sub	r1,r1,#1
+	b	.Lot
+.Laligned:
+	str	ip,[r0],#4
+	subs	r1,r1,#4
+	bhs	.Laligned
+	adds	r1,r1,#4
+	bne	.Little
+.Lcleanse_done:
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+	ldr	r0,.LOPENSSL_armcap
+	adr	r1,.LOPENSSL_armcap
+	ldr	r0,[r1,r0]
+	eor	r2,r2,r2
+	eor	r3,r3,r3
+	eor	ip,ip,ip
+	tst	r0,#1
+	beq	.Lwipe_done
+	.word	0xf3000150	@ veor    q0, q0, q0
+	.word	0xf3022152	@ veor    q1, q1, q1
+	.word	0xf3044154	@ veor    q2, q2, q2
+	.word	0xf3066156	@ veor    q3, q3, q3
+	.word	0xf34001f0	@ veor    q8, q8, q8
+	.word	0xf34221f2	@ veor    q9, q9, q9
+	.word	0xf34441f4	@ veor    q10, q10, q10
+	.word	0xf34661f6	@ veor    q11, q11, q11
+	.word	0xf34881f8	@ veor    q12, q12, q12
+	.word	0xf34aa1fa	@ veor    q13, q13, q13
+	.word	0xf34cc1fc	@ veor    q14, q14, q14
+	.word	0xf34ee1fe	@ veor    q15, q15, q15
+.Lwipe_done:
+	mov	r0,sp
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global	OPENSSL_instrument_bus
+.type	OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global	OPENSSL_instrument_bus2
+.type	OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align	5
+#else
+.Lspinlock:
+.word	atomic_add_spinlock-.Lspinlock
+.align	5
+
+.data
+.align	2
+atomic_add_spinlock:
+.word	0
+#endif
+
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
diff --git a/crypto/asn1/a_d2i_fp.c b/crypto/asn1/a_d2i_fp.c
index ece40bc..52b2ebd 100644
--- a/crypto/asn1/a_d2i_fp.c
+++ b/crypto/asn1/a_d2i_fp.c
@@ -57,6 +57,7 @@
  */
 
 #include <stdio.h>
+#include <limits.h>
 #include "cryptlib.h"
 #include <openssl/buffer.h>
 #include <openssl/asn1_mac.h>
@@ -143,17 +144,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 	BUF_MEM *b;
 	unsigned char *p;
 	int i;
-	int ret=-1;
 	ASN1_const_CTX c;
-	int want=HEADER_SIZE;
+	size_t want=HEADER_SIZE;
 	int eos=0;
-#if defined(__GNUC__) && defined(__ia64)
-	/* pathetic compiler bug in all known versions as of Nov. 2002 */
-	long off=0;
-#else
-	int off=0;
-#endif
-	int len=0;
+	size_t off=0;
+	size_t len=0;
 
 	b=BUF_MEM_new();
 	if (b == NULL)
@@ -169,7 +164,7 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			{
 			want-=(len-off);
 
-			if (!BUF_MEM_grow_clean(b,len+want))
+			if (len + want < len || !BUF_MEM_grow_clean(b,len+want))
 				{
 				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
 				goto err;
@@ -181,7 +176,14 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 				goto err;
 				}
 			if (i > 0)
+				{
+				if (len+i < len)
+					{
+					ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+					goto err;
+					}
 				len+=i;
+				}
 			}
 		/* else data already loaded */
 
@@ -206,6 +208,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			{
 			/* no data body so go round again */
 			eos++;
+			if (eos < 0)
+				{
+				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_HEADER_TOO_LONG);
+				goto err;
+				}
 			want=HEADER_SIZE;
 			}
 		else if (eos && (c.slen == 0) && (c.tag == V_ASN1_EOC))
@@ -220,10 +227,16 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 		else 
 			{
 			/* suck in c.slen bytes of data */
-			want=(int)c.slen;
+			want=c.slen;
 			if (want > (len-off))
 				{
 				want-=(len-off);
+				if (want > INT_MAX /* BIO_read takes an int length */ ||
+					len+want < len)
+						{
+						ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+						goto err;
+						}
 				if (!BUF_MEM_grow_clean(b,len+want))
 					{
 					ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
@@ -238,11 +251,18 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 						    ASN1_R_NOT_ENOUGH_DATA);
 						goto err;
 						}
+					/* This can't overflow because
+					 * |len+want| didn't overflow. */
 					len+=i;
-					want -= i;
+					want-=i;
 					}
 				}
-			off+=(int)c.slen;
+			if (off + c.slen < off)
+				{
+				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+				goto err;
+				}
+			off+=c.slen;
 			if (eos <= 0)
 				{
 				break;
@@ -252,9 +272,15 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			}
 		}
 
+	if (off > INT_MAX)
+		{
+		ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+		goto err;
+		}
+
 	*pb = b;
 	return off;
 err:
 	if (b != NULL) BUF_MEM_free(b);
-	return(ret);
+	return -1;
 	}
diff --git a/crypto/asn1/a_digest.c b/crypto/asn1/a_digest.c
index d00d9e2..cbdeea6 100644
--- a/crypto/asn1/a_digest.c
+++ b/crypto/asn1/a_digest.c
@@ -87,7 +87,8 @@ int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data,
 	p=str;
 	i2d(data,&p);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}
@@ -104,7 +105,8 @@ int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn,
 	i=ASN1_item_i2d(asn,&str, it);
 	if (!str) return(0);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}
diff --git a/crypto/asn1/a_int.c b/crypto/asn1/a_int.c
index 3348b87..ad0d250 100644
--- a/crypto/asn1/a_int.c
+++ b/crypto/asn1/a_int.c
@@ -386,8 +386,8 @@ long ASN1_INTEGER_get(const ASN1_INTEGER *a)
 	
 	if (a->length > (int)sizeof(long))
 		{
-		/* hmm... a bit ugly */
-		return(0xffffffffL);
+		/* hmm... a bit ugly, return all ones */
+		return -1;
 		}
 	if (a->data == NULL)
 		return 0;
diff --git a/crypto/asn1/a_sign.c b/crypto/asn1/a_sign.c
index ff63bfc..7b4a193 100644
--- a/crypto/asn1/a_sign.c
+++ b/crypto/asn1/a_sign.c
@@ -184,9 +184,9 @@ int ASN1_sign(i2d_of_void *i2d, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
+	if (!EVP_SignInit_ex(&ctx,type, NULL)
+		|| !EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl)
+		|| !EVP_SignFinal(&ctx,(unsigned char *)buf_out,
 			(unsigned int *)&outl,pkey))
 		{
 		outl=0;
@@ -218,65 +218,100 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	     const EVP_MD *type)
 	{
 	EVP_MD_CTX ctx;
+	EVP_MD_CTX_init(&ctx);
+	if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey))
+		{
+		EVP_MD_CTX_cleanup(&ctx);
+		return 0;
+		}
+	return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx);
+	}
+		
+
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx)
+	{
+	const EVP_MD *type;
+	EVP_PKEY *pkey;
 	unsigned char *buf_in=NULL,*buf_out=NULL;
-	int inl=0,outl=0,outll=0;
+	size_t inl=0,outl=0,outll=0;
 	int signid, paramtype;
+	int rv;
+
+	type = EVP_MD_CTX_md(ctx);
+	pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx);
 
-	if (type == NULL)
+	if (!type || !pkey)
 		{
-		int def_nid;
-		if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
-			type = EVP_get_digestbynid(def_nid);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED);
+		return 0;
 		}
 
-	if (type == NULL)
+	if (pkey->ameth->item_sign)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN, ASN1_R_NO_DEFAULT_DIGEST);
-		return 0;
+		rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2,
+						signature);
+		if (rv == 1)
+			outl = signature->length;
+		/* Return value meanings:
+		 * <=0: error.
+		 *   1: method does everything.
+		 *   2: carry on as normal.
+		 *   3: ASN1 method sets algorithm identifiers: just sign.
+		 */
+		if (rv <= 0)
+			ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+		if (rv <= 1)
+			goto err;
 		}
+	else
+		rv = 2;
 
-	if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
+	if (rv == 2)
 		{
-		if (!pkey->ameth ||
-			!OBJ_find_sigid_by_algs(&signid, EVP_MD_nid(type),
-						pkey->ameth->pkey_id))
+		if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 			{
-			ASN1err(ASN1_F_ASN1_ITEM_SIGN,
-				ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
-			return 0;
+			if (!pkey->ameth ||
+				!OBJ_find_sigid_by_algs(&signid,
+							EVP_MD_nid(type),
+							pkey->ameth->pkey_id))
+				{
+				ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,
+					ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+				return 0;
+				}
 			}
-		}
-	else
-		signid = type->pkey_type;
+		else
+			signid = type->pkey_type;
 
-	if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
-		paramtype = V_ASN1_NULL;
-	else
-		paramtype = V_ASN1_UNDEF;
+		if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
+			paramtype = V_ASN1_NULL;
+		else
+			paramtype = V_ASN1_UNDEF;
 
-	if (algor1)
-		X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
-	if (algor2)
-		X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+		if (algor1)
+			X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
+		if (algor2)
+			X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+
+		}
 
-	EVP_MD_CTX_init(&ctx);
 	inl=ASN1_item_i2d(asn,&buf_in, it);
 	outll=outl=EVP_PKEY_size(pkey);
-	buf_out=(unsigned char *)OPENSSL_malloc((unsigned int)outl);
+	buf_out=OPENSSL_malloc((unsigned int)outl);
 	if ((buf_in == NULL) || (buf_out == NULL))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_MALLOC_FAILURE);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
 
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
-			(unsigned int *)&outl,pkey))
+	if (!EVP_DigestSignUpdate(ctx, buf_in, inl)
+		|| !EVP_DigestSignFinal(ctx, buf_out, &outl))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_EVP_LIB);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_EVP_LIB);
 		goto err;
 		}
 	if (signature->data != NULL) OPENSSL_free(signature->data);
@@ -289,7 +324,7 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	signature->flags&= ~(ASN1_STRING_FLAG_BITS_LEFT|0x07);
 	signature->flags|=ASN1_STRING_FLAG_BITS_LEFT;
 err:
-	EVP_MD_CTX_cleanup(&ctx);
+	EVP_MD_CTX_cleanup(ctx);
 	if (buf_in != NULL)
 		{ OPENSSL_cleanse((char *)buf_in,(unsigned int)inl); OPENSSL_free(buf_in); }
 	if (buf_out != NULL)
diff --git a/crypto/asn1/a_strex.c b/crypto/asn1/a_strex.c
index 264ebf2..ead37ac 100644
--- a/crypto/asn1/a_strex.c
+++ b/crypto/asn1/a_strex.c
@@ -567,6 +567,7 @@ int ASN1_STRING_to_UTF8(unsigned char **out, ASN1_STRING *in)
 	if(mbflag == -1) return -1;
 	mbflag |= MBSTRING_FLAG;
 	stmp.data = NULL;
+	stmp.length = 0;
 	ret = ASN1_mbstring_copy(&str, in->data, in->length, mbflag, B_ASN1_UTF8STRING);
 	if(ret < 0) return ret;
 	*out = stmp.data;
diff --git a/crypto/asn1/a_verify.c b/crypto/asn1/a_verify.c
index cecdb13..fc84cd3 100644
--- a/crypto/asn1/a_verify.c
+++ b/crypto/asn1/a_verify.c
@@ -101,8 +101,13 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_VerifyInit_ex(&ctx,type, NULL);
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_VerifyInit_ex(&ctx,type, NULL)
+		|| !EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
@@ -126,16 +131,21 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
 #endif
 
 
-int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signature,
-	     void *asn, EVP_PKEY *pkey)
+int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a,
+		ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey)
 	{
 	EVP_MD_CTX ctx;
-	const EVP_MD *type = NULL;
 	unsigned char *buf_in=NULL;
 	int ret= -1,inl;
 
 	int mdnid, pknid;
 
+	if (!pkey)
+		{
+		ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_PASSED_NULL_PARAMETER);
+		return -1;
+		}
+
 	EVP_MD_CTX_init(&ctx);
 
 	/* Convert signature OID into digest and public key OIDs */
@@ -144,25 +154,47 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
 		goto err;
 		}
-	type=EVP_get_digestbynid(mdnid);
-	if (type == NULL)
+	if (mdnid == NID_undef)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
-		goto err;
+		if (!pkey->ameth || !pkey->ameth->item_verify)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+			goto err;
+			}
+		ret = pkey->ameth->item_verify(&ctx, it, asn, a,
+							signature, pkey);
+		/* Return value of 2 means carry on, anything else means we
+		 * exit straight away: either a fatal error of the underlying
+		 * verification routine handles all verification.
+		 */
+		if (ret != 2)
+			goto err;
+		ret = -1;
 		}
-
-	/* Check public key OID matches public key type */
-	if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+	else
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
-		goto err;
-		}
+		const EVP_MD *type;
+		type=EVP_get_digestbynid(mdnid);
+		if (type == NULL)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+			goto err;
+			}
+
+		/* Check public key OID matches public key type */
+		if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
+			goto err;
+			}
+
+		if (!EVP_DigestVerifyInit(&ctx, NULL, type, NULL, pkey))
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+			ret=0;
+			goto err;
+			}
 
-	if (!EVP_VerifyInit_ex(&ctx,type, NULL))
-		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
-		ret=0;
-		goto err;
 		}
 
 	inl = ASN1_item_i2d(asn, &buf_in, it);
@@ -173,13 +205,18 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
 		goto err;
 		}
 
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_DigestVerifyUpdate(&ctx,buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
 
-	if (EVP_VerifyFinal(&ctx,(unsigned char *)signature->data,
-			(unsigned int)signature->length,pkey) <= 0)
+	if (EVP_DigestVerifyFinal(&ctx,signature->data,
+			(size_t)signature->length) <= 0)
 		{
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
 		ret=0;
diff --git a/crypto/asn1/ameth_lib.c b/crypto/asn1/ameth_lib.c
index 5a581b9..a19e058 100644
--- a/crypto/asn1/ameth_lib.c
+++ b/crypto/asn1/ameth_lib.c
@@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
 extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
 
 /* Keep this sorted in type order !! */
 static const EVP_PKEY_ASN1_METHOD *standard_methods[] = 
@@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
 #ifndef OPENSSL_NO_EC
 	&eckey_asn1_meth,
 #endif
-	&hmac_asn1_meth
+	&hmac_asn1_meth,
+	&cmac_asn1_meth
 	};
 
 typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
@@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
 	if (!ameth)
 		return NULL;
 
+	memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD));
+
 	ameth->pkey_id = id;
 	ameth->pkey_base_id = id;
 	ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
@@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
 	ameth->old_priv_encode = 0;
 	ameth->old_priv_decode = 0;
 
+	ameth->item_verify = 0;
+	ameth->item_sign = 0;
+
 	ameth->pkey_size = 0;
 	ameth->pkey_bits = 0;
 
@@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
 	dst->pkey_free = src->pkey_free;
 	dst->pkey_ctrl = src->pkey_ctrl;
 
+	dst->item_sign = src->item_sign;
+	dst->item_verify = src->item_verify;
+
 	}
 
 void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
diff --git a/crypto/asn1/asn1.h b/crypto/asn1/asn1.h
index 59540e4..220a0c8 100644
--- a/crypto/asn1/asn1.h
+++ b/crypto/asn1/asn1.h
@@ -235,7 +235,7 @@ typedef struct asn1_object_st
  */
 #define ASN1_STRING_FLAG_MSTRING 0x040 
 /* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
 	{
 	int length;
 	int type;
@@ -245,7 +245,7 @@ typedef struct asn1_string_st
 	 * input data has a non-zero 'unused bits' value, it will be
 	 * handled correctly */
 	long flags;
-	} ASN1_STRING;
+	};
 
 /* ASN1_ENCODING structure: this is used to save the received
  * encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@ DECLARE_STACK_OF(ASN1_STRING_TABLE)
  * see asn1t.h
  */
 typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct ASN1_TLC_st ASN1_TLC;
 /* This is just an opaque pointer */
 typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_ASN1_ITEM_I2D_FP				 193
 #define ASN1_F_ASN1_ITEM_PACK				 198
 #define ASN1_F_ASN1_ITEM_SIGN				 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX			 220
 #define ASN1_F_ASN1_ITEM_UNPACK				 199
 #define ASN1_F_ASN1_ITEM_VERIFY				 197
 #define ASN1_F_ASN1_MBSTRING_NCOPY			 122
@@ -1266,6 +1266,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_PKCS5_PBE2_SET_IV			 167
 #define ASN1_F_PKCS5_PBE_SET				 202
 #define ASN1_F_PKCS5_PBE_SET0_ALGOR			 215
+#define ASN1_F_PKCS5_PBKDF2_SET				 219
 #define ASN1_F_SMIME_READ_ASN1				 212
 #define ASN1_F_SMIME_TEXT				 213
 #define ASN1_F_X509_CINF_NEW				 168
@@ -1291,6 +1292,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH			 106
 #define ASN1_R_BUFFER_TOO_SMALL				 107
 #define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER		 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED			 217
 #define ASN1_R_DATA_IS_WRONG				 109
 #define ASN1_R_DECODE_ERROR				 110
 #define ASN1_R_DECODING_ERROR				 111
diff --git a/crypto/asn1/asn1_err.c b/crypto/asn1/asn1_err.c
index 6e04d08..1a30bf1 100644
--- a/crypto/asn1/asn1_err.c
+++ b/crypto/asn1/asn1_err.c
@@ -1,6 +1,6 @@
 /* crypto/asn1/asn1_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -107,6 +107,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
 {ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP),	"ASN1_item_i2d_fp"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_PACK),	"ASN1_item_pack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN),	"ASN1_item_sign"},
+{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX),	"ASN1_item_sign_ctx"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK),	"ASN1_item_unpack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY),	"ASN1_item_verify"},
 {ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY),	"ASN1_mbstring_ncopy"},
@@ -179,6 +180,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
 {ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV),	"PKCS5_pbe2_set_iv"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET),	"PKCS5_pbe_set"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR),	"PKCS5_pbe_set0_algor"},
+{ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET),	"PKCS5_pbkdf2_set"},
 {ERR_FUNC(ASN1_F_SMIME_READ_ASN1),	"SMIME_read_ASN1"},
 {ERR_FUNC(ASN1_F_SMIME_TEXT),	"SMIME_text"},
 {ERR_FUNC(ASN1_F_X509_CINF_NEW),	"X509_CINF_NEW"},
@@ -207,6 +209,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]=
 {ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"},
 {ERR_REASON(ASN1_R_BUFFER_TOO_SMALL)     ,"buffer too small"},
 {ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),"cipher has no object identifier"},
+{ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED),"context not initialised"},
 {ERR_REASON(ASN1_R_DATA_IS_WRONG)        ,"data is wrong"},
 {ERR_REASON(ASN1_R_DECODE_ERROR)         ,"decode error"},
 {ERR_REASON(ASN1_R_DECODING_ERROR)       ,"decoding error"},
diff --git a/crypto/asn1/asn1_locl.h b/crypto/asn1/asn1_locl.h
index 5aa65e2..9fcf0d9 100644
--- a/crypto/asn1/asn1_locl.h
+++ b/crypto/asn1/asn1_locl.h
@@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st
 	int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
 	int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
 							ASN1_PCTX *pctx);
+	int (*sig_print)(BIO *out,
+			 const X509_ALGOR *sigalg, const ASN1_STRING *sig,
+					 int indent, ASN1_PCTX *pctx);
+
 
 	void (*pkey_free)(EVP_PKEY *pkey);
 	int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
@@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st
 	int (*old_priv_decode)(EVP_PKEY *pkey,
 				const unsigned char **pder, int derlen);
 	int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+	/* Custom ASN1 signature verification */
+	int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *a, ASN1_BIT_STRING *sig,
+				EVP_PKEY *pkey);
+	int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig);
 
 	} /* EVP_PKEY_ASN1_METHOD */;
 
diff --git a/crypto/asn1/asn_mime.c b/crypto/asn1/asn_mime.c
index c1d1b12..54a704a 100644
--- a/crypto/asn1/asn_mime.c
+++ b/crypto/asn1/asn_mime.c
@@ -377,8 +377,12 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 	BIO *tmpbio;
 	const ASN1_AUX *aux = it->funcs;
 	ASN1_STREAM_ARG sarg;
+	int rv = 1;
 
-	if (!(flags & SMIME_DETACHED))
+	/* If data is not deteched or resigning then the output BIO is
+	 * already set up to finalise when it is written through.
+	 */
+	if (!(flags & SMIME_DETACHED) || (flags & PKCS7_REUSE_DIGEST))
 		{
 		SMIME_crlf_copy(data, out, flags);
 		return 1;
@@ -405,7 +409,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 
 	/* Finalize structure */
 	if (aux->asn1_cb(ASN1_OP_DETACHED_POST, &val, it, &sarg) <= 0)
-		return 0;
+		rv = 0;
 
 	/* Now remove any digests prepended to the BIO */
 
@@ -416,7 +420,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 		sarg.ndef_bio = tmpbio;
 		}
 
-	return 1;
+	return rv;
 
 	}
 
@@ -486,9 +490,9 @@ ASN1_VALUE *SMIME_read_ASN1(BIO *bio, BIO **bcont, const ASN1_ITEM *it)
 
 		if(strcmp(hdr->value, "application/x-pkcs7-signature") &&
 			strcmp(hdr->value, "application/pkcs7-signature")) {
-			sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
 			ASN1err(ASN1_F_SMIME_READ_ASN1,ASN1_R_SIG_INVALID_MIME_TYPE);
 			ERR_add_error_data(2, "type: ", hdr->value);
+			sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
 			sk_BIO_pop_free(parts, BIO_vfree);
 			return NULL;
 		}
@@ -801,7 +805,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
 	if(name) {
 		if(!(tmpname = BUF_strdup(name))) return NULL;
 		for(p = tmpname ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -811,7 +815,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
 	if(value) {
 		if(!(tmpval = BUF_strdup(value))) return NULL;
 		for(p = tmpval ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -835,7 +839,7 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
 		tmpname = BUF_strdup(name);
 		if(!tmpname) return 0;
 		for(p = tmpname ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -858,12 +862,17 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
 static int mime_hdr_cmp(const MIME_HEADER * const *a,
 			const MIME_HEADER * const *b)
 {
+	if (!(*a)->name || !(*b)->name)
+		return !!(*a)->name - !!(*b)->name;
+
 	return(strcmp((*a)->name, (*b)->name));
 }
 
 static int mime_param_cmp(const MIME_PARAM * const *a,
 			const MIME_PARAM * const *b)
 {
+	if (!(*a)->param_name || !(*b)->param_name)
+		return !!(*a)->param_name - !!(*b)->param_name;
 	return(strcmp((*a)->param_name, (*b)->param_name));
 }
 
diff --git a/crypto/asn1/n_pkey.c b/crypto/asn1/n_pkey.c
index e7d0439..e251739 100644
--- a/crypto/asn1/n_pkey.c
+++ b/crypto/asn1/n_pkey.c
@@ -129,6 +129,7 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
 	unsigned char buf[256],*zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	if (a == NULL) return(0);
 
@@ -206,24 +207,28 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
 	i = strlen((char *)buf);
 	/* If the key is used for SGC the algorithm is modified a little. */
 	if(sgckey) {
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
 	/* Encrypt private key in place */
 	zz = enckey->enckey->digest->data;
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL);
-	EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen);
-	EVP_EncryptFinal_ex(&ctx,zz + i,&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL))
+		goto err;
+	if (!EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen))
+		goto err;
+	if (!EVP_EncryptFinal_ex(&ctx,zz + i,&j))
+		goto err;
 
 	ret = i2d_NETSCAPE_ENCRYPTED_PKEY(enckey, pp);
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_ENCRYPTED_PKEY_free(enckey);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
@@ -288,6 +293,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 	const unsigned char *zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	i=cb((char *)buf,256,"Enter Private Key password:",0);
 	if (i != 0)
@@ -298,19 +304,22 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 
 	i = strlen((char *)buf);
 	if(sgckey){
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 		
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL);
-	EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length);
-	EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL))
+		goto err;
+	if (!EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length))
+		goto err;
+	if (!EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j))
+		goto err;
 	os->length=i+j;
 
 	zz=os->data;
@@ -328,6 +337,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 		goto err;
 		}
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
 	}
diff --git a/crypto/asn1/p5_pbev2.c b/crypto/asn1/p5_pbev2.c
index cb49b66..4ea6830 100644
--- a/crypto/asn1/p5_pbev2.c
+++ b/crypto/asn1/p5_pbev2.c
@@ -91,12 +91,10 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 				 unsigned char *aiv, int prf_nid)
 {
 	X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL;
-	int alg_nid;
+	int alg_nid, keylen;
 	EVP_CIPHER_CTX ctx;
 	unsigned char iv[EVP_MAX_IV_LENGTH];
-	PBKDF2PARAM *kdf = NULL;
 	PBE2PARAM *pbe2 = NULL;
-	ASN1_OCTET_STRING *osalt = NULL;
 	ASN1_OBJECT *obj;
 
 	alg_nid = EVP_CIPHER_type(cipher);
@@ -127,7 +125,8 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 	EVP_CIPHER_CTX_init(&ctx);
 
 	/* Dummy cipherinit to just setup the IV, and PRF */
-	EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0);
+	if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0))
+		goto err;
 	if(EVP_CIPHER_param_to_asn1(&ctx, scheme->parameter) < 0) {
 		ASN1err(ASN1_F_PKCS5_PBE2_SET_IV,
 					ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
@@ -145,55 +144,21 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 		}
 	EVP_CIPHER_CTX_cleanup(&ctx);
 
-	if(!(kdf = PBKDF2PARAM_new())) goto merr;
-	if(!(osalt = M_ASN1_OCTET_STRING_new())) goto merr;
-
-	if (!saltlen) saltlen = PKCS5_SALT_LEN;
-	if (!(osalt->data = OPENSSL_malloc (saltlen))) goto merr;
-	osalt->length = saltlen;
-	if (salt) memcpy (osalt->data, salt, saltlen);
-	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) goto merr;
-
-	if(iter <= 0) iter = PKCS5_DEFAULT_ITER;
-	if(!ASN1_INTEGER_set(kdf->iter, iter)) goto merr;
-
-	/* Now include salt in kdf structure */
-	kdf->salt->value.octet_string = osalt;
-	kdf->salt->type = V_ASN1_OCTET_STRING;
-	osalt = NULL;
-
 	/* If its RC2 then we'd better setup the key length */
 
-	if(alg_nid == NID_rc2_cbc) {
-		if(!(kdf->keylength = M_ASN1_INTEGER_new())) goto merr;
-		if(!ASN1_INTEGER_set (kdf->keylength,
-				 EVP_CIPHER_key_length(cipher))) goto merr;
-	}
-
-	/* prf can stay NULL if we are using hmacWithSHA1 */
-	if (prf_nid != NID_hmacWithSHA1)
-		{
-		kdf->prf = X509_ALGOR_new();
-		if (!kdf->prf)
-			goto merr;
-		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
-					V_ASN1_NULL, NULL);
-		}
-
-	/* Now setup the PBE2PARAM keyfunc structure */
+	if(alg_nid == NID_rc2_cbc)
+		keylen = EVP_CIPHER_key_length(cipher);
+	else
+		keylen = -1;
 
-	pbe2->keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+	/* Setup keyfunc */
 
-	/* Encode PBKDF2PARAM into parameter of pbe2 */
+	X509_ALGOR_free(pbe2->keyfunc);
 
-	if(!(pbe2->keyfunc->parameter = ASN1_TYPE_new())) goto merr;
+	pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
 
-	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
-			 &pbe2->keyfunc->parameter->value.sequence)) goto merr;
-	pbe2->keyfunc->parameter->type = V_ASN1_SEQUENCE;
-
-	PBKDF2PARAM_free(kdf);
-	kdf = NULL;
+	if (!pbe2->keyfunc)
+		goto merr;
 
 	/* Now set up top level AlgorithmIdentifier */
 
@@ -219,8 +184,6 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 	err:
 	PBE2PARAM_free(pbe2);
 	/* Note 'scheme' is freed as part of pbe2 */
-	M_ASN1_OCTET_STRING_free(osalt);
-	PBKDF2PARAM_free(kdf);
 	X509_ALGOR_free(kalg);
 	X509_ALGOR_free(ret);
 
@@ -233,3 +196,85 @@ X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
 	{
 	return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
 	}
+
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen)
+	{
+	X509_ALGOR *keyfunc = NULL;
+	PBKDF2PARAM *kdf = NULL;
+	ASN1_OCTET_STRING *osalt = NULL;
+
+	if(!(kdf = PBKDF2PARAM_new()))
+		goto merr;
+	if(!(osalt = M_ASN1_OCTET_STRING_new()))
+		goto merr;
+
+	kdf->salt->value.octet_string = osalt;
+	kdf->salt->type = V_ASN1_OCTET_STRING;
+
+	if (!saltlen)
+		saltlen = PKCS5_SALT_LEN;
+	if (!(osalt->data = OPENSSL_malloc (saltlen)))
+		goto merr;
+
+	osalt->length = saltlen;
+
+	if (salt)
+		memcpy (osalt->data, salt, saltlen);
+	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0)
+		goto merr;
+
+	if(iter <= 0)
+		iter = PKCS5_DEFAULT_ITER;
+
+	if(!ASN1_INTEGER_set(kdf->iter, iter))
+		goto merr;
+
+	/* If have a key len set it up */
+
+	if(keylen > 0) 
+		{
+		if(!(kdf->keylength = M_ASN1_INTEGER_new()))
+			goto merr;
+		if(!ASN1_INTEGER_set (kdf->keylength, keylen))
+			goto merr;
+		}
+
+	/* prf can stay NULL if we are using hmacWithSHA1 */
+	if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1)
+		{
+		kdf->prf = X509_ALGOR_new();
+		if (!kdf->prf)
+			goto merr;
+		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
+					V_ASN1_NULL, NULL);
+		}
+
+	/* Finally setup the keyfunc structure */
+
+	keyfunc = X509_ALGOR_new();
+	if (!keyfunc)
+		goto merr;
+
+	keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+
+	/* Encode PBKDF2PARAM into parameter of pbe2 */
+
+	if(!(keyfunc->parameter = ASN1_TYPE_new()))
+		goto merr;
+
+	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
+			 &keyfunc->parameter->value.sequence))
+		goto merr;
+	keyfunc->parameter->type = V_ASN1_SEQUENCE;
+
+	PBKDF2PARAM_free(kdf);
+	return keyfunc;
+
+	merr:
+	ASN1err(ASN1_F_PKCS5_PBKDF2_SET,ERR_R_MALLOC_FAILURE);
+	PBKDF2PARAM_free(kdf);
+	X509_ALGOR_free(keyfunc);
+	return NULL;
+	}
+
diff --git a/crypto/asn1/t_crl.c b/crypto/asn1/t_crl.c
index ee5a687..c611692 100644
--- a/crypto/asn1/t_crl.c
+++ b/crypto/asn1/t_crl.c
@@ -94,8 +94,7 @@ int X509_CRL_print(BIO *out, X509_CRL *x)
 	l = X509_CRL_get_version(x);
 	BIO_printf(out, "%8sVersion %lu (0x%lx)\n", "", l+1, l);
 	i = OBJ_obj2nid(x->sig_alg->algorithm);
-	BIO_printf(out, "%8sSignature Algorithm: %s\n", "",
-				 (i == NID_undef) ? "NONE" : OBJ_nid2ln(i));
+	X509_signature_print(out, x->sig_alg, NULL);
 	p=X509_NAME_oneline(X509_CRL_get_issuer(x),NULL,0);
 	BIO_printf(out,"%8sIssuer: %s\n","",p);
 	OPENSSL_free(p);
diff --git a/crypto/asn1/t_x509.c b/crypto/asn1/t_x509.c
index e061f2f..edbb39a 100644
--- a/crypto/asn1/t_x509.c
+++ b/crypto/asn1/t_x509.c
@@ -72,6 +72,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/x509v3.h>
+#include "asn1_locl.h"
 
 #ifndef OPENSSL_NO_FP_API
 int X509_print_fp(FILE *fp, X509 *x)
@@ -137,10 +138,10 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
 		if (BIO_write(bp,"        Serial Number:",22) <= 0) goto err;
 
 		bs=X509_get_serialNumber(x);
-		if (bs->length <= 4)
+		if (bs->length <= (int)sizeof(long))
 			{
 			l=ASN1_INTEGER_get(bs);
-			if (l < 0)
+			if (bs->type == V_ASN1_NEG_INTEGER)
 				{
 				l= -l;
 				neg="-";
@@ -167,12 +168,16 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
 
 	if(!(cflag & X509_FLAG_NO_SIGNAME))
 		{
+		if(X509_signature_print(bp, x->sig_alg, NULL) <= 0)
+			goto err;
+#if 0
 		if (BIO_printf(bp,"%8sSignature Algorithm: ","") <= 0) 
 			goto err;
 		if (i2a_ASN1_OBJECT(bp, ci->signature->algorithm) <= 0)
 			goto err;
 		if (BIO_puts(bp, "\n") <= 0)
 			goto err;
+#endif
 		}
 
 	if(!(cflag & X509_FLAG_NO_ISSUER))
@@ -255,7 +260,8 @@ int X509_ocspid_print (BIO *bp, X509 *x)
 		goto err;
 	i2d_X509_NAME(x->cert_info->subject, &dertmp);
 
-	EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) goto err;
@@ -268,8 +274,10 @@ int X509_ocspid_print (BIO *bp, X509 *x)
 	if (BIO_printf(bp,"\n        Public key OCSP hash: ") <= 0)
 		goto err;
 
-	EVP_Digest(x->cert_info->key->public_key->data,
-		x->cert_info->key->public_key->length, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->cert_info->key->public_key->data,
+			x->cert_info->key->public_key->length,
+			SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0)
@@ -283,23 +291,50 @@ int X509_ocspid_print (BIO *bp, X509 *x)
 	return(0);
 	}
 
-int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+int X509_signature_dump(BIO *bp, const ASN1_STRING *sig, int indent)
 {
-	unsigned char *s;
+	const unsigned char *s;
 	int i, n;
-	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
-	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
 
 	n=sig->length;
 	s=sig->data;
 	for (i=0; i<n; i++)
 		{
 		if ((i%18) == 0)
-			if (BIO_write(bp,"\n        ",9) <= 0) return 0;
+			{
+			if (BIO_write(bp,"\n",1) <= 0) return 0;
+			if (BIO_indent(bp, indent, indent) <= 0) return 0;
+			}
 			if (BIO_printf(bp,"%02x%s",s[i],
 				((i+1) == n)?"":":") <= 0) return 0;
 		}
 	if (BIO_write(bp,"\n",1) != 1) return 0;
+
+	return 1;
+}
+
+int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+{
+	int sig_nid;
+	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
+	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
+
+	sig_nid = OBJ_obj2nid(sigalg->algorithm);
+	if (sig_nid != NID_undef)
+		{
+		int pkey_nid, dig_nid;
+		const EVP_PKEY_ASN1_METHOD *ameth;
+		if (OBJ_find_sigid_algs(sig_nid, &dig_nid, &pkey_nid))
+			{
+			ameth = EVP_PKEY_asn1_find(NULL, pkey_nid);
+			if (ameth && ameth->sig_print)
+				return ameth->sig_print(bp, sigalg, sig, 9, 0);
+			}
+		}
+	if (sig)
+		return X509_signature_dump(bp, sig, 9);
+	else if (BIO_puts(bp, "\n") <= 0)
+		return 0;
 	return 1;
 }
 
diff --git a/crypto/asn1/tasn_prn.c b/crypto/asn1/tasn_prn.c
index 4536980..542a091 100644
--- a/crypto/asn1/tasn_prn.c
+++ b/crypto/asn1/tasn_prn.c
@@ -446,11 +446,11 @@ static int asn1_print_fsname(BIO *out, int indent,
 	return 1;
 	}
 
-static int asn1_print_boolean_ctx(BIO *out, const int bool,
+static int asn1_print_boolean_ctx(BIO *out, int boolval,
 							const ASN1_PCTX *pctx)
 	{
 	const char *str;
-	switch (bool)
+	switch (boolval)
 		{
 		case -1:
 		str = "BOOL ABSENT";
@@ -574,10 +574,10 @@ static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld,
 		{
 		case V_ASN1_BOOLEAN:
 			{
-			int bool = *(int *)fld;
-			if (bool == -1)
-				bool = it->size;
-			ret = asn1_print_boolean_ctx(out, bool, pctx);
+			int boolval = *(int *)fld;
+			if (boolval == -1)
+				boolval = it->size;
+			ret = asn1_print_boolean_ctx(out, boolval, pctx);
 			}
 		break;
 
diff --git a/crypto/asn1/x_algor.c b/crypto/asn1/x_algor.c
index 99e5342..274e456 100644
--- a/crypto/asn1/x_algor.c
+++ b/crypto/asn1/x_algor.c
@@ -128,3 +128,17 @@ void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 		}
 	}
 
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md)
+	{
+	int param_type;
+
+	if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+		param_type = V_ASN1_UNDEF;
+	else
+		param_type = V_ASN1_NULL;
+
+	X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+	}
diff --git a/crypto/asn1/x_name.c b/crypto/asn1/x_name.c
index 49be08b..d7c2318 100644
--- a/crypto/asn1/x_name.c
+++ b/crypto/asn1/x_name.c
@@ -399,8 +399,7 @@ static int asn1_string_canon(ASN1_STRING *out, ASN1_STRING *in)
 	/* If type not in bitmask just copy string across */
 	if (!(ASN1_tag2bit(in->type) & ASN1_MASK_CANON))
 		{
-		out->type = in->type;
-		if (!ASN1_STRING_set(out, in->data, in->length))
+		if (!ASN1_STRING_copy(out, in))
 			return 0;
 		return 1;
 		}
diff --git a/crypto/asn1/x_pubkey.c b/crypto/asn1/x_pubkey.c
index d42b6a2..b649e1f 100644
--- a/crypto/asn1/x_pubkey.c
+++ b/crypto/asn1/x_pubkey.c
@@ -171,7 +171,19 @@ EVP_PKEY *X509_PUBKEY_get(X509_PUBKEY *key)
 		goto error;
 		}
 
-	key->pkey = ret;
+	/* Check to see if another thread set key->pkey first */
+	CRYPTO_w_lock(CRYPTO_LOCK_EVP_PKEY);
+	if (key->pkey)
+		{
+		CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+		EVP_PKEY_free(ret);
+		ret = key->pkey;
+		}
+	else
+		{
+		key->pkey = ret;
+		CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+		}
 	CRYPTO_add(&ret->references, 1, CRYPTO_LOCK_EVP_PKEY);
 
 	return ret;
diff --git a/crypto/bf/asm/bf-586.S b/crypto/bf/asm/bf-586.S
new file mode 100644
index 0000000..aa718d4
--- /dev/null
+++ b/crypto/bf/asm/bf-586.S
@@ -0,0 +1,896 @@
+.file	"bf-586.s"
+.text
+.globl	BF_encrypt
+.type	BF_encrypt,@function
+.align	16
+BF_encrypt:
+.L_BF_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	movl	12(%esp),%ebx
+	movl	16(%esp),%ebp
+	pushl	%esi
+	pushl	%edi
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	xorl	%eax,%eax
+	movl	(%ebp),%ebx
+	xorl	%ecx,%ecx
+	xorl	%ebx,%edi
+
+
+	movl	4(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	8(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	12(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	16(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	20(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	24(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	28(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	32(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	36(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	40(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	44(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	48(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	52(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	56(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	60(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	64(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+
+	movl	20(%esp),%eax
+	xorl	%ebx,%edi
+	movl	68(%ebp),%edx
+	xorl	%edx,%esi
+	movl	%edi,4(%eax)
+	movl	%esi,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	BF_encrypt,.-.L_BF_encrypt_begin
+.globl	BF_decrypt
+.type	BF_decrypt,@function
+.align	16
+BF_decrypt:
+.L_BF_decrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	movl	12(%esp),%ebx
+	movl	16(%esp),%ebp
+	pushl	%esi
+	pushl	%edi
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	xorl	%eax,%eax
+	movl	68(%ebp),%ebx
+	xorl	%ecx,%ecx
+	xorl	%ebx,%edi
+
+
+	movl	64(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	60(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	56(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	52(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	48(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	44(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	40(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	36(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	32(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	28(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	24(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	20(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	16(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	12(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%edi
+
+
+	movl	8(%ebp),%edx
+	movl	%edi,%ebx
+	xorl	%edx,%esi
+	shrl	$16,%ebx
+	movl	%edi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+	xorl	%eax,%eax
+	xorl	%ebx,%esi
+
+
+	movl	4(%ebp),%edx
+	movl	%esi,%ebx
+	xorl	%edx,%edi
+	shrl	$16,%ebx
+	movl	%esi,%edx
+	movb	%bh,%al
+	andl	$255,%ebx
+	movb	%dh,%cl
+	andl	$255,%edx
+	movl	72(%ebp,%eax,4),%eax
+	movl	1096(%ebp,%ebx,4),%ebx
+	addl	%eax,%ebx
+	movl	2120(%ebp,%ecx,4),%eax
+	xorl	%eax,%ebx
+	movl	3144(%ebp,%edx,4),%edx
+	addl	%edx,%ebx
+
+	movl	20(%esp),%eax
+	xorl	%ebx,%edi
+	movl	(%ebp),%edx
+	xorl	%edx,%esi
+	movl	%edi,4(%eax)
+	movl	%esi,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	BF_decrypt,.-.L_BF_decrypt_begin
+.globl	BF_cbc_encrypt
+.type	BF_cbc_encrypt,@function
+.align	16
+BF_cbc_encrypt:
+.L_BF_cbc_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ebp
+
+	movl	36(%esp),%ebx
+	movl	(%ebx),%esi
+	movl	4(%ebx),%edi
+	pushl	%edi
+	pushl	%esi
+	pushl	%edi
+	pushl	%esi
+	movl	%esp,%ebx
+	movl	36(%esp),%esi
+	movl	40(%esp),%edi
+
+	movl	56(%esp),%ecx
+
+	movl	48(%esp),%eax
+	pushl	%eax
+	pushl	%ebx
+	cmpl	$0,%ecx
+	jz	.L000decrypt
+	andl	$4294967288,%ebp
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	jz	.L001encrypt_finish
+.L002encrypt_loop:
+	movl	(%esi),%ecx
+	movl	4(%esi),%edx
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_encrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L002encrypt_loop
+.L001encrypt_finish:
+	movl	52(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L003finish
+	call	.L004PIC_point
+.L004PIC_point:
+	popl	%edx
+	leal	.L005cbc_enc_jmp_table-.L004PIC_point(%edx),%ecx
+	movl	(%ecx,%ebp,4),%ebp
+	addl	%edx,%ebp
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	jmp	*%ebp
+.L006ej7:
+	movb	6(%esi),%dh
+	shll	$8,%edx
+.L007ej6:
+	movb	5(%esi),%dh
+.L008ej5:
+	movb	4(%esi),%dl
+.L009ej4:
+	movl	(%esi),%ecx
+	jmp	.L010ejend
+.L011ej3:
+	movb	2(%esi),%ch
+	shll	$8,%ecx
+.L012ej2:
+	movb	1(%esi),%ch
+.L013ej1:
+	movb	(%esi),%cl
+.L010ejend:
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_encrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	jmp	.L003finish
+.L000decrypt:
+	andl	$4294967288,%ebp
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	jz	.L014decrypt_finish
+.L015decrypt_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_decrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	16(%esp),%ecx
+	movl	20(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%ecx,(%edi)
+	movl	%edx,4(%edi)
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L015decrypt_loop
+.L014decrypt_finish:
+	movl	52(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L003finish
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	call	.L_BF_decrypt_begin
+	movl	8(%esp),%eax
+	movl	12(%esp),%ebx
+	bswap	%eax
+	bswap	%ebx
+	movl	16(%esp),%ecx
+	movl	20(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+.L016dj7:
+	rorl	$16,%edx
+	movb	%dl,6(%edi)
+	shrl	$16,%edx
+.L017dj6:
+	movb	%dh,5(%edi)
+.L018dj5:
+	movb	%dl,4(%edi)
+.L019dj4:
+	movl	%ecx,(%edi)
+	jmp	.L020djend
+.L021dj3:
+	rorl	$16,%ecx
+	movb	%cl,2(%edi)
+	shll	$16,%ecx
+.L022dj2:
+	movb	%ch,1(%esi)
+.L023dj1:
+	movb	%cl,(%esi)
+.L020djend:
+	jmp	.L003finish
+.L003finish:
+	movl	60(%esp),%ecx
+	addl	$24,%esp
+	movl	%eax,(%ecx)
+	movl	%ebx,4(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L005cbc_enc_jmp_table:
+.long	0
+.long	.L013ej1-.L004PIC_point
+.long	.L012ej2-.L004PIC_point
+.long	.L011ej3-.L004PIC_point
+.long	.L009ej4-.L004PIC_point
+.long	.L008ej5-.L004PIC_point
+.long	.L007ej6-.L004PIC_point
+.long	.L006ej7-.L004PIC_point
+.align	64
+.size	BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin
diff --git a/crypto/bf/bf_skey.c b/crypto/bf/bf_skey.c
index 3673cde..3b0bca4 100644
--- a/crypto/bf/bf_skey.c
+++ b/crypto/bf/bf_skey.c
@@ -58,11 +58,19 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <openssl/crypto.h>
 #include <openssl/blowfish.h>
 #include "bf_locl.h"
 #include "bf_pi.h"
 
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(BLOWFISH);
+	private_BF_set_key(key, len, data);
+	}
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#endif
 	{
 	int i;
 	BF_LONG *p,ri,in[2];
diff --git a/crypto/bf/blowfish.h b/crypto/bf/blowfish.h
index b97e76f..4b6c892 100644
--- a/crypto/bf/blowfish.h
+++ b/crypto/bf/blowfish.h
@@ -104,7 +104,9 @@ typedef struct bf_key_st
 	BF_LONG S[4*256];
 	} BF_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
 
 void BF_encrypt(BF_LONG *data,const BF_KEY *key);
diff --git a/crypto/bio/b_sock.c b/crypto/bio/b_sock.c
index d47310d..470b1a0 100644
--- a/crypto/bio/b_sock.c
+++ b/crypto/bio/b_sock.c
@@ -629,7 +629,8 @@ int BIO_get_accept_socket(char *host, int bind_mode)
 		struct sockaddr_in6 sa_in6;
 #endif
 	} server,client;
-	int s=INVALID_SOCKET,cs,addrlen;
+	int s=INVALID_SOCKET,cs;
+	socklen_t addrlen;
 	unsigned char ip[4];
 	unsigned short port;
 	char *str=NULL,*e;
@@ -704,10 +705,10 @@ int BIO_get_accept_socket(char *host, int bind_mode)
 
 	if ((*p_getaddrinfo.f)(h,p,&hint,&res)) break;
 
-	addrlen = res->ai_addrlen<=sizeof(server) ?
+	addrlen = res->ai_addrlen <= (socklen_t)sizeof(server) ?
 			res->ai_addrlen :
-			sizeof(server);
-	memcpy(&server, res->ai_addr, addrlen);
+			(socklen_t)sizeof(server);
+	memcpy(&server, res->ai_addr, (size_t)addrlen);
 
 	(*p_freeaddrinfo.f)(res);
 	goto again;
@@ -719,7 +720,7 @@ int BIO_get_accept_socket(char *host, int bind_mode)
 	memset((char *)&server,0,sizeof(server));
 	server.sa_in.sin_family=AF_INET;
 	server.sa_in.sin_port=htons(port);
-	addrlen = sizeof(server.sa_in);
+	addrlen = (socklen_t)sizeof(server.sa_in);
 
 	if (h == NULL || strcmp(h,"*") == 0)
 		server.sa_in.sin_addr.s_addr=INADDR_ANY;
@@ -960,7 +961,6 @@ int BIO_set_tcp_ndelay(int s, int on)
 #endif
 	return(ret == 0);
 	}
-#endif
 
 int BIO_socket_nbio(int s, int mode)
 	{
@@ -973,3 +973,4 @@ int BIO_socket_nbio(int s, int mode)
 #endif
 	return(ret == 0);
 	}
+#endif
diff --git a/crypto/bio/bf_buff.c b/crypto/bio/bf_buff.c
index c1fd75a..4b5a132 100644
--- a/crypto/bio/bf_buff.c
+++ b/crypto/bio/bf_buff.c
@@ -209,7 +209,7 @@ static int buffer_write(BIO *b, const char *in, int inl)
 	/* add to buffer and return */
 	if (i >= inl)
 		{
-		memcpy(&(ctx->obuf[ctx->obuf_len]),in,inl);
+		memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,inl);
 		ctx->obuf_len+=inl;
 		return(num+inl);
 		}
@@ -219,7 +219,7 @@ static int buffer_write(BIO *b, const char *in, int inl)
 		{
 		if (i > 0) /* lets fill it up if we can */
 			{
-			memcpy(&(ctx->obuf[ctx->obuf_len]),in,i);
+			memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,i);
 			in+=i;
 			inl-=i;
 			num+=i;
@@ -294,9 +294,9 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
 	case BIO_C_GET_BUFF_NUM_LINES:
 		ret=0;
 		p1=ctx->ibuf;
-		for (i=ctx->ibuf_off; i<ctx->ibuf_len; i++)
+		for (i=0; i<ctx->ibuf_len; i++)
 			{
-			if (p1[i] == '\n') ret++;
+			if (p1[ctx->ibuf_off + i] == '\n') ret++;
 			}
 		break;
 	case BIO_CTRL_WPENDING:
@@ -399,17 +399,18 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
 		for (;;)
 			{
 			BIO_clear_retry_flags(b);
-			if (ctx->obuf_len > ctx->obuf_off)
+			if (ctx->obuf_len > 0)
 				{
 				r=BIO_write(b->next_bio,
 					&(ctx->obuf[ctx->obuf_off]),
-					ctx->obuf_len-ctx->obuf_off);
+					ctx->obuf_len);
 #if 0
-fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len-ctx->obuf_off,r);
+fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len,r);
 #endif
 				BIO_copy_next_retry(b);
 				if (r <= 0) return((long)r);
 				ctx->obuf_off+=r;
+				ctx->obuf_len-=r;
 				}
 			else
 				{
diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h
index 152802f..05699ab 100644
--- a/crypto/bio/bio.h
+++ b/crypto/bio/bio.h
@@ -68,6 +68,14 @@
 
 #include <openssl/crypto.h>
 
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include <stdint.h>
+# else
+# include <inttypes.h>
+# endif
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -95,6 +103,9 @@ extern "C" {
 #define BIO_TYPE_BIO		(19|0x0400)		/* (half a) BIO pair */
 #define BIO_TYPE_LINEBUFFER	(20|0x0200)		/* filter */
 #define BIO_TYPE_DGRAM		(21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP	(24|0x0400|0x0100)
+#endif
 #define BIO_TYPE_ASN1 		(22|0x0200)		/* filter */
 #define BIO_TYPE_COMP 		(23|0x0200)		/* filter */
 
@@ -146,6 +157,7 @@ extern "C" {
 /* #endif */
 
 #define BIO_CTRL_DGRAM_QUERY_MTU          40 /* as kernel for current MTU */
+#define BIO_CTRL_DGRAM_GET_FALLBACK_MTU   47
 #define BIO_CTRL_DGRAM_GET_MTU            41 /* get cached value for MTU */
 #define BIO_CTRL_DGRAM_SET_MTU            42 /* set cached value for
 					      * MTU. want to use this
@@ -161,7 +173,22 @@ extern "C" {
 #define BIO_CTRL_DGRAM_SET_PEER           44 /* Destination for the data */
 
 #define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT   45 /* Next DTLS handshake timeout to
-											  * adjust socket timeouts */
+                                              * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE	50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY		51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY		52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD		53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO		60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO		61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO		62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO		63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO			64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO			65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN		70
+#endif
 
 /* modifiers */
 #define BIO_FP_READ		0x02
@@ -306,6 +333,15 @@ DECLARE_STACK_OF(BIO)
 
 typedef struct bio_f_buffer_ctx_struct
 	{
+	/* Buffers are setup like this:
+	 *
+	 * <---------------------- size ----------------------->
+	 * +---------------------------------------------------+
+	 * | consumed | remaining          | free space        |
+	 * +---------------------------------------------------+
+	 * <-- off --><------- len ------->
+	 */
+
 	/* BIO *bio; */ /* this is now in the BIO struct */
 	int ibuf_size;	/* how big is the input buffer */
 	int obuf_size;	/* how big is the output buffer */
@@ -322,6 +358,34 @@ typedef struct bio_f_buffer_ctx_struct
 /* Prefix and suffix callback in ASN1 BIO */
 typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
 
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+	{
+	uint16_t snd_sid;
+	uint16_t snd_flags;
+	uint32_t snd_ppid;
+	uint32_t snd_context;
+	};
+
+struct bio_dgram_sctp_rcvinfo
+	{
+	uint16_t rcv_sid;
+	uint16_t rcv_ssn;
+	uint16_t rcv_flags;
+	uint32_t rcv_ppid;
+	uint32_t rcv_tsn;
+	uint32_t rcv_cumtsn;
+	uint32_t rcv_context;
+	};
+
+struct bio_dgram_sctp_prinfo
+	{
+	uint16_t pr_policy;
+	uint32_t pr_value;
+	};
+#endif
+
 /* connect BIO stuff */
 #define BIO_CONN_S_BEFORE		1
 #define BIO_CONN_S_GET_IP		2
@@ -619,6 +683,9 @@ BIO_METHOD *BIO_f_linebuffer(void);
 BIO_METHOD *BIO_f_nbio_test(void);
 #ifndef OPENSSL_NO_DGRAM
 BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
 #endif
 
 /* BIO_METHOD *BIO_f_ber(void); */
@@ -661,6 +728,15 @@ int BIO_set_tcp_ndelay(int sock,int turn_on);
 
 BIO *BIO_new_socket(int sock, int close_flag);
 BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
 BIO *BIO_new_fd(int fd, int close_flag);
 BIO *BIO_new_connect(char *host_port);
 BIO *BIO_new_accept(char *host_port);
@@ -725,6 +801,7 @@ void ERR_load_BIO_strings(void);
 #define BIO_F_BUFFER_CTRL				 114
 #define BIO_F_CONN_CTRL					 127
 #define BIO_F_CONN_STATE				 115
+#define BIO_F_DGRAM_SCTP_READ				 132
 #define BIO_F_FILE_CTRL					 116
 #define BIO_F_FILE_READ					 130
 #define BIO_F_LINEBUFFER_CTRL				 129
diff --git a/crypto/bio/bio_err.c b/crypto/bio/bio_err.c
index a224edd..0dbfbd8 100644
--- a/crypto/bio/bio_err.c
+++ b/crypto/bio/bio_err.c
@@ -1,6 +1,6 @@
 /* crypto/bio/bio_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -95,6 +95,7 @@ static ERR_STRING_DATA BIO_str_functs[]=
 {ERR_FUNC(BIO_F_BUFFER_CTRL),	"BUFFER_CTRL"},
 {ERR_FUNC(BIO_F_CONN_CTRL),	"CONN_CTRL"},
 {ERR_FUNC(BIO_F_CONN_STATE),	"CONN_STATE"},
+{ERR_FUNC(BIO_F_DGRAM_SCTP_READ),	"DGRAM_SCTP_READ"},
 {ERR_FUNC(BIO_F_FILE_CTRL),	"FILE_CTRL"},
 {ERR_FUNC(BIO_F_FILE_READ),	"FILE_READ"},
 {ERR_FUNC(BIO_F_LINEBUFFER_CTRL),	"LINEBUFFER_CTRL"},
diff --git a/crypto/bio/bio_lib.c b/crypto/bio/bio_lib.c
index e12bc3a..9c9646a 100644
--- a/crypto/bio/bio_lib.c
+++ b/crypto/bio/bio_lib.c
@@ -521,40 +521,40 @@ void BIO_free_all(BIO *bio)
 
 BIO *BIO_dup_chain(BIO *in)
 	{
-	BIO *ret=NULL,*eoc=NULL,*bio,*new;
+	BIO *ret=NULL,*eoc=NULL,*bio,*new_bio;
 
 	for (bio=in; bio != NULL; bio=bio->next_bio)
 		{
-		if ((new=BIO_new(bio->method)) == NULL) goto err;
-		new->callback=bio->callback;
-		new->cb_arg=bio->cb_arg;
-		new->init=bio->init;
-		new->shutdown=bio->shutdown;
-		new->flags=bio->flags;
+		if ((new_bio=BIO_new(bio->method)) == NULL) goto err;
+		new_bio->callback=bio->callback;
+		new_bio->cb_arg=bio->cb_arg;
+		new_bio->init=bio->init;
+		new_bio->shutdown=bio->shutdown;
+		new_bio->flags=bio->flags;
 
 		/* This will let SSL_s_sock() work with stdin/stdout */
-		new->num=bio->num;
+		new_bio->num=bio->num;
 
-		if (!BIO_dup_state(bio,(char *)new))
+		if (!BIO_dup_state(bio,(char *)new_bio))
 			{
-			BIO_free(new);
+			BIO_free(new_bio);
 			goto err;
 			}
 
 		/* copy app data */
-		if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new->ex_data,
+		if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new_bio->ex_data,
 					&bio->ex_data))
 			goto err;
 
 		if (ret == NULL)
 			{
-			eoc=new;
+			eoc=new_bio;
 			ret=eoc;
 			}
 		else
 			{
-			BIO_push(eoc,new);
-			eoc=new;
+			BIO_push(eoc,new_bio);
+			eoc=new_bio;
 			}
 		}
 	return(ret);
diff --git a/crypto/bio/bss_bio.c b/crypto/bio/bss_bio.c
index 76bd48e..52ef0eb 100644
--- a/crypto/bio/bss_bio.c
+++ b/crypto/bio/bss_bio.c
@@ -277,10 +277,10 @@ static int bio_read(BIO *bio, char *buf, int size_)
  */
 /* WARNING: The non-copying interface is largely untested as of yet
  * and may contain bugs. */
-static ssize_t bio_nread0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nread0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num;
+	ossl_ssize_t num;
 	
 	BIO_clear_retry_flags(bio);
 
@@ -315,15 +315,15 @@ static ssize_t bio_nread0(BIO *bio, char **buf)
 	return num;
 	}
 
-static ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num, available;
+	ossl_ssize_t num, available;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	available = bio_nread0(bio, buf);
 	if (num > available)
@@ -428,7 +428,7 @@ static int bio_write(BIO *bio, const char *buf, int num_)
  * (example usage:  bio_nwrite0(), write to buffer, bio_nwrite()
  *  or just         bio_nwrite(), write to buffer)
  */
-static ssize_t bio_nwrite0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b;
 	size_t num;
@@ -476,15 +476,15 @@ static ssize_t bio_nwrite0(BIO *bio, char **buf)
 	return num;
 	}
 
-static ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b;
-	ssize_t num, space;
+	ossl_ssize_t num, space;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	space = bio_nwrite0(bio, buf);
 	if (num > space)
diff --git a/crypto/bio/bss_dgram.c b/crypto/bio/bss_dgram.c
index 71ebe98..8990909 100644
--- a/crypto/bio/bss_dgram.c
+++ b/crypto/bio/bss_dgram.c
@@ -70,10 +70,27 @@
 #include <sys/timeb.h>
 #endif
 
-#ifdef OPENSSL_SYS_LINUX
+#ifndef OPENSSL_NO_SCTP
+#include <netinet/sctp.h>
+#include <fcntl.h>
+#define OPENSSL_SCTP_DATA_CHUNK_TYPE            0x00
+#define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0
+#endif
+
+#if defined(OPENSSL_SYS_LINUX) && !defined(IP_MTU)
 #define IP_MTU      14 /* linux is lame */
 #endif
 
+#if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
+/* Standard definition causes type-punning problems. */
+#undef IN6_IS_ADDR_V4MAPPED
+#define s6_addr32 __u6_addr.__u6_addr32
+#define IN6_IS_ADDR_V4MAPPED(a)               \
+        (((a)->s6_addr32[0] == 0) &&          \
+         ((a)->s6_addr32[1] == 0) &&          \
+         ((a)->s6_addr32[2] == htonl(0x0000ffff)))
+#endif
+
 #ifdef WATT32
 #define sock_write SockWrite  /* Watt-32 uses same names */
 #define sock_read  SockRead
@@ -88,6 +105,18 @@ static int dgram_new(BIO *h);
 static int dgram_free(BIO *data);
 static int dgram_clear(BIO *bio);
 
+#ifndef OPENSSL_NO_SCTP
+static int dgram_sctp_write(BIO *h, const char *buf, int num);
+static int dgram_sctp_read(BIO *h, char *buf, int size);
+static int dgram_sctp_puts(BIO *h, const char *str);
+static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_sctp_new(BIO *h);
+static int dgram_sctp_free(BIO *data);
+#ifdef SCTP_AUTHENTICATION_EVENT
+static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp);
+#endif
+#endif
+
 static int BIO_dgram_should_retry(int s);
 
 static void get_current_time(struct timeval *t);
@@ -106,6 +135,22 @@ static BIO_METHOD methods_dgramp=
 	NULL,
 	};
 
+#ifndef OPENSSL_NO_SCTP
+static BIO_METHOD methods_dgramp_sctp=
+	{
+	BIO_TYPE_DGRAM_SCTP,
+	"datagram sctp socket",
+	dgram_sctp_write,
+	dgram_sctp_read,
+	dgram_sctp_puts,
+	NULL, /* dgram_gets, */
+	dgram_sctp_ctrl,
+	dgram_sctp_new,
+	dgram_sctp_free,
+	NULL,
+	};
+#endif
+
 typedef struct bio_dgram_data_st
 	{
 	union {
@@ -122,6 +167,40 @@ typedef struct bio_dgram_data_st
 	struct timeval socket_timeout;
 	} bio_dgram_data;
 
+#ifndef OPENSSL_NO_SCTP
+typedef struct bio_dgram_sctp_save_message_st
+	{
+        BIO *bio;
+        char *data;
+        int length;
+	} bio_dgram_sctp_save_message;
+
+typedef struct bio_dgram_sctp_data_st
+	{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sa_in;
+#if OPENSSL_USE_IPV6
+		struct sockaddr_in6 sa_in6;
+#endif
+	} peer;
+	unsigned int connected;
+	unsigned int _errno;
+	unsigned int mtu;
+	struct bio_dgram_sctp_sndinfo sndinfo;
+	struct bio_dgram_sctp_rcvinfo rcvinfo;
+	struct bio_dgram_sctp_prinfo prinfo;
+	void (*handle_notifications)(BIO *bio, void *context, void *buf);
+	void* notification_context;
+	int in_handshake;
+	int ccs_rcvd;
+	int ccs_sent;
+	int save_shutdown;
+	int peer_auth_tested;
+	bio_dgram_sctp_save_message saved_message;
+	} bio_dgram_sctp_data;
+#endif
+
 BIO_METHOD *BIO_s_datagram(void)
 	{
 	return(&methods_dgramp);
@@ -186,7 +265,7 @@ static void dgram_adjust_rcv_timeout(BIO *b)
 	{
 #if defined(SO_RCVTIMEO)
 	bio_dgram_data *data = (bio_dgram_data *)b->ptr;
-	int sz = sizeof(int);
+	union { size_t s; int i; } sz = {0};
 
 	/* Is a timer active? */
 	if (data->next_timeout.tv_sec > 0 || data->next_timeout.tv_usec > 0)
@@ -196,8 +275,10 @@ static void dgram_adjust_rcv_timeout(BIO *b)
 		/* Read current socket timeout */
 #ifdef OPENSSL_SYS_WINDOWS
 		int timeout;
+
+		sz.i = sizeof(timeout);
 		if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
-					   (void*)&timeout, &sz) < 0)
+					   (void*)&timeout, &sz.i) < 0)
 			{ perror("getsockopt"); }
 		else
 			{
@@ -205,9 +286,12 @@ static void dgram_adjust_rcv_timeout(BIO *b)
 			data->socket_timeout.tv_usec = (timeout % 1000) * 1000;
 			}
 #else
+		sz.i = sizeof(data->socket_timeout);
 		if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO, 
 						&(data->socket_timeout), (void *)&sz) < 0)
 			{ perror("getsockopt"); }
+		else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+			OPENSSL_assert(sz.s<=sizeof(data->socket_timeout));
 #endif
 
 		/* Get current time */
@@ -376,11 +460,10 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 	int *ip;
 	struct sockaddr *to = NULL;
 	bio_dgram_data *data = NULL;
-#if defined(IP_MTU_DISCOVER) || defined(IP_MTU)
-	long sockopt_val = 0;
-	unsigned int sockopt_len = 0;
-#endif
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
+	int sockopt_val = 0;
+	socklen_t sockopt_len;	/* assume that system supporting IP_MTU is
+				 * modern enough to define socklen_t */
 	socklen_t addr_len;
 	union	{
 		struct sockaddr	sa;
@@ -462,7 +545,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 		break;
 		/* (Linux)kernel sets DF bit on outgoing IP packets */
 	case BIO_CTRL_DGRAM_MTU_DISCOVER:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DO)
 		addr_len = (socklen_t)sizeof(addr);
 		memset((void *)&addr, 0, sizeof(addr));
 		if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -470,7 +553,6 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 			ret = 0;
 			break;
 			}
-		sockopt_len = sizeof(sockopt_val);
 		switch (addr.sa.sa_family)
 			{
 		case AF_INET:
@@ -479,7 +561,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 				&sockopt_val, sizeof(sockopt_val))) < 0)
 				perror("setsockopt");
 			break;
-#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER)
+#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DO)
 		case AF_INET6:
 			sockopt_val = IPV6_PMTUDISC_DO;
 			if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
@@ -496,7 +578,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 		break;
 #endif
 	case BIO_CTRL_DGRAM_QUERY_MTU:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU)
 		addr_len = (socklen_t)sizeof(addr);
 		memset((void *)&addr, 0, sizeof(addr));
 		if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -547,6 +629,27 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 		ret = 0;
 #endif
 		break;
+	case BIO_CTRL_DGRAM_GET_FALLBACK_MTU:
+		switch (data->peer.sa.sa_family)
+			{
+			case AF_INET:
+				ret = 576 - 20 - 8;
+				break;
+#if OPENSSL_USE_IPV6
+			case AF_INET6:
+#ifdef IN6_IS_ADDR_V4MAPPED
+				if (IN6_IS_ADDR_V4MAPPED(&data->peer.sa_in6.sin6_addr))
+					ret = 576 - 20 - 8;
+				else
+#endif
+					ret = 1280 - 40 - 8;
+				break;
+#endif
+			default:
+				ret = 576 - 20 - 8;
+				break;
+			}
+		break;
 	case BIO_CTRL_DGRAM_GET_MTU:
 		return data->mtu;
 		break;
@@ -637,12 +740,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 #endif
 		break;
 	case BIO_CTRL_DGRAM_GET_RECV_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
 		{
-		int timeout, sz = sizeof(timeout);
+		union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+		int timeout;
 		struct timeval *tv = (struct timeval *)ptr;
+
+		sz.i = sizeof(timeout);
 		if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
-			(void*)&timeout, &sz) < 0)
+			(void*)&timeout, &sz.i) < 0)
 			{ perror("getsockopt"); ret = -1; }
 		else
 			{
@@ -650,12 +756,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 			tv->tv_usec = (timeout % 1000) * 1000;
 			ret = sizeof(*tv);
 			}
-		}
 #else
+		sz.i = sizeof(struct timeval);
 		if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO, 
-			ptr, (void *)&ret) < 0)
+			ptr, (void *)&sz) < 0)
 			{ perror("getsockopt"); ret = -1; }
+		else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+			{
+			OPENSSL_assert(sz.s<=sizeof(struct timeval));
+			ret = (int)sz.s;
+			}
+		else
+			ret = sz.i;
 #endif
+		}
 		break;
 #endif
 #if defined(SO_SNDTIMEO)
@@ -675,12 +789,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 #endif
 		break;
 	case BIO_CTRL_DGRAM_GET_SEND_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
 		{
-		int timeout, sz = sizeof(timeout);
+		union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+		int timeout;
 		struct timeval *tv = (struct timeval *)ptr;
+
+		sz.i = sizeof(timeout);
 		if (getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
-			(void*)&timeout, &sz) < 0)
+			(void*)&timeout, &sz.i) < 0)
 			{ perror("getsockopt"); ret = -1; }
 		else
 			{
@@ -688,12 +805,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 			tv->tv_usec = (timeout % 1000) * 1000;
 			ret = sizeof(*tv);
 			}
-		}
 #else
+		sz.i = sizeof(struct timeval);
 		if ( getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO, 
-			ptr, (void *)&ret) < 0)
+			ptr, (void *)&sz) < 0)
 			{ perror("getsockopt"); ret = -1; }
+		else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+			{
+			OPENSSL_assert(sz.s<=sizeof(struct timeval));
+			ret = (int)sz.s;
+			}
+		else
+			ret = sz.i;
 #endif
+		}
 		break;
 #endif
 	case BIO_CTRL_DGRAM_GET_SEND_TIMER_EXP:
@@ -738,6 +863,910 @@ static int dgram_puts(BIO *bp, const char *str)
 	return(ret);
 	}
 
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void)
+	{
+	return(&methods_dgramp_sctp);
+	}
+
+BIO *BIO_new_dgram_sctp(int fd, int close_flag)
+	{
+	BIO *bio;
+	int ret, optval = 20000;
+	int auth_data = 0, auth_forward = 0;
+	unsigned char *p;
+	struct sctp_authchunk auth;
+	struct sctp_authchunks *authchunks;
+	socklen_t sockopt_len;
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+#endif
+#endif
+
+	bio=BIO_new(BIO_s_datagram_sctp());
+	if (bio == NULL) return(NULL);
+	BIO_set_fd(bio,fd,close_flag);
+
+	/* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */
+	auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+	auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+
+	/* Test if activation was successful. When using accept(),
+	 * SCTP-AUTH has to be activated for the listening socket
+	 * already, otherwise the connected socket won't use it. */
+	sockopt_len = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+	authchunks = OPENSSL_malloc(sockopt_len);
+	memset(authchunks, 0, sizeof(sockopt_len));
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+	
+	for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+	     p < (unsigned char*) authchunks + sockopt_len;
+	     p += sizeof(uint8_t))
+		{
+		if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+		if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+		}
+		
+	OPENSSL_free(authchunks);
+
+	OPENSSL_assert(auth_data);
+	OPENSSL_assert(auth_forward);
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_AUTHENTICATION_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+	OPENSSL_assert(ret >= 0);
+#else
+	sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+
+	event.sctp_authentication_event = 1;
+
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+	OPENSSL_assert(ret >= 0);
+#endif
+#endif
+
+	/* Disable partial delivery by setting the min size
+	 * larger than the max record size of 2^14 + 2048 + 13
+	 */
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval, sizeof(optval));
+	OPENSSL_assert(ret >= 0);
+
+	return(bio);
+	}
+
+int BIO_dgram_is_sctp(BIO *bio)
+	{
+	return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP);
+	}
+
+static int dgram_sctp_new(BIO *bi)
+	{
+	bio_dgram_sctp_data *data = NULL;
+
+	bi->init=0;
+	bi->num=0;
+	data = OPENSSL_malloc(sizeof(bio_dgram_sctp_data));
+	if (data == NULL)
+		return 0;
+	memset(data, 0x00, sizeof(bio_dgram_sctp_data));
+#ifdef SCTP_PR_SCTP_NONE
+	data->prinfo.pr_policy = SCTP_PR_SCTP_NONE;
+#endif
+    bi->ptr = data;
+
+	bi->flags=0;
+	return(1);
+	}
+
+static int dgram_sctp_free(BIO *a)
+	{
+	bio_dgram_sctp_data *data;
+
+	if (a == NULL) return(0);
+	if ( ! dgram_clear(a))
+		return 0;
+
+	data = (bio_dgram_sctp_data *)a->ptr;
+	if(data != NULL) OPENSSL_free(data);
+
+	return(1);
+	}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp)
+	{
+	int ret;
+	struct sctp_authkey_event* authkeyevent = &snp->sn_auth_event;
+
+	if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY)
+		{
+		struct sctp_authkeyid authkeyid;
+
+		/* delete key */
+		authkeyid.scact_keynumber = authkeyevent->auth_keynumber;
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		}
+	}
+#endif
+
+static int dgram_sctp_read(BIO *b, char *out, int outl)
+	{
+	int ret = 0, n = 0, i, optval;
+	socklen_t optlen;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	union sctp_notification *snp;
+	struct msghdr msg;
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[512];
+
+	if (out != NULL)
+		{
+		clear_socket_error();
+
+		do
+			{
+			memset(&data->rcvinfo, 0x00, sizeof(struct bio_dgram_sctp_rcvinfo));
+			iov.iov_base = out;
+			iov.iov_len = outl;
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = cmsgbuf;
+			msg.msg_controllen = 512;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (msg.msg_controllen > 0)
+				{
+				for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg))
+					{
+					if (cmsg->cmsg_level != IPPROTO_SCTP)
+						continue;
+#ifdef SCTP_RCVINFO
+					if (cmsg->cmsg_type == SCTP_RCVINFO)
+						{
+						struct sctp_rcvinfo *rcvinfo;
+
+						rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = rcvinfo->rcv_sid;
+						data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn;
+						data->rcvinfo.rcv_flags = rcvinfo->rcv_flags;
+						data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid;
+						data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn;
+						data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn;
+						data->rcvinfo.rcv_context = rcvinfo->rcv_context;
+						}
+#endif
+#ifdef SCTP_SNDRCV
+					if (cmsg->cmsg_type == SCTP_SNDRCV)
+						{
+						struct sctp_sndrcvinfo *sndrcvinfo;
+
+						sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream;
+						data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn;
+						data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags;
+						data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid;
+						data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn;
+						data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn;
+						data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context;
+						}
+#endif
+					}
+				}
+
+			if (n <= 0)
+				{
+				if (n < 0)
+					ret = n;
+				break;
+				}
+
+			if (msg.msg_flags & MSG_NOTIFICATION)
+				{
+				snp = (union sctp_notification*) out;
+				if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+					{
+#ifdef SCTP_EVENT
+					struct sctp_event event;
+#else
+					struct sctp_event_subscribe event;
+					socklen_t eventsize;
+#endif
+					/* If a message has been delayed until the socket
+					 * is dry, it can be sent now.
+					 */
+					if (data->saved_message.length > 0)
+						{
+						dgram_sctp_write(data->saved_message.bio, data->saved_message.data,
+						                 data->saved_message.length);
+						OPENSSL_free(data->saved_message.data);
+						data->saved_message.length = 0;
+						}
+
+					/* disable sender dry event */
+#ifdef SCTP_EVENT
+					memset(&event, 0, sizeof(struct sctp_event));
+					event.se_assoc_id = 0;
+					event.se_type = SCTP_SENDER_DRY_EVENT;
+					event.se_on = 0;
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+					OPENSSL_assert(i >= 0);
+#else
+					eventsize = sizeof(struct sctp_event_subscribe);
+					i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+					OPENSSL_assert(i >= 0);
+
+					event.sctp_sender_dry_event = 0;
+
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+					OPENSSL_assert(i >= 0);
+#endif
+					}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+				if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+					dgram_sctp_handle_auth_free_key_event(b, snp);
+#endif
+
+				if (data->handle_notifications != NULL)
+					data->handle_notifications(b, data->notification_context, (void*) out);
+
+				memset(out, 0, outl);
+				}
+			else
+				ret += n;
+			}
+		while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR) && (ret < outl));
+
+		if (ret > 0 && !(msg.msg_flags & MSG_EOR))
+			{
+			/* Partial message read, this should never happen! */
+
+			/* The buffer was too small, this means the peer sent
+			 * a message that was larger than allowed. */
+			if (ret == outl)
+				return -1;
+
+			/* Test if socket buffer can handle max record
+			 * size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Test if SCTP doesn't partially deliver below
+			 * max record size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT,
+			                 &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Partially delivered notification??? Probably a bug.... */
+			OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION));
+
+			/* Everything seems ok till now, so it's most likely
+			 * a message dropped by PR-SCTP.
+			 */
+			memset(out, 0, outl);
+			BIO_set_retry_read(b);
+			return -1;
+			}
+
+		BIO_clear_retry_flags(b);
+		if (ret < 0)
+			{
+			if (BIO_dgram_should_retry(ret))
+				{
+				BIO_set_retry_read(b);
+				data->_errno = get_last_socket_error();
+				}
+			}
+
+		/* Test if peer uses SCTP-AUTH before continuing */
+		if (!data->peer_auth_tested)
+			{
+			int ii, auth_data = 0, auth_forward = 0;
+			unsigned char *p;
+			struct sctp_authchunks *authchunks;
+
+			optlen = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+			authchunks = OPENSSL_malloc(optlen);
+			memset(authchunks, 0, sizeof(optlen));
+			ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS, authchunks, &optlen);
+			OPENSSL_assert(ii >= 0);
+
+			for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+				 p < (unsigned char*) authchunks + optlen;
+				 p += sizeof(uint8_t))
+				{
+				if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+				if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+				}
+
+			OPENSSL_free(authchunks);
+
+			if (!auth_data || !auth_forward)
+				{
+				BIOerr(BIO_F_DGRAM_SCTP_READ,BIO_R_CONNECT_ERROR);
+				return -1;
+				}
+
+			data->peer_auth_tested = 1;
+			}
+		}
+	return(ret);
+	}
+
+static int dgram_sctp_write(BIO *b, const char *in, int inl)
+	{
+	int ret;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo);
+	struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo);
+	struct bio_dgram_sctp_sndinfo handshake_sinfo;
+	struct iovec iov[1];
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) + CMSG_SPACE(sizeof(struct sctp_prinfo))];
+	struct sctp_sndinfo *sndinfo;
+	struct sctp_prinfo *prinfo;
+#else
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct sctp_sndrcvinfo *sndrcvinfo;
+#endif
+
+	clear_socket_error();
+
+	/* If we're send anything else than application data,
+	 * disable all user parameters and flags.
+	 */
+	if (in[0] != 23) {
+		memset(&handshake_sinfo, 0x00, sizeof(struct bio_dgram_sctp_sndinfo));
+#ifdef SCTP_SACK_IMMEDIATELY
+		handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY;
+#endif
+		sinfo = &handshake_sinfo;
+	}
+
+	/* If we have to send a shutdown alert message and the
+	 * socket is not dry yet, we have to save it and send it
+	 * as soon as the socket gets dry.
+	 */
+	if (data->save_shutdown && !BIO_dgram_sctp_wait_for_dry(b))
+	{
+		data->saved_message.bio = b;
+		data->saved_message.length = inl;
+		data->saved_message.data = OPENSSL_malloc(inl);
+		memcpy(data->saved_message.data, in, inl);
+		return inl;
+	}
+
+	iov[0].iov_base = (char *)in;
+	iov[0].iov_len = inl;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = (caddr_t)cmsgbuf;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo));
+	sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg);
+	memset(sndinfo, 0, sizeof(struct sctp_sndinfo));
+	sndinfo->snd_sid = sinfo->snd_sid;
+	sndinfo->snd_flags = sinfo->snd_flags;
+	sndinfo->snd_ppid = sinfo->snd_ppid;
+	sndinfo->snd_context = sinfo->snd_context;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo));
+
+	cmsg = (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))];
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_PRINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo));
+	prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg);
+	memset(prinfo, 0, sizeof(struct sctp_prinfo));
+	prinfo->pr_policy = pinfo->pr_policy;
+	prinfo->pr_value = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo));
+#else
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo->sinfo_stream = sinfo->snd_sid;
+	sndrcvinfo->sinfo_flags = sinfo->snd_flags;
+#ifdef __FreeBSD__
+	sndrcvinfo->sinfo_flags |= pinfo->pr_policy;
+#endif
+	sndrcvinfo->sinfo_ppid = sinfo->snd_ppid;
+	sndrcvinfo->sinfo_context = sinfo->snd_context;
+	sndrcvinfo->sinfo_timetolive = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo));
+#endif
+
+	ret = sendmsg(b->num, &msg, 0);
+
+	BIO_clear_retry_flags(b);
+	if (ret <= 0)
+		{
+		if (BIO_dgram_should_retry(ret))
+			{
+			BIO_set_retry_write(b);  
+			data->_errno = get_last_socket_error();
+			}
+		}
+	return(ret);
+	}
+
+static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr)
+	{
+	long ret=1;
+	bio_dgram_sctp_data *data = NULL;
+	socklen_t sockopt_len = 0;
+	struct sctp_authkeyid authkeyid;
+	struct sctp_authkey *authkey;
+
+	data = (bio_dgram_sctp_data *)b->ptr;
+
+	switch (cmd)
+		{
+	case BIO_CTRL_DGRAM_QUERY_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore user input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_CONNECTED:
+	case BIO_CTRL_DGRAM_CONNECT:
+		/* Returns always -1. */
+		ret = -1;
+		break;
+	case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+		/* SCTP doesn't need the DTLS timer
+		 * Returns always 1.
+		 */
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+		if (num > 0)
+			data->in_handshake = 1;
+		else
+			data->in_handshake = 0;
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY, &data->in_handshake, sizeof(int));
+		break;
+	case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY:
+		/* New shared key for SCTP AUTH.
+		 * Returns 0 on success, -1 otherwise.
+		 */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Add new key */
+		sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t);
+		authkey = OPENSSL_malloc(sockopt_len);
+		memset(authkey, 0x00, sockopt_len);
+		authkey->sca_keynumber = authkeyid.scact_keynumber + 1;
+#ifndef __FreeBSD__
+		/* This field is missing in FreeBSD 8.2 and earlier,
+		 * and FreeBSD 8.3 and higher work without it.
+		 */
+		authkey->sca_keylength = 64;
+#endif
+		memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t));
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len);
+		if (ret < 0) break;
+
+		/* Reset active key */
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		break;
+	case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Set active key */
+		authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1;
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		/* CCS has been sent, so remember that and fall through
+		 * to check if we need to deactivate an old key
+		 */
+		data->ccs_sent = 1;
+
+	case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Has this command really been called or is this just a fall-through? */
+		if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD)
+			data->ccs_rcvd = 1;
+
+		/* CSS has been both, received and sent, so deactivate an old key */
+		if (data->ccs_rcvd == 1 && data->ccs_sent == 1)
+			{
+			/* Get active key */
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+			if (ret < 0) break;
+
+			/* Deactivate key or delete second last key if
+			 * SCTP_AUTHENTICATION_EVENT is not available.
+			 */
+			authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+#ifdef SCTP_AUTH_DEACTIVATE_KEY
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY,
+			      &authkeyid, sockopt_len);
+			if (ret < 0) break;
+#endif
+#ifndef SCTP_AUTHENTICATION_EVENT
+			if (authkeyid.scact_keynumber > 0)
+				{
+				authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+				ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+					  &authkeyid, sizeof(struct sctp_authkeyid));
+				if (ret < 0) break;
+				}
+#endif
+
+			data->ccs_rcvd = 0;
+			data->ccs_sent = 0;
+			}
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(ptr, &(data->sndinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(&(data->sndinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(ptr, &data->rcvinfo, num);
+
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(&(data->rcvinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(ptr, &(data->prinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(&(data->prinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN:
+		/* Returns always 1. */
+		if (num > 0)
+			data->save_shutdown = 1;
+		else
+			data->save_shutdown = 0;
+		break;
+
+	default:
+		/* Pass to default ctrl function to
+		 * process SCTP unspecific commands
+		 */
+		ret=dgram_ctrl(b, cmd, num, ptr);
+		break;
+		}
+	return(ret);
+	}
+
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context)
+	{
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+	if (handle_notifications != NULL)
+		{
+		data->handle_notifications = handle_notifications;
+		data->notification_context = context;
+		}
+	else
+		return -1;
+
+	return 0;
+	}
+
+int BIO_dgram_sctp_wait_for_dry(BIO *b)
+{
+	int is_dry = 0;
+	int n, sockflags, ret;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+	socklen_t eventsize;
+#endif
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* set sender dry event */
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_SENDER_DRY_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+	eventsize = sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+	if (ret < 0)
+		return -1;
+	
+	event.sctp_sender_dry_event = 1;
+	
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+	if (ret < 0)
+		return -1;
+
+	/* peek for notification */
+	memset(&snp, 0x00, sizeof(union sctp_notification));
+	iov.iov_base = (char *)&snp;
+	iov.iov_len = sizeof(union sctp_notification);
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	n = recvmsg(b->num, &msg, MSG_PEEK);
+	if (n <= 0)
+		{
+		if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+			return -1;
+		else
+			return 0;
+		}
+
+	/* if we find a notification, process it and try again if necessary */
+	while (msg.msg_flags & MSG_NOTIFICATION)
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		n = recvmsg(b->num, &msg, 0);
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		
+		if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+			{
+			is_dry = 1;
+
+			/* disable sender dry event */
+#ifdef SCTP_EVENT
+			memset(&event, 0, sizeof(struct sctp_event));
+			event.se_assoc_id = 0;
+			event.se_type = SCTP_SENDER_DRY_EVENT;
+			event.se_on = 0;
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+			eventsize = (socklen_t) sizeof(struct sctp_event_subscribe);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+			if (ret < 0)
+				return -1;
+
+			event.sctp_sender_dry_event = 0;
+
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+			if (ret < 0)
+				return -1;
+			}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+		if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+			dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+		if (data->handle_notifications != NULL)
+			data->handle_notifications(b, data->notification_context, (void*) &snp);
+
+		/* found notification, peek again */
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		/* if we have seen the dry already, don't wait */
+		if (is_dry)
+			{
+			sockflags = fcntl(b->num, F_GETFL, 0);
+			fcntl(b->num, F_SETFL, O_NONBLOCK);
+			}
+
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+
+		if (is_dry)
+			{
+			fcntl(b->num, F_SETFL, sockflags);
+			}
+
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		}
+
+	/* read anything else */
+	return is_dry;
+}
+
+int BIO_dgram_sctp_msg_waiting(BIO *b)
+	{
+	int n, sockflags;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* Check if there are any messages waiting to be read */
+	do
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		sockflags = fcntl(b->num, F_GETFL, 0);
+		fcntl(b->num, F_SETFL, O_NONBLOCK);
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+		fcntl(b->num, F_SETFL, sockflags);
+
+		/* if notification, process and try again */
+		if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION))
+			{
+#ifdef SCTP_AUTHENTICATION_EVENT
+			if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+				dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+			memset(&snp, 0x00, sizeof(union sctp_notification));
+			iov.iov_base = (char *)&snp;
+			iov.iov_len = sizeof(union sctp_notification);
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = NULL;
+			msg.msg_controllen = 0;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (data->handle_notifications != NULL)
+				data->handle_notifications(b, data->notification_context, (void*) &snp);
+			}
+
+		} while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION));
+
+	/* Return 1 if there is a message to be read, return 0 otherwise. */
+	if (n > 0)
+		return 1;
+	else
+		return 0;
+	}
+
+static int dgram_sctp_puts(BIO *bp, const char *str)
+	{
+	int n,ret;
+
+	n=strlen(str);
+	ret=dgram_sctp_write(bp,str,n);
+	return(ret);
+	}
+#endif
+
 static int BIO_dgram_should_retry(int i)
 	{
 	int err;
diff --git a/crypto/bn/asm/armv4-gf2m.S b/crypto/bn/asm/armv4-gf2m.S
new file mode 100644
index 0000000..038f086
--- /dev/null
+++ b/crypto/bn/asm/armv4-gf2m.S
@@ -0,0 +1,213 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	mul_1x1_neon,%function
+.align	5
+mul_1x1_neon:
+	vshl.u64	d2,d16,#8	@ q1-q3 are slided 
+	vmull.p8	q0,d16,d17	@ a·bb
+	vshl.u64	d4,d16,#16
+	vmull.p8	q1,d2,d17	@ a<<8·bb
+	vshl.u64	d6,d16,#24
+	vmull.p8	q2,d4,d17	@ a<<16·bb
+	vshr.u64	d2,#8
+	vmull.p8	q3,d6,d17	@ a<<24·bb
+	vshl.u64	d3,#24
+	veor		d0,d2
+	vshr.u64	d4,#16
+	veor		d0,d3
+	vshl.u64	d5,#16
+	veor		d0,d4
+	vshr.u64	d6,#24
+	veor		d0,d5
+	vshl.u64	d7,#8
+	veor		d0,d6
+	veor		d0,d7
+	.word	0xe12fff1e
+.size	mul_1x1_neon,.-mul_1x1_neon
+#endif
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	r4,#0
+	bic	r5,r1,#3<<30		@ a1=a&0x3fffffff
+	str	r4,[sp,#0]		@ tab[0]=0
+	add	r6,r5,r5		@ a2=a1<<1
+	str	r5,[sp,#4]		@ tab[1]=a1
+	eor	r7,r5,r6		@ a1^a2
+	str	r6,[sp,#8]		@ tab[2]=a2
+	mov	r8,r5,lsl#2		@ a4=a1<<2
+	str	r7,[sp,#12]		@ tab[3]=a1^a2
+	eor	r9,r5,r8		@ a1^a4
+	str	r8,[sp,#16]		@ tab[4]=a4
+	eor	r4,r6,r8		@ a2^a4
+	str	r9,[sp,#20]		@ tab[5]=a1^a4
+	eor	r7,r7,r8		@ a1^a2^a4
+	str	r4,[sp,#24]		@ tab[6]=a2^a4
+	and	r8,r12,r0,lsl#2
+	str	r7,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	r9,r12,r0,lsr#1
+	ldr	r5,[sp,r8]		@ tab[b       & 0x7]
+	and	r8,r12,r0,lsr#4
+	ldr	r7,[sp,r9]		@ tab[b >>  3 & 0x7]
+	and	r9,r12,r0,lsr#7
+	ldr	r6,[sp,r8]		@ tab[b >>  6 & 0x7]
+	eor	r5,r5,r7,lsl#3	@ stall
+	mov	r4,r7,lsr#29
+	ldr	r7,[sp,r9]		@ tab[b >>  9 & 0x7]
+
+	and	r8,r12,r0,lsr#10
+	eor	r5,r5,r6,lsl#6
+	eor	r4,r4,r6,lsr#26
+	ldr	r6,[sp,r8]		@ tab[b >> 12 & 0x7]
+
+	and	r9,r12,r0,lsr#13
+	eor	r5,r5,r7,lsl#9
+	eor	r4,r4,r7,lsr#23
+	ldr	r7,[sp,r9]		@ tab[b >> 15 & 0x7]
+
+	and	r8,r12,r0,lsr#16
+	eor	r5,r5,r6,lsl#12
+	eor	r4,r4,r6,lsr#20
+	ldr	r6,[sp,r8]		@ tab[b >> 18 & 0x7]
+
+	and	r9,r12,r0,lsr#19
+	eor	r5,r5,r7,lsl#15
+	eor	r4,r4,r7,lsr#17
+	ldr	r7,[sp,r9]		@ tab[b >> 21 & 0x7]
+
+	and	r8,r12,r0,lsr#22
+	eor	r5,r5,r6,lsl#18
+	eor	r4,r4,r6,lsr#14
+	ldr	r6,[sp,r8]		@ tab[b >> 24 & 0x7]
+
+	and	r9,r12,r0,lsr#25
+	eor	r5,r5,r7,lsl#21
+	eor	r4,r4,r7,lsr#11
+	ldr	r7,[sp,r9]		@ tab[b >> 27 & 0x7]
+
+	tst	r1,#1<<30
+	and	r8,r12,r0,lsr#28
+	eor	r5,r5,r6,lsl#24
+	eor	r4,r4,r6,lsr#8
+	ldr	r6,[sp,r8]		@ tab[b >> 30      ]
+
+	eorne	r5,r5,r0,lsl#30
+	eorne	r4,r4,r0,lsr#2
+	tst	r1,#1<<31
+	eor	r5,r5,r7,lsl#27
+	eor	r4,r4,r7,lsr#5
+	eorne	r5,r5,r0,lsl#31
+	eorne	r4,r4,r0,lsr#1
+	eor	r5,r5,r6,lsl#30
+	eor	r4,r4,r6,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	beq	.Lialu
+
+	veor	d18,d18
+	vmov.32	d19,r3,r3		@ two copies of b1
+	vmov.32	d18[0],r1		@ a1
+
+	veor	d20,d20
+	vld1.32	d21[],[sp,:32]	@ two copies of b0
+	vmov.32	d20[0],r2		@ a0
+	mov	r12,lr
+
+	vmov	d16,d18
+	vmov	d17,d19
+	bl	mul_1x1_neon		@ a1·b1
+	vmov	d22,d0
+
+	vmov	d16,d20
+	vmov	d17,d21
+	bl	mul_1x1_neon		@ a0·b0
+	vmov	d23,d0
+
+	veor	d16,d20,d18
+	veor	d17,d21,d19
+	veor	d20,d23,d22
+	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
+
+	veor	d0,d20			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	vshl.u64 d1,d0,#32
+	vshr.u64 d0,d0,#32
+	veor	d23,d1
+	veor	d22,d0
+	vst1.32	{d23[0]},[r0,:32]!
+	vst1.32	{d23[1]},[r0,:32]!
+	vst1.32	{d22[0]},[r0,:32]!
+	vst1.32	{d22[1]},[r0,:32]
+	bx	r12
+.align	4
+.Lialu:
+#endif
+	stmdb	sp!,{r4-r10,lr}
+	mov	r10,r0			@ reassign 1st argument
+	mov	r0,r3			@ r0=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	r12,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	r5,[r10,#8]
+	str	r4,[r10,#12]
+
+	eor	r0,r0,r3		@ flip b0 and b1
+	 eor	r1,r1,r2		@ flip a0 and a1
+	eor	r3,r3,r0
+	 eor	r2,r2,r1
+	eor	r0,r0,r3
+	 eor	r1,r1,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	r5,[r10]
+	str	r4,[r10,#4]
+
+	eor	r1,r1,r2
+	eor	r0,r0,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+	ldmia	r10,{r6-r9}
+	eor	r5,r5,r4
+	eor	r4,r4,r7
+	eor	r5,r5,r6
+	eor	r4,r4,r8
+	eor	r5,r5,r9
+	eor	r4,r4,r9
+	str	r4,[r10,#8]
+	eor	r5,r5,r4
+	add	sp,sp,#32		@ destroy tab[8]
+	str	r5,[r10,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align	5
+
+.comm	OPENSSL_armcap_P,4,4
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 0000000..22ad1f8
--- /dev/null
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,278 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	mul_1x1_neon,%function
+.align	5
+mul_1x1_neon:
+	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
+	vmull.p8	`&Q("d0")`,d16,d17	@ a·bb
+	vshl.u64	`&Dlo("q2")`,d16,#16
+	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8·bb
+	vshl.u64	`&Dlo("q3")`,d16,#24
+	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16·bb
+	vshr.u64	`&Dlo("q1")`,#8
+	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24·bb
+	vshl.u64	`&Dhi("q1")`,#24
+	veor		d0,`&Dlo("q1")`
+	vshr.u64	`&Dlo("q2")`,#16
+	veor		d0,`&Dhi("q1")`
+	vshl.u64	`&Dhi("q2")`,#16
+	veor		d0,`&Dlo("q2")`
+	vshr.u64	`&Dlo("q3")`,#24
+	veor		d0,`&Dhi("q2")`
+	vshl.u64	`&Dhi("q3")`,#8
+	veor		d0,`&Dlo("q3")`
+	veor		d0,`&Dhi("q3")`
+	bx	lr
+.size	mul_1x1_neon,.-mul_1x1_neon
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	$a0,#0
+	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
+	str	$a0,[sp,#0]		@ tab[0]=0
+	add	$a2,$a1,$a1		@ a2=a1<<1
+	str	$a1,[sp,#4]		@ tab[1]=a1
+	eor	$a12,$a1,$a2		@ a1^a2
+	str	$a2,[sp,#8]		@ tab[2]=a2
+	mov	$a4,$a1,lsl#2		@ a4=a1<<2
+	str	$a12,[sp,#12]		@ tab[3]=a1^a2
+	eor	$a14,$a1,$a4		@ a1^a4
+	str	$a4,[sp,#16]		@ tab[4]=a4
+	eor	$a0,$a2,$a4		@ a2^a4
+	str	$a14,[sp,#20]		@ tab[5]=a1^a4
+	eor	$a12,$a12,$a4		@ a1^a2^a4
+	str	$a0,[sp,#24]		@ tab[6]=a2^a4
+	and	$i0,$mask,$b,lsl#2
+	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	$i1,$mask,$b,lsr#1
+	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
+	and	$i0,$mask,$b,lsr#4
+	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
+	and	$i1,$mask,$b,lsr#7
+	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
+	eor	$lo,$lo,$t1,lsl#3	@ stall
+	mov	$hi,$t1,lsr#29
+	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
+
+	and	$i0,$mask,$b,lsr#10
+	eor	$lo,$lo,$t0,lsl#6
+	eor	$hi,$hi,$t0,lsr#26
+	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
+
+	and	$i1,$mask,$b,lsr#13
+	eor	$lo,$lo,$t1,lsl#9
+	eor	$hi,$hi,$t1,lsr#23
+	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
+
+	and	$i0,$mask,$b,lsr#16
+	eor	$lo,$lo,$t0,lsl#12
+	eor	$hi,$hi,$t0,lsr#20
+	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
+
+	and	$i1,$mask,$b,lsr#19
+	eor	$lo,$lo,$t1,lsl#15
+	eor	$hi,$hi,$t1,lsr#17
+	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
+
+	and	$i0,$mask,$b,lsr#22
+	eor	$lo,$lo,$t0,lsl#18
+	eor	$hi,$hi,$t0,lsr#14
+	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
+
+	and	$i1,$mask,$b,lsr#25
+	eor	$lo,$lo,$t1,lsl#21
+	eor	$hi,$hi,$t1,lsr#11
+	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
+
+	tst	$a,#1<<30
+	and	$i0,$mask,$b,lsr#28
+	eor	$lo,$lo,$t0,lsl#24
+	eor	$hi,$hi,$t0,lsr#8
+	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
+
+	eorne	$lo,$lo,$b,lsl#30
+	eorne	$hi,$hi,$b,lsr#2
+	tst	$a,#1<<31
+	eor	$lo,$lo,$t1,lsl#27
+	eor	$hi,$hi,$t1,lsr#5
+	eorne	$lo,$lo,$b,lsl#31
+	eorne	$hi,$hi,$b,lsr#1
+	eor	$lo,$lo,$t0,lsl#30
+	eor	$hi,$hi,$t0,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void	bn_GF2m_mul_2x2(BN_ULONG *r,
+#	BN_ULONG a1,BN_ULONG a0,
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+
+$code.=<<___;
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	beq	.Lialu
+
+	veor	$A1,$A1
+	vmov.32	$B1,r3,r3		@ two copies of b1
+	vmov.32	${A1}[0],r1		@ a1
+
+	veor	$A0,$A0
+	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
+	vmov.32	${A0}[0],r2		@ a0
+	mov	r12,lr
+
+	vmov	d16,$A1
+	vmov	d17,$B1
+	bl	mul_1x1_neon		@ a1·b1
+	vmov	$A1B1,d0
+
+	vmov	d16,$A0
+	vmov	d17,$B0
+	bl	mul_1x1_neon		@ a0·b0
+	vmov	$A0B0,d0
+
+	veor	d16,$A0,$A1
+	veor	d17,$B0,$B1
+	veor	$A0,$A0B0,$A1B1
+	bl	mul_1x1_neon		@ (a0+a1)·(b0+b1)
+
+	veor	d0,$A0			@ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	vshl.u64 d1,d0,#32
+	vshr.u64 d0,d0,#32
+	veor	$A0B0,d1
+	veor	$A1B1,d0
+	vst1.32	{${A0B0}[0]},[r0,:32]!
+	vst1.32	{${A0B0}[1]},[r0,:32]!
+	vst1.32	{${A1B1}[0]},[r0,:32]!
+	vst1.32	{${A1B1}[1]},[r0,:32]
+	bx	r12
+.align	4
+.Lialu:
+#endif
+___
+$ret="r10";	# reassigned 1st argument
+$code.=<<___;
+	stmdb	sp!,{r4-r10,lr}
+	mov	$ret,r0			@ reassign 1st argument
+	mov	$b,r3			@ $b=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	$mask,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	$lo,[$ret,#8]
+	str	$hi,[$ret,#12]
+
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	$lo,[$ret]
+	str	$hi,[$ret,#4]
+
+	eor	$a,$a,r2
+	eor	$b,$b,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+	ldmia	$ret,{@r[0]-@r[3]}
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
+	str	$hi,[$ret,#8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
+	str	$lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+.comm	OPENSSL_armcap_P,4,4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;   # enforce flush
diff --git a/crypto/bn/asm/armv4-mont.s b/crypto/bn/asm/armv4-mont.S
similarity index 100%
rename from crypto/bn/asm/armv4-mont.s
rename to crypto/bn/asm/armv4-mont.S
index 0488455..64c220b 100644
--- a/crypto/bn/asm/armv4-mont.s
+++ b/crypto/bn/asm/armv4-mont.S
@@ -38,9 +38,9 @@ bn_mul_mont:
 .L1st:
 	ldr	r5,[r1],#4		@ ap[j],ap++
 	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r11,#0
 	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
-	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r14,#0
 	umlal	r12,r14,r6,r8	@ np[j]*n0
 	adds	r12,r12,r10
@@ -50,21 +50,21 @@ bn_mul_mont:
 	bne	.L1st
 
 	adds	r12,r12,r11
+	ldr	r4,[r0,#13*4]		@ restore bp
 	mov	r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
 	adc	r14,r14,#0
-	ldr	r4,[r0,#13*4]		@ restore bp
 	str	r12,[r0]		@ tp[num-1]=
-	ldr	r8,[r0,#14*4]		@ restore n0
 	str	r14,[r0,#4]		@ tp[num]=
 
 .Louter:
 	sub	r7,r0,sp		@ "original" r0-1 value
 	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
-	sub	r3,r3,r7		@ "rewind" np to &np[1]
 	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
 	ldr	r5,[r1,#-4]		@ ap[0]
-	ldr	r6,[r3,#-4]		@ np[0]
 	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
 	ldr	r7,[sp,#4]		@ tp[1]
 
 	mov	r11,#0
@@ -78,13 +78,13 @@ bn_mul_mont:
 .Linner:
 	ldr	r5,[r1],#4		@ ap[j],ap++
 	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r11,#0
 	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
-	ldr	r6,[r3],#4		@ np[j],np++
 	mov	r14,#0
 	umlal	r12,r14,r6,r8	@ np[j]*n0
-	ldr	r7,[r4,#8]		@ tp[j+1]
 	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
 	adds	r12,r12,r10
 	str	r12,[r4],#4		@ tp[j-1]=,tp++
 	adc	r12,r14,#0
@@ -93,13 +93,13 @@ bn_mul_mont:
 
 	adds	r12,r12,r11
 	mov	r14,#0
+	ldr	r4,[r0,#13*4]		@ restore bp
 	adc	r14,r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
 	adds	r12,r12,r7
-	adc	r14,r14,#0
-	ldr	r4,[r0,#13*4]		@ restore bp
 	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
 	str	r12,[r0]		@ tp[num-1]=
-	ldr	r8,[r0,#14*4]		@ restore n0
 	str	r14,[r0,#4]		@ tp[num]=
 
 	cmp	r4,r7
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index 14e0d2d..f78a8b5 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -23,6 +23,9 @@
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.
 
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
 $bp="r2"; $bi="r2"; $rp="r2";
@@ -89,9 +92,9 @@
 .L1st:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
 	adds	$nlo,$nlo,$alo
@@ -101,21 +104,21 @@
 	bne	.L1st
 
 	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
 	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
 	adc	$nhi,$nhi,#0
-	ldr	$tp,[$_bp]		@ restore bp
 	str	$nlo,[$num]		@ tp[num-1]=
-	ldr	$n0,[$_n0]		@ restore n0
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 .Louter:
 	sub	$tj,$num,sp		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
-	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$aj,[$ap,#-4]		@ ap[0]
-	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$tj,[sp,#4]		@ tp[1]
 
 	mov	$ahi,#0
@@ -129,13 +132,13 @@
 .Linner:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
-	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adds	$nlo,$nlo,$alo
 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
 	adc	$nlo,$nhi,#0
@@ -144,13 +147,13 @@
 
 	adds	$nlo,$nlo,$ahi
 	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
 	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
 	adds	$nlo,$nlo,$tj
-	adc	$nhi,$nhi,#0
-	ldr	$tp,[$_bp]		@ restore bp
 	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
-	ldr	$n0,[$_n0]		@ restore n0
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 	cmp	$tp,$tj
diff --git a/crypto/bn/asm/bn-586.S b/crypto/bn/asm/bn-586.S
new file mode 100644
index 0000000..fe873ce
--- /dev/null
+++ b/crypto/bn/asm/bn-586.S
@@ -0,0 +1,1384 @@
+.file	"crypto/bn/asm/bn-586.s"
+.text
+.globl	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	28(%esp),%ecx
+	movl	24(%esp),%ebx
+	andl	$4294967288,%ecx
+	movl	32(%esp),%ebp
+	pushl	%ecx
+	jz	.L000maw_finish
+.align	16
+.L001maw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	28(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	subl	$8,%ecx
+	leal	32(%ebx),%ebx
+	leal	32(%edi),%edi
+	jnz	.L001maw_loop
+.L000maw_finish:
+	movl	32(%esp),%ecx
+	andl	$7,%ecx
+	jnz	.L002maw_finish2
+	jmp	.L003maw_end
+.L002maw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	jz	.L003maw_end
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L003maw_end:
+	movl	%esi,%eax
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl	bn_mul_words
+.type	bn_mul_words,@function
+.align	16
+bn_mul_words:
+.L_bn_mul_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ebp
+	movl	32(%esp),%ecx
+	andl	$4294967288,%ebp
+	jz	.L004mw_finish
+.L005mw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	addl	$32,%ebx
+	addl	$32,%edi
+	subl	$8,%ebp
+	jz	.L004mw_finish
+	jmp	.L005mw_loop
+.L004mw_finish:
+	movl	28(%esp),%ebp
+	andl	$7,%ebp
+	jnz	.L006mw_finish2
+	jmp	.L007mw_end
+.L006mw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L007mw_end
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L007mw_end:
+	movl	%esi,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_words,.-.L_bn_mul_words_begin
+.globl	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%ebx
+	andl	$4294967288,%ebx
+	jz	.L008sw_finish
+.L009sw_loop:
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	movl	%edx,4(%esi)
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	movl	%edx,12(%esi)
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	movl	%edx,20(%esi)
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	movl	%edx,28(%esi)
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	movl	%edx,36(%esi)
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	movl	%edx,44(%esi)
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+
+	movl	28(%edi),%eax
+	mull	%eax
+	movl	%eax,56(%esi)
+	movl	%edx,60(%esi)
+
+	addl	$32,%edi
+	addl	$64,%esi
+	subl	$8,%ebx
+	jnz	.L009sw_loop
+.L008sw_finish:
+	movl	28(%esp),%ebx
+	andl	$7,%ebx
+	jz	.L010sw_end
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	decl	%ebx
+	movl	%edx,4(%esi)
+	jz	.L010sw_end
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	decl	%ebx
+	movl	%edx,12(%esi)
+	jz	.L010sw_end
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	decl	%ebx
+	movl	%edx,20(%esi)
+	jz	.L010sw_end
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	decl	%ebx
+	movl	%edx,28(%esi)
+	jz	.L010sw_end
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	decl	%ebx
+	movl	%edx,36(%esi)
+	jz	.L010sw_end
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	decl	%ebx
+	movl	%edx,44(%esi)
+	jz	.L010sw_end
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+.L010sw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl	bn_div_words
+.type	bn_div_words,@function
+.align	16
+bn_div_words:
+.L_bn_div_words_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	movl	12(%esp),%ecx
+	divl	%ecx
+	ret
+.size	bn_div_words,.-.L_bn_div_words_begin
+.globl	bn_add_words
+.type	bn_add_words,@function
+.align	16
+bn_add_words:
+.L_bn_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L011aw_finish
+.L012aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L012aw_loop
+.L011aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L013aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L013aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L013aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L013aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L013aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L013aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L013aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L013aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_add_words,.-.L_bn_add_words_begin
+.globl	bn_sub_words
+.type	bn_sub_words,@function
+.align	16
+bn_sub_words:
+.L_bn_sub_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L014aw_finish
+.L015aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L015aw_loop
+.L014aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L016aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L016aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L016aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L016aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L016aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L016aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L016aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L016aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sub_words,.-.L_bn_sub_words_begin
+.globl	bn_sub_part_words
+.type	bn_sub_part_words,@function
+.align	16
+bn_sub_part_words:
+.L_bn_sub_part_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L017aw_finish
+.L018aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L018aw_loop
+.L017aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+	decl	%ebp
+	jz	.L019aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+	addl	$4,%esi
+	addl	$4,%edi
+	addl	$4,%ebx
+.L019aw_end:
+	cmpl	$0,36(%esp)
+	je	.L020pw_end
+	movl	36(%esp),%ebp
+	cmpl	$0,%ebp
+	je	.L020pw_end
+	jge	.L021pw_pos
+
+	movl	$0,%edx
+	subl	%ebp,%edx
+	movl	%edx,%ebp
+	andl	$4294967288,%ebp
+	jz	.L022pw_neg_finish
+.L023pw_neg_loop:
+
+	movl	$0,%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	$0,%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	$0,%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	$0,%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	$0,%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	$0,%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	$0,%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	$0,%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L023pw_neg_loop
+.L022pw_neg_finish:
+	movl	36(%esp),%edx
+	movl	$0,%ebp
+	subl	%edx,%ebp
+	andl	$7,%ebp
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L020pw_end
+
+	movl	$0,%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+	jmp	.L020pw_end
+.L021pw_pos:
+	andl	$4294967288,%ebp
+	jz	.L024pw_pos_finish
+.L025pw_pos_loop:
+
+	movl	(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,(%ebx)
+	jnc	.L026pw_nc0
+
+	movl	4(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,4(%ebx)
+	jnc	.L027pw_nc1
+
+	movl	8(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,8(%ebx)
+	jnc	.L028pw_nc2
+
+	movl	12(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,12(%ebx)
+	jnc	.L029pw_nc3
+
+	movl	16(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,16(%ebx)
+	jnc	.L030pw_nc4
+
+	movl	20(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,20(%ebx)
+	jnc	.L031pw_nc5
+
+	movl	24(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,24(%ebx)
+	jnc	.L032pw_nc6
+
+	movl	28(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,28(%ebx)
+	jnc	.L033pw_nc7
+
+	addl	$32,%esi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L025pw_pos_loop
+.L024pw_pos_finish:
+	movl	36(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L020pw_end
+
+	movl	(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,(%ebx)
+	jnc	.L034pw_tail_nc0
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	4(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,4(%ebx)
+	jnc	.L035pw_tail_nc1
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	8(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,8(%ebx)
+	jnc	.L036pw_tail_nc2
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	12(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,12(%ebx)
+	jnc	.L037pw_tail_nc3
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	16(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,16(%ebx)
+	jnc	.L038pw_tail_nc4
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	20(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,20(%ebx)
+	jnc	.L039pw_tail_nc5
+	decl	%ebp
+	jz	.L020pw_end
+
+	movl	24(%esi),%ecx
+	subl	%eax,%ecx
+	movl	%ecx,24(%ebx)
+	jnc	.L040pw_tail_nc6
+	movl	$1,%eax
+	jmp	.L020pw_end
+.L041pw_nc_loop:
+	movl	(%esi),%ecx
+	movl	%ecx,(%ebx)
+.L026pw_nc0:
+	movl	4(%esi),%ecx
+	movl	%ecx,4(%ebx)
+.L027pw_nc1:
+	movl	8(%esi),%ecx
+	movl	%ecx,8(%ebx)
+.L028pw_nc2:
+	movl	12(%esi),%ecx
+	movl	%ecx,12(%ebx)
+.L029pw_nc3:
+	movl	16(%esi),%ecx
+	movl	%ecx,16(%ebx)
+.L030pw_nc4:
+	movl	20(%esi),%ecx
+	movl	%ecx,20(%ebx)
+.L031pw_nc5:
+	movl	24(%esi),%ecx
+	movl	%ecx,24(%ebx)
+.L032pw_nc6:
+	movl	28(%esi),%ecx
+	movl	%ecx,28(%ebx)
+.L033pw_nc7:
+
+	addl	$32,%esi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L041pw_nc_loop
+	movl	36(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L042pw_nc_end
+	movl	(%esi),%ecx
+	movl	%ecx,(%ebx)
+.L034pw_tail_nc0:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	4(%esi),%ecx
+	movl	%ecx,4(%ebx)
+.L035pw_tail_nc1:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	8(%esi),%ecx
+	movl	%ecx,8(%ebx)
+.L036pw_tail_nc2:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	12(%esi),%ecx
+	movl	%ecx,12(%ebx)
+.L037pw_tail_nc3:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	16(%esi),%ecx
+	movl	%ecx,16(%ebx)
+.L038pw_tail_nc4:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	20(%esi),%ecx
+	movl	%ecx,20(%ebx)
+.L039pw_tail_nc5:
+	decl	%ebp
+	jz	.L042pw_nc_end
+	movl	24(%esi),%ecx
+	movl	%ecx,24(%ebx)
+.L040pw_tail_nc6:
+.L042pw_nc_end:
+	movl	$0,%eax
+.L020pw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sub_part_words,.-.L_bn_sub_part_words_begin
diff --git a/crypto/bn/asm/bn-mips.s b/crypto/bn/asm/bn-mips.S
similarity index 99%
rename from crypto/bn/asm/bn-mips.s
rename to crypto/bn/asm/bn-mips.S
index d1535b1..229c709 100644
--- a/crypto/bn/asm/bn-mips.s
+++ b/crypto/bn/asm/bn-mips.S
@@ -394,14 +394,14 @@ bn_add_words_internal:
 	sltu	$2,$14,$10
 	sw	$14,-2*4($4)
 	addu	$2,$24
-
+	
 	addu	$11,$15
 	sltu	$25,$11,$15
 	addu	$15,$11,$2
 	sltu	$2,$15,$11
 	sw	$15,-4($4)
 	addu	$2,$25
-
+	
 	.set	noreorder
 	bgtzl	$1,.L_bn_add_words_loop
 	lw	$12,0($5)
@@ -567,7 +567,7 @@ bn_div_3_words:
 				# so that we can save two arguments
 				# and return address in registers
 				# instead of stack:-)
-
+				
 	lw	$4,($7)
 	move	$10,$5
 	bne	$4,$6,bn_div_3_words_internal
@@ -582,7 +582,7 @@ bn_div_3_words:
 bn_div_3_words_internal:
 	.set	reorder
 	move	$11,$31
-	bal	bn_div_words
+	bal	bn_div_words_internal
 	move	$31,$11
 	multu	$10,$2
 	lw	$14,-2*4($7)
diff --git a/crypto/bn/asm/co-586.S b/crypto/bn/asm/co-586.S
new file mode 100644
index 0000000..3cb8073
--- /dev/null
+++ b/crypto/bn/asm/co-586.S
@@ -0,0 +1,1254 @@
+.file	"crypto/bn/asm/co-586.s"
+.text
+.globl	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	16(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	24(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,28(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,36(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,44(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,52(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%eax)
+
+
+	movl	%ebx,60(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+
+
+	movl	%ecx,28(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	12(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%eax
+	adcl	$0,%ebx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,28(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	20(%esi),%eax
+	adcl	$0,%ecx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	8(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	28(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,36(%edi)
+	movl	12(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	20(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	28(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,44(%edi)
+	movl	20(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,52(%edi)
+
+
+	xorl	%ecx,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%edi)
+
+	movl	%ebx,60(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl	bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+
+	movl	%ecx,28(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000..e258658
--- /dev/null
+++ b/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
+# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//		    const BN_ULONG *bp,const BN_ULONG *np,
+//		    const BN_ULONG *n0p,int num);			
+.align	64
+.global	bn_mul_mont#
+.proc	bn_mul_mont#
+bn_mul_mont:
+	.prologue
+	.body
+{ .mmi;	cmp4.le		p6,p7=2,r37;;
+(p6)	cmp4.lt.unc	p8,p9=8,r37
+	mov		ret0=r0		};;
+{ .bbb;
+(p9)	br.cond.dptk.many	bn_mul_mont_8
+(p8)	br.cond.dpnt.many	bn_mul_mont_general
+(p7)	br.ret.spnt.many	b0	};;
+.endp	bn_mul_mont#
+
+prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
+
+rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
+tptr=r16;	// &tp[0]
+tp_1=r17;	// &tp[-1]
+num=r18;	len=r19;	lc=r20;
+topbit=r21;	// carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align	64
+.local	bn_mul_mont_general#
+.proc	bn_mul_mont_general#
+bn_mul_mont_general:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,6,2,0,8
+	$ADDP	aptr=0,in1
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		}
+{ .mmi;	.vframe	prevsp
+	mov	prevsp=sp
+	$ADDP	bptr=0,in2
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
+	.rotr		a[3],n[3],t[2]
+
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		alo[4]=[aptr],16	// ap[0]
+	$ADDP		r30=8,in1	};;
+{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
+	ldf8		alo[2]=[aptr],16	// ap[2]
+	$ADDP		in4=0,in4	};;
+{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
+	ldf8		n0=[in4]		// n0
+	$ADDP		rptr=0,in0		}
+{ .mmi;	$ADDP		nptr=0,in3
+	mov		r31=16
+	zxt4		num=in5		};;
+{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
+	shladd		len=num,3,r0
+	shladd		r31=num,3,r31	};;
+{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
+	add		lc=-5,num
+	sub		r31=sp,r31	};;
+{ .mfb;	and		sp=-16,r31		// alloca
+	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
+	nop.b		0		}
+{ .mfb;	nop.m		0
+	xmpy.lu		alo[4]=alo[4],bi
+	brp.loop.imp	.L1st_ctop,.L1st_cend-16
+					};;
+{ .mfi;	nop.m		0
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
+	add		tp_1=8,sp	}
+{ .mfi;	nop.m		0
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20001f<<16
+			// ------^----- (p40) at first (p23)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	nop.m		0
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;	nop.m		0
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+.align	32
+.L1st_ctop:
+.pred.rel	"mutex",p40,p42
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
+{ .mfi;	(p23)	st8		[tp_1]=n[2],8
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	br.ctop.sptk	.L1st_ctop			};;
+.L1st_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	add		num=-1,num	};;	// num--
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+	sub		nptr=nptr,len	};;
+{ .mmi;	.pred.rel	"mutex",p39,p41
+(p39)	add		topbit=r0,r0
+(p41)	add		topbit=r0,r0,1
+	nop.i		0		}	
+{ .mmi;	st8		[tp_1]=n[0]
+	add		tptr=16,sp
+	add		tp_1=8,sp	};;
+
+.Louter:
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		ahi[3]=[tptr]		// tp[0]
+	add		r30=8,aptr	};;
+{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
+	ldf8		alo[3]=[r30],16		// ap[1]
+	add		r31=8,nptr	};;
+{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
+	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
+	brp.loop.imp	.Linner_ctop,.Linner_cend-16
+					}
+{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
+	xma.lu		alo[4]=alo[4],bi,ahi[3]
+	clrrrb.pr			};;
+{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
+	nop.i		0		}
+{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20101f<<16
+			// ------^----- (p40) at first (p23)
+			// --------^--- (p30) at first (p22)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align	32
+.Linner_ctop:
+.pred.rel	"mutex",p40,p42
+.pred.rel	"mutex",p30,p32
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
+{ .mfi;	(p16)	nop.m		0
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p16)	nop.f		0
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mfi;	(p21)	ld8		t[0]=[tptr],8
+	(p16)	nop.f		0
+	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
+{ .mfi;	(p16)	nop.m		0
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
+{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
+{ .mmb;	(p23)	st8		[tp_1]=n[2],8
+	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
+	br.ctop.sptk	.Linner_ctop			};;
+.Linner_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	nop.i		0		};;
+
+{ .mmi;	.pred.rel	"mutex",p31,p33
+(p31)	add		a[0]=a[0],topbit
+(p33)	add		a[0]=a[0],topbit,1
+	mov		topbit=r0	};;
+{ .mfi; .pred.rel	"mutex",p31,p33
+(p31)	cmp.ltu		p32,p30=a[0],topbit
+(p33)	cmp.leu		p32,p30=a[0],topbit
+					}
+{ .mfi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+					};;
+{ .mmi;	.pred.rel	"mutex",p44,p46
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+(p32)	add		topbit=r0,r0,1	}
+
+{ .mmi;	st8		[tp_1]=n[0],8
+	cmp4.ne		p6,p0=1,num
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	sub		nptr=nptr,len
+(p41)	add		topbit=r0,r0,1
+	add		tptr=16,sp	}
+{ .mmb;	add		tp_1=8,sp
+	add		num=-1,num		// num--
+(p6)	br.cond.sptk.many	.Louter	};;
+
+{ .mbb;	add		lc=4,lc
+	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
+	clrrrb.pr			};;
+{ .mii;	nop.m		0
+	mov		pr.rot=0x10001<<16
+			// ------^---- (p33) at first (p17)
+	mov		ar.lc=lc	}
+{ .mii;	nop.m		0
+	mov		ar.ec=3
+	nop.i		0		};;
+
+.Lsub_ctop:
+.pred.rel	"mutex",p33,p35
+{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
+	(p16)	nop.f		0
+	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
+{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
+	(p16)	nop.f		0
+	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
+{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
+	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
+	(p18)	nop.b		0			}
+{ .mib;	(p18)	nop.m		0
+	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
+	br.ctop.sptk	.Lsub_ctop			};;
+.Lsub_cend:
+
+{ .mmb;	.pred.rel	"mutex",p34,p36
+(p34)	sub	topbit=topbit,r0	// (p19)
+(p36)	sub	topbit=topbit,r0,1
+	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
+					}
+{ .mmb;	sub	rptr=rptr,len		// rewind
+	sub	tptr=tptr,len
+	clrrrb.pr			};;
+{ .mmi;	and	aptr=tptr,topbit
+	andcm	bptr=rptr,topbit
+	mov	pr.rot=1<<16		};;
+{ .mii;	or	nptr=aptr,bptr
+	mov	ar.lc=lc
+	mov	ar.ec=3			};;
+
+.Lcopy_ctop:
+{ .mmb;	(p16)	ld8	n[0]=[nptr],8
+	(p18)	st8	[tptr]=r0,8
+	(p16)	nop.b	0		}
+{ .mmb;	(p16)	nop.m	0
+	(p18)	st8	[rptr]=n[2],8
+	br.ctop.sptk	.Lcopy_ctop	};;
+.Lcopy_cend:
+
+{ .mmi;	mov		ret0=1			// signal "handled"
+	rum		1<<5			// clear um.mfh
+	mov		ar.lc=prevlc	}
+{ .mib;	.restore	sp
+	mov		sp=prevsp
+	mov		pr=prevpr,0x1ffff
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_general#
+
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align	64
+.skip	48		// aligns loop body
+.local	bn_mul_mont_8#
+.proc	bn_mul_mont_8#
+bn_mul_mont_8:
+	.prologue
+{ .mmi;	.save		ar.pfs,prevfs
+	alloc		prevfs=ar.pfs,6,2,0,8
+	.vframe		prevsp
+	mov		prevsp=sp
+	.save		ar.lc,prevlc
+	mov		prevlc=ar.lc	}
+{ .mmi;	add		r17=-6*16,sp
+	add		sp=-7*16,sp
+	.save		pr,prevpr
+	mov		prevpr=pr	};;
+
+{ .mmi;	.save.gf	0,0x10
+	stf.spill	[sp]=f16,-16
+	.save.gf	0,0x20
+	stf.spill	[r17]=f17,32
+	add		r16=-5*16,prevsp};;
+{ .mmi;	.save.gf	0,0x40
+	stf.spill	[r16]=f18,32
+	.save.gf	0,0x80
+	stf.spill	[r17]=f19,32
+	$ADDP		aptr=0,in1	};;
+{ .mmi;	.save.gf	0,0x100
+	stf.spill	[r16]=f20,32
+	.save.gf	0,0x200
+	stf.spill	[r17]=f21,32
+	$ADDP		r29=8,in1	};;
+{ .mmi;	.save.gf	0,0x400
+	stf.spill	[r16]=f22
+	.save.gf	0,0x800
+	stf.spill	[r17]=f23
+	$ADDP		rptr=0,in0	};;
+
+	.body
+	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+	.rotr		t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
+	ldf8		ai1=[r29],16		// ap[1]
+	$ADDP		bptr=0,in2	}
+{ .mmi;	$ADDP		r30=8,in2
+	$ADDP		nptr=0,in3
+	$ADDP		r31=8,in3	};;
+{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
+	ldf8		bj[6]=[r30],16		// bp[1]
+	cmp4.le		p4,p5=3,in5	}
+{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
+	ldf8		ni1=[r31],16		// np[1]
+	cmp4.le		p6,p7=4,in5	};;
+
+{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
+	(p5)fcvt.fxu	ai2=f0
+	cmp4.le		p8,p9=5,in5	}
+{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
+	(p7)fcvt.fxu	ai3=f0
+	cmp4.le		p10,p11=6,in5	}
+{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
+	(p5)fcvt.fxu	bj[5]=f0
+	cmp4.le		p12,p13=7,in5	}
+{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
+	(p7)fcvt.fxu	bj[4]=f0
+	cmp4.le		p14,p15=8,in5	}
+{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
+	(p5)fcvt.fxu	ni2=f0
+	addp4		r28=-1,in5	}
+{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
+	(p7)fcvt.fxu	ni3=f0
+	$ADDP		in4=0,in4	};;
+
+{ .mfi;	ldf8		n0=[in4]
+	fcvt.fxu	tf[1]=f0
+	nop.i		0		}
+
+{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
+	(p9)fcvt.fxu	ai4=f0
+	mov		t[0]=r0		}
+{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
+	(p11)fcvt.fxu	ai5=f0
+	mov		t[1]=r0		}
+{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
+	(p9)fcvt.fxu	bj[3]=f0
+	mov		t[2]=r0		}
+{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
+	(p11)fcvt.fxu	bj[2]=f0
+	mov		t[3]=r0		}
+{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
+	(p9)fcvt.fxu	ni4=f0
+	mov		t[4]=r0		}
+{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
+	(p11)fcvt.fxu	ni5=f0
+	mov		t[5]=r0		};;
+
+{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
+	(p13)fcvt.fxu	ai6=f0
+	mov		t[6]=r0		}
+{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
+	(p15)fcvt.fxu	ai7=f0
+	mov		t[7]=r0		}
+{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
+	(p13)fcvt.fxu	bj[1]=f0
+	mov		ar.lc=r28	}
+{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
+	(p15)fcvt.fxu	bj[0]=f0
+	mov		ar.ec=1		}
+{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
+	(p13)fcvt.fxu	ni6=f0
+	mov		pr.rot=1<<16	}
+{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
+	(p15)fcvt.fxu	ni7=f0
+	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
+					};;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 0:
+	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
+	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
+{ .mfi;	(p42)	add		a3=a3,n3,1
+	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 4:
+	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
+	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
+{ .mfi;	(p43)	add		a4=a4,n4,1
+	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p16)	nop.m		0			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 8:
+	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
+	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
+{ .mfi;	(p42)	add		a5=a5,n5,1
+	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mfi;	(p16)	nop.m		0			// 10:
+	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
+	(p40)	cmp.ltu		p43,p41=a5,n5	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
+	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
+	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
+{ .mfi;	(p43)	add		a6=a6,n6,1
+	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mfi;	(p16)	nop.m		0			// 14:
+	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
+	(p41)	cmp.ltu		p42,p40=a6,n6	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
+	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	nop.m		0			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 16:
+	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
+	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
+{ .mfi;	(p42)	add		a7=a7,n7,1
+	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mfi;	(p16)	nop.m		0			// 18:
+	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
+	(p40)	cmp.ltu		p43,p41=a7,n7	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
+	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 20:
+	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
+	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
+{ .mfi;	(p43)	add		a8=a8,n8,1
+	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	};;
+{ .mfi;	(p16)	nop.m		0			// 22:
+	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
+	(p41)	cmp.ltu		p42,p40=a8,n8	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
+	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	};;
+{ .mfi;	(p16)	nop.m		0			// 24:
+	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
+	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
+{ .mfi;	(p16)	nop.m		0
+	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
+	(p17)	mov		t[0]=r0		};;
+{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
+	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
+	(p42)	add		t[0]=t[0],r0,1	};;
+{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
+	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
+	(p50)	add		t[0]=t[0],r0,1	}
+{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
+	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
+	(p16)	cmp.ltu.unc	p50,p48=t0,a1
+	(p16)	nop.i		0		};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 28:
+	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
+	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
+{ .mfi;	(p42)	add		a2=a2,n2,1
+	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
+	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
+	(p50)	add		t[6]=t[6],a2,1	};;
+{ .mfi;	(p16)	nop.m		0			// 30:
+	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
+	(p40)	cmp.ltu		p41,p39=a2,n2	}
+{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
+	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
+	(p16)	nop.f		0
+	(p48)	cmp.ltu		p49,p47=t[6],a2	}
+{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
+	(p16)	nop.f		0
+	br.ctop.sptk.many	.Louter_8_ctop	};;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
+	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
+	(p42)	add		a3=a3,n3,1	};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
+	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
+	(p43)	add		a4=a4,n4,1	};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p0)	nop.f		0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
+	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
+	(p42)	add		a5=a5,n5,1	};;
+{ .mii;	(p0)	nop.m		0			// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mii;	(p0)	nop.m		0			// 10:
+	(p40)	cmp.ltu		p43,p41=a5,n5
+	(p42)	cmp.leu		p43,p41=a5,n5	};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
+	(p43)	add		a6=a6,n6,1	};;
+{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mii;	(p0)	nop.m		0			// 14:
+	(p41)	cmp.ltu		p42,p40=a6,n6
+	(p43)	cmp.leu		p42,p40=a6,n6	};;
+{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	nop.m		0			// 16:
+	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
+	(p42)	add		a7=a7,n7,1	};;
+{ .mii;	(p0)	nop.m		0			// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mii;	(p0)	nop.m		0			// 18:
+	(p40)	cmp.ltu		p43,p41=a7,n7
+	(p42)	cmp.leu		p43,p41=a7,n7	};;
+{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p0)	nop.m		0			// 20:
+	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
+	(p43)	add		a8=a8,n8,1	};;
+{ .mmi;	(p0)	nop.m		0			// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	}
+{ .mmi;	(p17)	mov		t[0]=r0
+	(p41)	cmp.ltu		p42,p40=a8,n8
+	(p43)	cmp.leu		p42,p40=a8,n8	};;
+{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	}
+{ .mmi;	(p42)	add		t[0]=t[0],r0,1
+	(p0)	add		r16=-7*16,prevsp
+	(p0)	add		r17=-6*16,prevsp	};;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi;	(p50)add	t[0]=t[0],r0,1
+	add		r18=-5*16,prevsp
+	sub		n1=t0,n1	};;
+{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n2=t[7],n2
+	(p34)sub	n2=t[7],n2,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
+	(p34)cmp.geu	p35,p33=n2,t[7];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n3=t[6],n3	}
+{ .mmi;	(p35)sub	n3=t[6],n3,1;;
+	(p33)cmp.gtu	p34,p32=n3,t[6]
+	(p35)cmp.geu	p34,p32=n3,t[6]	};;
+	.pred.rel	"mutex",p32,p34
+{ .mii;	(p32)sub	n4=t[5],n4
+	(p34)sub	n4=t[5],n4,1;;
+	(p32)cmp.gtu	p35,p33=n4,t[5]	}
+{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n5=t[4],n5
+	(p35)sub	n5=t[4],n5,1	};;
+{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
+	(p35)cmp.geu	p34,p32=n5,t[4];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n6=t[3],n6	}
+{ .mmi;	(p34)sub	n6=t[3],n6,1;;
+	(p32)cmp.gtu	p35,p33=n6,t[3]
+	(p34)cmp.geu	p35,p33=n6,t[3]	};;
+	.pred.rel	"mutex",p33,p35
+{ .mii;	(p33)sub	n7=t[2],n7
+	(p35)sub	n7=t[2],n7,1;;
+	(p33)cmp.gtu	p34,p32=n7,t[2]	}
+{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n8=t[1],n8
+	(p34)sub	n8=t[1],n8,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
+	(p34)cmp.geu	p35,p33=n8,t[1];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	a8=t[0],r0	}
+{ .mmi;	(p35)sub	a8=t[0],r0,1;;
+	(p33)cmp.gtu	p34,p32=a8,t[0]
+	(p35)cmp.geu	p34,p32=a8,t[0]	};;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+	.pred.rel	"mutex",p32,p34
+{ .mmi;	(p32)st8	[rptr]=n1,8
+	(p34)st8	[rptr]=t0,8
+	add		r19=-4*16,prevsp};;
+{ .mmb;	(p32)st8	[rptr]=n2,8
+	(p34)st8	[rptr]=t[7],8
+	(p5)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n3,8
+	(p34)st8	[rptr]=t[6],8
+	(p7)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n4,8
+	(p34)st8	[rptr]=t[5],8
+	(p9)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n5,8
+	(p34)st8	[rptr]=t[4],8
+	(p11)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n6,8
+	(p34)st8	[rptr]=t[3],8
+	(p13)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n7,8
+	(p34)st8	[rptr]=t[2],8
+	(p15)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n8,8
+	(p34)st8	[rptr]=t[1],8
+	nop.b		0		};;
+.Ldone:						// epilogue
+{ .mmi;	ldf.fill	f16=[r16],64
+	ldf.fill	f17=[r17],64
+	nop.i		0		}
+{ .mmi;	ldf.fill	f18=[r18],64
+	ldf.fill	f19=[r19],64
+	mov		pr=prevpr,0x1ffff	};;
+{ .mmi;	ldf.fill	f20=[r16]
+	ldf.fill	f21=[r17]
+	mov		ar.lc=prevlc	}
+{ .mmi;	ldf.fill	f22=[r18]
+	ldf.fill	f23=[r19]
+	mov		ret0=1		}	// signal "handled"
+{ .mib;	rum		1<<5
+	.restore	sp
+	mov		sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_8#
+
+.type	copyright#,\@object
+copyright:
+stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 951abc5..c0cee82 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -568,7 +568,7 @@ bn_sqr_comba8:
 // I've estimated this routine to run in ~120 ticks, but in reality
 // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
 // cycles consumed for instructions fetch? Or did I misinterpret some
-// clause in Itanium �-architecture manual? Comments are welcomed and
+// clause in Itanium µ-architecture manual? Comments are welcomed and
 // highly appreciated.
 //
 // On Itanium 2 it takes ~190 ticks. This is because of stalls on
diff --git a/crypto/bn/asm/mips-mont.s b/crypto/bn/asm/mips-mont.S
similarity index 99%
rename from crypto/bn/asm/mips-mont.s
rename to crypto/bn/asm/mips-mont.S
index 867de6f..32ecee5 100644
--- a/crypto/bn/asm/mips-mont.s
+++ b/crypto/bn/asm/mips-mont.S
@@ -7,6 +7,8 @@
 .globl	bn_mul_mont
 .ent	bn_mul_mont
 bn_mul_mont:
+	lw	$8,16($29)
+	lw	$9,20($29)
 	slt	$1,$9,4
 	bnez	$1,1f
 	li	$2,0
diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
index f04b3b9..38b5164 100644
--- a/crypto/bn/asm/mips.pl
+++ b/crypto/bn/asm/mips.pl
@@ -583,14 +583,14 @@
 	sltu	$v0,$t2,$ta2
 	$ST	$t2,-2*$BNSZ($a0)
 	$ADDU	$v0,$t8
-
+	
 	$ADDU	$ta3,$t3
 	sltu	$t9,$ta3,$t3
 	$ADDU	$t3,$ta3,$v0
 	sltu	$v0,$t3,$ta3
 	$ST	$t3,-$BNSZ($a0)
 	$ADDU	$v0,$t9
-
+	
 	.set	noreorder
 	bgtzl	$at,.L_bn_add_words_loop
 	$LD	$t0,0($a1)
@@ -790,7 +790,7 @@
 				# so that we can save two arguments
 				# and return address in registers
 				# instead of stack:-)
-
+				
 	$LD	$a0,($a3)
 	move	$ta2,$a1
 	bne	$a0,$a2,bn_div_3_words_internal
@@ -819,7 +819,7 @@
 $code.=<<___;
 	.set	reorder
 	move	$ta3,$ra
-	bal	bn_div_words
+	bal	bn_div_words_internal
 	move	$ra,$ta3
 	$MULTU	$ta2,$v0
 	$LD	$t2,-2*$BNSZ($a3)
diff --git a/crypto/bn/asm/modexp512-x86_64.S b/crypto/bn/asm/modexp512-x86_64.S
new file mode 100644
index 0000000..6cccafb
--- /dev/null
+++ b/crypto/bn/asm/modexp512-x86_64.S
@@ -0,0 +1,1773 @@
+.text	
+
+.type	MULADD_128x512,@function
+.align	16
+MULADD_128x512:
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,0(%rcx)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	8(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	movq	%r9,8(%rcx)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+.size	MULADD_128x512,.-MULADD_128x512
+.type	mont_reduce,@function
+.align	16
+mont_reduce:
+	leaq	192(%rsp),%rdi
+	movq	32(%rsp),%rsi
+	addq	$576,%rsi
+	leaq	520(%rsp),%rcx
+
+	movq	96(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	movq	(%rcx),%r8
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,0(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	movq	8(%rcx),%r9
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	movq	16(%rcx),%r10
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	movq	24(%rcx),%r11
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	movq	32(%rcx),%r12
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	movq	40(%rcx),%r13
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	movq	48(%rcx),%r14
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	movq	56(%rcx),%r15
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	104(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	movq	%r9,8(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	movq	112(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%r10,16(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	120(%rcx),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,24(%rdi)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	xorq	%rax,%rax
+
+	addq	64(%rcx),%r8
+	adcq	72(%rcx),%r9
+	adcq	80(%rcx),%r10
+	adcq	88(%rcx),%r11
+	adcq	$0,%rax
+
+
+
+
+	movq	%r8,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r10,%rbp
+	movq	%r11,88(%rdi)
+
+	movq	%rax,384(%rsp)
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+
+
+
+
+
+
+
+	addq	$80,%rdi
+
+	addq	$64,%rsi
+	leaq	296(%rsp),%rcx
+
+	call	MULADD_128x512			
+
+	movq	384(%rsp),%rax
+
+
+	addq	-16(%rdi),%r8
+	adcq	-8(%rdi),%r9
+	movq	%r8,64(%rcx)
+	movq	%r9,72(%rcx)
+
+	adcq	%rax,%rax
+	movq	%rax,384(%rsp)
+
+	leaq	192(%rsp),%rdi
+	addq	$64,%rsi
+
+
+
+
+
+	movq	(%rsi),%r8
+	movq	8(%rsi),%rbx
+
+	movq	(%rcx),%rax
+	mulq	%r8
+	movq	%rax,%rbp
+	movq	%rdx,%r9
+
+	movq	8(%rcx),%rax
+	mulq	%r8
+	addq	%rax,%r9
+
+	movq	(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r9
+
+	movq	%r9,8(%rdi)
+
+
+	subq	$192,%rsi
+
+	movq	(%rcx),%r8
+	movq	8(%rcx),%r9
+
+	call	MULADD_128x512			
+
+
+
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%rbx
+	movq	16(%rsi),%rdi
+	movq	24(%rsi),%rdx
+
+
+	movq	384(%rsp),%rbp
+
+	addq	64(%rcx),%r8
+	adcq	72(%rcx),%r9
+
+
+	adcq	%rbp,%rbp
+
+
+
+	shlq	$3,%rbp
+	movq	32(%rsp),%rcx
+	addq	%rcx,%rbp
+
+
+	xorq	%rsi,%rsi
+
+	addq	0(%rbp),%r10
+	adcq	64(%rbp),%r11
+	adcq	128(%rbp),%r12
+	adcq	192(%rbp),%r13
+	adcq	256(%rbp),%r14
+	adcq	320(%rbp),%r15
+	adcq	384(%rbp),%r8
+	adcq	448(%rbp),%r9
+
+
+
+	sbbq	$0,%rsi
+
+
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rdi
+	andq	%rsi,%rdx
+
+	movq	$1,%rbp
+	subq	%rax,%r10
+	sbbq	%rbx,%r11
+	sbbq	%rdi,%r12
+	sbbq	%rdx,%r13
+
+
+
+
+	sbbq	$0,%rbp
+
+
+
+	addq	$512,%rcx
+	movq	32(%rcx),%rax
+	movq	40(%rcx),%rbx
+	movq	48(%rcx),%rdi
+	movq	56(%rcx),%rdx
+
+
+
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rdi
+	andq	%rsi,%rdx
+
+
+
+	subq	$1,%rbp
+
+	sbbq	%rax,%r14
+	sbbq	%rbx,%r15
+	sbbq	%rdi,%r8
+	sbbq	%rdx,%r9
+
+
+
+	movq	144(%rsp),%rsi
+	movq	%r10,0(%rsi)
+	movq	%r11,8(%rsi)
+	movq	%r12,16(%rsi)
+	movq	%r13,24(%rsi)
+	movq	%r14,32(%rsi)
+	movq	%r15,40(%rsi)
+	movq	%r8,48(%rsi)
+	movq	%r9,56(%rsi)
+
+	.byte	0xf3,0xc3
+.size	mont_reduce,.-mont_reduce
+.type	mont_mul_a3b,@function
+.align	16
+mont_mul_a3b:
+
+
+
+
+	movq	0(%rdi),%rbp
+
+	movq	%r10,%rax
+	mulq	%rbp
+	movq	%rax,520(%rsp)
+	movq	%rdx,%r10
+	movq	%r11,%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r12,%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r13,%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r14,%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%r15,%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	%r8,%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	%r9,%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	movq	8(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%r10,528(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	16(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,536(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	24(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%r12,544(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	32(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%r13,552(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	40(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%r14,560(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	48(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,568(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	addq	%rbx,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	56(%rdi),%rbp
+	movq	0(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r8
+	adcq	$0,%rdx
+	movq	%r8,576(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r9
+	adcq	$0,%rdx
+	addq	%rbx,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	16(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	addq	%rbx,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	24(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%rbx,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	32(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%rbx,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	40(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%rbx,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	48(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%rbx,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	movq	56(%rsi),%rax
+	mulq	%rbp
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%rbx,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+	movq	%r9,584(%rsp)
+	movq	%r10,592(%rsp)
+	movq	%r11,600(%rsp)
+	movq	%r12,608(%rsp)
+	movq	%r13,616(%rsp)
+	movq	%r14,624(%rsp)
+	movq	%r15,632(%rsp)
+	movq	%r8,640(%rsp)
+
+
+
+
+
+	jmp	mont_reduce
+
+
+.size	mont_mul_a3b,.-mont_mul_a3b
+.type	sqr_reduce,@function
+.align	16
+sqr_reduce:
+	movq	16(%rsp),%rcx
+
+
+
+	movq	%r10,%rbx
+
+	movq	%r11,%rax
+	mulq	%rbx
+	movq	%rax,528(%rsp)
+	movq	%rdx,%r10
+	movq	%r12,%rax
+	mulq	%rbx
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r13,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r14,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r15,%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rsi
+
+	movq	%r10,536(%rsp)
+
+
+
+
+
+	movq	8(%rcx),%rbx
+
+	movq	16(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,544(%rsp)
+
+	movq	%rdx,%r10
+	movq	24(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%r12,552(%rsp)
+
+	movq	%rdx,%r10
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+
+	movq	%rdx,%r11
+
+
+
+
+	movq	16(%rcx),%rbx
+
+	movq	24(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	movq	%r13,560(%rsp)
+
+	movq	%rdx,%r10
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%r14,568(%rsp)
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%r12
+
+
+
+
+
+	movq	24(%rcx),%rbx
+
+	movq	32(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,576(%rsp)
+
+	movq	%rdx,%r10
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%rsi
+	adcq	$0,%rdx
+	addq	%r10,%rsi
+	adcq	$0,%rdx
+	movq	%rsi,584(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+
+	movq	%rdx,%r15
+
+
+
+
+	movq	32(%rcx),%rbx
+
+	movq	40(%rcx),%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	movq	%r11,592(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%r12,600(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+
+	movq	%rdx,%r11
+
+
+
+
+	movq	40(%rcx),%rbx
+
+	movq	%r8,%rax
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movq	%r15,608(%rsp)
+
+	movq	%rdx,%r10
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	$0,%rdx
+	addq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r11,616(%rsp)
+
+	movq	%rdx,%r12
+
+
+
+
+	movq	%r8,%rbx
+
+	movq	%r9,%rax
+	mulq	%rbx
+	addq	%rax,%r12
+	adcq	$0,%rdx
+	movq	%r12,624(%rsp)
+
+	movq	%rdx,632(%rsp)
+
+
+	movq	528(%rsp),%r10
+	movq	536(%rsp),%r11
+	movq	544(%rsp),%r12
+	movq	552(%rsp),%r13
+	movq	560(%rsp),%r14
+	movq	568(%rsp),%r15
+
+	movq	24(%rcx),%rax
+	mulq	%rax
+	movq	%rax,%rdi
+	movq	%rdx,%r8
+
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	$0,%r8
+
+	movq	0(%rcx),%rax
+	mulq	%rax
+	movq	%rax,520(%rsp)
+	movq	%rdx,%rbx
+
+	movq	8(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbx,%r10
+	adcq	%rax,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbx
+	movq	%r10,528(%rsp)
+	movq	%r11,536(%rsp)
+
+	movq	16(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbx,%r12
+	adcq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbx
+
+	movq	%r12,544(%rsp)
+	movq	%r13,552(%rsp)
+
+	xorq	%rbp,%rbp
+	addq	%rbx,%r14
+	adcq	%rdi,%r15
+	adcq	$0,%rbp
+
+	movq	%r14,560(%rsp)
+	movq	%r15,568(%rsp)
+
+
+
+
+	movq	576(%rsp),%r10
+	movq	584(%rsp),%r11
+	movq	592(%rsp),%r12
+	movq	600(%rsp),%r13
+	movq	608(%rsp),%r14
+	movq	616(%rsp),%r15
+	movq	624(%rsp),%rdi
+	movq	632(%rsp),%rsi
+
+	movq	%r9,%rax
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%rdx,%rbx
+
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%rdi,%rdi
+	adcq	%rsi,%rsi
+	adcq	$0,%rbx
+
+	addq	%rbp,%r10
+
+	movq	32(%rcx),%rax
+	mulq	%rax
+
+	addq	%r8,%r10
+	adcq	%rax,%r11
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbp
+
+	movq	%r10,576(%rsp)
+	movq	%r11,584(%rsp)
+
+	movq	40(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbp,%r12
+	adcq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%rdx,%rbp
+
+	movq	%r12,592(%rsp)
+	movq	%r13,600(%rsp)
+
+	movq	48(%rcx),%rax
+	mulq	%rax
+
+	addq	%rbp,%r14
+	adcq	%rax,%r15
+	adcq	$0,%rdx
+
+	movq	%r14,608(%rsp)
+	movq	%r15,616(%rsp)
+
+	addq	%rdx,%rdi
+	adcq	%r9,%rsi
+	adcq	$0,%rbx
+
+	movq	%rdi,624(%rsp)
+	movq	%rsi,632(%rsp)
+	movq	%rbx,640(%rsp)
+
+	jmp	mont_reduce
+
+
+.size	sqr_reduce,.-sqr_reduce
+.globl	mod_exp_512
+.type	mod_exp_512,@function
+mod_exp_512:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+
+	movq	%rsp,%r8
+	subq	$2688,%rsp
+	andq	$-64,%rsp
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rcx,24(%rsp)
+.Lbody:
+
+
+
+	pxor	%xmm4,%xmm4
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqa	%xmm4,512(%rsp)
+	movdqa	%xmm4,528(%rsp)
+	movdqa	%xmm4,608(%rsp)
+	movdqa	%xmm4,624(%rsp)
+	movdqa	%xmm0,544(%rsp)
+	movdqa	%xmm1,560(%rsp)
+	movdqa	%xmm2,576(%rsp)
+	movdqa	%xmm3,592(%rsp)
+
+
+	movdqu	0(%rdx),%xmm0
+	movdqu	16(%rdx),%xmm1
+	movdqu	32(%rdx),%xmm2
+	movdqu	48(%rdx),%xmm3
+
+	leaq	384(%rsp),%rbx
+	movq	%rbx,136(%rsp)
+	call	mont_reduce
+
+
+	leaq	448(%rsp),%rcx
+	xorq	%rax,%rax
+	movq	%rax,0(%rcx)
+	movq	%rax,8(%rcx)
+	movq	%rax,24(%rcx)
+	movq	%rax,32(%rcx)
+	movq	%rax,40(%rcx)
+	movq	%rax,48(%rcx)
+	movq	%rax,56(%rcx)
+	movq	%rax,128(%rsp)
+	movq	$1,16(%rcx)
+
+	leaq	640(%rsp),%rbp
+	movq	%rcx,%rsi
+	movq	%rbp,%rdi
+	movq	$8,%rax
+loop_0:
+	movq	(%rcx),%rbx
+	movw	%bx,(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,64(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,128(%rdi)
+	shrq	$16,%rbx
+	movw	%bx,192(%rdi)
+	leaq	8(%rcx),%rcx
+	leaq	256(%rdi),%rdi
+	decq	%rax
+	jnz	loop_0
+	movq	$31,%rax
+	movq	%rax,32(%rsp)
+	movq	%rbp,40(%rsp)
+
+	movq	%rsi,136(%rsp)
+	movq	0(%rsi),%r10
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	32(%rsi),%r14
+	movq	40(%rsi),%r15
+	movq	48(%rsi),%r8
+	movq	56(%rsi),%r9
+init_loop:
+	leaq	384(%rsp),%rdi
+	call	mont_mul_a3b
+	leaq	448(%rsp),%rsi
+	movq	40(%rsp),%rbp
+	addq	$2,%rbp
+	movq	%rbp,40(%rsp)
+	movq	%rsi,%rcx
+	movq	$8,%rax
+loop_1:
+	movq	(%rcx),%rbx
+	movw	%bx,(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,64(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,128(%rbp)
+	shrq	$16,%rbx
+	movw	%bx,192(%rbp)
+	leaq	8(%rcx),%rcx
+	leaq	256(%rbp),%rbp
+	decq	%rax
+	jnz	loop_1
+	movq	32(%rsp),%rax
+	subq	$1,%rax
+	movq	%rax,32(%rsp)
+	jne	init_loop
+
+
+
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm1,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movdqa	%xmm3,112(%rsp)
+
+
+
+
+
+	movl	126(%rsp),%eax
+	movq	%rax,%rdx
+	shrq	$11,%rax
+	andl	$2047,%edx
+	movl	%edx,126(%rsp)
+	leaq	640(%rsp,%rax,2),%rsi
+	movq	8(%rsp),%rdx
+	movq	$4,%rbp
+loop_2:
+	movzwq	192(%rsi),%rbx
+	movzwq	448(%rsi),%rax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	128(%rsi),%bx
+	movw	384(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	64(%rsi),%bx
+	movw	320(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	0(%rsi),%bx
+	movw	256(%rsi),%ax
+	movq	%rbx,0(%rdx)
+	movq	%rax,8(%rdx)
+	leaq	512(%rsi),%rsi
+	leaq	16(%rdx),%rdx
+	subq	$1,%rbp
+	jnz	loop_2
+	movq	$505,48(%rsp)
+
+	movq	8(%rsp),%rcx
+	movq	%rcx,136(%rsp)
+	movq	0(%rcx),%r10
+	movq	8(%rcx),%r11
+	movq	16(%rcx),%r12
+	movq	24(%rcx),%r13
+	movq	32(%rcx),%r14
+	movq	40(%rcx),%r15
+	movq	48(%rcx),%r8
+	movq	56(%rcx),%r9
+	jmp	sqr_2
+
+main_loop_a3b:
+	call	sqr_reduce
+	call	sqr_reduce
+	call	sqr_reduce
+sqr_2:
+	call	sqr_reduce
+	call	sqr_reduce
+
+
+
+	movq	48(%rsp),%rcx
+	movq	%rcx,%rax
+	shrq	$4,%rax
+	movl	64(%rsp,%rax,2),%edx
+	andq	$15,%rcx
+	shrq	%cl,%rdx
+	andq	$31,%rdx
+
+	leaq	640(%rsp,%rdx,2),%rsi
+	leaq	448(%rsp),%rdx
+	movq	%rdx,%rdi
+	movq	$4,%rbp
+loop_3:
+	movzwq	192(%rsi),%rbx
+	movzwq	448(%rsi),%rax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	128(%rsi),%bx
+	movw	384(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	64(%rsi),%bx
+	movw	320(%rsi),%ax
+	shlq	$16,%rbx
+	shlq	$16,%rax
+	movw	0(%rsi),%bx
+	movw	256(%rsi),%ax
+	movq	%rbx,0(%rdx)
+	movq	%rax,8(%rdx)
+	leaq	512(%rsi),%rsi
+	leaq	16(%rdx),%rdx
+	subq	$1,%rbp
+	jnz	loop_3
+	movq	8(%rsp),%rsi
+	call	mont_mul_a3b
+
+
+
+	movq	48(%rsp),%rcx
+	subq	$5,%rcx
+	movq	%rcx,48(%rsp)
+	jge	main_loop_a3b
+
+
+
+end_main_loop_a3b:
+
+
+	movq	8(%rsp),%rdx
+	pxor	%xmm4,%xmm4
+	movdqu	0(%rdx),%xmm0
+	movdqu	16(%rdx),%xmm1
+	movdqu	32(%rdx),%xmm2
+	movdqu	48(%rdx),%xmm3
+	movdqa	%xmm4,576(%rsp)
+	movdqa	%xmm4,592(%rsp)
+	movdqa	%xmm4,608(%rsp)
+	movdqa	%xmm4,624(%rsp)
+	movdqa	%xmm0,512(%rsp)
+	movdqa	%xmm1,528(%rsp)
+	movdqa	%xmm2,544(%rsp)
+	movdqa	%xmm3,560(%rsp)
+	call	mont_reduce
+
+
+
+	movq	8(%rsp),%rax
+	movq	0(%rax),%r8
+	movq	8(%rax),%r9
+	movq	16(%rax),%r10
+	movq	24(%rax),%r11
+	movq	32(%rax),%r12
+	movq	40(%rax),%r13
+	movq	48(%rax),%r14
+	movq	56(%rax),%r15
+
+
+	movq	24(%rsp),%rbx
+	addq	$512,%rbx
+
+	subq	0(%rbx),%r8
+	sbbq	8(%rbx),%r9
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+	sbbq	32(%rbx),%r12
+	sbbq	40(%rbx),%r13
+	sbbq	48(%rbx),%r14
+	sbbq	56(%rbx),%r15
+
+
+	movq	0(%rax),%rsi
+	movq	8(%rax),%rdi
+	movq	16(%rax),%rcx
+	movq	24(%rax),%rdx
+	cmovncq	%r8,%rsi
+	cmovncq	%r9,%rdi
+	cmovncq	%r10,%rcx
+	cmovncq	%r11,%rdx
+	movq	%rsi,0(%rax)
+	movq	%rdi,8(%rax)
+	movq	%rcx,16(%rax)
+	movq	%rdx,24(%rax)
+
+	movq	32(%rax),%rsi
+	movq	40(%rax),%rdi
+	movq	48(%rax),%rcx
+	movq	56(%rax),%rdx
+	cmovncq	%r12,%rsi
+	cmovncq	%r13,%rdi
+	cmovncq	%r14,%rcx
+	cmovncq	%r15,%rdx
+	movq	%rsi,32(%rax)
+	movq	%rdi,40(%rax)
+	movq	%rcx,48(%rax)
+	movq	%rdx,56(%rax)
+
+	movq	0(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbx
+	movq	40(%rsi),%rbp
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	mod_exp_512, . - mod_exp_512
diff --git a/crypto/bn/asm/modexp512-x86_64.pl b/crypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 0000000..bfd6e97
--- /dev/null
+++ b/crypto/bn/asm/modexp512-x86_64.pl
@@ -0,0 +1,1497 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+#   Author: Vinodh.Gopal@intel.com
+#           Jim Guilford
+#           Erdinc.Ozturk@intel.com
+#           Maxim.Perminov@intel.com
+#
+# More information about algorithm used can be found at:
+#   http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+#    software must display the following acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission. For written permission, please contact
+#    licensing@OpenSSL.org.
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+#    nor may "OpenSSL" appear in their names without prior written
+#    permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+#    acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+use strict;
+my $code=".text\n\n";
+my $m=0;
+
+#
+# Define x512 macros
+#
+
+#MULSTEP_512_ADD	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	($ASRC), $X[0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 mov	(+8*$i)($ASRC), $X[$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#MULSTEP_512	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#
+# Swizzle Macros
+#
+
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle	pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+	 mov	\$8, $cnt
+loop_$m:
+	 mov	($pSrc), $d0
+	 mov	$d0#w, ($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*1)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*2)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*3)($pDst)
+	 lea	8($pSrc), $pSrc
+	 lea	64*4($pDst), $pDst
+	 dec	$cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+# macro to copy data from swizzled table to  flat space
+#MACRO unswizzle	pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+	 mov	\$4, $cnt
+loop_$m:
+	 movzxw	(+64*3+256*0)($pSrc), $d0
+	 movzxw	(+64*3+256*1)($pSrc), $d1
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*2+256*0)($pSrc), $d0#w
+	 mov	(+64*2+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*1+256*0)($pSrc), $d0#w
+	 mov	(+64*1+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*0+256*0)($pSrc), $d0#w
+	 mov	(+64*0+256*1)($pSrc), $d1#w
+	 mov	$d0, (+8*0)($pDst)
+	 mov	$d1, (+8*1)($pDst)
+	 lea	256*2($pSrc), $pSrc
+	 lea	8*2($pDst), $pDst
+	 sub	\$1, $cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+#
+# Data Structures
+#
+
+# Reduce Data
+#
+#
+# Offset  Value
+# 0C0     Carries
+# 0B8     X2[10]
+# 0B0     X2[9]
+# 0A8     X2[8]
+# 0A0     X2[7]
+# 098     X2[6]
+# 090     X2[5]
+# 088     X2[4]
+# 080     X2[3]
+# 078     X2[2]
+# 070     X2[1]
+# 068     X2[0]
+# 060     X1[12]  P[10]
+# 058     X1[11]  P[9]  Z[8]
+# 050     X1[10]  P[8]  Z[7]
+# 048     X1[9]   P[7]  Z[6]
+# 040     X1[8]   P[6]  Z[5]
+# 038     X1[7]   P[5]  Z[4]
+# 030     X1[6]   P[4]  Z[3]
+# 028     X1[5]   P[3]  Z[2]
+# 020     X1[4]   P[2]  Z[1]
+# 018     X1[3]   P[1]  Z[0]
+# 010     X1[2]   P[0]  Y[2]
+# 008     X1[1]   Q[1]  Y[1]
+# 000     X1[0]   Q[0]  Y[0]
+
+my $X1_offset           =  0;			# 13 qwords
+my $X2_offset           =  $X1_offset + 13*8;			# 11 qwords
+my $Carries_offset      =  $X2_offset + 11*8;			# 1 qword
+my $Q_offset            =  0;			# 2 qwords
+my $P_offset            =  $Q_offset + 2*8;			# 11 qwords
+my $Y_offset            =  0;			# 3 qwords
+my $Z_offset            =  $Y_offset + 3*8;			# 9 qwords
+
+my $Red_Data_Size       =  $Carries_offset + 1*8;			# (25 qwords)
+
+#
+# Stack Frame
+#
+#
+# offset	value
+# ...		<old stack contents>
+# ...
+# 280		Garray
+
+# 278		tmp16[15]
+# ...		...
+# 200		tmp16[0]
+
+# 1F8		tmp[7]
+# ...		...
+# 1C0		tmp[0]
+
+# 1B8		GT[7]
+# ...		...
+# 180		GT[0]
+
+# 178		Reduce Data
+# ...		...
+# 0B8		Reduce Data
+# 0B0		reserved
+# 0A8		reserved
+# 0A0		reserved
+# 098		reserved
+# 090		reserved
+# 088		reduce result addr
+# 080		exp[8]
+
+# ...
+# 048		exp[1]
+# 040		exp[0]
+
+# 038		reserved
+# 030		loop_idx
+# 028		pg
+# 020		i
+# 018		pData	; arg 4
+# 010		pG	; arg 2
+# 008		pResult	; arg 1
+# 000		rsp	; stack pointer before subtract
+
+my $rsp_offset          =  0;
+my $pResult_offset      =  8*1 + $rsp_offset;
+my $pG_offset           =  8*1 + $pResult_offset;
+my $pData_offset        =  8*1 + $pG_offset;
+my $i_offset            =  8*1 + $pData_offset;
+my $pg_offset           =  8*1 + $i_offset;
+my $loop_idx_offset     =  8*1 + $pg_offset;
+my $reserved1_offset    =  8*1 + $loop_idx_offset;
+my $exp_offset          =  8*1 + $reserved1_offset;
+my $red_result_addr_offset=  8*9 + $exp_offset;
+my $reserved2_offset    =  8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset  =  8*5 + $reserved2_offset;
+my $GT_offset           =  $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset          =  8*8 + $GT_offset;
+my $tmp16_offset        =  8*8 + $tmp_offset;
+my $garray_offset       =  8*16 + $tmp16_offset;
+my $mem_size            =  8*8*32 + $garray_offset;
+
+#
+# Offsets within Reduce Data
+#
+#
+#	struct MODF_2FOLD_MONT_512_C1_DATA {
+#	UINT64 t[8][8];
+#	UINT64 m[8];
+#	UINT64 m1[8]; /* 2^768 % m */
+#	UINT64 m2[8]; /* 2^640 % m */
+#	UINT64 k1[2]; /* (- 1/m) % 2^128 */
+#	};
+
+my $T                   =  0;
+my $M                   =  512;			# = 8 * 8 * 8
+my $M1                  =  576;			# = 8 * 8 * 9 /* += 8 * 8 */
+my $M2                  =  640;			# = 8 * 8 * 10 /* += 8 * 8 */
+my $K1                  =  704;			# = 8 * 8 * 11 /* += 8 * 8 */
+
+#
+#   FUNCTIONS
+#
+
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+#                       and add 512-bits (8 qwords)
+#                       to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+#        512-bit mul source: [rsi+8*n]
+#        512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type	MULADD_128x512,\@abi-omnipotent
+.align	16
+MULADD_128x512:
+___
+	&MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 mov	(+8*1)(%rdi), %rbp
+___
+	&MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 ret
+.size	MULADD_128x512,.-MULADD_128x512
+___
+}}}
+
+{{{
+#MULADD_256x512	MACRO	pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination  (768 bits, 12 qwords)
+#         pA:   Multiplicand (1024 bits, 16 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+	mov	(+8*12)($pA), $OP
+___
+	&MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*13)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*14)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*15)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+}
+
+#
+# mont_reduce(UINT64 *x,  /* 1024 bits, 16 qwords */
+#	       UINT64 *m,  /*  512 bits,  8 qwords */
+#	       MODF_2FOLD_MONT_512_C1_DATA *data,
+#             UINT64 *r)  /*  512 bits,  8 qwords */
+# Input:  x (number to be reduced): tmp16 (Implicit)
+#         m (modulus):              [pM]  (Implicit)
+#         data (reduce data):       [pData] (Implicit)
+# Output: r (result):		     Address in [red_res_addr]
+#         result also in: r9, r8, r15, r14, r13, r12, r11, r10
+
+my @X=map("%r$_",(8..15));
+
+$code.=<<___;
+.type	mont_reduce,\@abi-omnipotent
+.align	16
+mont_reduce:
+___
+
+my $STACK_DEPTH         =  8;
+	#
+	# X1 = Xh * M1 + Xl
+$code.=<<___;
+	 lea	(+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi			# pX1 (Dst) 769 bits, 13 qwords
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rsi			# pM1 (Bsrc) 512 bits, 8 qwords
+	 add	\$$M1, %rsi
+	 lea	(+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx			# X (Asrc) 1024 bits, 16 qwords
+
+___
+
+	&MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X);	# rotates @X 4 times
+	# results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+
+$code.=<<___;
+	 xor	%rax, %rax
+	# X1 += xl
+	 add	(+8*8)(%rcx), $X[4]
+	 adc	(+8*9)(%rcx), $X[5]
+	 adc	(+8*10)(%rcx), $X[6]
+	 adc	(+8*11)(%rcx), $X[7]
+	 adc	\$0, %rax
+	# X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+
+	#
+	# check for carry ;; carry stored in rax
+	 mov	$X[4], (+8*8)(%rdi)			# rdi points to X1
+	 mov	$X[5], (+8*9)(%rdi)
+	 mov	$X[6], %rbp
+	 mov	$X[7], (+8*11)(%rdi)
+
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 mov	(+8*0)(%rdi), $X[4]
+	 mov	(+8*1)(%rdi), $X[5]
+	 mov	(+8*2)(%rdi), $X[6]
+	 mov	(+8*3)(%rdi), $X[7]
+
+	# X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+	# rdi -> X1
+	# rsi -> M1
+
+	#
+	# X2 = Xh * M2 + Xl
+	# do first part (X2 = Xh * M2)
+	 add	\$8*10, %rdi			# rdi -> pXh ; 128 bits, 2 qwords
+				#        Xh is actually { [rdi+8*1], rbp }
+	 add	\$`$M2-$M1`, %rsi			# rsi -> M2
+	 lea	(+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx			# rcx -> pX2 ; 641 bits, 11 qwords
+___
+	unshift(@X,pop(@X));	unshift(@X,pop(@X));
+$code.=<<___;
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+
+	# X2 += Xl
+	 add	(+8*8-8*10)(%rdi), $X[6]		# (-8*10) is to adjust rdi -> Xh to Xl
+	 adc	(+8*9-8*10)(%rdi), $X[7]
+	 mov	$X[6], (+8*8)(%rcx)
+	 mov	$X[7], (+8*9)(%rcx)
+
+	 adc	%rax, %rax
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 lea	(+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi			# rdi -> pQ ; 128 bits, 2 qwords
+	 add	\$`$K1-$M2`, %rsi			# rsi -> pK1 ; 128 bits, 2 qwords
+
+	# MUL_128x128t128	rdi, rcx, rsi	; Q = X2 * K1 (bottom half)
+	# B1:B0 = rsi[1:0] = K1[1:0]
+	# A1:A0 = rcx[1:0] = X2[1:0]
+	# Result = rdi[1],rbp = Q[1],rbp
+	 mov	(%rsi), %r8			# B0
+	 mov	(+8*1)(%rsi), %rbx			# B1
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%r8			# B0
+	 mov	%rax, %rbp
+	 mov	%rdx, %r9
+
+	 mov	(+8*1)(%rcx), %rax			# A1
+	 mul	%r8			# B0
+	 add	%rax, %r9
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%rbx			# B1
+	 add	%rax, %r9
+
+	 mov	%r9, (+8*1)(%rdi)
+	# end MUL_128x128t128
+
+	 sub	\$`$K1-$M`, %rsi
+
+	 mov	(%rcx), $X[6]
+	 mov	(+8*1)(%rcx), $X[7]			# r9:r8 = X2[1:0]
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+
+	# load first half of m to rdx, rdi, rbx, rax
+	# moved this here for efficiency
+	 mov	(+8*0)(%rsi), %rax
+	 mov	(+8*1)(%rsi), %rbx
+	 mov	(+8*2)(%rsi), %rdi
+	 mov	(+8*3)(%rsi), %rdx
+
+	# continue with reduction
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+
+	 add	(+8*8)(%rcx), $X[6]
+	 adc	(+8*9)(%rcx), $X[7]
+
+	#accumulate the final carry to rbp
+	 adc	%rbp, %rbp
+
+	# Add in overflow corrections: R = (X2>>128) += T[overflow]
+	# R = {r9, r8, r15, r14, ..., r10}
+	 shl	\$3, %rbp
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rcx			# rsi -> Data (and points to T)
+	 add	%rcx, %rbp			# pT ; 512 bits, 8 qwords, spread out
+
+	# rsi will be used to generate a mask after the addition
+	 xor	%rsi, %rsi
+
+	 add	(+8*8*0)(%rbp), $X[0]
+	 adc	(+8*8*1)(%rbp), $X[1]
+	 adc	(+8*8*2)(%rbp), $X[2]
+	 adc	(+8*8*3)(%rbp), $X[3]
+	 adc	(+8*8*4)(%rbp), $X[4]
+	 adc	(+8*8*5)(%rbp), $X[5]
+	 adc	(+8*8*6)(%rbp), $X[6]
+	 adc	(+8*8*7)(%rbp), $X[7]
+
+	# if there is a carry:	rsi = 0xFFFFFFFFFFFFFFFF
+	# if carry is clear:	rsi = 0x0000000000000000
+	 sbb	\$0, %rsi
+
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	 mov	\$1, %rbp
+	 sub	%rax, $X[0]
+	 sbb	%rbx, $X[1]
+	 sbb	%rdi, $X[2]
+	 sbb	%rdx, $X[3]
+
+	# if there is a borrow:		rbp = 0
+	# if there is no borrow:	rbp = 1
+	# this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+	 sbb	\$0, %rbp
+
+	#load second half of m to rdx, rdi, rbx, rax
+
+	 add	\$$M, %rcx
+	 mov	(+8*4)(%rcx), %rax
+	 mov	(+8*5)(%rcx), %rbx
+	 mov	(+8*6)(%rcx), %rdi
+	 mov	(+8*7)(%rcx), %rdx
+
+	# use the rsi mask as before
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	# if rbp = 0, there was a borrow before, it is moved to the carry flag
+	# if rbp = 1, there was not a borrow before, carry flag is cleared
+	 sub	\$1, %rbp
+
+	 sbb	%rax, $X[4]
+	 sbb	%rbx, $X[5]
+	 sbb	%rdi, $X[6]
+	 sbb	%rdx, $X[7]
+
+	# write R back to memory
+
+	 mov	(+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+	 mov	$X[0], (+8*0)(%rsi)
+	 mov	$X[1], (+8*1)(%rsi)
+	 mov	$X[2], (+8*2)(%rsi)
+	 mov	$X[3], (+8*3)(%rsi)
+	 mov	$X[4], (+8*4)(%rsi)
+	 mov	$X[5], (+8*5)(%rsi)
+	 mov	$X[6], (+8*6)(%rsi)
+	 mov	$X[7], (+8*7)(%rsi)
+
+	 ret
+.size	mont_reduce,.-mont_reduce
+___
+}}}
+
+{{{
+#MUL_512x512	MACRO	pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination  (1024 bits, 16 qwords)
+#         pA:   Multiplicand (512 bits, 8 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+#   B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+
+$code.=<<___;
+	 mov	(+8*0)($pA), $OP
+
+	 mov	$X[0], %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	$X[$i], %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i-1]
+	 adc	\$0, %rdx
+	 mov	%rdx, $X[$i]
+___
+}
+
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	(+8*$i)($pA), $OP
+___
+
+	&MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+	push(@X,shift(@X));
+}
+
+$code.=<<___;
+	 mov	$X[0], (+$pDst_o+8*8)($pDst)
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+	 mov	$X[3], (+$pDst_o+8*11)($pDst)
+	 mov	$X[4], (+$pDst_o+8*12)($pDst)
+	 mov	$X[5], (+$pDst_o+8*13)($pDst)
+	 mov	$X[6], (+$pDst_o+8*14)($pDst)
+	 mov	$X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input:  src1: Address of source 1: rdi
+#         src2: Address of source 2: rsi
+# Output: dst:  Address of destination: [red_res_addr]
+#    src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp:   Clobbers [tmp16], all registers
+$code.=<<___;
+.type	mont_mul_a3b,\@abi-omnipotent
+.align	16
+mont_mul_a3b:
+	#
+	# multiply tmp = src1 * src2
+	# For multiply: dst = rcx, src1 = rdi, src2 = rsi
+	# stack depth is extra 8 from call
+___
+	&MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+	#
+	# Dst = tmp % m
+	# Call reduce(tmp, m, data, dst)
+
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	# ------------------
+	# first pass 01...07
+	# ------------------
+	 mov	$X[0], $A
+
+	 mov	$X[1],%rax
+	 mul	$A
+	 mov	%rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $X[$i-2]
+	 mov	$X[$i],%rax
+	 mul	$A
+	 add	%rax, $X[$i-2]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $x7
+
+	 mov	$X[0], (+$pDst_o+8*2)($pDst)
+
+	# ------------------
+	# second pass 12...17
+	# ------------------
+
+	 mov	(+8*1)($pA), $A
+
+	 mov	(+8*2)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*3)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*4)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# third pass 23...27
+	# ------------------
+	 mov	(+8*2)($pA), $A
+
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 mov	$X[3], (+$pDst_o+8*5)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+	 mov	$X[4], (+$pDst_o+8*6)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# fourth pass 34...37
+	# ------------------
+
+	 mov	(+8*3)($pA), $A
+
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*7)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+	 mov	$x7, (+$pDst_o+8*8)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[5]
+
+	# ------------------
+	# fifth pass 45...47
+	# ------------------
+	 mov	(+8*4)($pA), $A
+
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# sixth pass 56...57
+	# ------------------
+	 mov	(+8*5)($pA), $A
+
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*11)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*12)($pDst)
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# seventh pass 67
+	# ------------------
+	 mov	$X[6], $A
+
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*13)($pDst)
+
+	 mov	%rdx, (+$pDst_o+8*14)($pDst)
+
+	# start finalize (add	in squares, and double off-terms)
+	 mov	(+$pDst_o+8*1)($pDst), $X[0]
+	 mov	(+$pDst_o+8*2)($pDst), $X[1]
+	 mov	(+$pDst_o+8*3)($pDst), $X[2]
+	 mov	(+$pDst_o+8*4)($pDst), $X[3]
+	 mov	(+$pDst_o+8*5)($pDst), $X[4]
+	 mov	(+$pDst_o+8*6)($pDst), $X[5]
+
+	 mov	(+8*3)($pA), %rax
+	 mul	%rax
+	 mov	%rax, $x6
+	 mov	%rdx, $X[6]
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	\$0, $X[6]
+
+	 mov	(+8*0)($pA), %rax
+	 mul	%rax
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $A
+
+	 mov	(+8*1)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+	 mov	$X[0], (+$pDst_o+8*1)($pDst)
+	 mov	$X[1], (+$pDst_o+8*2)($pDst)
+
+	 mov	(+8*2)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+
+	 mov	$X[2], (+$pDst_o+8*3)($pDst)
+	 mov	$X[3], (+$pDst_o+8*4)($pDst)
+
+	 xor	$tmp, $tmp
+	 add	$A, $X[4]
+	 adc	$x6, $X[5]
+	 adc	\$0, $tmp
+
+	 mov	$X[4], (+$pDst_o+8*5)($pDst)
+	 mov	$X[5], (+$pDst_o+8*6)($pDst)
+
+	# %%tmp has 0/1 in column 7
+	# %%A6 has a full value in column 7
+
+	 mov	(+$pDst_o+8*7)($pDst), $X[0]
+	 mov	(+$pDst_o+8*8)($pDst), $X[1]
+	 mov	(+$pDst_o+8*9)($pDst), $X[2]
+	 mov	(+$pDst_o+8*10)($pDst), $X[3]
+	 mov	(+$pDst_o+8*11)($pDst), $X[4]
+	 mov	(+$pDst_o+8*12)($pDst), $X[5]
+	 mov	(+$pDst_o+8*13)($pDst), $x6
+	 mov	(+$pDst_o+8*14)($pDst), $x7
+
+	 mov	$X[7], %rax
+	 mul	%rax
+	 mov	%rax, $X[7]
+	 mov	%rdx, $A
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	$x6, $x6
+	 adc	$x7, $x7
+	 adc	\$0, $A
+
+	 add	$tmp, $X[0]
+
+	 mov	(+8*4)($pA), %rax
+	 mul	%rax
+
+	 add	$X[6], $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[0], (+$pDst_o+8*7)($pDst)
+	 mov	$X[1], (+$pDst_o+8*8)($pDst)
+
+	 mov	(+8*5)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[2], (+$pDst_o+8*9)($pDst)
+	 mov	$X[3], (+$pDst_o+8*10)($pDst)
+
+	 mov	(+8*6)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[4]
+	 adc	%rax, $X[5]
+	 adc	\$0, %rdx
+
+	 mov	$X[4], (+$pDst_o+8*11)($pDst)
+	 mov	$X[5], (+$pDst_o+8*12)($pDst)
+
+	 add	%rdx, $x6
+	 adc	$X[7], $x7
+	 adc	\$0, $A
+
+	 mov	$x6, (+$pDst_o+8*13)($pDst)
+	 mov	$x7, (+$pDst_o+8*14)($pDst)
+	 mov	$A, (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type	sqr_reduce,\@abi-omnipotent
+.align	16
+sqr_reduce:
+	 mov	(+$pResult_offset+8)(%rsp), %rcx
+___
+	&SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	sqr_reduce,.-sqr_reduce
+___
+}}}
+
+#
+# MAIN FUNCTION
+#
+
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+#           UINT64 *g,   /* 512 bits, 8 qwords */
+#           UINT64 *exp, /* 512 bits, 8 qwords */
+#           struct mod_ctx_512 *data)
+
+# window size = 5
+# table size = 2^5 = 32
+#table_entries	equ	32
+#table_size	equ	table_entries * 8
+$code.=<<___;
+.globl	mod_exp_512
+.type	mod_exp_512,\@function,4
+mod_exp_512:
+	 push	%rbp
+	 push	%rbx
+	 push	%r12
+	 push	%r13
+	 push	%r14
+	 push	%r15
+
+	# adjust stack down and then align it with cache boundary
+	 mov	%rsp, %r8
+	 sub	\$$mem_size, %rsp
+	 and	\$-64, %rsp
+
+	# store previous stack pointer and arguments
+	 mov	%r8, (+$rsp_offset)(%rsp)
+	 mov	%rdi, (+$pResult_offset)(%rsp)
+	 mov	%rsi, (+$pG_offset)(%rsp)
+	 mov	%rcx, (+$pData_offset)(%rsp)
+.Lbody:
+	# transform g into montgomery space
+	# GT = reduce(g * C2) = reduce(g * (2^256))
+	# reduce expects to have the input in [tmp16]
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rsi), %xmm0
+	 movdqu	(+16*1)(%rsi), %xmm1
+	 movdqu	(+16*2)(%rsi), %xmm2
+	 movdqu	(+16*3)(%rsi), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*3)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*5)(%rsp)
+
+	# load pExp before rdx gets blown away
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+
+	 lea	(+$GT_offset)(%rsp), %rbx
+	 mov	%rbx, (+$red_result_addr_offset)(%rsp)
+	 call	mont_reduce
+
+	# Initialize tmp = C
+	 lea	(+$tmp_offset)(%rsp), %rcx
+	 xor	%rax, %rax
+	 mov	%rax, (+8*0)(%rcx)
+	 mov	%rax, (+8*1)(%rcx)
+	 mov	%rax, (+8*3)(%rcx)
+	 mov	%rax, (+8*4)(%rcx)
+	 mov	%rax, (+8*5)(%rcx)
+	 mov	%rax, (+8*6)(%rcx)
+	 mov	%rax, (+8*7)(%rcx)
+	 mov	%rax, (+$exp_offset+8*8)(%rsp)
+	 movq	\$1, (+8*2)(%rcx)
+
+	 lea	(+$garray_offset)(%rsp), %rbp
+	 mov	%rcx, %rsi			# pTmp
+	 mov	%rbp, %rdi			# Garray[][0]
+___
+
+	&swizzle("%rdi", "%rcx", "%rax", "%rbx");
+
+	# for (rax = 31; rax != 0; rax--) {
+	#     tmp = reduce(tmp * G)
+	#     swizzle(pg, tmp);
+	#     pg += 2; }
+$code.=<<___;
+	 mov	\$31, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	# rsi -> pTmp
+	 mov	%rsi, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rsi), %r10
+	 mov	(+8*1)(%rsi), %r11
+	 mov	(+8*2)(%rsi), %r12
+	 mov	(+8*3)(%rsi), %r13
+	 mov	(+8*4)(%rsi), %r14
+	 mov	(+8*5)(%rsi), %r15
+	 mov	(+8*6)(%rsi), %r8
+	 mov	(+8*7)(%rsi), %r9
+init_loop:
+	 lea	(+$GT_offset)(%rsp), %rdi
+	 call	mont_mul_a3b
+	 lea	(+$tmp_offset)(%rsp), %rsi
+	 mov	(+$pg_offset)(%rsp), %rbp
+	 add	\$2, %rbp
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	 mov	%rsi, %rcx			# rcx = rsi = addr of tmp
+___
+
+	&swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+	 mov	(+$i_offset)(%rsp), %rax
+	 sub	\$1, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 jne	init_loop
+
+	#
+	# Copy exponent onto stack
+	 movdqa	%xmm0, (+$exp_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$exp_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$exp_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$exp_offset+16*3)(%rsp)
+
+
+	#
+	# Do exponentiation
+	# Initialize result to G[exp{511:507}]
+	 mov	(+$exp_offset+62)(%rsp), %eax
+	 mov	%rax, %rdx
+	 shr	\$11, %rax
+	 and	\$0x07FF, %edx
+	 mov	%edx, (+$exp_offset+62)(%rsp)
+	 lea	(+$garray_offset)(%rsp,%rax,2), %rsi
+	 mov	(+$pResult_offset)(%rsp), %rdx
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+
+	#
+	# Loop variables
+	# rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+	 movq	\$505, (+$loop_idx_offset)(%rsp)
+
+	 mov	(+$pResult_offset)(%rsp), %rcx
+	 mov	%rcx, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rcx), %r10
+	 mov	(+8*1)(%rcx), %r11
+	 mov	(+8*2)(%rcx), %r12
+	 mov	(+8*3)(%rcx), %r13
+	 mov	(+8*4)(%rcx), %r14
+	 mov	(+8*5)(%rcx), %r15
+	 mov	(+8*6)(%rcx), %r8
+	 mov	(+8*7)(%rcx), %r9
+	 jmp	sqr_2
+
+main_loop_a3b:
+	 call	sqr_reduce
+	 call	sqr_reduce
+	 call	sqr_reduce
+sqr_2:
+	 call	sqr_reduce
+	 call	sqr_reduce
+
+	#
+	# Do multiply, first look up proper value in Garray
+	 mov	(+$loop_idx_offset)(%rsp), %rcx			# bit index
+	 mov	%rcx, %rax
+	 shr	\$4, %rax			# rax is word pointer
+	 mov	(+$exp_offset)(%rsp,%rax,2), %edx
+	 and	\$15, %rcx
+	 shrq	%cl, %rdx
+	 and	\$0x1F, %rdx
+
+	 lea	(+$garray_offset)(%rsp,%rdx,2), %rsi
+	 lea	(+$tmp_offset)(%rsp), %rdx
+	 mov	%rdx, %rdi
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+	# rdi = tmp = pG
+
+	#
+	# Call mod_mul_a1(pDst,  pSrc1, pSrc2, pM, pData)
+	#                 result result pG     M   Data
+$code.=<<___;
+	 mov	(+$pResult_offset)(%rsp), %rsi
+	 call	mont_mul_a3b
+
+	#
+	# finish loop
+	 mov	(+$loop_idx_offset)(%rsp), %rcx
+	 sub	\$5, %rcx
+	 mov	%rcx, (+$loop_idx_offset)(%rsp)
+	 jge	main_loop_a3b
+
+	#
+
+end_main_loop_a3b:
+	# transform result out of Montgomery space
+	# result = reduce(result)
+	 mov	(+$pResult_offset)(%rsp), %rdx
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*5)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*3)(%rsp)
+	 call	mont_reduce
+
+	# If result > m, subract m
+	# load result into r15:r8
+	 mov	(+$pResult_offset)(%rsp), %rax
+	 mov	(+8*0)(%rax), %r8
+	 mov	(+8*1)(%rax), %r9
+	 mov	(+8*2)(%rax), %r10
+	 mov	(+8*3)(%rax), %r11
+	 mov	(+8*4)(%rax), %r12
+	 mov	(+8*5)(%rax), %r13
+	 mov	(+8*6)(%rax), %r14
+	 mov	(+8*7)(%rax), %r15
+
+	# subtract m
+	 mov	(+$pData_offset)(%rsp), %rbx
+	 add	\$$M, %rbx
+
+	 sub	(+8*0)(%rbx), %r8
+	 sbb	(+8*1)(%rbx), %r9
+	 sbb	(+8*2)(%rbx), %r10
+	 sbb	(+8*3)(%rbx), %r11
+	 sbb	(+8*4)(%rbx), %r12
+	 sbb	(+8*5)(%rbx), %r13
+	 sbb	(+8*6)(%rbx), %r14
+	 sbb	(+8*7)(%rbx), %r15
+
+	# if Carry is clear, replace result with difference
+	 mov	(+8*0)(%rax), %rsi
+	 mov	(+8*1)(%rax), %rdi
+	 mov	(+8*2)(%rax), %rcx
+	 mov	(+8*3)(%rax), %rdx
+	 cmovnc	%r8, %rsi
+	 cmovnc	%r9, %rdi
+	 cmovnc	%r10, %rcx
+	 cmovnc	%r11, %rdx
+	 mov	%rsi, (+8*0)(%rax)
+	 mov	%rdi, (+8*1)(%rax)
+	 mov	%rcx, (+8*2)(%rax)
+	 mov	%rdx, (+8*3)(%rax)
+
+	 mov	(+8*4)(%rax), %rsi
+	 mov	(+8*5)(%rax), %rdi
+	 mov	(+8*6)(%rax), %rcx
+	 mov	(+8*7)(%rax), %rdx
+	 cmovnc	%r12, %rsi
+	 cmovnc	%r13, %rdi
+	 cmovnc	%r14, %rcx
+	 cmovnc	%r15, %rdx
+	 mov	%rsi, (+8*4)(%rax)
+	 mov	%rdi, (+8*5)(%rax)
+	 mov	%rcx, (+8*6)(%rax)
+	 mov	%rdx, (+8*7)(%rax)
+
+	 mov	(+$rsp_offset)(%rsp), %rsi
+	 mov	0(%rsi),%r15
+	 mov	8(%rsi),%r14
+	 mov	16(%rsi),%r13
+	 mov	24(%rsi),%r12
+	 mov	32(%rsi),%rbx
+	 mov	40(%rsi),%rbp
+	 lea	48(%rsi),%rsp
+.Lepilogue:
+	 ret
+.size mod_exp_512, . - mod_exp_512
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mod_exp_512_se_handler,\@abi-omnipotent
+.align	16
+mod_exp_512_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	$rsp_offset(%rax),%rax	# pull saved Rsp
+
+	mov	32(%rax),%rbx
+	mov	40(%rax),%rbp
+	mov	24(%rax),%r12
+	mov	16(%rax),%r13
+	mov	8(%rax),%r14
+	mov	0(%rax),%r15
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mod_exp_512_se_handler,.-mod_exp_512_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_mod_exp_512
+	.rva	.LSEH_end_mod_exp_512
+	.rva	.LSEH_info_mod_exp_512
+
+.section	.xdata
+.align	8
+.LSEH_info_mod_exp_512:
+	.byte	9,0,0,0
+	.rva	mod_exp_512_se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000..4a766a8
--- /dev/null
+++ b/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,993 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$BN_SZ		=$SIZE_T;
+} else {
+	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$BN_SZ		=$SIZE_T;
+	if (open CONF,"<${dir}../../opensslconf.h") {
+	    while(<CONF>) {
+		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+		    $BN_SZ=8;
+		    $LEVEL="2.0";
+		    last;
+		}
+	    }
+	    close CONF;
+	}
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
+				#                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;			# local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";	# passed through stack in 32-bit
+$num="%r21";	# passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4";	$fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
+$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+bn_mul_mont
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	ldo	-$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldw	`-$FRAME_MARKER-4`($fp),$n0
+	ldw	`-$FRAME_MARKER-8`($fp),$num
+	nop
+	nop					; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+	comiclr,<=	6,$num,%r0		; are vectors long enough?
+	b		L\$abort
+	ldi		0,%r28			; signal "unhandled"
+	add,ev		%r0,$num,$num		; is $num even?
+	b		L\$abort
+	nop
+	or		$ap,$np,$ti1
+	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
+	b		L\$abort
+	nop
+	nop					; alignment
+	nop
+
+	fldws		0($n0),${fn0}
+	fldws,ma	4($bp),${fbi}		; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+	comib,>		3,$num,L\$abort		; are vectors long enough?
+	ldi		0,%r28			; signal "unhandled"
+	addl		$num,$num,$num		; I operate on 32-bit values
+
+	fldws		4($n0),${fn0}		; only low part of n0
+	fldws		4($bp),${fbi}		; bp[0] in flipped word order
+___
+$code.=<<___;
+	fldds		0($ap),${fai}		; ap[0,1]
+	fldds		0($np),${fni}		; np[0,1]
+
+	sh2addl		$num,%r0,$arrsz
+	ldi		31,$hi0
+	ldo		36($arrsz),$hi1		; space for tp[num+1]
+	andcm		$hi1,$hi0,$hi1		; align
+	addl		$hi1,%sp,%sp
+	$PUSH		$fp,-$SIZE_T(%sp)
+
+	ldo		`$LOCALS+16`($fp),$xfer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
+	xmpyu		${fn0},${fab0}R,${fm0}
+
+	addl		$arrsz,$ap,$ap		; point at the end
+	addl		$arrsz,$np,$np
+	subi		0,$arrsz,$idx		; j=0
+	ldo		8($idx),$idx		; j++++
+
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	fstds		${fab1},0($xfer)
+	fstds		${fnm1},8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+	mtctl		$hi0,%cr11		; $hi0 still holds 31
+	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
+	b		L\$parisc11
+	nop
+___
+$code.=<<___;					# PA-RISC 2.0 code-path
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+
+	extrd,u		$ab0,31,32,$hi0
+	extrd,u		$ab0,63,32,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 ldo		8($idx),$idx		; j++++
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	 extrd,u	$nm0,31,32,$hi1
+
+L\$1st
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$ab1,$nm1,$nm1
+	 extrd,u	$nm1,31,32,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	 addl		$ab1,$nm1,$nm1
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+	fldws		0($bp),${fbi}		; bp[1] in flipped word order
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	addl		$hi1,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2]
+	 flddx		$idx($np),${fni}	; np[2]
+	 ldo		8($idx),$idx		; j++++
+	ldd		-16($xfer),$ab0		; 33-bit value
+	ldd		-8($xfer),$nm0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	 extrd,u	$ab0,31,32,$ti0		; carry bit
+	 extrd,u	$ab0,63,32,$ab0
+	fstds		${fab1},0($xfer)
+	 addl		$ti0,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 extrd,u	$nm0,31,32,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+L\$inner
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldw		4($tp),$ti0		; tp[j]
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ti0,$ti0
+	 addl		$ti0,$ab0,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,31,32,$hi0
+	 extrd,u	$nm1,31,32,$hi1
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	ldw		4($tp),$ti0		; tp[j]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	addl		$hi0,$ab0,$ab0
+	 addl		$ti0,$ab0,$ab0
+	 stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,31,32,$hi0
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone	; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldi		12,$ti0			; bp[i] in flipped word order
+	addl,ev		%r0,$num,$num
+	ldi		-4,$ti0
+	addl		$ti0,$bp,$bp
+	fldws		0($bp),${fbi}
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0]
+	addl		$hi0,$ab1,$ab1
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	addl		$hi1,$hi0,$hi0
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+	addl		$hi0,$ab1,$ab1
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	addl		$hi1,$hi0,$hi0
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+	ldws,ma		4($tp),$ti0
+	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
+	b		L\$sub_pa11
+	addl		$tp,$arrsz,$tp
+L\$sub
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldd,ma		8($tp),$ti0
+L\$sub
+	ldd		$idx($np),$hi0
+	shrpd		$ti0,$ti0,32,$ti0	; flip word order
+	std		$ti0,-8($tp)		; save flipped value
+	sub,db		$ti0,$hi0,$hi1
+	ldd,ma		8($tp),$ti0
+	addib,<>	8,$idx,L\$sub
+	std,ma		$hi1,8($rp)
+
+	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
+	sub,db		$ti0,%r0,$hi1
+	ldo		-8($tp),$tp
+___
+$code.=<<___;
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy
+	ldd		$idx($np),$hi0
+	std,ma		%r0,8($tp)
+	addib,<>	8,$idx,.-8		; L\$copy
+	std,ma		$hi0,8($rp)	
+___
+
+if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+	b		L\$done
+	nop
+
+	.ALIGN		8
+L\$parisc11
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-12($xfer),$ablo
+	ldw		-16($xfer),$hi0
+	ldw		-4($xfer),$nmlo0
+	ldw		-8($xfer),$nmhi0
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+	 ldo		8($idx),$idx		; j++++
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	 addc		%r0,$nmhi0,$hi1
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+	nop
+
+L\$1st_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		12($xfer),$nmlo1
+	 addc		%r0,$abhi,$hi0
+	ldw		8($xfer),$nmhi1
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	fstds		${fnm1},8($xfer)
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-16($xfer),$abhi
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	ldw		-4($xfer),$nmlo0
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-8($xfer),$nmhi0
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab0},-16($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	fstds		${fnm0},-8($xfer)
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	 ldw		8($xfer),$nmhi1
+	 ldw		12($xfer),$nmlo1
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	 add		$hi0,$ablo,$ablo
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-4($xfer),$nmlo0
+
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	stws,ma		$nmlo0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[1]
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+	ldw		-16($xfer),$abhi	; carry bit actually
+	 ldo		8($idx),$idx		; j++++
+	ldw		-12($xfer),$ablo
+	ldw		-8($xfer),$nmhi0
+	ldw		-4($xfer),$nmlo0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	fstds		${fab1},0($xfer)
+	 addl		$abhi,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 addc		%r0,$nmhi0,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+
+L\$inner_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	ldw		12($xfer),$nmlo1
+	 add		$ti1,$ablo,$ablo
+	ldw		8($xfer),$nmhi1
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab1},0($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	ldw		8($tp),$ti1		; tp[j]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-4($xfer),$nmlo0
+	 add		$hi0,$ablo,$ablo
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$abhi,$abhi
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 add		$ti0,$ablo,$ablo
+	fstds		${fab0},-16($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm0},-8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	ldw		12($xfer),$nmlo1
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldw		8($xfer),$nmhi1
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	fstds		${fab1},0($xfer)
+	 add		$ti1,$ablo,$ablo
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$abhi,$hi0
+	ldw		-16($xfer),$abhi
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-8($xfer),$nmhi0
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-4($xfer),$nmlo0
+	 addc		%r0,$nmhi1,$hi1
+
+	add		$hi0,$ablo,$ablo
+	 stw		$nmlo1,-4($tp)		; tp[j-1]
+	addc		%r0,$abhi,$abhi
+	 add		$ti0,$ablo,$ablo
+	ldw		8($tp),$ti1		; tp[j]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone_pa11; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[i]
+	 flddx		$idx($ap),${fai}	; ap[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer_pa11
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32+4`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+	ldw		-4($tp),$ti0
+	addl		$tp,$arrsz,$tp
+L\$sub_pa11
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub_pa11
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy_pa11
+	ldwx		$idx($np),$hi0
+	stws,ma		%r0,4($tp)
+	addib,<>	4,$idx,L\$copy_pa11
+	stws,ma		$hi0,4($rp)	
+
+	nop					; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+	ldi		1,%r28			; signal "handled"
+	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
+    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+	$opcode|=(1<<10);	# e1
+	$opcode|=(1<<8);	# e2
+	$opcode|=(1<<5);	# d
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	# flip word order in 64-bit mode...
+	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+	# assemble 2.0 instructions in 32-bit mode...
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index 7849eae..f9b6992 100644
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -31,7 +31,6 @@
 	$BNSZ=	$BITS/8;
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*16;
 
 	$LD=	"lwz";		# load
 	$LDU=	"lwzu";		# load and update
@@ -51,7 +50,6 @@
 	$BNSZ=	$BITS/8;
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*16;
 
 	# same as above, but 64-bit mnemonics...
 	$LD=	"ld";		# load
@@ -69,6 +67,9 @@
 	$POP=	$LD;
 } else { die "nonsense $flavour"; }
 
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@
 $nj="r11";
 $tj="r12";
 # non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
 #
 $nhi="r0";
 
@@ -108,42 +109,48 @@
 .machine "any"
 .text
 
-.globl	.bn_mul_mont
+.globl	.bn_mul_mont_int
 .align	4
-.bn_mul_mont:
+.bn_mul_mont_int:
 	cmpwi	$num,4
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0
 	bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+	cmpwi	$num,32		; longer key performance is not better
+	bgelr
+___
+$code.=<<___;
 	slwi	$num,$num,`log($BNSZ)/log(2)`
 	li	$tj,-4096
-	addi	$ovf,$num,`$FRAME+$RZONE`
+	addi	$ovf,$num,$FRAME
 	subf	$ovf,$ovf,$sp	; $sp-$ovf
 	and	$ovf,$ovf,$tj	; minimize TLB usage
 	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	mr	$tj,$sp
 	srwi	$num,$num,`log($BNSZ)/log(2)`
 	$STUX	$sp,$sp,$ovf
 
-	$PUSH	r14,`4*$SIZE_T`($sp)
-	$PUSH	r15,`5*$SIZE_T`($sp)
-	$PUSH	r16,`6*$SIZE_T`($sp)
-	$PUSH	r17,`7*$SIZE_T`($sp)
-	$PUSH	r18,`8*$SIZE_T`($sp)
-	$PUSH	r19,`9*$SIZE_T`($sp)
-	$PUSH	r20,`10*$SIZE_T`($sp)
-	$PUSH	r21,`11*$SIZE_T`($sp)
-	$PUSH	r22,`12*$SIZE_T`($sp)
-	$PUSH	r23,`13*$SIZE_T`($sp)
-	$PUSH	r24,`14*$SIZE_T`($sp)
-	$PUSH	r25,`15*$SIZE_T`($sp)
+	$PUSH	r20,`-12*$SIZE_T`($tj)
+	$PUSH	r21,`-11*$SIZE_T`($tj)
+	$PUSH	r22,`-10*$SIZE_T`($tj)
+	$PUSH	r23,`-9*$SIZE_T`($tj)
+	$PUSH	r24,`-8*$SIZE_T`($tj)
+	$PUSH	r25,`-7*$SIZE_T`($tj)
+	$PUSH	r26,`-6*$SIZE_T`($tj)
+	$PUSH	r27,`-5*$SIZE_T`($tj)
+	$PUSH	r28,`-4*$SIZE_T`($tj)
+	$PUSH	r29,`-3*$SIZE_T`($tj)
+	$PUSH	r30,`-2*$SIZE_T`($tj)
+	$PUSH	r31,`-1*$SIZE_T`($tj)
 
 	$LD	$n0,0($n0)	; pull n0[0] value
 	addi	$num,$num,-2	; adjust $num for counter register
 
 	$LD	$m0,0($bp)	; m0=bp[0]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
 	$UMULH	$hi0,$aj,$m0
 
@@ -205,8 +212,8 @@
 Louter:
 	$LDX	$m0,$bp,$i	; m0=bp[i]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
-	$LD	$tj,$FRAME($sp)	; tp[0]
+	addi	$tp,$sp,$LOCALS
+	$LD	$tj,$LOCALS($sp); tp[0]
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
 	$UMULH	$hi0,$aj,$m0
 	$LD	$aj,$BNSZ($ap)	; ap[1]
@@ -273,7 +280,7 @@
 
 	addi	$num,$num,2	; restore $num
 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	mtctr	$num
 
 .align	4
@@ -299,23 +306,27 @@
 	addi	$j,$j,$BNSZ
 	bdnz-	Lcopy
 
-	$POP	r14,`4*$SIZE_T`($sp)
-	$POP	r15,`5*$SIZE_T`($sp)
-	$POP	r16,`6*$SIZE_T`($sp)
-	$POP	r17,`7*$SIZE_T`($sp)
-	$POP	r18,`8*$SIZE_T`($sp)
-	$POP	r19,`9*$SIZE_T`($sp)
-	$POP	r20,`10*$SIZE_T`($sp)
-	$POP	r21,`11*$SIZE_T`($sp)
-	$POP	r22,`12*$SIZE_T`($sp)
-	$POP	r23,`13*$SIZE_T`($sp)
-	$POP	r24,`14*$SIZE_T`($sp)
-	$POP	r25,`15*$SIZE_T`($sp)
-	$POP	$sp,0($sp)
+	$POP	$tj,0($sp)
 	li	r3,1
+	$POP	r20,`-12*$SIZE_T`($tj)
+	$POP	r21,`-11*$SIZE_T`($tj)
+	$POP	r22,`-10*$SIZE_T`($tj)
+	$POP	r23,`-9*$SIZE_T`($tj)
+	$POP	r24,`-8*$SIZE_T`($tj)
+	$POP	r25,`-7*$SIZE_T`($tj)
+	$POP	r26,`-6*$SIZE_T`($tj)
+	$POP	r27,`-5*$SIZE_T`($tj)
+	$POP	r28,`-4*$SIZE_T`($tj)
+	$POP	r29,`-3*$SIZE_T`($tj)
+	$POP	r30,`-2*$SIZE_T`($tj)
+	$POP	r31,`-1*$SIZE_T`($tj)
+	mr	$sp,$tj
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x80,12,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
index 37c65d3..1249ce2 100644
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@@ -389,7 +389,9 @@
 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -814,8 +816,9 @@
 
 
 	blr
-
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -949,7 +952,7 @@
 	addze	r11,r0
 					#mul_add_c(a[3],b[2],c3,c1,c2);
 	$LD	r6,`3*$BNSZ`(r4)
-	$LD	r7,`2*$BNSZ`(r4)
+	$LD	r7,`2*$BNSZ`(r5)
 	$UMULL	r8,r6,r7
 	$UMULH	r9,r6,r7
 	addc	r12,r8,r12
@@ -966,7 +969,9 @@
 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1502,7 +1507,9 @@
 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1550,8 +1557,9 @@
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1594,7 +1602,9 @@
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1707,7 +1717,9 @@
 Lppcasm_div9:
 	or	r3,r8,r0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1746,8 +1758,9 @@
 	bdnz-	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1850,7 +1863,9 @@
 Lppcasm_mw_OVER:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1973,7 +1988,9 @@
 Lppcasm_maw_adios:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 	.align	4
 EOF
 $data =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index 3449b35..a14e769 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -45,23 +45,40 @@
 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
 # in absolute terms, but it's apparently the way Power 6 is...
 
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
 $flavour = shift;
 
 if ($flavour =~ /32/) {
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont_ppc64";
+	$fname=	"bn_mul_mont_fpu64";
 
 	$STUX=	"stwux";	# store indexed and update
 	$PUSH=	"stw";
 	$POP=	"lwz";
-	die "not implemented yet";
 } elsif ($flavour =~ /64/) {
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont";
+	$fname=	"bn_mul_mont_fpu64";
 
 	# same as above, but 64-bit mnemonics...
 	$STUX=	"stdux";	# store indexed and update
@@ -76,7 +93,7 @@
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=($FRAME+63)&~63;
+$FRAME=64;	# padded frame header
 $TRANSFER=16*8;
 
 $carry="r0";
@@ -93,16 +110,16 @@
 $j="r11";
 $i="r12";
 # non-volatile registers
-$nap_d="r14";	# interleaved ap and np in double format
-$a0="r15";	# ap[0]
-$t0="r16";	# temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22";	# interleaved ap and np in double format
+$a0="r23";	# ap[0]
+$t0="r24";	# temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
 
 # PPC offers enough register bank capacity to unroll inner loops twice
 #
@@ -132,28 +149,17 @@
 $na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
 $dota="f8";	$dotb="f9";
 $A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
-$N0="f14";	$N1="f15";	$N2="f16";	$N3="f17";
-$T0a="f18";	$T0b="f19";
-$T1a="f20";	$T1b="f21";
-$T2a="f22";	$T2b="f23";
-$T3a="f24";	$T3b="f25";
+$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
+$T0a="f24";	$T0b="f25";
+$T1a="f26";	$T1b="f27";
+$T2a="f28";	$T2b="f29";
+$T3a="f30";	$T3b="f31";
 
 # sp----------->+-------------------------------+
 #		| saved sp			|
 #		+-------------------------------+
-#		|				|
-#		+-------------------------------+
-#		| 10 saved gpr, r14-r23		|
-#		.				.
-#		.				.
-#   +12*size_t	+-------------------------------+
-#		| 12 saved fpr, f14-f25		|
 #		.				.
-#		.				.
-#   +12*8	+-------------------------------+
-#		| padding to 64 byte boundary	|
-#		.				.
-#   +X		+-------------------------------+
+#   +64		+-------------------------------+
 #		| 16 gpr<->fpr transfer zone	|
 #		.				.
 #		.				.
@@ -173,6 +179,16 @@
 #		.				.
 #		.				.
 #		+-------------------------------+
+#		.				.
+#   -12*size_t	+-------------------------------+
+#		| 10 saved gpr, r22-r31		|
+#		.				.
+#		.				.
+#   -12*8	+-------------------------------+
+#		| 12 saved fpr, f20-f31		|
+#		.				.
+#		.				.
+#		+-------------------------------+
 
 $code=<<___;
 .machine "any"
@@ -181,14 +197,14 @@
 .globl	.$fname
 .align	5
 .$fname:
-	cmpwi	$num,4
+	cmpwi	$num,`3*8/$SIZE_T`
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0		; possible "not handled" return code
 	bltlr-
-	andi.	r0,$num,1	; $num has to be even
+	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
 	bnelr-
 
-	slwi	$num,$num,3	; num*=8
+	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
 	li	$i,-4096
 	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
 	add	$tp,$tp,$num	; place for tp[num+1]
@@ -196,35 +212,50 @@
 	subf	$tp,$tp,$sp	; $sp-$tp
 	and	$tp,$tp,$i	; minimize TLB usage
 	subf	$tp,$sp,$tp	; $tp-$sp
+	mr	$i,$sp
 	$STUX	$sp,$sp,$tp	; alloca
 
-	$PUSH	r14,`2*$SIZE_T`($sp)
-	$PUSH	r15,`3*$SIZE_T`($sp)
-	$PUSH	r16,`4*$SIZE_T`($sp)
-	$PUSH	r17,`5*$SIZE_T`($sp)
-	$PUSH	r18,`6*$SIZE_T`($sp)
-	$PUSH	r19,`7*$SIZE_T`($sp)
-	$PUSH	r20,`8*$SIZE_T`($sp)
-	$PUSH	r21,`9*$SIZE_T`($sp)
-	$PUSH	r22,`10*$SIZE_T`($sp)
-	$PUSH	r23,`11*$SIZE_T`($sp)
-	stfd	f14,`12*$SIZE_T+0`($sp)
-	stfd	f15,`12*$SIZE_T+8`($sp)
-	stfd	f16,`12*$SIZE_T+16`($sp)
-	stfd	f17,`12*$SIZE_T+24`($sp)
-	stfd	f18,`12*$SIZE_T+32`($sp)
-	stfd	f19,`12*$SIZE_T+40`($sp)
-	stfd	f20,`12*$SIZE_T+48`($sp)
-	stfd	f21,`12*$SIZE_T+56`($sp)
-	stfd	f22,`12*$SIZE_T+64`($sp)
-	stfd	f23,`12*$SIZE_T+72`($sp)
-	stfd	f24,`12*$SIZE_T+80`($sp)
-	stfd	f25,`12*$SIZE_T+88`($sp)
-
+	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
+	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
+	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
+	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
+	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
+	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
+	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
+	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
+	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
+	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
+	stfd	f20,`-12*8`($i)
+	stfd	f21,`-11*8`($i)
+	stfd	f22,`-10*8`($i)
+	stfd	f23,`-9*8`($i)
+	stfd	f24,`-8*8`($i)
+	stfd	f25,`-7*8`($i)
+	stfd	f26,`-6*8`($i)
+	stfd	f27,`-5*8`($i)
+	stfd	f28,`-4*8`($i)
+	stfd	f29,`-3*8`($i)
+	stfd	f30,`-2*8`($i)
+	stfd	f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
 	ld	$a0,0($ap)	; pull ap[0] value
 	ld	$n0,0($n0)	; pull n0[0] value
 	ld	$t3,0($bp)	; bp[0]
-
+___
+$code.=<<___ if ($SIZE_T==4);
+	mr	$t1,$n0
+	lwz	$a0,0($ap)	; pull ap[0,1] value
+	lwz	$t0,4($ap)
+	lwz	$n0,0($t1)	; pull n0[0,1] value
+	lwz	$t1,4($t1)
+	lwz	$t3,0($bp)	; bp[0,1]
+	lwz	$t2,4($bp)
+	insrdi	$a0,$t0,32,0
+	insrdi	$n0,$t1,32,0
+	insrdi	$t3,$t2,32,0
+___
+$code.=<<___;
 	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
 	li	$i,-64
 	add	$nap_d,$tp,$num
@@ -258,6 +289,8 @@
 	std	$t5,`$FRAME+40`($sp)
 	std	$t6,`$FRAME+48`($sp)
 	std	$t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	lfd	$ba,`$FRAME+0`($sp)
 	lfd	$bb,`$FRAME+8`($sp)
 	lfd	$bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@
 
 .align	5
 L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	std	$t0,`$FRAME+64`($sp)
 	std	$t1,`$FRAME+72`($sp)
 	std	$t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@
 	li	$i,8			; i=1
 .align	5
 Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
 	ldx	$t3,$bp,$i	; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+	add	$t0,$bp,$i
+	lwz	$t3,0($t0)		; bp[i,i+1]
+	lwz	$t0,4($t0)
+	insrdi	$t3,$t0,32,0
+___
+$code.=<<___;
 	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
 	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
 
@@ -761,6 +830,13 @@
 	stfd	$T0b,`$FRAME+8`($sp)
 	 add	$t7,$t7,$carry
 	 addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	stfd	$T1a,`$FRAME+16`($sp)
 	stfd	$T1b,`$FRAME+24`($sp)
 	 insrdi	$t4,$t7,16,0		; 64..127 bits
@@ -768,6 +844,13 @@
 	stfd	$T2a,`$FRAME+32`($sp)
 	stfd	$T2b,`$FRAME+40`($sp)
 	 adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	stfd	$T3a,`$FRAME+48`($sp)
 	stfd	$T3b,`$FRAME+56`($sp)
 	 addze	$carry,$carry
@@ -816,7 +899,21 @@
 	ld	$t7,`$FRAME+72`($sp)
 
 	addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	addze	$carry,$carry
 
 	std	$t3,-16($tp)		; tp[j-1]
@@ -835,7 +932,9 @@
 	subf	$nap_d,$t7,$nap_d	; rewind pointer
 	cmpw	$i,$num
 	blt-	Louter
+___
 
+$code.=<<___ if ($SIZE_T==8);
 	subf	$np,$num,$np	; rewind np
 	addi	$j,$j,1		; restore counter
 	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@
 	stdx	$i,$t4,$i
 	addi	$i,$i,16
 	bdnz-	Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+	subf	$np,$num,$np	; rewind np
+	addi	$j,$j,1		; restore counter
+	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	addi	$np,$np,-4
+	addi	$rp,$rp,-4
+	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
+	mtctr	$j
+
+.align	4
+Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
+	ldu	$t2,16($tp)
+	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
+	lwz	$t5,8($np)
+	lwz	$t6,12($np)
+	lwzu	$t7,16($np)
+	extrdi	$t1,$t0,32,0
+	extrdi	$t3,$t2,32,0
+	subfe	$t4,$t4,$t0	; tp[j]-np[j]
+	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
+	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
+	 stw	$t1,8($ap)
+	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
+	 stw	$t2,12($ap)
+	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
+	 stwu	$t3,16($ap)
+	stw	$t4,4($rp)
+	stw	$t5,8($rp)
+	stw	$t6,12($rp)
+	stwu	$t7,16($rp)
+	bdnz-	Lsub
+
+	li	$i,0
+	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
+	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
+	subf	$rp,$num,$rp	; rewind rp
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	mtctr	$j
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	lwz	$t0,4($ap)
+	lwz	$t1,8($ap)
+	lwz	$t2,12($ap)
+	lwzu	$t3,16($ap)
+	std	$i,8($nap_d)	; zap nap_d
+	std	$i,16($nap_d)
+	std	$i,24($nap_d)
+	std	$i,32($nap_d)
+	std	$i,40($nap_d)
+	std	$i,48($nap_d)
+	std	$i,56($nap_d)
+	stdu	$i,64($nap_d)
+	stw	$t0,4($rp)
+	stw	$t1,8($rp)
+	stw	$t2,12($rp)
+	stwu	$t3,16($rp)
+	std	$i,8($tp)	; zap tp at once
+	stdu	$i,16($tp)
+	bdnz-	Lcopy
+___
 
-	$POP	r14,`2*$SIZE_T`($sp)
-	$POP	r15,`3*$SIZE_T`($sp)
-	$POP	r16,`4*$SIZE_T`($sp)
-	$POP	r17,`5*$SIZE_T`($sp)
-	$POP	r18,`6*$SIZE_T`($sp)
-	$POP	r19,`7*$SIZE_T`($sp)
-	$POP	r20,`8*$SIZE_T`($sp)
-	$POP	r21,`9*$SIZE_T`($sp)
-	$POP	r22,`10*$SIZE_T`($sp)
-	$POP	r23,`11*$SIZE_T`($sp)
-	lfd	f14,`12*$SIZE_T+0`($sp)
-	lfd	f15,`12*$SIZE_T+8`($sp)
-	lfd	f16,`12*$SIZE_T+16`($sp)
-	lfd	f17,`12*$SIZE_T+24`($sp)
-	lfd	f18,`12*$SIZE_T+32`($sp)
-	lfd	f19,`12*$SIZE_T+40`($sp)
-	lfd	f20,`12*$SIZE_T+48`($sp)
-	lfd	f21,`12*$SIZE_T+56`($sp)
-	lfd	f22,`12*$SIZE_T+64`($sp)
-	lfd	f23,`12*$SIZE_T+72`($sp)
-	lfd	f24,`12*$SIZE_T+80`($sp)
-	lfd	f25,`12*$SIZE_T+88`($sp)
-	$POP	$sp,0($sp)
+$code.=<<___;
+	$POP	$i,0($sp)
 	li	r3,1	; signal "handled"
+	$POP	r22,`-12*8-10*$SIZE_T`($i)
+	$POP	r23,`-12*8-9*$SIZE_T`($i)
+	$POP	r24,`-12*8-8*$SIZE_T`($i)
+	$POP	r25,`-12*8-7*$SIZE_T`($i)
+	$POP	r26,`-12*8-6*$SIZE_T`($i)
+	$POP	r27,`-12*8-5*$SIZE_T`($i)
+	$POP	r28,`-12*8-4*$SIZE_T`($i)
+	$POP	r29,`-12*8-3*$SIZE_T`($i)
+	$POP	r30,`-12*8-2*$SIZE_T`($i)
+	$POP	r31,`-12*8-1*$SIZE_T`($i)
+	lfd	f20,`-12*8`($i)
+	lfd	f21,`-11*8`($i)
+	lfd	f22,`-10*8`($i)
+	lfd	f23,`-9*8`($i)
+	lfd	f24,`-8*8`($i)
+	lfd	f25,`-7*8`($i)
+	lfd	f26,`-6*8`($i)
+	lfd	f27,`-5*8`($i)
+	lfd	f28,`-4*8`($i)
+	lfd	f29,`-3*8`($i)
+	lfd	f30,`-2*8`($i)
+	lfd	f31,`-1*8`($i)
+	mr	$sp,$i
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x8c,10,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl
new file mode 100644
index 0000000..9d18d40
--- /dev/null
+++ b/crypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*)	gcc 4.1 was observed to deliver better results than gcc 4.3,
+#	so that improvement coefficients can vary from one specific
+#	setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@function
+.align	16
+_mul_1x1:
+	lgr	$a1,$a
+	sllg	$a2,$a,1
+	sllg	$a4,$a,2
+	sllg	$a8,$a,3
+
+	srag	$lo,$a1,63			# broadcast 63rd bit
+	nihh	$a1,0x1fff
+	srag	@i[0],$a2,63			# broadcast 62nd bit
+	nihh	$a2,0x3fff
+	srag	@i[1],$a4,63			# broadcast 61st bit
+	nihh	$a4,0x7fff
+	ngr	$lo,$b
+	ngr	@i[0],$b
+	ngr	@i[1],$b
+
+	lghi	@T[0],0
+	lgr	$a12,$a1
+	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
+	xgr	$a12,$a2
+	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
+	 lgr	$a48,$a4
+	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
+	 xgr	$a48,$a8
+	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
+	 xgr	$a1,$a4
+
+	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
+	xgr	$a2,$a4
+	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
+	xgr	$a12,$a4
+	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
+	 xgr	$a1,$a48
+	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
+	 xgr	$a2,$a48
+
+	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
+	xgr	$a12,$a48
+	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
+	 xgr	$a1,$a4
+	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
+	 xgr	$a2,$a4
+	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8
+
+	xgr	$a12,$a4
+	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
+	 srlg	$hi,$lo,1
+	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
+	 sllg	$lo,$lo,63
+	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
+	 srlg	@T[0],@i[0],2
+	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8
+
+	lghi	$mask,`0xf<<3`
+	sllg	$a1,@i[0],62
+	 sllg	@i[0],$b,3
+	srlg	@T[1],@i[1],3
+	 ngr	@i[0],$mask
+	sllg	$a2,@i[1],61
+	 srlg	@i[1],$b,4-3
+	xgr	$hi,@T[0]
+	 ngr	@i[1],$mask
+	xgr	$lo,$a1
+	xgr	$hi,@T[1]
+	xgr	$lo,$a2
+
+	xg	$lo,$stdframe(@i[0],$sp)
+	srlg	@i[0],$b,8-3
+	ngr	@i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	srlg	@i[1],$b,`($n+2)*4`-3
+	sllg	@T[0],@T[1],`$n*4`
+	ngr	@i[1],$mask
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	sllg	@T[0],@T[1],`$n*4`
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+
+	lg	@T[0],$stdframe(@i[0],$sp)
+	sllg	@T[1],@T[0],`($n+1)*4`
+	srlg	@T[0],@T[0],`64-($n+1)*4`
+	xgr	$lo,@T[1]
+	xgr	$hi,@T[0]
+
+	br	$ra
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@function
+.align	16
+bn_GF2m_mul_2x2:
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	%r1,-$stdframe-128
+	la	%r0,0($sp)
+	la	$sp,0(%r1,$sp)			# alloca
+	st${g}	%r0,0($sp)			# back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+	bras	$ra,_mul_1x1			# a1·b1
+	stmg	$lo,$hi,16($rp)
+
+	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# a0·b0
+	stmg	$lo,$hi,0($rp)
+
+	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
+	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
+	lmg	@r[0],@r[3],0($rp)
+
+	xgr	$lo,$hi
+	xgr	$hi,@r[1]
+	xgr	$lo,@r[0]
+	xgr	$hi,@r[2]
+	xgr	$lo,@r[3]	
+	xgr	$hi,@r[3]
+	xgr	$lo,$hi
+	stg	$hi,16($rp)
+	stg	$lo,8($rp)
+___
+} else {
+$code.=<<___;
+	sllg	%r3,%r3,32
+	sllg	%r5,%r5,32
+	or	%r3,%r4
+	or	%r5,%r6
+	bras	$ra,_mul_1x1
+	rllg	$lo,$lo,32
+	rllg	$hi,$hi,32
+	stmg	$lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+	br	$ra
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl
index f61246f..9fd64e8 100644
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -32,6 +32,33 @@
 # Reschedule to minimize/avoid Address Generation Interlock hazard,
 # make inner loops counter-based.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
 $mn0="%r0";
 $num="%r1";
 
@@ -60,34 +87,44 @@
 .globl	bn_mul_mont
 .type	bn_mul_mont,\@function
 bn_mul_mont:
-	lgf	$num,164($sp)	# pull $num
-	sla	$num,3		# $num to enumerate bytes
+	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
+	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
 	la	$bp,0($num,$bp)
 
-	stg	%r2,16($sp)
+	st${g}	%r2,2*$SIZE_T($sp)
 
 	cghi	$num,16		#
 	lghi	%r2,0		#
 	blr	%r14		# if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	tmll	$num,4
+	bnzr	%r14		# if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
 	cghi	$num,96		#
 	bhr	%r14		# if($num>96) return 0;
+___
+$code.=<<___;
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
 
-	stmg	%r3,%r15,24($sp)
-
-	lghi	$rp,-160-8	# leave room for carry bit
+	lghi	$rp,-$stdframe-8	# leave room for carry bit
 	lcgr	$j,$num		# -$num
 	lgr	%r0,$sp
 	la	$rp,0($rp,$sp)
 	la	$sp,0($j,$rp)	# alloca
-	stg	%r0,0($sp)	# back chain
+	st${g}	%r0,0($sp)	# back chain
 
 	sra	$num,3		# restore $num
 	la	$bp,0($j,$bp)	# restore $bp
 	ahi	$num,-1		# adjust $num for inner loop
 	lg	$n0,0($n0)	# pull n0
+	_dswap	$n0
 
 	lg	$bi,0($bp)
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[0]
 	lgr	$AHI,$ahi
 
@@ -95,6 +132,7 @@
 	msgr	$mn0,$n0
 
 	lg	$nlo,0($np)	#
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -106,12 +144,14 @@
 .align	16
 .L1st:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[0]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -119,22 +159,24 @@
 	algr	$nlo,$alo
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.L1st
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI	# upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 	la	$bp,8($bp)	# bp++
 
 .Louter:
 	lg	$bi,0($bp)	# bp[i]
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[i]
-	alg	$alo,160($sp)	# +=tp[0]
+	alg	$alo,$stdframe($sp)	# +=tp[0]
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
@@ -142,6 +184,7 @@
 	msgr	$mn0,$n0	# tp[0]*n0
 
 	lg	$nlo,0($np)	# np[0]
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -153,14 +196,16 @@
 .align	16
 .Linner:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[i]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$ahi,$AHI
-	alg	$alo,160($j,$sp)# +=tp[j]
+	alg	$alo,$stdframe($j,$sp)# +=tp[j]
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -168,31 +213,33 @@
 	algr	$nlo,$alo	# +="tp[j]"
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.Linner
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI
-	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
+	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
 	lghi	$ahi,0
 	alcgr	$AHI,$ahi	# new upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 
 	la	$bp,8($bp)	# bp++
-	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
+	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
 	jne	.Louter
 
-	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
-	la	$ap,160($sp)
+	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
+	la	$ap,$stdframe($sp)
 	ahi	$num,1		# restore $num, incidentally clears "borrow"
 
 	la	$j,0(%r0)
 	lr	$count,$num
 .Lsub:	lg	$alo,0($j,$ap)
-	slbg	$alo,0($j,$np)
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	slbgr	$alo,$nlo
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lsub
@@ -207,19 +254,24 @@
 
 	la	$j,0(%r0)
 	lgr	$count,$num
-.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
-	stg	$j,160($j,$sp)	# zap tp
+.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
+	_dswap	$alo
+	stg	$j,$stdframe($j,$sp)	# zap tp
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lcopy
 
-	la	%r1,160+8+48($j,$sp)
-	lmg	%r6,%r15,0(%r1)
+	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+	lm${g}	%r6,%r15,0(%r1)
 	lghi	%r2,1		# signal "processed"
 	br	%r14
 .size	bn_mul_mont,.-bn_mul_mont
 .string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+	print $_,"\n";
+}
 close STDOUT;
diff --git a/crypto/bn/asm/x86-gf2m.S b/crypto/bn/asm/x86-gf2m.S
new file mode 100644
index 0000000..9403a5a
--- /dev/null
+++ b/crypto/bn/asm/x86-gf2m.S
@@ -0,0 +1,335 @@
+.file	"crypto/bn/asm/x86-gf2m.s"
+.text
+.type	_mul_1x1_mmx,@function
+.align	16
+_mul_1x1_mmx:
+	subl	$36,%esp
+	movl	%eax,%ecx
+	leal	(%eax,%eax,1),%edx
+	andl	$1073741823,%ecx
+	leal	(%edx,%edx,1),%ebp
+	movl	$0,(%esp)
+	andl	$2147483647,%edx
+	movd	%eax,%mm2
+	movd	%ebx,%mm3
+	movl	%ecx,4(%esp)
+	xorl	%edx,%ecx
+	pxor	%mm5,%mm5
+	pxor	%mm4,%mm4
+	movl	%edx,8(%esp)
+	xorl	%ebp,%edx
+	movl	%ecx,12(%esp)
+	pcmpgtd	%mm2,%mm5
+	paddd	%mm2,%mm2
+	xorl	%edx,%ecx
+	movl	%ebp,16(%esp)
+	xorl	%edx,%ebp
+	pand	%mm3,%mm5
+	pcmpgtd	%mm2,%mm4
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%ebp
+	psllq	$31,%mm5
+	pand	%mm3,%mm4
+	movl	%edx,24(%esp)
+	movl	$7,%esi
+	movl	%ebp,28(%esp)
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	movl	%ebp,%edi
+	psllq	$30,%mm4
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	movd	(%esp,%esi,4),%mm0
+	movl	%ebp,%esi
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$3,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$6,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$9,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$12,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$15,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$18,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	movl	%ebp,%edi
+	psllq	$21,%mm2
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	movl	%ebp,%esi
+	psllq	$24,%mm1
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	pxor	%mm1,%mm0
+	movd	(%esp,%edi,4),%mm2
+	pxor	%mm4,%mm0
+	psllq	$27,%mm2
+	pxor	%mm2,%mm0
+	movd	(%esp,%esi,4),%mm1
+	pxor	%mm5,%mm0
+	psllq	$30,%mm1
+	addl	$36,%esp
+	pxor	%mm1,%mm0
+	ret
+.size	_mul_1x1_mmx,.-_mul_1x1_mmx
+.type	_mul_1x1_ialu,@function
+.align	16
+_mul_1x1_ialu:
+	subl	$36,%esp
+	movl	%eax,%ecx
+	leal	(%eax,%eax,1),%edx
+	leal	(,%eax,4),%ebp
+	andl	$1073741823,%ecx
+	leal	(%eax,%eax,1),%edi
+	sarl	$31,%eax
+	movl	$0,(%esp)
+	andl	$2147483647,%edx
+	movl	%ecx,4(%esp)
+	xorl	%edx,%ecx
+	movl	%edx,8(%esp)
+	xorl	%ebp,%edx
+	movl	%ecx,12(%esp)
+	xorl	%edx,%ecx
+	movl	%ebp,16(%esp)
+	xorl	%edx,%ebp
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%ebp
+	sarl	$31,%edi
+	andl	%ebx,%eax
+	movl	%edx,24(%esp)
+	andl	%ebx,%edi
+	movl	%ebp,28(%esp)
+	movl	%eax,%edx
+	shll	$31,%eax
+	movl	%edi,%ecx
+	shrl	$1,%edx
+	movl	$7,%esi
+	shll	$30,%edi
+	andl	%ebx,%esi
+	shrl	$2,%ecx
+	xorl	%edi,%eax
+	shrl	$3,%ebx
+	movl	$7,%edi
+	andl	%ebx,%edi
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	xorl	(%esp,%esi,4),%eax
+	movl	$7,%esi
+	andl	%ebx,%esi
+	shrl	$3,%ebx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$3,%ebp
+	andl	%ebx,%edi
+	shrl	$29,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$6,%ecx
+	andl	%ebx,%esi
+	shrl	$26,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$9,%ebp
+	andl	%ebx,%edi
+	shrl	$23,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$12,%ecx
+	andl	%ebx,%esi
+	shrl	$20,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$15,%ebp
+	andl	%ebx,%edi
+	shrl	$17,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$18,%ecx
+	andl	%ebx,%esi
+	shrl	$14,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	$7,%edi
+	movl	%ebp,%ecx
+	shll	$21,%ebp
+	andl	%ebx,%edi
+	shrl	$11,%ecx
+	xorl	%ebp,%eax
+	shrl	$3,%ebx
+	xorl	%ecx,%edx
+	movl	(%esp,%esi,4),%ecx
+	movl	$7,%esi
+	movl	%ecx,%ebp
+	shll	$24,%ecx
+	andl	%ebx,%esi
+	shrl	$8,%ebp
+	xorl	%ecx,%eax
+	shrl	$3,%ebx
+	xorl	%ebp,%edx
+	movl	(%esp,%edi,4),%ebp
+	movl	%ebp,%ecx
+	shll	$27,%ebp
+	movl	(%esp,%esi,4),%edi
+	shrl	$5,%ecx
+	movl	%edi,%esi
+	xorl	%ebp,%eax
+	shll	$30,%edi
+	xorl	%ecx,%edx
+	shrl	$2,%esi
+	xorl	%edi,%eax
+	xorl	%esi,%edx
+	addl	$36,%esp
+	ret
+.size	_mul_1x1_ialu,.-_mul_1x1_ialu
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,@function
+.align	16
+bn_GF2m_mul_2x2:
+.L_bn_GF2m_mul_2x2_begin:
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%edx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+	movl	OPENSSL_ia32cap_P@GOT(%edx),%edx
+	movl	(%edx),%eax
+	movl	4(%edx),%edx
+	testl	$8388608,%eax
+	jz	.L001ialu
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	24(%esp),%eax
+	movl	32(%esp),%ebx
+	call	_mul_1x1_mmx
+	movq	%mm0,%mm7
+	movl	28(%esp),%eax
+	movl	36(%esp),%ebx
+	call	_mul_1x1_mmx
+	movq	%mm0,%mm6
+	movl	24(%esp),%eax
+	movl	32(%esp),%ebx
+	xorl	28(%esp),%eax
+	xorl	36(%esp),%ebx
+	call	_mul_1x1_mmx
+	pxor	%mm7,%mm0
+	movl	20(%esp),%eax
+	pxor	%mm6,%mm0
+	movq	%mm0,%mm2
+	psllq	$32,%mm0
+	popl	%edi
+	psrlq	$32,%mm2
+	popl	%esi
+	pxor	%mm6,%mm0
+	popl	%ebx
+	pxor	%mm7,%mm2
+	movq	%mm0,(%eax)
+	popl	%ebp
+	movq	%mm2,8(%eax)
+	emms
+	ret
+.align	16
+.L001ialu:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	subl	$20,%esp
+	movl	44(%esp),%eax
+	movl	52(%esp),%ebx
+	call	_mul_1x1_ialu
+	movl	%eax,8(%esp)
+	movl	%edx,12(%esp)
+	movl	48(%esp),%eax
+	movl	56(%esp),%ebx
+	call	_mul_1x1_ialu
+	movl	%eax,(%esp)
+	movl	%edx,4(%esp)
+	movl	44(%esp),%eax
+	movl	52(%esp),%ebx
+	xorl	48(%esp),%eax
+	xorl	56(%esp),%ebx
+	call	_mul_1x1_ialu
+	movl	40(%esp),%ebp
+	movl	(%esp),%ebx
+	movl	4(%esp),%ecx
+	movl	8(%esp),%edi
+	movl	12(%esp),%esi
+	xorl	%edx,%eax
+	xorl	%ecx,%edx
+	xorl	%ebx,%eax
+	movl	%ebx,(%ebp)
+	xorl	%edi,%edx
+	movl	%esi,12(%ebp)
+	xorl	%esi,%eax
+	addl	$20,%esp
+	xorl	%esi,%edx
+	popl	%edi
+	xorl	%edx,%eax
+	popl	%esi
+	movl	%edx,8(%ebp)
+	popl	%ebx
+	movl	%eax,4(%ebp)
+	popl	%ebp
+	ret
+.size	bn_GF2m_mul_2x2,.-.L_bn_GF2m_mul_2x2_begin
+.byte	71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105
+.byte	99,97,116,105,111,110,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.comm	OPENSSL_ia32cap_P,8,4
diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl
new file mode 100644
index 0000000..b579530
--- /dev/null
+++ b/crypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and µ-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII		16%-30%
+# P4		12%-12%
+# Opteron	18%-40%
+# Core2		19%-44%
+# Atom		38%-64%
+# Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+					if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &and	($a1,0x3fffffff);
+	 &lea	($a4,&DWP(0,$a2,$a2));
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	&movd	($A,$a);
+	&movd	($B,$b);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	&pxor	($B31,$B31);
+	&pxor	($B30,$B30);
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	&pcmpgtd($B31,$A);		# broadcast 31st bit
+	&paddd	($A,$A);		# $A<<=1
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	&pand	($B31,$B);
+	&pcmpgtd($B30,$A);		# broadcast 30th bit
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&psllq	($B31,31);
+	&pand	($B30,$B);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&mov	(@i[0],0x7);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	 &mov	($a4,@i[0]);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	&mov	(@i[1],$a4);
+	&psllq	($B30,30);
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	&movd	($R,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],$a4);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],$a4);
+		&psllq	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	($b,3);
+		&pxor	($R,@T[1]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+	&pxor	($R,$B30);
+	&psllq	(@T[1],3*$n++);
+	&pxor	($R,@T[1]);
+
+	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
+	&pxor	($R,$B31);
+	&psllq	(@T[0],3*$n);
+	&add	("esp",32+4);
+	&pxor	($R,@T[0]);
+	&ret	();
+&function_end_B("_mul_1x1_mmx");
+					}
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &lea	($a4,&DWP(0,"",$a,4));
+	 &and	($a1,0x3fffffff);
+	&lea	(@i[1],&DWP(0,$lo,$lo));
+	&sar	($lo,31);		# broadcast 31st bit
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&sar	(@i[1],31);		# broardcast 30th bit
+	&and	($lo,$b);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&and	(@i[1],$b);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	&mov	($hi,$lo);
+	&shl	($lo,31);
+	&mov	(@T[0],@i[1]);
+	&shr	($hi,1);
+
+	 &mov	(@i[0],0x7);
+	&shl	(@i[1],30);
+	 &and	(@i[0],$b);
+	&shr	(@T[0],2);
+	&xor	($lo,@i[1]);
+
+	&shr	($b,3);
+	&mov	(@i[1],0x7);		# 5-byte instruction!?
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	 &xor	($hi,@T[0]);
+	&xor	($lo,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],0x7);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],0x7);
+		&mov	(@T[0],@T[1]);
+		&shl	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	(@T[0],32-3*$n);
+		&xor	($lo,@T[1]);
+		&shr	($b,3);
+		&xor	($hi,@T[0]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+	&mov	(@T[0],@T[1]);
+	&shl	(@T[1],3*$n);
+	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
+	&shr	(@T[0],32-3*$n);	$n++;
+	&mov	(@i[0],@i[1]);
+	&xor	($lo,@T[1]);
+	&shl	(@i[1],3*$n);
+	&xor	($hi,@T[0]);
+	&shr	(@i[0],32-3*$n);
+	&xor	($lo,@i[1]);
+	&xor	($hi,@i[0]);
+
+	&add	("esp",32+4);
+	&ret	();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&mov	("eax",&DWP(0,"edx"));
+	&mov	("edx",&DWP(4,"edx"));
+	&test	("eax",1<<23);		# check MMX bit
+	&jz	(&label("ialu"));
+if ($sse2) {
+	&test	("eax",1<<24);		# check FXSR bit
+	&jz	(&label("mmx"));
+	&test	("edx",1<<1);		# check PCLMULQDQ bit
+	&jz	(&label("mmx"));
+
+	&movups		("xmm0",&QWP(8,"esp"));
+	&shufps		("xmm0","xmm0",0b10110001);
+	&pclmulqdq	("xmm0","xmm0",1);
+	&mov		("eax",&DWP(4,"esp"));
+	&movups		(&QWP(0,"eax"),"xmm0");
+	&ret	();
+
+&set_label("mmx",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_mmx");	# a1·b1
+	&movq	("mm7",$R);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# a0·b0
+	&movq	("mm6",$R);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
+	&pxor	($R,"mm7");
+	&mov	($a,&wparam(0));
+	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
+
+	&movq	($A,$R);
+	&psllq	($R,32);
+	&pop	("edi");
+	&psrlq	($A,32);
+	&pop	("esi");
+	&pxor	($R,"mm6");
+	&pop	("ebx");
+	&pxor	($A,"mm7");
+	&movq	(&QWP(0,$a),$R);
+	&pop	("ebp");
+	&movq	(&QWP(8,$a),$A);
+	&emms	();
+	&ret	();
+&set_label("ialu",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&stack_push(4+1);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_ialu");	# a1·b1
+	&mov	(&DWP(8,"esp"),$lo);
+	&mov	(&DWP(12,"esp"),$hi);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# a0·b0
+	&mov	(&DWP(0,"esp"),$lo);
+	&mov	(&DWP(4,"esp"),$hi);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
+
+	&mov	("ebp",&wparam(0));
+		 @r=("ebx","ecx","edi","esi");
+	&mov	(@r[0],&DWP(0,"esp"));
+	&mov	(@r[1],&DWP(4,"esp"));
+	&mov	(@r[2],&DWP(8,"esp"));
+	&mov	(@r[3],&DWP(12,"esp"));
+
+	&xor	($lo,$hi);
+	&xor	($hi,@r[1]);
+	&xor	($lo,@r[0]);
+	&mov	(&DWP(0,"ebp"),@r[0]);
+	&xor	($hi,@r[2]);
+	&mov	(&DWP(12,"ebp"),@r[3]);
+	&xor	($lo,@r[3]);
+	&stack_pop(4+1);
+	&xor	($hi,@r[3]);
+	&pop	("edi");
+	&xor	($lo,$hi);
+	&pop	("esi");
+	&mov	(&DWP(8,"ebp"),$hi);
+	&pop	("ebx");
+	&mov	(&DWP(4,"ebp"),$lo);
+	&pop	("ebp");
+	&ret	();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/crypto/bn/asm/x86-mont.S b/crypto/bn/asm/x86-mont.S
new file mode 100644
index 0000000..2bbb0e3
--- /dev/null
+++ b/crypto/bn/asm/x86-mont.S
@@ -0,0 +1,338 @@
+.file	"crypto/bn/asm/x86-mont.s"
+.text
+.globl	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	cmpl	$4,%edi
+	jl	.L000just_leave
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	movl	%esp,%ebp
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%esp
+	negl	%edi
+	movl	%esp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%esp
+	xorl	%esp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%esp
+	andl	$-64,%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%edx,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%ebp,24(%esp)
+	movl	8(%esp),%esi
+	leal	1(%ebx),%ebp
+	movl	12(%esp),%edi
+	xorl	%ecx,%ecx
+	movl	%esi,%edx
+	andl	$1,%ebp
+	subl	%edi,%edx
+	leal	4(%edi,%ebx,4),%eax
+	orl	%edx,%ebp
+	movl	(%edi),%edi
+	jz	.L001bn_sqr_mont
+	movl	%eax,28(%esp)
+	movl	(%esi),%eax
+	xorl	%edx,%edx
+.align	16
+.L002mull:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%eax,%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	movl	(%esi,%ecx,4),%eax
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L002mull
+	movl	%edx,%ebp
+	mull	%edi
+	movl	20(%esp),%edi
+	addl	%ebp,%eax
+	movl	16(%esp),%esi
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	movl	%eax,32(%esp,%ebx,4)
+	xorl	%ecx,%ecx
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	movl	(%esi),%eax
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	incl	%ecx
+	jmp	.L0032ndmadd
+.align	16
+.L0041stmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L0041stmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	xorl	%ecx,%ecx
+	addl	36(%esp,%ebx,4),%edx
+	movl	%ebp,32(%esp,%ebx,4)
+	adcl	$0,%ecx
+	movl	(%esi),%eax
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	movl	$1,%ecx
+.align	16
+.L0032ndmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0032ndmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	xorl	%eax,%eax
+	movl	12(%esp),%ecx
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	leal	4(%ecx),%ecx
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	28(%esp),%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L005common_tail
+	movl	(%ecx),%edi
+	movl	8(%esp),%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	movl	(%esi),%eax
+	jmp	.L0041stmadd
+.align	16
+.L001bn_sqr_mont:
+	movl	%ebx,(%esp)
+	movl	%ecx,12(%esp)
+	movl	%edi,%eax
+	mull	%edi
+	movl	%eax,32(%esp)
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+	incl	%ecx
+.align	16
+.L006sqr:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	leal	(%ebx,%eax,2),%ebp
+	shrl	$31,%eax
+	cmpl	(%esp),%ecx
+	movl	%eax,%ebx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L006sqr
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	leal	(%ebx,%eax,2),%ebp
+	imull	32(%esp),%edi
+	shrl	$31,%eax
+	movl	%ebp,32(%esp,%ecx,4)
+	leal	(%eax,%edx,2),%ebp
+	movl	(%esi),%eax
+	shrl	$31,%edx
+	movl	%ebp,36(%esp,%ecx,4)
+	movl	%edx,40(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	%ecx,%ebx
+	adcl	$0,%edx
+	movl	4(%esi),%eax
+	movl	$1,%ecx
+.align	16
+.L0073rdmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	4(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%edx,%ebp
+	mull	%edi
+	addl	36(%esp,%ecx,4),%ebp
+	leal	2(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0073rdmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	movl	12(%esp),%ecx
+	xorl	%eax,%eax
+	movl	8(%esp),%esi
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	%ebx,%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L005common_tail
+	movl	4(%esi,%ecx,4),%edi
+	leal	1(%ecx),%ecx
+	movl	%edi,%eax
+	movl	%ecx,12(%esp)
+	mull	%edi
+	addl	32(%esp,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%eax,32(%esp,%ecx,4)
+	xorl	%ebp,%ebp
+	cmpl	%ebx,%ecx
+	leal	1(%ecx),%ecx
+	je	.L008sqrlast
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+.align	16
+.L009sqradd:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	(%eax,%eax,1),%ebp
+	adcl	$0,%edx
+	shrl	$31,%eax
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%eax
+	addl	%ebx,%ebp
+	adcl	$0,%eax
+	cmpl	(%esp),%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%eax,%ebx
+	jle	.L009sqradd
+	movl	%edx,%ebp
+	addl	%edx,%edx
+	shrl	$31,%ebp
+	addl	%ebx,%edx
+	adcl	$0,%ebp
+.L008sqrlast:
+	movl	20(%esp),%edi
+	movl	16(%esp),%esi
+	imull	32(%esp),%edi
+	addl	32(%esp,%ecx,4),%edx
+	movl	(%esi),%eax
+	adcl	$0,%ebp
+	movl	%edx,32(%esp,%ecx,4)
+	movl	%ebp,36(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	leal	-1(%ecx),%ebx
+	adcl	$0,%edx
+	movl	$1,%ecx
+	movl	4(%esi),%eax
+	jmp	.L0073rdmadd
+.align	16
+.L005common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	16
+.L010sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	.L010sub
+	sbbl	$0,%eax
+	andl	%eax,%esi
+	notl	%eax
+	movl	%edi,%ebp
+	andl	%eax,%ebp
+	orl	%ebp,%esi
+.align	16
+.L011copy:
+	movl	(%esi,%ebx,4),%eax
+	movl	%eax,(%edi,%ebx,4)
+	movl	%ecx,32(%esp,%ebx,4)
+	decl	%ebx
+	jge	.L011copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+.L000just_leave:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index 5cd3cd2..e8f6b05 100755
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -527,8 +527,10 @@
 	&jle	(&label("sqradd"));
 
 	&mov	($carry,"edx");
-	&lea	("edx",&DWP(0,$sbit,"edx",2));
+	&add	("edx","edx");
 	&shr	($carry,31);
+	&add	("edx",$sbit);
+	&adc	($carry,0);
 &set_label("sqrlast");
 	&mov	($word,$_n0);
 	&mov	($inp,$_np);
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index acb0b40..329946e 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -66,7 +66,7 @@
 #undef sqr
 
 /*
- * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
+ * "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
  * "g"(0)		let the compiler to decide where does it
  *			want to keep the value of zero;
  */
diff --git a/crypto/bn/asm/x86_64-gf2m.S b/crypto/bn/asm/x86_64-gf2m.S
new file mode 100644
index 0000000..ccd2ed7
--- /dev/null
+++ b/crypto/bn/asm/x86_64-gf2m.S
@@ -0,0 +1,291 @@
+.text	
+
+.type	_mul_1x1,@function
+.align	16
+_mul_1x1:
+	subq	$128+8,%rsp
+	movq	$-1,%r9
+	leaq	(%rax,%rax,1),%rsi
+	shrq	$3,%r9
+	leaq	(,%rax,4),%rdi
+	andq	%rax,%r9
+	leaq	(,%rax,8),%r12
+	sarq	$63,%rax
+	leaq	(%r9,%r9,1),%r10
+	sarq	$63,%rsi
+	leaq	(,%r9,4),%r11
+	andq	%rbp,%rax
+	sarq	$63,%rdi
+	movq	%rax,%rdx
+	shlq	$63,%rax
+	andq	%rbp,%rsi
+	shrq	$1,%rdx
+	movq	%rsi,%rcx
+	shlq	$62,%rsi
+	andq	%rbp,%rdi
+	shrq	$2,%rcx
+	xorq	%rsi,%rax
+	movq	%rdi,%rbx
+	shlq	$61,%rdi
+	xorq	%rcx,%rdx
+	shrq	$3,%rbx
+	xorq	%rdi,%rax
+	xorq	%rbx,%rdx
+
+	movq	%r9,%r13
+	movq	$0,0(%rsp)
+	xorq	%r10,%r13
+	movq	%r9,8(%rsp)
+	movq	%r11,%r14
+	movq	%r10,16(%rsp)
+	xorq	%r12,%r14
+	movq	%r13,24(%rsp)
+
+	xorq	%r11,%r9
+	movq	%r11,32(%rsp)
+	xorq	%r11,%r10
+	movq	%r9,40(%rsp)
+	xorq	%r11,%r13
+	movq	%r10,48(%rsp)
+	xorq	%r14,%r9
+	movq	%r13,56(%rsp)
+	xorq	%r14,%r10
+
+	movq	%r12,64(%rsp)
+	xorq	%r14,%r13
+	movq	%r9,72(%rsp)
+	xorq	%r11,%r9
+	movq	%r10,80(%rsp)
+	xorq	%r11,%r10
+	movq	%r13,88(%rsp)
+
+	xorq	%r11,%r13
+	movq	%r14,96(%rsp)
+	movq	%r8,%rsi
+	movq	%r9,104(%rsp)
+	andq	%rbp,%rsi
+	movq	%r10,112(%rsp)
+	shrq	$4,%rbp
+	movq	%r13,120(%rsp)
+	movq	%r8,%rdi
+	andq	%rbp,%rdi
+	shrq	$4,%rbp
+
+	movq	(%rsp,%rsi,8),%xmm0
+	movq	%r8,%rsi
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$4,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$60,%rbx
+	xorq	%rcx,%rax
+	pslldq	$1,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$12,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$52,%rbx
+	xorq	%rcx,%rax
+	pslldq	$2,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$20,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$44,%rbx
+	xorq	%rcx,%rax
+	pslldq	$3,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$28,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$36,%rbx
+	xorq	%rcx,%rax
+	pslldq	$4,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$36,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$28,%rbx
+	xorq	%rcx,%rax
+	pslldq	$5,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$44,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$20,%rbx
+	xorq	%rcx,%rax
+	pslldq	$6,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%r8,%rdi
+	movq	%rcx,%rbx
+	shlq	$52,%rcx
+	andq	%rbp,%rdi
+	movq	(%rsp,%rsi,8),%xmm1
+	shrq	$12,%rbx
+	xorq	%rcx,%rax
+	pslldq	$7,%xmm1
+	movq	%r8,%rsi
+	shrq	$4,%rbp
+	xorq	%rbx,%rdx
+	andq	%rbp,%rsi
+	shrq	$4,%rbp
+	pxor	%xmm1,%xmm0
+	movq	(%rsp,%rdi,8),%rcx
+	movq	%rcx,%rbx
+	shlq	$60,%rcx
+.byte	102,72,15,126,198
+	shrq	$4,%rbx
+	xorq	%rcx,%rax
+	psrldq	$8,%xmm0
+	xorq	%rbx,%rdx
+.byte	102,72,15,126,199
+	xorq	%rsi,%rax
+	xorq	%rdi,%rdx
+
+	addq	$128+8,%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,@function
+.align	16
+bn_GF2m_mul_2x2:
+	movq	OPENSSL_ia32cap_P(%rip),%rax
+	btq	$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+.byte	102,72,15,110,198
+.byte	102,72,15,110,201
+.byte	102,72,15,110,210
+.byte	102,73,15,110,216
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm1,%xmm5
+.byte	102,15,58,68,193,0
+	pxor	%xmm2,%xmm4
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,68,211,0
+.byte	102,15,58,68,229,0
+	xorps	%xmm0,%xmm4
+	xorps	%xmm2,%xmm4
+	movdqa	%xmm4,%xmm5
+	pslldq	$8,%xmm4
+	psrldq	$8,%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm0
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	.byte	0xf3,0xc3
+
+.align	16
+.Lvanilla_mul_2x2:
+	leaq	-136(%rsp),%rsp
+	movq	%r14,80(%rsp)
+	movq	%r13,88(%rsp)
+	movq	%r12,96(%rsp)
+	movq	%rbp,104(%rsp)
+	movq	%rbx,112(%rsp)
+.Lbody_mul_2x2:
+	movq	%rdi,32(%rsp)
+	movq	%rsi,40(%rsp)
+	movq	%rdx,48(%rsp)
+	movq	%rcx,56(%rsp)
+	movq	%r8,64(%rsp)
+
+	movq	$15,%r8
+	movq	%rsi,%rax
+	movq	%rcx,%rbp
+	call	_mul_1x1		
+	movq	%rax,16(%rsp)
+	movq	%rdx,24(%rsp)
+
+	movq	48(%rsp),%rax
+	movq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	%rax,0(%rsp)
+	movq	%rdx,8(%rsp)
+
+	movq	40(%rsp),%rax
+	movq	56(%rsp),%rbp
+	xorq	48(%rsp),%rax
+	xorq	64(%rsp),%rbp
+	call	_mul_1x1		
+	movq	0(%rsp),%rbx
+	movq	8(%rsp),%rcx
+	movq	16(%rsp),%rdi
+	movq	24(%rsp),%rsi
+	movq	32(%rsp),%rbp
+
+	xorq	%rdx,%rax
+	xorq	%rcx,%rdx
+	xorq	%rbx,%rax
+	movq	%rbx,0(%rbp)
+	xorq	%rdi,%rdx
+	movq	%rsi,24(%rbp)
+	xorq	%rsi,%rax
+	xorq	%rsi,%rdx
+	xorq	%rdx,%rax
+	movq	%rdx,16(%rbp)
+	movq	%rax,8(%rbp)
+
+	movq	80(%rsp),%r14
+	movq	88(%rsp),%r13
+	movq	96(%rsp),%r12
+	movq	104(%rsp),%rbp
+	movq	112(%rsp),%rbx
+	leaq	136(%rsp),%rsp
+	.byte	0xf3,0xc3
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.byte	71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
new file mode 100644
index 0000000..cf9f48e
--- /dev/null
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,389 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" $xlate $flavour $output";
+
+($lo,$hi)=("%rax","%rdx");	$a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@abi-omnipotent
+.align	16
+_mul_1x1:
+	sub	\$128+8,%rsp
+	mov	\$-1,$a1
+	lea	($a,$a),$i0
+	shr	\$3,$a1
+	lea	(,$a,4),$i1
+	and	$a,$a1			# a1=a&0x1fffffffffffffff
+	lea	(,$a,8),$a8
+	sar	\$63,$a			# broadcast 63rd bit
+	lea	($a1,$a1),$a2
+	sar	\$63,$i0		# broadcast 62nd bit
+	lea	(,$a1,4),$a4
+	and	$b,$a
+	sar	\$63,$i1		# boardcast 61st bit
+	mov	$a,$hi			# $a is $lo
+	shl	\$63,$lo
+	and	$b,$i0
+	shr	\$1,$hi
+	mov	$i0,$t1
+	shl	\$62,$i0
+	and	$b,$i1
+	shr	\$2,$t1
+	xor	$i0,$lo
+	mov	$i1,$t0
+	shl	\$61,$i1
+	xor	$t1,$hi
+	shr	\$3,$t0
+	xor	$i1,$lo
+	xor	$t0,$hi
+
+	mov	$a1,$a12
+	movq	\$0,0(%rsp)		# tab[0]=0
+	xor	$a2,$a12		# a1^a2
+	mov	$a1,8(%rsp)		# tab[1]=a1
+	 mov	$a4,$a48
+	mov	$a2,16(%rsp)		# tab[2]=a2
+	 xor	$a8,$a48		# a4^a8
+	mov	$a12,24(%rsp)		# tab[3]=a1^a2
+
+	xor	$a4,$a1
+	mov	$a4,32(%rsp)		# tab[4]=a4
+	xor	$a4,$a2
+	mov	$a1,40(%rsp)		# tab[5]=a1^a4
+	xor	$a4,$a12
+	mov	$a2,48(%rsp)		# tab[6]=a2^a4
+	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
+	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
+	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
+
+	mov	$a8,64(%rsp)		# tab[8]=a8
+	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
+	mov	$a1,72(%rsp)		# tab[9]=a1^a8
+	 xor	$a4,$a1			# a1^a8^a4
+	mov	$a2,80(%rsp)		# tab[10]=a2^a8
+	 xor	$a4,$a2			# a2^a8^a4
+	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
+
+	xor	$a4,$a12		# a1^a2^a8^a4
+	mov	$a48,96(%rsp)		# tab[12]=a4^a8
+	 mov	$mask,$i0
+	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
+	 and	$b,$i0
+	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
+	 shr	\$4,$b
+	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
+	 mov	$mask,$i1
+	 and	$b,$i1
+	 shr	\$4,$b
+
+	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
+	mov	$mask,$i0
+	and	$b,$i0
+	shr	\$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+	$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$mask,$i1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	and	$b,$i1
+	 movq	(%rsp,$i0,8),$Tx
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	 pslldq	\$$n,$Tx
+	 mov	$mask,$i0
+	shr	\$4,$b
+	xor	$t0,$hi
+	 and	$b,$i0
+	 shr	\$4,$b
+	 pxor	$Tx,$R
+___
+    }
+$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	movq	$R,$i0
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	psrldq	\$8,$R
+	xor	$t0,$hi
+	movq	$R,$i1
+	xor	$i0,$lo
+	xor	$i1,$hi
+
+	add	\$128+8,%rsp
+	ret
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
+				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@abi-omnipotent
+.align	16
+bn_GF2m_mul_2x2:
+	mov	OPENSSL_ia32cap_P(%rip),%rax
+	bt	\$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+	movq		$a1,%xmm0
+	movq		$b1,%xmm1
+	movq		$a0,%xmm2
+___
+$code.=<<___ if ($win64);
+	movq		40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+	movq		$b0,%xmm3
+___
+$code.=<<___;
+	movdqa		%xmm0,%xmm4
+	movdqa		%xmm1,%xmm5
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
+	pxor		%xmm2,%xmm4
+	pxor		%xmm3,%xmm5
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
+	xorps		%xmm0,%xmm4
+	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	movdqa		%xmm4,%xmm5
+	pslldq		\$8,%xmm4
+	psrldq		\$8,%xmm5
+	pxor		%xmm4,%xmm2
+	pxor		%xmm5,%xmm0
+	movdqu		%xmm2,0($rp)
+	movdqu		%xmm0,16($rp)
+	ret
+
+.align	16
+.Lvanilla_mul_2x2:
+	lea	-8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	mov	`8*17+40`(%rsp),$b0
+	mov	%rdi,8*15(%rsp)
+	mov	%rsi,8*16(%rsp)
+___
+$code.=<<___;
+	mov	%r14,8*10(%rsp)
+	mov	%r13,8*11(%rsp)
+	mov	%r12,8*12(%rsp)
+	mov	%rbp,8*13(%rsp)
+	mov	%rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+	mov	$rp,32(%rsp)		# save the arguments
+	mov	$a1,40(%rsp)
+	mov	$a0,48(%rsp)
+	mov	$b1,56(%rsp)
+	mov	$b0,64(%rsp)
+
+	mov	\$0xf,$mask
+	mov	$a1,$a
+	mov	$b1,$b
+	call	_mul_1x1		# a1·b1
+	mov	$lo,16(%rsp)
+	mov	$hi,24(%rsp)
+
+	mov	48(%rsp),$a
+	mov	64(%rsp),$b
+	call	_mul_1x1		# a0·b0
+	mov	$lo,0(%rsp)
+	mov	$hi,8(%rsp)
+
+	mov	40(%rsp),$a
+	mov	56(%rsp),$b
+	xor	48(%rsp),$a
+	xor	64(%rsp),$b
+	call	_mul_1x1		# (a0+a1)·(b0+b1)
+___
+	@r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+	mov	0(%rsp),@r[0]
+	mov	8(%rsp),@r[1]
+	mov	16(%rsp),@r[2]
+	mov	24(%rsp),@r[3]
+	mov	32(%rsp),%rbp
+
+	xor	$hi,$lo
+	xor	@r[1],$hi
+	xor	@r[0],$lo
+	mov	@r[0],0(%rbp)
+	xor	@r[2],$hi
+	mov	@r[3],24(%rbp)
+	xor	@r[3],$lo
+	xor	@r[3],$hi
+	xor	$hi,$lo
+	mov	$hi,16(%rbp)
+	mov	$lo,8(%rbp)
+
+	mov	8*10(%rsp),%r14
+	mov	8*11(%rsp),%r13
+	mov	8*12(%rsp),%r12
+	mov	8*13(%rsp),%rbp
+	mov	8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+	mov	8*15(%rsp),%rdi
+	mov	8*16(%rsp),%rsi
+___
+$code.=<<___;
+	lea	8*17(%rsp),%rsp
+	ret
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody_mul_2x2(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_prologue
+
+	mov	8*10(%rax),%r14		# mimic epilogue
+	mov	8*11(%rax),%r13
+	mov	8*12(%rax),%r12
+	mov	8*13(%rax),%rbp
+	mov	8*14(%rax),%rbx
+	mov	8*15(%rax),%rdi
+	mov	8*16(%rax),%rsi
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+.Lin_prologue:
+	lea	8*17(%rax),%rax
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	_mul_1x1
+	.rva	.Lend_mul_1x1
+	.rva	.LSEH_info_1x1
+
+	.rva	.Lvanilla_mul_2x2
+	.rva	.Lend_mul_2x2
+	.rva	.LSEH_info_2x2
+.section	.xdata
+.align	8
+.LSEH_info_1x1:
+	.byte	0x01,0x07,0x02,0x00
+	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
+.LSEH_info_2x2:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/x86_64-mont.S b/crypto/bn/asm/x86_64-mont.S
new file mode 100644
index 0000000..95e2905
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont.S
@@ -0,0 +1,1374 @@
+.text	
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	cmpq	%rsi,%rdx
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	2(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont,.-bn_mul_mont
+.type	bn_mul4x_mont,@function
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movl	%r9d,%r9d
+	leaq	4(%r9),%r10
+	movq	%rsp,%r11
+	negq	%r10
+	leaq	(%rsp,%r10,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.type	bn_sqr4x_mont,@function
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	movq	%rsp,%r11
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,2),%rsp
+	andq	$-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+	movq	%r8,48(%rsp)
+	movq	%r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,16(%rdi,%rcx,1)
+
+
+	movq	24(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	leaq	16(%rbp),%rbp
+	movq	%r12,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	movq	-24(%rdi,%rbp,1),%r10
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	xorq	%r10,%r10
+	addq	-16(%rdi,%rbp,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	leaq	-16(%rbp),%rcx
+	xorq	%r12,%r12
+
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,8(%rdi,%rcx,1)
+
+	leaq	16(%rcx),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r12,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,(%rdi,%rcx,1)
+
+	movq	8(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	64(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-24(%rdi)
+
+	xorq	%r10,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	movq	%r11,-16(%rdi)
+
+	movq	-8(%rsi),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+
+	xorq	%r11,%r11
+	addq	%r12,%r10
+	movq	%rdx,%r13
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	%rdx,%r11
+	movq	%r10,-8(%rdi)
+
+	xorq	%r12,%r12
+	addq	%r11,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	%rdx,%r12
+
+	movq	%r13,(%rdi)
+	movq	%r12,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	64(%rsp,%r9,2),%rdi
+	xorq	%r10,%r10
+	movq	-24(%rdi,%rbp,2),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,-40(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi,%rbp,2),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi,%rbp,2)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi,%rbp,2),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi,%rbp,2),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi,%rbp,2)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi,%rbp,2)
+	sbbq	%r15,%r15
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+	movq	40(%rsp),%rsi
+	movq	48(%rsp),%r8
+	xorq	%rcx,%rcx
+	movq	%r9,0(%rsp)
+	subq	%r9,%rcx
+	movq	64(%rsp),%r10
+	movq	%r8,%r14
+	leaq	64(%rsp,%r9,2),%rax
+	leaq	64(%rsp,%r9,1),%rdi
+	movq	%rax,8(%rsp)
+	leaq	(%rsi,%r9,1),%rsi
+	xorq	%rbp,%rbp
+
+	movq	0(%rsi,%rcx,1),%rax
+	movq	8(%rsi,%rcx,1),%r9
+	imulq	%r10,%r14
+	movq	%rax,%rbx
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xorq	%r11,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r8,%r15
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+	imulq	%r11,%r15
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	8(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	8(%rdi,%rcx,1),%r11
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+
+
+	movq	16(%rsi,%rcx,1),%rbx
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	%rdx,%r13
+	movq	%r12,8(%rdi,%rcx,1)
+
+	xorq	%r11,%r11
+	addq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r11
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+
+	movq	24(%rsi,%rcx,1),%r9
+	xorq	%r12,%r12
+	addq	%r10,%r13
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r12
+	movq	%r13,16(%rdi,%rcx,1)
+
+	xorq	%r10,%r10
+	addq	24(%rdi,%rcx,1),%r11
+	leaq	32(%rcx),%rcx
+	adcq	$0,%r10
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	%rdx,%r10
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_mont_inner
+
+	subq	0(%rsp),%rcx
+	movq	%r8,%r14
+
+	xorq	%r13,%r13
+	addq	%r11,%r12
+	adcq	$0,%r13
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	movq	%r12,-8(%rdi)
+
+	xorq	%r11,%r11
+	addq	(%rdi),%r10
+	adcq	$0,%r11
+	movq	0(%rsi,%rcx,1),%rbx
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+	imulq	16(%rdi,%rcx,1),%r14
+	xorq	%r12,%r12
+	movq	8(%rsi,%rcx,1),%r9
+	addq	%r10,%r13
+	movq	16(%rdi,%rcx,1),%r10
+	adcq	$0,%r12
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	adcq	%rdx,%r12
+	movq	%r13,(%rdi)
+
+	xorq	%rbp,%rbp
+	addq	8(%rdi),%r12
+	adcq	%rbp,%rbp
+	addq	%r11,%r12
+	leaq	16(%rdi),%rdi
+	adcq	$0,%rbp
+	movq	%r12,-8(%rdi)
+	cmpq	8(%rsp),%rdi
+	jb	.Lsqr4x_mont_outer
+
+	movq	0(%rsp),%r9
+	movq	%rbp,(%rdi)
+	movq	64(%rsp,%r9,1),%rax
+	leaq	64(%rsp,%r9,1),%rbx
+	movq	40(%rsp),%rsi
+	shrq	$5,%r9
+	movq	8(%rbx),%rdx
+	xorq	%rbp,%rbp
+
+	movq	32(%rsp),%rdi
+	subq	0(%rsi),%rax
+	movq	16(%rbx),%r10
+	movq	24(%rbx),%r11
+	sbbq	8(%rsi),%rdx
+	leaq	-1(%r9),%rcx
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	32(%rbx,%rbp,8),%rax
+	movq	40(%rbx,%rbp,8),%rdx
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+	movq	%r11,24(%rdi,%rbp,8)
+	sbbq	32(%rsi,%rbp,8),%rax
+	movq	48(%rbx,%rbp,8),%r10
+	movq	56(%rbx,%rbp,8),%r11
+	sbbq	40(%rsi,%rbp,8),%rdx
+	leaq	4(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_sub
+
+	movq	%rax,0(%rdi,%rbp,8)
+	movq	32(%rbx,%rbp,8),%rax
+	sbbq	16(%rsi,%rbp,8),%r10
+	movq	%rdx,8(%rdi,%rbp,8)
+	sbbq	24(%rsi,%rbp,8),%r11
+	movq	%r10,16(%rdi,%rbp,8)
+
+	sbbq	$0,%rax
+	movq	%r11,24(%rdi,%rbp,8)
+	xorq	%rbp,%rbp
+	andq	%rax,%rbx
+	notq	%rax
+	movq	%rdi,%rsi
+	andq	%rax,%rsi
+	leaq	-1(%r9),%rcx
+	orq	%rsi,%rbx
+
+	pxor	%xmm0,%xmm0
+	leaq	64(%rsp,%r9,8),%rsi
+	movdqu	(%rbx),%xmm1
+	leaq	(%rsi,%r9,8),%rsi
+	movdqa	%xmm0,64(%rsp)
+	movdqa	%xmm0,(%rsi)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqu	32(%rbx,%rbp,1),%xmm1
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,96(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqa	%xmm0,32(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movdqu	%xmm1,32(%rdi,%rbp,1)
+	leaq	32(%rbp),%rbp
+	decq	%rcx
+	jnz	.Lsqr4x_copy
+
+	movdqu	16(%rbx,%rbp,1),%xmm2
+	movdqa	%xmm0,80(%rsp,%rbp,1)
+	movdqa	%xmm0,16(%rsi,%rbp,1)
+	movdqu	%xmm2,16(%rdi,%rbp,1)
+	movq	56(%rsp),%rsi
+	movq	$1,%rax
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 3b7a6f2..17fb94c 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -15,6 +15,20 @@
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
 
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -26,7 +40,8 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 # int bn_mul_mont(
 $rp="%rdi";	# BN_ULONG *rp,
@@ -37,7 +52,6 @@
 $num="%r9";	# int num);
 $lo0="%r10";
 $hi0="%r11";
-$bp="%r12";	# reassign $bp
 $hi1="%r13";
 $i="%r14";
 $j="%r15";
@@ -51,6 +65,16 @@
 .type	bn_mul_mont,\@function,6
 .align	16
 bn_mul_mont:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	cmp	$ap,$bp
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -66,48 +90,66 @@
 	and	\$-1024,%rsp		# minimize TLB usage
 
 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
-.Lprologue:
-	mov	%rdx,$bp		# $bp reassigned, remember?
-
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
 	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
 
 	xor	$i,$i			# i=0
 	xor	$j,$j			# j=0
 
-	mov	($bp),$m0		# m0=bp[0]
-	mov	($ap),%rax
+	mov	$n0,$m1
 	mulq	$m0			# ap[0]*bp[0]
 	mov	%rax,$lo0
-	mov	%rdx,$hi0
+	mov	($np),%rax
 
-	imulq	$n0,%rax		# "tp[0]"*n0
-	mov	%rax,$m1
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
 
-	mulq	($np)			# np[0]*m1
-	add	$lo0,%rax		# discarded
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
 .L1st:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[0]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
-	mov	%rdx,$hi0
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
+	cmp	$num,$j
+	jne	.L1st
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[0]
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
 	adc	\$0,%rdx
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
-	cmp	$num,$j
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.L1st
+	mov	$lo0,$hi0
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -116,50 +158,64 @@
 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
 
 	lea	1($i),$i		# i++
-.align	4
+	jmp	.Louter
+.align	16
 .Louter:
-	xor	$j,$j			# j=0
-
 	mov	($bp,$i,8),$m0		# m0=bp[i]
-	mov	($ap),%rax		# ap[0]
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
 	mulq	$m0			# ap[0]*bp[i]
-	add	(%rsp),%rax		# ap[0]*bp[i]+tp[0]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
-	mov	%rdx,$hi0
 
-	imulq	$n0,%rax		# tp[0]*n0
-	mov	%rax,$m1
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
 
-	mulq	($np,$j,8)		# np[0]*m1
-	add	$lo0,%rax		# discarded
-	mov	8(%rsp),$lo0		# tp[1]
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
-.align	4
+	jmp	.Linner_enter
+
+.align	16
 .Linner:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[i]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	add	%rax,$lo0		# ap[j]*bp[i]+tp[j]
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
 	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
 	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
-	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	cmp	$num,$j
+	jne	.Linner
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
 	mov	(%rsp,$j,8),$lo0
-	cmp	$num,$j
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.Linner
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -173,35 +229,449 @@
 	cmp	$num,$i
 	jl	.Louter
 
-	lea	(%rsp),$ap		# borrow ap for tp
-	lea	-1($num),$j		# j=num-1
-
-	mov	($ap),%rax		# tp[0]
 	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
 	jmp	.Lsub
 .align	16
 .Lsub:	sbb	($np,$i,8),%rax
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
-	dec	$j			# doesn't affect CF!
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	jge	.Lsub
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
 	and	%rax,$ap
 	not	%rax
 	mov	$rp,$np
 	and	%rax,$np
-	lea	-1($num),$j
+	mov	$num,$j			# j=num
 	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont,\@function,6
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	4($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
 	mov	($ap,$j,8),%rax
-	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
-	mov	$i,(%rsp,$j,8)		# zap temporary vector
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)		# tp[j-1]
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
 	dec	$j
-	jge	.Lcopy
+	jnz	.Lcopy4x
 
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 	mov	\$1,%rax
 	mov	(%rsi),%r15
@@ -211,9 +681,823 @@
 	mov	32(%rsi),%rbp
 	mov	40(%rsi),%rbx
 	lea	48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
 	ret
-.size	bn_mul_mont,.-bn_mul_mont
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi";	# const BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# not used
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 4 and
+			# not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type	bn_sqr4x_mont,\@function,6
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	shl	\$3,${num}d		# convert $num to bytes
+	xor	%r10,%r10
+	mov	%rsp,%r11		# put aside %rsp
+	sub	$num,%r10		# -$num
+	mov	($n0),$n0		# *n0
+	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
+	and	\$-1024,%rsp		# minimize TLB usage
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +32	saved $rptr
+	# +40	saved $nptr
+	# +48	saved *n0
+	# +56	saved %rsp
+	# +64	t[2*$num]
+	#
+	mov	$rptr,32(%rsp)		# save $rptr
+	mov	$nptr,40(%rsp)
+	mov	$n0,  48(%rsp)
+	mov	%r11, 56(%rsp)		# save original %rsp
+.Lsqr4x_body:
+	##############################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	lea	32(%r10),$i		# $i=-($num-32)
+	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
+
+	mov	$num,$j			# $j=$num
+
+					# comments apply to $num==8 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	%rax,$A0[0]		# a[1]*a[0]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	16($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[3]
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[5]
+
+	 mov	16($aptr,$j),$ai	# a[6]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[6]*a[2]
+	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],16($tptr,$j)	# t[6]
+
+
+	 mov	24($aptr,$j),$ai	# a[7]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[6]*a[5]
+	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	32($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[7]*a[4]
+	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[7]
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_1st
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[7]*a[5]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[8]
+	lea	16($i),$i
+	mov	$A1[0],8($tptr)		# t[9]
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:				# comments apply to $num==6 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mov	-24($tptr,$i),$A0[0]	# t[1]
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+	xor	$A1[0],$A1[0]
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[3]
+
+	lea	16($j),$j
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	add	($tptr,$j),$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	lea	16($j),$j		# j++
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_inner
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
+	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
+
+	add	\$16,$i
+	jnz	.Lsqr4x_outer
+
+					# comments apply to $num==4 case
+	mov	-32($aptr),$a0		# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr),%rax		# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr),$ai		# a[2]
+	mov	%rax,$a1
+
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr)	# t[2]
+
+	 mov	-8($aptr),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 mov	%rdx,$A1[1]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr)	# t[3]
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]
+	 mov	-16($aptr),%rax		# a[2]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[4]
+	mov	$A1[0],8($tptr)		# t[5]
+
+	mul	$ai			# a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+	 add	\$16,$i
+	 xor	$shift,$shift
+	 sub	$num,$i			# $i=16-$num
+	 xor	$carry,$carry
+
+	add	$A1[0],%rax		# t[5]
+	adc	\$0,%rdx
+	mov	%rax,8($tptr)		# t[5]
+	mov	%rdx,16($tptr)		# t[6]
+	mov	$carry,24($tptr)	# t[7]
+
+	 mov	-16($aptr,$i),%rax	# a[0]
+	lea	64(%rsp,$num,2),$tptr
+	 xor	$A0[0],$A0[0]		# t[0]
+	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	lea	16($i),$i
+	mov	$S[3],-40($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	 mov	$S[3],-8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],0($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	mov	$S[3],24($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	add	\$32,$i
+	jnz	.Lsqr4x_shift_n_add
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	adc	%rax,$S[2]
+	adc	%rdx,$S[3]
+	mov	$S[2],-16($tptr)
+	mov	$S[3],-8($tptr)
+___
+}
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+	mov	40(%rsp),$nptr		# restore $nptr
+	mov	48(%rsp),$n0		# restore *n0
+	xor	$j,$j
+	mov	$num,0(%rsp)		# save $num
+	sub	$num,$j			# $j=-$num
+	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
+	 mov	$n0,$m0			#		# modsched #
+	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
+	lea	64(%rsp,$num),$tptr	# end of t[] window
+	mov	%rax,8(%rsp)		# save end of t[] buffer
+	lea	($nptr,$num),$nptr	# end of n[] buffer
+	xor	$topbit,$topbit		# $topbit=0
+
+	mov	0($nptr,$j),%rax	# n[0]		# modsched #
+	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
+	 mov	%rax,$Ni[0]		#		# modsched #
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xor	$A0[1],$A0[1]
+	mul	$m0			# n[0]*m0
+	add	%rax,$A0[0]		# n[0]*m0+t[0]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+	mov	$n0,$m1
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[1]*m0
+	add	%rax,$A0[1]		# n[1]*m0+t[1]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+	imulq	$A0[1],$m1
+
+	mov	16($nptr,$j),$Ni[0]	# n[2]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[0]*m1
+	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[1]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[2]*m0
+	add	%rax,$A0[0]		# n[2]*m0+t[2]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[3]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[1]*m1
+	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[2]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[3]*m0
+	add	%rax,$A0[1]		# n[3]*m0+t[3]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	mov	($nptr,$j),$Ni[0]	# n[4]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[2]*m1
+	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr,$j)	# "t[3]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[4]*m0
+	add	%rax,$A0[0]		# n[4]*m0+t[4]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	8($nptr,$j),$Ni[1]	# n[5]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[3]*m1
+	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr,$j)	# "t[4]"
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[5]*m0
+	add	%rax,$A0[1]		# n[5]*m0+t[5]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+
+	mov	16($nptr,$j),$Ni[0]	# n[6]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[4]*m1
+	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[5]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[6]*m0
+	add	%rax,$A0[0]		# n[6]*m0+t[6]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[7]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[5]*m1
+	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[6]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[7]*m0
+	add	%rax,$A0[1]		# n[7]*m0+t[7]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	cmp	\$0,$j
+	jne	.Lsqr4x_mont_inner
+
+	 sub	0(%rsp),$j		# $j=-$num	# modsched #
+	 mov	$n0,$m0			#		# modsched #
+
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[6]*m1
+	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
+	mov	$Ni[1],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr)	# "t[7]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr),$A0[0]		# +t[8]
+	adc	\$0,$A0[1]
+	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
+	add	$topbit,$A0[0]
+	adc	\$0,$A0[1]
+
+	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
+	xor	$A1[0],$A1[0]
+	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	add	$A0[0],$A1[1]
+	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
+	adc	\$0,$A1[0]
+	mul	$m1			# n[7]*m1
+	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
+	 mov	$Ni[0],%rax		#		# modsched #
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr)		# "t[8]"
+
+	xor	$topbit,$topbit
+	add	8($tptr),$A1[0]		# +t[9]
+	adc	$topbit,$topbit
+	add	$A0[1],$A1[0]
+	lea	16($tptr),$tptr		# "t[$num]>>128"
+	adc	\$0,$topbit
+	mov	$A1[0],-8($tptr)	# "t[9]"
+	cmp	8(%rsp),$tptr		# are we done?
+	jb	.Lsqr4x_mont_outer
+
+	mov	0(%rsp),$num		# restore $num
+	mov	$topbit,($tptr)		# save $topbit
+___
+}
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+	mov	64(%rsp,$num),@ri[0]	# tp[0]
+	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
+	mov	40(%rsp),$nptr		# restore $nptr
+	shr	\$5,$num		# num/4
+	mov	8($tptr),@ri[1]		# t[1]
+	xor	$i,$i			# i=0 and clear CF!
+
+	mov	32(%rsp),$rptr		# restore $rptr
+	sub	0($nptr),@ri[0]
+	mov	16($tptr),@ri[2]	# t[2]
+	mov	24($tptr),@ri[3]	# t[3]
+	sbb	8($nptr),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
+	mov	40($tptr,$i,8),@ri[1]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($nptr,$i,8),@ri[0]
+	mov	48($tptr,$i,8),@ri[2]
+	mov	56($tptr,$i,8),@ri[3]
+	sbb	40($nptr,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesn't affect CF!
+	jnz	.Lsqr4x_sub
+
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$tptr
+	not	@ri[0]
+	mov	$rptr,$nptr
+	and	@ri[0],$nptr
+	lea	-1($num),$j
+	or	$nptr,$tptr		# tp=borrow?tp:rp
+
+	pxor	%xmm0,%xmm0
+	lea	64(%rsp,$num,8),$nptr
+	movdqu	($tptr),%xmm1
+	lea	($nptr,$num,8),$nptr
+	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
+	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
+	movdqu	%xmm1,($rptr)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:				# copy or in-place refresh
+	movdqu	16($tptr,$i),%xmm2
+	movdqu	32($tptr,$i),%xmm1
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+	movdqu	%xmm1,32($rptr,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lsqr4x_copy
+
+	movdqu	16($tptr,$i),%xmm2
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+	mov	56(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	ret
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
 ___
@@ -228,9 +1512,9 @@
 
 $code.=<<___;
 .extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
+.type	mul_handler,\@abi-omnipotent
 .align	16
-se_handler:
+mul_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -245,15 +1529,20 @@
 	mov	120($context),%rax	# pull context->Rax
 	mov	248($context),%rbx	# pull context->Rip
 
-	lea	.Lprologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
-	lea	.Lepilogue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
 
 	mov	192($context),%r10	# pull $num
 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
@@ -272,7 +1561,53 @@
 	mov	%r14,232($context)	# restore context->R14
 	mov	%r15,240($context)	# restore context->R15
 
-.Lin_prologue:
+	jmp	.Lcommon_seh_tail
+.size	mul_handler,.-mul_handler
+
+.type	sqr_handler,\@abi-omnipotent
+.align	16
+sqr_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lsqr4x_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lsqr4x_epilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
+	jae	.Lcommon_seh_tail
+
+	mov	56(%rax),%rax		# pull saved stack pointer
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -310,7 +1645,7 @@
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	sqr_handler,.-sqr_handler
 
 .section	.pdata
 .align	4
@@ -318,11 +1653,27 @@
 	.rva	.LSEH_end_bn_mul_mont
 	.rva	.LSEH_info_bn_mul_mont
 
+	.rva	.LSEH_begin_bn_mul4x_mont
+	.rva	.LSEH_end_bn_mul4x_mont
+	.rva	.LSEH_info_bn_mul4x_mont
+
+	.rva	.LSEH_begin_bn_sqr4x_mont
+	.rva	.LSEH_end_bn_sqr4x_mont
+	.rva	.LSEH_info_bn_sqr4x_mont
+
 .section	.xdata
 .align	8
 .LSEH_info_bn_mul_mont:
 	.byte	9,0,0,0
-	.rva	se_handler
+	.rva	mul_handler
+	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
 ___
 }
 
diff --git a/crypto/bn/asm/x86_64-mont5.S b/crypto/bn/asm/x86_64-mont5.S
new file mode 100644
index 0000000..49ec6ac
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont5.S
@@ -0,0 +1,784 @@
+.text	
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,@function
+.align	64
+bn_mul_mont_gather5:
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	2(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+.byte	102,72,15,126,195
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jl	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	movq	%r9,%r15
+	orq	%rcx,%rsi
+.align	16
+.Lcopy:
+	movq	(%rsi,%r14,8),%rax
+	movq	%r14,(%rsp,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type	bn_mul4x_mont_gather5,@function
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	movl	%r9d,%r9d
+	movl	8(%rsp),%r10d
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%rax
+	leaq	4(%r9),%r11
+	negq	%r11
+	leaq	(%rsp,%r11,8),%rsp
+	andq	$-1024,%rsp
+
+	movq	%rax,8(%rsp,%r9,8)
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	%r10,%r11
+	shrq	$3,%r10
+	andq	$7,%r11
+	notq	%r10
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%r10
+	leaq	96(%r12,%r11,8),%r12
+	movq	0(%rax,%r10,8),%xmm4
+	movq	8(%rax,%r10,8),%xmm5
+	movq	16(%rax,%r10,8),%xmm6
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+.byte	102,72,15,126,195
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	xorq	%r15,%r15
+	movq	-96(%r12),%xmm0
+	movq	-32(%r12),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%r12),%xmm2
+	pand	%xmm5,%xmm1
+
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	movq	96(%r12),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	por	%xmm2,%xmm0
+	leaq	256(%r12),%r12
+	por	%xmm3,%xmm0
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-40(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jl	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.byte	102,72,15,126,195
+	movq	%rdi,-16(%rsp,%r15,8)
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jl	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	movq	8(%rsp),%rdx
+	shrq	$2,%r9
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+	leaq	-1(%r9),%r15
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-1(%r9),%r15
+	orq	%rcx,%rsi
+
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
+	leaq	32(%r14),%r14
+	decq	%r15
+	jnz	.Lcopy4x
+
+	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movq	8(%rsp,%r9,8),%rsi
+	movq	$1,%rax
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.globl	bn_scatter5
+.type	bn_scatter5,@function
+.align	16
+bn_scatter5:
+	cmpq	$0,%rsi
+	jz	.Lscatter_epilogue
+	leaq	(%rdx,%rcx,8),%rdx
+.Lscatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subq	$1,%rsi
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	.byte	0xf3,0xc3
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,@function
+.align	16
+bn_gather5:
+	movq	%rcx,%r11
+	shrq	$3,%rcx
+	andq	$7,%r11
+	notq	%rcx
+	leaq	.Lmagic_masks(%rip),%rax
+	andq	$3,%rcx
+	leaq	96(%rdx,%r11,8),%rdx
+	movq	0(%rax,%rcx,8),%xmm4
+	movq	8(%rax,%rcx,8),%xmm5
+	movq	16(%rax,%rcx,8),%xmm6
+	movq	24(%rax,%rcx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	-96(%rdx),%xmm0
+	movq	-32(%rdx),%xmm1
+	pand	%xmm4,%xmm0
+	movq	32(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movq	96(%rdx),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	leaq	256(%rdx),%rdx
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subq	$1,%rsi
+	jnz	.Lgather
+	.byte	0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+.align	64
+.Lmagic_masks:
+.long	0,0, 0,0, 0,0, -1,-1
+.long	0,0, 0,0, 0,0,  0,0
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
new file mode 100755
index 0000000..8f8dc5a
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,1071 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# int bn_mul_mont_gather5(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num,
+		# int idx);	# 0 to 2^5-1, "index" in $bp holding
+				# pre-computed powers of a', interlaced
+				# in such manner that b[0] is $bp[idx],
+				# b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,\@function,6
+.align	64
+bn_mul_mont_gather5:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	2($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.L1st
+
+	movq	%xmm0,$m0		# bp[1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	mov	$lo0,$hi0
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	jmp	.Louter
+.align	16
+.Louter:
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.Linner
+
+	movq	%xmm0,$m0		# bp[i+1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jl	.Louter
+
+	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont_gather5,\@function,6
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	4($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	xor	$j,$j			# j=0
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[i+1]
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lcopy4x
+
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	ret
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+				("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl	bn_scatter5
+.type	bn_scatter5,\@abi-omnipotent
+.align	16
+bn_scatter5:
+	cmp	\$0, $num
+	jz	.Lscatter_epilogue
+	lea	($tbl,$idx,8),$tbl
+.Lscatter:
+	mov	($inp),%rax
+	lea	8($inp),$inp
+	mov	%rax,($tbl)
+	lea	32*8($tbl),$tbl
+	sub	\$1,$num
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,\@abi-omnipotent
+.align	16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	mov	$idx,%r11
+	shr	\$`log($N/8)/log(2)`,$idx
+	and	\$`$N/8-1`,%r11
+	not	$idx
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
+	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
+	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
+	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,$idx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	`0*$STRIDE/4-96`($tbl),%xmm0
+	movq	`1*$STRIDE/4-96`($tbl),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($tbl),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($tbl),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($tbl),$tbl
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,($out)		# m0=bp[0]
+	lea	8($out),$out
+	sub	\$1,$num
+	jnz	.Lgather
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	lea	0x28(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align	64
+.Lmagic_masks:
+	.long	0,0, 0,0, 0,0, -1,-1
+	.long	0,0, 0,0, 0,0,  0,0
+.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	lea	`40+48`(%rax),%rax
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# end of alloca label
+	cmp	%r10,%rbx		# context->Rip<end of alloca label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+
+	movaps	(%rax),%xmm0
+	movaps	16(%rax),%xmm1
+	lea	`40+48`(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+	movups	%xmm0,512($context)	# restore context->Xmm6
+	movups	%xmm1,528($context)	# restore context->Xmm7
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mul_handler,.-mul_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont_gather5
+	.rva	.LSEH_end_bn_mul_mont_gather5
+	.rva	.LSEH_info_bn_mul_mont_gather5
+
+	.rva	.LSEH_begin_bn_mul4x_mont_gather5
+	.rva	.LSEH_end_bn_mul4x_mont_gather5
+	.rva	.LSEH_info_bn_mul4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_gather5
+	.rva	.LSEH_end_bn_gather5
+	.rva	.LSEH_info_bn_gather5
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_mul4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.align	8
+.LSEH_info_bn_gather5:
+        .byte   0x01,0x0d,0x05,0x00
+        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
+        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
+.align	8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index a0bc478..f34248e 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -558,6 +558,17 @@ int	BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
 int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
 		int do_trial_division, BN_GENCB *cb);
 
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+			const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			BIGNUM *Xp1, BIGNUM *Xp2,
+			const BIGNUM *Xp,
+			const BIGNUM *e, BN_CTX *ctx,
+			BN_GENCB *cb);
+
 BN_MONT_CTX *BN_MONT_CTX_new(void );
 void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
 int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@ int	BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 	BN_RECP_CTX *recp, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Functions for arithmetic over binary polynomials represented by BIGNUMs. 
  *
  * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@ int	BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
 int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
 int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
 
+#endif
+
 /* faster mod functions for the 'NIST primes' 
  * 0 <= a < p^2 */
 int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
diff --git a/crypto/bn/bn_blind.c b/crypto/bn/bn_blind.c
index e060592..9ed8bc2 100644
--- a/crypto/bn/bn_blind.c
+++ b/crypto/bn/bn_blind.c
@@ -126,7 +126,7 @@ struct bn_blinding_st
 				  * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
 #endif
 	CRYPTO_THREADID tid;
-	unsigned int  counter;
+	int counter;
 	unsigned long flags;
 	BN_MONT_CTX *m_ctx;
 	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
@@ -160,7 +160,10 @@ BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
 	if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
 		BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
 
-	ret->counter = BN_BLINDING_COUNTER;
+	/* Set the counter to the special value -1
+	 * to indicate that this is never-used fresh blinding
+	 * that does not need updating before first use. */
+	ret->counter = -1;
 	CRYPTO_THREADID_current(&ret->tid);
 	return(ret);
 err:
@@ -190,7 +193,10 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
 		goto err;
 		}
 
-	if (--(b->counter) == 0 && b->e != NULL &&
+	if (b->counter == -1)
+		b->counter = 0;
+
+	if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
 		!(b->flags & BN_BLINDING_NO_RECREATE))
 		{
 		/* re-create blinding parameters */
@@ -205,8 +211,8 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
 
 	ret=1;
 err:
-	if (b->counter == 0)
-		b->counter = BN_BLINDING_COUNTER;
+	if (b->counter == BN_BLINDING_COUNTER)
+		b->counter = 0;
 	return(ret);
 	}
 
@@ -227,6 +233,12 @@ int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
 		return(0);
 		}
 
+	if (b->counter == -1)
+		/* Fresh blinding, doesn't need updating. */
+		b->counter = 0;
+	else if (!BN_BLINDING_update(b,ctx))
+		return(0);
+
 	if (r != NULL)
 		{
 		if (!BN_copy(r, b->Ai)) ret=0;
@@ -247,22 +259,19 @@ int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ct
 	int ret;
 
 	bn_check_top(n);
-	if ((b->A == NULL) || (b->Ai == NULL))
-		{
-		BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
-		return(0);
-		}
 
 	if (r != NULL)
 		ret = BN_mod_mul(n, n, r, b->mod, ctx);
 	else
-		ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
-
-	if (ret >= 0)
 		{
-		if (!BN_BLINDING_update(b,ctx))
+		if (b->Ai == NULL)
+			{
+			BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
 			return(0);
+			}
+		ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
 		}
+
 	bn_check_top(n);
 	return(ret);
 	}
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 802a43d..7b24031 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -141,6 +141,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
     *
     *					<appro@fy.chalmers.se>
     */
+#undef bn_div_words
 #  define bn_div_words(n0,n1,d0)		\
 	({  asm volatile (			\
 		"divl	%4"			\
@@ -155,6 +156,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
     * Same story here, but it's 128-bit by 64-bit division. Wow!
     *					<appro@fy.chalmers.se>
     */
+#  undef bn_div_words
 #  define bn_div_words(n0,n1,d0)		\
 	({  asm volatile (			\
 		"divq	%4"			\
@@ -169,15 +171,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
 #endif /* OPENSSL_NO_ASM */
 
 
-/* BN_div[_no_branch] computes  dv := num / divisor,  rounding towards
+/* BN_div computes  dv := num / divisor,  rounding towards
  * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
  * Thus:
  *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
  *     rm->neg == num->neg                 (unless the remainder is zero)
  * If 'dv' or 'rm' is NULL, the respective value is not returned.
  */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
-        const BIGNUM *divisor, BN_CTX *ctx);
 int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	   BN_CTX *ctx)
 	{
@@ -186,6 +186,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	BN_ULONG *resp,*wnump;
 	BN_ULONG d0,d1;
 	int num_n,div_n;
+	int no_branch=0;
 
 	/* Invalid zero-padding would have particularly bad consequences
 	 * in the case of 'num', so don't just rely on bn_check_top() for this one
@@ -200,7 +201,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 
 	if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
 		{
-		return BN_div_no_branch(dv, rm, num, divisor, ctx);
+		no_branch=1;
 		}
 
 	bn_check_top(dv);
@@ -214,7 +215,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 		return(0);
 		}
 
-	if (BN_ucmp(num,divisor) < 0)
+	if (!no_branch && BN_ucmp(num,divisor) < 0)
 		{
 		if (rm != NULL)
 			{ if (BN_copy(rm,num) == NULL) return(0); }
@@ -239,242 +240,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	norm_shift+=BN_BITS2;
 	if (!(BN_lshift(snum,num,norm_shift))) goto err;
 	snum->neg=0;
-	div_n=sdiv->top;
-	num_n=snum->top;
-	loop=num_n-div_n;
-	/* Lets setup a 'window' into snum
-	 * This is the part that corresponds to the current
-	 * 'area' being divided */
-	wnum.neg   = 0;
-	wnum.d     = &(snum->d[loop]);
-	wnum.top   = div_n;
-	/* only needed when BN_ucmp messes up the values between top and max */
-	wnum.dmax  = snum->dmax - loop; /* so we don't step out of bounds */
-
-	/* Get the top 2 words of sdiv */
-	/* div_n=sdiv->top; */
-	d0=sdiv->d[div_n-1];
-	d1=(div_n == 1)?0:sdiv->d[div_n-2];
-
-	/* pointer to the 'top' of snum */
-	wnump= &(snum->d[num_n-1]);
-
-	/* Setup to 'res' */
-	res->neg= (num->neg^divisor->neg);
-	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop;
-	resp= &(res->d[loop-1]);
-
-	/* space for temp */
-	if (!bn_wexpand(tmp,(div_n+1))) goto err;
 
-	if (BN_ucmp(&wnum,sdiv) >= 0)
+	if (no_branch)
 		{
-		/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
-		 * bn_pollute) the const bignum arguments =>
-		 * clean the values between top and max again */
-		bn_clear_top2max(&wnum);
-		bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
-		*resp=1;
-		}
-	else
-		res->top--;
-	/* if res->top == 0 then clear the neg value otherwise decrease
-	 * the resp pointer */
-	if (res->top == 0)
-		res->neg = 0;
-	else
-		resp--;
-
-	for (i=0; i<loop-1; i++, wnump--, resp--)
-		{
-		BN_ULONG q,l0;
-		/* the first part of the loop uses the top two words of
-		 * snum and sdiv to calculate a BN_ULONG q such that
-		 * | wnum - sdiv * q | < sdiv */
-#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
-		BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
-		q=bn_div_3_words(wnump,d1,d0);
-#else
-		BN_ULONG n0,n1,rem=0;
-
-		n0=wnump[0];
-		n1=wnump[-1];
-		if (n0 == d0)
-			q=BN_MASK2;
-		else 			/* n0 < d0 */
-			{
-#ifdef BN_LLONG
-			BN_ULLONG t2;
-
-#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
-			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
-#else
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			/*
-			 * rem doesn't have to be BN_ULLONG. The least we
-			 * know it's less that d0, isn't it?
-			 */
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-			t2=(BN_ULLONG)d1*q;
-
-			for (;;)
-				{
-				if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				t2 -= d1;
-				}
-#else /* !BN_LLONG */
-			BN_ULONG t2l,t2h;
-
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-
-#if defined(BN_UMULT_LOHI)
-			BN_UMULT_LOHI(t2l,t2h,d1,q);
-#elif defined(BN_UMULT_HIGH)
-			t2l = d1 * q;
-			t2h = BN_UMULT_HIGH(d1,q);
-#else
+		/* Since we don't know whether snum is larger than sdiv,
+		 * we pad snum with enough zeroes without changing its
+		 * value. 
+		 */
+		if (snum->top <= sdiv->top+1) 
 			{
-			BN_ULONG ql, qh;
-			t2l=LBITS(d1); t2h=HBITS(d1);
-			ql =LBITS(q);  qh =HBITS(q);
-			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+			if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+			for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+			snum->top = sdiv->top + 2;
 			}
-#endif
-
-			for (;;)
-				{
-				if ((t2h < rem) ||
-					((t2h == rem) && (t2l <= wnump[-2])))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				if (t2l < d1) t2h--; t2l -= d1;
-				}
-#endif /* !BN_LLONG */
-			}
-#endif /* !BN_DIV3W */
-
-		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
-		tmp->d[div_n]=l0;
-		wnum.d--;
-		/* ingore top values of the bignums just sub the two 
-		 * BN_ULONG arrays with bn_sub_words */
-		if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+		else
 			{
-			/* Note: As we have considered only the leading
-			 * two BN_ULONGs in the calculation of q, sdiv * q
-			 * might be greater than wnum (but then (q-1) * sdiv
-			 * is less or equal than wnum)
-			 */
-			q--;
-			if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
-				/* we can't have an overflow here (assuming
-				 * that q != 0, but if q == 0 then tmp is
-				 * zero anyway) */
-				(*wnump)++;
+			if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+			snum->d[snum->top] = 0;
+			snum->top ++;
 			}
-		/* store part of the result */
-		*resp = q;
-		}
-	bn_correct_top(snum);
-	if (rm != NULL)
-		{
-		/* Keep a copy of the neg flag in num because if rm==num
-		 * BN_rshift() will overwrite it.
-		 */
-		int neg = num->neg;
-		BN_rshift(rm,snum,norm_shift);
-		if (!BN_is_zero(rm))
-			rm->neg = neg;
-		bn_check_top(rm);
-		}
-	BN_CTX_end(ctx);
-	return(1);
-err:
-	bn_check_top(rm);
-	BN_CTX_end(ctx);
-	return(0);
-	}
-
-
-/* BN_div_no_branch is a special version of BN_div. It does not contain
- * branches that may leak sensitive information.
- */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, 
-	const BIGNUM *divisor, BN_CTX *ctx)
-	{
-	int norm_shift,i,loop;
-	BIGNUM *tmp,wnum,*snum,*sdiv,*res;
-	BN_ULONG *resp,*wnump;
-	BN_ULONG d0,d1;
-	int num_n,div_n;
-
-	bn_check_top(dv);
-	bn_check_top(rm);
-	/* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
-	bn_check_top(divisor);
-
-	if (BN_is_zero(divisor))
-		{
-		BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
-		return(0);
-		}
-
-	BN_CTX_start(ctx);
-	tmp=BN_CTX_get(ctx);
-	snum=BN_CTX_get(ctx);
-	sdiv=BN_CTX_get(ctx);
-	if (dv == NULL)
-		res=BN_CTX_get(ctx);
-	else	res=dv;
-	if (sdiv == NULL || res == NULL) goto err;
-
-	/* First we normalise the numbers */
-	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
-	if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
-	sdiv->neg=0;
-	norm_shift+=BN_BITS2;
-	if (!(BN_lshift(snum,num,norm_shift))) goto err;
-	snum->neg=0;
-
-	/* Since we don't know whether snum is larger than sdiv,
-	 * we pad snum with enough zeroes without changing its
-	 * value. 
-	 */
-	if (snum->top <= sdiv->top+1) 
-		{
-		if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
-		for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
-		snum->top = sdiv->top + 2;
-		}
-	else
-		{
-		if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
-		snum->d[snum->top] = 0;
-		snum->top ++;
 		}
 
 	div_n=sdiv->top;
@@ -500,12 +284,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
 	/* Setup to 'res' */
 	res->neg= (num->neg^divisor->neg);
 	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop-1;
+	res->top=loop-no_branch;
 	resp= &(res->d[loop-1]);
 
 	/* space for temp */
 	if (!bn_wexpand(tmp,(div_n+1))) goto err;
 
+	if (!no_branch)
+		{
+		if (BN_ucmp(&wnum,sdiv) >= 0)
+			{
+			/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
+			 * bn_pollute) the const bignum arguments =>
+			 * clean the values between top and max again */
+			bn_clear_top2max(&wnum);
+			bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+			*resp=1;
+			}
+		else
+			res->top--;
+		}
+
 	/* if res->top == 0 then clear the neg value otherwise decrease
 	 * the resp pointer */
 	if (res->top == 0)
@@ -638,7 +437,7 @@ X) -> 0x%08X\n",
 			rm->neg = neg;
 		bn_check_top(rm);
 		}
-	bn_correct_top(res);
+	if (no_branch)	bn_correct_top(res);
 	BN_CTX_end(ctx);
 	return(1);
 err:
@@ -646,5 +445,4 @@ X) -> 0x%08X\n",
 	BN_CTX_end(ctx);
 	return(0);
 	}
-
 #endif
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index d9b6c73..2abf6fd 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -113,6 +113,18 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#include <stdlib.h>
+#ifdef _WIN32
+# include <malloc.h>
+# ifndef alloca
+#  define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+#  define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
 /* maximum precomputation table size for *variable* sliding windows */
 #define TABLE_SIZE	32
 
@@ -522,23 +534,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
  * from/to that table. */
 
-static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
 	{
 	size_t i, j;
 
-	if (bn_wexpand(b, top) == NULL)
-		return 0;
-	while (b->top < top)
-		{
-		b->d[b->top++] = 0;
-		}
-	
+	if (top > b->top)
+		top = b->top; /* this works because 'buf' is explicitly zeroed */
 	for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
 		{
 		buf[j] = ((unsigned char*)b->d)[i];
 		}
 
-	bn_correct_top(b);
 	return 1;
 	}
 
@@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 
 /* Given a pointer value, compute the next address that is a cache line multiple. */
 #define MOD_EXP_CTIME_ALIGN(x_) \
-	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
 
 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
  * precomputation memory layout to limit data-dependency to a minimum
@@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 		    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
-	int i,bits,ret=0,idx,window,wvalue;
+	int i,bits,ret=0,window,wvalue;
 	int top;
- 	BIGNUM *r;
-	const BIGNUM *aa;
 	BN_MONT_CTX *mont=NULL;
 
 	int numPowers;
 	unsigned char *powerbufFree=NULL;
 	int powerbufLen = 0;
 	unsigned char *powerbuf=NULL;
-	BIGNUM *computeTemp=NULL, *am=NULL;
+	BIGNUM tmp, am;
 
 	bn_check_top(a);
 	bn_check_top(p);
@@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 		return ret;
 		}
 
- 	/* Initialize BIGNUM context and allocate intermediate result */
 	BN_CTX_start(ctx);
-	r = BN_CTX_get(ctx);
-	if (r == NULL) goto err;
 
 	/* Allocate a montgomery context if it was not supplied by the caller.
 	 * If this is not done, things will break in the montgomery part.
@@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
 	/* Get the window size to use with size of p. */
 	window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+	if (window==6 && bits<=1024) window=5;	/* ~5% improvement of 2048-bit RSA sign */
+#endif
 
 	/* Allocate a buffer large enough to hold all of the pre-computed
-	 * powers of a.
+	 * powers of am, am itself and tmp.
 	 */
 	numPowers = 1 << window;
-	powerbufLen = sizeof(m->d[0])*top*numPowers;
+	powerbufLen = sizeof(m->d[0])*(top*numPowers +
+				((2*top)>numPowers?(2*top):numPowers));
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+	else
+#endif
 	if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
 		goto err;
 		
 	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
 	memset(powerbuf, 0, powerbufLen);
 
- 	/* Initialize the intermediate result. Do this early to save double conversion,
-	 * once each for a^0 and intermediate result.
-	 */
- 	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = NULL;
+#endif
 
-	/* Initialize computeTemp as a^1 with montgomery precalcs */
-	computeTemp = BN_CTX_get(ctx);
-	am = BN_CTX_get(ctx);
-	if (computeTemp==NULL || am==NULL) goto err;
+	/* lay down tmp and am right after powers table */
+	tmp.d     = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+	am.d      = tmp.d + top;
+	tmp.top   = am.top  = 0;
+	tmp.dmax  = am.dmax = top;
+	tmp.neg   = am.neg  = 0;
+	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+	/* prepare a^0 in Montgomery domain */
+#if 1
+ 	if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))	goto err;
+#else
+	tmp.d[0] = (0-m->d[0])&BN_MASK2;	/* 2^(top*BN_BITS2) - m */
+	for (i=1;i<top;i++)
+		tmp.d[i] = (~m->d[i])&BN_MASK2;
+	tmp.top = top;
+#endif
 
+	/* prepare a^1 in Montgomery domain */
 	if (a->neg || BN_ucmp(a,m) >= 0)
 		{
-		if (!BN_mod(am,a,m,ctx))
-			goto err;
-		aa= am;
+		if (!BN_mod(&am,a,m,ctx))			goto err;
+		if (!BN_to_montgomery(&am,&am,mont,ctx))	goto err;
 		}
-	else
-		aa=a;
-	if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
-	if (!BN_copy(computeTemp, am)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+	else	if (!BN_to_montgomery(&am,a,mont,ctx))		goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+    /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+     * specifically optimization of cache-timing attack countermeasures
+     * and pre-computation optimization. */
+
+    /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+     * 512-bit RSA is hardly relevant, we omit it to spare size... */ 
+    if (window==5)
+	{
+	void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+			const void *table,const BN_ULONG *np,
+			const BN_ULONG *n0,int num,int power);
+	void bn_scatter5(const BN_ULONG *inp,size_t num,
+			void *table,size_t power);
+	void bn_gather5(BN_ULONG *out,size_t num,
+			void *table,size_t power);
+
+	BN_ULONG *np=mont->N.d, *n0=mont->n0;
+
+	/* BN_to_montgomery can contaminate words above .top
+	 * [in BN_DEBUG[_DEBUG] build]... */
+	for (i=am.top; i<top; i++)	am.d[i]=0;
+	for (i=tmp.top; i<top; i++)	tmp.d[i]=0;
+
+	bn_scatter5(tmp.d,top,powerbuf,0);
+	bn_scatter5(am.d,am.top,powerbuf,1);
+	bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
+	bn_scatter5(tmp.d,top,powerbuf,2);
+
+#if 0
+	for (i=3; i<32; i++)
+		{
+		/* Calculate a^i = a^(i-1) * a */
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#else
+	/* same as above, but uses squaring for 1/2 of operations */
+	for (i=4; i<32; i*=2)
+		{
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+	for (i=3; i<8; i+=2)
+		{
+		int j;
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		for (j=2*i; j<32; j*=2)
+			{
+			bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+			bn_scatter5(tmp.d,top,powerbuf,j);
+			}
+		}
+	for (; i<16; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,2*i);
+		}
+	for (; i<32; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#endif
+	bits--;
+	for (wvalue=0, i=bits%5; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	bn_gather5(tmp.d,top,powerbuf,wvalue);
+
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+	while (bits >= 0)
+		{
+		for (wvalue=0, i=0; i<5; i++,bits--)
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
+		}
+
+	tmp.top=top;
+	bn_correct_top(&tmp);
+	}
+    else
+#endif
+	{
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1, numPowers)) goto err;
 
 	/* If the window size is greater than 1, then calculate
 	 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
@@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 	 */
 	if (window > 1)
 		{
-		for (i=2; i<numPowers; i++)
+		if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx))	goto err;
+		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
+		for (i=3; i<numPowers; i++)
 			{
 			/* Calculate a^i = a^(i-1) * a */
-			if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+			if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
 				goto err;
-			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
 			}
 		}
 
- 	/* Adjust the number of bits up to a multiple of the window size.
- 	 * If the exponent length is not a multiple of the window size, then
- 	 * this pads the most significant bits with zeros to normalize the
- 	 * scanning loop to there's no special cases.
- 	 *
- 	 * * NOTE: Making the window size a power of two less than the native
-	 * * word size ensures that the padded bits won't go past the last
- 	 * * word in the internal BIGNUM structure. Going past the end will
- 	 * * still produce the correct result, but causes a different branch
- 	 * * to be taken in the BN_is_bit_set function.
- 	 */
- 	bits = ((bits+window-1)/window)*window;
- 	idx=bits-1;	/* The top bit of the window */
-
- 	/* Scan the exponent one window at a time starting from the most
- 	 * significant bits.
- 	 */
- 	while (idx >= 0)
+	bits--;
+	for (wvalue=0, i=bits%window; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
+ 
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+ 	while (bits >= 0)
   		{
  		wvalue=0; /* The 'value' of the window */
  		
  		/* Scan the window, squaring the result as we go */
- 		for (i=0; i<window; i++,idx--)
+ 		for (i=0; i<window; i++,bits--)
  			{
-			if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))	goto err;
-			wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+			if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx))	goto err;
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
   			}
  		
 		/* Fetch the appropriate pre-computed value from the pre-buf */
-		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
 
  		/* Multiply the result into the intermediate result */
- 		if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+ 		if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
   		}
+	}
 
  	/* Convert the final result from montgomery to standard format */
-	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+	if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
 	ret=1;
 err:
 	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
 	if (powerbuf!=NULL)
 		{
 		OPENSSL_cleanse(powerbuf,powerbufLen);
-		OPENSSL_free(powerbufFree);
+		if (powerbufFree) OPENSSL_free(powerbufFree);
 		}
- 	if (am!=NULL) BN_clear(am);
- 	if (computeTemp!=NULL) BN_clear(computeTemp);
 	BN_CTX_end(ctx);
 	return(ret);
 	}
@@ -988,4 +1095,3 @@ int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	bn_check_top(r);
 	return(ret);
 	}
-
diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c
index 4a35211..a808f53 100644
--- a/crypto/bn/bn_gcd.c
+++ b/crypto/bn/bn_gcd.c
@@ -205,6 +205,7 @@ static BIGNUM *euclid(BIGNUM *a, BIGNUM *b)
 /* solves ax == 1 (mod n) */
 static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
         const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
+
 BIGNUM *BN_mod_inverse(BIGNUM *in,
 	const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
 	{
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index 432a3aa..8a4dc20 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -94,6 +94,8 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Maximum number of iterations before BN_GF2m_mod_solve_quad_arr should fail. */
 #define MAX_ITERATIONS 50
 
@@ -122,6 +124,7 @@ static const BN_ULONG SQR_tb[16] =
     SQR_tb[(w) >>  4 & 0xF] <<  8 | SQR_tb[(w)       & 0xF]
 #endif
 
+#if !defined(OPENSSL_BN_ASM_GF2m)
 /* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
  * result is a polynomial r with degree < 2 * BN_BITS - 1
  * The caller MUST ensure that the variables have the right amount
@@ -216,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c
 	r[2] ^= m1 ^ r[1] ^ r[3];  /* h0 ^= m1 ^ l1 ^ h1; */
 	r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0;  /* l1 ^= l0 ^ h0 ^ m0; */
 	}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif 
 
 /* Add polynomials a and b and store result in r; r could be a or b, a and b 
  * could be equal; r is the bitwise XOR of a and b.
@@ -360,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
 int	BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p) + 1;
-	int *arr=NULL;
+	int arr[6];
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
-	ret = BN_GF2m_poly2arr(p, arr, max);
-	if (!ret || ret > max)
+	ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+	if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
 		{
 		BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
-		goto err;
+		return 0;
 		}
 	ret = BN_GF2m_mod_arr(r, a, arr);
 	bn_check_top(r);
-err:
-	if (arr) OPENSSL_free(arr);
 	return ret;
 	}
 
@@ -521,7 +522,7 @@ int	BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
  */
 int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
-	BIGNUM *b, *c, *u, *v, *tmp;
+	BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
 	int ret = 0;
 
 	bn_check_top(a);
@@ -529,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 
 	BN_CTX_start(ctx);
 	
-	b = BN_CTX_get(ctx);
-	c = BN_CTX_get(ctx);
-	u = BN_CTX_get(ctx);
-	v = BN_CTX_get(ctx);
-	if (v == NULL) goto err;
+	if ((b = BN_CTX_get(ctx))==NULL) goto err;
+	if ((c = BN_CTX_get(ctx))==NULL) goto err;
+	if ((u = BN_CTX_get(ctx))==NULL) goto err;
+	if ((v = BN_CTX_get(ctx))==NULL) goto err;
 
-	if (!BN_one(b)) goto err;
 	if (!BN_GF2m_mod(u, a, p)) goto err;
-	if (!BN_copy(v, p)) goto err;
-
 	if (BN_is_zero(u)) goto err;
 
+	if (!BN_copy(v, p)) goto err;
+#if 0
+	if (!BN_one(b)) goto err;
+
 	while (1)
 		{
 		while (!BN_is_odd(u))
@@ -565,13 +566,89 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 		if (!BN_GF2m_add(u, u, v)) goto err;
 		if (!BN_GF2m_add(b, b, c)) goto err;
 		}
+#else
+	{
+	int i,	ubits = BN_num_bits(u),
+		vbits = BN_num_bits(v),	/* v is copy of p */
+		top = p->top;
+	BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+	bn_wexpand(u,top);	udp = u->d;
+				for (i=u->top;i<top;i++) udp[i] = 0;
+				u->top = top;
+	bn_wexpand(b,top);	bdp = b->d;
+				bdp[0] = 1;
+				for (i=1;i<top;i++) bdp[i] = 0;
+				b->top = top;
+	bn_wexpand(c,top);	cdp = c->d;
+				for (i=0;i<top;i++) cdp[i] = 0;
+				c->top = top;
+	vdp = v->d;	/* It pays off to "cache" *->d pointers, because
+			 * it allows optimizer to be more aggressive.
+			 * But we don't have to "cache" p->d, because *p
+			 * is declared 'const'... */
+	while (1)
+		{
+		while (ubits && !(udp[0]&1))
+			{
+			BN_ULONG u0,u1,b0,b1,mask;
+
+			u0   = udp[0];
+			b0   = bdp[0];
+			mask = (BN_ULONG)0-(b0&1);
+			b0  ^= p->d[0]&mask;
+			for (i=0;i<top-1;i++)
+				{
+				u1 = udp[i+1];
+				udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+				u0 = u1;
+				b1 = bdp[i+1]^(p->d[i+1]&mask);
+				bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+				b0 = b1;
+				}
+			udp[i] = u0>>1;
+			bdp[i] = b0>>1;
+			ubits--;
+			}
 
+		if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+		if (ubits<vbits)
+			{
+			i = ubits; ubits = vbits; vbits = i;
+			tmp = u; u = v; v = tmp;
+			tmp = b; b = c; c = tmp;
+			udp = vdp; vdp = v->d;
+			bdp = cdp; cdp = c->d;
+			}
+		for(i=0;i<top;i++)
+			{
+			udp[i] ^= vdp[i];
+			bdp[i] ^= cdp[i];
+			}
+		if (ubits==vbits)
+			{
+			BN_ULONG ul;
+			int utop = (ubits-1)/BN_BITS2;
+
+			while ((ul=udp[utop])==0 && utop) utop--;
+			ubits = utop*BN_BITS2 + BN_num_bits_word(ul);
+			}
+		}
+	bn_correct_top(b);
+	}
+#endif
 
 	if (!BN_copy(r, b)) goto err;
 	bn_check_top(r);
 	ret = 1;
 
 err:
+#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
+        bn_correct_top(c);
+        bn_correct_top(u);
+        bn_correct_top(v);
+#endif
   	BN_CTX_end(ctx);
 	return ret;
 	}
@@ -1033,3 +1110,4 @@ int BN_GF2m_arr2poly(const int p[], BIGNUM *a)
 	return 1;
 	}
 
+#endif
diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index 8e5e98e..817c773 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -238,7 +238,7 @@ extern "C" {
 #  if defined(__DECC)
 #   include <c_asm.h>
 #   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-#  elif defined(__GNUC__)
+#  elif defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("umulh	%1,%2,%0"	\
@@ -247,7 +247,7 @@ extern "C" {
 	ret;			})
 #  endif	/* compiler */
 # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("mulhdu	%0,%1,%2"	\
@@ -257,7 +257,7 @@ extern "C" {
 #  endif	/* compiler */
 # elif (defined(__x86_64) || defined(__x86_64__)) && \
        (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret,discard;	\
 	asm ("mulq	%3"		\
@@ -280,6 +280,26 @@ extern "C" {
 #   define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
 #   define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
 #  endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+#  if defined(__GNUC__) && __GNUC__>=2
+#   if __GNUC__>=4 && __GNUC_MINOR__>=4 /* "h" constraint is no more since 4.4 */
+#     define BN_UMULT_HIGH(a,b)		 (((__uint128_t)(a)*(b))>>64)
+#     define BN_UMULT_LOHI(low,high,a,b) ({	\
+	__uint128_t ret=(__uint128_t)(a)*(b);	\
+	(high)=ret>>64; (low)=ret;	 })
+#   else
+#     define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("dmultu	%1,%2"		\
+	     : "=h"(ret)		\
+	     : "r"(a), "r"(b) : "l");	\
+	ret;			})
+#     define BN_UMULT_LOHI(low,high,a,b)\
+	asm ("dmultu	%2,%3"		\
+	     : "=l"(low),"=h"(high)	\
+	     : "r"(a), "r"(b));
+#    endif
+#  endif
 # endif		/* cpu */
 #endif		/* OPENSSL_NO_ASM */
 
@@ -459,6 +479,10 @@ extern "C" {
 	}
 #endif /* !BN_LLONG */
 
+#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
+#undef bn_div_words
+#endif
+
 void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
 void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
 void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index 5470fbe..7a5676d 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c
@@ -139,25 +139,6 @@ const BIGNUM *BN_value_one(void)
 	return(&const_one);
 	}
 
-char *BN_options(void)
-	{
-	static int init=0;
-	static char data[16];
-
-	if (!init)
-		{
-		init++;
-#ifdef BN_LLONG
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
-#else
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
-#endif
-		}
-	return(data);
-	}
-
 int BN_num_bits_word(BN_ULONG l)
 	{
 	static const unsigned char bits[256]={
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 1a86688..427b5cf 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -177,31 +177,26 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 	{
 	BIGNUM *n;
-	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
-	int al,nl,max,i,x,ri;
+	BN_ULONG *ap,*np,*rp,n0,v,carry;
+	int nl,max,i;
 
 	n= &(mont->N);
-	/* mont->ri is the size of mont->N in bits (rounded up
-	   to the word size) */
-	al=ri=mont->ri/BN_BITS2;
-
 	nl=n->top;
-	if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+	if (nl == 0) { ret->top=0; return(1); }
 
-	max=(nl+al+1); /* allow for overflow (no?) XXX */
+	max=(2*nl); /* carry is stored separately */
 	if (bn_wexpand(r,max) == NULL) return(0);
 
 	r->neg^=n->neg;
 	np=n->d;
 	rp=r->d;
-	nrp= &(r->d[nl]);
 
 	/* clear the top words of T */
 #if 1
 	for (i=r->top; i<max; i++) /* memset? XXX */
-		r->d[i]=0;
+		rp[i]=0;
 #else
-	memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
+	memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
 #endif
 
 	r->top=max;
@@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 #ifdef BN_COUNT
 	fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
 #endif
-	for (i=0; i<nl; i++)
+	for (carry=0, i=0; i<nl; i++, rp++)
 		{
 #ifdef __TANDEM
                 {
@@ -228,61 +223,33 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 #else
 		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
 #endif
-		nrp++;
-		rp++;
-		if (((nrp[-1]+=v)&BN_MASK2) >= v)
-			continue;
-		else
-			{
-			if (((++nrp[0])&BN_MASK2) != 0) continue;
-			if (((++nrp[1])&BN_MASK2) != 0) continue;
-			for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
-			}
-		}
-	bn_correct_top(r);
-
-	/* mont->ri will be a multiple of the word size and below code
-	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
-	if (r->top <= ri)
-		{
-		ret->top=0;
-		return(1);
+		v = (v+carry+rp[nl])&BN_MASK2;
+		carry |= (v != rp[nl]);
+		carry &= (v <= rp[nl]);
+		rp[nl]=v;
 		}
-	al=r->top-ri;
 
-#define BRANCH_FREE 1
-#if BRANCH_FREE
-	if (bn_wexpand(ret,ri) == NULL) return(0);
-	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
-	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	if (bn_wexpand(ret,nl) == NULL) return(0);
+	ret->top=nl;
 	ret->neg=r->neg;
 
 	rp=ret->d;
-	ap=&(r->d[ri]);
+	ap=&(r->d[nl]);
 
+#define BRANCH_FREE 1
+#if BRANCH_FREE
 	{
-	size_t m1,m2;
-
-	v=bn_sub_words(rp,ap,np,ri);
-	/* this ----------------^^ works even in al<ri case
-	 * thanks to zealous zeroing of top of the vector in the
-	 * beginning. */
+	BN_ULONG *nrp;
+	size_t m;
 
-	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
-	/* in other words if subtraction result is real, then
+	v=bn_sub_words(rp,ap,np,nl)-carry;
+	/* if subtraction result is real, then
 	 * trick unconditional memcpy below to perform in-place
 	 * "refresh" instead of actual copy. */
-	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
-	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
-	m1|=m2;			/* (al!=ri) */
-	m1|=(0-(size_t)v);	/* (al!=ri || v) */
-	m1&=~m2;		/* (al!=ri || v) && !al>ri */
-	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1));
-	}
+	m=(0-(size_t)v);
+	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
 
-	/* 'i<ri' is chosen to eliminate dependency on input data, even
-	 * though it results in redundant copy in al<ri case. */
-	for (i=0,ri-=4; i<ri; i+=4)
+	for (i=0,nl-=4; i<nl; i+=4)
 		{
 		BN_ULONG t1,t2,t3,t4;
 		
@@ -295,40 +262,15 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 		rp[i+2]=t3;
 		rp[i+3]=t4;
 		}
-	for (ri+=4; i<ri; i++)
+	for (nl+=4; i<nl; i++)
 		rp[i]=nrp[i], ap[i]=0;
-	bn_correct_top(r);
-	bn_correct_top(ret);
+	}
 #else
-	if (bn_wexpand(ret,al) == NULL) return(0);
-	ret->top=al;
-	ret->neg=r->neg;
-
-	rp=ret->d;
-	ap=&(r->d[ri]);
-	al-=4;
-	for (i=0; i<al; i+=4)
-		{
-		BN_ULONG t1,t2,t3,t4;
-		
-		t1=ap[i+0];
-		t2=ap[i+1];
-		t3=ap[i+2];
-		t4=ap[i+3];
-		rp[i+0]=t1;
-		rp[i+1]=t2;
-		rp[i+2]=t3;
-		rp[i+3]=t4;
-		}
-	al+=4;
-	for (; i<al; i++)
-		rp[i]=ap[i];
-
-	if (BN_ucmp(ret, &(mont->N)) >= 0)
-		{
-		if (!BN_usub(ret,ret,&(mont->N))) return(0);
-		}
+	if (bn_sub_words (rp,ap,np,nl)-carry)
+		memcpy(rp,ap,nl*sizeof(BN_ULONG));
 #endif
+	bn_correct_top(r);
+	bn_correct_top(ret);
 	bn_check_top(ret);
 
 	return(1);
diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c
index c6de032..43caee4 100644
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c
@@ -319,6 +319,13 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 						:(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
 #define bn_32_set_0(to, n)		(((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
 #define bn_cp_32(to,n,from,m)		((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
+# if defined(L_ENDIAN)
+#  if defined(__arch64__)
+#   define NIST_INT64 long
+#  else
+#   define NIST_INT64 long long
+#  endif
+# endif
 #else
 #define bn_cp_64(to, n, from, m) \
 	{ \
@@ -330,13 +337,15 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 	bn_32_set_0(to, (n)*2); \
 	bn_32_set_0(to, (n)*2+1); \
 	}
-#if BN_BITS2 == 32
 #define bn_cp_32(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
 #define bn_32_set_0(to, n)		(to)[n] = (BN_ULONG)0;
-#endif
+# if defined(_WIN32) && !defined(__GNUC__)
+#  define NIST_INT64 __int64
+# elif defined(BN_LLONG)
+#  define NIST_INT64 long long
+# endif
 #endif /* BN_BITS2 != 64 */
 
-
 #define nist_set_192(to, from, a1, a2, a3) \
 	{ \
 	bn_cp_64(to, 0, from, (a3) - 3) \
@@ -350,9 +359,11 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int      top = a->top, i;
 	int      carry;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_192_TOP],
-	         buf[BN_NIST_192_TOP],
-		 c_d[BN_NIST_192_TOP],
+	union	{
+		BN_ULONG	bn[BN_NIST_192_TOP];
+		unsigned int	ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_192_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	static const BIGNUM _bignum_nist_p_192_sqr = {
@@ -385,15 +396,48 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc  = rp[0];	acc += bp[3*2-6];
+			acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[3*2-5];
+			acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32;
 
-	nist_set_192(t_d, buf, 0, 3, 3);
+	acc += rp[2];	acc += bp[3*2-6];
+			acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[3*2-5];
+			acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[5] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_192_TOP];
+
+	nist_set_192(t_d, buf.bn, 0, 3, 3);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 4, 4, 0);
+	nist_set_192(t_d, buf.bn, 4, 4, 0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 5, 5, 5)
+	nist_set_192(t_d, buf.bn, 5, 5, 5)
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-
+	}
+#endif
 	if (carry > 0)
 		carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
 	else
@@ -435,8 +479,7 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	top = a->top, i;
 	int	carry;
 	BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_224_TOP],
-	         buf[BN_NIST_224_TOP],
+	BN_ULONG buf[BN_NIST_224_TOP],
 		 c_d[BN_NIST_224_TOP],
 		*res;
 	PTR_SIZE_INT mask;
@@ -474,14 +517,54 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 #if BN_BITS2==64
 	/* copy upper 256 bits of 448 bit number ... */
-	nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+	nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
 	/* ... and right shift by 32 to obtain upper 224 bits */
-	nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+	nist_set_224(buf, c_d, 14, 13, 12, 11, 10, 9, 8);
 	/* truncate lower part to 224 bits too */
 	r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
 #else
 	nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
 #endif
+
+#if defined(NIST_INT64) && BN_BITS2!=64
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf;
+
+	acc  = rp[0];	acc -= bp[7-7];
+			acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc -= bp[8-7];
+			acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc -= bp[9-7];
+			acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[7-7];
+			acc += bp[11-7];
+			acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32;
+
+	acc += rp[4];	acc += bp[8-7];
+			acc += bp[12-7];
+			acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[9-7];
+			acc += bp[13-7];
+			acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[10-7];
+			acc -= bp[13-7]; rp[6] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+# if BN_BITS2==64
+	rp[7] = carry;
+# endif
+	}	
+#else
+	{
+	BN_ULONG t_d[BN_NIST_224_TOP];
+
 	nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
 	nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
@@ -493,6 +576,8 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 #if BN_BITS2==64
 	carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
+#endif
+	}
 #endif
 	u.f = bn_sub_words;
 	if (carry > 0)
@@ -548,9 +633,11 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *a_d = a->d, *r_d;
-	BN_ULONG t_d[BN_NIST_256_TOP],
-	         buf[BN_NIST_256_TOP],
-		 c_d[BN_NIST_256_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_256_TOP];
+		unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_256_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -584,12 +671,87 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[8-8];
+			acc += bp[9-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[9-8];
+			acc += bp[10-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[10-8];
+			acc += bp[11-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[11-8];
+			acc += bp[11-8];
+			acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc -= bp[15-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc -= bp[9-8];
+			acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc -= bp[10-8];
+			acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[14-8];
+			acc += bp[13-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[6] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[7];	acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[8 -8];
+			acc -= bp[10-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8]; rp[7] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_256_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
+	nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
 	/*S2*/
-	nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+	nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
 	carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
 	/* left shift */
 		{
@@ -607,24 +769,26 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 		}
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S3*/
-	nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+	nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S4*/
-	nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+	nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D1*/
-	nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+	nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D2*/
-	nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+	nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D3*/
-	nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+	nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D4*/
-	nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+	nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)
@@ -672,9 +836,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_384_TOP],
-	         buf[BN_NIST_384_TOP],
-		 c_d[BN_NIST_384_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_384_TOP];
+		unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_384_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -709,10 +875,100 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[12-12];
+			acc += bp[21-12];
+			acc += bp[20-12];
+			acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[13-12];
+			acc += bp[22-12];
+			acc += bp[23-12];
+			acc -= bp[12-12];
+			acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[14-12];
+			acc += bp[23-12];
+			acc -= bp[13-12];
+			acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[15-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[21-12];
+			acc -= bp[14-12];
+			acc -= bp[22-12];
+			acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[21-12];
+			acc += bp[21-12];
+			acc += bp[16-12];
+			acc += bp[13-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[22-12];
+			acc -= bp[15-12];
+			acc -= bp[23-12];
+			acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[22-12];
+			acc += bp[22-12];
+			acc += bp[17-12];
+			acc += bp[14-12];
+			acc += bp[13-12];
+			acc += bp[21-12];
+			acc += bp[23-12];
+			acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[6];	acc += bp[23-12];
+			acc += bp[23-12];
+			acc += bp[18-12];
+			acc += bp[15-12];
+			acc += bp[14-12];
+			acc += bp[22-12];
+			acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[7];	acc += bp[19-12];
+			acc += bp[16-12];
+			acc += bp[15-12];
+			acc += bp[23-12];
+			acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[8];	acc += bp[20-12];
+			acc += bp[17-12];
+			acc += bp[16-12];
+			acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[9];	acc += bp[21-12];
+			acc += bp[18-12];
+			acc += bp[17-12];
+			acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[10];	acc += bp[22-12];
+			acc += bp[19-12];
+			acc += bp[18-12];
+			acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[11];	acc += bp[23-12];
+			acc += bp[20-12];
+			acc += bp[19-12];
+			acc -= bp[22-12]; rp[11] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_384_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
+	nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
 		/* left shift */
 		{
 		register BN_ULONG *ap,t,c;
@@ -729,29 +985,31 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
 		t_d, BN_NIST_256_TOP);
 	/*S2 */
-	carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
+	carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
 	/*S3*/
-	nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+	nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S4*/
-	nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+	nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S5*/
-	nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+	nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S6*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D1*/
-	nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+	nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D2*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D3*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index bebb466..1743b6a 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -357,3 +357,22 @@ int BN_print(BIO *bp, const BIGNUM *a)
 	return(ret);
 	}
 #endif
+
+char *BN_options(void)
+	{
+	static int init=0;
+	static char data[16];
+
+	if (!init)
+		{
+		init++;
+#ifdef BN_LLONG
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
+#else
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
+#endif
+		}
+	return(data);
+	}
diff --git a/crypto/bn/bn_shift.c b/crypto/bn/bn_shift.c
index c4d301a..a6fca2c 100644
--- a/crypto/bn/bn_shift.c
+++ b/crypto/bn/bn_shift.c
@@ -99,7 +99,7 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a)
 int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 	{
 	BN_ULONG *ap,*rp,t,c;
-	int i;
+	int i,j;
 
 	bn_check_top(r);
 	bn_check_top(a);
@@ -109,22 +109,25 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 		BN_zero(r);
 		return(1);
 		}
+	i = a->top;
+	ap= a->d;
+	j = i-(ap[i-1]==1);
 	if (a != r)
 		{
-		if (bn_wexpand(r,a->top) == NULL) return(0);
-		r->top=a->top;
+		if (bn_wexpand(r,j) == NULL) return(0);
 		r->neg=a->neg;
 		}
-	ap=a->d;
 	rp=r->d;
-	c=0;
-	for (i=a->top-1; i>=0; i--)
+	t=ap[--i];
+	c=(t&1)?BN_TBIT:0;
+	if (t>>=1) rp[i]=t;
+	while (i>0)
 		{
-		t=ap[i];
+		t=ap[--i];
 		rp[i]=((t>>1)&BN_MASK2)|c;
 		c=(t&1)?BN_TBIT:0;
 		}
-	bn_correct_top(r);
+	r->top=j;
 	bn_check_top(r);
 	return(1);
 	}
@@ -182,10 +185,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 		BN_zero(r);
 		return(1);
 		}
+	i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2;
 	if (r != a)
 		{
 		r->neg=a->neg;
-		if (bn_wexpand(r,a->top-nw+1) == NULL) return(0);
+		if (bn_wexpand(r,i) == NULL) return(0);
 		}
 	else
 		{
@@ -196,7 +200,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 	f= &(a->d[nw]);
 	t=r->d;
 	j=a->top-nw;
-	r->top=j;
+	r->top=i;
 
 	if (rb == 0)
 		{
@@ -212,9 +216,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 			l= *(f++);
 			*(t++) =(tmp|(l<<lb))&BN_MASK2;
 			}
-		*(t++) =(l>>rb)&BN_MASK2;
+		if ((l = (l>>rb)&BN_MASK2)) *(t) = l;
 		}
-	bn_correct_top(r);
 	bn_check_top(r);
 	return(1);
 	}
diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c
index ee7b87c..de83a15 100644
--- a/crypto/bn/bn_word.c
+++ b/crypto/bn/bn_word.c
@@ -144,26 +144,17 @@ int BN_add_word(BIGNUM *a, BN_ULONG w)
 			a->neg=!(a->neg);
 		return(i);
 		}
-	/* Only expand (and risk failing) if it's possibly necessary */
-	if (((BN_ULONG)(a->d[a->top - 1] + 1) == 0) &&
-			(bn_wexpand(a,a->top+1) == NULL))
-		return(0);
-	i=0;
-	for (;;)
+	for (i=0;w!=0 && i<a->top;i++)
 		{
-		if (i >= a->top)
-			l=w;
-		else
-			l=(a->d[i]+w)&BN_MASK2;
-		a->d[i]=l;
-		if (w > l)
-			w=1;
-		else
-			break;
-		i++;
+		a->d[i] = l = (a->d[i]+w)&BN_MASK2;
+		w = (w>l)?1:0;
 		}
-	if (i >= a->top)
+	if (w && i==a->top)
+		{
+		if (bn_wexpand(a,a->top+1) == NULL) return 0;
 		a->top++;
+		a->d[i]=w;
+		}
 	bn_check_top(a);
 	return(1);
 	}
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 0cd99c5..06f5954 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -262,7 +262,7 @@ int main(int argc, char *argv[])
 	message(out,"BN_mod_sqrt");
 	if (!test_sqrt(out,ctx)) goto err;
 	(void)BIO_flush(out);
-
+#ifndef OPENSSL_NO_EC2M
 	message(out,"BN_GF2m_add");
 	if (!test_gf2m_add(out)) goto err;
 	(void)BIO_flush(out);
@@ -298,7 +298,7 @@ int main(int argc, char *argv[])
 	message(out,"BN_GF2m_mod_solve_quad");
 	if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
 	(void)BIO_flush(out);
-
+#endif
 	BN_CTX_free(ctx);
 	BIO_free(out);
 
@@ -1061,7 +1061,7 @@ int test_exp(BIO *bp, BN_CTX *ctx)
 	BN_free(one);
 	return(1);
 	}
-
+#ifndef OPENSSL_NO_EC2M
 int test_gf2m_add(BIO *bp)
 	{
 	BIGNUM a,b,c;
@@ -1636,7 +1636,7 @@ int test_gf2m_mod_solve_quad(BIO *bp,BN_CTX *ctx)
 	BN_free(e);
 	return ret;
 	}
-
+#endif
 static int genprime_cb(int p, int n, BN_GENCB *arg)
 	{
 	char c='*';
diff --git a/crypto/buffer/buf_str.c b/crypto/buffer/buf_str.c
new file mode 100644
index 0000000..151f5ea
--- /dev/null
+++ b/crypto/buffer/buf_str.c
@@ -0,0 +1,119 @@
+/* crypto/buffer/buffer.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/buffer.h>
+
+char *BUF_strdup(const char *str)
+	{
+	if (str == NULL) return(NULL);
+	return BUF_strndup(str, strlen(str));
+	}
+
+char *BUF_strndup(const char *str, size_t siz)
+	{
+	char *ret;
+
+	if (str == NULL) return(NULL);
+
+	ret=OPENSSL_malloc(siz+1);
+	if (ret == NULL) 
+		{
+		BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+	BUF_strlcpy(ret,str,siz+1);
+	return(ret);
+	}
+
+void *BUF_memdup(const void *data, size_t siz)
+	{
+	void *ret;
+
+	if (data == NULL) return(NULL);
+
+	ret=OPENSSL_malloc(siz);
+	if (ret == NULL) 
+		{
+		BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+	return memcpy(ret, data, siz);
+	}	
+
+size_t BUF_strlcpy(char *dst, const char *src, size_t size)
+	{
+	size_t l = 0;
+	for(; size > 1 && *src; size--)
+		{
+		*dst++ = *src++;
+		l++;
+		}
+	if (size)
+		*dst = '\0';
+	return l + strlen(src);
+	}
+
+size_t BUF_strlcat(char *dst, const char *src, size_t size)
+	{
+	size_t l = 0;
+	for(; size > 0 && *dst; size--, dst++)
+		l++;
+	return l + BUF_strlcpy(dst, src, size);
+	}
diff --git a/crypto/buffer/buffer.c b/crypto/buffer/buffer.c
index 620ea8d..d7aa79a 100644
--- a/crypto/buffer/buffer.c
+++ b/crypto/buffer/buffer.c
@@ -60,6 +60,11 @@
 #include "cryptlib.h"
 #include <openssl/buffer.h>
 
+/* LIMIT_BEFORE_EXPANSION is the maximum n such that (n+3)/3*4 < 2**31. That
+ * function is applied in several functions in this file and this limit ensures
+ * that the result fits in an int. */
+#define LIMIT_BEFORE_EXPANSION 0x5ffffffc
+
 BUF_MEM *BUF_MEM_new(void)
 	{
 	BUF_MEM *ret;
@@ -105,6 +110,12 @@ int BUF_MEM_grow(BUF_MEM *str, size_t len)
 		str->length=len;
 		return(len);
 		}
+	/* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+	if (len > LIMIT_BEFORE_EXPANSION)
+		{
+		BUFerr(BUF_F_BUF_MEM_GROW,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
 	n=(len+3)/3*4;
 	if (str->data == NULL)
 		ret=OPENSSL_malloc(n);
@@ -142,6 +153,12 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
 		str->length=len;
 		return(len);
 		}
+	/* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+	if (len > LIMIT_BEFORE_EXPANSION)
+		{
+		BUFerr(BUF_F_BUF_MEM_GROW_CLEAN,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
 	n=(len+3)/3*4;
 	if (str->data == NULL)
 		ret=OPENSSL_malloc(n);
@@ -162,64 +179,6 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
 	return(len);
 	}
 
-char *BUF_strdup(const char *str)
-	{
-	if (str == NULL) return(NULL);
-	return BUF_strndup(str, strlen(str));
-	}
-
-char *BUF_strndup(const char *str, size_t siz)
-	{
-	char *ret;
-
-	if (str == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz+1);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	BUF_strlcpy(ret,str,siz+1);
-	return(ret);
-	}
-
-void *BUF_memdup(const void *data, size_t siz)
-	{
-	void *ret;
-
-	if (data == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	return memcpy(ret, data, siz);
-	}	
-
-size_t BUF_strlcpy(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 1 && *src; size--)
-		{
-		*dst++ = *src++;
-		l++;
-		}
-	if (size)
-		*dst = '\0';
-	return l + strlen(src);
-	}
-
-size_t BUF_strlcat(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 0 && *dst; size--, dst++)
-		l++;
-	return l + BUF_strlcpy(dst, src, size);
-	}
-
 void BUF_reverse(unsigned char *out, unsigned char *in, size_t size)
 	{
 	size_t i;
diff --git a/crypto/cmac/cm_ameth.c b/crypto/cmac/cm_ameth.c
new file mode 100644
index 0000000..0b8e567
--- /dev/null
+++ b/crypto/cmac/cm_ameth.c
@@ -0,0 +1,97 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "asn1_locl.h"
+
+/* CMAC "ASN1" method. This is just here to indicate the
+ * maximum CMAC output length and to free up a CMAC
+ * key.
+ */
+
+static int cmac_size(const EVP_PKEY *pkey)
+	{
+	return EVP_MAX_BLOCK_LENGTH;
+	}
+
+static void cmac_key_free(EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr;
+	if (cmctx)
+		CMAC_CTX_free(cmctx);
+	}
+
+const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_CMAC,
+	0,
+
+	"CMAC",
+	"OpenSSL CMAC method",
+
+	0,0,0,0,
+
+	0,0,0,
+
+	cmac_size,
+	0,
+	0,0,0,0,0,0,0,
+
+	cmac_key_free,
+	0,
+	0,0
+	};
+
diff --git a/crypto/cmac/cm_pmeth.c b/crypto/cmac/cm_pmeth.c
new file mode 100644
index 0000000..072228e
--- /dev/null
+++ b/crypto/cmac/cm_pmeth.c
@@ -0,0 +1,224 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "evp_locl.h"
+
+/* The context structure and "key" is simply a CMAC_CTX */
+
+static int pkey_cmac_init(EVP_PKEY_CTX *ctx)
+	{
+	ctx->data = CMAC_CTX_new();
+	if (!ctx->data)
+		return 0;
+	ctx->keygen_info_count = 0;
+	return 1;
+	}
+
+static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
+	{
+	if (!pkey_cmac_init(dst))
+		return 0;
+	if (!CMAC_CTX_copy(dst->data, src->data))
+		return 0;
+	return 1;
+	}
+
+static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx)
+	{
+	CMAC_CTX_free(ctx->data);
+	}
+
+static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmkey = CMAC_CTX_new();
+	CMAC_CTX *cmctx = ctx->data;
+	if (!cmkey)
+		return 0;
+	if (!CMAC_CTX_copy(cmkey, cmctx))
+		{
+		CMAC_CTX_free(cmkey);
+		return 0;
+		}
+	EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey);
+	
+	return 1;
+	}
+
+static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
+	{
+	if (!CMAC_Update(ctx->pctx->data, data, count))
+		return 0;
+	return 1;
+	}
+
+static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
+	{
+	EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
+	mctx->update = int_update;
+	return 1;
+	}
+
+static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+					EVP_MD_CTX *mctx)
+	{
+	return CMAC_Final(ctx->data, sig, siglen);
+	}
+
+static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
+	{
+	CMAC_CTX *cmctx = ctx->data;
+	switch (type)
+		{
+
+		case EVP_PKEY_CTRL_SET_MAC_KEY:
+		if (!p2 || p1 < 0)
+			return 0;
+		if (!CMAC_Init(cmctx, p2, p1, NULL, NULL))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_CIPHER:
+		if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_MD:
+		if (ctx->pkey && !CMAC_CTX_copy(ctx->data,
+					(CMAC_CTX *)ctx->pkey->pkey.ptr))
+			return 0;
+		if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL))
+			return 0;
+		break;
+
+		default:
+		return -2;
+
+		}
+	return 1;
+	}
+
+static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx,
+			const char *type, const char *value)
+	{
+	if (!value)
+		{
+		return 0;
+		}
+	if (!strcmp(type, "key"))
+		{
+		void *p = (void *)value;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY,
+								strlen(p), p);
+		}
+	if (!strcmp(type, "cipher"))
+		{
+		const EVP_CIPHER *c;
+		c = EVP_get_cipherbyname(value);
+		if (!c)
+			return 0;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c);
+		}
+	if (!strcmp(type, "hexkey"))
+		{
+		unsigned char *key;
+		int r;
+		long keylen;
+		key = string_to_hex(value, &keylen);
+		if (!key)
+			return 0;
+		r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key);
+		OPENSSL_free(key);
+		return r;
+		}
+	return -2;
+	}
+
+const EVP_PKEY_METHOD cmac_pkey_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_FLAG_SIGCTX_CUSTOM,
+	pkey_cmac_init,
+	pkey_cmac_copy,
+	pkey_cmac_cleanup,
+
+	0, 0,
+
+	0,
+	pkey_cmac_keygen,
+
+	0, 0,
+
+	0, 0,
+
+	0,0,
+
+	cmac_signctx_init,
+	cmac_signctx,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	pkey_cmac_ctrl,
+	pkey_cmac_ctrl_str
+
+	};
diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c
new file mode 100644
index 0000000..8b72b09
--- /dev/null
+++ b/crypto/cmac/cmac.c
@@ -0,0 +1,308 @@
+/* crypto/cmac/cmac.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cryptlib.h"
+#include <openssl/cmac.h>
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+struct CMAC_CTX_st
+	{
+	/* Cipher context to use */
+	EVP_CIPHER_CTX cctx;
+	/* Keys k1 and k2 */
+	unsigned char k1[EVP_MAX_BLOCK_LENGTH];
+	unsigned char k2[EVP_MAX_BLOCK_LENGTH];
+	/* Temporary block */
+	unsigned char tbl[EVP_MAX_BLOCK_LENGTH];
+	/* Last (possibly partial) block */
+	unsigned char last_block[EVP_MAX_BLOCK_LENGTH];
+	/* Number of bytes in last block: -1 means context not initialised */
+	int nlast_block;
+	};
+
+
+/* Make temporary keys K1 and K2 */
+
+static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+	{
+	int i;
+	/* Shift block to left, including carry */
+	for (i = 0; i < bl; i++)
+		{
+		k1[i] = l[i] << 1;
+		if (i < bl - 1 && l[i + 1] & 0x80)
+			k1[i] |= 1;
+		}
+	/* If MSB set fixup with R */
+	if (l[0] & 0x80)
+		k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+	}
+
+CMAC_CTX *CMAC_CTX_new(void)
+	{
+	CMAC_CTX *ctx;
+	ctx = OPENSSL_malloc(sizeof(CMAC_CTX));
+	if (!ctx)
+		return NULL;
+	EVP_CIPHER_CTX_init(&ctx->cctx);
+	ctx->nlast_block = -1;
+	return ctx;
+	}
+
+void CMAC_CTX_cleanup(CMAC_CTX *ctx)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		{
+		FIPS_cmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
+	EVP_CIPHER_CTX_cleanup(&ctx->cctx);
+	OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH);
+	ctx->nlast_block = -1;
+	}
+
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx)
+	{
+	return &ctx->cctx;
+	}
+
+void CMAC_CTX_free(CMAC_CTX *ctx)
+	{
+	CMAC_CTX_cleanup(ctx);
+	OPENSSL_free(ctx);
+	}
+
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
+	{
+	int bl;
+	if (in->nlast_block == -1)
+		return 0;
+	if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx))
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&in->cctx);
+	memcpy(out->k1, in->k1, bl);
+	memcpy(out->k2, in->k2, bl);
+	memcpy(out->tbl, in->tbl, bl);
+	memcpy(out->last_block, in->last_block, bl);
+	out->nlast_block = in->nlast_block;
+	return 1;
+	}
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl)
+	{
+	static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->cctx.engine)
+			&& !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+
+			{
+			EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_cipherinit().
+		 */
+		if (!impl && !ctx->cctx.engine)
+			return FIPS_cmac_init(ctx, key, keylen, cipher, NULL);
+		}
+#endif
+	/* All zeros means restart */
+	if (!key && !cipher && !impl && keylen == 0)
+		{
+		/* Not initialised */
+		if (ctx->nlast_block == -1)
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx));
+		ctx->nlast_block = 0;
+		return 1;
+		}
+	/* Initialiase context */
+	if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL))
+		return 0;
+	/* Non-NULL key means initialisation complete */
+	if (key)
+		{
+		int bl;
+		if (!EVP_CIPHER_CTX_cipher(&ctx->cctx))
+			return 0;
+		if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen))
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv))
+			return 0;
+		bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl))
+			return 0;
+		make_kn(ctx->k1, ctx->tbl, bl);
+		make_kn(ctx->k2, ctx->k1, bl);
+		OPENSSL_cleanse(ctx->tbl, bl);
+		/* Reset context again ready for first data block */
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		/* Zero tbl so resume works */
+		memset(ctx->tbl, 0, bl);
+		ctx->nlast_block = 0;
+		}
+	return 1;
+	}
+
+int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen)
+	{
+	const unsigned char *data = in;
+	size_t bl;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_update(ctx, in, dlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	if (dlen == 0)
+		return 1;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	/* Copy into partial block if we need to */
+	if (ctx->nlast_block > 0)
+		{
+		size_t nleft;
+		nleft = bl - ctx->nlast_block;
+		if (dlen < nleft)
+			nleft = dlen;
+		memcpy(ctx->last_block + ctx->nlast_block, data, nleft);
+		dlen -= nleft;
+		ctx->nlast_block += nleft;
+		/* If no more to process return */
+		if (dlen == 0)
+			return 1;
+		data += nleft;
+		/* Else not final block so encrypt it */
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl))
+			return 0;
+		}
+	/* Encrypt all but one of the complete blocks left */
+	while(dlen > bl)
+		{
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl))
+			return 0;
+		dlen -= bl;
+		data += bl;
+		}
+	/* Copy any data left to last block buffer */
+	memcpy(ctx->last_block, data, dlen);
+	ctx->nlast_block = dlen;
+	return 1;
+
+	}
+
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen)
+	{
+	int i, bl, lb;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_final(ctx, out, poutlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	*poutlen = (size_t)bl;
+	if (!out)
+		return 1;
+	lb = ctx->nlast_block;
+	/* Is last block complete? */
+	if (lb == bl)
+		{
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k1[i];
+		}
+	else
+		{
+		ctx->last_block[lb] = 0x80;
+		if (bl - lb > 1)
+			memset(ctx->last_block + lb + 1, 0, bl - lb - 1);
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k2[i];
+		}
+	if (!EVP_Cipher(&ctx->cctx, out, out, bl))
+		{
+		OPENSSL_cleanse(out, bl);	
+		return 0;
+		}
+	return 1;
+	}
+
+int CMAC_resume(CMAC_CTX *ctx)
+	{
+	if (ctx->nlast_block == -1)
+		return 0;
+	/* The buffer "tbl" containes the last fully encrypted block
+	 * which is the last IV (or all zeroes if no last encrypted block).
+	 * The last block has not been modified since CMAC_final().
+	 * So reinitliasing using the last decrypted block will allow
+	 * CMAC to continue after calling CMAC_Final(). 
+	 */
+	return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl);
+	}
diff --git a/crypto/cmac/cmac.h b/crypto/cmac/cmac.h
new file mode 100644
index 0000000..712e92d
--- /dev/null
+++ b/crypto/cmac/cmac.h
@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/evp.h>
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/crypto/comp/c_rle.c b/crypto/comp/c_rle.c
index 18bceae..47dfb67 100644
--- a/crypto/comp/c_rle.c
+++ b/crypto/comp/c_rle.c
@@ -30,7 +30,7 @@ static int rle_compress_block(COMP_CTX *ctx, unsigned char *out,
 	{
 	/* int i; */
 
-	if (olen < (ilen+1))
+	if (ilen == 0 || olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);
@@ -46,7 +46,7 @@ static int rle_expand_block(COMP_CTX *ctx, unsigned char *out,
 	{
 	int i;
 
-	if (ilen == 0 || olen < (ilen-1))
+	if (olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);
diff --git a/crypto/conf/conf_mall.c b/crypto/conf/conf_mall.c
index c6f4cb2..213890e 100644
--- a/crypto/conf/conf_mall.c
+++ b/crypto/conf/conf_mall.c
@@ -76,5 +76,6 @@ void OPENSSL_load_builtin_modules(void)
 #ifndef OPENSSL_NO_ENGINE
 	ENGINE_add_conf_module();
 #endif
+	EVP_add_alg_module();
 	}
 
diff --git a/crypto/cpt_err.c b/crypto/cpt_err.c
index 139b928..289005f 100644
--- a/crypto/cpt_err.c
+++ b/crypto/cpt_err.c
@@ -1,6 +1,6 @@
 /* crypto/cpt_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
 {ERR_FUNC(CRYPTO_F_CRYPTO_SET_EX_DATA),	"CRYPTO_set_ex_data"},
 {ERR_FUNC(CRYPTO_F_DEF_ADD_INDEX),	"DEF_ADD_INDEX"},
 {ERR_FUNC(CRYPTO_F_DEF_GET_CLASS),	"DEF_GET_CLASS"},
+{ERR_FUNC(CRYPTO_F_FIPS_MODE_SET),	"FIPS_mode_set"},
 {ERR_FUNC(CRYPTO_F_INT_DUP_EX_DATA),	"INT_DUP_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_FREE_EX_DATA),	"INT_FREE_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_NEW_EX_DATA),	"INT_NEW_EX_DATA"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
 
 static ERR_STRING_DATA CRYPTO_str_reasons[]=
 	{
+{ERR_REASON(CRYPTO_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
 {ERR_REASON(CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK),"no dynlock create callback"},
 {0,NULL}
 	};
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index 24fe123..304c6b7 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -409,6 +409,10 @@ int (*CRYPTO_get_add_lock_callback(void))(int *num,int mount,int type,
 void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
 					      const char *file,int line))
 	{
+	/* Calling this here ensures initialisation before any threads
+	 * are started.
+	 */
+	OPENSSL_init();
 	locking_callback=func;
 	}
 
@@ -500,7 +504,7 @@ void CRYPTO_THREADID_current(CRYPTO_THREADID *id)
 	CRYPTO_THREADID_set_numeric(id, (unsigned long)find_thread(NULL));
 #else
 	/* For everything else, default to using the address of 'errno' */
-	CRYPTO_THREADID_set_pointer(id, &errno);
+	CRYPTO_THREADID_set_pointer(id, (void*)&errno);
 #endif
 	}
 
@@ -661,28 +665,53 @@ const char *CRYPTO_get_lock_name(int type)
 	defined(__INTEL__) || \
 	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
 
-unsigned long  OPENSSL_ia32cap_P=0;
-unsigned long *OPENSSL_ia32cap_loc(void) { return &OPENSSL_ia32cap_P; }
+unsigned int  OPENSSL_ia32cap_P[2];
+unsigned long *OPENSSL_ia32cap_loc(void)
+{   if (sizeof(long)==4)
+	/*
+	 * If 32-bit application pulls address of OPENSSL_ia32cap_P[0]
+	 * clear second element to maintain the illusion that vector
+	 * is 32-bit.
+	 */
+	OPENSSL_ia32cap_P[1]=0;
+    return (unsigned long *)OPENSSL_ia32cap_P;
+}
 
 #if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY)
 #define OPENSSL_CPUID_SETUP
+#if defined(_WIN32)
+typedef unsigned __int64 IA32CAP;
+#else
+typedef unsigned long long IA32CAP;
+#endif
 void OPENSSL_cpuid_setup(void)
 { static int trigger=0;
-  unsigned long OPENSSL_ia32_cpuid(void);
+  IA32CAP OPENSSL_ia32_cpuid(void);
+  IA32CAP vec;
   char *env;
 
     if (trigger)	return;
 
     trigger=1;
-    if ((env=getenv("OPENSSL_ia32cap")))
-	OPENSSL_ia32cap_P = strtoul(env,NULL,0)|(1<<10);
+    if ((env=getenv("OPENSSL_ia32cap"))) {
+	int off = (env[0]=='~')?1:0;
+#if defined(_WIN32)
+	if (!sscanf(env+off,"%I64i",&vec)) vec = strtoul(env+off,NULL,0);
+#else
+	if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0);
+#endif
+	if (off) vec = OPENSSL_ia32_cpuid()&~vec;
+    }
     else
-	OPENSSL_ia32cap_P = OPENSSL_ia32_cpuid()|(1<<10);
+	vec = OPENSSL_ia32_cpuid();
+
     /*
      * |(1<<10) sets a reserved bit to signal that variable
      * was initialized already... This is to avoid interference
      * with cpuid snippets in ELF .init segment.
      */
+    OPENSSL_ia32cap_P[0] = (unsigned int)vec|(1<<10);
+    OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32);
 }
 #endif
 
@@ -896,3 +925,16 @@ void OpenSSLDie(const char *file,int line,const char *assertion)
 	}
 
 void *OPENSSL_stderr(void)	{ return stderr; }
+
+int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+	{
+	size_t i;
+	const unsigned char *a = in_a;
+	const unsigned char *b = in_b;
+	unsigned char x = 0;
+
+	for (i = 0; i < len; i++)
+		x |= a[i] ^ b[i];
+
+	return x;
+	}
diff --git a/crypto/cryptlib.h b/crypto/cryptlib.h
index fc249c5..d26f963 100644
--- a/crypto/cryptlib.h
+++ b/crypto/cryptlib.h
@@ -99,8 +99,8 @@ extern "C" {
 #define HEX_SIZE(type)		(sizeof(type)*2)
 
 void OPENSSL_cpuid_setup(void);
-extern unsigned long OPENSSL_ia32cap_P;
-void OPENSSL_showfatal(const char *,...);
+extern unsigned int OPENSSL_ia32cap_P[];
+void OPENSSL_showfatal(const char *fmta,...);
 void *OPENSSL_stderr(void);
 extern int OPENSSL_NONPIC_relocated;
 
diff --git a/crypto/crypto.h b/crypto/crypto.h
index b0360ce..f92fc51 100644
--- a/crypto/crypto.h
+++ b/crypto/crypto.h
@@ -488,10 +488,10 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
 				    long (**go)(void));
 
 void *CRYPTO_malloc_locked(int num, const char *file, int line);
-void CRYPTO_free_locked(void *);
+void CRYPTO_free_locked(void *ptr);
 void *CRYPTO_malloc(int num, const char *file, int line);
 char *CRYPTO_strdup(const char *str, const char *file, int line);
-void CRYPTO_free(void *);
+void CRYPTO_free(void *ptr);
 void *CRYPTO_realloc(void *addr,int num, const char *file, int line);
 void *CRYPTO_realloc_clean(void *addr,int old_num,int num,const char *file,
 			   int line);
@@ -547,6 +547,40 @@ unsigned long *OPENSSL_ia32cap_loc(void);
 #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
 int OPENSSL_isservice(void);
 
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c) \
+	{ \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+	return private_##alg##_Init(c); \
+	} \
+	int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
+/* CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It
+ * takes an amount of time dependent on |len|, but independent of the contents
+ * of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a
+ * defined order as the return value when a != b is undefined, other than to be
+ * non-zero. */
+int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -562,11 +596,13 @@ void ERR_load_CRYPTO_strings(void);
 #define CRYPTO_F_CRYPTO_SET_EX_DATA			 102
 #define CRYPTO_F_DEF_ADD_INDEX				 104
 #define CRYPTO_F_DEF_GET_CLASS				 105
+#define CRYPTO_F_FIPS_MODE_SET				 109
 #define CRYPTO_F_INT_DUP_EX_DATA			 106
 #define CRYPTO_F_INT_FREE_EX_DATA			 107
 #define CRYPTO_F_INT_NEW_EX_DATA			 108
 
 /* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED		 101
 #define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK		 100
 
 #ifdef  __cplusplus
diff --git a/crypto/des/asm/crypt586.S b/crypto/des/asm/crypt586.S
new file mode 100644
index 0000000..fb321ba
--- /dev/null
+++ b/crypto/des/asm/crypt586.S
@@ -0,0 +1,879 @@
+.file	"crypt586.s"
+.text
+.globl	fcrypt_body
+.type	fcrypt_body,@function
+.align	16
+fcrypt_body:
+.L_fcrypt_body_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+
+	xorl	%edi,%edi
+	xorl	%esi,%esi
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%edx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+	movl	DES_SPtrans@GOT(%edx),%edx
+	pushl	%edx
+	movl	28(%esp),%ebp
+	pushl	$25
+.L001start:
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	4(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	8(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	12(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	16(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	20(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	24(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	28(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	32(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	36(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	40(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	44(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	48(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	52(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	56(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	60(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	64(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	68(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	72(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	76(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	80(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	84(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	88(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	92(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	96(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	100(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	104(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	108(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%esi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%esi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	112(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	116(%ebp),%ecx
+	xorl	%esi,%eax
+	xorl	%esi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%edi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%edi
+	movl	32(%esp),%ebp
+
+
+	movl	36(%esp),%eax
+	movl	%edi,%edx
+	shrl	$16,%edx
+	movl	40(%esp),%ecx
+	xorl	%edi,%edx
+	andl	%edx,%eax
+	andl	%ecx,%edx
+	movl	%eax,%ebx
+	shll	$16,%ebx
+	movl	%edx,%ecx
+	shll	$16,%ecx
+	xorl	%ebx,%eax
+	xorl	%ecx,%edx
+	movl	120(%ebp),%ebx
+	xorl	%ebx,%eax
+	movl	124(%ebp),%ecx
+	xorl	%edi,%eax
+	xorl	%edi,%edx
+	xorl	%ecx,%edx
+	andl	$0xfcfcfcfc,%eax
+	xorl	%ebx,%ebx
+	andl	$0xcfcfcfcf,%edx
+	xorl	%ecx,%ecx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	movl	4(%esp),%ebp
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	movl	0x600(%ebp,%ebx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x700(%ebp,%ecx,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x400(%ebp,%eax,1),%ebx
+	xorl	%ebx,%esi
+	movl	0x500(%ebp,%edx,1),%ebx
+	xorl	%ebx,%esi
+	movl	32(%esp),%ebp
+	movl	(%esp),%ebx
+	movl	%edi,%eax
+	decl	%ebx
+	movl	%esi,%edi
+	movl	%eax,%esi
+	movl	%ebx,(%esp)
+	jnz	.L001start
+
+
+	movl	28(%esp),%edx
+	rorl	$1,%edi
+	movl	%esi,%eax
+	xorl	%edi,%esi
+	andl	$0xaaaaaaaa,%esi
+	xorl	%esi,%eax
+	xorl	%esi,%edi
+
+	roll	$23,%eax
+	movl	%eax,%esi
+	xorl	%edi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%esi
+	xorl	%eax,%edi
+
+	roll	$10,%esi
+	movl	%esi,%eax
+	xorl	%edi,%esi
+	andl	$0x33333333,%esi
+	xorl	%esi,%eax
+	xorl	%esi,%edi
+
+	roll	$18,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xfff0000f,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	roll	$12,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xf0f0f0f0,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%edx)
+	movl	%edi,4(%edx)
+	addl	$8,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	fcrypt_body,.-.L_fcrypt_body_begin
diff --git a/crypto/des/asm/des-586.S b/crypto/des/asm/des-586.S
new file mode 100644
index 0000000..2fbd340
--- /dev/null
+++ b/crypto/des/asm/des-586.S
@@ -0,0 +1,1837 @@
+.file	"des-586.s"
+.text
+.globl	DES_SPtrans
+.type	_x86_DES_encrypt,@function
+.align	16
+_x86_DES_encrypt:
+	pushl	%ecx
+
+	movl	(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	4(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	8(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	12(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	16(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	20(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	24(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	28(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	32(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	36(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	40(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	44(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	48(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	52(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	56(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	60(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	64(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	68(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	72(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	76(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	80(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	84(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	88(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	92(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	96(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	100(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	104(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	108(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	112(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	116(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	120(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	124(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+	addl	$4,%esp
+	ret
+.size	_x86_DES_encrypt,.-_x86_DES_encrypt
+.type	_x86_DES_decrypt,@function
+.align	16
+_x86_DES_decrypt:
+	pushl	%ecx
+
+	movl	120(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	124(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	112(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	116(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	104(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	108(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	96(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	100(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	88(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	92(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	80(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	84(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	72(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	76(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	64(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	68(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	56(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	60(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	48(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	52(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	40(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	44(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	32(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	36(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	24(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	28(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	16(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	20(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+
+	movl	8(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	12(%ecx),%edx
+	xorl	%esi,%eax
+	xorl	%ecx,%ecx
+	xorl	%esi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%edi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%edi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%edi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%edi
+	xorl	0x700(%ebp,%ecx,1),%edi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%edi
+	xorl	0x500(%ebp,%edx,1),%edi
+
+	movl	(%ecx),%eax
+	xorl	%ebx,%ebx
+	movl	4(%ecx),%edx
+	xorl	%edi,%eax
+	xorl	%ecx,%ecx
+	xorl	%edi,%edx
+	andl	$0xfcfcfcfc,%eax
+	andl	$0xcfcfcfcf,%edx
+	movb	%al,%bl
+	movb	%ah,%cl
+	rorl	$4,%edx
+	xorl	(%ebp,%ebx,1),%esi
+	movb	%dl,%bl
+	xorl	0x200(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	shrl	$16,%eax
+	xorl	0x100(%ebp,%ebx,1),%esi
+	movb	%ah,%bl
+	shrl	$16,%edx
+	xorl	0x300(%ebp,%ecx,1),%esi
+	movb	%dh,%cl
+	andl	$0xff,%eax
+	andl	$0xff,%edx
+	xorl	0x600(%ebp,%ebx,1),%esi
+	xorl	0x700(%ebp,%ecx,1),%esi
+	movl	(%esp),%ecx
+	xorl	0x400(%ebp,%eax,1),%esi
+	xorl	0x500(%ebp,%edx,1),%esi
+	addl	$4,%esp
+	ret
+.size	_x86_DES_decrypt,.-_x86_DES_decrypt
+.globl	DES_encrypt1
+.type	DES_encrypt1,@function
+.align	16
+DES_encrypt1:
+.L_DES_encrypt1_begin:
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	12(%esp),%esi
+	xorl	%ecx,%ecx
+	pushl	%ebx
+	pushl	%ebp
+	movl	(%esi),%eax
+	movl	28(%esp),%ebx
+	movl	4(%esi),%edi
+
+
+	roll	$4,%eax
+	movl	%eax,%esi
+	xorl	%edi,%eax
+	andl	$0xf0f0f0f0,%eax
+	xorl	%eax,%esi
+	xorl	%eax,%edi
+
+	roll	$20,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xfff0000f,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$14,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x33333333,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$22,%esi
+	movl	%esi,%eax
+	xorl	%edi,%esi
+	andl	$0x03fc03fc,%esi
+	xorl	%esi,%eax
+	xorl	%esi,%edi
+
+	roll	$9,%eax
+	movl	%eax,%esi
+	xorl	%edi,%eax
+	andl	$0xaaaaaaaa,%eax
+	xorl	%eax,%esi
+	xorl	%eax,%edi
+
+	roll	$1,%edi
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	DES_SPtrans-.L000pic_point(%ebp),%ebp
+	movl	24(%esp),%ecx
+	cmpl	$0,%ebx
+	je	.L001decrypt
+	call	_x86_DES_encrypt
+	jmp	.L002done
+.L001decrypt:
+	call	_x86_DES_decrypt
+.L002done:
+
+
+	movl	20(%esp),%edx
+	rorl	$1,%esi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$23,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$10,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$18,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	roll	$12,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%edx)
+	movl	%esi,4(%edx)
+	popl	%ebp
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	ret
+.size	DES_encrypt1,.-.L_DES_encrypt1_begin
+.globl	DES_encrypt2
+.type	DES_encrypt2,@function
+.align	16
+DES_encrypt2:
+.L_DES_encrypt2_begin:
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	12(%esp),%eax
+	xorl	%ecx,%ecx
+	pushl	%ebx
+	pushl	%ebp
+	movl	(%eax),%esi
+	movl	28(%esp),%ebx
+	roll	$3,%esi
+	movl	4(%eax),%edi
+	roll	$3,%edi
+	call	.L003pic_point
+.L003pic_point:
+	popl	%ebp
+	leal	DES_SPtrans-.L003pic_point(%ebp),%ebp
+	movl	24(%esp),%ecx
+	cmpl	$0,%ebx
+	je	.L004decrypt
+	call	_x86_DES_encrypt
+	jmp	.L005done
+.L004decrypt:
+	call	_x86_DES_decrypt
+.L005done:
+
+
+	rorl	$3,%edi
+	movl	20(%esp),%eax
+	rorl	$3,%esi
+	movl	%edi,(%eax)
+	movl	%esi,4(%eax)
+	popl	%ebp
+	popl	%ebx
+	popl	%edi
+	popl	%esi
+	ret
+.size	DES_encrypt2,.-.L_DES_encrypt2_begin
+.globl	DES_encrypt3
+.type	DES_encrypt3,@function
+.align	16
+DES_encrypt3:
+.L_DES_encrypt3_begin:
+	pushl	%ebx
+	movl	8(%esp),%ebx
+	pushl	%ebp
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	subl	$12,%esp
+
+
+	roll	$4,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	roll	$20,%esi
+	movl	%esi,%edi
+	xorl	%edx,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%edx
+
+	roll	$14,%edi
+	movl	%edi,%esi
+	xorl	%edx,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%edx
+
+	roll	$22,%edx
+	movl	%edx,%edi
+	xorl	%esi,%edx
+	andl	$0x03fc03fc,%edx
+	xorl	%edx,%edi
+	xorl	%edx,%esi
+
+	roll	$9,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	rorl	$3,%edx
+	rorl	$2,%esi
+	movl	%esi,4(%ebx)
+	movl	36(%esp),%eax
+	movl	%edx,(%ebx)
+	movl	40(%esp),%edi
+	movl	44(%esp),%esi
+	movl	$1,8(%esp)
+	movl	%eax,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$0,8(%esp)
+	movl	%edi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$1,8(%esp)
+	movl	%esi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	addl	$12,%esp
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+
+
+	roll	$2,%esi
+	roll	$3,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$23,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$10,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$18,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	roll	$12,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%ebx)
+	movl	%esi,4(%ebx)
+	popl	%edi
+	popl	%esi
+	popl	%ebp
+	popl	%ebx
+	ret
+.size	DES_encrypt3,.-.L_DES_encrypt3_begin
+.globl	DES_decrypt3
+.type	DES_decrypt3,@function
+.align	16
+DES_decrypt3:
+.L_DES_decrypt3_begin:
+	pushl	%ebx
+	movl	8(%esp),%ebx
+	pushl	%ebp
+	pushl	%esi
+	pushl	%edi
+
+
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+	subl	$12,%esp
+
+
+	roll	$4,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	roll	$20,%esi
+	movl	%esi,%edi
+	xorl	%edx,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%edx
+
+	roll	$14,%edi
+	movl	%edi,%esi
+	xorl	%edx,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%edx
+
+	roll	$22,%edx
+	movl	%edx,%edi
+	xorl	%esi,%edx
+	andl	$0x03fc03fc,%edx
+	xorl	%edx,%edi
+	xorl	%edx,%esi
+
+	roll	$9,%edi
+	movl	%edi,%edx
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%edx
+	xorl	%edi,%esi
+
+	rorl	$3,%edx
+	rorl	$2,%esi
+	movl	%esi,4(%ebx)
+	movl	36(%esp),%esi
+	movl	%edx,(%ebx)
+	movl	40(%esp),%edi
+	movl	44(%esp),%eax
+	movl	$0,8(%esp)
+	movl	%eax,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$1,8(%esp)
+	movl	%edi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	movl	$0,8(%esp)
+	movl	%esi,4(%esp)
+	movl	%ebx,(%esp)
+	call	.L_DES_encrypt2_begin
+	addl	$12,%esp
+	movl	(%ebx),%edi
+	movl	4(%ebx),%esi
+
+
+	roll	$2,%esi
+	roll	$3,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0xaaaaaaaa,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$23,%eax
+	movl	%eax,%edi
+	xorl	%esi,%eax
+	andl	$0x03fc03fc,%eax
+	xorl	%eax,%edi
+	xorl	%eax,%esi
+
+	roll	$10,%edi
+	movl	%edi,%eax
+	xorl	%esi,%edi
+	andl	$0x33333333,%edi
+	xorl	%edi,%eax
+	xorl	%edi,%esi
+
+	roll	$18,%esi
+	movl	%esi,%edi
+	xorl	%eax,%esi
+	andl	$0xfff0000f,%esi
+	xorl	%esi,%edi
+	xorl	%esi,%eax
+
+	roll	$12,%edi
+	movl	%edi,%esi
+	xorl	%eax,%edi
+	andl	$0xf0f0f0f0,%edi
+	xorl	%edi,%esi
+	xorl	%edi,%eax
+
+	rorl	$4,%eax
+	movl	%eax,(%ebx)
+	movl	%esi,4(%ebx)
+	popl	%edi
+	popl	%esi
+	popl	%ebp
+	popl	%ebx
+	ret
+.size	DES_decrypt3,.-.L_DES_decrypt3_begin
+.globl	DES_ncbc_encrypt
+.type	DES_ncbc_encrypt,@function
+.align	16
+DES_ncbc_encrypt:
+.L_DES_ncbc_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ebp
+
+	movl	36(%esp),%ebx
+	movl	(%ebx),%esi
+	movl	4(%ebx),%edi
+	pushl	%edi
+	pushl	%esi
+	pushl	%edi
+	pushl	%esi
+	movl	%esp,%ebx
+	movl	36(%esp),%esi
+	movl	40(%esp),%edi
+
+	movl	56(%esp),%ecx
+
+	pushl	%ecx
+
+	movl	52(%esp),%eax
+	pushl	%eax
+	pushl	%ebx
+	cmpl	$0,%ecx
+	jz	.L006decrypt
+	andl	$4294967288,%ebp
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	jz	.L007encrypt_finish
+.L008encrypt_loop:
+	movl	(%esi),%ecx
+	movl	4(%esi),%edx
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L008encrypt_loop
+.L007encrypt_finish:
+	movl	56(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L009finish
+	call	.L010PIC_point
+.L010PIC_point:
+	popl	%edx
+	leal	.L011cbc_enc_jmp_table-.L010PIC_point(%edx),%ecx
+	movl	(%ecx,%ebp,4),%ebp
+	addl	%edx,%ebp
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	jmp	*%ebp
+.L012ej7:
+	movb	6(%esi),%dh
+	shll	$8,%edx
+.L013ej6:
+	movb	5(%esi),%dh
+.L014ej5:
+	movb	4(%esi),%dl
+.L015ej4:
+	movl	(%esi),%ecx
+	jmp	.L016ejend
+.L017ej3:
+	movb	2(%esi),%ch
+	shll	$8,%ecx
+.L018ej2:
+	movb	1(%esi),%ch
+.L019ej1:
+	movb	(%esi),%cl
+.L016ejend:
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	jmp	.L009finish
+.L006decrypt:
+	andl	$4294967288,%ebp
+	movl	20(%esp),%eax
+	movl	24(%esp),%ebx
+	jz	.L020decrypt_finish
+.L021decrypt_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	20(%esp),%ecx
+	movl	24(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%ecx,(%edi)
+	movl	%edx,4(%edi)
+	movl	%eax,20(%esp)
+	movl	%ebx,24(%esp)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L021decrypt_loop
+.L020decrypt_finish:
+	movl	56(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L009finish
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,12(%esp)
+	movl	%ebx,16(%esp)
+	call	.L_DES_encrypt1_begin
+	movl	12(%esp),%eax
+	movl	16(%esp),%ebx
+	movl	20(%esp),%ecx
+	movl	24(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+.L022dj7:
+	rorl	$16,%edx
+	movb	%dl,6(%edi)
+	shrl	$16,%edx
+.L023dj6:
+	movb	%dh,5(%edi)
+.L024dj5:
+	movb	%dl,4(%edi)
+.L025dj4:
+	movl	%ecx,(%edi)
+	jmp	.L026djend
+.L027dj3:
+	rorl	$16,%ecx
+	movb	%cl,2(%edi)
+	shll	$16,%ecx
+.L028dj2:
+	movb	%ch,1(%esi)
+.L029dj1:
+	movb	%cl,(%esi)
+.L026djend:
+	jmp	.L009finish
+.L009finish:
+	movl	64(%esp),%ecx
+	addl	$28,%esp
+	movl	%eax,(%ecx)
+	movl	%ebx,4(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L011cbc_enc_jmp_table:
+.long	0
+.long	.L019ej1-.L010PIC_point
+.long	.L018ej2-.L010PIC_point
+.long	.L017ej3-.L010PIC_point
+.long	.L015ej4-.L010PIC_point
+.long	.L014ej5-.L010PIC_point
+.long	.L013ej6-.L010PIC_point
+.long	.L012ej7-.L010PIC_point
+.align	64
+.size	DES_ncbc_encrypt,.-.L_DES_ncbc_encrypt_begin
+.globl	DES_ede3_cbc_encrypt
+.type	DES_ede3_cbc_encrypt,@function
+.align	16
+DES_ede3_cbc_encrypt:
+.L_DES_ede3_cbc_encrypt_begin:
+
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	28(%esp),%ebp
+
+	movl	44(%esp),%ebx
+	movl	(%ebx),%esi
+	movl	4(%ebx),%edi
+	pushl	%edi
+	pushl	%esi
+	pushl	%edi
+	pushl	%esi
+	movl	%esp,%ebx
+	movl	36(%esp),%esi
+	movl	40(%esp),%edi
+
+	movl	64(%esp),%ecx
+
+	movl	56(%esp),%eax
+	pushl	%eax
+
+	movl	56(%esp),%eax
+	pushl	%eax
+
+	movl	56(%esp),%eax
+	pushl	%eax
+	pushl	%ebx
+	cmpl	$0,%ecx
+	jz	.L030decrypt
+	andl	$4294967288,%ebp
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	jz	.L031encrypt_finish
+.L032encrypt_loop:
+	movl	(%esi),%ecx
+	movl	4(%esi),%edx
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_encrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L032encrypt_loop
+.L031encrypt_finish:
+	movl	60(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L033finish
+	call	.L034PIC_point
+.L034PIC_point:
+	popl	%edx
+	leal	.L035cbc_enc_jmp_table-.L034PIC_point(%edx),%ecx
+	movl	(%ecx,%ebp,4),%ebp
+	addl	%edx,%ebp
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	jmp	*%ebp
+.L036ej7:
+	movb	6(%esi),%dh
+	shll	$8,%edx
+.L037ej6:
+	movb	5(%esi),%dh
+.L038ej5:
+	movb	4(%esi),%dl
+.L039ej4:
+	movl	(%esi),%ecx
+	jmp	.L040ejend
+.L041ej3:
+	movb	2(%esi),%ch
+	shll	$8,%ecx
+.L042ej2:
+	movb	1(%esi),%ch
+.L043ej1:
+	movb	(%esi),%cl
+.L040ejend:
+	xorl	%ecx,%eax
+	xorl	%edx,%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_encrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	jmp	.L033finish
+.L030decrypt:
+	andl	$4294967288,%ebp
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	jz	.L044decrypt_finish
+.L045decrypt_loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_decrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	24(%esp),%ecx
+	movl	28(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%ecx,(%edi)
+	movl	%edx,4(%edi)
+	movl	%eax,24(%esp)
+	movl	%ebx,28(%esp)
+	addl	$8,%esi
+	addl	$8,%edi
+	subl	$8,%ebp
+	jnz	.L045decrypt_loop
+.L044decrypt_finish:
+	movl	60(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L033finish
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	call	.L_DES_decrypt3_begin
+	movl	16(%esp),%eax
+	movl	20(%esp),%ebx
+	movl	24(%esp),%ecx
+	movl	28(%esp),%edx
+	xorl	%eax,%ecx
+	xorl	%ebx,%edx
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+.L046dj7:
+	rorl	$16,%edx
+	movb	%dl,6(%edi)
+	shrl	$16,%edx
+.L047dj6:
+	movb	%dh,5(%edi)
+.L048dj5:
+	movb	%dl,4(%edi)
+.L049dj4:
+	movl	%ecx,(%edi)
+	jmp	.L050djend
+.L051dj3:
+	rorl	$16,%ecx
+	movb	%cl,2(%edi)
+	shll	$16,%ecx
+.L052dj2:
+	movb	%ch,1(%esi)
+.L053dj1:
+	movb	%cl,(%esi)
+.L050djend:
+	jmp	.L033finish
+.L033finish:
+	movl	76(%esp),%ecx
+	addl	$32,%esp
+	movl	%eax,(%ecx)
+	movl	%ebx,4(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L035cbc_enc_jmp_table:
+.long	0
+.long	.L043ej1-.L034PIC_point
+.long	.L042ej2-.L034PIC_point
+.long	.L041ej3-.L034PIC_point
+.long	.L039ej4-.L034PIC_point
+.long	.L038ej5-.L034PIC_point
+.long	.L037ej6-.L034PIC_point
+.long	.L036ej7-.L034PIC_point
+.align	64
+.size	DES_ede3_cbc_encrypt,.-.L_DES_ede3_cbc_encrypt_begin
+.align	64
+DES_SPtrans:
+.long	34080768,524288,33554434,34080770
+.long	33554432,526338,524290,33554434
+.long	526338,34080768,34078720,2050
+.long	33556482,33554432,0,524290
+.long	524288,2,33556480,526336
+.long	34080770,34078720,2050,33556480
+.long	2,2048,526336,34078722
+.long	2048,33556482,34078722,0
+.long	0,34080770,33556480,524290
+.long	34080768,524288,2050,33556480
+.long	34078722,2048,526336,33554434
+.long	526338,2,33554434,34078720
+.long	34080770,526336,34078720,33556482
+.long	33554432,2050,524290,0
+.long	524288,33554432,33556482,34080768
+.long	2,34078722,2048,526338
+.long	1074823184,0,1081344,1074790400
+.long	1073741840,32784,1073774592,1081344
+.long	32768,1074790416,16,1073774592
+.long	1048592,1074823168,1074790400,16
+.long	1048576,1073774608,1074790416,32768
+.long	1081360,1073741824,0,1048592
+.long	1073774608,1081360,1074823168,1073741840
+.long	1073741824,1048576,32784,1074823184
+.long	1048592,1074823168,1073774592,1081360
+.long	1074823184,1048592,1073741840,0
+.long	1073741824,32784,1048576,1074790416
+.long	32768,1073741824,1081360,1073774608
+.long	1074823168,32768,0,1073741840
+.long	16,1074823184,1081344,1074790400
+.long	1074790416,1048576,32784,1073774592
+.long	1073774608,16,1074790400,1081344
+.long	67108865,67371264,256,67109121
+.long	262145,67108864,67109121,262400
+.long	67109120,262144,67371008,1
+.long	67371265,257,1,67371009
+.long	0,262145,67371264,256
+.long	257,67371265,262144,67108865
+.long	67371009,67109120,262401,67371008
+.long	262400,0,67108864,262401
+.long	67371264,256,1,262144
+.long	257,262145,67371008,67109121
+.long	0,67371264,262400,67371009
+.long	262145,67108864,67371265,1
+.long	262401,67108865,67108864,67371265
+.long	262144,67109120,67109121,262400
+.long	67109120,0,67371009,257
+.long	67108865,262401,256,67371008
+.long	4198408,268439552,8,272633864
+.long	0,272629760,268439560,4194312
+.long	272633856,268435464,268435456,4104
+.long	268435464,4198408,4194304,268435456
+.long	272629768,4198400,4096,8
+.long	4198400,268439560,272629760,4096
+.long	4104,0,4194312,272633856
+.long	268439552,272629768,272633864,4194304
+.long	272629768,4104,4194304,268435464
+.long	4198400,268439552,8,272629760
+.long	268439560,0,4096,4194312
+.long	0,272629768,272633856,4096
+.long	268435456,272633864,4198408,4194304
+.long	272633864,8,268439552,4198408
+.long	4194312,4198400,272629760,268439560
+.long	4104,268435456,268435464,272633856
+.long	134217728,65536,1024,134284320
+.long	134283296,134218752,66592,134283264
+.long	65536,32,134217760,66560
+.long	134218784,134283296,134284288,0
+.long	66560,134217728,65568,1056
+.long	134218752,66592,0,134217760
+.long	32,134218784,134284320,65568
+.long	134283264,1024,1056,134284288
+.long	134284288,134218784,65568,134283264
+.long	65536,32,134217760,134218752
+.long	134217728,66560,134284320,0
+.long	66592,134217728,1024,65568
+.long	134218784,1024,0,134284320
+.long	134283296,134284288,1056,65536
+.long	66560,134283296,134218752,1056
+.long	32,66592,134283264,134217760
+.long	2147483712,2097216,0,2149588992
+.long	2097216,8192,2147491904,2097152
+.long	8256,2149589056,2105344,2147483648
+.long	2147491840,2147483712,2149580800,2105408
+.long	2097152,2147491904,2149580864,0
+.long	8192,64,2149588992,2149580864
+.long	2149589056,2149580800,2147483648,8256
+.long	64,2105344,2105408,2147491840
+.long	8256,2147483648,2147491840,2105408
+.long	2149588992,2097216,0,2147491840
+.long	2147483648,8192,2149580864,2097152
+.long	2097216,2149589056,2105344,64
+.long	2149589056,2105344,2097152,2147491904
+.long	2147483712,2149580800,2105408,0
+.long	8192,2147483712,2147491904,2149588992
+.long	2149580800,8256,64,2149580864
+.long	16384,512,16777728,16777220
+.long	16794116,16388,16896,0
+.long	16777216,16777732,516,16793600
+.long	4,16794112,16793600,516
+.long	16777732,16384,16388,16794116
+.long	0,16777728,16777220,16896
+.long	16793604,16900,16794112,4
+.long	16900,16793604,512,16777216
+.long	16900,16793600,16793604,516
+.long	16384,512,16777216,16793604
+.long	16777732,16900,16896,0
+.long	512,16777220,4,16777728
+.long	0,16777732,16777728,16896
+.long	516,16384,16794116,16777216
+.long	16794112,4,16388,16794116
+.long	16777220,16794112,16793600,16388
+.long	545259648,545390592,131200,0
+.long	537001984,8388736,545259520,545390720
+.long	128,536870912,8519680,131200
+.long	8519808,537002112,536871040,545259520
+.long	131072,8519808,8388736,537001984
+.long	545390720,536871040,0,8519680
+.long	536870912,8388608,537002112,545259648
+.long	8388608,131072,545390592,128
+.long	8388608,131072,536871040,545390720
+.long	131200,536870912,0,8519680
+.long	545259648,537002112,537001984,8388736
+.long	545390592,128,8388736,537001984
+.long	545390720,8388608,545259520,536871040
+.long	8519680,131200,537002112,545259520
+.long	128,545390592,8519808,0
+.long	536870912,545259648,131072,8519808
diff --git a/crypto/des/des.h b/crypto/des/des.h
index 92b6663..1eaedcb 100644
--- a/crypto/des/des.h
+++ b/crypto/des/des.h
@@ -224,6 +224,9 @@ int DES_set_key(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
 void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
 void DES_string_to_key(const char *str,DES_cblock *key);
 void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
 void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,
diff --git a/crypto/des/set_key.c b/crypto/des/set_key.c
index 3004cc3..da4d62e 100644
--- a/crypto/des/set_key.c
+++ b/crypto/des/set_key.c
@@ -63,6 +63,7 @@
  * 1.1 added norm_expand_bits
  * 1.0 First working version
  */
+#include <openssl/crypto.h>
 #include "des_locl.h"
 
 OPENSSL_IMPLEMENT_GLOBAL(int,DES_check_key,0)	/* defaults to false */
@@ -335,6 +336,13 @@ int DES_set_key_checked(const_DES_cblock *key, DES_key_schedule *schedule)
 	}
 
 void DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(DES);
+	private_DES_set_key_unchecked(key, schedule);
+	}
+void private_DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#endif
 	{
 	static const int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0};
 	register DES_LONG c,d,t,s,t2;
diff --git a/crypto/des/str2key.c b/crypto/des/str2key.c
index 9c2054b..1077f99 100644
--- a/crypto/des/str2key.c
+++ b/crypto/des/str2key.c
@@ -56,8 +56,8 @@
  * [including the GNU Public Licence.]
  */
 
-#include "des_locl.h"
 #include <openssl/crypto.h>
+#include "des_locl.h"
 
 void DES_string_to_key(const char *str, DES_cblock *key)
 	{
diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h
index 849309a..ea59e61 100644
--- a/crypto/dh/dh.h
+++ b/crypto/dh/dh.h
@@ -86,6 +86,21 @@
                                        * be used for all exponents.
                                        */
 
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -230,6 +245,9 @@ void ERR_load_DH_strings(void);
 #define DH_F_COMPUTE_KEY				 102
 #define DH_F_DHPARAMS_PRINT_FP				 101
 #define DH_F_DH_BUILTIN_GENPARAMS			 106
+#define DH_F_DH_COMPUTE_KEY				 114
+#define DH_F_DH_GENERATE_KEY				 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX			 116
 #define DH_F_DH_NEW_METHOD				 105
 #define DH_F_DH_PARAM_DECODE				 107
 #define DH_F_DH_PRIV_DECODE				 110
@@ -249,7 +267,9 @@ void ERR_load_DH_strings(void);
 #define DH_R_DECODE_ERROR				 104
 #define DH_R_INVALID_PUBKEY				 102
 #define DH_R_KEYS_NOT_SET				 108
+#define DH_R_KEY_SIZE_TOO_SMALL				 110
 #define DH_R_MODULUS_TOO_LARGE				 103
+#define DH_R_NON_FIPS_METHOD				 111
 #define DH_R_NO_PARAMETERS_SET				 107
 #define DH_R_NO_PRIVATE_VALUE				 100
 #define DH_R_PARAMETER_ENCODING_ERROR			 105
diff --git a/crypto/dh/dh_ameth.c b/crypto/dh/dh_ameth.c
index 377caf9..02ec2d4 100644
--- a/crypto/dh/dh_ameth.c
+++ b/crypto/dh/dh_ameth.c
@@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth =
 	dh_copy_parameters,
 	dh_cmp_parameters,
 	dh_param_print,
+	0,
 
 	int_dh_free,
 	0
diff --git a/crypto/dh/dh_err.c b/crypto/dh/dh_err.c
index d5cf0c2..56d3df7 100644
--- a/crypto/dh/dh_err.c
+++ b/crypto/dh/dh_err.c
@@ -1,6 +1,6 @@
 /* crypto/dh/dh_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -73,6 +73,9 @@ static ERR_STRING_DATA DH_str_functs[]=
 {ERR_FUNC(DH_F_COMPUTE_KEY),	"COMPUTE_KEY"},
 {ERR_FUNC(DH_F_DHPARAMS_PRINT_FP),	"DHparams_print_fp"},
 {ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS),	"DH_BUILTIN_GENPARAMS"},
+{ERR_FUNC(DH_F_DH_COMPUTE_KEY),	"DH_compute_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_KEY),	"DH_generate_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX),	"DH_generate_parameters_ex"},
 {ERR_FUNC(DH_F_DH_NEW_METHOD),	"DH_new_method"},
 {ERR_FUNC(DH_F_DH_PARAM_DECODE),	"DH_PARAM_DECODE"},
 {ERR_FUNC(DH_F_DH_PRIV_DECODE),	"DH_PRIV_DECODE"},
@@ -95,7 +98,9 @@ static ERR_STRING_DATA DH_str_reasons[]=
 {ERR_REASON(DH_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(DH_R_INVALID_PUBKEY)         ,"invalid public key"},
 {ERR_REASON(DH_R_KEYS_NOT_SET)           ,"keys not set"},
+{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL)     ,"key size too small"},
 {ERR_REASON(DH_R_MODULUS_TOO_LARGE)      ,"modulus too large"},
+{ERR_REASON(DH_R_NON_FIPS_METHOD)        ,"non fips method"},
 {ERR_REASON(DH_R_NO_PARAMETERS_SET)      ,"no parameters set"},
 {ERR_REASON(DH_R_NO_PRIVATE_VALUE)       ,"no private value"},
 {ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
diff --git a/crypto/dh/dh_gen.c b/crypto/dh/dh_gen.c
index cfd5b11..7b1fe9c 100644
--- a/crypto/dh/dh_gen.c
+++ b/crypto/dh/dh_gen.c
@@ -66,12 +66,29 @@
 #include <openssl/bn.h>
 #include <openssl/dh.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dh_builtin_genparams(DH *ret, int prime_len, int generator, BN_GENCB *cb);
 
 int DH_generate_parameters_ex(DH *ret, int prime_len, int generator, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_PARAMETERS_EX, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->generate_params)
 		return ret->meth->generate_params(ret, prime_len, generator, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dh_generate_parameters_ex(ret, prime_len,
+							generator, cb);
+#endif
 	return dh_builtin_genparams(ret, prime_len, generator, cb);
 	}
 
diff --git a/crypto/dh/dh_key.c b/crypto/dh/dh_key.c
index e7db440..89a74db 100644
--- a/crypto/dh/dh_key.c
+++ b/crypto/dh/dh_key.c
@@ -73,11 +73,27 @@ static int dh_finish(DH *dh);
 
 int DH_generate_key(DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->generate_key(dh);
 	}
 
 int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_COMPUTE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->compute_key(key, pub_key, dh);
 	}
 
@@ -138,8 +154,21 @@ static int generate_key(DH *dh)
 
 	if (generate_new_key)
 		{
-		l = dh->length ? dh->length : BN_num_bits(dh->p)-1; /* secret exponent length */
-		if (!BN_rand(priv_key, l, 0, 0)) goto err;
+		if (dh->q)
+			{
+			do
+				{
+				if (!BN_rand_range(priv_key, dh->q))
+					goto err;
+				}
+			while (BN_is_zero(priv_key) || BN_is_one(priv_key));
+			}
+		else
+			{
+			/* secret exponent length */
+			l = dh->length ? dh->length : BN_num_bits(dh->p)-1;
+			if (!BN_rand(priv_key, l, 0, 0)) goto err;
+			}
 		}
 
 	{
diff --git a/crypto/dh/dh_lib.c b/crypto/dh/dh_lib.c
index 7aef080..00218f2 100644
--- a/crypto/dh/dh_lib.c
+++ b/crypto/dh/dh_lib.c
@@ -64,6 +64,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DH_version[]="Diffie-Hellman" OPENSSL_VERSION_PTEXT;
 
 static const DH_METHOD *default_DH_method = NULL;
@@ -76,7 +80,16 @@ void DH_set_default_method(const DH_METHOD *meth)
 const DH_METHOD *DH_get_default_method(void)
 	{
 	if(!default_DH_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dh_openssl();
+		else
+			return DH_OpenSSL();
+#else
 		default_DH_method = DH_OpenSSL();
+#endif
+		}
 	return default_DH_method;
 	}
 
@@ -156,7 +169,7 @@ DH *DH_new_method(ENGINE *engine)
 	ret->counter = NULL;
 	ret->method_mont_p=NULL;
 	ret->references = 1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DH_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DH, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{
diff --git a/crypto/dsa/dsa.h b/crypto/dsa/dsa.h
index ac50a5c..a6f6d0b 100644
--- a/crypto/dsa/dsa.h
+++ b/crypto/dsa/dsa.h
@@ -97,6 +97,21 @@
                                               * be used for all exponents.
                                               */
 
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -272,6 +287,8 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSAPARAMS_PRINT_FP			 101
 #define DSA_F_DSA_DO_SIGN				 112
 #define DSA_F_DSA_DO_VERIFY				 113
+#define DSA_F_DSA_GENERATE_KEY				 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX		 123
 #define DSA_F_DSA_NEW_METHOD				 103
 #define DSA_F_DSA_PARAM_DECODE				 119
 #define DSA_F_DSA_PRINT_FP				 105
@@ -282,6 +299,7 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSA_SIGN					 106
 #define DSA_F_DSA_SIGN_SETUP				 107
 #define DSA_F_DSA_SIG_NEW				 109
+#define DSA_F_DSA_SIG_PRINT				 125
 #define DSA_F_DSA_VERIFY				 108
 #define DSA_F_I2D_DSA_SIG				 111
 #define DSA_F_OLD_DSA_PRIV_DECODE			 122
@@ -298,6 +316,8 @@ void ERR_load_DSA_strings(void);
 #define DSA_R_INVALID_DIGEST_TYPE			 106
 #define DSA_R_MISSING_PARAMETERS			 101
 #define DSA_R_MODULUS_TOO_LARGE				 103
+#define DSA_R_NEED_NEW_SETUP_VALUES			 110
+#define DSA_R_NON_FIPS_DSA_METHOD			 111
 #define DSA_R_NO_PARAMETERS_SET				 107
 #define DSA_R_PARAMETER_ENCODING_ERROR			 105
 
diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c
index 6413aae..376156e 100644
--- a/crypto/dsa/dsa_ameth.c
+++ b/crypto/dsa/dsa_ameth.c
@@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder)
 	return i2d_DSAPrivateKey(pkey->pkey.dsa, pder);
 	}
 
+static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	DSA_SIG *dsa_sig;
+	const unsigned char *p;
+	if (!sig)
+		{
+		if (BIO_puts(bp, "\n") <= 0)
+			return 0;
+		else
+			return 1;
+		}
+	p = sig->data;
+	dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
+	if (dsa_sig)
+		{
+		int rv = 0;
+		size_t buf_len = 0;
+		unsigned char *m=NULL;
+		update_buflen(dsa_sig->r, &buf_len);
+		update_buflen(dsa_sig->s, &buf_len);
+		m = OPENSSL_malloc(buf_len+10);
+		if (m == NULL)
+			{
+			DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+
+		if (BIO_write(bp, "\n", 1) != 1)
+			goto err;
+
+		if (!ASN1_bn_print(bp,"r:   ",dsa_sig->r,m,indent))
+			goto err;
+		if (!ASN1_bn_print(bp,"s:   ",dsa_sig->s,m,indent))
+			goto err;
+		rv = 1;
+		err:
+		if (m)
+			OPENSSL_free(m);
+		DSA_SIG_free(dsa_sig);
+		return rv;
+		}
+	return X509_signature_dump(bp, sig, indent);
+	}
+
 static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
 	switch (op)
@@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] =
 		dsa_copy_parameters,
 		dsa_cmp_parameters,
 		dsa_param_print,
+		dsa_sig_print,
 
 		int_dsa_free,
 		dsa_pkey_ctrl,
diff --git a/crypto/dsa/dsa_asn1.c b/crypto/dsa/dsa_asn1.c
index c37460b..6058534 100644
--- a/crypto/dsa/dsa_asn1.c
+++ b/crypto/dsa/dsa_asn1.c
@@ -61,6 +61,7 @@
 #include <openssl/dsa.h>
 #include <openssl/asn1.h>
 #include <openssl/asn1t.h>
+#include <openssl/rand.h>
 
 /* Override the default new methods */
 static int sig_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -87,7 +88,7 @@ ASN1_SEQUENCE_cb(DSA_SIG, sig_cb) = {
 	ASN1_SIMPLE(DSA_SIG, s, CBIGNUM)
 } ASN1_SEQUENCE_END_cb(DSA_SIG, DSA_SIG)
 
-IMPLEMENT_ASN1_FUNCTIONS_const(DSA_SIG)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DSA_SIG, DSA_SIG, DSA_SIG)
 
 /* Override the default free and new methods */
 static int dsa_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -148,3 +149,40 @@ DSA *DSAparams_dup(DSA *dsa)
 	{
 	return ASN1_item_dup(ASN1_ITEM_rptr(DSAparams), dsa);
 	}
+
+int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
+	     unsigned int *siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	RAND_seed(dgst, dlen);
+	s=DSA_do_sign(dgst,dlen,dsa);
+	if (s == NULL)
+		{
+		*siglen=0;
+		return(0);
+		}
+	*siglen=i2d_DSA_SIG(s,&sig);
+	DSA_SIG_free(s);
+	return(1);
+	}
+
+/* data has already been hashed (probably with SHA or SHA-1). */
+/* returns
+ *      1: correct signature
+ *      0: incorrect signature
+ *     -1: error
+ */
+int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
+	     const unsigned char *sigbuf, int siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	int ret=-1;
+
+	s = DSA_SIG_new();
+	if (s == NULL) return(ret);
+	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
+	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
+err:
+	DSA_SIG_free(s);
+	return(ret);
+	}
diff --git a/crypto/dsa/dsa_err.c b/crypto/dsa/dsa_err.c
index bba984e..00545b7 100644
--- a/crypto/dsa/dsa_err.c
+++ b/crypto/dsa/dsa_err.c
@@ -1,6 +1,6 @@
 /* crypto/dsa/dsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,8 @@ static ERR_STRING_DATA DSA_str_functs[]=
 {ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP),	"DSAparams_print_fp"},
 {ERR_FUNC(DSA_F_DSA_DO_SIGN),	"DSA_do_sign"},
 {ERR_FUNC(DSA_F_DSA_DO_VERIFY),	"DSA_do_verify"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_KEY),	"DSA_generate_key"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_PARAMETERS_EX),	"DSA_generate_parameters_ex"},
 {ERR_FUNC(DSA_F_DSA_NEW_METHOD),	"DSA_new_method"},
 {ERR_FUNC(DSA_F_DSA_PARAM_DECODE),	"DSA_PARAM_DECODE"},
 {ERR_FUNC(DSA_F_DSA_PRINT_FP),	"DSA_print_fp"},
@@ -86,6 +88,7 @@ static ERR_STRING_DATA DSA_str_functs[]=
 {ERR_FUNC(DSA_F_DSA_SIGN),	"DSA_sign"},
 {ERR_FUNC(DSA_F_DSA_SIGN_SETUP),	"DSA_sign_setup"},
 {ERR_FUNC(DSA_F_DSA_SIG_NEW),	"DSA_SIG_new"},
+{ERR_FUNC(DSA_F_DSA_SIG_PRINT),	"DSA_SIG_PRINT"},
 {ERR_FUNC(DSA_F_DSA_VERIFY),	"DSA_verify"},
 {ERR_FUNC(DSA_F_I2D_DSA_SIG),	"i2d_DSA_SIG"},
 {ERR_FUNC(DSA_F_OLD_DSA_PRIV_DECODE),	"OLD_DSA_PRIV_DECODE"},
@@ -105,6 +108,8 @@ static ERR_STRING_DATA DSA_str_reasons[]=
 {ERR_REASON(DSA_R_INVALID_DIGEST_TYPE)   ,"invalid digest type"},
 {ERR_REASON(DSA_R_MISSING_PARAMETERS)    ,"missing parameters"},
 {ERR_REASON(DSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES) ,"need new setup values"},
+{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD)   ,"non fips dsa method"},
 {ERR_REASON(DSA_R_NO_PARAMETERS_SET)     ,"no parameters set"},
 {ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
 {0,NULL}
diff --git a/crypto/dsa/dsa_gen.c b/crypto/dsa/dsa_gen.c
index cb0b453..c398761 100644
--- a/crypto/dsa/dsa_gen.c
+++ b/crypto/dsa/dsa_gen.c
@@ -81,13 +81,33 @@
 #include <openssl/sha.h>
 #include "dsa_locl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int DSA_generate_parameters_ex(DSA *ret, int bits,
 		const unsigned char *seed_in, int seed_len,
 		int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_PARAMETERS_EX, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->dsa_paramgen)
 		return ret->meth->dsa_paramgen(ret, bits, seed_in, seed_len,
 				counter_ret, h_ret, cb);
+#ifdef OPENSSL_FIPS
+	else if (FIPS_mode())
+		{
+		return FIPS_dsa_generate_parameters_ex(ret, bits, 
+							seed_in, seed_len,
+							counter_ret, h_ret, cb);
+		}
+#endif
 	else
 		{
 		const EVP_MD *evpmd;
@@ -105,12 +125,13 @@ int DSA_generate_parameters_ex(DSA *ret, int bits,
 			}
 
 		return dsa_builtin_paramgen(ret, bits, qbits, evpmd,
-				seed_in, seed_len, counter_ret, h_ret, cb);
+			seed_in, seed_len, NULL, counter_ret, h_ret, cb);
 		}
 	}
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
 	int ok=0;
@@ -201,8 +222,10 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 				}
 
 			/* step 2 */
-			EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL);
-			EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL);
+			if (!EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL))
+				goto err;
+			if (!EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL))
+				goto err;
 			for (i = 0; i < qsize; i++)
 				md[i]^=buf2[i];
 
@@ -251,7 +274,9 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 						break;
 					}
 
-				EVP_Digest(buf, qsize, md ,NULL, evpmd, NULL);
+				if (!EVP_Digest(buf, qsize, md ,NULL, evpmd,
+									NULL))
+					goto err;
 
 				/* step 8 */
 				if (!BN_bin2bn(md, qsize, r0))
@@ -332,6 +357,8 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 			}
 		if (counter_ret != NULL) *counter_ret=counter;
 		if (h_ret != NULL) *h_ret=h;
+		if (seed_out)
+			memcpy(seed_out, seed, qsize);
 		}
 	if(ctx)
 		{
diff --git a/crypto/dsa/dsa_key.c b/crypto/dsa/dsa_key.c
index c4aa86b..9cf669b 100644
--- a/crypto/dsa/dsa_key.c
+++ b/crypto/dsa/dsa_key.c
@@ -64,12 +64,28 @@
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dsa_builtin_keygen(DSA *dsa);
 
 int DSA_generate_key(DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_KEY, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(dsa->meth->dsa_keygen)
 		return dsa->meth->dsa_keygen(dsa);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dsa_generate_key(dsa);
+#endif
 	return dsa_builtin_keygen(dsa);
 	}
 
diff --git a/crypto/dsa/dsa_lib.c b/crypto/dsa/dsa_lib.c
index e9b7590..96d8d0c 100644
--- a/crypto/dsa/dsa_lib.c
+++ b/crypto/dsa/dsa_lib.c
@@ -70,6 +70,10 @@
 #include <openssl/dh.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DSA_version[]="DSA" OPENSSL_VERSION_PTEXT;
 
 static const DSA_METHOD *default_DSA_method = NULL;
@@ -82,7 +86,16 @@ void DSA_set_default_method(const DSA_METHOD *meth)
 const DSA_METHOD *DSA_get_default_method(void)
 	{
 	if(!default_DSA_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dsa_openssl();
+		else
+			return DSA_OpenSSL();
+#else
 		default_DSA_method = DSA_OpenSSL();
+#endif
+		}
 	return default_DSA_method;
 	}
 
@@ -163,7 +176,7 @@ DSA *DSA_new_method(ENGINE *engine)
 	ret->method_mont_p=NULL;
 
 	ret->references=1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DSA_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DSA, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{
@@ -276,7 +289,8 @@ void *DSA_get_ex_data(DSA *d, int idx)
 DH *DSA_dup_DH(const DSA *r)
 	{
 	/* DSA has p, q, g, optional pub_key, optional priv_key.
-	 * DH has p, optional length, g, optional pub_key, optional priv_key.
+	 * DH has p, optional length, g, optional pub_key, optional priv_key,
+	 * optional q.
 	 */ 
 
 	DH *ret = NULL;
@@ -290,7 +304,11 @@ DH *DSA_dup_DH(const DSA *r)
 		if ((ret->p = BN_dup(r->p)) == NULL)
 			goto err;
 	if (r->q != NULL)
+		{
 		ret->length = BN_num_bits(r->q);
+		if ((ret->q = BN_dup(r->q)) == NULL)
+			goto err;
+		}
 	if (r->g != NULL)
 		if ((ret->g = BN_dup(r->g)) == NULL)
 			goto err;
diff --git a/crypto/dsa/dsa_locl.h b/crypto/dsa/dsa_locl.h
index 2b8cfee..21e2e45 100644
--- a/crypto/dsa/dsa_locl.h
+++ b/crypto/dsa/dsa_locl.h
@@ -56,4 +56,5 @@
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb);
diff --git a/crypto/dsa/dsa_ossl.c b/crypto/dsa/dsa_ossl.c
index a3ddd7d..b3d78e5 100644
--- a/crypto/dsa/dsa_ossl.c
+++ b/crypto/dsa/dsa_ossl.c
@@ -136,6 +136,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	BN_CTX *ctx=NULL;
 	int reason=ERR_R_BN_LIB;
 	DSA_SIG *ret=NULL;
+	int noredo = 0;
 
 	BN_init(&m);
 	BN_init(&xr);
@@ -150,7 +151,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	if (s == NULL) goto err;
 	ctx=BN_CTX_new();
 	if (ctx == NULL) goto err;
-
+redo:
 	if ((dsa->kinv == NULL) || (dsa->r == NULL))
 		{
 		if (!DSA_sign_setup(dsa,ctx,&kinv,&r)) goto err;
@@ -161,6 +162,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 		dsa->kinv=NULL;
 		r=dsa->r;
 		dsa->r=NULL;
+		noredo = 1;
 		}
 
 	
@@ -181,6 +183,18 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 
 	ret=DSA_SIG_new();
 	if (ret == NULL) goto err;
+	/* Redo if r or s is zero as required by FIPS 186-3: this is
+	 * very unlikely.
+	 */
+	if (BN_is_zero(r) || BN_is_zero(s))
+		{
+		if (noredo)
+			{
+			reason = DSA_R_NEED_NEW_SETUP_VALUES;
+			goto err;
+			}
+		goto redo;
+		}
 	ret->r = r;
 	ret->s = s;
 	
diff --git a/crypto/dsa/dsa_pmeth.c b/crypto/dsa/dsa_pmeth.c
index e2df54f..715d8d6 100644
--- a/crypto/dsa/dsa_pmeth.c
+++ b/crypto/dsa/dsa_pmeth.c
@@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsa    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
-		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256)
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha512)
 			{
 			DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE);
 			return 0;
@@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
 	if (!dsa)
 		return 0;
 	ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd,
-	                           NULL, 0, NULL, NULL, pcb);
+	                           NULL, 0, NULL, NULL, NULL, pcb);
 	if (ret)
 		EVP_PKEY_assign_DSA(pkey, dsa);
 	else
diff --git a/crypto/dsa/dsa_sign.c b/crypto/dsa/dsa_sign.c
index 17555e5..c3cc364 100644
--- a/crypto/dsa/dsa_sign.c
+++ b/crypto/dsa/dsa_sign.c
@@ -61,30 +61,54 @@
 #include "cryptlib.h"
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
+#include <openssl/bn.h>
 
 DSA_SIG * DSA_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_DO_SIGN, DSA_R_NON_FIPS_DSA_METHOD);
+		return NULL;
+		}
+#endif
 	return dsa->meth->dsa_do_sign(dgst, dlen, dsa);
 	}
 
-int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
-	     unsigned int *siglen, DSA *dsa)
+int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
 	{
-	DSA_SIG *s;
-	RAND_seed(dgst, dlen);
-	s=DSA_do_sign(dgst,dlen,dsa);
-	if (s == NULL)
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
 		{
-		*siglen=0;
-		return(0);
+		DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
 		}
-	*siglen=i2d_DSA_SIG(s,&sig);
-	DSA_SIG_free(s);
-	return(1);
+#endif
+	return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
 	}
 
-int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
+DSA_SIG *DSA_SIG_new(void)
 	{
-	return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
+	DSA_SIG *sig;
+	sig = OPENSSL_malloc(sizeof(DSA_SIG));
+	if (!sig)
+		return NULL;
+	sig->r = NULL;
+	sig->s = NULL;
+	return sig;
+	}
+
+void DSA_SIG_free(DSA_SIG *sig)
+	{
+	if (sig)
+		{
+		if (sig->r)
+			BN_free(sig->r);
+		if (sig->s)
+			BN_free(sig->s);
+		OPENSSL_free(sig);
+		}
 	}
 
diff --git a/crypto/dsa/dsa_vrf.c b/crypto/dsa/dsa_vrf.c
index 226a75f..674cb5f 100644
--- a/crypto/dsa/dsa_vrf.c
+++ b/crypto/dsa/dsa_vrf.c
@@ -64,26 +64,13 @@
 int DSA_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
 		  DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_DO_VERIFY, DSA_R_NON_FIPS_DSA_METHOD);
+		return -1;
+		}
+#endif
 	return dsa->meth->dsa_do_verify(dgst, dgst_len, sig, dsa);
 	}
-
-/* data has already been hashed (probably with SHA or SHA-1). */
-/* returns
- *      1: correct signature
- *      0: incorrect signature
- *     -1: error
- */
-int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
-	     const unsigned char *sigbuf, int siglen, DSA *dsa)
-	{
-	DSA_SIG *s;
-	int ret=-1;
-
-	s = DSA_SIG_new();
-	if (s == NULL) return(ret);
-	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
-	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
-err:
-	DSA_SIG_free(s);
-	return(ret);
-	}
diff --git a/crypto/dso/dso_dlfcn.c b/crypto/dso/dso_dlfcn.c
index c2bc617..5f22548 100644
--- a/crypto/dso/dso_dlfcn.c
+++ b/crypto/dso/dso_dlfcn.c
@@ -86,7 +86,8 @@ DSO_METHOD *DSO_METHOD_dlfcn(void)
 # if defined(_AIX) || defined(__CYGWIN__) || \
      defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
      (defined(__osf__) && !defined(RTLD_NEXT))     || \
-     (defined(__OpenBSD__) && !defined(RTLD_SELF))
+     (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \
+	defined(__ANDROID__)
 #  undef HAVE_DLINFO
 # endif
 #endif
diff --git a/crypto/dso/dso_lib.c b/crypto/dso/dso_lib.c
index 8a15b79..7801529 100644
--- a/crypto/dso/dso_lib.c
+++ b/crypto/dso/dso_lib.c
@@ -237,11 +237,19 @@ DSO *DSO_load(DSO *dso, const char *filename, DSO_METHOD *meth, int flags)
 	if(ret->meth->dso_load == NULL)
 		{
 		DSOerr(DSO_F_DSO_LOAD,DSO_R_UNSUPPORTED);
+		/* Make sure we unset the filename on failure, because we use
+		 * this to determine when the DSO has been loaded above. */
+		OPENSSL_free(ret->filename);
+		ret->filename = NULL;
 		goto err;
 		}
 	if(!ret->meth->dso_load(ret))
 		{
 		DSOerr(DSO_F_DSO_LOAD,DSO_R_LOAD_FAILED);
+		/* Make sure we unset the filename on failure, because we use
+		 * this to determine when the DSO has been loaded above. */
+		OPENSSL_free(ret->filename);
+		ret->filename = NULL;
 		goto err;
 		}
 	/* Load succeeded */
diff --git a/crypto/ec/ec.h b/crypto/ec/ec.h
index ee70781..dfe8710 100644
--- a/crypto/ec/ec.h
+++ b/crypto/ec/ec.h
@@ -151,7 +151,24 @@ const EC_METHOD *EC_GFp_mont_method(void);
  */
 const EC_METHOD *EC_GFp_nist_method(void);
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
+
+/** Returns 64-bit optimized methods for nistp256
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
 
+#ifndef OPENSSL_NO_EC2M
 /********************************************************************/ 
 /*           EC_METHOD for curves over GF(2^m)                      */
 /********************************************************************/
@@ -161,6 +178,8 @@ const EC_METHOD *EC_GFp_nist_method(void);
  */
 const EC_METHOD *EC_GF2m_simple_method(void);
 
+#endif
+
 
 /********************************************************************/
 /*                   EC_GROUP functions                             */
@@ -255,10 +274,10 @@ int EC_GROUP_get_curve_name(const EC_GROUP *group);
 void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag);
 int EC_GROUP_get_asn1_flag(const EC_GROUP *group);
 
-void EC_GROUP_set_point_conversion_form(EC_GROUP *, point_conversion_form_t);
+void EC_GROUP_set_point_conversion_form(EC_GROUP *group, point_conversion_form_t form);
 point_conversion_form_t EC_GROUP_get_point_conversion_form(const EC_GROUP *);
 
-unsigned char *EC_GROUP_get0_seed(const EC_GROUP *);
+unsigned char *EC_GROUP_get0_seed(const EC_GROUP *x);
 size_t EC_GROUP_get_seed_len(const EC_GROUP *);
 size_t EC_GROUP_set_seed(EC_GROUP *, const unsigned char *, size_t len);
 
@@ -282,6 +301,7 @@ int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, co
  */
 int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
 /** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
  *  \param  group  EC_GROUP object
  *  \param  p      BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@ int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, c
  *  \return 1 on success and 0 if an error occured
  */
 int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Returns the number of bits needed to represent a field element 
  *  \param  group  EC_GROUP object
  *  \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Creates a new EC_GROUP object with the specified parameters defined
  *  over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
  *  \param  p    BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Creates a EC_GROUP object with a curve specified by a NID
  *  \param  nid  NID of the OID of the curve name
  *  \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Sets the affine coordinates of a EC_POINT over GF2m
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -514,7 +534,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
 /** Encodes a EC_POINT object to a octet string
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -606,8 +626,8 @@ int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_CTX *c
  */
 int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
 
-int EC_POINT_make_affine(const EC_GROUP *, EC_POINT *, BN_CTX *);
-int EC_POINTs_make_affine(const EC_GROUP *, size_t num, EC_POINT *[], BN_CTX *);
+int EC_POINT_make_affine(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx);
+int EC_POINTs_make_affine(const EC_GROUP *group, size_t num, EC_POINT *points[], BN_CTX *ctx);
 
 /** Computes r = generator * n sum_{i=0}^num p[i] * m[i]
  *  \param  group  underlying EC_GROUP object
@@ -653,9 +673,11 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
 /* EC_GROUP_get_basis_type() returns the NID of the basis type
  * used to represent the field elements */
 int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
 	unsigned int *k2, unsigned int *k3);
+#endif
 
 #define OPENSSL_EC_NAMED_CURVE	0x001
 
@@ -689,11 +711,21 @@ typedef struct ec_key_st EC_KEY;
 #define EC_PKEY_NO_PARAMETERS	0x001
 #define EC_PKEY_NO_PUBKEY	0x002
 
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW	0x1
+#define EC_FLAG_FIPS_CHECKED	0x2
+
 /** Creates a new EC_KEY object.
  *  \return EC_KEY object or NULL if an error occurred.
  */
 EC_KEY *EC_KEY_new(void);
 
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
 /** Creates a new EC_KEY object using a named curve as underlying
  *  EC_GROUP object.
  *  \param  nid  NID of the named curve.
@@ -768,16 +800,24 @@ const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key);
 int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub);
 
 unsigned EC_KEY_get_enc_flags(const EC_KEY *key);
-void EC_KEY_set_enc_flags(EC_KEY *, unsigned int);
-point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *);
-void EC_KEY_set_conv_form(EC_KEY *, point_conversion_form_t);
+void EC_KEY_set_enc_flags(EC_KEY *eckey, unsigned int flags);
+point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key);
+void EC_KEY_set_conv_form(EC_KEY *eckey, point_conversion_form_t cform);
 /* functions to set/get method specific data  */
-void *EC_KEY_get_key_method_data(EC_KEY *, 
+void *EC_KEY_get_key_method_data(EC_KEY *key, 
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
-void EC_KEY_insert_key_method_data(EC_KEY *, void *data,
+/** Sets the key method data of an EC_KEY object, if none has yet been set.
+ *  \param  key              EC_KEY object
+ *  \param  data             opaque data to install.
+ *  \param  dup_func         a function that duplicates |data|.
+ *  \param  free_func        a function that frees |data|.
+ *  \param  clear_free_func  a function that wipes and frees |data|.
+ *  \return the previously set data pointer, or NULL if |data| was inserted.
+ */
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
 /* wrapper functions for the underlying EC_GROUP object */
-void EC_KEY_set_asn1_flag(EC_KEY *, int);
+void EC_KEY_set_asn1_flag(EC_KEY *eckey, int asn1_flag);
 
 /** Creates a table of pre-computed multiples of the generator to 
  *  accelerate further EC_KEY operations.
@@ -799,6 +839,15 @@ int EC_KEY_generate_key(EC_KEY *key);
  */
 int EC_KEY_check_key(const EC_KEY *key);
 
+/** Sets a public key from affine coordindates performing
+ *  neccessary NIST PKV tests.
+ *  \param  key  the EC_KEY object
+ *  \param  x    public key x coordinate
+ *  \param  y    public key y coordinate
+ *  \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
 
 /********************************************************************/
 /*        de- and encoding functions for SEC1 ECPrivateKey          */
@@ -926,6 +975,7 @@ void ERR_load_EC_strings(void);
 /* Error codes for the EC functions. */
 
 /* Function codes. */
+#define EC_F_BN_TO_FELEM				 224
 #define EC_F_COMPUTE_WNAF				 143
 #define EC_F_D2I_ECPARAMETERS				 144
 #define EC_F_D2I_ECPKPARAMETERS				 145
@@ -968,6 +1018,15 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_GFP_MONT_FIELD_SQR			 132
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE		 189
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP		 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE		 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL			 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE		 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL			 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE		 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL			 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
 #define EC_F_EC_GFP_NIST_FIELD_MUL			 200
 #define EC_F_EC_GFP_NIST_FIELD_SQR			 201
 #define EC_F_EC_GFP_NIST_GROUP_SET_CURVE		 202
@@ -1010,6 +1069,7 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_KEY_NEW					 182
 #define EC_F_EC_KEY_PRINT				 180
 #define EC_F_EC_KEY_PRINT_FP				 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES	 229
 #define EC_F_EC_POINTS_MAKE_AFFINE			 136
 #define EC_F_EC_POINT_ADD				 112
 #define EC_F_EC_POINT_CMP				 113
@@ -1040,6 +1100,9 @@ void ERR_load_EC_strings(void);
 #define EC_F_I2D_ECPKPARAMETERS				 191
 #define EC_F_I2D_ECPRIVATEKEY				 192
 #define EC_F_I2O_ECPUBLICKEY				 151
+#define EC_F_NISTP224_PRE_COMP_NEW			 227
+#define EC_F_NISTP256_PRE_COMP_NEW			 236
+#define EC_F_NISTP521_PRE_COMP_NEW			 237
 #define EC_F_O2I_ECPUBLICKEY				 152
 #define EC_F_OLD_EC_PRIV_DECODE				 222
 #define EC_F_PKEY_EC_CTRL				 197
@@ -1052,12 +1115,15 @@ void ERR_load_EC_strings(void);
 /* Reason codes. */
 #define EC_R_ASN1_ERROR					 115
 #define EC_R_ASN1_UNKNOWN_FIELD				 116
+#define EC_R_BIGNUM_OUT_OF_RANGE			 144
 #define EC_R_BUFFER_TOO_SMALL				 100
+#define EC_R_COORDINATES_OUT_OF_RANGE			 146
 #define EC_R_D2I_ECPKPARAMETERS_FAILURE			 117
 #define EC_R_DECODE_ERROR				 142
 #define EC_R_DISCRIMINANT_IS_ZERO			 118
 #define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE		 119
 #define EC_R_FIELD_TOO_LARGE				 143
+#define EC_R_GF2M_NOT_SUPPORTED				 147
 #define EC_R_GROUP2PKPARAMETERS_FAILURE			 120
 #define EC_R_I2D_ECPKPARAMETERS_FAILURE			 121
 #define EC_R_INCOMPATIBLE_OBJECTS			 101
@@ -1092,6 +1158,7 @@ void ERR_load_EC_strings(void);
 #define EC_R_UNKNOWN_GROUP				 129
 #define EC_R_UNKNOWN_ORDER				 114
 #define EC_R_UNSUPPORTED_FIELD				 131
+#define EC_R_WRONG_CURVE_PARAMETERS			 145
 #define EC_R_WRONG_ORDER				 130
 
 #ifdef  __cplusplus
diff --git a/crypto/ec/ec2_mult.c b/crypto/ec/ec2_mult.c
index e12b9b2..26f4a78 100644
--- a/crypto/ec/ec2_mult.c
+++ b/crypto/ec/ec2_mult.c
@@ -71,6 +71,8 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 
 /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective 
  * coordinates.
@@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group)
 	{
 	return ec_wNAF_have_precompute_mult(group);
  	}
+
+#endif
diff --git a/crypto/ec/ec2_oct.c b/crypto/ec/ec2_oct.c
new file mode 100644
index 0000000..f1d75e5
--- /dev/null
+++ b/crypto/ec/ec2_oct.c
@@ -0,0 +1,407 @@
+/* crypto/ec/ec2_oct.c */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
+ * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
+ * to the OpenSSL project.
+ *
+ * The ECC Code is licensed pursuant to the OpenSSL open source
+ * license provided below.
+ *
+ * The software is originally written by Sheueling Chang Shantz and
+ * Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <openssl/err.h>
+
+#include "ec_lcl.h"
+
+#ifndef OPENSSL_NO_EC2M
+
+/* Calculates and sets the affine coordinates of an EC_POINT from the given
+ * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
+ * Note that the simple implementation only uses affine coordinates.
+ *
+ * The method is from the following publication:
+ * 
+ *     Harper, Menezes, Vanstone:
+ *     "Public-Key Cryptosystems with Very Small Key Lengths",
+ *     EUROCRYPT '92, Springer-Verlag LNCS 658,
+ *     published February 1993
+ *
+ * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
+ * the same method, but claim no priority date earlier than July 29, 1994
+ * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
+ */
+int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp, *x, *y, *z;
+	int ret = 0, z0;
+
+	/* clear error queue */
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0) ? 1 : 0;
+
+	BN_CTX_start(ctx);
+	tmp = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	z = BN_CTX_get(ctx);
+	if (z == NULL) goto err;
+
+	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
+	if (BN_is_zero(x))
+		{
+		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
+		}
+	else
+		{
+		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
+		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
+		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
+		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
+		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
+			{
+			unsigned long err = ERR_peek_last_error();
+			
+			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
+				{
+				ERR_clear_error();
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+				}
+			else
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+			goto err;
+			}
+		z0 = (BN_is_odd(z)) ? 1 : 0;
+		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
+		if (z0 != y_bit)
+			{
+			if (!BN_GF2m_add(y, y, x)) goto err;
+			}
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+/* Converts an EC_POINT to an octet string.  
+ * If buf is NULL, the encoded length will be returned.
+ * If the length len of buf is smaller than required an error will be returned.
+ */
+size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		yxi = BN_CTX_get(ctx);
+		if (yxi == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+		buf[0] = form;
+		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (BN_is_odd(yxi)) buf[0]++;
+			}
+
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+/* Converts an octet string representation to an EC_POINT. 
+ * Note that the simple implementation only uses affine coordinates.
+ */
+int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	yxi = BN_CTX_get(ctx);
+	if (yxi == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (y_bit != BN_is_odd(yxi))
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+#endif
diff --git a/crypto/ec/ec2_smpl.c b/crypto/ec/ec2_smpl.c
index af94458..e0e59c7 100644
--- a/crypto/ec/ec2_smpl.c
+++ b/crypto/ec/ec2_smpl.c
@@ -71,10 +71,20 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 
 const EC_METHOD *EC_GF2m_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gf2m_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_characteristic_two_field,
 		ec_GF2m_simple_group_init,
 		ec_GF2m_simple_group_finish,
@@ -93,9 +103,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
 		0 /* get_Jprojective_coordinates_GFp */,
 		ec_GF2m_simple_point_set_affine_coordinates,
 		ec_GF2m_simple_point_get_affine_coordinates,
-		ec_GF2m_simple_set_compressed_coordinates,
-		ec_GF2m_simple_point2oct,
-		ec_GF2m_simple_oct2point,
+		0,0,0,
 		ec_GF2m_simple_add,
 		ec_GF2m_simple_dbl,
 		ec_GF2m_simple_invert,
@@ -118,6 +126,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -405,340 +414,6 @@ int ec_GF2m_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_
 	return ret;
 	}
 
-
-/* Calculates and sets the affine coordinates of an EC_POINT from the given
- * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
- * Note that the simple implementation only uses affine coordinates.
- *
- * The method is from the following publication:
- * 
- *     Harper, Menezes, Vanstone:
- *     "Public-Key Cryptosystems with Very Small Key Lengths",
- *     EUROCRYPT '92, Springer-Verlag LNCS 658,
- *     published February 1993
- *
- * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
- * the same method, but claim no priority date earlier than July 29, 1994
- * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
- */
-int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp, *x, *y, *z;
-	int ret = 0, z0;
-
-	/* clear error queue */
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0) ? 1 : 0;
-
-	BN_CTX_start(ctx);
-	tmp = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	z = BN_CTX_get(ctx);
-	if (z == NULL) goto err;
-
-	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
-	if (BN_is_zero(x))
-		{
-		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
-		}
-	else
-		{
-		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
-		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
-		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
-		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
-		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
-			{
-			unsigned long err = ERR_peek_last_error();
-			
-			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
-				{
-				ERR_clear_error();
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-				}
-			else
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-			goto err;
-			}
-		z0 = (BN_is_odd(z)) ? 1 : 0;
-		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
-		if (z0 != y_bit)
-			{
-			if (!BN_GF2m_add(y, y, x)) goto err;
-			}
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-/* Converts an EC_POINT to an octet string.  
- * If buf is NULL, the encoded length will be returned.
- * If the length len of buf is smaller than required an error will be returned.
- */
-size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		yxi = BN_CTX_get(ctx);
-		if (yxi == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-		buf[0] = form;
-		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (BN_is_odd(yxi)) buf[0]++;
-			}
-
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-/* Converts an octet string representation to an EC_POINT. 
- * Note that the simple implementation only uses affine coordinates.
- */
-int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	yxi = BN_CTX_get(ctx);
-	if (yxi == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (y_bit != BN_is_odd(yxi))
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 /* Computes a + b and stores the result in r.  r could be a or b, a could be b.
  * Uses algorithm A.10.2 of IEEE P1363.
  */
@@ -887,7 +562,7 @@ int ec_GF2m_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_
 	field_sqr = group->meth->field_sqr;	
 
 	/* only support affine coordinates */
-	if (!point->Z_is_one) goto err;
+	if (!point->Z_is_one) return -1;
 
 	if (ctx == NULL)
 		{
@@ -1040,3 +715,5 @@ int ec_GF2m_simple_field_div(const EC_GROUP *group, BIGNUM *r, const BIGNUM *a,
 	{
 	return BN_GF2m_mod_div(r, a, b, &group->field, ctx);
 	}
+
+#endif
diff --git a/crypto/ec/ec_ameth.c b/crypto/ec/ec_ameth.c
index c00f7d7..83909c1 100644
--- a/crypto/ec/ec_ameth.c
+++ b/crypto/ec/ec_ameth.c
@@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth =
 	ec_copy_parameters,
 	ec_cmp_parameters,
 	eckey_param_print,
+	0,
 
 	int_ec_free,
 	ec_pkey_ctrl,
diff --git a/crypto/ec/ec_asn1.c b/crypto/ec/ec_asn1.c
index ae55539..175eec5 100644
--- a/crypto/ec/ec_asn1.c
+++ b/crypto/ec/ec_asn1.c
@@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group)
 		/* everything else is currently not supported */
 		return 0;
 	}
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
 	{
 	if (group == NULL)
@@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
 
 	return 1;
 	}
-
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
 	unsigned int *k2, unsigned int *k3)
 	{
@@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
 
 	return 1;
 	}
-
+#endif
 
 
 /* some structures needed for the asn1 encoding */
@@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
 			}
 		}
 	else	/* nid == NID_X9_62_characteristic_two_field */
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		int		field_type;
 		X9_62_CHARACTERISTIC_TWO *char_two;
@@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
 				}
 			}
 		}
+#endif
 
 	ok = 1;
 
@@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* nid == NID_X9_62_characteristic_two_field */
 		{
 		if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL))
@@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
 			goto err;
 			}
 		}
-
+#endif
 	len_1 = (size_t)BN_num_bytes(tmp_1);
 	len_2 = (size_t)BN_num_bytes(tmp_2);
 
@@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
 
 	/* get the field parameters */
 	tmp = OBJ_obj2nid(params->fieldID->fieldType);
-
 	if (tmp == NID_X9_62_characteristic_two_field)
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		X9_62_CHARACTERISTIC_TWO *char_two;
 
@@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
 		/* create the EC_GROUP structure */
 		ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL);
 		}
+#endif
 	else if (tmp == NID_X9_62_prime_field)
 		{
 		/* we have a curve over a prime field */
@@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len)
 	if ((group = ec_asn1_pkparameters2group(params)) == NULL)
 		{
 		ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE);
+		ECPKPARAMETERS_free(params);
 		return NULL; 
 		}
 
diff --git a/crypto/ec/ec_curve.c b/crypto/ec/ec_curve.c
index 23274e4..c72fb26 100644
--- a/crypto/ec/ec_curve.c
+++ b/crypto/ec/ec_curve.c
@@ -3,7 +3,7 @@
  * Written by Nils Larsch for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -72,6 +72,7 @@
 #include "ec_lcl.h"
 #include <openssl/err.h>
 #include <openssl/obj_mac.h>
+#include <openssl/opensslconf.h>
 
 typedef struct {
 	int	field_type,	/* either NID_X9_62_prime_field or
@@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; }
 	  0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D }
 	};
 
+#ifndef OPENSSL_NO_EC2M
+
 /* characteristic two curves */
 static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; }
 	_EC_SECG_CHAR2_113R1 = {
@@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; }
 	{ 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76,	/* seed */
 	  0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD,
 
- 	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
+	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
 	  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
 	  0x07,
 	  0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9,	/* a */
@@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; }
 	  0xBA,0xFC,0xA7,0x5E }
 	};
 
+#endif
+
 typedef struct _ec_list_element_st {
 	int	nid;
 	const EC_CURVE_DATA *data;
+	const EC_METHOD *(*meth)(void);
 	const char *comment;
 	} ec_list_element;
 
 static const ec_list_element curve_list[] = {
-	/* prime field curves */	
+	/* prime field curves */
 	/* secg curves */
-	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"},
-	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
+	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" },
+	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
 	/* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */
-	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"},
-	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"},
-	{ NID_secp224r1, &_EC_NIST_PRIME_224.h,   "NIST/SECG curve over a 224 bit prime field"},
-	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"},
+	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" },
+	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" },
+#else
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" },
+#endif
+	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" },
 	/* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */
-	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"},
-	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"},
+	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" },
+#else
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" },
+#endif
 	/* X9.62 curves */
-	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"},
+	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+#else
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+#endif
+#ifndef OPENSSL_NO_EC2M
 	/* characteristic two field curves */
 	/* NIST/SECG curves */
-	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"},
-	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"},
-	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h,  "NIST/SECG/WTLS curve over a 163 bit binary field" },
-	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"},
-	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h,  "NIST/SECG curve over a 163 bit binary field" },
-	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"},
-	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h,  "NIST/SECG curve over a 571 bit binary field" },
-	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h,  "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" },
+	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" },
+	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" },
+	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" },
+	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" },
+	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" },
 	/* X9.62 curves */
-	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"},
-	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"},
-	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"},
-	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"},
-	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"},
-	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"},
-	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"},
+	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" },
+	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" },
+	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" },
+	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" },
+	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" },
+	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" },
+	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" },
 	/* the WAP/WTLS curves
 	 * [unlike SECG, spec has its own OIDs for curves from X9.62] */
-	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h,   "NIST/SECG/WTLS curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h,  "SECG curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h,  "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h,  "SECG/WTLS curve over a 160 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" },
-	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"},
+	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
+	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
 	/* IPSec curves */
-	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
-	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+#endif
 };
 
 #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
 
-static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
+static EC_GROUP *ec_group_new_from_data(const ec_list_element curve)
 	{
 	EC_GROUP *group=NULL;
 	EC_POINT *P=NULL;
 	BN_CTX	 *ctx=NULL;
-	BIGNUM 	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
+	BIGNUM	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
 	int	 ok=0;
 	int	 seed_len,param_len;
+	const EC_METHOD *meth;
+	const EC_CURVE_DATA *data;
 	const unsigned char *params;
 
 	if ((ctx = BN_CTX_new()) == NULL)
@@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 		goto err;
 		}
 
+	data = curve.data;
 	seed_len  = data->seed_len;
 	param_len = data->param_len;
-	params    = (const unsigned char *)(data+1);	/* skip header */
-	params   += seed_len;				/* skip seed   */
+	params	  = (const unsigned char *)(data+1);	/* skip header */
+	params	 += seed_len;				/* skip seed   */
 
 	if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL))
 		|| !(a = BN_bin2bn(params+1*param_len, param_len, NULL))
@@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 		goto err;
 		}
 
-	if (data->field_type == NID_X9_62_prime_field)
+	if (curve.meth != 0)
+		{
+		meth = curve.meth();
+		if (((group = EC_GROUP_new(meth)) == NULL) ||
+			(!(group->meth->group_set_curve(group, p, a, b, ctx))))
+			{
+			ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
+			goto err;
+			}
+		}
+	else if (data->field_type == NID_X9_62_prime_field)
 		{
 		if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL)
 			{
@@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* field_type == NID_X9_62_characteristic_two_field */
 		{
 		if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL)
@@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 			goto err;
 			}
 		}
+#endif
 
 	if ((P = EC_POINT_new(group)) == NULL)
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
 		}
-	
+
 	if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL))
 		|| !(y = BN_bin2bn(params+4*param_len, param_len, NULL)))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB);
 		goto err;
 		}
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx))
+	if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
@@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid)
 	for (i=0; i<curve_list_length; i++)
 		if (curve_list[i].nid == nid)
 			{
-			ret = ec_group_new_from_data(curve_list[i].data);
+			ret = ec_group_new_from_data(curve_list[i]);
 			break;
 			}
 
diff --git a/crypto/ec/ec_cvt.c b/crypto/ec/ec_cvt.c
index d45640b..bfcbab3 100644
--- a/crypto/ec/ec_cvt.c
+++ b/crypto/ec/ec_cvt.c
@@ -78,7 +78,32 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 	const EC_METHOD *meth;
 	EC_GROUP *ret;
 
+#if defined(OPENSSL_BN_ASM_MONT)
+	/*
+	 * This might appear controversial, but the fact is that generic
+	 * prime method was observed to deliver better performance even
+	 * for NIST primes on a range of platforms, e.g.: 60%-15%
+	 * improvement on IA-64, ~25% on ARM, 30%-90% on P4, 20%-25%
+	 * in 32-bit build and 35%--12% in 64-bit build on Core2...
+	 * Coefficients are relative to optimized bn_nist.c for most
+	 * intensive ECDSA verify and ECDH operations for 192- and 521-
+	 * bit keys respectively. Choice of these boundary values is
+	 * arguable, because the dependency of improvement coefficient
+	 * from key length is not a "monotone" curve. For example while
+	 * 571-bit result is 23% on ARM, 384-bit one is -1%. But it's
+	 * generally faster, sometimes "respectfully" faster, sometimes
+	 * "tolerably" slower... What effectively happens is that loop
+	 * with bn_mul_add_words is put against bn_mul_mont, and the
+	 * latter "wins" on short vectors. Correct solution should be
+	 * implementing dedicated NxN multiplication subroutines for
+	 * small N. But till it materializes, let's stick to generic
+	 * prime method...
+	 *						<appro>
+	 */
+	meth = EC_GFp_mont_method();
+#else
 	meth = EC_GFp_nist_method();
+#endif
 	
 	ret = EC_GROUP_new(meth);
 	if (ret == NULL)
@@ -122,7 +147,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 	return ret;
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	const EC_METHOD *meth;
@@ -142,3 +167,4 @@ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 
 	return ret;
 	}
+#endif
diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c
index 84b4833..0d19398 100644
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c
@@ -1,6 +1,6 @@
 /* crypto/ec/ec_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA EC_str_functs[]=
 	{
+{ERR_FUNC(EC_F_BN_TO_FELEM),	"BN_TO_FELEM"},
 {ERR_FUNC(EC_F_COMPUTE_WNAF),	"COMPUTE_WNAF"},
 {ERR_FUNC(EC_F_D2I_ECPARAMETERS),	"d2i_ECParameters"},
 {ERR_FUNC(EC_F_D2I_ECPKPARAMETERS),	"d2i_ECPKParameters"},
@@ -112,6 +113,15 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_EC_GFP_MONT_FIELD_SQR),	"ec_GFp_mont_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE),	"ec_GFp_mont_group_set_curve"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP),	"EC_GFP_MONT_GROUP_SET_CURVE_GFP"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE),	"ec_GFp_nistp224_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINTS_MUL),	"ec_GFp_nistp224_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp224_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE),	"ec_GFp_nistp256_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINTS_MUL),	"ec_GFp_nistp256_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp256_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE),	"ec_GFp_nistp521_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINTS_MUL),	"ec_GFp_nistp521_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp521_point_get_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_MUL),	"ec_GFp_nist_field_mul"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_SQR),	"ec_GFp_nist_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_GROUP_SET_CURVE),	"ec_GFp_nist_group_set_curve"},
@@ -154,6 +164,7 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_EC_KEY_NEW),	"EC_KEY_new"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT),	"EC_KEY_print"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT_FP),	"EC_KEY_print_fp"},
+{ERR_FUNC(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES),	"EC_KEY_set_public_key_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_POINTS_MAKE_AFFINE),	"EC_POINTs_make_affine"},
 {ERR_FUNC(EC_F_EC_POINT_ADD),	"EC_POINT_add"},
 {ERR_FUNC(EC_F_EC_POINT_CMP),	"EC_POINT_cmp"},
@@ -184,6 +195,9 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_I2D_ECPKPARAMETERS),	"i2d_ECPKParameters"},
 {ERR_FUNC(EC_F_I2D_ECPRIVATEKEY),	"i2d_ECPrivateKey"},
 {ERR_FUNC(EC_F_I2O_ECPUBLICKEY),	"i2o_ECPublicKey"},
+{ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW),	"NISTP224_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW),	"NISTP256_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW),	"NISTP521_PRE_COMP_NEW"},
 {ERR_FUNC(EC_F_O2I_ECPUBLICKEY),	"o2i_ECPublicKey"},
 {ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE),	"OLD_EC_PRIV_DECODE"},
 {ERR_FUNC(EC_F_PKEY_EC_CTRL),	"PKEY_EC_CTRL"},
@@ -199,12 +213,15 @@ static ERR_STRING_DATA EC_str_reasons[]=
 	{
 {ERR_REASON(EC_R_ASN1_ERROR)             ,"asn1 error"},
 {ERR_REASON(EC_R_ASN1_UNKNOWN_FIELD)     ,"asn1 unknown field"},
+{ERR_REASON(EC_R_BIGNUM_OUT_OF_RANGE)    ,"bignum out of range"},
 {ERR_REASON(EC_R_BUFFER_TOO_SMALL)       ,"buffer too small"},
+{ERR_REASON(EC_R_COORDINATES_OUT_OF_RANGE),"coordinates out of range"},
 {ERR_REASON(EC_R_D2I_ECPKPARAMETERS_FAILURE),"d2i ecpkparameters failure"},
 {ERR_REASON(EC_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(EC_R_DISCRIMINANT_IS_ZERO)   ,"discriminant is zero"},
 {ERR_REASON(EC_R_EC_GROUP_NEW_BY_NAME_FAILURE),"ec group new by name failure"},
 {ERR_REASON(EC_R_FIELD_TOO_LARGE)        ,"field too large"},
+{ERR_REASON(EC_R_GF2M_NOT_SUPPORTED)     ,"gf2m not supported"},
 {ERR_REASON(EC_R_GROUP2PKPARAMETERS_FAILURE),"group2pkparameters failure"},
 {ERR_REASON(EC_R_I2D_ECPKPARAMETERS_FAILURE),"i2d ecpkparameters failure"},
 {ERR_REASON(EC_R_INCOMPATIBLE_OBJECTS)   ,"incompatible objects"},
@@ -239,6 +256,7 @@ static ERR_STRING_DATA EC_str_reasons[]=
 {ERR_REASON(EC_R_UNKNOWN_GROUP)          ,"unknown group"},
 {ERR_REASON(EC_R_UNKNOWN_ORDER)          ,"unknown order"},
 {ERR_REASON(EC_R_UNSUPPORTED_FIELD)      ,"unsupported field"},
+{ERR_REASON(EC_R_WRONG_CURVE_PARAMETERS) ,"wrong curve parameters"},
 {ERR_REASON(EC_R_WRONG_ORDER)            ,"wrong order"},
 {0,NULL}
 	};
diff --git a/crypto/ec/ec_key.c b/crypto/ec/ec_key.c
index 522802c..7fa2475 100644
--- a/crypto/ec/ec_key.c
+++ b/crypto/ec/ec_key.c
@@ -64,7 +64,9 @@
 #include <string.h>
 #include "ec_lcl.h"
 #include <openssl/err.h>
-#include <string.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 EC_KEY *EC_KEY_new(void)
 	{
@@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void)
 		}
 
 	ret->version = 1;	
+	ret->flags = 0;
 	ret->group   = NULL;
 	ret->pub_key = NULL;
 	ret->priv_key= NULL;
@@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src)
 	dest->enc_flag  = src->enc_flag;
 	dest->conv_form = src->conv_form;
 	dest->version   = src->version;
+	dest->flags = src->flags;
 
 	return dest;
 	}
@@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey)
 	BIGNUM	*priv_key = NULL, *order = NULL;
 	EC_POINT *pub_key = NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_ec_key_generate_key(eckey);
+#endif
+
 	if (!eckey || !eckey->group)
 		{
 		ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER);
@@ -371,6 +380,82 @@ int EC_KEY_check_key(const EC_KEY *eckey)
 	return(ok);
 	}
 
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
+	{
+	BN_CTX *ctx = NULL;
+	BIGNUM *tx, *ty;
+	EC_POINT *point = NULL;
+	int ok = 0, tmp_nid, is_char_two = 0;
+
+	if (!key || !key->group || !x || !y)
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+						ERR_R_PASSED_NULL_PARAMETER);
+		return 0;
+		}
+	ctx = BN_CTX_new();
+	if (!ctx)
+		goto err;
+
+	point = EC_POINT_new(key->group);
+
+	if (!point)
+		goto err;
+
+	tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
+
+        if (tmp_nid == NID_X9_62_characteristic_two_field)
+		is_char_two = 1;
+
+	tx = BN_CTX_get(ctx);
+	ty = BN_CTX_get(ctx);
+#ifndef OPENSSL_NO_EC2M
+	if (is_char_two)
+		{
+		if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	else
+#endif
+		{
+		if (!EC_POINT_set_affine_coordinates_GFp(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GFp(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	/* Check if retrieved coordinates match originals: if not values
+	 * are out of range.
+	 */
+	if (BN_cmp(x, tx) || BN_cmp(y, ty))
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+			EC_R_COORDINATES_OUT_OF_RANGE);
+		goto err;
+		}
+
+	if (!EC_KEY_set_public_key(key, point))
+		goto err;
+
+	if (EC_KEY_check_key(key) == 0)
+		goto err;
+
+	ok = 1;
+
+	err:
+	if (ctx)
+		BN_CTX_free(ctx);
+	if (point)
+		EC_POINT_free(point);
+	return ok;
+
+	}
+
 const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key)
 	{
 	return key->group;
@@ -435,18 +520,27 @@ void EC_KEY_set_conv_form(EC_KEY *key, point_conversion_form_t cform)
 void *EC_KEY_get_key_method_data(EC_KEY *key,
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
 	{
-	return EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+	void *ret;
+
+	CRYPTO_r_lock(CRYPTO_LOCK_EC);
+	ret = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+	CRYPTO_r_unlock(CRYPTO_LOCK_EC);
+
+	return ret;
 	}
 
-void EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
 	{
 	EC_EXTRA_DATA *ex_data;
+
 	CRYPTO_w_lock(CRYPTO_LOCK_EC);
 	ex_data = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
 	if (ex_data == NULL)
 		EC_EX_DATA_set_data(&key->method_data, data, dup_func, free_func, clear_free_func);
 	CRYPTO_w_unlock(CRYPTO_LOCK_EC);
+
+	return ex_data;
 	}
 
 void EC_KEY_set_asn1_flag(EC_KEY *key, int flag)
@@ -461,3 +555,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx)
 		return 0;
 	return EC_GROUP_precompute_mult(key->group, ctx);
 	}
+
+int EC_KEY_get_flags(const EC_KEY *key)
+	{
+	return key->flags;
+	}
+
+void EC_KEY_set_flags(EC_KEY *key, int flags)
+	{
+	key->flags |= flags;
+	}
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags)
+	{
+	key->flags &= ~flags;
+	}
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index 3e2c34b..da7967d 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -3,7 +3,7 @@
  * Originally written by Bodo Moeller for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -82,10 +82,15 @@
 # endif
 #endif
 
+/* Use default functions for poin2oct, oct2point and compressed coordinates */
+#define EC_FLAGS_DEFAULT_OCT	0x1
+
 /* Structure details are not part of the exported interface,
  * so all this may change in future versions. */
 
 struct ec_method_st {
+	/* Various method flags */
+	int flags;
 	/* used by EC_METHOD_get_field_type: */
 	int field_type; /* a NID */
 
@@ -244,6 +249,7 @@ struct ec_key_st {
 	point_conversion_form_t conv_form;
 
 	int 	references;
+	int	flags;
 
 	EC_EXTRA_DATA *method_data;
 } /* EC_KEY */;
@@ -391,3 +397,50 @@ int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
 	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
 int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
 int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ec2_mult.c */
+int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
+	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+#ifndef OPENSSL_EC_NISTP_64_GCC_128
+/* method functions in ecp_nistp224.c */
+int ec_GFp_nistp224_group_init(EC_GROUP *group);
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp224_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp256.c */
+int ec_GFp_nistp256_group_init(EC_GROUP *group);
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp256_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp521.c */
+int ec_GFp_nistp521_group_init(EC_GROUP *group);
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp521_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group);
+
+/* utility functions in ecp_nistputil.c */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+	size_t felem_size, void *tmp_felems,
+	void (*felem_one)(void *out),
+	int (*felem_is_zero)(const void *in),
+	void (*felem_assign)(void *out, const void *in),
+	void (*felem_square)(void *out, const void *in),
+	void (*felem_mul)(void *out, const void *in1, const void *in2),
+	void (*felem_inv)(void *out, const void *in),
+	void (*felem_contract)(void *out, const void *in));
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in);
+#endif
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index dd7da0f..25247b5 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -425,7 +425,7 @@ int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	if (group->meth->group_set_curve == 0)
@@ -446,7 +446,7 @@ int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM
 		}
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
-
+#endif
 
 int EC_GROUP_get_degree(const EC_GROUP *group)
 	{
@@ -856,7 +856,7 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
 	const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx)
 	{
@@ -872,7 +872,7 @@ int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
 		}
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
-
+#endif
 
 int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
@@ -890,7 +890,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *p
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
 	{
@@ -906,75 +906,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *
 		}
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
-
-
-int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-        unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->point2oct == 0)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point2oct(group, point, form, buf, len, ctx);
-	}
-
-
-int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
-        const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->oct2point == 0)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->oct2point(group, point, buf, len, ctx);
-	}
-
+#endif
 
 int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{
diff --git a/crypto/ec/ec_oct.c b/crypto/ec/ec_oct.c
new file mode 100644
index 0000000..fd9db07
--- /dev/null
+++ b/crypto/ec/ec_oct.c
@@ -0,0 +1,199 @@
+/* crypto/ec/ec_lib.c */
+/*
+ * Originally written by Bodo Moeller for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Binary polynomial ECC support in OpenSSL originally developed by 
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/opensslv.h>
+
+#include "ec_lcl.h"
+
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+#endif
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+
+#ifndef OPENSSL_NO_EC2M
+int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+#endif
+
+size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+        unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->point2oct == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_point2oct(group, point,
+							form, buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_point2oct(group, point,
+							form, buf, len, ctx);
+#endif
+		}
+			
+	return group->meth->point2oct(group, point, form, buf, len, ctx);
+	}
+
+
+int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
+        const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->oct2point == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_oct2point(group, point,
+							buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_oct2point(group, point,
+							buf, len, ctx);
+#endif
+		}
+	return group->meth->oct2point(group, point, buf, len, ctx);
+	}
+
diff --git a/crypto/ec/ec_pmeth.c b/crypto/ec/ec_pmeth.c
index f433076..66ee397 100644
--- a/crypto/ec/ec_pmeth.c
+++ b/crypto/ec/ec_pmeth.c
@@ -188,7 +188,7 @@ static int pkey_ec_derive(EVP_PKEY_CTX *ctx, unsigned char *key, size_t *keylen)
 
 	pubkey = EC_KEY_get0_public_key(ctx->peerkey->pkey.ec);
 
-	/* NB: unlike PKS#3 DH, if *outlen is less than maximum size this is
+	/* NB: unlike PKCS#3 DH, if *outlen is less than maximum size this is
 	 * not an error, the result is truncated.
 	 */
 
@@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 
 		case EVP_PKEY_CTRL_MD:
 		if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
diff --git a/crypto/ec/eck_prn.c b/crypto/ec/eck_prn.c
index 7d3e175..06de8f3 100644
--- a/crypto/ec/eck_prn.c
+++ b/crypto/ec/eck_prn.c
@@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
 			reason = ERR_R_MALLOC_FAILURE;
 			goto err;
 			}
-
+#ifndef OPENSSL_NO_EC2M
 		if (is_char_two)
 			{
 			if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx))
@@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
 				}
 			}
 		else /* prime field */
+#endif
 			{
 			if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx))
 				{
diff --git a/crypto/ec/ecp_mont.c b/crypto/ec/ecp_mont.c
index 9fc4a46..f04f132 100644
--- a/crypto/ec/ecp_mont.c
+++ b/crypto/ec/ecp_mont.c
@@ -63,12 +63,20 @@
 
 #include <openssl/err.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 
 const EC_METHOD *EC_GFp_mont_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_mont_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_mont_group_init,
 		ec_GFp_mont_group_finish,
@@ -87,9 +95,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -109,6 +115,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
 		ec_GFp_mont_field_set_to_one };
 
 	return &ret;
+#endif
 	}
 
 
diff --git a/crypto/ec/ecp_nist.c b/crypto/ec/ecp_nist.c
index 2a5682e..aad2d5f 100644
--- a/crypto/ec/ecp_nist.c
+++ b/crypto/ec/ecp_nist.c
@@ -67,9 +67,17 @@
 #include <openssl/obj_mac.h>
 #include "ec_lcl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const EC_METHOD *EC_GFp_nist_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_nist_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 int ec_GFp_nist_group_copy(EC_GROUP *dest, const EC_GROUP *src)
diff --git a/crypto/ec/ecp_oct.c b/crypto/ec/ecp_oct.c
new file mode 100644
index 0000000..374a0ee
--- /dev/null
+++ b/crypto/ec/ecp_oct.c
@@ -0,0 +1,433 @@
+/* crypto/ec/ecp_oct.c */
+/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * for the OpenSSL project. 
+ * Includes code written by Bodo Moeller for the OpenSSL project.
+*/
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Portions of this software developed by SUN MICROSYSTEMS, INC.,
+ * and contributed to the OpenSSL project.
+ */
+
+#include <openssl/err.h>
+#include <openssl/symhacks.h>
+
+#include "ec_lcl.h"
+
+int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp1, *tmp2, *x, *y;
+	int ret = 0;
+
+	/* clear error queue*/
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0);
+
+	BN_CTX_start(ctx);
+	tmp1 = BN_CTX_get(ctx);
+	tmp2 = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	/* Recover y.  We have a Weierstrass equation
+	 *     y^2 = x^3 + a*x + b,
+	 * so  y  is one of the square roots of  x^3 + a*x + b.
+	 */
+
+	/* tmp1 := x^3 */
+	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
+	if (group->meth->field_decode == 0)
+		{
+		/* field_{sqr,mul} work on standard representation */
+		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
+		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
+		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + a*x */
+	if (group->a_is_minus3)
+		{
+		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
+		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
+		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (group->meth->field_decode)
+			{
+			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
+			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
+			}
+		else
+			{
+			/* field_mul works on standard representation */
+			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
+			}
+		
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + b */
+	if (group->meth->field_decode)
+		{
+		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
+		}
+	
+	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
+		{
+		unsigned long err = ERR_peek_last_error();
+		
+		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
+			{
+			ERR_clear_error();
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			}
+		else
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+		goto err;
+		}
+
+	if (y_bit != BN_is_odd(y))
+		{
+		if (BN_is_zero(y))
+			{
+			int kron;
+
+			kron = BN_kronecker(x, &group->field, ctx);
+			if (kron == -2) goto err;
+
+			if (kron == 1)
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
+			else
+				/* BN_mod_sqrt() should have cought this error (not a square) */
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			goto err;
+			}
+		if (!BN_usub(y, &group->field, y)) goto err;
+		}
+	if (y_bit != BN_is_odd(y))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
+		goto err;
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = BN_num_bytes(&group->field);
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		if (y == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
+			buf[0] = form + 1;
+		else
+			buf[0] = form;
+	
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = BN_num_bytes(&group->field);
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (y_bit != BN_is_odd(y))
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
diff --git a/crypto/ec/ecp_smpl.c b/crypto/ec/ecp_smpl.c
index 66a92e2..7cbb321 100644
--- a/crypto/ec/ecp_smpl.c
+++ b/crypto/ec/ecp_smpl.c
@@ -65,11 +65,19 @@
 #include <openssl/err.h>
 #include <openssl/symhacks.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 const EC_METHOD *EC_GFp_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -633,372 +640,6 @@ int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_P
 	return ret;
 	}
 
-
-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp1, *tmp2, *x, *y;
-	int ret = 0;
-
-	/* clear error queue*/
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0);
-
-	BN_CTX_start(ctx);
-	tmp1 = BN_CTX_get(ctx);
-	tmp2 = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	/* Recover y.  We have a Weierstrass equation
-	 *     y^2 = x^3 + a*x + b,
-	 * so  y  is one of the square roots of  x^3 + a*x + b.
-	 */
-
-	/* tmp1 := x^3 */
-	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
-	if (group->meth->field_decode == 0)
-		{
-		/* field_{sqr,mul} work on standard representation */
-		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
-		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
-		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + a*x */
-	if (group->a_is_minus3)
-		{
-		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
-		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
-		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (group->meth->field_decode)
-			{
-			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
-			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
-			}
-		else
-			{
-			/* field_mul works on standard representation */
-			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
-			}
-		
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + b */
-	if (group->meth->field_decode)
-		{
-		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
-		}
-	
-	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
-		{
-		unsigned long err = ERR_peek_last_error();
-		
-		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
-			{
-			ERR_clear_error();
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			}
-		else
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-		goto err;
-		}
-
-	if (y_bit != BN_is_odd(y))
-		{
-		if (BN_is_zero(y))
-			{
-			int kron;
-
-			kron = BN_kronecker(x, &group->field, ctx);
-			if (kron == -2) goto err;
-
-			if (kron == 1)
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
-			else
-				/* BN_mod_sqrt() should have cought this error (not a square) */
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			goto err;
-			}
-		if (!BN_usub(y, &group->field, y)) goto err;
-		}
-	if (y_bit != BN_is_odd(y))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
-		goto err;
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = BN_num_bytes(&group->field);
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		if (y == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
-			buf[0] = form + 1;
-		else
-			buf[0] = form;
-	
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = BN_num_bytes(&group->field);
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (y_bit != BN_is_odd(y))
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{
 	int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *, BN_CTX *);
diff --git a/crypto/ec/ectest.c b/crypto/ec/ectest.c
index 7509cb9..102eaa9 100644
--- a/crypto/ec/ectest.c
+++ b/crypto/ec/ectest.c
@@ -94,6 +94,7 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
 #include <openssl/objects.h>
 #include <openssl/rand.h>
 #include <openssl/bn.h>
+#include <openssl/opensslconf.h>
 
 #if defined(_MSC_VER) && defined(_MIPS_) && (_MSC_VER/100==12)
 /* suppress "too big too optimize" warning */
@@ -107,10 +108,6 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
 	EXIT(1); \
 } while (0)
 
-void prime_field_tests(void);
-void char2_field_tests(void);
-void internal_curve_test(void);
-
 #define TIMING_BASE_PT 0
 #define TIMING_RAND_PT 1
 #define TIMING_SIMUL 2
@@ -195,8 +192,51 @@ static void timings(EC_GROUP *group, int type, BN_CTX *ctx)
 	}
 #endif
 
-void prime_field_tests()
-	{	
+/* test multiplication with group order, long and negative scalars */
+static void group_order_tests(EC_GROUP *group)
+	{
+	BIGNUM *n1, *n2, *order;
+	EC_POINT *P = EC_POINT_new(group);
+	EC_POINT *Q = EC_POINT_new(group);
+	BN_CTX *ctx = BN_CTX_new();
+
+	n1 = BN_new(); n2 = BN_new(); order = BN_new();
+	fprintf(stdout, "verify group order ...");
+	fflush(stdout);
+	if (!EC_GROUP_get_order(group, order, ctx)) ABORT;
+	if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+	fprintf(stdout, ".");
+	fflush(stdout);
+	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
+	if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+	fprintf(stdout, " ok\n");
+	fprintf(stdout, "long/negative scalar tests ... ");
+	if (!BN_one(n1)) ABORT;
+	/* n1 = 1 - order */
+	if (!BN_sub(n1, n1, order)) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n1, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	/* n2 = 1 + order */
+	if (!BN_add(n2, order, BN_value_one())) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	/* n2 = (1 - order) * (1 + order) */
+	if (!BN_mul(n2, n1, n2, ctx)) ABORT;
+	if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+	if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+	fprintf(stdout, "ok\n");
+	EC_POINT_free(P);
+	EC_POINT_free(Q);
+	BN_free(n1);
+	BN_free(n2);
+	BN_free(order);
+	BN_CTX_free(ctx);
+	}
+
+static void prime_field_tests(void)
+	{
 	BN_CTX *ctx = NULL;
 	BIGNUM *p, *a, *b;
 	EC_GROUP *group;
@@ -321,21 +361,21 @@ void prime_field_tests()
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "Generator as octect string, compressed form:\n     ");
+	fprintf(stdout, "Generator as octet string, compressed form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_UNCOMPRESSED, buf, sizeof buf, ctx);
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "\nGenerator as octect string, uncompressed form:\n     ");
+	fprintf(stdout, "\nGenerator as octet string, uncompressed form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_HYBRID, buf, sizeof buf, ctx);
 	if (len == 0) ABORT;
 	if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
 	if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
-	fprintf(stdout, "\nGenerator as octect string, hybrid form:\n     ");
+	fprintf(stdout, "\nGenerator as octet string, hybrid form:\n     ");
 	for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
 	
 	if (!EC_POINT_get_Jprojective_coordinates_GFp(group, R, x, y, z, ctx)) ABORT;
@@ -381,17 +421,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 160) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_160 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_160, group)) ABORT;
@@ -425,17 +455,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 192) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_192 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_192, group)) ABORT;
@@ -469,17 +489,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 224) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_224 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_224, group)) ABORT;
@@ -514,17 +524,7 @@ void prime_field_tests()
 	if (EC_GROUP_get_degree(group) != 256) ABORT;
 	fprintf(stdout, " ok\n");
 	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+	group_order_tests(group);
 
 	if (!(P_256 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_256, group)) ABORT;
@@ -563,18 +563,8 @@ void prime_field_tests()
 	fprintf(stdout, "verify degree ...");
 	if (EC_GROUP_get_degree(group) != 384) ABORT;
 	fprintf(stdout, " ok\n");
-	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+
+	group_order_tests(group);
 
 	if (!(P_384 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_384, group)) ABORT;
@@ -619,18 +609,8 @@ void prime_field_tests()
 	fprintf(stdout, "verify degree ...");
 	if (EC_GROUP_get_degree(group) != 521) ABORT;
 	fprintf(stdout, " ok\n");
-	
-	fprintf(stdout, "verify group order ...");
-	fflush(stdout);
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, ".");
-	fflush(stdout);
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
-	fprintf(stdout, " ok\n");
+
+ 	group_order_tests(group);
 
 	if (!(P_521 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
 	if (!EC_GROUP_copy(P_521, group)) ABORT;
@@ -659,6 +639,7 @@ void prime_field_tests()
 		points[2] = Q;
 		points[3] = Q;
 
+		if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
 		if (!BN_add(y, z, BN_value_one())) ABORT;
 		if (BN_is_odd(y)) ABORT;
 		if (!BN_rshift1(y, y)) ABORT;
@@ -792,22 +773,14 @@ void prime_field_tests()
 	fprintf(stdout, "verify degree ..."); \
 	if (EC_GROUP_get_degree(group) != _degree) ABORT; \
 	fprintf(stdout, " ok\n"); \
-	fprintf(stdout, "verify group order ..."); \
-	fflush(stdout); \
-	if (!EC_GROUP_get_order(group, z, ctx)) ABORT; \
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
-	fprintf(stdout, "."); \
-	fflush(stdout); \
-	if (!EC_GROUP_precompute_mult(group, ctx)) ABORT; \
-	if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
-	if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
-	fprintf(stdout, " ok\n"); \
+	group_order_tests(group); \
 	if (!(_variable = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT; \
-	if (!EC_GROUP_copy(_variable, group)) ABORT;
+	if (!EC_GROUP_copy(_variable, group)) ABORT; \
 
-void char2_field_tests()
-	{	
+#ifndef OPENSSL_NO_EC2M
+
+static void char2_field_tests(void)
+	{
 	BN_CTX *ctx = NULL;
 	BIGNUM *p, *a, *b;
 	EC_GROUP *group;
@@ -1239,8 +1212,9 @@ void char2_field_tests()
 	if (C2_B571) EC_GROUP_free(C2_B571);
 
 	}
+#endif
 
-void internal_curve_test(void)
+static void internal_curve_test(void)
 	{
 	EC_builtin_curve *curves = NULL;
 	size_t crv_len = 0, n = 0;
@@ -1287,13 +1261,189 @@ void internal_curve_test(void)
 		EC_GROUP_free(group);
 		}
 	if (ok)
-		fprintf(stdout, " ok\n");
+		fprintf(stdout, " ok\n\n");
 	else
-		fprintf(stdout, " failed\n");
+		{
+		fprintf(stdout, " failed\n\n");
+		ABORT;
+		}
 	OPENSSL_free(curves);
 	return;
 	}
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/* nistp_test_params contains magic numbers for testing our optimized
+ * implementations of several NIST curves with characteristic > 3. */
+struct nistp_test_params
+	{
+	const EC_METHOD* (*meth) ();
+	int degree;
+	/* Qx, Qy and D are taken from
+	 * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/ECDSA_Prime.pdf
+	 * Otherwise, values are standard curve parameters from FIPS 180-3 */
+	const char *p, *a, *b, *Qx, *Qy, *Gx, *Gy, *order, *d;
+	};
+
+static const struct nistp_test_params nistp_tests_params[] =
+	{
+		{
+		/* P-224 */
+		EC_GFp_nistp224_method,
+		224,
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000000000000000001", /* p */
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFE", /* a */
+		"B4050A850C04B3ABF54132565044B0B7D7BFD8BA270B39432355FFB4", /* b */
+		"E84FB0B8E7000CB657D7973CF6B42ED78B301674276DF744AF130B3E", /* Qx */
+		"4376675C6FC5612C21A0FF2D2A89D2987DF7A2BC52183B5982298555", /* Qy */
+		"B70E0CBD6BB4BF7F321390B94A03C1D356C21122343280D6115C1D21", /* Gx */
+		"BD376388B5F723FB4C22DFE6CD4375A05A07476444D5819985007E34", /* Gy */
+		"FFFFFFFFFFFFFFFFFFFFFFFFFFFF16A2E0B8F03E13DD29455C5C2A3D", /* order */
+		"3F0C488E987C80BE0FEE521F8D90BE6034EC69AE11CA72AA777481E8", /* d */
+		},
+		{
+		/* P-256 */
+		EC_GFp_nistp256_method,
+		256,
+		"ffffffff00000001000000000000000000000000ffffffffffffffffffffffff", /* p */
+		"ffffffff00000001000000000000000000000000fffffffffffffffffffffffc", /* a */
+		"5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", /* b */
+		"b7e08afdfe94bad3f1dc8c734798ba1c62b3a0ad1e9ea2a38201cd0889bc7a19", /* Qx */
+		"3603f747959dbf7a4bb226e41928729063adc7ae43529e61b563bbc606cc5e09", /* Qy */
+		"6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", /* Gx */
+		"4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", /* Gy */
+		"ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551", /* order */
+		"c477f9f65c22cce20657faa5b2d1d8122336f851a508a1ed04e479c34985bf96", /* d */
+		},
+		{
+		/* P-521 */
+		EC_GFp_nistp521_method,
+		521,
+		"1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", /* p */
+		"1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffc", /* a */
+		"051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00", /* b */
+		"0098e91eef9a68452822309c52fab453f5f117c1da8ed796b255e9ab8f6410cca16e59df403a6bdc6ca467a37056b1e54b3005d8ac030decfeb68df18b171885d5c4", /* Qx */
+		"0164350c321aecfc1cca1ba4364c9b15656150b4b78d6a48d7d28e7f31985ef17be8554376b72900712c4b83ad668327231526e313f5f092999a4632fd50d946bc2e", /* Qy */
+		"c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66", /* Gx */
+		"11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650", /* Gy */
+		"1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409", /* order */
+		"0100085f47b8e1b8b11b7eb33028c0b2888e304bfc98501955b45bba1478dc184eeedf09b86a5f7c21994406072787205e69a63709fe35aa93ba333514b24f961722", /* d */
+		},
+	};
+
+void nistp_single_test(const struct nistp_test_params *test)
+	{
+	BN_CTX *ctx;
+	BIGNUM *p, *a, *b, *x, *y, *n, *m, *order;
+	EC_GROUP *NISTP;
+	EC_POINT *G, *P, *Q, *Q_CHECK;
+
+	fprintf(stdout, "\nNIST curve P-%d (optimised implementation):\n", test->degree);
+	ctx = BN_CTX_new();
+	p = BN_new();
+	a = BN_new();
+	b = BN_new();
+	x = BN_new(); y = BN_new();
+	m = BN_new(); n = BN_new(); order = BN_new();
+
+	NISTP = EC_GROUP_new(test->meth());
+	if(!NISTP) ABORT;
+	if (!BN_hex2bn(&p, test->p)) ABORT;
+	if (1 != BN_is_prime_ex(p, BN_prime_checks, ctx, NULL)) ABORT;
+	if (!BN_hex2bn(&a, test->a)) ABORT;
+	if (!BN_hex2bn(&b, test->b)) ABORT;
+	if (!EC_GROUP_set_curve_GFp(NISTP, p, a, b, ctx)) ABORT;
+	G = EC_POINT_new(NISTP);
+	P = EC_POINT_new(NISTP);
+	Q = EC_POINT_new(NISTP);
+	Q_CHECK = EC_POINT_new(NISTP);
+	if(!BN_hex2bn(&x, test->Qx)) ABORT;
+	if(!BN_hex2bn(&y, test->Qy)) ABORT;
+	if(!EC_POINT_set_affine_coordinates_GFp(NISTP, Q_CHECK, x, y, ctx)) ABORT;
+	if (!BN_hex2bn(&x, test->Gx)) ABORT;
+	if (!BN_hex2bn(&y, test->Gy)) ABORT;
+	if (!EC_POINT_set_affine_coordinates_GFp(NISTP, G, x, y, ctx)) ABORT;
+	if (!BN_hex2bn(&order, test->order)) ABORT;
+	if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+
+	fprintf(stdout, "verify degree ... ");
+	if (EC_GROUP_get_degree(NISTP) != test->degree) ABORT;
+	fprintf(stdout, "ok\n");
+
+	fprintf(stdout, "NIST test vectors ... ");
+	if (!BN_hex2bn(&n, test->d)) ABORT;
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* set generator to P = 2*G, where G is the standard generator */
+	if (!EC_POINT_dbl(NISTP, P, G, ctx)) ABORT;
+	if (!EC_GROUP_set_generator(NISTP, P, order, BN_value_one())) ABORT;
+	/* set the scalar to m=n/2, where n is the NIST test scalar */
+	if (!BN_rshift(m, n, 1)) ABORT;
+
+	/* test the non-standard generator */
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* now repeat all tests with precomputation */
+	if (!EC_GROUP_precompute_mult(NISTP, ctx)) ABORT;
+
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	/* reset generator */
+	if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+	/* fixed point multiplication */
+	EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+	/* random point multiplication */
+	EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+	if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+	fprintf(stdout, "ok\n");
+	group_order_tests(NISTP);
+#if 0
+	timings(NISTP, TIMING_BASE_PT, ctx);
+	timings(NISTP, TIMING_RAND_PT, ctx);
+#endif
+	EC_GROUP_free(NISTP);
+	EC_POINT_free(G);
+	EC_POINT_free(P);
+	EC_POINT_free(Q);
+	EC_POINT_free(Q_CHECK);
+	BN_free(n);
+	BN_free(m);
+	BN_free(p);
+	BN_free(a);
+	BN_free(b);
+	BN_free(x);
+	BN_free(y);
+	BN_free(order);
+	BN_CTX_free(ctx);
+	}
+
+void nistp_tests()
+	{
+	unsigned i;
+
+	for (i = 0; i < sizeof(nistp_tests_params) / sizeof(struct nistp_test_params); i++)
+		{
+		nistp_single_test(&nistp_tests_params[i]);
+		}
+	}
+#endif
+
 static const char rnd_seed[] = "string to make the random number generator think it has entropy";
 
 int main(int argc, char *argv[])
@@ -1317,7 +1467,12 @@ int main(int argc, char *argv[])
 
 	prime_field_tests();
 	puts("");
+#ifndef OPENSSL_NO_EC2M
 	char2_field_tests();
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	nistp_tests();
+#endif
 	/* test the internal curves */
 	internal_curve_test();
 
diff --git a/crypto/ecdh/ecdh.h b/crypto/ecdh/ecdh.h
index b4b58ee..8887102 100644
--- a/crypto/ecdh/ecdh.h
+++ b/crypto/ecdh/ecdh.h
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void);
 /* Error codes for the ECDH functions. */
 
 /* Function codes. */
+#define ECDH_F_ECDH_CHECK				 102
 #define ECDH_F_ECDH_COMPUTE_KEY				 100
 #define ECDH_F_ECDH_DATA_NEW_METHOD			 101
 
 /* Reason codes. */
 #define ECDH_R_KDF_FAILED				 102
+#define ECDH_R_NON_FIPS_METHOD				 103
 #define ECDH_R_NO_PRIVATE_VALUE				 100
 #define ECDH_R_POINT_ARITHMETIC_FAILURE			 101
 
diff --git a/crypto/ecdh/ecdhtest.c b/crypto/ecdh/ecdhtest.c
index 212a87e..823d7ba 100644
--- a/crypto/ecdh/ecdhtest.c
+++ b/crypto/ecdh/ecdhtest.c
@@ -158,11 +158,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
 		if (!EC_POINT_get_affine_coordinates_GFp(group,
 			EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group,
 			EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
 		}
+#endif
 #ifdef NOISY
 	BIO_puts(out,"  pri 1=");
 	BN_print(out,a->priv_key);
@@ -183,11 +185,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
 		if (!EC_POINT_get_affine_coordinates_GFp(group, 
 			EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group, 
 			EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
 		}
+#endif
 
 #ifdef NOISY
 	BIO_puts(out,"  pri 2=");
@@ -324,6 +328,7 @@ int main(int argc, char *argv[])
 	if (!test_ecdh_curve(NID_X9_62_prime256v1, "NIST Prime-Curve P-256", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_secp384r1, "NIST Prime-Curve P-384", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_secp521r1, "NIST Prime-Curve P-521", ctx, out)) goto err;
+#ifndef OPENSSL_NO_EC2M
 	/* NIST BINARY CURVES TESTS */
 	if (!test_ecdh_curve(NID_sect163k1, "NIST Binary-Curve K-163", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect163r2, "NIST Binary-Curve B-163", ctx, out)) goto err;
@@ -335,6 +340,7 @@ int main(int argc, char *argv[])
 	if (!test_ecdh_curve(NID_sect409r1, "NIST Binary-Curve B-409", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect571k1, "NIST Binary-Curve K-571", ctx, out)) goto err;
 	if (!test_ecdh_curve(NID_sect571r1, "NIST Binary-Curve B-571", ctx, out)) goto err;
+#endif
 
 	ret = 0;
 
diff --git a/crypto/ecdh/ech_err.c b/crypto/ecdh/ech_err.c
index 6f4b0c9..3bd2473 100644
--- a/crypto/ecdh/ech_err.c
+++ b/crypto/ecdh/ech_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdh/ech_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDH_str_functs[]=
 	{
+{ERR_FUNC(ECDH_F_ECDH_CHECK),	"ECDH_CHECK"},
 {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY),	"ECDH_compute_key"},
 {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD),	"ECDH_DATA_new_method"},
 {0,NULL}
@@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]=
 static ERR_STRING_DATA ECDH_str_reasons[]=
 	{
 {ERR_REASON(ECDH_R_KDF_FAILED)           ,"KDF failed"},
+{ERR_REASON(ECDH_R_NON_FIPS_METHOD)      ,"non fips method"},
 {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE)     ,"no private value"},
 {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"},
 {0,NULL}
diff --git a/crypto/ecdh/ech_key.c b/crypto/ecdh/ech_key.c
index f44da92..2988899 100644
--- a/crypto/ecdh/ech_key.c
+++ b/crypto/ecdh/ech_key.c
@@ -68,9 +68,6 @@
  */
 
 #include "ech_locl.h"
-#ifndef OPENSSL_NO_ENGINE
-#include <openssl/engine.h>
-#endif
 
 int ECDH_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 	EC_KEY *eckey,
diff --git a/crypto/ecdh/ech_lib.c b/crypto/ecdh/ech_lib.c
index 4d8ea03..0644431 100644
--- a/crypto/ecdh/ech_lib.c
+++ b/crypto/ecdh/ech_lib.c
@@ -73,6 +73,9 @@
 #include <openssl/engine.h>
 #endif
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT;
 
@@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth)
 const ECDH_METHOD *ECDH_get_default_method(void)
 	{
 	if(!default_ECDH_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdh_openssl();
+		else
+			return ECDH_OpenSSL();
+#else
 		default_ECDH_method = ECDH_OpenSSL();
+#endif
+		}
 	return default_ECDH_method;
 	}
 
@@ -210,11 +222,26 @@ ECDH_DATA *ecdh_check(EC_KEY *key)
 		ecdh_data = (ECDH_DATA *)ecdh_data_new();
 		if (ecdh_data == NULL)
 			return NULL;
-		EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
-			ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+		data = EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
+			   ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+		if (data != NULL)
+			{
+			/* Another thread raced us to install the key_method
+			 * data and won. */
+			ecdh_data_free(ecdh_data);
+			ecdh_data = (ECDH_DATA *)data;
+			}
 	}
 	else
 		ecdh_data = (ECDH_DATA *)data;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 	
 
 	return ecdh_data;
diff --git a/crypto/ecdh/ech_locl.h b/crypto/ecdh/ech_locl.h
index f658526..f6cad6a 100644
--- a/crypto/ecdh/ech_locl.h
+++ b/crypto/ecdh/ech_locl.h
@@ -75,6 +75,14 @@ struct ecdh_method
 	char *app_data;
 	};
 
+/* If this flag is set the ECDH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDH_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdh_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);
diff --git a/crypto/ecdh/ech_ossl.c b/crypto/ecdh/ech_ossl.c
index 2a40ff1..4a30628 100644
--- a/crypto/ecdh/ech_ossl.c
+++ b/crypto/ecdh/ech_ossl.c
@@ -157,6 +157,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else
 		{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group, tmp, x, y, ctx)) 
@@ -165,6 +166,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 			goto err;
 			}
 		}
+#endif
 
 	buflen = (EC_GROUP_get_degree(group) + 7)/8;
 	len = BN_num_bytes(x);
diff --git a/crypto/ecdsa/ecdsa.h b/crypto/ecdsa/ecdsa.h
index e61c539..7fb5254 100644
--- a/crypto/ecdsa/ecdsa.h
+++ b/crypto/ecdsa/ecdsa.h
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void);
 /* Error codes for the ECDSA functions. */
 
 /* Function codes. */
+#define ECDSA_F_ECDSA_CHECK				 104
 #define ECDSA_F_ECDSA_DATA_NEW_METHOD			 100
 #define ECDSA_F_ECDSA_DO_SIGN				 101
 #define ECDSA_F_ECDSA_DO_VERIFY				 102
@@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void);
 #define ECDSA_R_ERR_EC_LIB				 102
 #define ECDSA_R_MISSING_PARAMETERS			 103
 #define ECDSA_R_NEED_NEW_SETUP_VALUES			 106
+#define ECDSA_R_NON_FIPS_METHOD				 107
 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED		 104
 #define ECDSA_R_SIGNATURE_MALLOC_FAILED			 105
 
diff --git a/crypto/ecdsa/ecdsatest.c b/crypto/ecdsa/ecdsatest.c
index 26a4a9e..537bb30 100644
--- a/crypto/ecdsa/ecdsatest.c
+++ b/crypto/ecdsa/ecdsatest.c
@@ -262,6 +262,7 @@ int x9_62_tests(BIO *out)
 		"3238135532097973577080787768312505059318910517550078427819"
 		"78505179448783"))
 		goto x962_err;
+#ifndef OPENSSL_NO_EC2M
 	if (!x9_62_test_internal(out, NID_X9_62_c2tnb191v1,
 		"87194383164871543355722284926904419997237591535066528048",
 		"308992691965804947361541664549085895292153777025772063598"))
@@ -272,7 +273,7 @@ int x9_62_tests(BIO *out)
 		"1970303740007316867383349976549972270528498040721988191026"
 		"49413465737174"))
 		goto x962_err;
-
+#endif
 	ret = 1;
 x962_err:
 	if (!restore_rand())
@@ -286,9 +287,13 @@ int test_builtin(BIO *out)
 	size_t		crv_len = 0, n = 0;
 	EC_KEY		*eckey = NULL, *wrong_eckey = NULL;
 	EC_GROUP	*group;
+	ECDSA_SIG	*ecdsa_sig = NULL;
 	unsigned char	digest[20], wrong_digest[20];
-	unsigned char	*signature = NULL; 
-	unsigned int	sig_len;
+	unsigned char	*signature = NULL;
+	const unsigned char	*sig_ptr;
+	unsigned char	*sig_ptr2;
+	unsigned char	*raw_buf = NULL;
+	unsigned int	sig_len, degree, r_len, s_len, bn_len, buf_len;
 	int		nid, ret =  0;
 	
 	/* fill digest values with some random data */
@@ -338,7 +343,8 @@ int test_builtin(BIO *out)
 		if (EC_KEY_set_group(eckey, group) == 0)
 			goto builtin_err;
 		EC_GROUP_free(group);
-		if (EC_GROUP_get_degree(EC_KEY_get0_group(eckey)) < 160)
+		degree = EC_GROUP_get_degree(EC_KEY_get0_group(eckey));
+		if (degree < 160)
 			/* drop the curve */ 
 			{
 			EC_KEY_free(eckey);
@@ -414,26 +420,89 @@ int test_builtin(BIO *out)
 			}
 		BIO_printf(out, ".");
 		(void)BIO_flush(out);
-		/* modify a single byte of the signature */
-		offset = signature[10] % sig_len;
-		dirt   = signature[11];
-		signature[offset] ^= dirt ? dirt : 1; 
+		/* wrong length */
+		if (ECDSA_verify(0, digest, 20, signature, sig_len - 1,
+			eckey) == 1)
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
+		BIO_printf(out, ".");
+		(void)BIO_flush(out);
+
+		/* Modify a single byte of the signature: to ensure we don't
+		 * garble the ASN1 structure, we read the raw signature and
+		 * modify a byte in one of the bignums directly. */
+		sig_ptr = signature;
+		if ((ecdsa_sig = d2i_ECDSA_SIG(NULL, &sig_ptr, sig_len)) == NULL)
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
+
+		/* Store the two BIGNUMs in raw_buf. */
+		r_len = BN_num_bytes(ecdsa_sig->r);
+		s_len = BN_num_bytes(ecdsa_sig->s);
+		bn_len = (degree + 7) / 8;
+		if ((r_len > bn_len) || (s_len > bn_len))
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
+		buf_len = 2 * bn_len;
+		if ((raw_buf = OPENSSL_malloc(buf_len)) == NULL)
+			goto builtin_err;
+		/* Pad the bignums with leading zeroes. */
+		memset(raw_buf, 0, buf_len);
+		BN_bn2bin(ecdsa_sig->r, raw_buf + bn_len - r_len);
+		BN_bn2bin(ecdsa_sig->s, raw_buf + buf_len - s_len);
+
+		/* Modify a single byte in the buffer. */
+		offset = raw_buf[10] % buf_len;
+		dirt   = raw_buf[11] ? raw_buf[11] : 1;
+		raw_buf[offset] ^= dirt;
+		/* Now read the BIGNUMs back in from raw_buf. */
+		if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+			(BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+			goto builtin_err;
+
+		sig_ptr2 = signature;
+		sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
 		if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) == 1)
 			{
 			BIO_printf(out, " failed\n");
 			goto builtin_err;
 			}
+		/* Sanity check: undo the modification and verify signature. */
+		raw_buf[offset] ^= dirt;
+		if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+			(BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+			goto builtin_err;
+
+		sig_ptr2 = signature;
+		sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
+		if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) != 1)
+			{
+			BIO_printf(out, " failed\n");
+			goto builtin_err;
+			}
 		BIO_printf(out, ".");
 		(void)BIO_flush(out);
 		
 		BIO_printf(out, " ok\n");
 		/* cleanup */
+		/* clean bogus errors */
+		ERR_clear_error();
 		OPENSSL_free(signature);
 		signature = NULL;
 		EC_KEY_free(eckey);
 		eckey = NULL;
 		EC_KEY_free(wrong_eckey);
 		wrong_eckey = NULL;
+		ECDSA_SIG_free(ecdsa_sig);
+		ecdsa_sig = NULL;
+		OPENSSL_free(raw_buf);
+		raw_buf = NULL;
 		}
 
 	ret = 1;	
@@ -442,8 +511,12 @@ int test_builtin(BIO *out)
 		EC_KEY_free(eckey);
 	if (wrong_eckey)
 		EC_KEY_free(wrong_eckey);
+	if (ecdsa_sig)
+		ECDSA_SIG_free(ecdsa_sig);
 	if (signature)
 		OPENSSL_free(signature);
+	if (raw_buf)
+		OPENSSL_free(raw_buf);
 	if (curves)
 		OPENSSL_free(curves);
 
diff --git a/crypto/ecdsa/ecs_err.c b/crypto/ecdsa/ecs_err.c
index 98e38d5..81542e6 100644
--- a/crypto/ecdsa/ecs_err.c
+++ b/crypto/ecdsa/ecs_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdsa/ecs_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDSA_str_functs[]=
 	{
+{ERR_FUNC(ECDSA_F_ECDSA_CHECK),	"ECDSA_CHECK"},
 {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD),	"ECDSA_DATA_NEW_METHOD"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN),	"ECDSA_do_sign"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY),	"ECDSA_do_verify"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]=
 {ERR_REASON(ECDSA_R_ERR_EC_LIB)          ,"err ec lib"},
 {ERR_REASON(ECDSA_R_MISSING_PARAMETERS)  ,"missing parameters"},
 {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"},
+{ERR_REASON(ECDSA_R_NON_FIPS_METHOD)     ,"non fips method"},
 {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"},
 {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"},
 {0,NULL}
diff --git a/crypto/ecdsa/ecs_lib.c b/crypto/ecdsa/ecs_lib.c
index 2ebae3a..814a6bf 100644
--- a/crypto/ecdsa/ecs_lib.c
+++ b/crypto/ecdsa/ecs_lib.c
@@ -60,6 +60,9 @@
 #endif
 #include <openssl/err.h>
 #include <openssl/bn.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT;
 
@@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth)
 const ECDSA_METHOD *ECDSA_get_default_method(void)
 {
 	if(!default_ECDSA_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdsa_openssl();
+		else
+			return ECDSA_OpenSSL();
+#else
 		default_ECDSA_method = ECDSA_OpenSSL();
+#endif
+		}
 	return default_ECDSA_method;
 }
 
@@ -188,12 +200,26 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key)
 		ecdsa_data = (ECDSA_DATA *)ecdsa_data_new();
 		if (ecdsa_data == NULL)
 			return NULL;
-		EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
-			ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+		data = EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
+			   ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+		if (data != NULL)
+			{
+			/* Another thread raced us to install the key_method
+			 * data and won. */
+			ecdsa_data_free(ecdsa_data);
+			ecdsa_data = (ECDSA_DATA *)data;
+			}
 	}
 	else
 		ecdsa_data = (ECDSA_DATA *)data;
-	
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 
 	return ecdsa_data;
 }
diff --git a/crypto/ecdsa/ecs_locl.h b/crypto/ecdsa/ecs_locl.h
index 3a69a84..cb3be13 100644
--- a/crypto/ecdsa/ecs_locl.h
+++ b/crypto/ecdsa/ecs_locl.h
@@ -82,6 +82,14 @@ struct ecdsa_method
 	char *app_data;
 	};
 
+/* If this flag is set the ECDSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDSA_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdsa_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);
diff --git a/crypto/ecdsa/ecs_ossl.c b/crypto/ecdsa/ecs_ossl.c
index 1bbf328..7725935 100644
--- a/crypto/ecdsa/ecs_ossl.c
+++ b/crypto/ecdsa/ecs_ossl.c
@@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 				goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 		else /* NID_X9_62_characteristic_two_field */
 		{
 			if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 				goto err;
 			}
 		}
+#endif
 		if (!BN_nnmod(r, X, order, ctx))
 		{
 			ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
@@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
 			goto err;
 		}
 	}
+#ifndef OPENSSL_NO_EC2M
 	else /* NID_X9_62_characteristic_two_field */
 	{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
 			goto err;
 		}
 	}
-	
+#endif	
 	if (!BN_nnmod(u1, X, order, ctx))
 	{
 		ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
diff --git a/crypto/engine/eng_all.c b/crypto/engine/eng_all.c
index 22c1204..6093376 100644
--- a/crypto/engine/eng_all.c
+++ b/crypto/engine/eng_all.c
@@ -61,6 +61,8 @@
 
 void ENGINE_load_builtin_engines(void)
 	{
+	/* Some ENGINEs need this */
+	OPENSSL_cpuid_setup();
 #if 0
 	/* There's no longer any need for an "openssl" ENGINE unless, one day,
 	 * it is the *only* way for standard builtin implementations to be be
@@ -70,6 +72,12 @@ void ENGINE_load_builtin_engines(void)
 #endif
 #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
 	ENGINE_load_cryptodev();
+#endif
+#ifndef OPENSSL_NO_RSAX
+	ENGINE_load_rsax();
+#endif
+#ifndef OPENSSL_NO_RDRAND
+	ENGINE_load_rdrand();
 #endif
 	ENGINE_load_dynamic();
 #ifndef OPENSSL_NO_STATIC_ENGINE
@@ -112,6 +120,7 @@ void ENGINE_load_builtin_engines(void)
 	ENGINE_load_capi();
 #endif
 #endif
+	ENGINE_register_all_complete();
 	}
 
 #if defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)
diff --git a/crypto/engine/eng_cryptodev.c b/crypto/engine/eng_cryptodev.c
index 52f4ca3..5a715ac 100644
--- a/crypto/engine/eng_cryptodev.c
+++ b/crypto/engine/eng_cryptodev.c
@@ -79,8 +79,6 @@ struct dev_crypto_state {
 	unsigned char digest_res[HASH_MAX_LEN];
 	char *mac_data;
 	int mac_len;
-
-	int copy;
 #endif
 };
 
@@ -200,6 +198,7 @@ get_dev_crypto(void)
 
 	if ((fd = open_dev_crypto()) == -1)
 		return (-1);
+#ifndef CRIOGET_NOT_NEEDED
 	if (ioctl(fd, CRIOGET, &retfd) == -1)
 		return (-1);
 
@@ -208,9 +207,19 @@ get_dev_crypto(void)
 		close(retfd);
 		return (-1);
 	}
+#else
+        retfd = fd;
+#endif
 	return (retfd);
 }
 
+static void put_dev_crypto(int fd)
+{
+#ifndef CRIOGET_NOT_NEEDED
+	close(fd);
+#endif
+}
+
 /* Caching version for asym operations */
 static int
 get_asym_dev_crypto(void)
@@ -252,7 +261,7 @@ get_cryptodev_ciphers(const int **cnids)
 		    ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
 			nids[count++] = ciphers[i].nid;
 	}
-	close(fd);
+	put_dev_crypto(fd);
 
 	if (count > 0)
 		*cnids = nids;
@@ -291,7 +300,7 @@ get_cryptodev_digests(const int **cnids)
 		    ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
 			nids[count++] = digests[i].nid;
 	}
-	close(fd);
+	put_dev_crypto(fd);
 
 	if (count > 0)
 		*cnids = nids;
@@ -436,7 +445,7 @@ cryptodev_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	sess->cipher = cipher;
 
 	if (ioctl(state->d_fd, CIOCGSESSION, sess) == -1) {
-		close(state->d_fd);
+		put_dev_crypto(state->d_fd);
 		state->d_fd = -1;
 		return (0);
 	}
@@ -473,7 +482,7 @@ cryptodev_cleanup(EVP_CIPHER_CTX *ctx)
 	} else {
 		ret = 1;
 	}
-	close(state->d_fd);
+	put_dev_crypto(state->d_fd);
 	state->d_fd = -1;
 
 	return (ret);
@@ -686,7 +695,7 @@ static int cryptodev_digest_init(EVP_MD_CTX *ctx)
 	sess->mac = digest;
 
 	if (ioctl(state->d_fd, CIOCGSESSION, sess) < 0) {
-		close(state->d_fd);
+		put_dev_crypto(state->d_fd);
 		state->d_fd = -1;
 		printf("cryptodev_digest_init: Open session failed\n");
 		return (0);
@@ -758,14 +767,12 @@ static int cryptodev_digest_final(EVP_MD_CTX *ctx, unsigned char *md)
 	if (! (ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) ) {
 		/* if application doesn't support one buffer */
 		memset(&cryp, 0, sizeof(cryp));
-
 		cryp.ses = sess->ses;
 		cryp.flags = 0;
 		cryp.len = state->mac_len;
 		cryp.src = state->mac_data;
 		cryp.dst = NULL;
 		cryp.mac = (caddr_t)md;
-
 		if (ioctl(state->d_fd, CIOCCRYPT, &cryp) < 0) {
 			printf("cryptodev_digest_final: digest failed\n");
 			return (0);
@@ -786,6 +793,9 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
 	struct dev_crypto_state *state = ctx->md_data;
 	struct session_op *sess = &state->d_sess;
 
+	if (state == NULL)
+	  return 0;
+
 	if (state->d_fd < 0) {
 		printf("cryptodev_digest_cleanup: illegal input\n");
 		return (0);
@@ -797,16 +807,13 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
 		state->mac_len = 0;
 	}
 
-	if (state->copy)
-		return 1;
-
 	if (ioctl(state->d_fd, CIOCFSESSION, &sess->ses) < 0) {
 		printf("cryptodev_digest_cleanup: failed to close session\n");
 		ret = 0;
 	} else {
 		ret = 1;
 	}
-	close(state->d_fd);	
+	put_dev_crypto(state->d_fd);	
 	state->d_fd = -1;
 
 	return (ret);
@@ -816,15 +823,39 @@ static int cryptodev_digest_copy(EVP_MD_CTX *to,const EVP_MD_CTX *from)
 {
 	struct dev_crypto_state *fstate = from->md_data;
 	struct dev_crypto_state *dstate = to->md_data;
+	struct session_op *sess;
+	int digest;
 
-	memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+	if (dstate == NULL || fstate == NULL)
+	  return 1;
 
-	if (fstate->mac_len != 0) {
-		dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
-		memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+       	memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+
+	sess = &dstate->d_sess;
+
+	digest = digest_nid_to_cryptodev(to->digest->type);
+
+	sess->mackey = dstate->dummy_mac_key;
+	sess->mackeylen = digest_key_length(to->digest->type);
+	sess->mac = digest;
+
+	dstate->d_fd = get_dev_crypto();
+
+	if (ioctl(dstate->d_fd, CIOCGSESSION, sess) < 0) {
+		put_dev_crypto(dstate->d_fd);
+		dstate->d_fd = -1;
+		printf("cryptodev_digest_init: Open session failed\n");
+		return (0);
 	}
 
-	dstate->copy = 1;
+	if (fstate->mac_len != 0) {
+	        if (fstate->mac_data != NULL)
+	                {
+        		dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
+	        	memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+           		dstate->mac_len = fstate->mac_len;
+	        	}
+	}
 
 	return 1;
 }
@@ -1347,11 +1378,11 @@ ENGINE_load_cryptodev(void)
 	 * find out what asymmetric crypto algorithms we support
 	 */
 	if (ioctl(fd, CIOCASYMFEAT, &cryptodev_asymfeat) == -1) {
-		close(fd);
+		put_dev_crypto(fd);
 		ENGINE_free(engine);
 		return;
 	}
-	close(fd);
+	put_dev_crypto(fd);
 
 	if (!ENGINE_set_id(engine, "cryptodev") ||
 	    !ENGINE_set_name(engine, "BSD cryptodev engine") ||
diff --git a/crypto/engine/eng_dyn.c b/crypto/engine/eng_dyn.c
index 807da7a..8fb8634 100644
--- a/crypto/engine/eng_dyn.c
+++ b/crypto/engine/eng_dyn.c
@@ -408,7 +408,7 @@ static int int_load(dynamic_data_ctx *ctx)
 	int num, loop;
 	/* Unless told not to, try a direct load */
 	if((ctx->dir_load != 2) && (DSO_load(ctx->dynamic_dso,
-				ctx->DYNAMIC_LIBNAME, NULL, 0)) != NULL)
+				ctx->DYNAMIC_LIBNAME, NULL, 0) != NULL))
 		return 1;
 	/* If we're not allowed to use 'dirs' or we have none, fail */
 	if(!ctx->dir_load || (num = sk_OPENSSL_STRING_num(ctx->dirs)) < 1)
@@ -423,6 +423,9 @@ static int int_load(dynamic_data_ctx *ctx)
 			{
 			/* Found what we're looking for */
 			OPENSSL_free(merge);
+			/* Previous failed loop iterations, if any, will have resulted in
+			 * errors. Clear them out before returning success. */
+			ERR_clear_error();
 			return 1;
 			}
 		OPENSSL_free(merge);
diff --git a/crypto/engine/eng_fat.c b/crypto/engine/eng_fat.c
index db66e62..789b8d5 100644
--- a/crypto/engine/eng_fat.c
+++ b/crypto/engine/eng_fat.c
@@ -176,6 +176,7 @@ int ENGINE_register_all_complete(void)
 	ENGINE *e;
 
 	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
-		ENGINE_register_complete(e);
+		if (!(e->flags & ENGINE_FLAGS_NO_REGISTER_ALL))
+			ENGINE_register_complete(e);
 	return 1;
 	}
diff --git a/crypto/engine/engine.h b/crypto/engine/engine.h
index 943aeae..f8be497 100644
--- a/crypto/engine/engine.h
+++ b/crypto/engine/engine.h
@@ -141,6 +141,13 @@ extern "C" {
  * the existing ENGINE's structural reference count. */
 #define ENGINE_FLAGS_BY_ID_COPY		(int)0x0004
 
+/* This flag if for an ENGINE that does not want its methods registered as 
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL	(int)0x0008
+
 /* ENGINEs can support their own command types, and these flags are used in
  * ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
  * command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@ void ENGINE_load_gost(void);
 #endif
 #endif
 void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
 void ENGINE_load_builtin_engines(void);
 
 /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation
diff --git a/crypto/engine/tb_asnmth.c b/crypto/engine/tb_asnmth.c
new file mode 100644
index 0000000..7509033
--- /dev/null
+++ b/crypto/engine/tb_asnmth.c
@@ -0,0 +1,246 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include "asn1_locl.h"
+#include <openssl/evp.h>
+
+/* If this symbol is defined then ENGINE_get_pkey_asn1_meth_engine(), the
+ * function that is used by EVP to hook in pkey_asn1_meth code and cache
+ * defaults (etc), will display brief debugging summaries to stderr with the
+ * 'nid'. */
+/* #define ENGINE_PKEY_ASN1_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_asn1_meth_table = NULL;
+
+void ENGINE_unregister_pkey_asn1_meths(ENGINE *e)
+	{
+	engine_table_unregister(&pkey_asn1_meth_table, e);
+	}
+
+static void engine_unregister_all_pkey_asn1_meths(void)
+	{
+	engine_table_cleanup(&pkey_asn1_meth_table);
+	}
+
+int ENGINE_register_pkey_asn1_meths(ENGINE *e)
+	{
+	if(e->pkey_asn1_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_asn1_meth_table,
+				engine_unregister_all_pkey_asn1_meths, e, nids,
+					num_nids, 0);
+		}
+	return 1;
+	}
+
+void ENGINE_register_all_pkey_asn1_meths(void)
+	{
+	ENGINE *e;
+
+	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+		ENGINE_register_pkey_asn1_meths(e);
+	}
+
+int ENGINE_set_default_pkey_asn1_meths(ENGINE *e)
+	{
+	if(e->pkey_asn1_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_asn1_meth_table,
+				engine_unregister_all_pkey_asn1_meths, e, nids,
+					num_nids, 1);
+		}
+	return 1;
+	}
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_asn1_meth 'nid' */
+ENGINE *ENGINE_get_pkey_asn1_meth_engine(int nid)
+	{
+	return engine_table_select(&pkey_asn1_meth_table, nid);
+	}
+
+/* Obtains a pkey_asn1_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth(ENGINE *e, int nid)
+	{
+	EVP_PKEY_ASN1_METHOD *ret;
+	ENGINE_PKEY_ASN1_METHS_PTR fn = ENGINE_get_pkey_asn1_meths(e);
+	if(!fn || !fn(e, &ret, NULL, nid))
+		{
+		ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_ASN1_METH,
+				ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+		return NULL;
+		}
+	return ret;
+	}
+
+/* Gets the pkey_asn1_meth callback from an ENGINE structure */
+ENGINE_PKEY_ASN1_METHS_PTR ENGINE_get_pkey_asn1_meths(const ENGINE *e)
+	{
+	return e->pkey_asn1_meths;
+	}
+
+/* Sets the pkey_asn1_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_asn1_meths(ENGINE *e, ENGINE_PKEY_ASN1_METHS_PTR f)
+	{
+	e->pkey_asn1_meths = f;
+	return 1;
+	}
+
+/* Internal function to free up EVP_PKEY_ASN1_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_asn1_meths_free(ENGINE *e)
+	{
+	int i;
+	EVP_PKEY_ASN1_METHOD *pkm;
+	if (e->pkey_asn1_meths)
+		{
+		const int *pknids;
+		int npknids;
+		npknids = e->pkey_asn1_meths(e, NULL, &pknids, 0);
+		for (i = 0; i < npknids; i++)
+			{
+			if (e->pkey_asn1_meths(e, &pkm, NULL, pknids[i]))
+				{
+				EVP_PKEY_asn1_free(pkm);
+				}
+			}
+		}
+	}
+
+/* Find a method based on a string. This does a linear search through
+ * all implemented algorithms. This is OK in practice because only
+ * a small number of algorithms are likely to be implemented in an engine
+ * and it is not used for speed critical operations.
+ */
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth_str(ENGINE *e,
+					const char *str, int len)
+	{
+	int i, nidcount;
+	const int *nids;
+	EVP_PKEY_ASN1_METHOD *ameth;
+	if (!e->pkey_asn1_meths)
+		return NULL;
+	if (len == -1)
+		len = strlen(str);
+	nidcount = e->pkey_asn1_meths(e, NULL, &nids, 0);
+	for (i = 0; i < nidcount; i++)
+		{
+		e->pkey_asn1_meths(e, &ameth, NULL, nids[i]);
+		if (((int)strlen(ameth->pem_str) == len) && 
+					!strncasecmp(ameth->pem_str, str, len))
+			return ameth;
+		}
+	return NULL;
+	}
+
+typedef struct 
+	{
+	ENGINE *e;
+	const EVP_PKEY_ASN1_METHOD *ameth;
+	const char *str;
+	int len;
+	} ENGINE_FIND_STR;
+
+static void look_str_cb(int nid, STACK_OF(ENGINE) *sk, ENGINE *def, void *arg)
+	{
+	ENGINE_FIND_STR *lk = arg;
+	int i;
+	if (lk->ameth)
+		return;
+	for (i = 0; i < sk_ENGINE_num(sk); i++)
+		{
+		ENGINE *e = sk_ENGINE_value(sk, i);
+		EVP_PKEY_ASN1_METHOD *ameth;
+		e->pkey_asn1_meths(e, &ameth, NULL, nid);
+		if (((int)strlen(ameth->pem_str) == lk->len) && 
+				!strncasecmp(ameth->pem_str, lk->str, lk->len))
+			{
+			lk->e = e;
+			lk->ameth = ameth;
+			return;
+			}
+		}
+	}
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_pkey_asn1_find_str(ENGINE **pe,
+					const char *str, int len)
+	{
+	ENGINE_FIND_STR fstr;
+	fstr.e = NULL;
+	fstr.ameth = NULL;
+	fstr.str = str;
+	fstr.len = len;
+	CRYPTO_w_lock(CRYPTO_LOCK_ENGINE);
+	engine_table_doall(pkey_asn1_meth_table, look_str_cb, &fstr);
+	/* If found obtain a structural reference to engine */
+	if (fstr.e)
+		{
+		fstr.e->struct_ref++;
+		engine_ref_debug(fstr.e, 0, 1)
+		}
+	*pe = fstr.e;
+	CRYPTO_w_unlock(CRYPTO_LOCK_ENGINE);
+	return fstr.ameth;
+	}
diff --git a/crypto/engine/tb_pkmeth.c b/crypto/engine/tb_pkmeth.c
new file mode 100644
index 0000000..1cdb967
--- /dev/null
+++ b/crypto/engine/tb_pkmeth.c
@@ -0,0 +1,167 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include <openssl/evp.h>
+
+/* If this symbol is defined then ENGINE_get_pkey_meth_engine(), the function
+ * that is used by EVP to hook in pkey_meth code and cache defaults (etc), will
+ * display brief debugging summaries to stderr with the 'nid'. */
+/* #define ENGINE_PKEY_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_meth_table = NULL;
+
+void ENGINE_unregister_pkey_meths(ENGINE *e)
+	{
+	engine_table_unregister(&pkey_meth_table, e);
+	}
+
+static void engine_unregister_all_pkey_meths(void)
+	{
+	engine_table_cleanup(&pkey_meth_table);
+	}
+
+int ENGINE_register_pkey_meths(ENGINE *e)
+	{
+	if(e->pkey_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_meth_table,
+				engine_unregister_all_pkey_meths, e, nids,
+					num_nids, 0);
+		}
+	return 1;
+	}
+
+void ENGINE_register_all_pkey_meths()
+	{
+	ENGINE *e;
+
+	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+		ENGINE_register_pkey_meths(e);
+	}
+
+int ENGINE_set_default_pkey_meths(ENGINE *e)
+	{
+	if(e->pkey_meths)
+		{
+		const int *nids;
+		int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+		if(num_nids > 0)
+			return engine_table_register(&pkey_meth_table,
+				engine_unregister_all_pkey_meths, e, nids,
+					num_nids, 1);
+		}
+	return 1;
+	}
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_meth 'nid' */
+ENGINE *ENGINE_get_pkey_meth_engine(int nid)
+	{
+	return engine_table_select(&pkey_meth_table, nid);
+	}
+
+/* Obtains a pkey_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_METHOD *ENGINE_get_pkey_meth(ENGINE *e, int nid)
+	{
+	EVP_PKEY_METHOD *ret;
+	ENGINE_PKEY_METHS_PTR fn = ENGINE_get_pkey_meths(e);
+	if(!fn || !fn(e, &ret, NULL, nid))
+		{
+		ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_METH,
+				ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+		return NULL;
+		}
+	return ret;
+	}
+
+/* Gets the pkey_meth callback from an ENGINE structure */
+ENGINE_PKEY_METHS_PTR ENGINE_get_pkey_meths(const ENGINE *e)
+	{
+	return e->pkey_meths;
+	}
+
+/* Sets the pkey_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_meths(ENGINE *e, ENGINE_PKEY_METHS_PTR f)
+	{
+	e->pkey_meths = f;
+	return 1;
+	}
+
+/* Internal function to free up EVP_PKEY_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_meths_free(ENGINE *e)
+	{
+	int i;
+	EVP_PKEY_METHOD *pkm;
+	if (e->pkey_meths)
+		{
+		const int *pknids;
+		int npknids;
+		npknids = e->pkey_meths(e, NULL, &pknids, 0);
+		for (i = 0; i < npknids; i++)
+			{
+			if (e->pkey_meths(e, &pkm, NULL, pknids[i]))
+				{
+				EVP_PKEY_meth_free(pkm);
+				}
+			}
+		}
+	}
diff --git a/crypto/err/err.c b/crypto/err/err.c
index 69713a6..fcdb244 100644
--- a/crypto/err/err.c
+++ b/crypto/err/err.c
@@ -1066,6 +1066,13 @@ void ERR_set_error_data(char *data, int flags)
 void ERR_add_error_data(int num, ...)
 	{
 	va_list args;
+	va_start(args, num);
+	ERR_add_error_vdata(num, args);
+	va_end(args);
+	}
+
+void ERR_add_error_vdata(int num, va_list args)
+	{
 	int i,n,s;
 	char *str,*p,*a;
 
@@ -1074,7 +1081,6 @@ void ERR_add_error_data(int num, ...)
 	if (str == NULL) return;
 	str[0]='\0';
 
-	va_start(args, num);
 	n=0;
 	for (i=0; i<num; i++)
 		{
@@ -1090,7 +1096,7 @@ void ERR_add_error_data(int num, ...)
 				if (p == NULL)
 					{
 					OPENSSL_free(str);
-					goto err;
+					return;
 					}
 				else
 					str=p;
@@ -1099,9 +1105,6 @@ void ERR_add_error_data(int num, ...)
 			}
 		}
 	ERR_set_error_data(str,ERR_TXT_MALLOCED|ERR_TXT_STRING);
-
-err:
-	va_end(args);
 	}
 
 int ERR_set_mark(void)
diff --git a/crypto/err/err.h b/crypto/err/err.h
index b9f8c16..974cc9c 100644
--- a/crypto/err/err.h
+++ b/crypto/err/err.h
@@ -344,8 +344,9 @@ void ERR_print_errors_fp(FILE *fp);
 #endif
 #ifndef OPENSSL_NO_BIO
 void ERR_print_errors(BIO *bp);
-void ERR_add_error_data(int num, ...);
 #endif
+void ERR_add_error_data(int num, ...);
+void ERR_add_error_vdata(int num, va_list args);
 void ERR_load_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_unload_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_load_ERR_strings(void);
diff --git a/crypto/err/err_all.c b/crypto/err/err_all.c
index fc049e8..8eb547d 100644
--- a/crypto/err/err_all.c
+++ b/crypto/err/err_all.c
@@ -64,7 +64,9 @@
 #endif
 #include <openssl/buffer.h>
 #include <openssl/bio.h>
+#ifndef OPENSSL_NO_COMP
 #include <openssl/comp.h>
+#endif
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
@@ -95,6 +97,9 @@
 #include <openssl/ui.h>
 #include <openssl/ocsp.h>
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include <openssl/ts.h>
 #ifndef OPENSSL_NO_CMS
 #include <openssl/cms.h>
@@ -102,7 +107,6 @@
 #ifndef OPENSSL_NO_JPAKE
 #include <openssl/jpake.h>
 #endif
-#include <openssl/comp.h>
 
 void ERR_load_crypto_strings(void)
 	{
@@ -126,7 +130,9 @@ void ERR_load_crypto_strings(void)
 	ERR_load_ASN1_strings();
 	ERR_load_CONF_strings();
 	ERR_load_CRYPTO_strings();
+#ifndef OPENSSL_NO_COMP
 	ERR_load_COMP_strings();
+#endif
 #ifndef OPENSSL_NO_EC
 	ERR_load_EC_strings();
 #endif
@@ -149,12 +155,14 @@ void ERR_load_crypto_strings(void)
 #endif
 	ERR_load_OCSP_strings();
 	ERR_load_UI_strings();
+#ifdef OPENSSL_FIPS
+	ERR_load_FIPS_strings();
+#endif
 #ifndef OPENSSL_NO_CMS
 	ERR_load_CMS_strings();
 #endif
 #ifndef OPENSSL_NO_JPAKE
 	ERR_load_JPAKE_strings();
 #endif
-	ERR_load_COMP_strings();
 #endif
 	}
diff --git a/crypto/evp/bio_md.c b/crypto/evp/bio_md.c
index 9841e32..144fdfd 100644
--- a/crypto/evp/bio_md.c
+++ b/crypto/evp/bio_md.c
@@ -153,8 +153,12 @@ static int md_write(BIO *b, const char *in, int inl)
 		{
 		if (ret > 0)
 			{
-			EVP_DigestUpdate(ctx,(const unsigned char *)in,
-				(unsigned int)ret);
+			if (!EVP_DigestUpdate(ctx,(const unsigned char *)in,
+				(unsigned int)ret))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
 			}
 		}
 	if(b->next_bio != NULL)
@@ -220,7 +224,8 @@ static long md_ctrl(BIO *b, int cmd, long num, void *ptr)
 	case BIO_CTRL_DUP:
 		dbio=ptr;
 		dctx=dbio->ptr;
-		EVP_MD_CTX_copy_ex(dctx,ctx);
+		if (!EVP_MD_CTX_copy_ex(dctx,ctx))
+			return 0;
 		b->init=1;
 		break;
 	default:
diff --git a/crypto/evp/bio_ok.c b/crypto/evp/bio_ok.c
index 98bc1ab..e643353 100644
--- a/crypto/evp/bio_ok.c
+++ b/crypto/evp/bio_ok.c
@@ -133,10 +133,10 @@ static int ok_new(BIO *h);
 static int ok_free(BIO *data);
 static long ok_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp);
 
-static void sig_out(BIO* b);
-static void sig_in(BIO* b);
-static void block_out(BIO* b);
-static void block_in(BIO* b);
+static int sig_out(BIO* b);
+static int sig_in(BIO* b);
+static int block_out(BIO* b);
+static int block_in(BIO* b);
 #define OK_BLOCK_SIZE	(1024*4)
 #define OK_BLOCK_BLOCK	4
 #define IOBS		(OK_BLOCK_SIZE+ OK_BLOCK_BLOCK+ 3*EVP_MAX_MD_SIZE)
@@ -266,10 +266,24 @@ static int ok_read(BIO *b, char *out, int outl)
 		ctx->buf_len+= i;
 
 		/* no signature yet -- check if we got one */
-		if (ctx->sigio == 1) sig_in(b);
+		if (ctx->sigio == 1)
+			{
+			if (!sig_in(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
+			}
 
 		/* signature ok -- check if we got block */
-		if (ctx->sigio == 0) block_in(b);
+		if (ctx->sigio == 0)
+			{
+			if (!block_in(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
+			}
 
 		/* invalid block -- cancel */
 		if (ctx->cont <= 0) break;
@@ -293,7 +307,8 @@ static int ok_write(BIO *b, const char *in, int inl)
 
 	if ((ctx == NULL) || (b->next_bio == NULL) || (b->init == 0)) return(0);
 
-	if(ctx->sigio) sig_out(b);
+	if(ctx->sigio && !sig_out(b))
+		return 0;
 
 	do{
 		BIO_clear_retry_flags(b);
@@ -332,7 +347,11 @@ static int ok_write(BIO *b, const char *in, int inl)
 
 		if(ctx->buf_len >= OK_BLOCK_SIZE+ OK_BLOCK_BLOCK)
 			{
-			block_out(b);
+			if (!block_out(b))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
 			}
 	}while(inl > 0);
 
@@ -379,7 +398,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
 	case BIO_CTRL_FLUSH:
 		/* do a final write */
 		if(ctx->blockout == 0)
-			block_out(b);
+			if (!block_out(b))
+				return 0;
 
 		while (ctx->blockout)
 			{
@@ -408,7 +428,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
 		break;
 	case BIO_C_SET_MD:
 		md=ptr;
-		EVP_DigestInit_ex(&ctx->md, md, NULL);
+		if (!EVP_DigestInit_ex(&ctx->md, md, NULL))
+			return 0;
 		b->init=1;
 		break;
 	case BIO_C_GET_MD:
@@ -455,7 +476,7 @@ static void longswap(void *_ptr, size_t len)
 	}
 }
 
-static void sig_out(BIO* b)
+static int sig_out(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -463,9 +484,10 @@ static void sig_out(BIO* b)
 	ctx=b->ptr;
 	md=&ctx->md;
 
-	if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return;
+	if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return 1;
 
-	EVP_DigestInit_ex(md, md->digest, NULL);
+	if (!EVP_DigestInit_ex(md, md->digest, NULL))
+		goto berr;
 	/* FIXME: there's absolutely no guarantee this makes any sense at all,
 	 * particularly now EVP_MD_CTX has been restructured.
 	 */
@@ -474,14 +496,20 @@ static void sig_out(BIO* b)
 	longswap(&(ctx->buf[ctx->buf_len]), md->digest->md_size);
 	ctx->buf_len+= md->digest->md_size;
 
-	EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
-	EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+	if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+		goto berr;
 	ctx->buf_len+= md->digest->md_size;
 	ctx->blockout= 1;
 	ctx->sigio= 0;
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void sig_in(BIO* b)
+static int sig_in(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -491,15 +519,18 @@ static void sig_in(BIO* b)
 	ctx=b->ptr;
 	md=&ctx->md;
 
-	if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return;
+	if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return 1;
 
-	EVP_DigestInit_ex(md, md->digest, NULL);
+	if (!EVP_DigestInit_ex(md, md->digest, NULL))
+		goto berr;
 	memcpy(md->md_data, &(ctx->buf[ctx->buf_off]), md->digest->md_size);
 	longswap(md->md_data, md->digest->md_size);
 	ctx->buf_off+= md->digest->md_size;
 
-	EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
-	EVP_DigestFinal_ex(md, tmp, NULL);
+	if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, tmp, NULL))
+		goto berr;
 	ret= memcmp(&(ctx->buf[ctx->buf_off]), tmp, md->digest->md_size) == 0;
 	ctx->buf_off+= md->digest->md_size;
 	if(ret == 1)
@@ -516,9 +547,13 @@ static void sig_in(BIO* b)
 		{
 		ctx->cont= 0;
 		}
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void block_out(BIO* b)
+static int block_out(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -532,13 +567,20 @@ static void block_out(BIO* b)
 	ctx->buf[1]=(unsigned char)(tl>>16);
 	ctx->buf[2]=(unsigned char)(tl>>8);
 	ctx->buf[3]=(unsigned char)(tl);
-	EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
-	EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+	if (!EVP_DigestUpdate(md,
+		(unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+		goto berr;
 	ctx->buf_len+= md->digest->md_size;
 	ctx->blockout= 1;
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
-static void block_in(BIO* b)
+static int block_in(BIO* b)
 	{
 	BIO_OK_CTX *ctx;
 	EVP_MD_CTX *md;
@@ -554,10 +596,13 @@ static void block_in(BIO* b)
 	tl|=ctx->buf[2]; tl<<=8;
 	tl|=ctx->buf[3];
 
-	if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return;
+	if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return 1;
  
-	EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
-	EVP_DigestFinal_ex(md, tmp, NULL);
+	if (!EVP_DigestUpdate(md,
+			(unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+		goto berr;
+	if (!EVP_DigestFinal_ex(md, tmp, NULL))
+		goto berr;
 	if(memcmp(&(ctx->buf[tl+ OK_BLOCK_BLOCK]), tmp, md->digest->md_size) == 0)
 		{
 		/* there might be parts from next block lurking around ! */
@@ -571,5 +616,9 @@ static void block_in(BIO* b)
 		{
 		ctx->cont= 0;
 		}
+	return 1;
+	berr:
+	BIO_clear_retry_flags(b);
+	return 0;
 	}
 
diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c
index c5f9268..2a45d43 100644
--- a/crypto/evp/c_allc.c
+++ b/crypto/evp/c_allc.c
@@ -98,6 +98,9 @@ void OpenSSL_add_all_ciphers(void)
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
 	EVP_add_cipher(EVP_rc4_40());
+#ifndef OPENSSL_NO_MD5
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_IDEA
@@ -166,9 +169,9 @@ void OpenSSL_add_all_ciphers(void)
 	EVP_add_cipher(EVP_aes_128_cfb1());
 	EVP_add_cipher(EVP_aes_128_cfb8());
 	EVP_add_cipher(EVP_aes_128_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_128_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_128_gcm());
+	EVP_add_cipher(EVP_aes_128_xts());
 	EVP_add_cipher_alias(SN_aes_128_cbc,"AES128");
 	EVP_add_cipher_alias(SN_aes_128_cbc,"aes128");
 	EVP_add_cipher(EVP_aes_192_ecb());
@@ -177,9 +180,8 @@ void OpenSSL_add_all_ciphers(void)
 	EVP_add_cipher(EVP_aes_192_cfb1());
 	EVP_add_cipher(EVP_aes_192_cfb8());
 	EVP_add_cipher(EVP_aes_192_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_192_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_192_gcm());
 	EVP_add_cipher_alias(SN_aes_192_cbc,"AES192");
 	EVP_add_cipher_alias(SN_aes_192_cbc,"aes192");
 	EVP_add_cipher(EVP_aes_256_ecb());
@@ -188,11 +190,15 @@ void OpenSSL_add_all_ciphers(void)
 	EVP_add_cipher(EVP_aes_256_cfb1());
 	EVP_add_cipher(EVP_aes_256_cfb8());
 	EVP_add_cipher(EVP_aes_256_ofb());
-#if 0
 	EVP_add_cipher(EVP_aes_256_ctr());
-#endif
+	EVP_add_cipher(EVP_aes_256_gcm());
+	EVP_add_cipher(EVP_aes_256_xts());
 	EVP_add_cipher_alias(SN_aes_256_cbc,"AES256");
 	EVP_add_cipher_alias(SN_aes_256_cbc,"aes256");
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_CAMELLIA
diff --git a/crypto/evp/digest.c b/crypto/evp/digest.c
index 982ba2b..6fc469f 100644
--- a/crypto/evp/digest.c
+++ b/crypto/evp/digest.c
@@ -117,6 +117,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
 	{
 	memset(ctx,'\0',sizeof *ctx);
@@ -225,12 +229,26 @@ int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl)
 		}
 	if (ctx->flags & EVP_MD_CTX_FLAG_NO_INIT)
 		return 1;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		if (FIPS_digestinit(ctx, type))
+			return 1;
+		OPENSSL_free(ctx->md_data);
+		ctx->md_data = NULL;
+		return 0;
+		}
+#endif
 	return ctx->digest->init(ctx);
 	}
 
 int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestupdate(ctx, data, count);
+#else
 	return ctx->update(ctx,data,count);
+#endif
 	}
 
 /* The caller can assume that this removes any secret data from the context */
@@ -245,6 +263,9 @@ int EVP_DigestFinal(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 /* The caller can assume that this removes any secret data from the context */
 int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestfinal(ctx, md, size);
+#else
 	int ret;
 
 	OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
@@ -258,6 +279,7 @@ int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 		}
 	memset(ctx->md_data,0,ctx->digest->ctx_size);
 	return ret;
+#endif
 	}
 
 int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in)
@@ -351,6 +373,7 @@ void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx)
 /* This call frees resources associated with the context */
 int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 	{
+#ifndef OPENSSL_FIPS
 	/* Don't assume ctx->md_data was cleaned in EVP_Digest_Final,
 	 * because sometimes only copies of the context are ever finalised.
 	 */
@@ -363,6 +386,7 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 		OPENSSL_cleanse(ctx->md_data,ctx->digest->ctx_size);
 		OPENSSL_free(ctx->md_data);
 		}
+#endif
 	if (ctx->pctx)
 		EVP_PKEY_CTX_free(ctx->pctx);
 #ifndef OPENSSL_NO_ENGINE
@@ -370,6 +394,9 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 		/* The EVP_MD we used belongs to an ENGINE, release the
 		 * functional reference we held for this reason. */
 		ENGINE_finish(ctx->engine);
+#endif
+#ifdef OPENSSL_FIPS
+	FIPS_md_ctx_cleanup(ctx);
 #endif
 	memset(ctx,'\0',sizeof *ctx);
 
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index bd6c0a3..1bfb5d9 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -1,5 +1,5 @@
 /* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 2001-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -56,57 +56,511 @@
 #include <assert.h>
 #include <openssl/aes.h>
 #include "evp_locl.h"
-
-static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-					const unsigned char *iv, int enc);
+#ifndef OPENSSL_FIPS
+#include "modes_lcl.h"
+#include <openssl/rand.h>
 
 typedef struct
 	{
 	AES_KEY ks;
+	block128_f block;
+	union {
+		cbc128_f cbc;
+		ctr128_f ctr;
+	} stream;
 	} EVP_AES_KEY;
 
-#define data(ctx)	EVP_C_DATA(EVP_AES_KEY,ctx)
-
-IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
-		       NID_aes_128, 16, 16, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_192, ks, AES, EVP_AES_KEY,
-		       NID_aes_192, 16, 24, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_256, ks, AES, EVP_AES_KEY,
-		       NID_aes_256, 16, 32, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-
-#define IMPLEMENT_AES_CFBR(ksize,cbits)	IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
-
-IMPLEMENT_AES_CFBR(128,1)
-IMPLEMENT_AES_CFBR(192,1)
-IMPLEMENT_AES_CFBR(256,1)
-
-IMPLEMENT_AES_CFBR(128,8)
-IMPLEMENT_AES_CFBR(192,8)
-IMPLEMENT_AES_CFBR(256,8)
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	GCM128_CONTEXT gcm;
+	unsigned char *iv;	/* Temporary IV store */
+	int ivlen;		/* IV length */
+	int taglen;
+	int iv_gen;		/* It is OK to generate IVs */
+	int tls_aad_len;	/* TLS AAD length */
+	ctr128_f ctr;
+	} EVP_AES_GCM_CTX;
+
+typedef struct
+	{
+	AES_KEY ks1, ks2;	/* AES key schedules to use */
+	XTS128_CONTEXT xts;
+	void     (*stream)(const unsigned char *in,
+			unsigned char *out, size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+	} EVP_AES_XTS_CTX;
+
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	int tag_set;		/* Set if tag is valid */
+	int len_set;		/* Set if message length set */
+	int L, M;		/* L and M parameters from RFC3610 */
+	CCM128_CONTEXT ccm;
+	ccm128_f str;
+	} EVP_AES_CCM_CTX;
+
+#define MAXBITCHUNK	((size_t)1<<(sizeof(size_t)*8-4))
+
+#ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+#endif
+#ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+			size_t length, const AES_KEY *key,
+			unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+			size_t len, const AES_KEY *key,
+			const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+#endif
+#ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+			size_t blocks, const AES_KEY *key,
+			const unsigned char ivec[AES_BLOCK_SIZE]);
+#endif
+#ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+#endif
+
+#if	defined(AES_ASM) && !defined(I386_ONLY) &&	(  \
+	((defined(__i386)	|| defined(__i386__)	|| \
+	  defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)				)
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+#ifdef VPAES_ASM
+#define VPAES_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#endif
+#ifdef BSAES_ASM
+#define BSAES_CAPABLE	VPAES_CAPABLE
+#endif
+/*
+ * AES-NI section
+ */
+#define	AESNI_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+		   const unsigned char *iv, int enc)
+	{
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+		{ 
+		ret = aesni_set_decrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)aesni_cbc_encrypt :
+					NULL;
+		}
+	else	{
+		ret = aesni_set_encrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_encrypt;
+		if (mode==EVP_CIPH_CBC_MODE)
+			dat->stream.cbc	= (cbc128_f)aesni_cbc_encrypt;
+		else if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		else
+			dat->stream.cbc = NULL;
+		}
+
+	if(ret < 0)
+		{
+		EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
+		return 0;
+		}
+
+	return 1;
+	}
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	aesni_cbc_encrypt(in,out,len,ctx->cipher_data,ctx->iv,ctx->encrypt);
+
+	return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+
+	if (len<bl)	return 1;
+
+	aesni_ecb_encrypt(in,out,len,ctx->cipher_data,ctx->encrypt);
+
+	return 1;
+}
+
+#define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+				(block128_f)aesni_encrypt);
+		gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+#define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key)
+		{
+		/* key_len is two AES keys */
+		if (enc)
+			{
+			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_encrypt;
+			xctx->stream = aesni_xts_encrypt;
+			}
+		else
+			{
+			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_decrypt;
+			xctx->stream = aesni_xts_decrypt;
+			}
+
+		aesni_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)aesni_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		}
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+#define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)aesni_encrypt);
+		cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
+				(ccm128_f)aesni_ccm64_decrypt_blocks;
+		cctx->key_set = 1;
+		}
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+#define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_init_key,			\
+	aesni_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,	\
+	keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_##mode##_init_key,	\
+	aesni_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#else
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+#endif
+
+#define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
+	BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
 
 static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		   const unsigned char *iv, int enc)
 	{
-	int ret;
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
 
-	if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
-	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
-	    || enc) 
-		ret=AES_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+		}
 	else
-		ret=AES_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.ctr	= (ctr128_f)bsaes_ctr32_encrypt_blocks;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+#ifdef AES_CTR_ASM
+		if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)AES_ctr32_encrypt;
+#endif
+		}
 
 	if(ret < 0)
 		{
@@ -117,4 +571,744 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	return 1;
 	}
 
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.cbc)
+		(*dat->stream.cbc)(in,out,len,&dat->ks,ctx->iv,ctx->encrypt);
+	else if (ctx->encrypt)
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+	else
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+
+	return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+	size_t	i;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (len<bl)	return 1;
+
+	for (i=0,len-=bl;i<=len;i+=bl)
+		(*dat->block)(in+i,out+i,&dat->ks);
+
+	return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_ofb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,dat->block);
+	return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_8_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (ctx->flags&EVP_CIPH_FLAG_LENGTH_BITS) {
+		CRYPTO_cfb128_1_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		return 1;
+	}
+
+	while (len>=MAXBITCHUNK) {
+		CRYPTO_cfb128_1_encrypt(in,out,MAXBITCHUNK*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		len-=MAXBITCHUNK;
+	}
+	if (len)
+		CRYPTO_cfb128_1_encrypt(in,out,len*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	
+	return 1;
+}
+
+static int aes_ctr_cipher (EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+{
+	unsigned int num = ctx->num;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.ctr)
+		CRYPTO_ctr128_encrypt_ctr32(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->stream.ctr);
+	else
+		CRYPTO_ctr128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->block);
+	ctx->num = (size_t)num;
+	return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes,128,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,192,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,256,EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+	if (gctx->iv != c->iv)
+		OPENSSL_free(gctx->iv);
+	return 1;
+	}
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter) {
+	int n=8;
+	unsigned char  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		gctx->key_set = 0;
+		gctx->iv_set = 0;
+		gctx->ivlen = c->cipher->iv_len;
+		gctx->iv = c->iv;
+		gctx->taglen = -1;
+		gctx->iv_gen = 0;
+		gctx->tls_aad_len = -1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IVLEN:
+		if (arg <= 0)
+			return 0;
+#ifdef OPENSSL_FIPS
+		if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)
+						 && arg < 12)
+			return 0;
+#endif
+		/* Allocate memory for IV if needed */
+		if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen))
+			{
+			if (gctx->iv != c->iv)
+				OPENSSL_free(gctx->iv);
+			gctx->iv = OPENSSL_malloc(arg);
+			if (!gctx->iv)
+				return 0;
+			}
+		gctx->ivlen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_TAG:
+		if (arg <= 0 || arg > 16 || c->encrypt)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->taglen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_GET_TAG:
+		if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+			return 0;
+		memcpy(ptr, c->buf, arg);
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_FIXED:
+		/* Special case: -1 length restores whole IV */
+		if (arg == -1)
+			{
+			memcpy(gctx->iv, ptr, gctx->ivlen);
+			gctx->iv_gen = 1;
+			return 1;
+			}
+		/* Fixed field must be at least 4 bytes and invocation field
+		 * at least 8.
+		 */
+		if ((arg < 4) || (gctx->ivlen - arg) < 8)
+			return 0;
+		if (arg)
+			memcpy(gctx->iv, ptr, arg);
+		if (c->encrypt &&
+			RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+			return 0;
+		gctx->iv_gen = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_IV_GEN:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0)
+			return 0;
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		if (arg <= 0 || arg > gctx->ivlen)
+			arg = gctx->ivlen;
+		memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+		/* Invocation field will be at least 8 bytes in size and
+		 * so no need to check wrap around or increment more than
+		 * last 8 bytes.
+		 */
+		ctr64_inc(gctx->iv + gctx->ivlen - 8);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_INV:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+			return 0;
+		memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		/* Save the AAD for later use */
+		if (arg != 13)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->tls_aad_len = arg;
+			{
+			unsigned int len=c->buf[arg-2]<<8|c->buf[arg-1];
+			/* Correct length for explicit IV */
+			len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			/* If decrypting correct for tag too */
+			if (!c->encrypt)
+				len -= EVP_GCM_TLS_TAG_LEN;
+                        c->buf[arg-2] = len>>8;
+                        c->buf[arg-1] = len & 0xff;
+			}
+		/* Extra padding: tag appended to record */
+		return EVP_GCM_TLS_TAG_LEN;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{ do {
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			{
+			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)AES_encrypt);
+			gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+			break;
+			}
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)vpaes_encrypt);
+			gctx->ctr = NULL;
+			break;
+			}
+#endif
+		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+#ifdef AES_CTR_ASM
+		gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
+#else
+		gctx->ctr = NULL;
+#endif
+		} while (0);
+
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+/* Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	int rv = -1;
+	/* Encrypt/decrypt must be performed in place */
+	if (out != in || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN+EVP_GCM_TLS_TAG_LEN))
+		return -1;
+	/* Set IV from start of buffer or generate IV and write to start
+	 * of buffer.
+	 */
+	if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+				EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+				EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+		goto err;
+	/* Use saved AAD */
+	if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+		goto err;
+	/* Fix buffer and length to point to payload */
+	in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+	if (ctx->encrypt)
+		{
+		/* Encrypt payload */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		out += len;
+		/* Finally write tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+		rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+		}
+	else
+		{
+		/* Decrypt */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		/* Retrieve tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf,
+					EVP_GCM_TLS_TAG_LEN);
+		/* If tag mismatch wipe buffer */
+		if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN))
+			{
+			OPENSSL_cleanse(out, len);
+			goto err;
+			}
+		rv = len;
+		}
+
+	err:
+	gctx->iv_set = 0;
+	gctx->tls_aad_len = -1;
+	return rv;
+	}
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	/* If not set up, return error */
+	if (!gctx->key_set)
+		return -1;
+
+	if (gctx->tls_aad_len >= 0)
+		return aes_gcm_tls_cipher(ctx, out, in, len);
+
+	if (!gctx->iv_set)
+		return -1;
+	if (in)
+		{
+		if (out == NULL)
+			{
+			if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+				return -1;
+			}
+		else if (ctx->encrypt)
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		else
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		return len;
+		}
+	else
+		{
+		if (!ctx->encrypt)
+			{
+			if (gctx->taglen < 0)
+				return -1;
+			if (CRYPTO_gcm128_finish(&gctx->gcm,
+					ctx->buf, gctx->taglen) != 0)
+				return -1;
+			gctx->iv_set = 0;
+			return 0;
+			}
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+		gctx->taglen = 16;
+		/* Don't reuse the IV */
+		gctx->iv_set = 0;
+		return 0;
+		}
+
+	}
+
+#define CUSTOM_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 \
+		| EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+		| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_XTS_CTX *xctx = c->cipher_data;
+	if (type != EVP_CTRL_INIT)
+		return -1;
+	/* key1 and key2 are used as an indicator both key and IV are set */
+	xctx->xts.key1 = NULL;
+	xctx->xts.key2 = NULL;
+	return 1;
+	}
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key) do
+		{
+#ifdef AES_XTS_ASM
+		xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+#else
+		xctx->stream = NULL;
+#endif
+		/* key_len is two AES keys */
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+		    {
+		    if (enc)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_encrypt;
+			}
+		    else
+			{
+			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_decrypt;
+			}
+
+		vpaes_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)vpaes_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		break;
+		}
+#endif
+		if (enc)
+			{
+			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_encrypt;
+			}
+		else
+			{
+			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_decrypt;
+			}
+
+		AES_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)AES_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		} while (0);
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!xctx->xts.key1 || !xctx->xts.key2)
+		return 0;
+	if (!out || !in || len<AES_BLOCK_SIZE)
+		return 0;
+#ifdef OPENSSL_FIPS
+	/* Requirement of SP800-38E */
+	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
+			(len > (1UL<<20)*16))
+		{
+		EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
+		return 0;
+		}
+#endif
+	if (xctx->stream)
+		(*xctx->stream)(in, out, len,
+				xctx->xts.key1, xctx->xts.key2, ctx->iv);
+	else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+								ctx->encrypt))
+		return 0;
+	return 1;
+	}
+
+#define aes_xts_cleanup NULL
+
+#define XTS_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+			 | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_CCM_CTX *cctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		cctx->key_set = 0;
+		cctx->iv_set = 0;
+		cctx->L = 8;
+		cctx->M = 12;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_IVLEN:
+		arg = 15 - arg;
+	case EVP_CTRL_CCM_SET_L:
+		if (arg < 2 || arg > 8)
+			return 0;
+		cctx->L = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_TAG:
+		if ((arg & 1) || arg < 4 || arg > 16)
+			return 0;
+		if ((c->encrypt && ptr) || (!c->encrypt && !ptr))
+			return 0;
+		if (ptr)
+			{
+			cctx->tag_set = 1;
+			memcpy(c->buf, ptr, arg);
+			}
+		cctx->M = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_GET_TAG:
+		if (!c->encrypt || !cctx->tag_set)
+			return 0;
+		if(!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+			return 0;
+		cctx->tag_set = 0;
+		cctx->iv_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key) do
+		{
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
+			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)vpaes_encrypt);
+			cctx->str = NULL;
+			cctx->key_set = 1;
+			break;
+			}
+#endif
+		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)AES_encrypt);
+		cctx->str = NULL;
+		cctx->key_set = 1;
+		} while (0);
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	CCM128_CONTEXT *ccm = &cctx->ccm;
+	/* If not set up, return error */
+	if (!cctx->iv_set && !cctx->key_set)
+		return -1;
+	if (!ctx->encrypt && !cctx->tag_set)
+		return -1;
+	if (!out)
+		{
+		if (!in)
+			{
+			if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L,len))
+				return -1;
+			cctx->len_set = 1;
+			return len;
+			}
+		/* If have AAD need message length */
+		if (!cctx->len_set && len)
+			return -1;
+		CRYPTO_ccm128_aad(ccm, in, len);
+		return len;
+		}
+	/* EVP_*Final() doesn't return any data */
+	if (!in)
+		return 0;
+	/* If not set length yet do it */
+	if (!cctx->len_set)
+		{
+		if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+			return -1;
+		cctx->len_set = 1;
+		}
+	if (ctx->encrypt)
+		{
+		if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				CRYPTO_ccm128_encrypt(ccm, in, out, len))
+			return -1;
+		cctx->tag_set = 1;
+		return len;
+		}
+	else
+		{
+		int rv = -1;
+		if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				!CRYPTO_ccm128_decrypt(ccm, in, out, len))
+			{
+			unsigned char tag[16];
+			if (CRYPTO_ccm128_tag(ccm, tag, cctx->M))
+				{
+				if (!memcmp(tag, ctx->buf, cctx->M))
+					rv = len;
+				}
+			}
+		if (rv == -1)
+			OPENSSL_cleanse(out, len);
+		cctx->iv_set = 0;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return rv;
+		}
+
+	}
+
+#define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+
+#endif
 #endif
diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 0000000..483e04b
--- /dev/null
+++ b/crypto/evp/e_aes_cbc_hmac_sha1.c
@@ -0,0 +1,580 @@
+/* ====================================================================
+ * Copyright (c) 2011-2013 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+#include "evp_locl.h"
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+    {
+    AES_KEY		ks;
+    SHA_CTX		head,tail,md;
+    size_t		payload_length;	/* AAD length in decrypt case */
+    union {
+	unsigned int	tls_ver;
+    	unsigned char	tls_aad[16];	/* 13 used */
+    } aux;
+    } EVP_AES_HMAC_SHA1;
+
+#define NO_PAYLOAD_LENGTH	((size_t)-1)
+
+#if	defined(AES_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)	)
+
+#if defined(__GNUC__) && __GNUC__>=2 && !defined(PEDANTIC)
+# define BSWAP(x) ({ unsigned int r=(x); asm ("bswapl %0":"=r"(r):"0"(r)); r; })
+#endif
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE   (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+			   unsigned char *out,
+			   size_t length,
+			   const AES_KEY *key,
+			   unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+		const AES_KEY *key, unsigned char iv[16],
+		SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	int ret;
+
+	if (enc)
+		ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+	else
+		ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+	SHA1_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return ret<0?0:1;
+	}
+
+#define	STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define	aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{	const unsigned char *ptr = data;
+	size_t res;
+
+	if ((res = c->num)) {
+		res = SHA_CBLOCK-res;
+		if (len<res) res=len;
+		SHA1_Update (c,ptr,res);
+		ptr += res;
+		len -= res;
+	}
+
+	res = len % SHA_CBLOCK;
+	len -= res;
+
+	if (len) {
+		sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+
+		ptr += len;
+		c->Nh += len>>29;
+		c->Nl += len<<=3;
+		if (c->Nl<(unsigned int)len) c->Nh++;
+	}
+
+	if (res)
+		SHA1_Update(c,ptr,res);
+}
+
+#ifdef SHA1_Update
+#undef SHA1_Update
+#endif
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	unsigned int l;
+	size_t	plen = key->payload_length,
+		iv = 0,		/* explicit IV in TLS 1.1 and later */
+		sha_off = 0;
+#if defined(STITCHED_CALL)
+	size_t	aes_off = 0,
+		blocks;
+
+	sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	if (len%AES_BLOCK_SIZE) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==NO_PAYLOAD_LENGTH)
+			plen = len;
+		else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+			return 0;
+		else if (key->aux.tls_ver >= TLS1_1_VERSION)
+			iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+		if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+			SHA1_Update(&key->md,in+iv,sha_off);
+
+			aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+				ctx->iv,&key->md,in+iv+sha_off);
+			blocks *= SHA_CBLOCK;
+			aes_off += blocks;
+			sha_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			sha_off = 0;
+		}
+#endif
+		sha_off += iv;
+		SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+		if (plen!=len)	{	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+			/* calculate HMAC and append it to payload */
+			SHA1_Final(out+plen,&key->md);
+			key->md = key->tail;
+			SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+			SHA1_Final(out+plen,&key->md);
+
+			/* pad the payload|hmac */
+			plen += SHA_DIGEST_LENGTH;
+			for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+			/* encrypt HMAC|padding at once */
+			aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		} else {
+			aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		}
+	} else {
+		union { unsigned int  u[SHA_DIGEST_LENGTH/sizeof(unsigned int)];
+			unsigned char c[32+SHA_DIGEST_LENGTH]; } mac, *pmac;
+
+		/* arrange cache line alignment */
+		pmac = (void *)(((size_t)mac.c+31)&((size_t)0-32));
+
+		/* decrypt HMAC|padding at once */
+		aesni_cbc_encrypt(in,out,len,
+				&key->ks,ctx->iv,0);
+
+		if (plen) {	/* "TLS" mode of operation */
+			size_t inp_len, mask, j, i;
+			unsigned int res, maxpad, pad, bitlen;
+			int ret = 1;
+			union {	unsigned int  u[SHA_LBLOCK];
+				unsigned char c[SHA_CBLOCK]; }
+				*data = (void *)key->md.data;
+
+			if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+			    >= TLS1_1_VERSION)
+				iv = AES_BLOCK_SIZE;
+
+			if (len<(iv+SHA_DIGEST_LENGTH+1))
+				return 0;
+
+			/* omit explicit iv */
+			out += iv;
+			len -= iv;
+
+			/* figure out payload length */
+			pad = out[len-1];
+			maxpad = len-(SHA_DIGEST_LENGTH+1);
+			maxpad |= (255-maxpad)>>(sizeof(maxpad)*8-8);
+			maxpad &= 255;
+
+			inp_len = len - (SHA_DIGEST_LENGTH+pad+1);
+			mask = (0-((inp_len-len)>>(sizeof(inp_len)*8-1)));
+			inp_len &= mask;
+			ret &= (int)mask;
+
+			key->aux.tls_aad[plen-2] = inp_len>>8;
+			key->aux.tls_aad[plen-1] = inp_len;
+
+			/* calculate HMAC */
+			key->md = key->head;
+			SHA1_Update(&key->md,key->aux.tls_aad,plen);
+
+#if 1
+			len -= SHA_DIGEST_LENGTH;		/* amend mac */
+			if (len>=(256+SHA_CBLOCK)) {
+				j = (len-(256+SHA_CBLOCK))&(0-SHA_CBLOCK);
+				j += SHA_CBLOCK-key->md.num;
+				SHA1_Update(&key->md,out,j);
+				out += j;
+				len -= j;
+				inp_len -= j;
+			}
+
+			/* but pretend as if we hashed padded payload */
+			bitlen = key->md.Nl+(inp_len<<3);	/* at most 18 bits */
+#ifdef BSWAP
+			bitlen = BSWAP(bitlen);
+#else
+			mac.c[0] = 0;
+			mac.c[1] = (unsigned char)(bitlen>>16);
+			mac.c[2] = (unsigned char)(bitlen>>8);
+			mac.c[3] = (unsigned char)bitlen;
+			bitlen = mac.u[0];
+#endif
+
+			pmac->u[0]=0;
+			pmac->u[1]=0;
+			pmac->u[2]=0;
+			pmac->u[3]=0;
+			pmac->u[4]=0;
+
+			for (res=key->md.num, j=0;j<len;j++) {
+				size_t c = out[j];
+				mask = (j-inp_len)>>(sizeof(j)*8-8);
+				c &= mask;
+				c |= 0x80&~mask&~((inp_len-j)>>(sizeof(j)*8-8));
+				data->c[res++]=(unsigned char)c;
+
+				if (res!=SHA_CBLOCK) continue;
+
+				mask = 0-((inp_len+8-j)>>(sizeof(j)*8-1));
+				data->u[SHA_LBLOCK-1] |= bitlen&mask;
+				sha1_block_data_order(&key->md,data,1);
+				mask &= 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+				pmac->u[0] |= key->md.h0 & mask;
+				pmac->u[1] |= key->md.h1 & mask;
+				pmac->u[2] |= key->md.h2 & mask;
+				pmac->u[3] |= key->md.h3 & mask;
+				pmac->u[4] |= key->md.h4 & mask;
+				res=0;
+			}
+
+			for(i=res;i<SHA_CBLOCK;i++,j++) data->c[i]=0;
+
+			if (res>SHA_CBLOCK-8) {
+				mask = 0-((inp_len+8-j)>>(sizeof(j)*8-1));
+				data->u[SHA_LBLOCK-1] |= bitlen&mask;
+				sha1_block_data_order(&key->md,data,1);
+				mask &= 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+				pmac->u[0] |= key->md.h0 & mask;
+				pmac->u[1] |= key->md.h1 & mask;
+				pmac->u[2] |= key->md.h2 & mask;
+				pmac->u[3] |= key->md.h3 & mask;
+				pmac->u[4] |= key->md.h4 & mask;
+
+				memset(data,0,SHA_CBLOCK);
+				j+=64;
+			}
+			data->u[SHA_LBLOCK-1] = bitlen;
+			sha1_block_data_order(&key->md,data,1);
+			mask = 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+			pmac->u[0] |= key->md.h0 & mask;
+			pmac->u[1] |= key->md.h1 & mask;
+			pmac->u[2] |= key->md.h2 & mask;
+			pmac->u[3] |= key->md.h3 & mask;
+			pmac->u[4] |= key->md.h4 & mask;
+
+#ifdef BSWAP
+			pmac->u[0] = BSWAP(pmac->u[0]);
+			pmac->u[1] = BSWAP(pmac->u[1]);
+			pmac->u[2] = BSWAP(pmac->u[2]);
+			pmac->u[3] = BSWAP(pmac->u[3]);
+			pmac->u[4] = BSWAP(pmac->u[4]);
+#else
+			for (i=0;i<5;i++) {
+				res = pmac->u[i];
+				pmac->c[4*i+0]=(unsigned char)(res>>24);
+				pmac->c[4*i+1]=(unsigned char)(res>>16);
+				pmac->c[4*i+2]=(unsigned char)(res>>8);
+				pmac->c[4*i+3]=(unsigned char)res;
+			}
+#endif
+			len += SHA_DIGEST_LENGTH;
+#else
+			SHA1_Update(&key->md,out,inp_len);
+			res = key->md.num;
+			SHA1_Final(pmac->c,&key->md);
+
+			{
+			unsigned int inp_blocks, pad_blocks;
+
+			/* but pretend as if we hashed padded payload */
+			inp_blocks = 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+			res += (unsigned int)(len-inp_len);
+			pad_blocks = res / SHA_CBLOCK;
+			res %= SHA_CBLOCK;
+			pad_blocks += 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+			for (;inp_blocks<pad_blocks;inp_blocks++)
+				sha1_block_data_order(&key->md,data,1);
+			}
+#endif
+			key->md = key->tail;
+			SHA1_Update(&key->md,pmac->c,SHA_DIGEST_LENGTH);
+			SHA1_Final(pmac->c,&key->md);
+
+			/* verify HMAC */
+			out += inp_len;
+			len -= inp_len;
+#if 1
+			{
+			unsigned char *p = out+len-1-maxpad-SHA_DIGEST_LENGTH;
+			size_t off = out-p;
+			unsigned int c, cmask;
+
+			maxpad += SHA_DIGEST_LENGTH;
+			for (res=0,i=0,j=0;j<maxpad;j++) {
+				c = p[j];
+				cmask = ((int)(j-off-SHA_DIGEST_LENGTH))>>(sizeof(int)*8-1);
+				res |= (c^pad)&~cmask;	/* ... and padding */
+				cmask &= ((int)(off-1-j))>>(sizeof(int)*8-1);
+				res |= (c^pmac->c[i])&cmask;
+				i += 1&cmask;
+			}
+			maxpad -= SHA_DIGEST_LENGTH;
+
+			res = 0-((0-res)>>(sizeof(res)*8-1));
+			ret &= (int)~res;
+			}
+#else
+			for (res=0,i=0;i<SHA_DIGEST_LENGTH;i++)
+				res |= out[i]^pmac->c[i];
+			res = 0-((0-res)>>(sizeof(res)*8-1));
+			ret &= (int)~res;
+
+			/* verify padding */
+			pad = (pad&~res) | (maxpad&res);
+			out = out+len-1-pad;
+			for (res=0,i=0;i<pad;i++)
+				res |= out[i]^pad;
+
+			res = (0-res)>>(sizeof(res)*8-1);
+			ret &= (int)~res;
+#endif
+			return ret;
+		} else {
+			SHA1_Update(&key->md,out,len);
+		}
+	}
+
+	return 1;
+	}
+
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			SHA1_Init(&key->head);
+			SHA1_Update(&key->head,ptr,arg);
+			SHA1_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		SHA1_Init(&key->head);
+		SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		SHA1_Init(&key->tail);
+		SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		OPENSSL_cleanse(hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (ctx->encrypt)
+			{
+			key->payload_length = len;
+			if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				p[arg-2] = len>>8;
+				p[arg-1] = len;
+			}
+			key->md = key->head;
+			SHA1_Update(&key->md,p,arg);
+
+			return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+				- len);
+			}
+		else
+			{
+			if (arg>13) arg = 13;
+			memcpy(key->aux.tls_aad,ptr,arg);
+			key->payload_length = arg;
+
+			return SHA_DIGEST_LENGTH;
+			}
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_128_cbc_hmac_sha1
+	NID_aes_128_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,16,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_256_cbc_hmac_sha1
+	NID_aes_256_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,32,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_128_cbc_hmac_sha1_cipher:NULL);
+	}
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_256_cbc_hmac_sha1_cipher:NULL);
+	}
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+#endif
+#endif
diff --git a/crypto/evp/e_des3.c b/crypto/evp/e_des3.c
index 3232cfe..1e69972 100644
--- a/crypto/evp/e_des3.c
+++ b/crypto/evp/e_des3.c
@@ -65,6 +65,8 @@
 #include <openssl/des.h>
 #include <openssl/rand.h>
 
+#ifndef OPENSSL_FIPS
+
 static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 			    const unsigned char *iv,int enc);
 
@@ -311,3 +313,4 @@ const EVP_CIPHER *EVP_des_ede3(void)
 	return &des_ede3_ecb;
 }
 #endif
+#endif
diff --git a/crypto/evp/e_null.c b/crypto/evp/e_null.c
index 7cf50e1..f0c1f78 100644
--- a/crypto/evp/e_null.c
+++ b/crypto/evp/e_null.c
@@ -61,6 +61,8 @@
 #include <openssl/evp.h>
 #include <openssl/objects.h>
 
+#ifndef OPENSSL_FIPS
+
 static int null_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	const unsigned char *iv,int enc);
 static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
@@ -99,4 +101,4 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		memcpy((char *)out,(const char *)in,inl);
 	return 1;
 	}
-
+#endif
diff --git a/crypto/evp/e_rc2.c b/crypto/evp/e_rc2.c
index f78d781..d4c33b5 100644
--- a/crypto/evp/e_rc2.c
+++ b/crypto/evp/e_rc2.c
@@ -183,7 +183,8 @@ static int rc2_get_asn1_type_and_iv(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 		key_bits =rc2_magic_to_meth((int)num);
 		if (!key_bits)
 			return(-1);
-		if(i > 0) EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1);
+		if(i > 0 && !EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1))
+			return -1;
 		EVP_CIPHER_CTX_ctrl(c, EVP_CTRL_SET_RC2_KEY_BITS, key_bits, NULL);
 		EVP_CIPHER_CTX_set_key_length(c, key_bits / 8);
 		}
diff --git a/crypto/evp/e_rc4.c b/crypto/evp/e_rc4.c
index 8b5175e..b4f6bda 100644
--- a/crypto/evp/e_rc4.c
+++ b/crypto/evp/e_rc4.c
@@ -62,6 +62,7 @@
 #ifndef OPENSSL_NO_RC4
 
 #include <openssl/evp.h>
+#include "evp_locl.h"
 #include <openssl/objects.h>
 #include <openssl/rc4.h>
 
diff --git a/crypto/evp/e_rc4_hmac_md5.c b/crypto/evp/e_rc4_hmac_md5.c
new file mode 100644
index 0000000..5656319
--- /dev/null
+++ b/crypto/evp/e_rc4_hmac_md5.c
@@ -0,0 +1,298 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE		16
+
+typedef struct
+    {
+    RC4_KEY		ks;
+    MD5_CTX		head,tail,md;
+    size_t		payload_length;
+    } EVP_RC4_HMAC_MD5;
+
+#define NO_PAYLOAD_LENGTH	((size_t)-1)
+
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+		MD5_CTX *ctx,const void *inp,size_t blocks);
+
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+		    inkey);
+
+	MD5_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return 1;
+	}
+
+#if	!defined(OPENSSL_NO_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)		) && \
+	!(defined(__APPLE__) && defined(__MACH__))
+#define	STITCHED_CALL
+#endif
+
+#if !defined(STITCHED_CALL)
+#define	rc4_off 0
+#define	md5_off 0
+#endif
+
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+	size_t	rc4_off = 32-1-(key->ks.x&(32-1)),	/* 32 is $MOD from rc4_md5-x86_64.pl */
+		md5_off = MD5_CBLOCK-key->md.num,
+		blocks;
+	unsigned int l;
+	extern unsigned int OPENSSL_ia32cap_P[];
+#endif
+	size_t	plen = key->payload_length;
+
+	if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==NO_PAYLOAD_LENGTH) plen = len;
+#if defined(STITCHED_CALL)
+		/* cipher has to "fall behind" */
+		if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+
+		if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) &&
+		    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+			MD5_Update(&key->md,in,md5_off);
+			RC4(&key->ks,rc4_off,in,out);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,in+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			rc4_off = 0;
+			md5_off = 0;
+		}
+#endif
+		MD5_Update(&key->md,in+md5_off,plen-md5_off);
+
+		if (plen!=len) {	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+
+			/* calculate HMAC and append it to payload */
+			MD5_Final(out+plen,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+			MD5_Final(out+plen,&key->md);
+			/* encrypt HMAC at once */
+			RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+		} else {
+			RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		}
+	} else {
+		unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+		/* digest has to "fall behind" */
+		if (md5_off>rc4_off)	rc4_off += 2*MD5_CBLOCK;
+		else			rc4_off += MD5_CBLOCK;
+
+		if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) &&
+		    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+			RC4(&key->ks,rc4_off,in,out);
+			MD5_Update(&key->md,out,md5_off);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,out+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+			if (l<key->md.Nl) key->md.Nh++;
+			key->md.Nl  = l;
+			key->md.Nh += blocks>>29;
+		} else {
+			md5_off=0;
+			rc4_off=0;
+		}
+#endif
+		/* decrypt HMAC at once */
+		RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		if (plen!=NO_PAYLOAD_LENGTH) {	/* "TLS" mode of operation */
+			MD5_Update(&key->md,out+md5_off,plen-md5_off);
+
+			/* calculate HMAC and verify it */
+			MD5_Final(mac,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+			MD5_Final(mac,&key->md);
+
+			if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+				return 0;
+		} else {
+			MD5_Update(&key->md,out+md5_off,len-md5_off);
+		}
+	}
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return 1;
+	}
+
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			MD5_Init(&key->head);
+			MD5_Update(&key->head,ptr,arg);
+			MD5_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		MD5_Init(&key->head);
+		MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		MD5_Init(&key->tail);
+		MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (!ctx->encrypt)
+			{
+			len -= MD5_DIGEST_LENGTH;
+			p[arg-2] = len>>8;
+			p[arg-1] = len;
+			}
+		key->payload_length=len;
+		key->md = key->head;
+		MD5_Update(&key->md,p,arg);
+
+		return MD5_DIGEST_LENGTH;
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER r4_hmac_md5_cipher=
+	{
+#ifdef NID_rc4_hmac_md5
+	NID_rc4_hmac_md5,
+#else
+	NID_undef,
+#endif
+	1,EVP_RC4_KEY_SIZE,0,
+	EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+	rc4_hmac_md5_init_key,
+	rc4_hmac_md5_cipher,
+	NULL,
+	sizeof(EVP_RC4_HMAC_MD5),
+	NULL,
+	NULL,
+	rc4_hmac_md5_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+	{
+	return(&r4_hmac_md5_cipher);
+	}
+#endif
diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h
index 9f9795e..e43a58e 100644
--- a/crypto/evp/evp.h
+++ b/crypto/evp/evp.h
@@ -83,7 +83,7 @@
 #define EVP_RC5_32_12_16_KEY_SIZE	16
 */
 #define EVP_MAX_MD_SIZE			64	/* longest known is SHA512 */
-#define EVP_MAX_KEY_LENGTH		32
+#define EVP_MAX_KEY_LENGTH		64
 #define EVP_MAX_IV_LENGTH		16
 #define EVP_MAX_BLOCK_LENGTH		32
 
@@ -116,6 +116,7 @@
 #define EVP_PKEY_DH	NID_dhKeyAgreement
 #define EVP_PKEY_EC	NID_X9_62_id_ecPublicKey
 #define EVP_PKEY_HMAC	NID_hmac
+#define EVP_PKEY_CMAC	NID_cmac
 
 #ifdef	__cplusplus
 extern "C" {
@@ -216,6 +217,8 @@ typedef int evp_verify_method(int type,const unsigned char *m,
 
 #define EVP_MD_FLAG_DIGALGID_CUSTOM		0x0018
 
+#define EVP_MD_FLAG_FIPS	0x0400 /* Note if suitable for use in FIPS mode */
+
 /* Digest ctrls */
 
 #define	EVP_MD_CTRL_DIGALGID			0x1
@@ -325,6 +328,10 @@ struct evp_cipher_st
 #define		EVP_CIPH_CBC_MODE		0x2
 #define		EVP_CIPH_CFB_MODE		0x3
 #define		EVP_CIPH_OFB_MODE		0x4
+#define		EVP_CIPH_CTR_MODE		0x5
+#define		EVP_CIPH_GCM_MODE		0x6
+#define		EVP_CIPH_CCM_MODE		0x7
+#define		EVP_CIPH_XTS_MODE		0x10001
 #define 	EVP_CIPH_MODE			0xF0007
 /* Set if variable length cipher */
 #define 	EVP_CIPH_VARIABLE_LENGTH	0x8
@@ -346,6 +353,15 @@ struct evp_cipher_st
 #define		EVP_CIPH_FLAG_DEFAULT_ASN1	0x1000
 /* Buffer length in bits not bytes: CFB1 mode only */
 #define		EVP_CIPH_FLAG_LENGTH_BITS	0x2000
+/* Note if suitable for use in FIPS mode */
+#define		EVP_CIPH_FLAG_FIPS		0x4000
+/* Allow non FIPS cipher in FIPS mode */
+#define		EVP_CIPH_FLAG_NON_FIPS_ALLOW	0x8000
+/* Cipher handles any and all padding logic as well
+ * as finalisation.
+ */
+#define 	EVP_CIPH_FLAG_CUSTOM_CIPHER	0x100000
+#define		EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
 
 /* ctrl() values */
 
@@ -358,6 +374,33 @@ struct evp_cipher_st
 #define 	EVP_CTRL_RAND_KEY		0x6
 #define 	EVP_CTRL_PBE_PRF_NID		0x7
 #define 	EVP_CTRL_COPY			0x8
+#define 	EVP_CTRL_GCM_SET_IVLEN		0x9
+#define 	EVP_CTRL_GCM_GET_TAG		0x10
+#define 	EVP_CTRL_GCM_SET_TAG		0x11
+#define		EVP_CTRL_GCM_SET_IV_FIXED	0x12
+#define		EVP_CTRL_GCM_IV_GEN		0x13
+#define		EVP_CTRL_CCM_SET_IVLEN		EVP_CTRL_GCM_SET_IVLEN
+#define		EVP_CTRL_CCM_GET_TAG		EVP_CTRL_GCM_GET_TAG
+#define		EVP_CTRL_CCM_SET_TAG		EVP_CTRL_GCM_SET_TAG
+#define		EVP_CTRL_CCM_SET_L		0x14
+#define		EVP_CTRL_CCM_SET_MSGLEN		0x15
+/* AEAD cipher deduces payload length and returns number of bytes
+ * required to store MAC and eventual padding. Subsequent call to
+ * EVP_Cipher even appends/verifies MAC.
+ */
+#define		EVP_CTRL_AEAD_TLS1_AAD		0x16
+/* Used by composite AEAD ciphers, no-op in GCM, CCM... */
+#define		EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+/* Set the GCM invocation field, decrypt only */
+#define		EVP_CTRL_GCM_SET_IV_INV		0x18
+
+/* GCM TLS constants */
+/* Length of fixed part of IV derived from PRF */
+#define EVP_GCM_TLS_FIXED_IV_LEN			4
+/* Length of explicit part of IV part of TLS records */
+#define EVP_GCM_TLS_EXPLICIT_IV_LEN			8
+/* Length of tag for TLS */
+#define EVP_GCM_TLS_TAG_LEN				16
 
 typedef struct evp_cipher_info_st
 	{
@@ -375,7 +418,7 @@ struct evp_cipher_ctx_st
 	unsigned char  oiv[EVP_MAX_IV_LENGTH];	/* original iv */
 	unsigned char  iv[EVP_MAX_IV_LENGTH];	/* working iv */
 	unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */
-	int num;				/* used by cfb/ofb mode */
+	int num;				/* used by cfb/ofb/ctr mode */
 
 	void *app_data;		/* application stuff */
 	int key_len;		/* May change for variable length cipher */
@@ -695,6 +738,9 @@ const EVP_MD *EVP_dev_crypto_md5(void);
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -741,9 +787,10 @@ const EVP_CIPHER *EVP_aes_128_cfb8(void);
 const EVP_CIPHER *EVP_aes_128_cfb128(void);
 # define EVP_aes_128_cfb EVP_aes_128_cfb128
 const EVP_CIPHER *EVP_aes_128_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_128_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_128_ccm(void);
+const EVP_CIPHER *EVP_aes_128_gcm(void);
+const EVP_CIPHER *EVP_aes_128_xts(void);
 const EVP_CIPHER *EVP_aes_192_ecb(void);
 const EVP_CIPHER *EVP_aes_192_cbc(void);
 const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -751,9 +798,9 @@ const EVP_CIPHER *EVP_aes_192_cfb8(void);
 const EVP_CIPHER *EVP_aes_192_cfb128(void);
 # define EVP_aes_192_cfb EVP_aes_192_cfb128
 const EVP_CIPHER *EVP_aes_192_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_192_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_192_ccm(void);
+const EVP_CIPHER *EVP_aes_192_gcm(void);
 const EVP_CIPHER *EVP_aes_256_ecb(void);
 const EVP_CIPHER *EVP_aes_256_cbc(void);
 const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -761,8 +808,13 @@ const EVP_CIPHER *EVP_aes_256_cfb8(void);
 const EVP_CIPHER *EVP_aes_256_cfb128(void);
 # define EVP_aes_256_cfb EVP_aes_256_cfb128
 const EVP_CIPHER *EVP_aes_256_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_256_ctr(void);
+const EVP_CIPHER *EVP_aes_256_ccm(void);
+const EVP_CIPHER *EVP_aes_256_gcm(void);
+const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
 #endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -869,6 +921,7 @@ struct ec_key_st *EVP_PKEY_get1_EC_KEY(EVP_PKEY *pkey);
 #endif
 
 EVP_PKEY *	EVP_PKEY_new(void);
+EVP_PKEY *	EVP_PKEY_dup(EVP_PKEY *pkey);
 void		EVP_PKEY_free(EVP_PKEY *pkey);
 
 EVP_PKEY *	d2i_PublicKey(int type,EVP_PKEY **a, const unsigned char **pp,
@@ -1047,13 +1100,22 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
 #define EVP_PKEY_CTRL_CMS_DECRYPT	10
 #define EVP_PKEY_CTRL_CMS_SIGN		11
 
+#define EVP_PKEY_CTRL_CIPHER		12
+
 #define EVP_PKEY_ALG_CTRL		0x1000
 
 
 #define EVP_PKEY_FLAG_AUTOARGLEN	2
+/* Method handles all operations: don't assume any digest related
+ * defaults.
+ */
+#define EVP_PKEY_FLAG_SIGCTX_CUSTOM	4
 
 const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
 EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags);
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth);
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
 int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
 
@@ -1071,7 +1133,7 @@ int EVP_PKEY_CTX_get_operation(EVP_PKEY_CTX *ctx);
 void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen);
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen);
+				const unsigned char *key, int keylen);
 
 void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data);
 void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx);
@@ -1181,6 +1243,8 @@ void EVP_PKEY_meth_set_ctrl(EVP_PKEY_METHOD *pmeth,
 	int (*ctrl_str)(EVP_PKEY_CTX *ctx,
 					const char *type, const char *value));
 
+void EVP_add_alg_module(void);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1190,8 +1254,14 @@ void ERR_load_EVP_strings(void);
 /* Error codes for the EVP functions. */
 
 /* Function codes. */
+#define EVP_F_AESNI_INIT_KEY				 165
+#define EVP_F_AESNI_XTS_CIPHER				 176
 #define EVP_F_AES_INIT_KEY				 133
+#define EVP_F_AES_XTS					 172
+#define EVP_F_AES_XTS_CIPHER				 175
+#define EVP_F_ALG_MODULE_INIT				 177
 #define EVP_F_CAMELLIA_INIT_KEY				 159
+#define EVP_F_CMAC_INIT					 173
 #define EVP_F_D2I_PKEY					 100
 #define EVP_F_DO_SIGVER_INIT				 161
 #define EVP_F_DSAPKEY2PKCS8				 134
@@ -1246,15 +1316,24 @@ void ERR_load_EVP_strings(void);
 #define EVP_F_EVP_RIJNDAEL				 126
 #define EVP_F_EVP_SIGNFINAL				 107
 #define EVP_F_EVP_VERIFYFINAL				 108
+#define EVP_F_FIPS_CIPHERINIT				 166
+#define EVP_F_FIPS_CIPHER_CTX_COPY			 170
+#define EVP_F_FIPS_CIPHER_CTX_CTRL			 167
+#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH		 171
+#define EVP_F_FIPS_DIGESTINIT				 168
+#define EVP_F_FIPS_MD_CTX_COPY				 169
+#define EVP_F_HMAC_INIT_EX				 174
 #define EVP_F_INT_CTX_NEW				 157
 #define EVP_F_PKCS5_PBE_KEYIVGEN			 117
 #define EVP_F_PKCS5_V2_PBE_KEYIVGEN			 118
+#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN			 164
 #define EVP_F_PKCS8_SET_BROKEN				 112
 #define EVP_F_PKEY_SET_TYPE				 158
 #define EVP_F_RC2_MAGIC_TO_METH				 109
 #define EVP_F_RC5_CTRL					 125
 
 /* Reason codes. */
+#define EVP_R_AES_IV_SETUP_FAILED			 162
 #define EVP_R_AES_KEY_SETUP_FAILED			 143
 #define EVP_R_ASN1_LIB					 140
 #define EVP_R_BAD_BLOCK_LENGTH				 136
@@ -1272,16 +1351,21 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_DECODE_ERROR				 114
 #define EVP_R_DIFFERENT_KEY_TYPES			 101
 #define EVP_R_DIFFERENT_PARAMETERS			 153
+#define EVP_R_DISABLED_FOR_FIPS				 163
 #define EVP_R_ENCODE_ERROR				 115
+#define EVP_R_ERROR_LOADING_SECTION			 165
+#define EVP_R_ERROR_SETTING_FIPS_MODE			 166
 #define EVP_R_EVP_PBE_CIPHERINIT_ERROR			 119
 #define EVP_R_EXPECTING_AN_RSA_KEY			 127
 #define EVP_R_EXPECTING_A_DH_KEY			 128
 #define EVP_R_EXPECTING_A_DSA_KEY			 129
 #define EVP_R_EXPECTING_A_ECDSA_KEY			 141
 #define EVP_R_EXPECTING_A_EC_KEY			 142
+#define EVP_R_FIPS_MODE_NOT_SUPPORTED			 167
 #define EVP_R_INITIALIZATION_ERROR			 134
 #define EVP_R_INPUT_NOT_INITIALIZED			 111
 #define EVP_R_INVALID_DIGEST				 152
+#define EVP_R_INVALID_FIPS_MODE				 168
 #define EVP_R_INVALID_KEY_LENGTH			 130
 #define EVP_R_INVALID_OPERATION				 148
 #define EVP_R_IV_TOO_LARGE				 102
@@ -1303,8 +1387,10 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_PRIVATE_KEY_DECODE_ERROR			 145
 #define EVP_R_PRIVATE_KEY_ENCODE_ERROR			 146
 #define EVP_R_PUBLIC_KEY_NOT_RSA			 106
+#define EVP_R_TOO_LARGE					 164
 #define EVP_R_UNKNOWN_CIPHER				 160
 #define EVP_R_UNKNOWN_DIGEST				 161
+#define EVP_R_UNKNOWN_OPTION				 169
 #define EVP_R_UNKNOWN_PBE_ALGORITHM			 121
 #define EVP_R_UNSUPORTED_NUMBER_OF_ROUNDS		 135
 #define EVP_R_UNSUPPORTED_ALGORITHM			 156
diff --git a/crypto/evp/evp_cnf.c b/crypto/evp/evp_cnf.c
new file mode 100644
index 0000000..2e4db30
--- /dev/null
+++ b/crypto/evp/evp_cnf.c
@@ -0,0 +1,125 @@
+/* evp_cnf.c */
+/* Written by Stephen Henson (steve@openssl.org) for the OpenSSL
+ * project 2007.
+ */
+/* ====================================================================
+ * Copyright (c) 2007 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/conf.h>
+#include <openssl/dso.h>
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+
+/* Algorithm configuration module. */
+
+static int alg_module_init(CONF_IMODULE *md, const CONF *cnf)
+	{
+	int i;
+	const char *oid_section;
+	STACK_OF(CONF_VALUE) *sktmp;
+	CONF_VALUE *oval;
+	oid_section = CONF_imodule_get_value(md);
+	if(!(sktmp = NCONF_get_section(cnf, oid_section)))
+		{
+		EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_ERROR_LOADING_SECTION);
+		return 0;
+		}
+	for(i = 0; i < sk_CONF_VALUE_num(sktmp); i++)
+		{
+		oval = sk_CONF_VALUE_value(sktmp, i);
+		if (!strcmp(oval->name, "fips_mode"))
+			{
+			int m;
+			if (!X509V3_get_value_bool(oval, &m))
+				{
+				EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_INVALID_FIPS_MODE);
+				return 0;
+				}
+			if (m > 0)
+				{
+#ifdef OPENSSL_FIPS
+				if (!FIPS_mode() && !FIPS_mode_set(1))
+					{
+					EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_ERROR_SETTING_FIPS_MODE);
+					return 0;
+					}
+#else
+				EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_FIPS_MODE_NOT_SUPPORTED);
+				return 0;
+#endif
+				}
+			}
+		else
+			{
+			EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_UNKNOWN_OPTION);
+			ERR_add_error_data(4, "name=", oval->name,
+						", value=", oval->value);
+			}
+				
+		}
+	return 1;
+	}
+
+void EVP_add_alg_module(void)
+	{
+	CONF_module_add("alg_section", alg_module_init, 0);
+	}
diff --git a/crypto/evp/evp_enc.c b/crypto/evp/evp_enc.c
index c268d25..0c54f05 100644
--- a/crypto/evp/evp_enc.c
+++ b/crypto/evp/evp_enc.c
@@ -64,8 +64,18 @@
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 
+#ifdef OPENSSL_FIPS
+#define M_do_cipher(ctx, out, in, inl) FIPS_cipher(ctx, out, in, inl)
+#else
+#define M_do_cipher(ctx, out, in, inl) ctx->cipher->do_cipher(ctx, out, in, inl)
+#endif
+
+
 const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT;
 
 void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx)
@@ -115,10 +125,14 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 		/* Ensure a context left lying around from last time is cleared
 		 * (the previous check attempted to avoid this if the same
 		 * ENGINE and EVP_CIPHER could be used). */
-		EVP_CIPHER_CTX_cleanup(ctx);
-
-		/* Restore encrypt field: it is zeroed by cleanup */
-		ctx->encrypt = enc;
+		if (ctx->cipher)
+			{
+			unsigned long flags = ctx->flags;
+			EVP_CIPHER_CTX_cleanup(ctx);
+			/* Restore encrypt and flags */
+			ctx->encrypt = enc;
+			ctx->flags = flags;
+			}
 #ifndef OPENSSL_NO_ENGINE
 		if(impl)
 			{
@@ -155,6 +169,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 			ctx->engine = NULL;
 #endif
 
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#endif
 		ctx->cipher=cipher;
 		if (ctx->cipher->ctx_size)
 			{
@@ -187,6 +205,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 		}
 #ifndef OPENSSL_NO_ENGINE
 skip_to_init:
+#endif
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_cipherinit(ctx, cipher, key, iv, enc);
 #endif
 	/* we assume block size is a power of 2 in *cryptUpdate */
 	OPENSSL_assert(ctx->cipher->block_size == 1
@@ -214,6 +236,13 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 			memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx));
 			break;
 
+			case EVP_CIPH_CTR_MODE:
+			ctx->num = 0;
+			/* Don't reuse IV for CTR mode */
+			if(iv)
+				memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx));
+			break;
+
 			default:
 			return 0;
 			break;
@@ -280,6 +309,16 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	{
 	int i,j,bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, in, inl);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -288,7 +327,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 
 	if(ctx->buf_len == 0 && (inl&(ctx->block_mask)) == 0)
 		{
-		if(ctx->cipher->do_cipher(ctx,out,in,inl))
+		if(M_do_cipher(ctx,out,in,inl))
 			{
 			*outl=inl;
 			return 1;
@@ -315,7 +354,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 			{
 			j=bl-i;
 			memcpy(&(ctx->buf[i]),in,j);
-			if(!ctx->cipher->do_cipher(ctx,out,ctx->buf,bl)) return 0;
+			if(!M_do_cipher(ctx,out,ctx->buf,bl)) return 0;
 			inl-=j;
 			in+=j;
 			out+=bl;
@@ -328,7 +367,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	inl-=i;
 	if (inl > 0)
 		{
-		if(!ctx->cipher->do_cipher(ctx,out,in,inl)) return 0;
+		if(!M_do_cipher(ctx,out,in,inl)) return 0;
 		*outl+=inl;
 		}
 
@@ -350,6 +389,16 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	int n,ret;
 	unsigned int i, b, bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		ret = M_do_cipher(ctx, out, NULL, 0);
+		if (ret < 0)
+			return 0;
+		else 
+			*outl = ret;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	OPENSSL_assert(b <= sizeof ctx->buf);
 	if (b == 1)
@@ -372,7 +421,7 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	n=b-bl;
 	for (i=bl; i<b; i++)
 		ctx->buf[i]=n;
-	ret=ctx->cipher->do_cipher(ctx,out,ctx->buf,b);
+	ret=M_do_cipher(ctx,out,ctx->buf,b);
 
 
 	if(ret)
@@ -387,6 +436,19 @@ int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	int fix_len;
 	unsigned int b;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		fix_len = M_do_cipher(ctx, out, in, inl);
+		if (fix_len < 0)
+			{
+			*outl = 0;
+			return 0;
+			}
+		else
+			*outl = fix_len;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -440,8 +502,18 @@ int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	{
 	int i,n;
 	unsigned int b;
-
 	*outl=0;
+
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, NULL, 0);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	if (ctx->flags & EVP_CIPH_NO_PADDING)
 		{
@@ -496,6 +568,7 @@ void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx)
 
 int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
 	{
+#ifndef OPENSSL_FIPS
 	if (c->cipher != NULL)
 		{
 		if(c->cipher->cleanup && !c->cipher->cleanup(c))
@@ -506,11 +579,15 @@ int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
 		}
 	if (c->cipher_data)
 		OPENSSL_free(c->cipher_data);
+#endif
 #ifndef OPENSSL_NO_ENGINE
 	if (c->engine)
 		/* The EVP_CIPHER we used belongs to an ENGINE, release the
 		 * functional reference we held for this reason. */
 		ENGINE_finish(c->engine);
+#endif
+#ifdef OPENSSL_FIPS
+	FIPS_cipher_ctx_cleanup(c);
 #endif
 	memset(c,0,sizeof(EVP_CIPHER_CTX));
 	return 1;
diff --git a/crypto/evp/evp_err.c b/crypto/evp/evp_err.c
index d8bfec0..08eab98 100644
--- a/crypto/evp/evp_err.c
+++ b/crypto/evp/evp_err.c
@@ -1,6 +1,6 @@
 /* crypto/evp/evp_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,8 +70,14 @@
 
 static ERR_STRING_DATA EVP_str_functs[]=
 	{
+{ERR_FUNC(EVP_F_AESNI_INIT_KEY),	"AESNI_INIT_KEY"},
+{ERR_FUNC(EVP_F_AESNI_XTS_CIPHER),	"AESNI_XTS_CIPHER"},
 {ERR_FUNC(EVP_F_AES_INIT_KEY),	"AES_INIT_KEY"},
+{ERR_FUNC(EVP_F_AES_XTS),	"AES_XTS"},
+{ERR_FUNC(EVP_F_AES_XTS_CIPHER),	"AES_XTS_CIPHER"},
+{ERR_FUNC(EVP_F_ALG_MODULE_INIT),	"ALG_MODULE_INIT"},
 {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY),	"CAMELLIA_INIT_KEY"},
+{ERR_FUNC(EVP_F_CMAC_INIT),	"CMAC_INIT"},
 {ERR_FUNC(EVP_F_D2I_PKEY),	"D2I_PKEY"},
 {ERR_FUNC(EVP_F_DO_SIGVER_INIT),	"DO_SIGVER_INIT"},
 {ERR_FUNC(EVP_F_DSAPKEY2PKCS8),	"DSAPKEY2PKCS8"},
@@ -86,7 +92,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
 {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX),	"EVP_DigestInit_ex"},
 {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX),	"EVP_EncryptFinal_ex"},
 {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX),	"EVP_MD_CTX_copy_ex"},
-{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_SIZE"},
+{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_size"},
 {ERR_FUNC(EVP_F_EVP_OPENINIT),	"EVP_OpenInit"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD),	"EVP_PBE_alg_add"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE),	"EVP_PBE_alg_add_type"},
@@ -126,9 +132,17 @@ static ERR_STRING_DATA EVP_str_functs[]=
 {ERR_FUNC(EVP_F_EVP_RIJNDAEL),	"EVP_RIJNDAEL"},
 {ERR_FUNC(EVP_F_EVP_SIGNFINAL),	"EVP_SignFinal"},
 {ERR_FUNC(EVP_F_EVP_VERIFYFINAL),	"EVP_VerifyFinal"},
+{ERR_FUNC(EVP_F_FIPS_CIPHERINIT),	"FIPS_CIPHERINIT"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_COPY),	"FIPS_CIPHER_CTX_COPY"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_CTRL),	"FIPS_CIPHER_CTX_CTRL"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH),	"FIPS_CIPHER_CTX_SET_KEY_LENGTH"},
+{ERR_FUNC(EVP_F_FIPS_DIGESTINIT),	"FIPS_DIGESTINIT"},
+{ERR_FUNC(EVP_F_FIPS_MD_CTX_COPY),	"FIPS_MD_CTX_COPY"},
+{ERR_FUNC(EVP_F_HMAC_INIT_EX),	"HMAC_Init_ex"},
 {ERR_FUNC(EVP_F_INT_CTX_NEW),	"INT_CTX_NEW"},
 {ERR_FUNC(EVP_F_PKCS5_PBE_KEYIVGEN),	"PKCS5_PBE_keyivgen"},
 {ERR_FUNC(EVP_F_PKCS5_V2_PBE_KEYIVGEN),	"PKCS5_v2_PBE_keyivgen"},
+{ERR_FUNC(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN),	"PKCS5_V2_PBKDF2_KEYIVGEN"},
 {ERR_FUNC(EVP_F_PKCS8_SET_BROKEN),	"PKCS8_set_broken"},
 {ERR_FUNC(EVP_F_PKEY_SET_TYPE),	"PKEY_SET_TYPE"},
 {ERR_FUNC(EVP_F_RC2_MAGIC_TO_METH),	"RC2_MAGIC_TO_METH"},
@@ -138,6 +152,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
 
 static ERR_STRING_DATA EVP_str_reasons[]=
 	{
+{ERR_REASON(EVP_R_AES_IV_SETUP_FAILED)   ,"aes iv setup failed"},
 {ERR_REASON(EVP_R_AES_KEY_SETUP_FAILED)  ,"aes key setup failed"},
 {ERR_REASON(EVP_R_ASN1_LIB)              ,"asn1 lib"},
 {ERR_REASON(EVP_R_BAD_BLOCK_LENGTH)      ,"bad block length"},
@@ -155,16 +170,21 @@ static ERR_STRING_DATA EVP_str_reasons[]=
 {ERR_REASON(EVP_R_DECODE_ERROR)          ,"decode error"},
 {ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES)   ,"different key types"},
 {ERR_REASON(EVP_R_DIFFERENT_PARAMETERS)  ,"different parameters"},
+{ERR_REASON(EVP_R_DISABLED_FOR_FIPS)     ,"disabled for fips"},
 {ERR_REASON(EVP_R_ENCODE_ERROR)          ,"encode error"},
+{ERR_REASON(EVP_R_ERROR_LOADING_SECTION) ,"error loading section"},
+{ERR_REASON(EVP_R_ERROR_SETTING_FIPS_MODE),"error setting fips mode"},
 {ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"},
 {ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY)  ,"expecting an rsa key"},
 {ERR_REASON(EVP_R_EXPECTING_A_DH_KEY)    ,"expecting a dh key"},
 {ERR_REASON(EVP_R_EXPECTING_A_DSA_KEY)   ,"expecting a dsa key"},
 {ERR_REASON(EVP_R_EXPECTING_A_ECDSA_KEY) ,"expecting a ecdsa key"},
 {ERR_REASON(EVP_R_EXPECTING_A_EC_KEY)    ,"expecting a ec key"},
+{ERR_REASON(EVP_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
 {ERR_REASON(EVP_R_INITIALIZATION_ERROR)  ,"initialization error"},
 {ERR_REASON(EVP_R_INPUT_NOT_INITIALIZED) ,"input not initialized"},
 {ERR_REASON(EVP_R_INVALID_DIGEST)        ,"invalid digest"},
+{ERR_REASON(EVP_R_INVALID_FIPS_MODE)     ,"invalid fips mode"},
 {ERR_REASON(EVP_R_INVALID_KEY_LENGTH)    ,"invalid key length"},
 {ERR_REASON(EVP_R_INVALID_OPERATION)     ,"invalid operation"},
 {ERR_REASON(EVP_R_IV_TOO_LARGE)          ,"iv too large"},
@@ -186,8 +206,10 @@ static ERR_STRING_DATA EVP_str_reasons[]=
 {ERR_REASON(EVP_R_PRIVATE_KEY_DECODE_ERROR),"private key decode error"},
 {ERR_REASON(EVP_R_PRIVATE_KEY_ENCODE_ERROR),"private key encode error"},
 {ERR_REASON(EVP_R_PUBLIC_KEY_NOT_RSA)    ,"public key not rsa"},
+{ERR_REASON(EVP_R_TOO_LARGE)             ,"too large"},
 {ERR_REASON(EVP_R_UNKNOWN_CIPHER)        ,"unknown cipher"},
 {ERR_REASON(EVP_R_UNKNOWN_DIGEST)        ,"unknown digest"},
+{ERR_REASON(EVP_R_UNKNOWN_OPTION)        ,"unknown option"},
 {ERR_REASON(EVP_R_UNKNOWN_PBE_ALGORITHM) ,"unknown pbe algorithm"},
 {ERR_REASON(EVP_R_UNSUPORTED_NUMBER_OF_ROUNDS),"unsuported number of rounds"},
 {ERR_REASON(EVP_R_UNSUPPORTED_ALGORITHM) ,"unsupported algorithm"},
diff --git a/crypto/evp/evp_key.c b/crypto/evp/evp_key.c
index 839d6a3..7961fbe 100644
--- a/crypto/evp/evp_key.c
+++ b/crypto/evp/evp_key.c
@@ -120,7 +120,7 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 	unsigned char md_buf[EVP_MAX_MD_SIZE];
 	int niv,nkey,addmd=0;
 	unsigned int mds=0,i;
-
+	int rv = 0;
 	nkey=type->key_len;
 	niv=type->iv_len;
 	OPENSSL_assert(nkey <= EVP_MAX_KEY_LENGTH);
@@ -134,17 +134,24 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 		if (!EVP_DigestInit_ex(&c,md, NULL))
 			return 0;
 		if (addmd++)
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-		EVP_DigestUpdate(&c,data,datal);
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+		if (!EVP_DigestUpdate(&c,data,datal))
+			goto err;
 		if (salt != NULL)
-			EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN);
-		EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN))
+				goto err;
+		if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+			goto err;
 
 		for (i=1; i<(unsigned int)count; i++)
 			{
-			EVP_DigestInit_ex(&c,md, NULL);
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-			EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestInit_ex(&c,md, NULL))
+				goto err;
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+			if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+				goto err;
 			}
 		i=0;
 		if (nkey)
@@ -173,8 +180,10 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 			}
 		if ((nkey == 0) && (niv == 0)) break;
 		}
+	rv = type->key_len;
+	err:
 	EVP_MD_CTX_cleanup(&c);
 	OPENSSL_cleanse(&(md_buf[0]),EVP_MAX_MD_SIZE);
-	return(type->key_len);
+	return rv;
 	}
 
diff --git a/crypto/evp/evp_lib.c b/crypto/evp/evp_lib.c
index 40951a0..b180e48 100644
--- a/crypto/evp/evp_lib.c
+++ b/crypto/evp/evp_lib.c
@@ -67,6 +67,8 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 
 	if (c->cipher->set_asn1_parameters != NULL)
 		ret=c->cipher->set_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_set_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);
@@ -78,6 +80,8 @@ int EVP_CIPHER_asn1_to_param(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 
 	if (c->cipher->get_asn1_parameters != NULL)
 		ret=c->cipher->get_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_get_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);
diff --git a/crypto/evp/evp_locl.h b/crypto/evp/evp_locl.h
index 292d74c..08c0a66 100644
--- a/crypto/evp/evp_locl.h
+++ b/crypto/evp/evp_locl.h
@@ -343,3 +343,43 @@ struct evp_pkey_method_st
 	} /* EVP_PKEY_METHOD */;
 
 void evp_pkey_set_cb_translate(BN_GENCB *cb, EVP_PKEY_CTX *ctx);
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+			     ASN1_TYPE *param,
+			     const EVP_CIPHER *c, const EVP_MD *md, int en_de);
+
+#ifdef OPENSSL_FIPS
+
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef SHA1_Init
+#undef SHA1_Update
+#undef SHA224_Init
+#undef SHA256_Init
+#undef SHA384_Init
+#undef SHA512_Init
+#undef DES_set_key_unchecked
+#endif
+
+#define RIPEMD160_Init	private_RIPEMD160_Init
+#define WHIRLPOOL_Init	private_WHIRLPOOL_Init
+#define MD5_Init	private_MD5_Init
+#define MD4_Init	private_MD4_Init
+#define MD2_Init	private_MD2_Init
+#define MDC2_Init	private_MDC2_Init
+#define SHA_Init	private_SHA_Init
+#define SHA1_Init	private_SHA1_Init
+#define SHA224_Init	private_SHA224_Init
+#define SHA256_Init	private_SHA256_Init
+#define SHA384_Init	private_SHA384_Init
+#define SHA512_Init	private_SHA512_Init
+
+#define BF_set_key	private_BF_set_key
+#define CAST_set_key	private_CAST_set_key
+#define idea_set_encrypt_key	private_idea_set_encrypt_key
+#define SEED_set_key	private_SEED_set_key
+#define RC2_set_key	private_RC2_set_key
+#define RC4_set_key	private_RC4_set_key
+#define DES_set_key_unchecked	private_DES_set_key_unchecked
+#define Camellia_set_key	private_Camellia_set_key
+
+#endif
diff --git a/crypto/evp/evp_pbe.c b/crypto/evp/evp_pbe.c
index c9d932d..f8c32d8 100644
--- a/crypto/evp/evp_pbe.c
+++ b/crypto/evp/evp_pbe.c
@@ -61,6 +61,7 @@
 #include <openssl/evp.h>
 #include <openssl/pkcs12.h>
 #include <openssl/x509.h>
+#include "evp_locl.h"
 
 /* Password based encryption (PBE) functions */
 
@@ -87,6 +88,10 @@ static const EVP_PBE_CTL builtin_pbe[] =
 	{EVP_PBE_TYPE_OUTER, NID_pbeWithSHA1AndRC2_CBC,
 			NID_rc2_64_cbc, NID_sha1, PKCS5_PBE_keyivgen},
 
+#ifndef OPENSSL_NO_HMAC
+	{EVP_PBE_TYPE_OUTER, NID_id_pbkdf2, -1, -1, PKCS5_v2_PBKDF2_keyivgen},
+#endif
+
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And128BitRC4,
 			NID_rc4, NID_sha1, PKCS12_PBE_keyivgen},
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And40BitRC4,
diff --git a/crypto/evp/evptests.txt b/crypto/evp/evptests.txt
index beb1214..c273707 100644
--- a/crypto/evp/evptests.txt
+++ b/crypto/evp/evptests.txt
@@ -158,6 +158,19 @@ AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:B7B
 AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:E1C656305ED1A7A6563805746FE03EDC:30C81C46A35CE411E5FBC1191A0A52EF:71AB47A086E86EEDF39D1C5BBA97C408:0
 AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:41635BE625B48AFC1666DD42A09D96E7:F69F2445DF4F9B17AD2B417BE66C3710:0126141D67F37BE8538F5A8BE740E484:0
 
+# AES Counter test vectors from RFC3686
+aes-128-ctr:AE6852F8121067CC4BF7A5765577F39E:00000030000000000000000000000001:53696E676C6520626C6F636B206D7367:E4095D4FB7A7B3792D6175A3261311B8:1
+aes-128-ctr:7E24067817FAE0D743D6CE1F32539163:006CB6DBC0543B59DA48D90B00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:5104A106168A72D9790D41EE8EDAD388EB2E1EFC46DA57C8FCE630DF9141BE28:1
+aes-128-ctr:7691BE035E5020A8AC6E618529F9A0DC:00E0017B27777F3F4A1786F000000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:C1CF48A89F2FFDD9CF4652E9EFDB72D74540A42BDE6D7836D59A5CEAAEF3105325B2072F:1
+
+aes-192-ctr:16AF5B145FC9F579C175F93E3BFB0EED863D06CCFDB78515:0000004836733C147D6D93CB00000001:53696E676C6520626C6F636B206D7367:4B55384FE259C9C84E7935A003CBE928:1
+aes-192-ctr:7C5CB2401B3DC33C19E7340819E0F69C678C3DB8E6F6A91A:0096B03B020C6EADC2CB500D00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:453243FC609B23327EDFAAFA7131CD9F8490701C5AD4A79CFC1FE0FF42F4FB00:1
+aes-192-ctr:02BF391EE8ECB159B959617B0965279BF59B60A786D3E0FE:0007BDFD5CBD60278DCC091200000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:96893FC55E5C722F540B7DD1DDF7E758D288BC95C69165884536C811662F2188ABEE0935:1
+
+aes-256-ctr:776BEFF2851DB06F4C8A0542C8696F6C6A81AF1EEC96B4D37FC1D689E6C1C104:00000060DB5672C97AA8F0B200000001:53696E676C6520626C6F636B206D7367:145AD01DBF824EC7560863DC71E3E0C0:1
+aes-256-ctr:F6D66D6BD52D59BB0796365879EFF886C66DD51A5B6A99744B50590C87A23884:00FAAC24C1585EF15A43D87500000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:F05E231B3894612C49EE000B804EB2A9B8306B508F839D6A5530831D9344AF1C:1
+aes-256-ctr:FF7A617CE69148E4F1726E2F43581DE2AA62D9F805532EDFF1EED687FB54153D:001CC5B751A51D70A1C1114800000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:EB6C52821D0BBBF7CE7594462ACA4FAAB407DF866569FD07F48CC0B583D6071F1EC0E6B8:1
+
 # DES ECB tests (from destest)
 
 DES-ECB:0000000000000000::0000000000000000:8CA64DE9C1B123A7
diff --git a/crypto/evp/m_dss.c b/crypto/evp/m_dss.c
index 48c2689..6fb7e9a 100644
--- a/crypto/evp/m_dss.c
+++ b/crypto/evp/m_dss.c
@@ -60,12 +60,13 @@
 #include "cryptlib.h"
 #include <openssl/evp.h>
 #include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
 #ifndef OPENSSL_NO_DSA
 #include <openssl/dsa.h>
 #endif
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
 
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
@@ -97,3 +98,4 @@ const EVP_MD *EVP_dss(void)
 	return(&dsa_md);
 	}
 #endif
+#endif
diff --git a/crypto/evp/m_dss1.c b/crypto/evp/m_dss1.c
index 4f03fb7..2df362a 100644
--- a/crypto/evp/m_dss1.c
+++ b/crypto/evp/m_dss1.c
@@ -63,11 +63,13 @@
 
 #include <openssl/evp.h>
 #include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
 #ifndef OPENSSL_NO_DSA
 #include <openssl/dsa.h>
 #endif
 
+#ifndef OPENSSL_FIPS 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -98,3 +100,4 @@ const EVP_MD *EVP_dss1(void)
 	return(&dss1_md);
 	}
 #endif
+#endif
diff --git a/crypto/evp/m_ecdsa.c b/crypto/evp/m_ecdsa.c
index 8d87a49..4b15fb0 100644
--- a/crypto/evp/m_ecdsa.c
+++ b/crypto/evp/m_ecdsa.c
@@ -116,6 +116,8 @@
 #include <openssl/x509.h>
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void)
 	return(&ecdsa_md);
 	}
 #endif
+#endif
diff --git a/crypto/evp/m_md4.c b/crypto/evp/m_md4.c
index 1e0b7c5..6d47f61 100644
--- a/crypto/evp/m_md4.c
+++ b/crypto/evp/m_md4.c
@@ -69,6 +69,8 @@
 #include <openssl/rsa.h>
 #endif
 
+#include "evp_locl.h"
+
 static int init(EVP_MD_CTX *ctx)
 	{ return MD4_Init(ctx->md_data); }
 
diff --git a/crypto/evp/m_md5.c b/crypto/evp/m_md5.c
index 63c1421..9a8bae0 100644
--- a/crypto/evp/m_md5.c
+++ b/crypto/evp/m_md5.c
@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return MD5_Init(ctx->md_data); }
diff --git a/crypto/evp/m_mdc2.c b/crypto/evp/m_mdc2.c
index b08d559..3602bed 100644
--- a/crypto/evp/m_mdc2.c
+++ b/crypto/evp/m_mdc2.c
@@ -69,6 +69,8 @@
 #include <openssl/rsa.h>
 #endif
 
+#include "evp_locl.h"
+
 static int init(EVP_MD_CTX *ctx)
 	{ return MDC2_Init(ctx->md_data); }
 
diff --git a/crypto/evp/m_ripemd.c b/crypto/evp/m_ripemd.c
index a1d60ee..7bf4804 100644
--- a/crypto/evp/m_ripemd.c
+++ b/crypto/evp/m_ripemd.c
@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return RIPEMD160_Init(ctx->md_data); }
diff --git a/crypto/evp/m_sha1.c b/crypto/evp/m_sha1.c
index 9a2790f..bd0c01a 100644
--- a/crypto/evp/m_sha1.c
+++ b/crypto/evp/m_sha1.c
@@ -59,15 +59,18 @@
 #include <stdio.h>
 #include "cryptlib.h"
 
+#ifndef OPENSSL_FIPS
+
 #ifndef OPENSSL_NO_SHA
 
 #include <openssl/evp.h>
 #include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -202,3 +205,5 @@ static const EVP_MD sha512_md=
 const EVP_MD *EVP_sha512(void)
 	{ return(&sha512_md); }
 #endif	/* ifndef OPENSSL_NO_SHA512 */
+
+#endif
diff --git a/crypto/evp/m_wp.c b/crypto/evp/m_wp.c
index 1ce47c0..c51bc2d 100644
--- a/crypto/evp/m_wp.c
+++ b/crypto/evp/m_wp.c
@@ -9,6 +9,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/whrlpool.h>
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return WHIRLPOOL_Init(ctx->md_data); }
diff --git a/crypto/evp/names.c b/crypto/evp/names.c
index f2869f5..6311ad7 100644
--- a/crypto/evp/names.c
+++ b/crypto/evp/names.c
@@ -66,6 +66,10 @@ int EVP_add_cipher(const EVP_CIPHER *c)
 	{
 	int r;
 
+	if (c == NULL) return 0;
+
+	OPENSSL_init();
+
 	r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c);
 	if (r == 0) return(0);
 	check_defer(c->nid);
@@ -78,6 +82,7 @@ int EVP_add_digest(const EVP_MD *md)
 	{
 	int r;
 	const char *name;
+	OPENSSL_init();
 
 	name=OBJ_nid2sn(md->type);
 	r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md);
diff --git a/crypto/evp/p5_crpt.c b/crypto/evp/p5_crpt.c
index 7ecfa8d..294cc90 100644
--- a/crypto/evp/p5_crpt.c
+++ b/crypto/evp/p5_crpt.c
@@ -82,6 +82,8 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
 	unsigned char *salt;
 	const unsigned char *pbuf;
 	int mdsize;
+	int rv = 0;
+	EVP_MD_CTX_init(&ctx);
 
 	/* Extract useful info from parameter */
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
@@ -104,29 +106,38 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
 	if(!pass) passlen = 0;
 	else if(passlen == -1) passlen = strlen(pass);
 
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, md, NULL);
-	EVP_DigestUpdate(&ctx, pass, passlen);
-	EVP_DigestUpdate(&ctx, salt, saltlen);
+	if (!EVP_DigestInit_ex(&ctx, md, NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, pass, passlen))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, salt, saltlen))
+		goto err;
 	PBEPARAM_free(pbe);
-	EVP_DigestFinal_ex(&ctx, md_tmp, NULL);
+	if (!EVP_DigestFinal_ex(&ctx, md_tmp, NULL))
+		goto err;
 	mdsize = EVP_MD_size(md);
 	if (mdsize < 0)
 	    return 0;
 	for (i = 1; i < iter; i++) {
-		EVP_DigestInit_ex(&ctx, md, NULL);
-		EVP_DigestUpdate(&ctx, md_tmp, mdsize);
-		EVP_DigestFinal_ex (&ctx, md_tmp, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md, NULL))
+			goto err;
+		if (!EVP_DigestUpdate(&ctx, md_tmp, mdsize))
+			goto err;
+		if (!EVP_DigestFinal_ex (&ctx, md_tmp, NULL))
+			goto err;
 	}
-	EVP_MD_CTX_cleanup(&ctx);
 	OPENSSL_assert(EVP_CIPHER_key_length(cipher) <= (int)sizeof(md_tmp));
 	memcpy(key, md_tmp, EVP_CIPHER_key_length(cipher));
 	OPENSSL_assert(EVP_CIPHER_iv_length(cipher) <= 16);
 	memcpy(iv, md_tmp + (16 - EVP_CIPHER_iv_length(cipher)),
 						 EVP_CIPHER_iv_length(cipher));
-	EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de);
+	if (!EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de))
+		goto err;
 	OPENSSL_cleanse(md_tmp, EVP_MAX_MD_SIZE);
 	OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH);
 	OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH);
-	return 1;
+	rv = 1;
+	err:
+	EVP_MD_CTX_cleanup(&ctx);
+	return rv;
 }
diff --git a/crypto/evp/p5_crpt2.c b/crypto/evp/p5_crpt2.c
index 334379f..975d004 100644
--- a/crypto/evp/p5_crpt2.c
+++ b/crypto/evp/p5_crpt2.c
@@ -62,6 +62,7 @@
 #include <openssl/x509.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
+#include "evp_locl.h"
 
 /* set this to print out info about the keygen algorithm */
 /* #define DEBUG_PKCS5V2 */
@@ -110,10 +111,14 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
 		itmp[1] = (unsigned char)((i >> 16) & 0xff);
 		itmp[2] = (unsigned char)((i >> 8) & 0xff);
 		itmp[3] = (unsigned char)(i & 0xff);
-		HMAC_Init_ex(&hctx, pass, passlen, digest, NULL);
-		HMAC_Update(&hctx, salt, saltlen);
-		HMAC_Update(&hctx, itmp, 4);
-		HMAC_Final(&hctx, digtmp, NULL);
+		if (!HMAC_Init_ex(&hctx, pass, passlen, digest, NULL)
+			|| !HMAC_Update(&hctx, salt, saltlen)
+			|| !HMAC_Update(&hctx, itmp, 4)
+			|| !HMAC_Final(&hctx, digtmp, NULL))
+			{
+			HMAC_CTX_cleanup(&hctx);
+			return 0;
+			}
 		memcpy(p, digtmp, cplen);
 		for(j = 1; j < iter; j++)
 			{
@@ -168,27 +173,24 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
                          ASN1_TYPE *param, const EVP_CIPHER *c, const EVP_MD *md,
                          int en_de)
 {
-	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
 	const unsigned char *pbuf;
-	int saltlen, iter, plen;
-	unsigned int keylen;
+	int plen;
 	PBE2PARAM *pbe2 = NULL;
 	const EVP_CIPHER *cipher;
-	PBKDF2PARAM *kdf = NULL;
-	const EVP_MD *prfmd;
-	int prf_nid, hmac_md_nid;
+
+	int rv = 0;
 
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
 	    param->value.sequence == NULL) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	pbuf = param->value.sequence->data;
 	plen = param->value.sequence->length;
 	if(!(pbe2 = d2i_PBE2PARAM(NULL, &pbuf, plen))) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	/* See if we recognise the key derivation function */
@@ -211,38 +213,63 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 	}
 
 	/* Fixup cipher based on AlgorithmIdentifier */
-	EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de);
+	if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de))
+		goto err;
 	if(EVP_CIPHER_asn1_to_param(ctx, pbe2->encryption->parameter) < 0) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
 					EVP_R_CIPHER_PARAMETER_ERROR);
 		goto err;
 	}
+	rv = PKCS5_v2_PBKDF2_keyivgen(ctx, pass, passlen,
+					pbe2->keyfunc->parameter, c, md, en_de);
+	err:
+	PBE2PARAM_free(pbe2);
+	return rv;
+}
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+                         ASN1_TYPE *param,
+			 const EVP_CIPHER *c, const EVP_MD *md, int en_de)
+{
+	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
+	const unsigned char *pbuf;
+	int saltlen, iter, plen;
+	int rv = 0;
+	unsigned int keylen = 0;
+	int prf_nid, hmac_md_nid;
+	PBKDF2PARAM *kdf = NULL;
+	const EVP_MD *prfmd;
+
+	if (EVP_CIPHER_CTX_cipher(ctx) == NULL)
+		{
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_NO_CIPHER_SET);
+		goto err;
+		}
 	keylen = EVP_CIPHER_CTX_key_length(ctx);
 	OPENSSL_assert(keylen <= sizeof key);
 
-	/* Now decode key derivation function */
+	/* Decode parameter */
 
-	if(!pbe2->keyfunc->parameter ||
-		 (pbe2->keyfunc->parameter->type != V_ASN1_SEQUENCE))
+	if(!param || (param->type != V_ASN1_SEQUENCE))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 		}
 
-	pbuf = pbe2->keyfunc->parameter->value.sequence->data;
-	plen = pbe2->keyfunc->parameter->value.sequence->length;
+	pbuf = param->value.sequence->data;
+	plen = param->value.sequence->length;
+
 	if(!(kdf = d2i_PBKDF2PARAM(NULL, &pbuf, plen)) ) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 	}
 
-	PBE2PARAM_free(pbe2);
-	pbe2 = NULL;
+	keylen = EVP_CIPHER_CTX_key_length(ctx);
 
 	/* Now check the parameters of the kdf */
 
 	if(kdf->keylength && (ASN1_INTEGER_get(kdf->keylength) != (int)keylen)){
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_KEYLENGTH);
 		goto err;
 	}
@@ -254,19 +281,19 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 
 	if (!EVP_PBE_find(EVP_PBE_TYPE_PRF, prf_nid, NULL, &hmac_md_nid, 0))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	prfmd = EVP_get_digestbynid(hmac_md_nid);
 	if (prfmd == NULL)
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	if(kdf->salt->type != V_ASN1_OCTET_STRING) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_SALT_TYPE);
 		goto err;
 	}
@@ -278,15 +305,11 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 	if(!PKCS5_PBKDF2_HMAC(pass, passlen, salt, saltlen, iter, prfmd,
 						   keylen, key))
 		goto err;
-	EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
-	OPENSSL_cleanse(key, keylen);
-	PBKDF2PARAM_free(kdf);
-	return 1;
-
+	rv = EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
 	err:
-	PBE2PARAM_free(pbe2);
+	OPENSSL_cleanse(key, keylen);
 	PBKDF2PARAM_free(kdf);
-	return 0;
+	return rv;
 }
 
 #ifdef DEBUG_PKCS5V2
diff --git a/crypto/evp/p_lib.c b/crypto/evp/p_lib.c
index e26ccd0..bd1977d 100644
--- a/crypto/evp/p_lib.c
+++ b/crypto/evp/p_lib.c
@@ -200,6 +200,12 @@ EVP_PKEY *EVP_PKEY_new(void)
 	return(ret);
 	}
 
+EVP_PKEY *EVP_PKEY_dup(EVP_PKEY *pkey)
+	{
+	CRYPTO_add(&pkey->references, 1, CRYPTO_LOCK_EVP_PKEY);
+	return pkey;
+	}
+
 /* Setup a public key ASN1 method and ENGINE from a NID or a string.
  * If pkey is NULL just return 1 or 0 if the algorithm exists.
  */
diff --git a/crypto/evp/p_open.c b/crypto/evp/p_open.c
index 53a59a2..c748fbe 100644
--- a/crypto/evp/p_open.c
+++ b/crypto/evp/p_open.c
@@ -115,7 +115,8 @@ int EVP_OpenFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	int i;
 
 	i=EVP_DecryptFinal_ex(ctx,out,outl);
-	EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i)
+		i = EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return(i);
 	}
 #else /* !OPENSSL_NO_RSA */
diff --git a/crypto/evp/p_seal.c b/crypto/evp/p_seal.c
index d832452..e5919b0 100644
--- a/crypto/evp/p_seal.c
+++ b/crypto/evp/p_seal.c
@@ -110,6 +110,7 @@ int EVP_SealFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	{
 	int i;
 	i = EVP_EncryptFinal_ex(ctx,out,outl);
-	EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i) 
+		i = EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return i;
 	}
diff --git a/crypto/evp/p_sign.c b/crypto/evp/p_sign.c
index bb893f5..8afb664 100644
--- a/crypto/evp/p_sign.c
+++ b/crypto/evp/p_sign.c
@@ -80,18 +80,20 @@ int EVP_SignFinal(EVP_MD_CTX *ctx, unsigned char *sigret, unsigned int *siglen,
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i = 0,ok = 0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	*siglen=0;
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);   
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;  
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		size_t sltmp = (size_t)EVP_PKEY_size(pkey);
 		i = 0;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);
diff --git a/crypto/evp/p_verify.c b/crypto/evp/p_verify.c
index 41d4b67..c66d63c 100644
--- a/crypto/evp/p_verify.c
+++ b/crypto/evp/p_verify.c
@@ -67,17 +67,19 @@ int EVP_VerifyFinal(EVP_MD_CTX *ctx, const unsigned char *sigbuf,
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i = 0,ok = 0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);     
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;    
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		i = -1;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);
 		if (!pkctx)
diff --git a/crypto/evp/pmeth_gn.c b/crypto/evp/pmeth_gn.c
index 5d74161..4651c81 100644
--- a/crypto/evp/pmeth_gn.c
+++ b/crypto/evp/pmeth_gn.c
@@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx)
 	}
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen)
+				const unsigned char *key, int keylen)
 	{
 	EVP_PKEY_CTX *mac_ctx = NULL;
 	EVP_PKEY *mac_key = NULL;
@@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
 	if (EVP_PKEY_keygen_init(mac_ctx) <= 0)
 		goto merr;
 	if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN,
-				EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0)
+				EVP_PKEY_CTRL_SET_MAC_KEY,
+				keylen, (void *)key) <= 0)
 		goto merr;
 	if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0)
 		goto merr;
diff --git a/crypto/evp/pmeth_lib.c b/crypto/evp/pmeth_lib.c
index 5481d4b..acfa7b6 100644
--- a/crypto/evp/pmeth_lib.c
+++ b/crypto/evp/pmeth_lib.c
@@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD)
 STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL;
 
 extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth;
-extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth;
+extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth;
 
 static const EVP_PKEY_METHOD *standard_methods[] =
 	{
@@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] =
 	&ec_pkey_meth,
 #endif
 	&hmac_pkey_meth,
+	&cmac_pkey_meth
 	};
 
 DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *,
@@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
 	if (!pmeth)
 		return NULL;
 
+	memset(pmeth, 0, sizeof(EVP_PKEY_METHOD));
+
 	pmeth->pkey_id = id;
 	pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC;
 
@@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
 	return pmeth;
 	}
 
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth)
+	{
+	if (ppkey_id)
+		*ppkey_id = meth->pkey_id;
+	if (pflags)
+		*pflags = meth->flags;
+	}
+
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src)
+	{
+
+	dst->init = src->init;
+	dst->copy = src->copy;
+	dst->cleanup = src->cleanup;
+
+	dst->paramgen_init = src->paramgen_init;
+	dst->paramgen = src->paramgen;
+
+	dst->keygen_init = src->keygen_init;
+	dst->keygen = src->keygen;
+
+	dst->sign_init = src->sign_init;
+	dst->sign = src->sign;
+
+	dst->verify_init = src->verify_init;
+	dst->verify = src->verify;
+
+	dst->verify_recover_init = src->verify_recover_init;
+	dst->verify_recover = src->verify_recover;
+
+	dst->signctx_init = src->signctx_init;
+	dst->signctx = src->signctx;
+
+	dst->verifyctx_init = src->verifyctx_init;
+	dst->verifyctx = src->verifyctx;
+
+	dst->encrypt_init = src->encrypt_init;
+	dst->encrypt = src->encrypt;
+
+	dst->decrypt_init = src->decrypt_init;
+	dst->decrypt = src->decrypt;
+
+	dst->derive_init = src->derive_init;
+	dst->derive = src->derive;
+
+	dst->ctrl = src->ctrl;
+	dst->ctrl_str = src->ctrl_str;
+	}
+
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth)
 	{
 	if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC))
diff --git a/crypto/hmac/hm_ameth.c b/crypto/hmac/hm_ameth.c
index 6d8a891..e03f24a 100644
--- a/crypto/hmac/hm_ameth.c
+++ b/crypto/hmac/hm_ameth.c
@@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth =
 
 	hmac_size,
 	0,
-	0,0,0,0,0,0,
+	0,0,0,0,0,0,0,
 
 	hmac_key_free,
 	hmac_pkey_ctrl,
diff --git a/crypto/hmac/hm_pmeth.c b/crypto/hmac/hm_pmeth.c
index 71e8567..0daa445 100644
--- a/crypto/hmac/hm_pmeth.c
+++ b/crypto/hmac/hm_pmeth.c
@@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
 	dctx = dst->data;
 	dctx->md = sctx->md;
 	HMAC_CTX_init(&dctx->ctx);
-	HMAC_CTX_copy(&dctx->ctx, &sctx->ctx);
+	if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx))
+		return 0;
 	if (sctx->ktmp.data)
 		{
 		if (!ASN1_OCTET_STRING_set(&dctx->ktmp,
@@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
 static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
 	{
 	HMAC_PKEY_CTX *hctx = ctx->pctx->data;
-	HMAC_Update(&hctx->ctx, data, count);
+	if (!HMAC_Update(&hctx->ctx, data, count))
+		return 0;
 	return 1;
 	}
 
@@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 	if (!sig)
 		return 1;
 
-	HMAC_Final(&hctx->ctx, sig, &hlen);
+	if (!HMAC_Final(&hctx->ctx, sig, &hlen))
+		return 0;
 	*siglen = (size_t)hlen;
 	return 1;
 	}
@@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr;
-		HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
-				ctx->engine);
+		if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
+				ctx->engine))
+			return 0;
 		break;
 
 		default:
diff --git a/crypto/hmac/hmac.c b/crypto/hmac/hmac.c
index 6c98fc4..ba27cbf 100644
--- a/crypto/hmac/hmac.c
+++ b/crypto/hmac/hmac.c
@@ -61,12 +61,34 @@
 #include "cryptlib.h"
 #include <openssl/hmac.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, int len,
 		  const EVP_MD *md, ENGINE *impl)
 	{
 	int i,j,reset=0;
 	unsigned char pad[HMAC_MAX_MD_CBLOCK];
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->i_ctx.engine)
+			&&  !(ctx->i_ctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+			{
+			EVPerr(EVP_F_HMAC_INIT_EX, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_hmac_init_ex().
+		 */
+		if (!impl && !ctx->i_ctx.engine)
+			return FIPS_hmac_init_ex(ctx, key, len, md, NULL);
+		}
+#endif
+
 	if (md != NULL)
 		{
 		reset=1;
@@ -133,6 +155,10 @@ int HMAC_Init(HMAC_CTX *ctx, const void *key, int len, const EVP_MD *md)
 
 int HMAC_Update(HMAC_CTX *ctx, const unsigned char *data, size_t len)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_update(ctx, data, len);
+#endif
 	return EVP_DigestUpdate(&ctx->md_ctx,data,len);
 	}
 
@@ -140,6 +166,10 @@ int HMAC_Final(HMAC_CTX *ctx, unsigned char *md, unsigned int *len)
 	{
 	unsigned int i;
 	unsigned char buf[EVP_MAX_MD_SIZE];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_final(ctx, md, len);
+#endif
 
 	if (!EVP_DigestFinal_ex(&ctx->md_ctx,buf,&i))
 		goto err;
@@ -179,6 +209,13 @@ int HMAC_CTX_copy(HMAC_CTX *dctx, HMAC_CTX *sctx)
 
 void HMAC_CTX_cleanup(HMAC_CTX *ctx)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		{
+		FIPS_hmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
 	EVP_MD_CTX_cleanup(&ctx->i_ctx);
 	EVP_MD_CTX_cleanup(&ctx->o_ctx);
 	EVP_MD_CTX_cleanup(&ctx->md_ctx);
diff --git a/crypto/ia64cpuid.S b/crypto/ia64cpuid.S
index d705fff..7832b9b 100644
--- a/crypto/ia64cpuid.S
+++ b/crypto/ia64cpuid.S
@@ -26,7 +26,7 @@ OPENSSL_atomic_add:
 { .mii;	mov		ar.ccv=r2
 	add		r8=r2,r33
 	mov		r3=r2		};;
-{ .mmi;	mf
+{ .mmi;	mf;;
 	cmpxchg4.acq	r2=[r32],r8,ar.ccv
 	nop.i		0		};;
 { .mib;	cmp.ne		p6,p0=r2,r3
diff --git a/crypto/md4/md4.h b/crypto/md4/md4.h
index c3ed9b3..a55368a 100644
--- a/crypto/md4/md4.h
+++ b/crypto/md4/md4.h
@@ -105,6 +105,9 @@ typedef struct MD4state_st
 	unsigned int num;
 	} MD4_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD4_Init(MD4_CTX *c);
+#endif
 int MD4_Init(MD4_CTX *c);
 int MD4_Update(MD4_CTX *c, const void *data, size_t len);
 int MD4_Final(unsigned char *md, MD4_CTX *c);
diff --git a/crypto/md4/md4_dgst.c b/crypto/md4/md4_dgst.c
index e0c42e8..b5b165b 100644
--- a/crypto/md4/md4_dgst.c
+++ b/crypto/md4/md4_dgst.c
@@ -57,8 +57,9 @@
  */
 
 #include <stdio.h>
-#include "md4_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include "md4_locl.h"
 
 const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@ const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD4_Init(MD4_CTX *c)
+fips_md_init(MD4)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;
@@ -105,22 +106,23 @@ void md4_block_data_order (MD4_CTX *c, const void *data_, size_t num)
 
 	for (;num--;)
 		{
-	HOST_c2l(data,l); X( 0)=l;		HOST_c2l(data,l); X( 1)=l;
+	(void)HOST_c2l(data,l); X( 0)=l;
+	(void)HOST_c2l(data,l); X( 1)=l;
 	/* Round 0 */
-	R0(A,B,C,D,X( 0), 3,0);	HOST_c2l(data,l); X( 2)=l;
-	R0(D,A,B,C,X( 1), 7,0);	HOST_c2l(data,l); X( 3)=l;
-	R0(C,D,A,B,X( 2),11,0);	HOST_c2l(data,l); X( 4)=l;
-	R0(B,C,D,A,X( 3),19,0);	HOST_c2l(data,l); X( 5)=l;
-	R0(A,B,C,D,X( 4), 3,0);	HOST_c2l(data,l); X( 6)=l;
-	R0(D,A,B,C,X( 5), 7,0);	HOST_c2l(data,l); X( 7)=l;
-	R0(C,D,A,B,X( 6),11,0);	HOST_c2l(data,l); X( 8)=l;
-	R0(B,C,D,A,X( 7),19,0);	HOST_c2l(data,l); X( 9)=l;
-	R0(A,B,C,D,X( 8), 3,0);	HOST_c2l(data,l); X(10)=l;
-	R0(D,A,B,C,X( 9), 7,0);	HOST_c2l(data,l); X(11)=l;
-	R0(C,D,A,B,X(10),11,0);	HOST_c2l(data,l); X(12)=l;
-	R0(B,C,D,A,X(11),19,0);	HOST_c2l(data,l); X(13)=l;
-	R0(A,B,C,D,X(12), 3,0);	HOST_c2l(data,l); X(14)=l;
-	R0(D,A,B,C,X(13), 7,0);	HOST_c2l(data,l); X(15)=l;
+	R0(A,B,C,D,X( 0), 3,0);	(void)HOST_c2l(data,l); X( 2)=l;
+	R0(D,A,B,C,X( 1), 7,0);	(void)HOST_c2l(data,l); X( 3)=l;
+	R0(C,D,A,B,X( 2),11,0);	(void)HOST_c2l(data,l); X( 4)=l;
+	R0(B,C,D,A,X( 3),19,0);	(void)HOST_c2l(data,l); X( 5)=l;
+	R0(A,B,C,D,X( 4), 3,0);	(void)HOST_c2l(data,l); X( 6)=l;
+	R0(D,A,B,C,X( 5), 7,0);	(void)HOST_c2l(data,l); X( 7)=l;
+	R0(C,D,A,B,X( 6),11,0);	(void)HOST_c2l(data,l); X( 8)=l;
+	R0(B,C,D,A,X( 7),19,0);	(void)HOST_c2l(data,l); X( 9)=l;
+	R0(A,B,C,D,X( 8), 3,0);	(void)HOST_c2l(data,l); X(10)=l;
+	R0(D,A,B,C,X( 9), 7,0);	(void)HOST_c2l(data,l); X(11)=l;
+	R0(C,D,A,B,X(10),11,0);	(void)HOST_c2l(data,l); X(12)=l;
+	R0(B,C,D,A,X(11),19,0);	(void)HOST_c2l(data,l); X(13)=l;
+	R0(A,B,C,D,X(12), 3,0);	(void)HOST_c2l(data,l); X(14)=l;
+	R0(D,A,B,C,X(13), 7,0);	(void)HOST_c2l(data,l); X(15)=l;
 	R0(C,D,A,B,X(14),11,0);
 	R0(B,C,D,A,X(15),19,0);
 	/* Round 1 */
diff --git a/crypto/md4/md4_locl.h b/crypto/md4/md4_locl.h
index c8085b0..99c3e50 100644
--- a/crypto/md4/md4_locl.h
+++ b/crypto/md4/md4_locl.h
@@ -77,10 +77,10 @@ void md4_block_data_order (MD4_CTX *c, const void *p,size_t num);
 #define HASH_FINAL		MD4_Final
 #define	HASH_MAKE_STRING(c,s)	do {	\
 	unsigned long ll;		\
-	ll=(c)->A; HOST_l2c(ll,(s));	\
-	ll=(c)->B; HOST_l2c(ll,(s));	\
-	ll=(c)->C; HOST_l2c(ll,(s));	\
-	ll=(c)->D; HOST_l2c(ll,(s));	\
+	ll=(c)->A; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->B; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->C; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->D; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 #define	HASH_BLOCK_DATA_ORDER	md4_block_data_order
 
diff --git a/crypto/md5/asm/md5-586.S b/crypto/md5/asm/md5-586.S
new file mode 100644
index 0000000..23e4de7
--- /dev/null
+++ b/crypto/md5/asm/md5-586.S
@@ -0,0 +1,679 @@
+.file	"crypto/md5/asm/md5-586.s"
+.text
+.globl	md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+.align	16
+md5_block_asm_data_order:
+.L_md5_block_asm_data_order_begin:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+	movl	20(%esp),%ecx
+	pushl	%ebp
+	shll	$6,%ecx
+	pushl	%ebx
+	addl	%esi,%ecx
+	subl	$64,%ecx
+	movl	(%edi),%eax
+	pushl	%ecx
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+.L000start:
+
+
+	movl	%ecx,%edi
+	movl	(%esi),%ebp
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	3614090360(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	4(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	3905402710(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	8(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	606105819(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	12(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	3250441966(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	16(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	4118548399(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	20(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	1200080426(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	24(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2821735955(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	28(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	4249261313(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	32(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1770035416(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	36(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	2336552879(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	40(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	4294925233(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	44(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	2304563134(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	48(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1804603682(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	52(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	4254626195(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	56(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2792965006(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	60(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	1236535329(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	4(%esi),%ebp
+	addl	%ecx,%ebx
+
+
+
+	leal	4129170786(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	24(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3225465664(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	44(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	643717713(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3921069994(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	3593408605(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	40(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	38016083(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	60(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	3634488961(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	16(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3889429448(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	36(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	568446438(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	56(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3275163606(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	12(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	4107603335(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	32(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	1163531501(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	52(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	2850285829(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	8(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	4243563512(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	28(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	1735328473(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	48(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	2368359562(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	4294588738(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	32(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	2272392833(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	1839030562(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	56(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	4259657740(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	2763975236(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	16(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	1272893353(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	4139469664(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	40(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3200236656(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	681279174(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3936430074(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	3572445317(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	24(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	76029189(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	3654602809(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	48(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3873151461(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	530742520(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	8(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3299628645(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	orl	%ebx,%edi
+	leal	4096336452(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	1126891415(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	56(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2878612391(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	20(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	4237533241(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	48(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1700485571(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	2399980690(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	40(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	4293915773(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	2240044497(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	32(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1873313359(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	4264355552(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	24(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2734768916(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	1309151649(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	16(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	4149444226(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	3174756917(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	8(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	718787259(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	3951481745(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	24(%esp),%ebp
+	addl	%edi,%ebx
+	addl	$64,%esi
+	roll	$21,%ebx
+	movl	(%ebp),%edi
+	addl	%ecx,%ebx
+	addl	%edi,%eax
+	movl	4(%ebp),%edi
+	addl	%edi,%ebx
+	movl	8(%ebp),%edi
+	addl	%edi,%ecx
+	movl	12(%ebp),%edi
+	addl	%edi,%edx
+	movl	%eax,(%ebp)
+	movl	%ebx,4(%ebp)
+	movl	(%esp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	cmpl	%esi,%edi
+	jae	.L000start
+	popl	%eax
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
diff --git a/crypto/md5/asm/md5-x86_64.S b/crypto/md5/asm/md5-x86_64.S
new file mode 100644
index 0000000..235d5e4
--- /dev/null
+++ b/crypto/md5/asm/md5-x86_64.S
@@ -0,0 +1,668 @@
+.text	
+.align	16
+
+.globl	md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r14
+	pushq	%r15
+.Lprologue:
+
+
+
+
+	movq	%rdi,%rbp
+	shlq	$6,%rdx
+	leaq	(%rsi,%rdx,1),%rdi
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+
+
+
+
+
+
+	cmpq	%rdi,%rsi
+	je	.Lend				
+
+
+.Lloop:
+	movl	%eax,%r8d
+	movl	%ebx,%r9d
+	movl	%ecx,%r14d
+	movl	%edx,%r15d
+	movl	0(%rsi),%r10d
+	movl	%edx,%r11d
+	xorl	%ecx,%r11d
+	leal	-680876936(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	4(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-389564586(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	606105819(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1044525330(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	-176418897(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	1200080426(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1473231341(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-45705983(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1770035416(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-1958414417(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-42063(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1990404162(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1804603682(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-40341101(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1502002290(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	1236535329(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	movl	4(%rsi),%r10d
+	movl	%edx,%r11d
+	movl	%edx,%r12d
+	notl	%r11d
+	leal	-165796510(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1069501632(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	643717713(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-373897302(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-701558691(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	38016083(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-660478335(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-405537848(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	568446438(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1019803690(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-187363961(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	1163531501(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-1444681467(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-51403784(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	1735328473(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-1926607734(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	movl	20(%rsi),%r10d
+	movl	%ecx,%r11d
+	leal	-378558(%rax,%r10,1),%eax
+	movl	32(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-2022574463(%rdx,%r10,1),%edx
+	movl	44(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	1839030562(%rcx,%r10,1),%ecx
+	movl	56(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-35309556(%rbx,%r10,1),%ebx
+	movl	4(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-1530992060(%rax,%r10,1),%eax
+	movl	16(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	1272893353(%rdx,%r10,1),%edx
+	movl	28(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-155497632(%rcx,%r10,1),%ecx
+	movl	40(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-1094730640(%rbx,%r10,1),%ebx
+	movl	52(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	681279174(%rax,%r10,1),%eax
+	movl	0(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-358537222(%rdx,%r10,1),%edx
+	movl	12(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-722521979(%rcx,%r10,1),%ecx
+	movl	24(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	76029189(%rbx,%r10,1),%ebx
+	movl	36(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-640364487(%rax,%r10,1),%eax
+	movl	48(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-421815835(%rdx,%r10,1),%edx
+	movl	60(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	530742520(%rcx,%r10,1),%ecx
+	movl	8(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-995338651(%rbx,%r10,1),%ebx
+	movl	0(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	movl	0(%rsi),%r10d
+	movl	$4294967295,%r11d
+	xorl	%edx,%r11d
+	leal	-198630844(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	28(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	1126891415(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	56(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1416354905(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	20(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-57434055(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	48(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1700485571(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	12(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1894986606(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	40(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1051523(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	4(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-2054922799(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	32(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1873313359(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	60(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-30611744(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	24(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1560198380(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	52(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	1309151649(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	16(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	-145523070(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	44(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1120210379(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	8(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	718787259(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	36(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-343485551(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	0(%rsi),%r10d
+	movl	$4294967295,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+
+	addl	%r8d,%eax
+	addl	%r9d,%ebx
+	addl	%r14d,%ecx
+	addl	%r15d,%edx
+
+
+	addq	$64,%rsi
+	cmpq	%rdi,%rsi
+	jb	.Lloop				
+
+
+.Lend:
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	movq	(%rsp),%r15
+	movq	8(%rsp),%r14
+	movq	16(%rsp),%r12
+	movq	24(%rsp),%rbx
+	movq	32(%rsp),%rbp
+	addq	$40,%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	md5_block_asm_data_order,.-md5_block_asm_data_order
diff --git a/crypto/md5/asm/md5-x86_64.pl b/crypto/md5/asm/md5-x86_64.pl
index 8678854..f11224d 100755
--- a/crypto/md5/asm/md5-x86_64.pl
+++ b/crypto/md5/asm/md5-x86_64.pl
@@ -120,7 +120,8 @@ sub round4_step
 die "can't locate x86_64-xlate.pl";
 
 no warnings qw(uninitialized);
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $code .= <<EOF;
 .text
diff --git a/crypto/md5/md5.h b/crypto/md5/md5.h
index 4cbf843..541cc92 100644
--- a/crypto/md5/md5.h
+++ b/crypto/md5/md5.h
@@ -105,6 +105,9 @@ typedef struct MD5state_st
 	unsigned int num;
 	} MD5_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD5_Init(MD5_CTX *c);
+#endif
 int MD5_Init(MD5_CTX *c);
 int MD5_Update(MD5_CTX *c, const void *data, size_t len);
 int MD5_Final(unsigned char *md, MD5_CTX *c);
diff --git a/crypto/md5/md5_dgst.c b/crypto/md5/md5_dgst.c
index beace63..265890d 100644
--- a/crypto/md5/md5_dgst.c
+++ b/crypto/md5/md5_dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "md5_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@ const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD5_Init(MD5_CTX *c)
+fips_md_init(MD5)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;
diff --git a/crypto/md5/md5_locl.h b/crypto/md5/md5_locl.h
index 968d577..74d63d1 100644
--- a/crypto/md5/md5_locl.h
+++ b/crypto/md5/md5_locl.h
@@ -86,10 +86,10 @@ void md5_block_data_order (MD5_CTX *c, const void *p,size_t num);
 #define HASH_FINAL		MD5_Final
 #define	HASH_MAKE_STRING(c,s)	do {	\
 	unsigned long ll;		\
-	ll=(c)->A; HOST_l2c(ll,(s));	\
-	ll=(c)->B; HOST_l2c(ll,(s));	\
-	ll=(c)->C; HOST_l2c(ll,(s));	\
-	ll=(c)->D; HOST_l2c(ll,(s));	\
+	ll=(c)->A; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->B; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->C; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->D; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 #define	HASH_BLOCK_DATA_ORDER	md5_block_data_order
 
diff --git a/crypto/mdc2/mdc2.h b/crypto/mdc2/mdc2.h
index 72778a5..f3e8e57 100644
--- a/crypto/mdc2/mdc2.h
+++ b/crypto/mdc2/mdc2.h
@@ -81,6 +81,9 @@ typedef struct mdc2_ctx_st
 	} MDC2_CTX;
 
 
+#ifdef OPENSSL_FIPS
+int private_MDC2_Init(MDC2_CTX *c);
+#endif
 int MDC2_Init(MDC2_CTX *c);
 int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
 int MDC2_Final(unsigned char *md, MDC2_CTX *c);
diff --git a/crypto/mdc2/mdc2dgst.c b/crypto/mdc2/mdc2dgst.c
index 4aa406e..d66ed6a 100644
--- a/crypto/mdc2/mdc2dgst.c
+++ b/crypto/mdc2/mdc2dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <openssl/crypto.h>
 #include <openssl/des.h>
 #include <openssl/mdc2.h>
 
@@ -75,7 +76,7 @@
 			*((c)++)=(unsigned char)(((l)>>24L)&0xff))
 
 static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
-int MDC2_Init(MDC2_CTX *c)
+fips_md_init(MDC2)
 	{
 	c->num=0;
 	c->pad_type=1;
diff --git a/crypto/mem.c b/crypto/mem.c
index 6f80dd3..1cc62ea 100644
--- a/crypto/mem.c
+++ b/crypto/mem.c
@@ -121,10 +121,11 @@ static void (*set_debug_options_func)(long) = NULL;
 static long (*get_debug_options_func)(void) = NULL;
 #endif
 
-
 int CRYPTO_set_mem_functions(void *(*m)(size_t), void *(*r)(void *, size_t),
 	void (*f)(void *))
 	{
+	/* Dummy call just to ensure OPENSSL_init() gets linked in */
+	OPENSSL_init();
 	if (!allow_customize)
 		return 0;
 	if ((m == 0) || (r == 0) || (f == 0))
@@ -186,6 +187,7 @@ int CRYPTO_set_mem_debug_functions(void (*m)(void *,int,const char *,int,int),
 	{
 	if (!allow_customize_debug)
 		return 0;
+	OPENSSL_init();
 	malloc_debug_func=m;
 	realloc_debug_func=r;
 	free_debug_func=f;
@@ -361,6 +363,10 @@ void *CRYPTO_realloc_clean(void *str, int old_len, int num, const char *file,
 
 	if (num <= 0) return NULL;
 
+	/* We don't support shrinking the buffer. Note the memcpy that copies
+	 * |old_len| bytes to the new buffer, below. */
+	if (num < old_len) return NULL;
+
 	if (realloc_debug_func != NULL)
 		realloc_debug_func(str, NULL, num, file, line, 0);
 	ret=malloc_ex_func(num,file,line);
diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 0000000..6358b27
--- /dev/null
+++ b/crypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,451 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";	# $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";	# $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";	# $8
+#################
+$Xi="a0";	# $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4";	# $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";	# $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";	# $28
+
+{ my $N;
+  sub loop() {
+
+	$N++;
+$code.=<<___;
+.align	4
+	extbl	$Xlo,7,$nlo
+	and	$nlo,0xf0,$nhi
+	sll	$nlo,4,$nlo
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Zlo,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Zhi,0($nlo)
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,6(zero)
+	extbl	$Xlo,6,$nlo
+
+	ldq	$Tlo1,8($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+	ldq	$Thi1,0($nhi)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	and	$nlo,0xf0,$nhi
+
+	xor	$Tlo1,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	xor	$Thi1,$Zhi,$Zhi
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Tlo0,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Thi0,0($nlo)
+
+.Looplo$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xlo,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Looplo$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,7(zero)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	unop
+
+
+.Loophi$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Loophi$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo0,$Zlo,$Zlo
+	xor	$Thi0,$Zhi,$Zhi
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	xor	$rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	gcm_gmult_4bit
+.align	4
+.ent	gcm_gmult_4bit
+gcm_gmult_4bit:
+	.frame	sp,0,ra
+	.prologue 0
+
+	ldq	$Xlo,8($Xi)
+	ldq	$Xhi,0($Xi)
+
+	br	$rem_4bit,.Lpic1
+.Lpic1:	lda	$rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	ret	(ra)
+.end	gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.align	4
+.ent	gcm_ghash_4bit
+gcm_ghash_4bit:
+	lda	sp,-32(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	.mask	0x04000600,-32
+	.frame	sp,32,ra
+	.prologue 0
+
+	ldq_u	$inhi,0($inp)
+	ldq_u	$Thi0,7($inp)
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+	ldq	$Xhi,0($Xi)
+	ldq	$Xlo,8($Xi)
+
+	br	$rem_4bit,.Lpic2
+.Lpic2:	lda	$rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+
+.Louter:
+	extql	$inhi,$inp,$inhi
+	extqh	$Thi0,$inp,$Thi0
+	or	$inhi,$Thi0,$inhi
+	lda	$inp,16($inp)
+
+	extql	$inlo,$inp,$inlo
+	extqh	$Tlo0,$inp,$Tlo0
+	or	$inlo,$Tlo0,$inlo
+	subq	$len,16,$len
+
+	xor	$Xlo,$inlo,$Xlo
+	xor	$Xhi,$inhi,$Xhi
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+	beq	$len,.Ldone
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+	ldq_u	$inhi,0($inp)
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+	ldq_u	$Thi0,7($inp)
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	br	zero,.Louter
+
+.Ldone:
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	.set	noreorder
+	/*ldq	ra,0(sp)*/
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	lda	sp,32(sp)
+	ret	(ra)
+.end	gcm_ghash_4bit
+
+.align	4
+rem_4bit:
+	.quad	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+	.quad	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+	.quad	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+	.quad	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/crypto/modes/asm/ghash-armv4.S b/crypto/modes/asm/ghash-armv4.S
new file mode 100644
index 0000000..d66c4cb
--- /dev/null
+++ b/crypto/modes/asm/ghash-armv4.S
@@ -0,0 +1,408 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	r2,pc,#8
+	sub	r2,r2,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	r3,r2,r3		@ r3 to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	r12,[r2,#15]
+	ldrb	r14,[r0,#15]
+.Louter:
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
+	add	r11,r1,r14
+	ldrb	r12,[r2,#14]
+
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	ldrb	r14,[r0,#14]
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16
+
+.Linner:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+	ldrplb	r12,[r2,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+	ldrplb	r8,[r0,r3]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r9,[sp,r14]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eorpl	r12,r12,r8
+	eor	r7,r11,r7,lsr#4
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	r3,[sp,#32]		@ re-load r3/end
+	add	r2,r2,#16
+	mov	r14,r4
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+	cmp	r2,r3
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+	ldrneb	r12,[r2,#15]
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+	
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	r12,[r0,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
+	ldrb	r12,[r0,#14]
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	and	r14,r12,#0xf0
+	eor	r7,r7,r8,lsl#16
+	and	r12,r12,#0x0f
+
+.Loop:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+	ldrplb	r12,[r0,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+	
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+	
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		r1,#16		@ point at H in GCM128_CTX
+	vld1.64		d29,[r0,:64]!@ load Xi
+	vmov.i32	d5,#0xe1		@ our irreducible polynomial
+	vld1.64		d28,[r0,:64]!
+	vshr.u64	d5,#32
+	vldmia		r1,{d0-d1}	@ load H
+	veor		q12,q12
+#ifdef __ARMEL__
+	vrev64.8	q14,q14
+#endif
+	veor		q13,q13
+	veor		q11,q11
+	mov		r1,#16
+	veor		q10,q10
+	mov		r3,#16
+	veor		d2,d2
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		d21,[r0,:64]!	@ load Xi
+	vmov.i32	d5,#0xe1		@ our irreducible polynomial
+	vld1.64		d20,[r0,:64]!
+	vshr.u64	d5,#32
+	vldmia		r0,{d0-d1}		@ load H
+	veor		q12,q12
+	nop
+#ifdef __ARMEL__
+	vrev64.8	q10,q10
+#endif
+.Louter_neon:
+	vld1.64		d29,[r2]!	@ load inp
+	veor		q13,q13
+	vld1.64		d28,[r2]!
+	veor		q11,q11
+	mov		r1,#16
+#ifdef __ARMEL__
+	vrev64.8	q14,q14
+#endif
+	veor		d2,d2
+	veor		q14,q10			@ inp^=Xi
+	veor		q10,q10
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		r1,r1,#1
+	vmull.p8	q9,d1,d4		@ H.lo·Xi[i]
+	vmull.p8	q8,d0,d4		@ H.hi·Xi[i]
+	vext.8		q14,q12,#1		@ IN>>=8
+
+	veor		q10,q13		@ modulo-scheduled part
+	vshl.i64	d22,#48
+	vdup.8		d4,d28[0]	@ broadcast lowest byte
+	veor		d3,d18,d20
+
+	veor		d21,d22
+	vuzp.8		q9,q8
+	vsli.8		d2,d3,#1		@ compose the "carry" byte
+	vext.8		q10,q12,#1		@ Z>>=8
+
+	vmull.p8	q11,d2,d5		@ "carry"·0xe1
+	vshr.u8		d2,d3,#7		@ save Z's bottom bit
+	vext.8		q13,q9,q12,#1	@ Qlo>>=8
+	veor		q10,q8
+	bne		.Linner_neon
+
+	veor		q10,q13		@ modulo-scheduled artefact
+	vshl.i64	d22,#48
+	veor		d21,d22
+
+	@ finalization, normalize Z:Zo
+	vand		d2,d5		@ suffices to mask the bit
+	vshr.u64	d3,d20,#63
+	vshl.i64	q10,#1
+	subs		r3,#16
+	vorr		q10,q1		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q10,q10
+#endif
+	sub		r0,#16	
+	vst1.64		d21,[r0,:64]!	@ write out Xi
+	vst1.64		d20,[r0,:64]
+
+	.word	0xe12fff1e
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align  2
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000..e46f8e3
--- /dev/null
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";	# variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;	# used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$_,$_
+	str	$_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+	str	$_,[$Xi,#$i]
+#else
+	mov	$Tlh,$_,lsr#8
+	strb	$_,[$Xi,#$i+3]
+	mov	$Thl,$_,lsr#16
+	strb	$Tlh,[$Xi,#$i+2]
+	mov	$Thh,$_,lsr#24
+	strb	$Thl,[$Xi,#$i+1]
+	strb	$Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	$rem_4bit,pc,#8
+	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	$len,$inp,$len		@ $len to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	$nlo,[$inp,#15]
+	ldrb	$nhi,[$Xi,#15]
+.Louter:
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
+	ldrb	$nlo,[$inp,#14]
+
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	ldrb	$nhi,[$Xi,#14]
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$inp,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrplb	$Tll,[$Xi,$cnt]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tlh,[sp,$nhi]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eorpl	$nlo,$nlo,$Tll
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	$len,[sp,#32]		@ re-load $len/end
+	add	$inp,$inp,#16
+	mov	$nhi,$Zll
+___
+	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	$nlo,[$Xi,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	ldrb	$nlo,[$Xi,#14]
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	and	$nhi,$nlo,#0xf0
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+	and	$nlo,$nlo,#0x0f
+
+.Loop:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$Xi,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+___
+	&Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl;	# $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		$Htbl,#16		@ point at H in GCM128_CTX
+	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
+	veor		$zero,$zero
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Qpost,$Qpost
+	veor		$R,$R
+	mov		$cnt,#16
+	veor		$Z,$Z
+	mov		$len,#16
+	veor		$Zo,$Zo
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
+	veor		$zero,$zero
+	nop
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+.Louter_neon:
+	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
+	veor		$Qpost,$Qpost
+	vld1.64		`&Dlo($IN)`,[$inp]!
+	veor		$R,$R
+	mov		$cnt,#16
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Zo,$Zo
+	veor		$IN,$Z			@ inp^=Xi
+	veor		$Z,$Z
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		$cnt,$cnt,#1
+	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo·Xi[i]
+	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi·Xi[i]
+	vext.8		$IN,$zero,#1		@ IN>>=8
+
+	veor		$Z,$Qpost		@ modulo-scheduled part
+	vshl.i64	`&Dlo("$R")`,#48
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+	vuzp.8		$Qlo,$Qhi
+	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
+	vext.8		$Z,$zero,#1		@ Z>>=8
+
+	vmull.p8	$R,$Zo,$mod		@ "carry"·0xe1
+	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
+	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
+	veor		$Z,$Qhi
+	bne		.Linner_neon
+
+	veor		$Z,$Qpost		@ modulo-scheduled artefact
+	vshl.i64	`&Dlo("$R")`,#48
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+
+	@ finalization, normalize Z:Zo
+	vand		$Zo,$mod		@ suffices to mask the bit
+	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+	vshl.i64	$Z,#1
+	subs		$len,#16
+	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+	sub		$Xi,#16	
+	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
+	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+
+	bx	lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 0000000..0354c95
--- /dev/null
+++ b/crypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
+	($p17)	xor	xi[1]=xi[1],in[1]	};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p19)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p19)	ld8	rem=[rem]
+	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
+{ .mmi;	($p16)	ld1	in[0]=[inp],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p19)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p19)	xor	Hhi=Hhi,rem
+	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
+
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
+	(p18)	xor	Zhi=Zhi,Hhi		};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p18)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p18)	ld8	rem=[rem]
+	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
+{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p18)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p18)	xor	Hhi=Hhi,rem
+	(p17)	add	Hi[0]=Htbl,Hi[0]
+	br.ctop.sptk	$label			};;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2;	prevlc=r3;	prevpr=r8;
+mask0xf0=r21;
+rem=r22;	rem_4bitp=r23;
+Xi=r24;		Htbl=r25;
+inp=r26;	end=r27;
+Hhi=r28;	Hlo=r29;
+Zhi=r30;	Zlo=r31;
+
+.align	128
+.skip	16					// aligns loop body
+.global	gcm_gmult_4bit#
+.proc	gcm_gmult_4bit#
+gcm_gmult_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,2,6,0,8
+	$ADDP	Xi=15,in0			// &Xi[15]
+	mov	rem_4bitp=ip		}
+{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotr	in[3],xi[3],Hi[2]
+
+{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
+	mov	mask0xf0=0xf0
+	brp.loop.imp	.Loop1,.Lend1-16};;
+{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
+					};;
+{ .mii;	shladd	Hi[1]=xi[2],4,r0
+	mov	pr.rot=0x7<<16
+	mov	ar.lc=13		};;
+{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
+	mov	ar.ec=3
+	xor	Zlo=Zlo,Zlo		};;
+{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
+	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+	xor	Zhi=Zhi,Zhi		};;
+___
+	&loop	(".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
+{ .mib;	mux1	Zlo=Zlo,\@rev		};;
+{ .mib;	mux1	Zhi=Zhi,\@rev		};;
+{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
+	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
+{ .mib;	st8	[Hlo]=Zlo
+	mov	pr=prevpr,0x1ffff	};;
+{ .mib;	st8	[Hhi]=Zhi
+	mov	ar.lc=prevlc
+	br.ret.sptk.many	b0	};;
+.endp	gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+    for (my $i=0;$i<8;$i++) {
+	$code.=<<___;
+{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
+	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
+{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
+	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
+___
+	$code.=shift	if (($i+$#_)==7);
+	$code.="\t};;\n"
+    }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align	32
+.skip	16					// aligns loop body
+.global	gcm_ghash_4bit#
+.proc	gcm_ghash_4bit#
+gcm_ghash_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,4,2,0,0
+	.vframe	prevsp
+	mov	prevsp=sp
+	mov	$rem_8bit=ip		};;
+	.body
+{ .mfi;	$ADDP	r8=0+0,$Htbl
+	$ADDP	r9=0+8,$Htbl		}
+{ .mfi;	$ADDP	r10=128+0,$Htbl
+	$ADDP	r11=128+8,$Htbl		};;
+___
+	&load_htable(
+	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
+	"	$ADDP	$len=$len,$inp",	# &inp[len]
+	"	$ADDP	$inp=15,$inp",		# &inp[15]
+	"	mov	$mask0xff=0xff",
+	"	add	sp=-512,sp",
+	"	andcm	sp=sp,$mask0xff",	# align stack frame
+	"	add	r14=0,sp",
+	"	add	r15=8,sp");
+$code.=<<___;
+{ .mmi;	$sum	1<<1				// go big-endian
+	add	r8=256+0,sp
+	add	r9=256+8,sp		}
+{ .mmi;	add	r10=256+128+0,sp
+	add	r11=256+128+8,sp
+	add	$len=-17,$len		};;
+___
+for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
+	st8	[r9]=$rhi,16			// Htable[$i].hi
+	shrp	$rlo=$rhi,$rlo,4	}//;;
+{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
+	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
+	shr.u	$rhi=$rhi,4		};;
+{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
+	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
+	ld8	r17=[r9],16		};;	// Htable[8].hi
+{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
+	ld8	r19=[r9],16		}	// Htable[9].hi
+{ .mmi;	rum	1<<5				// clear um.mfh
+	shrp	r16=r17,r16,4		};;
+___
+for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
+	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
+	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+___
+}
+$code.=<<___;
+{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
+	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___;	# (p16)
+{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17)
+{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
+	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+.align	32
+.LOOP:
+{ .mmi;
+(p6)	st8	[$Xip]=$Zhi,13
+	xor	$Zlo=$Zlo,$Zlo
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17),(p18)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1		}	//(p16) *inp--
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___;	# (p16),(p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1			//(p16) *inp--
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+}
+
+$code.=<<___;	# (p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p18),(p19)
+{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
+{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
+	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
+{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p19)
+{ .mmi;	cmp.ltu	p6,p0=$inp,$len
+	add	$inp=32,$inp
+	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	add	$Xip=9,$Xip		};;	//	&Xi.lo
+{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
+{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
+(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
+{ .mmi;	st8	[$Xip]=$Zlo,-8
+(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
+	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
+{ .mib;
+(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
+(p6)	br.cond.dptk.many	.LOOP	};;
+
+{ .mib;	st8	[$Xip]=$Zhi		};;
+{ .mib;	$rum	1<<1				// return to little-endian
+	.restore	sp
+	mov	sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	gcm_ghash_4bit#
+___
+$code.=<<___;
+.align	128
+.type	rem_4bit#,\@object
+rem_4bit:
+        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size	rem_4bit#,128
+.type	rem_8bit#,\@object
+rem_8bit:
+	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size	rem_8bit#,512
+stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-parisc.pl b/crypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 0000000..8c7454e
--- /dev/null
+++ b/crypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,730 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$NREGS		=6;
+} else {
+	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$NREGS		=11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+				#                 [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26";	# argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;	# variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+	$Zhl="%r6";
+	$Zlh="%r7";
+	$Hhl="%r8";
+	$Hlh="%r9";
+	$Thl="%r10";
+	$Tlh="%r11";
+}
+$rem2="%r6";	# used in PA-RISC 2.0 code
+
+$code.=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	64
+gcm_gmult_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_gmult
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_gmult
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_gmult_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+	ldbx	$cnt($Xi),$nlo
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$Tll,$Zll,$Zll
+	addib,uv -1,$cnt,L\$oop_gmult_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	std	$Zhh,0($Xi)
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_gmult
+	nop
+
+L\$parisc1_gmult
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_gmult_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_gmult_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	stw	$Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+gcm_ghash_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_ghash
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_ghash
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+	ldb	15($inp),$nhi
+	xor	$nhi,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_ghash_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldbx	$cnt($Xi),$nlo
+	ldbx	$cnt($inp),$byte
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	xor	$byte,$nlo,$nlo
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+
+	ldd	$rem($rem_4bit),$rem
+	addib,uv -1,$cnt,L\$oop_ghash_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	ldo	16($inp),$inp
+	std	$Zhh,0($Xi)
+	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+	copy	$Zll,$nlo
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_ghash
+	nop
+
+L\$parisc1_ghash
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+	ldb	15($inp),$byte
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_ghash_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldbx	$cnt($inp),$byte
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$byte,$nlo,$nlo
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_ghash_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	ldo	16($inp),$inp
+	stw	$Zhh,0($Xi)
+	comb,<>	$inp,$len,L\$outer_ghash_pa1
+	copy	$Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	64
+L\$rem_4bit
+	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+	.ALIGN	64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+
+    # I only have ",z" completer, it's impicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
+    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+    	my $cpos=63-$2;
+	my $len=32-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	if ($SIZE_T==4) {
+		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+		s/cmpb,\*/comb,/;
+		s/,\*/,/;
+	}
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 0000000..6a40d5d
--- /dev/null
+++ b/crypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2";	# argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6";	# variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_gmult
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_gmult
+	stg	%r0,16($sp)	# arrange 16 bytes of zero input
+	stg	%r0,24($sp)
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	la	$inp,16($sp)
+	lghi	$len,16
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_gmult:
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	lghi	$len,1
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	j	.Lgmult_shortcut
+.type	gcm_gmult_4bit,\@function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_ghash
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_ghash
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_ghash:
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	srlg	$len,$len,4
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	lg	$Zhi,0+1($Xi)
+	lghi	$tmp,0
+.Louter:
+	xg	$Zhi,0($inp)		# Xi ^= inp 
+	xg	$Zlo,8($inp)
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+	lghi	$tmp,0xf0
+	sllg	$nlo,$Zlo,4
+	srlg	$xi,$Zlo,8		# extract second byte
+	ngr	$nlo,$tmp
+	lgr	$nhi,$Zlo
+	lghi	$cnt,14
+	ngr	$nhi,$tmp
+
+	lg	$Zlo,8($nlo,$Htbl)
+	lg	$Zhi,0($nlo,$Htbl)
+
+	sllg	$nlo,$xi,4
+	sllg	$rem0,$Zlo,3
+	ngr	$nlo,$tmp
+	ngr	$rem0,$x78
+	ngr	$xi,$tmp
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	j	.Lghash_inner
+.align	16
+.Lghash_inner:
+	srlg	$Zlo,$Zlo,4
+	sllg	$tmp,$Zhi,60
+	xg	$Zlo,8($nlo,$Htbl)
+	srlg	$Zhi,$Zhi,4
+	llgc	$xi,0($cnt,$Xi)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$nlo,$xi,4
+	xg	$Zhi,0($rem0,$rem_4bit)
+	nill	$nlo,0xf0
+	sllg	$rem0,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem0,$x78
+	nill	$xi,0xf0
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	xg	$Zhi,0($rem1,$rem_4bit)
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	brct	$cnt,.Lghash_inner
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$xi,$Zlo,3
+	xg	$Zhi,0($rem0,$rem_4bit)
+	xgr	$Zlo,$tmp
+	ngr	$xi,$x78
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	xgr	$Zlo,$tmp
+	xg	$Zhi,0($rem1,$rem_4bit)
+
+	lg	$tmp,0($xi,$rem_4bit)
+	la	$inp,16($inp)
+	sllg	$tmp,$tmp,4		# correct last rem_4bit[rem]
+	brctg	$len,.Louter
+
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+	lm${g}	%r6,%r14,6*$SIZE_T($sp)
+	br	%r14
+.type	gcm_ghash_4bit,\@function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+	.long	`0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+	.long	`0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+	.long	`0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type	rem_4bit,\@object
+.size	rem_4bit,(.-rem_4bit)
+.string	"GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 0000000..70e7b04
--- /dev/null
+++ b/crypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.3.x	cc 5.2		this assembler
+#
+# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
+# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+#		gcc 4.4.1			this assembler
+#
+# 32-bit build	566				50	(+1000%)
+# 64-bit build	56				50	(+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+$Zhi="%o0";	# 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0";	# small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0";	# input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type	rem_4bit,#object
+.size	rem_4bit,(.-rem_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$inp+15],$nlo
+	ldub	[$Xi+15],$xi0
+	ldub	[$Xi+14],$xi1
+	add	$len,$inp,$len
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+	xor	$xi0,$nlo,$nlo
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$inp+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	xor	$xi1,$nlo,$nlo
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lghash_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lghash_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	ldub	[$Xi+$cnt],$xi1
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$xi1,$nlo,$nlo
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lghash_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+
+	add	$inp,16,$inp
+	cmp	$inp,$len
+	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+15],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+	srl	$Zlo,8,$xi1
+	and	$Zlo,0xff,$xi0
+	ba	.Louter
+	and	$xi1,0xff,$xi1
+.align	32
+.Ldone:
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_ghash_4bit,#function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$Xi+15],$nlo
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$Xi+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lgmult_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lgmult_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$Xi+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lgmult_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_gmult_4bit,#function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-x86.S b/crypto/modes/asm/ghash-x86.S
new file mode 100644
index 0000000..cb9ae20
--- /dev/null
+++ b/crypto/modes/asm/ghash-x86.S
@@ -0,0 +1,728 @@
+.file	"ghash-x86.s"
+.text
+.globl	gcm_gmult_4bit_x86
+.type	gcm_gmult_4bit_x86,@function
+.align	16
+gcm_gmult_4bit_x86:
+.L_gcm_gmult_4bit_x86_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	subl	$84,%esp
+	movl	104(%esp),%edi
+	movl	108(%esp),%esi
+	movl	(%edi),%ebp
+	movl	4(%edi),%edx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%ebx
+	movl	$0,16(%esp)
+	movl	$471859200,20(%esp)
+	movl	$943718400,24(%esp)
+	movl	$610271232,28(%esp)
+	movl	$1887436800,32(%esp)
+	movl	$1822425088,36(%esp)
+	movl	$1220542464,40(%esp)
+	movl	$1423966208,44(%esp)
+	movl	$3774873600,48(%esp)
+	movl	$4246732800,52(%esp)
+	movl	$3644850176,56(%esp)
+	movl	$3311403008,60(%esp)
+	movl	$2441084928,64(%esp)
+	movl	$2376073216,68(%esp)
+	movl	$2847932416,72(%esp)
+	movl	$3051356160,76(%esp)
+	movl	%ebp,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	shrl	$20,%ebx
+	andl	$240,%ebx
+	movl	4(%esi,%ebx,1),%ebp
+	movl	(%esi,%ebx,1),%edx
+	movl	12(%esi,%ebx,1),%ecx
+	movl	8(%esi,%ebx,1),%ebx
+	xorl	%eax,%eax
+	movl	$15,%edi
+	jmp	.L000x86_loop
+.align	16
+.L000x86_loop:
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	andb	$240,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	decl	%edi
+	js	.L001x86_break
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	shlb	$4,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	jmp	.L000x86_loop
+.align	16
+.L001x86_break:
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	bswap	%ebp
+	movl	104(%esp),%edi
+	movl	%ebx,12(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,4(%edi)
+	movl	%ebp,(%edi)
+	addl	$84,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_gmult_4bit_x86,.-.L_gcm_gmult_4bit_x86_begin
+.globl	gcm_ghash_4bit_x86
+.type	gcm_ghash_4bit_x86,@function
+.align	16
+gcm_ghash_4bit_x86:
+.L_gcm_ghash_4bit_x86_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	subl	$84,%esp
+	movl	104(%esp),%ebx
+	movl	108(%esp),%esi
+	movl	112(%esp),%edi
+	movl	116(%esp),%ecx
+	addl	%edi,%ecx
+	movl	%ecx,116(%esp)
+	movl	(%ebx),%ebp
+	movl	4(%ebx),%edx
+	movl	8(%ebx),%ecx
+	movl	12(%ebx),%ebx
+	movl	$0,16(%esp)
+	movl	$471859200,20(%esp)
+	movl	$943718400,24(%esp)
+	movl	$610271232,28(%esp)
+	movl	$1887436800,32(%esp)
+	movl	$1822425088,36(%esp)
+	movl	$1220542464,40(%esp)
+	movl	$1423966208,44(%esp)
+	movl	$3774873600,48(%esp)
+	movl	$4246732800,52(%esp)
+	movl	$3644850176,56(%esp)
+	movl	$3311403008,60(%esp)
+	movl	$2441084928,64(%esp)
+	movl	$2376073216,68(%esp)
+	movl	$2847932416,72(%esp)
+	movl	$3051356160,76(%esp)
+.align	16
+.L002x86_outer_loop:
+	xorl	12(%edi),%ebx
+	xorl	8(%edi),%ecx
+	xorl	4(%edi),%edx
+	xorl	(%edi),%ebp
+	movl	%ebx,12(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,(%esp)
+	shrl	$20,%ebx
+	andl	$240,%ebx
+	movl	4(%esi,%ebx,1),%ebp
+	movl	(%esi,%ebx,1),%edx
+	movl	12(%esi,%ebx,1),%ecx
+	movl	8(%esi,%ebx,1),%ebx
+	xorl	%eax,%eax
+	movl	$15,%edi
+	jmp	.L003x86_loop
+.align	16
+.L003x86_loop:
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	andb	$240,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	decl	%edi
+	js	.L004x86_break
+	movb	%bl,%al
+	shrdl	$4,%ecx,%ebx
+	andb	$15,%al
+	shrdl	$4,%edx,%ecx
+	shrdl	$4,%ebp,%edx
+	shrl	$4,%ebp
+	xorl	16(%esp,%eax,4),%ebp
+	movb	(%esp,%edi,1),%al
+	shlb	$4,%al
+	xorl	8(%esi,%eax,1),%ebx
+	xorl	12(%esi,%eax,1),%ecx
+	xorl	(%esi,%eax,1),%edx
+	xorl	4(%esi,%eax,1),%ebp
+	jmp	.L003x86_loop
+.align	16
+.L004x86_break:
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	bswap	%ebp
+	movl	112(%esp),%edi
+	leal	16(%edi),%edi
+	cmpl	116(%esp),%edi
+	movl	%edi,112(%esp)
+	jb	.L002x86_outer_loop
+	movl	104(%esp),%edi
+	movl	%ebx,12(%edi)
+	movl	%ecx,8(%edi)
+	movl	%edx,4(%edi)
+	movl	%ebp,(%edi)
+	addl	$84,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
+.type	_mmx_gmult_4bit_inner,@function
+.align	16
+_mmx_gmult_4bit_inner:
+	xorl	%ecx,%ecx
+	movl	%ebx,%edx
+	movb	%dl,%cl
+	shlb	$4,%cl
+	andl	$240,%edx
+	movq	8(%esi,%ecx,1),%mm0
+	movq	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	14(%edi),%cl
+	psllq	$60,%mm2
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	13(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	12(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	11(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	10(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	9(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	8(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	7(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	6(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	5(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	4(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	3(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	2(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	1(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	movb	(%edi),%cl
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movl	%ecx,%edx
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	shlb	$4,%cl
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%ecx,1),%mm0
+	psllq	$60,%mm2
+	andl	$240,%edx
+	pxor	(%eax,%ebp,8),%mm1
+	andl	$15,%ebx
+	pxor	(%esi,%ecx,1),%mm1
+	movd	%mm0,%ebp
+	pxor	%mm2,%mm0
+	psrlq	$4,%mm0
+	movq	%mm1,%mm2
+	psrlq	$4,%mm1
+	pxor	8(%esi,%edx,1),%mm0
+	psllq	$60,%mm2
+	pxor	(%eax,%ebx,8),%mm1
+	andl	$15,%ebp
+	pxor	(%esi,%edx,1),%mm1
+	movd	%mm0,%ebx
+	pxor	%mm2,%mm0
+	movl	4(%eax,%ebp,8),%edi
+	psrlq	$32,%mm0
+	movd	%mm1,%edx
+	psrlq	$32,%mm1
+	movd	%mm0,%ecx
+	movd	%mm1,%ebp
+	shll	$4,%edi
+	bswap	%ebx
+	bswap	%edx
+	bswap	%ecx
+	xorl	%edi,%ebp
+	bswap	%ebp
+	ret
+.size	_mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
+.globl	gcm_gmult_4bit_mmx
+.type	gcm_gmult_4bit_mmx,@function
+.align	16
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	call	.L005pic_point
+.L005pic_point:
+	popl	%eax
+	leal	.Lrem_4bit-.L005pic_point(%eax),%eax
+	movzbl	15(%edi),%ebx
+	call	_mmx_gmult_4bit_inner
+	movl	20(%esp),%edi
+	emms
+	movl	%ebx,12(%edi)
+	movl	%edx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%ebp,(%edi)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_gmult_4bit_mmx,.-.L_gcm_gmult_4bit_mmx_begin
+.globl	gcm_ghash_4bit_mmx
+.type	gcm_ghash_4bit_mmx,@function
+.align	16
+gcm_ghash_4bit_mmx:
+.L_gcm_ghash_4bit_mmx_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ecx
+	call	.L006pic_point
+.L006pic_point:
+	popl	%eax
+	leal	.Lrem_4bit-.L006pic_point(%eax),%eax
+	addl	%edi,%ecx
+	movl	%ecx,32(%esp)
+	subl	$20,%esp
+	movl	12(%ebp),%ebx
+	movl	4(%ebp),%edx
+	movl	8(%ebp),%ecx
+	movl	(%ebp),%ebp
+	jmp	.L007mmx_outer_loop
+.align	16
+.L007mmx_outer_loop:
+	xorl	12(%edi),%ebx
+	xorl	4(%edi),%edx
+	xorl	8(%edi),%ecx
+	xorl	(%edi),%ebp
+	movl	%edi,48(%esp)
+	movl	%ebx,12(%esp)
+	movl	%edx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%ebp,(%esp)
+	movl	%esp,%edi
+	shrl	$24,%ebx
+	call	_mmx_gmult_4bit_inner
+	movl	48(%esp),%edi
+	leal	16(%edi),%edi
+	cmpl	52(%esp),%edi
+	jb	.L007mmx_outer_loop
+	movl	40(%esp),%edi
+	emms
+	movl	%ebx,12(%edi)
+	movl	%edx,4(%edi)
+	movl	%ecx,8(%edi)
+	movl	%ebp,(%edi)
+	addl	$20,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.align	64
+.Lrem_4bit:
+.long	0,0,0,29491200,0,58982400,0,38141952
+.long	0,117964800,0,113901568,0,76283904,0,88997888
+.long	0,235929600,0,265420800,0,227803136,0,206962688
+.long	0,152567808,0,148504576,0,177995776,0,190709760
+.align	64
+.L008rem_8bit:
+.value	0,450,900,582,1800,1738,1164,1358
+.value	3600,4050,3476,3158,2328,2266,2716,2910
+.value	7200,7650,8100,7782,6952,6890,6316,6510
+.value	4656,5106,4532,4214,5432,5370,5820,6014
+.value	14400,14722,15300,14854,16200,16010,15564,15630
+.value	13904,14226,13780,13334,12632,12442,13020,13086
+.value	9312,9634,10212,9766,9064,8874,8428,8494
+.value	10864,11186,10740,10294,11640,11450,12028,12094
+.value	28800,28994,29444,29382,30600,30282,29708,30158
+.value	32400,32594,32020,31958,31128,30810,31260,31710
+.value	27808,28002,28452,28390,27560,27242,26668,27118
+.value	25264,25458,24884,24822,26040,25722,26172,26622
+.value	18624,18690,19268,19078,20424,19978,19532,19854
+.value	18128,18194,17748,17558,16856,16410,16988,17310
+.value	21728,21794,22372,22182,21480,21034,20588,20910
+.value	23280,23346,22900,22710,24056,23610,24188,24510
+.value	57600,57538,57988,58182,58888,59338,58764,58446
+.value	61200,61138,60564,60758,59416,59866,60316,59998
+.value	64800,64738,65188,65382,64040,64490,63916,63598
+.value	62256,62194,61620,61814,62520,62970,63420,63102
+.value	55616,55426,56004,56070,56904,57226,56780,56334
+.value	55120,54930,54484,54550,53336,53658,54236,53790
+.value	50528,50338,50916,50982,49768,50090,49644,49198
+.value	52080,51890,51444,51510,52344,52666,53244,52798
+.value	37248,36930,37380,37830,38536,38730,38156,38094
+.value	40848,40530,39956,40406,39064,39258,39708,39646
+.value	36256,35938,36388,36838,35496,35690,35116,35054
+.value	33712,33394,32820,33270,33976,34170,34620,34558
+.value	43456,43010,43588,43910,44744,44810,44364,44174
+.value	42960,42514,42068,42390,41176,41242,41820,41630
+.value	46560,46114,46692,47014,45800,45866,45420,45230
+.value	48112,47666,47220,47542,48376,48442,49020,48830
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte	0
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
new file mode 100644
index 0000000..2426cd0
--- /dev/null
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#		gcc 2.95.3(*)	MMX assembler	x86 assembler
+#
+# Pentium	105/111(**)	-		50
+# PIII		68 /75		12.2		24
+# P4		125/125		17.8		84(***)
+# Opteron	66 /70		10.1		30
+# Core2		54 /67		8.4		18
+#
+# (*)	gcc 3.4.x was observed to generate few percent slower code,
+#	which is one of reasons why 2.95.3 results were chosen,
+#	another reason is lack of 3.4.x results for older CPUs;
+#	comparison with MMX results is not completely fair, because C
+#	results are for vanilla "256B" implementation, while
+#	assembler results are for "528B";-)
+# (**)	second number is result for code compiled with -fPIC flag,
+#	which is actually more relevant, because assembler code is
+#	position-independent;
+# (***)	see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+
+$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
+		# than unrolled, which has to be weighted against
+		# 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+
+	&mov	($Zhh,&DWP(4,$Htbl,$Zll));
+	&mov	($Zhl,&DWP(0,$Htbl,$Zll));
+	&mov	($Zlh,&DWP(12,$Htbl,$Zll));
+	&mov	($Zll,&DWP(8,$Htbl,$Zll));
+	&xor	($rem,$rem);	# avoid partial register stalls on PIII
+
+	# shrd practically kills P4, 2.5x deterioration, but P4 has
+	# MMX code-path to execute. shrd runs tad faster [than twice
+	# the shifts, move's and or's] on pre-MMX Pentium (as well as
+	# PIII and Core2), *but* minimizes code size, spares register
+	# and thus allows to fold the loop...
+	if (!$unroll) {
+	my $cnt = $inp;
+	&mov	($cnt,15);
+	&jmp	(&label("x86_loop"));
+	&set_label("x86_loop",16);
+	    for($i=1;$i<=2;$i++) {
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		&mov	(&LB($rem),&BP($off,"esp",$cnt));
+		if ($i&1) {
+			&and	(&LB($rem),0xf0);
+		} else {
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+
+		if ($i&1) {
+			&dec	($cnt);
+			&js	(&label("x86_break"));
+		} else {
+			&jmp	(&label("x86_loop"));
+		}
+	    }
+	&set_label("x86_break",16);
+	} else {
+	    for($i=1;$i<32;$i++) {
+		&comment($i);
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		if ($i&1) {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&and	(&LB($rem),0xf0);
+		} else {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+	    }
+	}
+	&bswap	($Zll);
+	&bswap	($Zlh);
+	&bswap	($Zhl);
+	if (!$x86only) {
+		&bswap	($Zhh);
+	} else {
+		&mov	("eax",$Zhh);
+		&bswap	("eax");
+		&mov	($Zhh,"eax");
+	}
+}
+
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+	&x86_loop(4);
+	&ret	();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+    my $bias = shift;
+
+	&mov	(&DWP($bias+0, "esp"),0x0000<<16);
+	&mov	(&DWP($bias+4, "esp"),0x1C20<<16);
+	&mov	(&DWP($bias+8, "esp"),0x3840<<16);
+	&mov	(&DWP($bias+12,"esp"),0x2460<<16);
+	&mov	(&DWP($bias+16,"esp"),0x7080<<16);
+	&mov	(&DWP($bias+20,"esp"),0x6CA0<<16);
+	&mov	(&DWP($bias+24,"esp"),0x48C0<<16);
+	&mov	(&DWP($bias+28,"esp"),0x54E0<<16);
+	&mov	(&DWP($bias+32,"esp"),0xE100<<16);
+	&mov	(&DWP($bias+36,"esp"),0xFD20<<16);
+	&mov	(&DWP($bias+40,"esp"),0xD940<<16);
+	&mov	(&DWP($bias+44,"esp"),0xC560<<16);
+	&mov	(&DWP($bias+48,"esp"),0x9180<<16);
+	&mov	(&DWP($bias+52,"esp"),0x8DA0<<16);
+	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
+	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for stack alignment
+	&mov	($inp,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+
+	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$inp));
+	&mov	($Zlh,&DWP(8,$inp));
+	&mov	($Zll,&DWP(12,$inp));
+
+	&deposit_rem_4bit(16);
+
+	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(0));
+	}
+
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for 64-bit alignment
+	&mov	($Zll,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+	&mov	($inp,&wparam(2));		# load in
+	&mov	("ecx",&wparam(3));		# load len
+	&add	("ecx",$inp);
+	&mov	(&wparam(3),"ecx");
+
+	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zll));
+	&mov	($Zlh,&DWP(8,$Zll));
+	&mov	($Zll,&DWP(12,$Zll));
+
+	&deposit_rem_4bit(16);
+
+    &set_label("x86_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));		# xor with input
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(2));
+	}
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&mov	(&wparam(2),$inp)	if (!$unroll);
+	&jb	(&label("x86_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (!$sse2) {{	# pure-MMX "May" version...
+
+$S=12;		# shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary µ-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem[0],$Zlo);
+
+	for ($cnt=28;$cnt>=-2;$cnt--) {
+	    my $odd = $cnt&1;
+	    my $nix = $odd ? $nlo : $nhi;
+
+		&shl	(&LB($nlo),4)			if ($odd);
+		&psrlq	($Zlo,4);
+		&movq	($tmp,$Zhi);
+		&psrlq	($Zhi,4);
+		&pxor	($Zlo,&QWP(8,$Htbl,$nix));
+		&mov	(&LB($nlo),&BP($cnt/2,$inp))	if (!$odd && $cnt>=0);
+		&psllq	($tmp,60);
+		&and	($nhi,0xf0)			if ($odd);
+		&pxor	($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+		&and	($rem[0],0xf);
+		&pxor	($Zhi,&QWP(0,$Htbl,$nix));
+		&mov	($nhi,$nlo)			if (!$odd && $cnt>=0);
+		&movd	($rem[1],$Zlo);
+		&pxor	($Zlo,$tmp);
+
+		push	(@rem,shift(@rem));		# "rotate" registers
+	}
+
+	&mov	($inp,&DWP(4,$rem_4bit,$rem[1],8));	# last rem_4bit[rem]
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+	&shl	($inp,4);	# compensate for rem_4bit[i] being >>4
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&xor	($Zhh,$inp);
+	&bswap	($Zhh);
+
+	&ret	();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+	&mov	($Zhh,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+	&mov	($inp,&wparam(2));	# load in
+	&mov	($Zlh,&wparam(3));	# load len
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&add	($Zlh,$inp);
+	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
+	&stack_push(4+1);		# +1 for stack alignment
+
+	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zhh));
+	&mov	($Zlh,&DWP(8,$Zhh));
+	&mov	($Zhh,&DWP(0,$Zhh));
+	&jmp	(&label("mmx_outer_loop"));
+
+    &set_label("mmx_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&wparam(2),$inp);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&mov	($inp,"esp");
+	&shr	($Zll,24);
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(2));
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&jb	(&label("mmx_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+
+	&stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{	# "June" MMX version...
+		# ... has slower "April" gcm_gmult_4bit_mmx with folded
+		# loop. This is done to conserve code size...
+$S=16;		# shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&mov	($cnt,14);
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem,$Zlo);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_loop",16);
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&dec	($cnt);
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&mov	($nhi,$nlo);
+	&pxor	($Zlo,$tmp);
+	&js	(&label("mmx_break"));
+
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_break",16);
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&bswap	($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&mmx_loop($inp,"eax");
+
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+
+    # parameter block
+    &mov	("eax",&wparam(0));		# Xi
+    &mov	("ebx",&wparam(1));		# Htable
+    &mov	("ecx",&wparam(2));		# inp
+    &mov	("edx",&wparam(3));		# len
+    &mov	("ebp","esp");			# original %esp
+    &call	(&label("pic_point"));
+    &set_label	("pic_point");
+    &blindpop	($rem_8bit);
+    &lea	($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+    &sub	("esp",512+16+16);		# allocate stack frame...
+    &and	("esp",-64);			# ...and align it
+    &sub	("esp",16);			# place for (u8)(H[]<<4)
+
+    &add	("edx","ecx");			# pointer to the end of input
+    &mov	(&DWP(528+16+0,"esp"),"eax");	# save Xi
+    &mov	(&DWP(528+16+8,"esp"),"edx");	# save inp+len
+    &mov	(&DWP(528+16+12,"esp"),"ebp");	# save original %esp
+
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my ($off1,$off2,$i) = (0,0,);
+
+      &add	($Htbl,128);			# optimize for size
+      &lea	("edi",&DWP(16+128,"esp"));
+      &lea	("ebp",&DWP(16+256+128,"esp"));
+
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+
+	&mov	("edx",&DWP(16*$i+8-128,$Htbl))		if ($i<16);
+	&movq	($lo[0],&QWP(16*$i+8-128,$Htbl))	if ($i<16);
+	&psllq	($tmp[1],60)				if ($i>1);
+	&movq	($hi[0],&QWP(16*$i+0-128,$Htbl))	if ($i<16);
+	&por	($lo[2],$tmp[1])			if ($i>1);
+	&movq	(&QWP($off1-128,"edi"),$lo[1])		if ($i>0 && $i<17);
+	&psrlq	($lo[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off1,"edi"),$hi[1])		if ($i>0 && $i<17);
+	&movq	($tmp[0],$hi[1])			if ($i>0 && $i<17);
+	&movq	(&QWP($off2-128,"ebp"),$lo[2])		if ($i>1);
+	&psrlq	($hi[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off2,"ebp"),$hi[2])		if ($i>1);
+	&shl	("edx",4)				if ($i<16);
+	&mov	(&BP($i,"esp"),&LB("edx"))		if ($i<16);
+
+	unshift	(@lo,pop(@lo));			# "rotate" registers
+	unshift	(@hi,pop(@hi));
+	unshift	(@tmp,pop(@tmp));
+	$off1 += 8	if ($i>0);
+	$off2 += 8	if ($i>1);
+      }
+    }
+
+    &movq	($Zhi,&QWP(0,"eax"));
+    &mov	("ebx",&DWP(8,"eax"));
+    &mov	("edx",&DWP(12,"eax"));		# load Xi
+
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+
+    &xor	($dat,&DWP(12,"ecx"));		# merge input data
+    &xor	("ebx",&DWP(8,"ecx"));
+    &pxor	($Zhi,&QWP(0,"ecx"));
+    &lea	("ecx",&DWP(16,"ecx"));		# inp+=16
+    #&mov	(&DWP(528+12,"esp"),$dat);	# save inp^Xi
+    &mov	(&DWP(528+8,"esp"),"ebx");
+    &movq	(&QWP(528+0,"esp"),$Zhi);
+    &mov	(&DWP(528+16+4,"esp"),"ecx");	# save inp
+
+    &xor	($nlo,$nlo);
+    &rol	($dat,8);
+    &mov	(&LB($nlo),&LB($dat));
+    &mov	($nhi[1],$nlo);
+    &and	(&LB($nlo),0x0f);
+    &shr	($nhi[1],4);
+    &pxor	($red[0],$red[0]);
+    &rol	($dat,8);			# next byte
+    &pxor	($red[1],$red[1]);
+    &pxor	($red[2],$red[2]);
+
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+
+      if ($i>0) {
+	&pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+	&rol	($dat,8);				# next byte
+	&pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+	&pxor	($Zlo,$tmp);
+	&pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+	&xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+      } else {
+	&movq	($Zlo,&QWP(16,"esp",$nlo,8));
+	&movq	($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+
+	&mov	(&LB($nlo),&LB($dat));
+	&mov	($dat,&DWP(528+$j,"esp"))		if (--$j%4==0);
+
+	&movd	($rem[0],$Zlo);
+	&movz	($rem[1],&LB($rem[1]))			if ($i>0);
+	&psrlq	($Zlo,8);				# Z>>=8
+
+	&movq	($tmp,$Zhi);
+	&mov	($nhi[0],$nlo);
+	&psrlq	($Zhi,8);
+
+	&pxor	($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));	# Z^=H[nhi]>>4
+	&and	(&LB($nlo),0x0f);
+	&psllq	($tmp,56);
+
+	&pxor	($Zhi,$red[1])				if ($i>1);
+	&shr	($nhi[0],4);
+	&pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2)	if ($i>0);
+
+	unshift	(@red,pop(@red));			# "rotate" registers
+	unshift	(@rem,pop(@rem));
+	unshift	(@nhi,pop(@nhi));
+    }
+
+    &pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+    &pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz	($rem[1],&LB($rem[1]));
+
+    &pxor	($red[2],$red[2]);			# clear 2nd word
+    &psllq	($red[1],4);
+
+    &movd	($rem[0],$Zlo);
+    &psrlq	($Zlo,4);				# Z>>=4
+
+    &movq	($tmp,$Zhi);
+    &psrlq	($Zhi,4);
+    &shl	($rem[0],4);				# rem<<4
+
+    &pxor	($Zlo,&QWP(16,"esp",$nhi[1],8));	# Z^=H[nhi]
+    &psllq	($tmp,60);
+    &movz	($rem[0],&LB($rem[0]));
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+    &pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor	($Zhi,$red[1]);
+
+    &movd	($dat,$Zlo);
+    &pinsrw	($red[2],&WP(0,$rem_8bit,$rem[0],2),3);	# last is <<48
+
+    &psllq	($red[0],12);				# correct by <<16>>4
+    &pxor	($Zhi,$red[0]);
+    &psrlq	($Zlo,32);
+    &pxor	($Zhi,$red[2]);
+
+    &mov	("ecx",&DWP(528+16+4,"esp"));	# restore inp
+    &movd	("ebx",$Zlo);
+    &movq	($tmp,$Zhi);			# 01234567
+    &psllw	($Zhi,8);			# 1.3.5.7.
+    &psrlw	($tmp,8);			# .0.2.4.6
+    &por	($Zhi,$tmp);			# 10325476
+    &bswap	($dat);
+    &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
+    &bswap	("ebx");
+    
+    &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
+    &jne	(&label("outer"));
+  }
+
+    &mov	("eax",&DWP(528+16+0,"esp"));	# restore Xi
+    &mov	(&DWP(12,"eax"),"edx");
+    &mov	(&DWP(8,"eax"),"ebx");
+    &movq	(&QWP(0,"eax"),$Zhi);
+
+    &mov	("esp",&DWP(528+16+12,"esp"));	# restore original %esp
+    &emms	();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {	# minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($Xhi,$Xi);		#
+	&pshufd		($T1,$Xi,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110);
+	&pxor		($T1,$Xi);		#
+	&pxor		($T2,$Hkey);
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T2,0x00);		#######
+	&xorps		($T1,$Xi);		#
+	&xorps		($T1,$Xhi);		#
+
+	&movdqa		($T2,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T2,8);		#
+	&pxor		($Xhi,$T1);
+	&pxor		($Xi,$T2);		#
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($Xhi,$Xi);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pshufd		($T2,$T1,0b01001110);	#
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T2,$T1);		#
+	&pxor		($T3,$Hkey);
+	&pclmulqdq	($T2,$T3,0x00);		#######
+	&pxor		($T2,$Xi);		#
+	&pxor		($T2,$Xhi);		#
+
+	&movdqa		($T3,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+}
+
+if (1) {		# Algorithm 9 with <<1 twist.
+			# Reduction is shorter and uses only two
+			# temporary registers, which makes it better
+			# candidate for interleaving with 64x64
+			# multiplication. Pre-modulo-scheduled loop
+			# was found to be ~20% faster than Algorithm 5
+			# below. Algorithm 9 was therefore chosen for
+			# further optimization...
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+	# 1st phase
+	&movdqa		($T1,$Xi);		#
+	&psllq		($Xi,1);
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,5);		#
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,57);		#
+	&movdqa		($T2,$Xi);		#
+	&pslldq		($Xi,8);
+	&psrldq		($T2,8);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);		#
+
+	# 2nd phase
+	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,5);
+	&pxor		($Xi,$T2);		#
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+	&pxor		($T2,$Xhi);
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# <<1 twist
+	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
+	&movdqa		($T1,$Hkey);
+	&psllq		($Hkey,1);
+	&pxor		($T3,$T3);		#
+	&psrlq		($T1,63);
+	&pcmpgtd	($T3,$T2);		# broadcast carry bit
+	&pslldq		($T1,8);
+	&por		($Hkey,$T1);		# H<<=1
+
+	# magic reduction
+	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
+	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movups		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&sub		($len,0x20);
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+
+	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
+
+	  &movdqa	($T1,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &psllq	($Xi,1);
+	  &pxor		($Xi,$T1);		#
+	  &psllq	($Xi,5);		#
+	  &pxor		($Xi,$T1);		#
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	  &psllq	($Xi,57);		#
+	  &movdqa	($T2,$Xi);		#
+	  &pslldq	($Xi,8);
+	  &psrldq	($T2,8);		#	
+	  &pxor		($Xi,$T1);
+	&pshufd		($T1,$T3,0b01001110);
+	  &pxor		($Xhi,$T2);		#
+	&pxor		($T1,$T3);
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T3,$Hkey);		#
+
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,5);
+	  &pxor		($Xi,$T2);		#
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+	  &pxor		($T2,$Xhi);
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+
+	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&xorps		($T1,$Xn);		#
+	&xorps		($T1,$Xhn);		#
+
+	&movdqa		($T3,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhn,$T1);
+	&pxor		($Xn,$T3);		#
+	&movdqa		($T3,&QWP(0,$const));
+
+	&lea		($inp,&DWP(32,$inp));
+	&sub		($len,0x20);
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg9	($Xhi,$Xi);
+
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else {		# Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 {	# 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+	# <<1
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xhi);
+	&pslld		($Xi,1);
+	&pslld		($Xhi,1);		#
+	&psrld		($T1,31);
+	&psrld		($T2,31);		#
+	&movdqa		($T3,$T1);
+	&pslldq		($T1,4);
+	&psrldq		($T3,12);		#
+	&pslldq		($T2,4);
+	&por		($Xhi,$T3);		#
+	&por		($Xi,$T1);
+	&por		($Xhi,$T2);		#
+
+	# 1st phase
+	&movdqa		($T1,$Xi);
+	&movdqa		($T2,$Xi);
+	&movdqa		($T3,$Xi);		#
+	&pslld		($T1,31);
+	&pslld		($T2,30);
+	&pslld		($Xi,25);		#
+	&pxor		($T1,$T2);
+	&pxor		($T1,$Xi);		#
+	&movdqa		($T2,$T1);		#
+	&pslldq		($T1,12);
+	&psrldq		($T2,4);		#
+	&pxor		($T3,$T1);
+
+	# 2nd phase
+	&pxor		($Xhi,$T3);		#
+	&movdqa		($Xi,$T3);
+	&movdqa		($T1,$T3);
+	&psrld		($Xi,1);		#
+	&psrld		($T1,2);
+	&psrld		($T3,7);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+	&pxor		($Xi,$Xhi);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($Xn,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$Xn);
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&pshufb		($Xi,$Xn);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	#######
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
+}}	# $sse2
+
+&set_label("rem_4bit",64);
+	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+	&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+	&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+	&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+	&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+	&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+	&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+	&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+	&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+	&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+	&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+	&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+	&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+	&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+	&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+	&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+	&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+	&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+	&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+	&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+	&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+	&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+	&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+	&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+	&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+	&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+	&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+	&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}}	# !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/crypto/modes/asm/ghash-x86_64.S b/crypto/modes/asm/ghash-x86_64.S
new file mode 100644
index 0000000..62d39c6
--- /dev/null
+++ b/crypto/modes/asm/ghash-x86_64.S
@@ -0,0 +1,1026 @@
+.text	
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,@function
+.align	16
+gcm_gmult_4bit:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+.Lgmult_prologue:
+
+	movzbq	15(%rdi),%r8
+	leaq	.Lrem_4bit(%rip),%r11
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	movb	%r8b,%al
+	movb	%r8b,%bl
+	shlb	$4,%al
+	movq	$14,%rcx
+	movq	8(%rsi,%rax,1),%r8
+	movq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	movq	%r8,%rdx
+	jmp	.Loop1
+
+.align	16
+.Loop1:
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	movb	(%rdi,%rcx,1),%al
+	shrq	$4,%r9
+	xorq	8(%rsi,%rbx,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rbx,1),%r9
+	movb	%al,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	shlb	$4,%al
+	xorq	%r10,%r8
+	decq	%rcx
+	js	.Lbreak1
+
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rax,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	xorq	%r10,%r8
+	jmp	.Loop1
+
+.align	16
+.Lbreak1:
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rax,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rax,1),%r9
+	andb	$240,%bl
+	xorq	(%r11,%rdx,8),%r9
+	movq	%r8,%rdx
+	xorq	%r10,%r8
+
+	shrq	$4,%r8
+	andq	$15,%rdx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	xorq	8(%rsi,%rbx,1),%r8
+	shlq	$60,%r10
+	xorq	(%rsi,%rbx,1),%r9
+	xorq	%r10,%r8
+	xorq	(%r11,%rdx,8),%r9
+
+	bswapq	%r8
+	bswapq	%r9
+	movq	%r8,8(%rdi)
+	movq	%r9,(%rdi)
+
+	movq	16(%rsp),%rbx
+	leaq	24(%rsp),%rsp
+.Lgmult_epilogue:
+	.byte	0xf3,0xc3
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,@function
+.align	16
+gcm_ghash_4bit:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$280,%rsp
+.Lghash_prologue:
+	movq	%rdx,%r14
+	movq	%rcx,%r15
+	subq	$-128,%rsi
+	leaq	16+128(%rsp),%rbp
+	xorl	%edx,%edx
+	movq	0+0-128(%rsi),%r8
+	movq	0+8-128(%rsi),%rax
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	16+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	16+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,0(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,0(%rbp)
+	movq	32+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,0-128(%rbp)
+	movq	32+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,1(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,8(%rbp)
+	movq	48+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,8-128(%rbp)
+	movq	48+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,2(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,16(%rbp)
+	movq	64+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,16-128(%rbp)
+	movq	64+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,3(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,24(%rbp)
+	movq	80+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,24-128(%rbp)
+	movq	80+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,4(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,32(%rbp)
+	movq	96+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,32-128(%rbp)
+	movq	96+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,5(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,40(%rbp)
+	movq	112+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,40-128(%rbp)
+	movq	112+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,6(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,48(%rbp)
+	movq	128+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,48-128(%rbp)
+	movq	128+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,7(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,56(%rbp)
+	movq	144+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,56-128(%rbp)
+	movq	144+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,8(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,64(%rbp)
+	movq	160+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,64-128(%rbp)
+	movq	160+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,9(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,72(%rbp)
+	movq	176+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,72-128(%rbp)
+	movq	176+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,10(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,80(%rbp)
+	movq	192+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,80-128(%rbp)
+	movq	192+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,11(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,88(%rbp)
+	movq	208+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,88-128(%rbp)
+	movq	208+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,12(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,96(%rbp)
+	movq	224+0-128(%rsi),%r8
+	shlb	$4,%dl
+	movq	%rax,96-128(%rbp)
+	movq	224+8-128(%rsi),%rax
+	shlq	$60,%r10
+	movb	%dl,13(%rsp)
+	orq	%r10,%rbx
+	movb	%al,%dl
+	shrq	$4,%rax
+	movq	%r8,%r10
+	shrq	$4,%r8
+	movq	%r9,104(%rbp)
+	movq	240+0-128(%rsi),%r9
+	shlb	$4,%dl
+	movq	%rbx,104-128(%rbp)
+	movq	240+8-128(%rsi),%rbx
+	shlq	$60,%r10
+	movb	%dl,14(%rsp)
+	orq	%r10,%rax
+	movb	%bl,%dl
+	shrq	$4,%rbx
+	movq	%r9,%r10
+	shrq	$4,%r9
+	movq	%r8,112(%rbp)
+	shlb	$4,%dl
+	movq	%rax,112-128(%rbp)
+	shlq	$60,%r10
+	movb	%dl,15(%rsp)
+	orq	%r10,%rbx
+	movq	%r9,120(%rbp)
+	movq	%rbx,120-128(%rbp)
+	addq	$-128,%rsi
+	movq	8(%rdi),%r8
+	movq	0(%rdi),%r9
+	addq	%r14,%r15
+	leaq	.Lrem_8bit(%rip),%r11
+	jmp	.Louter_loop
+.align	16
+.Louter_loop:
+	xorq	(%r14),%r9
+	movq	8(%r14),%rdx
+	leaq	16(%r14),%r14
+	xorq	%r8,%rdx
+	movq	%r9,(%rdi)
+	movq	%rdx,8(%rdi)
+	shrq	$32,%rdx
+	xorq	%rax,%rax
+	roll	$8,%edx
+	movb	%dl,%al
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	shrl	$4,%ebx
+	roll	$8,%edx
+	movq	8(%rsi,%rax,1),%r8
+	movq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	8(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	4(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	0(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	shrl	$4,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r12,2),%r12
+	movzbl	%dl,%ebx
+	shlb	$4,%al
+	movzbq	(%rsp,%rcx,1),%r13
+	shrl	$4,%ebx
+	shlq	$48,%r12
+	xorq	%r8,%r13
+	movq	%r9,%r10
+	xorq	%r12,%r9
+	shrq	$8,%r8
+	movzbq	%r13b,%r13
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rcx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rcx,8),%r9
+	roll	$8,%edx
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	movb	%dl,%al
+	xorq	%r10,%r8
+	movzwq	(%r11,%r13,2),%r13
+	movzbl	%dl,%ecx
+	shlb	$4,%al
+	movzbq	(%rsp,%rbx,1),%r12
+	andl	$240,%ecx
+	shlq	$48,%r13
+	xorq	%r8,%r12
+	movq	%r9,%r10
+	xorq	%r13,%r9
+	shrq	$8,%r8
+	movzbq	%r12b,%r12
+	movl	-4(%rdi),%edx
+	shrq	$8,%r9
+	xorq	-128(%rbp,%rbx,8),%r8
+	shlq	$56,%r10
+	xorq	(%rbp,%rbx,8),%r9
+	movzwq	(%r11,%r12,2),%r12
+	xorq	8(%rsi,%rax,1),%r8
+	xorq	(%rsi,%rax,1),%r9
+	shlq	$48,%r12
+	xorq	%r10,%r8
+	xorq	%r12,%r9
+	movzbq	%r8b,%r13
+	shrq	$4,%r8
+	movq	%r9,%r10
+	shlb	$4,%r13b
+	shrq	$4,%r9
+	xorq	8(%rsi,%rcx,1),%r8
+	movzwq	(%r11,%r13,2),%r13
+	shlq	$60,%r10
+	xorq	(%rsi,%rcx,1),%r9
+	xorq	%r10,%r8
+	shlq	$48,%r13
+	bswapq	%r8
+	xorq	%r13,%r9
+	bswapq	%r9
+	cmpq	%r15,%r14
+	jb	.Louter_loop
+	movq	%r8,8(%rdi)
+	movq	%r9,(%rdi)
+
+	leaq	280(%rsp),%rsi
+	movq	0(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lghash_epilogue:
+	.byte	0xf3,0xc3
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	.L0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	movdqu	%xmm2,(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	.byte	0xf3,0xc3
+.size	gcm_init_clmul,.-gcm_init_clmul
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lbswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+	movdqa	.Lbswap_mask(%rip),%xmm5
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+.byte	102,15,56,0,197
+
+	subq	$16,%rcx
+	jz	.Lodd_tail
+
+	movdqu	16(%rsi),%xmm8
+
+
+
+
+
+	movdqu	(%rdx),%xmm3
+	movdqu	16(%rdx),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm6,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm6,%xmm3
+	pxor	%xmm7,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm7
+	pxor	%xmm4,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm8,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm8,%xmm4
+
+	leaq	32(%rdx),%rdx
+	subq	$32,%rcx
+	jbe	.Leven_tail
+
+.Lmod_loop:
+.byte	102,65,15,58,68,192,0
+.byte	102,65,15,58,68,200,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqu	(%rdx),%xmm3
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+
+	movdqu	16(%rdx),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+
+	movdqa	%xmm6,%xmm7
+	pshufd	$78,%xmm6,%xmm9
+	pshufd	$78,%xmm2,%xmm10
+	pxor	%xmm6,%xmm9
+	pxor	%xmm2,%xmm10
+	pxor	%xmm3,%xmm1
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+.byte	102,15,58,68,250,17
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+
+.byte	102,69,15,58,68,202,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm8,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm8,%xmm4
+
+	pxor	%xmm6,%xmm9
+	pxor	%xmm7,%xmm9
+	movdqa	%xmm9,%xmm10
+	psrldq	$8,%xmm9
+	pslldq	$8,%xmm10
+	pxor	%xmm9,%xmm7
+	pxor	%xmm10,%xmm6
+
+	leaq	32(%rdx),%rdx
+	subq	$32,%rcx
+	ja	.Lmod_loop
+
+.Leven_tail:
+.byte	102,65,15,58,68,192,0
+.byte	102,65,15,58,68,200,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	pxor	%xmm6,%xmm0
+	pxor	%xmm7,%xmm1
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	testq	%rcx,%rcx
+	jnz	.Ldone
+
+.Lodd_tail:
+	movdqu	(%rdx),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$5,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm4
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm4
+	pxor	%xmm3,%xmm0
+	pxor	%xmm4,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+	pxor	%xmm1,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm0
+.Ldone:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align	64
+.type	.Lrem_4bit,@object
+.Lrem_4bit:
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type	.Lrem_8bit,@object
+.Lrem_8bit:
+.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000..38d779e
--- /dev/null
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,806 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/	or
+			$r =~ s/%[er]([sd]i)/%\1l/	or
+			$r =~ s/%[er](bp)/%\1l/		or
+			$r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+
+	$N++;
+$code.=<<___;
+	xor	$nlo,$nlo
+	xor	$nhi,$nhi
+	mov	`&LB("$Zlo")`,`&LB("$nlo")`
+	mov	`&LB("$Zlo")`,`&LB("$nhi")`
+	shl	\$4,`&LB("$nlo")`
+	mov	\$14,$cnt
+	mov	8($Htbl,$nlo),$Zlo
+	mov	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	mov	$Zlo,$rem
+	jmp	.Loop$N
+
+.align	16
+.Loop$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	mov	($inp,$cnt),`&LB("$nlo")`
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	mov	`&LB("$nlo")`,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	shl	\$4,`&LB("$nlo")`
+	xor	$tmp,$Zlo
+	dec	$cnt
+	js	.Lbreak$N
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+	jmp	.Loop$N
+
+.align	16
+.Lbreak$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	xor	$tmp,$Zlo
+	xor	($rem_4bit,$rem,8),$Zhi
+
+	bswap	$Zlo
+	bswap	$Zhi
+___
+}}
+
+$code=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,\@function,2
+.align	16
+gcm_gmult_4bit:
+	push	%rbx
+	push	%rbp		# %rbp and %r12 are pushed exclusively in
+	push	%r12		# order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+	movzb	15($Xi),$Zlo
+	lea	.Lrem_4bit(%rip),$rem_4bit
+___
+	&loop	($Xi);
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	mov	16(%rsp),%rbx
+	lea	24(%rsp),%rsp
+.Lgmult_epilogue:
+	ret
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,\@function,4
+.align	16
+gcm_ghash_4bit:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$280,%rsp
+.Lghash_prologue:
+	mov	$inp,%r14		# reassign couple of args
+	mov	$len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+
+	&sub	($Htbl,-128);		# size optimization
+	&lea	($Hshr4,"16+128(%rsp)");
+	{ my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+
+	  &xor	($dat,$dat);
+	  for ($i=0,$j=-2;$i<18;$i++,$j++) {
+	    &mov	("$j(%rsp)",&LB($dat))		if ($i>1);
+	    &or		($lo[0],$tmp)			if ($i>1);
+	    &mov	(&LB($dat),&LB($lo[1]))		if ($i>0 && $i<17);
+	    &shr	($lo[1],4)			if ($i>0 && $i<17);
+	    &mov	($tmp,$hi[1])			if ($i>0 && $i<17);
+	    &shr	($hi[1],4)			if ($i>0 && $i<17);
+	    &mov	("8*$j($Hshr4)",$hi[0])		if ($i>1);
+	    &mov	($hi[0],"16*$i+0-128($Htbl)")	if ($i<16);
+	    &shl	(&LB($dat),4)			if ($i>0 && $i<17);
+	    &mov	("8*$j-128($Hshr4)",$lo[0])	if ($i>1);
+	    &mov	($lo[0],"16*$i+8-128($Htbl)")	if ($i<16);
+	    &shl	($tmp,60)			if ($i>0 && $i<17);
+
+	    push	(@lo,shift(@lo));
+	    push	(@hi,shift(@hi));
+	  }
+	}
+	&add	($Htbl,-128);
+	&mov	($Zlo,"8($Xi)");
+	&mov	($Zhi,"0($Xi)");
+	&add	($len,$inp);		# pointer to the end of data
+	&lea	($rem_8bit,".Lrem_8bit(%rip)");
+	&jmp	(".Louter_loop");
+
+$code.=".align	16\n.Louter_loop:\n";
+	&xor	($Zhi,"($inp)");
+	&mov	("%rdx","8($inp)");
+	&lea	($inp,"16($inp)");
+	&xor	("%rdx",$Zlo);
+	&mov	("($Xi)",$Zhi);
+	&mov	("8($Xi)","%rdx");
+	&shr	("%rdx",32);
+
+	&xor	($nlo,$nlo);
+	&rol	($dat,8);
+	&mov	(&LB($nlo),&LB($dat));
+	&movz	($nhi[0],&LB($dat));
+	&shl	(&LB($nlo),4);
+	&shr	($nhi[0],4);
+
+	for ($j=11,$i=0;$i<15;$i++) {
+	    &rol	($dat,8);
+	    &xor	($Zlo,"8($Htbl,$nlo)")			if ($i>0);
+	    &xor	($Zhi,"($Htbl,$nlo)")			if ($i>0);
+	    &mov	($Zlo,"8($Htbl,$nlo)")			if ($i==0);
+	    &mov	($Zhi,"($Htbl,$nlo)")			if ($i==0);
+
+	    &mov	(&LB($nlo),&LB($dat));
+	    &xor	($Zlo,$tmp)				if ($i>0);
+	    &movzw	($rem[1],"($rem_8bit,$rem[1],2)")	if ($i>0);
+
+	    &movz	($nhi[1],&LB($dat));
+	    &shl	(&LB($nlo),4);
+	    &movzb	($rem[0],"(%rsp,$nhi[0])");
+
+	    &shr	($nhi[1],4)				if ($i<14);
+	    &and	($nhi[1],0xf0)				if ($i==14);
+	    &shl	($rem[1],48)				if ($i>0);
+	    &xor	($rem[0],$Zlo);
+
+	    &mov	($tmp,$Zhi);
+	    &xor	($Zhi,$rem[1])				if ($i>0);
+	    &shr	($Zlo,8);
+
+	    &movz	($rem[0],&LB($rem[0]));
+	    &mov	($dat,"$j($Xi)")			if (--$j%4==0);
+	    &shr	($Zhi,8);
+
+	    &xor	($Zlo,"-128($Hshr4,$nhi[0],8)");
+	    &shl	($tmp,56);
+	    &xor	($Zhi,"($Hshr4,$nhi[0],8)");
+
+	    unshift	(@nhi,pop(@nhi));		# "rotate" registers
+	    unshift	(@rem,pop(@rem));
+	}
+	&movzw	($rem[1],"($rem_8bit,$rem[1],2)");
+	&xor	($Zlo,"8($Htbl,$nlo)");
+	&xor	($Zhi,"($Htbl,$nlo)");
+
+	&shl	($rem[1],48);
+	&xor	($Zlo,$tmp);
+
+	&xor	($Zhi,$rem[1]);
+	&movz	($rem[0],&LB($Zlo));
+	&shr	($Zlo,4);
+
+	&mov	($tmp,$Zhi);
+	&shl	(&LB($rem[0]),4);
+	&shr	($Zhi,4);
+
+	&xor	($Zlo,"8($Htbl,$nhi[0])");
+	&movzw	($rem[0],"($rem_8bit,$rem[0],2)");
+	&shl	($tmp,60);
+
+	&xor	($Zhi,"($Htbl,$nhi[0])");
+	&xor	($Zlo,$tmp);
+	&shl	($rem[0],48);
+
+	&bswap	($Zlo);
+	&xor	($Zhi,$rem[0]);
+
+	&bswap	($Zhi);
+	&cmp	($inp,$len);
+	&jb	(".Louter_loop");
+}
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	lea	280(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lghash_epilogue:
+	ret
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {	# minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+
+$code.=<<___ if (!defined($modulo));
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey,$T2
+___
+$code.=<<___;
+	pclmulqdq	\$0x00,$Hkey,$Xi	#######
+	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
+	pclmulqdq	\$0x00,$T2,$T1		#######
+	pxor		$Xi,$T1			#
+	pxor		$Xhi,$T1		#
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+}
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	# 1st phase
+	movdqa		$Xi,$T1			#
+	psllq		\$1,$Xi
+	pxor		$T1,$Xi			#
+	psllq		\$5,$Xi			#
+	pxor		$T1,$Xi			#
+	psllq		\$57,$Xi		#
+	movdqa		$Xi,$T2			#
+	pslldq		\$8,$Xi
+	psrldq		\$8,$T2			#	
+	pxor		$T1,$Xi
+	pxor		$T2,$Xhi		#
+
+	# 2nd phase
+	movdqa		$Xi,$T2
+	psrlq		\$5,$Xi
+	pxor		$T2,$Xi			#
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+	pxor		$Xhi,$T2
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+
+$code.=<<___;
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,\@abi-omnipotent
+.align	16
+gcm_init_clmul:
+	movdqu		($Xip),$Hkey
+	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	movdqa		$Hkey,$T1
+	psllq		\$1,$Hkey
+	pxor		$T3,$T3			#
+	psrlq		\$63,$T1
+	pcmpgtd		$T2,$T3			# broadcast carry bit
+	pslldq		\$8,$T1
+	por		$T1,$Hkey		# H<<=1
+
+	# magic reduction
+	pand		.L0x1c2_polynomial(%rip),$T3
+	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	movdqa		$Hkey,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqu		$Hkey,($Htbl)		# save H
+	movdqu		$Xi,16($Htbl)		# save H^2
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,\@abi-omnipotent
+.align	16
+gcm_gmult_clmul:
+	movdqu		($Xip),$Xi
+	movdqa		.Lbswap_mask(%rip),$T3
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+
+$code.=<<___;
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,\@abi-omnipotent
+.align	16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
+	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
+	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+___
+$code.=<<___;
+	movdqa		.Lbswap_mask(%rip),$T3
+
+	movdqu		($Xip),$Xi
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+
+	sub		\$0x10,$len
+	jz		.Lodd_tail
+
+	movdqu		16($Htbl),$Hkey2
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	movdqu		($inp),$T1		# Ii
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey2,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey2,$T2
+
+	lea		32($inp),$inp		# i+=2
+	sub		\$0x20,$len
+	jbe		.Leven_tail
+
+.Lmod_loop:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	movdqu		($inp),$T1		# Ii
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+
+	movdqa		$Xn,$Xhn		#
+	pshufd		\$0b01001110,$Xn,$T1n
+	pshufd		\$0b01001110,$Hkey,$T2n
+	pxor		$Xn,$T1n		#
+	pxor		$Hkey,$T2n
+	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+
+	  movdqa	$Xi,$T1			# 1st phase
+	  psllq		\$1,$Xi
+	  pxor		$T1,$Xi			#
+	  psllq		\$5,$Xi			#
+	  pxor		$T1,$Xi			#
+	pclmulqdq	\$0x00,$Hkey,$Xn	#######
+	  psllq		\$57,$Xi		#
+	  movdqa	$Xi,$T2			#
+	  pslldq	\$8,$Xi
+	  psrldq	\$8,$T2			#	
+	  pxor		$T1,$Xi
+	  pxor		$T2,$Xhi		#
+
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$5,$Xi
+	  pxor		$T2,$Xi			#
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+	  pxor		$Xhi,$T2
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+
+	pclmulqdq	\$0x00,$T2n,$T1n	#######
+	 movdqa		$Xi,$Xhi		#
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pshufd		\$0b01001110,$Hkey2,$T2
+	 pxor		$Xi,$T1			#
+	 pxor		$Hkey2,$T2
+
+	pxor		$Xn,$T1n		#
+	pxor		$Xhn,$T1n		#
+	movdqa		$T1n,$T2n		#
+	psrldq		\$8,$T1n
+	pslldq		\$8,$T2n		#
+	pxor		$T1n,$Xhn
+	pxor		$T2n,$Xn		#
+
+	lea		32($inp),$inp
+	sub		\$0x20,$len
+	ja		.Lmod_loop
+
+.Leven_tail:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+___
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	test		$len,$len
+	jnz		.Ldone
+
+.Lodd_tail:
+	movdqu		($inp),$T1		# Ii
+	pshufb		$T3,$T1
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	add	\$0x58,%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align	64
+.type	.Lrem_4bit,\@object
+.Lrem_4bit:
+	.long	0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+	.long	0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+	.long	0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+	.long	0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type	.Lrem_8bit,\@object
+.Lrem_8bit:
+	.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+	.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+	.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+	.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+	.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+	.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+	.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+	.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+	.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+	.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+	.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+	.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+	.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+	.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+	.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+	.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+	.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+	.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+	.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+	.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+	.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+	.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+	.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+	.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+	.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+	.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+	.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+	.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+	.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+	.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+	.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+	.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	24(%rax),%rax		# adjust "rsp"
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_gcm_gmult_4bit
+	.rva	.LSEH_end_gcm_gmult_4bit
+	.rva	.LSEH_info_gcm_gmult_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_4bit
+	.rva	.LSEH_end_gcm_ghash_4bit
+	.rva	.LSEH_info_gcm_ghash_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_clmul
+	.rva	.LSEH_end_gcm_ghash_clmul
+	.rva	.LSEH_info_gcm_ghash_clmul
+
+.section	.xdata
+.align	8
+.LSEH_info_gcm_gmult_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lgmult_prologue,.Lgmult_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_clmul:
+	.byte	0x01,0x1f,0x0b,0x00
+	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
+	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/modes/cbc128.c b/crypto/modes/cbc128.c
index 8f8bd56..3d3782c 100644
--- a/crypto/modes/cbc128.c
+++ b/crypto/modes/cbc128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,12 +59,7 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
+#ifndef STRICT_ALIGNMENT
 #  define STRICT_ALIGNMENT 0
 #endif
 
diff --git a/crypto/modes/ccm128.c b/crypto/modes/ccm128.c
new file mode 100644
index 0000000..c9b35e5
--- /dev/null
+++ b/crypto/modes/ccm128.c
@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+/* First you setup M and L parameters and pass the key schedule.
+ * This is called once per session setup... */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+	unsigned int M,unsigned int L,void *key,block128_f block)
+{
+	memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
+	ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
+	ctx->blocks = 0;
+	ctx->block = block;
+	ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+	const unsigned char *nonce,size_t nlen,size_t mlen)
+{
+	unsigned int L = ctx->nonce.c[0]&7;	/* the L parameter */
+
+	if (nlen<(14-L)) return -1;		/* nonce is too short */
+
+	if (sizeof(mlen)==8 && L>=3) {
+		ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
+		ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
+		ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
+		ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
+	}
+	else
+		*(u32*)(&ctx->nonce.c[8]) = 0;
+
+	ctx->nonce.c[12] = (u8)(mlen>>24);
+	ctx->nonce.c[13] = (u8)(mlen>>16);
+	ctx->nonce.c[14] = (u8)(mlen>>8);
+	ctx->nonce.c[15] = (u8)mlen;
+
+	ctx->nonce.c[0] &= ~0x40;	/* clear Adata flag */
+	memcpy(&ctx->nonce.c[1],nonce,14-L);
+
+	return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+	const unsigned char *aad,size_t alen)
+{	unsigned int i;
+	block128_f block = ctx->block;
+
+	if (alen==0) return;
+
+	ctx->nonce.c[0] |= 0x40;	/* set Adata flag */
+	(*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
+	ctx->blocks++;
+
+	if (alen<(0x10000-0x100)) {
+		ctx->cmac.c[0] ^= (u8)(alen>>8);
+		ctx->cmac.c[1] ^= (u8)alen;
+		i=2;
+	}
+	else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFF;
+		ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
+		ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
+		ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
+		ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
+		ctx->cmac.c[6] ^= (u8)(alen>>24);
+		ctx->cmac.c[7] ^= (u8)(alen>>16);
+		ctx->cmac.c[8] ^= (u8)(alen>>8);
+		ctx->cmac.c[9] ^= (u8)alen;
+		i=10;
+	}
+	else {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFE;
+		ctx->cmac.c[2] ^= (u8)(alen>>24);
+		ctx->cmac.c[3] ^= (u8)(alen>>16);
+		ctx->cmac.c[4] ^= (u8)(alen>>8);
+		ctx->cmac.c[5] ^= (u8)alen;
+		i=6;
+	}
+
+	do {
+		for(;i<16 && alen;++i,++aad,--alen)
+			ctx->cmac.c[i] ^= *aad;
+		(*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
+		ctx->blocks++;
+		i=0;
+	} while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/* counter part of nonce may not be larger than L*8 bits,
+ * L is not larger than 8, therefore 64-bit counter... */
+static void ctr64_inc(unsigned char *counter) {
+	unsigned int n=8;
+	u8  c;
+
+	counter += 8;
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= temp.u[0];
+		ctx->cmac.u[1] ^= temp.u[1];
+#else
+		ctx->cmac.u[0] ^= ((u64*)inp)[0];
+		ctx->cmac.u[1] ^= ((u64*)inp)[1];
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		temp.u[0] ^= scratch.u[0];
+		temp.u[1] ^= scratch.u[1];
+		memcpy(out,temp.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
+		((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+#endif
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+		ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+		memcpy (out,scratch.c,16);
+#else
+		ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
+		ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+static void ctr64_add (unsigned char *counter,size_t inc)
+{	size_t n=8, val=0;
+
+	counter += 8;
+	do {
+		--n;
+		val += counter[n] + (inc&0xff);
+		counter[n] = (unsigned char)val;
+		val >>= 8;	/* carry bit */
+		inc >>= 8;
+	} while(n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
+{	unsigned int M = (ctx->nonce.c[0]>>3)&7;	/* the M parameter */
+
+	M *= 2; M += 2;
+	if (len<M)	return 0;
+	memcpy(tag,ctx->cmac.c,M);
+	return M;
+}
diff --git a/crypto/modes/cfb128.c b/crypto/modes/cfb128.c
index e5938c6..4e6f5d3 100644
--- a/crypto/modes/cfb128.c
+++ b/crypto/modes/cfb128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit cfb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
diff --git a/crypto/modes/ctr128.c b/crypto/modes/ctr128.c
index 932037f..ee642c5 100644
--- a/crypto/modes/ctr128.c
+++ b/crypto/modes/ctr128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,17 +59,6 @@
 #endif
 #include <assert.h>
 
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#define STRICT_ALIGNMENT
-#if defined(__i386)	|| defined(__i386__)	|| \
-    defined(__x86_64)	|| defined(__x86_64__)	|| \
-    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
-    defined(__s390__)	|| defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* NOTE: the IV/counter CTR mode is big-endian.  The code itself
  * is endian-neutral. */
 
@@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
 
 	*num=n;
 }
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter) {
+	u32 n=12;
+	u8  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], unsigned char ecount_buf[16],
+			unsigned int *num, ctr128_f func)
+{
+	unsigned int n,ctr32;
+
+	assert(in && out && key && ecount_buf && num);
+	assert(*num < 16);
+
+	n = *num;
+
+	while (n && len) {
+		*(out++) = *(in++) ^ ecount_buf[n];
+		--len;
+		n = (n+1) % 16;
+	}
+
+	ctr32 = GETU32(ivec+12);
+	while (len>=16) {
+		size_t blocks = len/16;
+		/*
+		 * 1<<28 is just a not-so-small yet not-so-large number...
+		 * Below condition is practically never met, but it has to
+		 * be checked for code correctness.
+		 */
+		if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
+			blocks = (1U<<28);
+		/*
+		 * As (*func) operates on 32-bit counter, caller
+		 * has to handle overflow. 'if' below detects the
+		 * overflow, which is then handled by limiting the
+		 * amount of blocks to the exact overflow point...
+		 */
+		ctr32 += (u32)blocks;
+		if (ctr32 < blocks) {
+			blocks -= ctr32;
+			ctr32   = 0;
+		}
+		(*func)(in,out,blocks,key,ivec);
+		/* (*ctr) does not update ivec, caller does: */
+		PUTU32(ivec+12,ctr32);
+		/* ... overflow was detected, propogate carry. */
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		blocks *= 16;
+		len -= blocks;
+		out += blocks;
+		in  += blocks;
+	}
+	if (len) {
+		memset(ecount_buf,0,16);
+		(*func)(ecount_buf,ecount_buf,1,key,ivec);
+		++ctr32;
+		PUTU32(ivec+12,ctr32);
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		while (len--) {
+			out[n] = in[n] ^ ecount_buf[n];
+			++n;
+		}
+	}
+
+	*num=n;
+}
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
new file mode 100644
index 0000000..0e6ff8b
--- /dev/null
+++ b/crypto/modes/gcm128.c
@@ -0,0 +1,1757 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#define OPENSSL_FIPSAPI
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
+/* redefine, because alignment is ensured */
+#undef	GETU32
+#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
+#undef	PUTU32
+#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V)	do { \
+	if (sizeof(size_t)==8) { \
+		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^T; \
+	} \
+	else { \
+		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^((u64)T<<32); \
+	} \
+} while(0)
+
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ *   handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ *   subsystem penalties (for example on Windows large enough free
+ *   results in VM working set trimming, meaning that consequent
+ *   malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ *   performance of other code paths (not necessarily even from same
+ *   thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
+#if	TABLE_BITS==8
+
+static void gcm_init_8bit(u128 Htable[256], u64 H[2])
+{
+	int  i, j;
+	u128 V;
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+	for (Htable[128]=V, i=64; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<256; i<<=1) {
+		u128 *Hi = Htable+i, H0 = *Hi;
+		for (j=1; j<i; ++j) {
+			Hi[j].hi = H0.hi^Htable[j].hi;
+			Hi[j].lo = H0.lo^Htable[j].lo;
+		}
+	}
+}
+
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
+{
+	u128 Z = { 0, 0};
+	const u8 *xi = (const u8 *)Xi+15;
+	size_t rem, n = *xi;
+	const union { long one; char little; } is_endian = {1};
+	static const size_t rem_8bit[256] = {
+		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
+
+	while (1) {
+		Z.hi ^= Htable[n].hi;
+		Z.lo ^= Htable[n].lo;
+
+		if ((u8 *)Xi==xi)	break;
+
+		n = *(--xi);
+
+		rem  = (size_t)Z.lo&0xff;
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_8bit[rem];
+		else
+			Z.hi ^= (u64)rem_8bit[rem]<<32;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif	TABLE_BITS==4
+
+static void gcm_init_4bit(u128 Htable[16], u64 H[2])
+{
+	u128 V;
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	int  i;
+#endif
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	for (Htable[8]=V, i=4; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<16; i<<=1) {
+		u128 *Hi = Htable+i;
+		int   j;
+		for (V=*Hi, j=1; j<i; ++j) {
+			Hi[j].hi = V.hi^Htable[j].hi;
+			Hi[j].lo = V.lo^Htable[j].lo;
+		}
+	}
+#else
+	Htable[8] = V;
+	REDUCE1BIT(V);
+	Htable[4] = V;
+	REDUCE1BIT(V);
+	Htable[2] = V;
+	REDUCE1BIT(V);
+	Htable[1] = V;
+	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
+	V=Htable[4];
+	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
+	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
+	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
+	V=Htable[8];
+	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
+	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
+	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
+	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
+	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
+	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
+	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
+#endif
+#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+	/*
+	 * ARM assembler expects specific dword order in Htable.
+	 */
+	{
+	int j;
+	const union { long one; char little; } is_endian = {1};
+
+	if (is_endian.little)
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo;
+			Htable[j].lo = V.hi;
+		}
+	else
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo<<32|V.lo>>32;
+			Htable[j].lo = V.hi<<32|V.hi>>32;
+		}
+	}
+#endif
+}
+
+#ifndef GHASH_ASM
+static const size_t rem_4bit[16] = {
+	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
+{
+	u128 Z;
+	int cnt = 15;
+	size_t rem, nlo, nhi;
+	const union { long one; char little; } is_endian = {1};
+
+	nlo  = ((const u8 *)Xi)[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+/*
+ * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]...
+ */
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)
+{
+    u128 Z;
+    int cnt;
+    size_t rem, nlo, nhi;
+    const union { long one; char little; } is_endian = {1};
+
+#if 1
+    do {
+	cnt  = 15;
+	nlo  = ((const u8 *)Xi)[15];
+	nlo ^= inp[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+#else
+    /*
+     * Extra 256+16 bytes per-key plus 512 bytes shared tables
+     * [should] give ~50% improvement... One could have PACK()-ed
+     * the rem_8bit even here, but the priority is to minimize
+     * cache footprint...
+     */ 
+    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
+    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
+    static const unsigned short rem_8bit[256] = {
+	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+    /*
+     * This pre-processing phase slows down procedure by approximately
+     * same time as it makes each loop spin faster. In other words
+     * single block performance is approximately same as straightforward
+     * "4-bit" implementation, and then it goes only faster...
+     */
+    for (cnt=0; cnt<16; ++cnt) {
+	Z.hi = Htable[cnt].hi;
+	Z.lo = Htable[cnt].lo;
+	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
+	Hshr4[cnt].hi = (Z.hi>>4);
+	Hshl4[cnt]    = (u8)(Z.lo<<4);
+    }
+
+    do {
+	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+
+		rem = (size_t)Z.lo&0xff;
+
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+
+		Z.hi ^= Hshr4[nhi].hi;
+		Z.lo ^= Hshr4[nhi].lo;
+		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
+	}
+
+	nlo  = ((const u8 *)Xi)[0];
+	nlo ^= inp[0];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi ^= Htable[nlo].hi;
+	Z.lo ^= Htable[nlo].lo;
+
+	rem = (size_t)Z.lo&0xf;
+
+	Z.lo = (Z.hi<<60)|(Z.lo>>4);
+	Z.hi = (Z.hi>>4);
+
+	Z.hi ^= Htable[nhi].hi;
+	Z.lo ^= Htable[nhi].lo;
+	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
+#endif
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+    } while (inp+=16, len-=16);
+}
+#endif
+#else
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+
+#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+ * trashing effect. In other words idea is to hash data while it's
+ * still in L1 cache after encryption pass... */
+#define GHASH_CHUNK       (3*1024)
+#endif
+
+#else	/* TABLE_BITS */
+
+static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+{
+	u128 V,Z = { 0,0 };
+	long X;
+	int  i,j;
+	const long *xi = (const long *)Xi;
+	const union { long one; char little; } is_endian = {1};
+
+	V.hi = H[0];	/* H is in host byte order, no byte swapping */
+	V.lo = H[1];
+
+	for (j=0; j<16/sizeof(long); ++j) {
+		if (is_endian.little) {
+			if (sizeof(long)==8) {
+#ifdef BSWAP8
+				X = (long)(BSWAP8(xi[j]));
+#else
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
+#endif
+			}
+			else {
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)GETU32(p);
+			}
+		}
+		else
+			X = xi[j];
+
+		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
+			u64 M = (u64)(X>>(8*sizeof(long)-1));
+			Z.hi ^= V.hi&M;
+			Z.lo ^= V.lo&M;
+
+			REDUCE1BIT(V);
+		}
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+#endif
+
+#if	TABLE_BITS==4 && defined(GHASH_ASM)
+# if	!defined(I386_ONLY) && \
+	(defined(__i386)	|| defined(__i386__)	|| \
+	 defined(__x86_64)	|| defined(__x86_64__)	|| \
+	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
+#  define GHASH_ASM_X86_OR_64
+#  define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#   define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# elif defined(__arm__) || defined(__arm)
+#  include "arm_arch.h"
+#  if __ARM_ARCH__>=7
+#   define GHASH_ASM_ARM
+#   define GCM_FUNCREF_4BIT
+void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# endif
+#endif
+
+#ifdef GCM_FUNCREF_4BIT
+# undef  GCM_MUL
+# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+#  undef  GHASH
+#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
+#endif
+
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+{
+	const union { long one; char little; } is_endian = {1};
+
+	memset(ctx,0,sizeof(*ctx));
+	ctx->block = block;
+	ctx->key   = key;
+
+	(*block)(ctx->H.c,ctx->H.c,key);
+
+	if (is_endian.little) {
+		/* H is stored in host byte order */
+#ifdef BSWAP8
+		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+		u8 *p = ctx->H.c;
+		u64 hi,lo;
+		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
+		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
+		ctx->H.u[0] = hi;
+		ctx->H.u[1] = lo;
+#endif
+	}
+
+#if	TABLE_BITS==8
+	gcm_init_8bit(ctx->Htable,ctx->H.u);
+#elif	TABLE_BITS==4
+# if	defined(GHASH_ASM_X86_OR_64)
+#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
+	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
+		gcm_init_clmul(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_clmul;
+		ctx->ghash = gcm_ghash_clmul;
+		return;
+	}
+#  endif
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+#  if	defined(GHASH_ASM_X86)			/* x86 only */
+#   if	defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
+#   else
+	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
+#   endif
+		ctx->gmult = gcm_gmult_4bit_mmx;
+		ctx->ghash = gcm_ghash_4bit_mmx;
+	} else {
+		ctx->gmult = gcm_gmult_4bit_x86;
+		ctx->ghash = gcm_ghash_4bit_x86;
+	}
+#  else
+	ctx->gmult = gcm_gmult_4bit;
+	ctx->ghash = gcm_ghash_4bit;
+#  endif
+# elif	defined(GHASH_ASM_ARM)
+	if (OPENSSL_armcap_P & ARMV7_NEON) {
+		ctx->gmult = gcm_gmult_neon;
+		ctx->ghash = gcm_ghash_neon;
+	} else {
+		gcm_init_4bit(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_4bit;
+		ctx->ghash = gcm_ghash_4bit;
+	}
+# else
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+# endif
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	ctx->Yi.u[0]  = 0;
+	ctx->Yi.u[1]  = 0;
+	ctx->Xi.u[0]  = 0;
+	ctx->Xi.u[1]  = 0;
+	ctx->len.u[0] = 0;	/* AAD length */
+	ctx->len.u[1] = 0;	/* message length */
+	ctx->ares = 0;
+	ctx->mres = 0;
+
+	if (len==12) {
+		memcpy(ctx->Yi.c,iv,12);
+		ctx->Yi.c[15]=1;
+		ctr=1;
+	}
+	else {
+		size_t i;
+		u64 len0 = len;
+
+		while (len>=16) {
+			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+			iv += 16;
+			len -= 16;
+		}
+		if (len) {
+			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+		}
+		len0 <<= 3;
+		if (is_endian.little) {
+#ifdef BSWAP8
+			ctx->Yi.u[1]  ^= BSWAP8(len0);
+#else
+			ctx->Yi.c[8]  ^= (u8)(len0>>56);
+			ctx->Yi.c[9]  ^= (u8)(len0>>48);
+			ctx->Yi.c[10] ^= (u8)(len0>>40);
+			ctx->Yi.c[11] ^= (u8)(len0>>32);
+			ctx->Yi.c[12] ^= (u8)(len0>>24);
+			ctx->Yi.c[13] ^= (u8)(len0>>16);
+			ctx->Yi.c[14] ^= (u8)(len0>>8);
+			ctx->Yi.c[15] ^= (u8)(len0);
+#endif
+		}
+		else
+			ctx->Yi.u[1]  ^= len0;
+
+		GCM_MUL(ctx,Yi);
+
+		if (is_endian.little)
+			ctr = GETU32(ctx->Yi.c+12);
+		else
+			ctr = ctx->Yi.d[3];
+	}
+
+	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
+	++ctr;
+	if (is_endian.little)
+		PUTU32(ctx->Yi.c+12,ctr);
+	else
+		ctx->Yi.d[3] = ctr;
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+{
+	size_t i;
+	unsigned int n;
+	u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	if (ctx->len.u[1]) return -2;
+
+	alen += len;
+	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
+		return -1;
+	ctx->len.u[0] = alen;
+
+	n = ctx->ares;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(aad++);
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->ares = n;
+			return 0;
+		}
+	}
+
+#ifdef GHASH
+	if ((i = (len&(size_t)-16))) {
+		GHASH(ctx,aad,i);
+		aad += i;
+		len -= i;
+	}
+#else
+	while (len>=16) {
+		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
+		GCM_MUL(ctx,Xi);
+		aad += 16;
+		len -= 16;
+	}
+#endif
+	if (len) {
+		n = (unsigned int)len;
+		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
+	}
+
+	ctx->ares = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+#if 0
+	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+#endif
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL(ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    while (j) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    size_t j=i;
+
+		    while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		    GHASH(ctx,out-j,j);
+		}
+#else
+		while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(ctx->Xi.c+i) ^=
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				u8 c = *(in++);
+				*(out++) = c^ctx->EKi.c[n];
+				ctx->Xi.c[n] ^= c;
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL (ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    GHASH(ctx,in,GHASH_CHUNK);
+		    while (j) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    GHASH(ctx,in,i);
+		    while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		}
+#else
+		while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t)) {
+				size_t c = *(size_t *)(in+i);
+				*(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
+				*(size_t *)(ctx->Xi.c+i) ^= c;
+			}
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				u8 c = in[n];
+				ctx->Xi.c[n] ^= c;
+				out[n] = c^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		u8 c;
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		c = in[i];
+		out[i] = c^ctx->EKi.c[n];
+		ctx->Xi.c[n] ^= c;
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len, ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		GHASH(ctx,out,GHASH_CHUNK);
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		in  += i;
+		len -= i;
+#if defined(GHASH)
+		GHASH(ctx,out,i);
+		out += i;
+#else
+		while (j--) {
+			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
+			GCM_MUL(ctx,Xi);
+			out += 16;
+		}
+#endif
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len,ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			u8 c = *(in++);
+			*(out++) = c^ctx->EKi.c[n];
+			ctx->Xi.c[n] ^= c;
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL (ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		GHASH(ctx,in,GHASH_CHUNK);
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+#if defined(GHASH)
+		GHASH(ctx,in,i);
+#else
+		while (j--) {
+			size_t k;
+			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
+			GCM_MUL(ctx,Xi);
+			in += 16;
+		}
+		j   = i/16;
+		in -= i;
+#endif
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += i;
+		in  += i;
+		len -= i;
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			u8 c = in[n];
+			ctx->Xi.c[n] ^= c;
+			out[n] = c^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+			size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	u64 alen = ctx->len.u[0]<<3;
+	u64 clen = ctx->len.u[1]<<3;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	if (ctx->mres || ctx->ares)
+		GCM_MUL(ctx,Xi);
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		alen = BSWAP8(alen);
+		clen = BSWAP8(clen);
+#else
+		u8 *p = ctx->len.c;
+
+		ctx->len.u[0] = alen;
+		ctx->len.u[1] = clen;
+
+		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
+		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+#endif
+	}
+
+	ctx->Xi.u[0] ^= alen;
+	ctx->Xi.u[1] ^= clen;
+	GCM_MUL(ctx,Xi);
+
+	ctx->Xi.u[0] ^= ctx->EK0.u[0];
+	ctx->Xi.u[1] ^= ctx->EK0.u[1];
+
+	if (tag && len<=sizeof(ctx->Xi))
+		return memcmp(ctx->Xi.c,tag,len);
+	else
+		return -1;
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+	CRYPTO_gcm128_finish(ctx, NULL, 0);
+	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+}
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
+{
+	GCM128_CONTEXT *ret;
+
+	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+		CRYPTO_gcm128_init(ret,key,block);
+
+	return ret;
+}
+
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
+{
+	if (ctx) {
+		OPENSSL_cleanse(ctx,sizeof(*ctx));
+		OPENSSL_free(ctx);
+	}
+}
+
+#if defined(SELFTEST)
+#include <stdio.h>
+#include <openssl/aes.h>
+
+/* Test Case 1 */
+static const u8	K1[16],
+		*P1=NULL,
+		*A1=NULL,
+		IV1[12],
+		*C1=NULL,
+		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+
+/* Test Case 2 */
+#define K2 K1
+#define A2 A1
+#define IV2 IV1
+static const u8	P2[16],
+		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
+		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+
+/* Test Case 3 */
+#define A3 A2
+static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
+		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+
+/* Test Case 4 */
+#define K4 K3
+#define IV4 IV3
+static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
+		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+
+/* Test Case 5 */
+#define K5 K4
+#define P5 P4
+#define A5 A4
+static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
+			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
+			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
+			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
+		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+
+/* Test Case 6 */
+#define K6 K5
+#define P6 P5
+#define A6 A5
+static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
+			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
+			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
+			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
+		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+
+/* Test Case 7 */
+static const u8 K7[24],
+		*P7=NULL,
+		*A7=NULL,
+		IV7[12],
+		*C7=NULL,
+		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+
+/* Test Case 8 */
+#define K8 K7
+#define IV8 IV7
+#define A8 A7
+static const u8	P8[16],
+		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
+		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+
+/* Test Case 9 */
+#define A9 A8
+static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
+		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
+		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+
+/* Test Case 10 */
+#define K10 K9
+#define IV10 IV9
+static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
+		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+
+/* Test Case 11 */
+#define K11 K10
+#define P11 P10
+#define A11 A10
+static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
+			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
+			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
+			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
+		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+
+/* Test Case 12 */
+#define K12 K11
+#define P12 P11
+#define A12 A11
+static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
+			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
+			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
+			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
+		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+
+/* Test Case 13 */
+static const u8	K13[32],
+		*P13=NULL,
+		*A13=NULL,
+		IV13[12],
+		*C13=NULL,
+		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+
+/* Test Case 14 */
+#define K14 K13
+#define A14 A13
+static const u8	P14[16],
+		IV14[12],
+		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
+		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+
+/* Test Case 15 */
+#define A15 A14
+static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+
+/* Test Case 16 */
+#define K16 K15
+#define IV16 IV15
+static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
+		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+
+/* Test Case 17 */
+#define K17 K16
+#define P17 P16
+#define A17 A16
+static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
+			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
+			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
+			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
+		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+
+/* Test Case 18 */
+#define K18 K17
+#define P18 P17
+#define A18 A17
+static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
+		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+
+#define TEST_CASE(n)	do {					\
+	u8 out[sizeof(P##n)];					\
+	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (C##n && memcmp(out,C##n,sizeof(out))))		\
+		ret++, printf ("encrypt test#%d failed.\n",n);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (P##n && memcmp(out,P##n,sizeof(out))))		\
+		ret++, printf ("decrypt test#%d failed.\n",n);	\
+	} while(0)
+
+int main()
+{
+	GCM128_CONTEXT ctx;
+	AES_KEY key;
+	int ret=0;
+
+	TEST_CASE(1);
+	TEST_CASE(2);
+	TEST_CASE(3);
+	TEST_CASE(4);
+	TEST_CASE(5);
+	TEST_CASE(6);
+	TEST_CASE(7);
+	TEST_CASE(8);
+	TEST_CASE(9);
+	TEST_CASE(10);
+	TEST_CASE(11);
+	TEST_CASE(12);
+	TEST_CASE(13);
+	TEST_CASE(14);
+	TEST_CASE(15);
+	TEST_CASE(16);
+	TEST_CASE(17);
+	TEST_CASE(18);
+
+#ifdef OPENSSL_CPUID_OBJ
+	{
+	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
+	union { u64 u; u8 c[1024]; } buf;
+	int i;
+
+	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
+	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
+
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	start = OPENSSL_rdtsc();
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	ctr_t = OPENSSL_rdtsc() - start;
+
+	printf("%.2f-%.2f=%.2f\n",
+			gcm_t/(double)sizeof(buf),
+			ctr_t/(double)sizeof(buf),
+			(gcm_t-ctr_t)/(double)sizeof(buf));
+#ifdef GHASH
+	GHASH(&ctx,buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
+#endif
+	}
+#endif
+
+	return ret;
+}
+#endif
diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h
new file mode 100644
index 0000000..b6dc3c3
--- /dev/null
+++ b/crypto/modes/modes_lcl.h
@@ -0,0 +1,131 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use is governed by OpenSSL license.
+ * ====================================================================
+ */
+
+#include <openssl/modes.h>
+
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef __int64 i64;
+typedef unsigned __int64 u64;
+#define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef long i64;
+typedef unsigned long u64;
+#define U64(C) C##UL
+#else
+typedef long long i64;
+typedef unsigned long long u64;
+#define U64(C) C##ULL
+#endif
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define STRICT_ALIGNMENT 1
+#if defined(__i386)	|| defined(__i386__)	|| \
+    defined(__x86_64)	|| defined(__x86_64__)	|| \
+    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
+    defined(__s390__)	|| defined(__s390x__)	|| \
+    ( (defined(__arm__)	|| defined(__arm)) && \
+      (defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__) || \
+       defined(__ARM_ARCH_7R__)	|| defined(__ARM_ARCH_7M__)) )
+# undef STRICT_ALIGNMENT
+#endif
+
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+#  define BSWAP8(x) ({	u64 ret=(x);			\
+			asm ("bswapq %0"		\
+			: "+r"(ret));	ret;		})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("bswapl %0; bswapl %1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("rev %0,%0; rev %1,%1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret;			\
+			asm ("rev %0,%1"		\
+			: "=r"(ret) : "r"((u32)(x)));	\
+			ret;				})
+# endif
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+#  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+#  define BSWAP8(x)	_byteswap_uint64((u64)(x))
+#  define BSWAP4(x)	_byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+   __inline u32 _bswap4(u32 val) {
+	_asm mov eax,val
+	_asm bswap eax
+   }
+#  define BSWAP4(x)	_bswap4(x)
+# endif
+#endif
+#endif
+
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+#define GETU32(p)	BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+
+/* GCM definitions */
+
+typedef struct { u64 hi,lo; } u128;
+
+#ifdef	TABLE_BITS
+#undef	TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8 [or 1]. For further information see gcm128.c.
+ */
+#define	TABLE_BITS 4
+
+struct gcm128_context {
+	/* Following 6 names follow names in GCM specification */
+	union { u64 u[2]; u32 d[4]; u8 c[16]; }	Yi,EKi,EK0,len,
+						Xi,H;
+	/* Relative position of Xi, H and pre-computed Htable is used
+	 * in some assembler modules, i.e. don't change the order! */
+#if TABLE_BITS==8
+	u128 Htable[256];
+#else
+	u128 Htable[16];
+	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+	unsigned int mres, ares;
+	block128_f block;
+	void *key;
+};
+
+struct xts128_context {
+	void      *key1, *key2;
+	block128_f block1,block2;
+};
+
+struct ccm128_context {
+	union { u64 u[2]; u8 c[16]; } nonce, cmac;
+	u64 blocks;
+	block128_f block;
+	void *key;
+};
+
diff --git a/crypto/modes/ofb128.c b/crypto/modes/ofb128.c
index c732e2e..01c0170 100644
--- a/crypto/modes/ofb128.c
+++ b/crypto/modes/ofb128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit ofb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
diff --git a/crypto/modes/xts128.c b/crypto/modes/xts128.c
new file mode 100644
index 0000000..9cf27a2
--- /dev/null
+++ b/crypto/modes/xts128.c
@@ -0,0 +1,187 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+	const unsigned char *inp, unsigned char *out,
+	size_t len, int enc)
+{
+	const union { long one; char little; } is_endian = {1};
+	union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
+	unsigned int i;
+
+	if (len<16) return -1;
+
+	memcpy(tweak.c, iv, 16);
+
+	(*ctx->block2)(tweak.c,tweak.c,ctx->key2);
+
+	if (!enc && (len%16)) len-=16;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+
+		if (len==0)	return 0;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+			
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak.u[0] = (tweak.u[0]<<1)^res;
+			tweak.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak.c[0] ^= (u8)(0x87&(0-c));
+		}
+	}
+	if (enc) {
+		for (i=0;i<len;++i) {
+			u8 c = inp[i];
+			out[i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out-16,scratch.c,16);
+	}
+	else {
+		union { u64 u[2]; u8 c[16]; } tweak1;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak1.u[0] = (tweak.u[0]<<1)^res;
+			tweak1.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak1.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak1.c[0] ^= (u8)(0x87&(0-c));
+		}
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+
+		for (i=0;i<len;++i) {
+			u8 c = inp[16+i];
+			out[16+i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy (out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+#endif
+	}
+
+	return 0;
+}
diff --git a/crypto/o_init.c b/crypto/o_init.c
new file mode 100644
index 0000000..db4cdc4
--- /dev/null
+++ b/crypto/o_init.c
@@ -0,0 +1,82 @@
+/* o_init.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <e_os.h>
+#include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/rand.h>
+#endif
+
+/* Perform any essential OpenSSL initialization operations.
+ * Currently only sets FIPS callbacks
+ */
+
+void OPENSSL_init(void)
+	{
+	static int done = 0;
+	if (done)
+		return;
+	done = 1;
+#ifdef OPENSSL_FIPS
+	FIPS_set_locking_callbacks(CRYPTO_lock, CRYPTO_add_lock);
+	FIPS_set_error_callbacks(ERR_put_error, ERR_add_error_vdata);
+	FIPS_set_malloc_callbacks(CRYPTO_malloc, CRYPTO_free);
+	RAND_init_fips();
+#endif
+#if 0
+	fprintf(stderr, "Called OPENSSL_init\n");
+#endif
+	}
+
diff --git a/crypto/objects/o_names.c b/crypto/objects/o_names.c
index 84380a9..4a548c2 100644
--- a/crypto/objects/o_names.c
+++ b/crypto/objects/o_names.c
@@ -73,7 +73,7 @@ int OBJ_NAME_new_index(unsigned long (*hash_func)(const char *),
 		name_funcs_stack=sk_NAME_FUNCS_new_null();
 		MemCheck_on();
 		}
-	if ((name_funcs_stack == NULL))
+	if (name_funcs_stack == NULL)
 		{
 		/* ERROR */
 		return(0);
diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h
index 6449be6..d404ad0 100644
--- a/crypto/objects/obj_dat.h
+++ b/crypto/objects/obj_dat.h
@@ -62,12 +62,12 @@
  * [including the GNU Public Licence.]
  */
 
-#define NUM_NID 893
-#define NUM_SN 886
-#define NUM_LN 886
-#define NUM_OBJ 840
+#define NUM_NID 920
+#define NUM_SN 913
+#define NUM_LN 913
+#define NUM_OBJ 857
 
-static const unsigned char lvalues[5824]={
+static const unsigned char lvalues[5980]={
 0x00,                                        /* [  0] OBJ_undef */
 0x2A,0x86,0x48,0x86,0xF7,0x0D,               /* [  1] OBJ_rsadsi */
 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,          /* [  7] OBJ_pkcs */
@@ -908,6 +908,23 @@ static const unsigned char lvalues[5824]={
 0x55,0x04,0x34,                              /* [5814] OBJ_supportedAlgorithms */
 0x55,0x04,0x35,                              /* [5817] OBJ_deltaRevocationList */
 0x55,0x04,0x36,                              /* [5820] OBJ_dmdName */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x09,0x10,0x03,0x09,/* [5823] OBJ_id_alg_PWRI_KEK */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x06,/* [5834] OBJ_aes_128_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x07,/* [5843] OBJ_aes_128_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x08,/* [5852] OBJ_id_aes128_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1A,/* [5861] OBJ_aes_192_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1B,/* [5870] OBJ_aes_192_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1C,/* [5879] OBJ_id_aes192_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2E,/* [5888] OBJ_aes_256_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2F,/* [5897] OBJ_aes_256_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x30,/* [5906] OBJ_id_aes256_wrap_pad */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x02,/* [5915] OBJ_id_camellia128_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x03,/* [5926] OBJ_id_camellia192_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x04,/* [5937] OBJ_id_camellia256_wrap */
+0x55,0x1D,0x25,0x00,                         /* [5948] OBJ_anyExtendedKeyUsage */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x08,/* [5952] OBJ_mgf1 */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x0A,/* [5961] OBJ_rsassaPss */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x07,/* [5970] OBJ_rsaesOaep */
 };
 
 static const ASN1_OBJECT nid_objs[NUM_NID]={
@@ -2351,28 +2368,74 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={
 {"deltaRevocationList","deltaRevocationList",NID_deltaRevocationList,
 	3,&(lvalues[5817]),0},
 {"dmdName","dmdName",NID_dmdName,3,&(lvalues[5820]),0},
+{"id-alg-PWRI-KEK","id-alg-PWRI-KEK",NID_id_alg_PWRI_KEK,11,
+	&(lvalues[5823]),0},
+{"CMAC","cmac",NID_cmac,0,NULL,0},
+{"id-aes128-GCM","aes-128-gcm",NID_aes_128_gcm,9,&(lvalues[5834]),0},
+{"id-aes128-CCM","aes-128-ccm",NID_aes_128_ccm,9,&(lvalues[5843]),0},
+{"id-aes128-wrap-pad","id-aes128-wrap-pad",NID_id_aes128_wrap_pad,9,
+	&(lvalues[5852]),0},
+{"id-aes192-GCM","aes-192-gcm",NID_aes_192_gcm,9,&(lvalues[5861]),0},
+{"id-aes192-CCM","aes-192-ccm",NID_aes_192_ccm,9,&(lvalues[5870]),0},
+{"id-aes192-wrap-pad","id-aes192-wrap-pad",NID_id_aes192_wrap_pad,9,
+	&(lvalues[5879]),0},
+{"id-aes256-GCM","aes-256-gcm",NID_aes_256_gcm,9,&(lvalues[5888]),0},
+{"id-aes256-CCM","aes-256-ccm",NID_aes_256_ccm,9,&(lvalues[5897]),0},
+{"id-aes256-wrap-pad","id-aes256-wrap-pad",NID_id_aes256_wrap_pad,9,
+	&(lvalues[5906]),0},
+{"AES-128-CTR","aes-128-ctr",NID_aes_128_ctr,0,NULL,0},
+{"AES-192-CTR","aes-192-ctr",NID_aes_192_ctr,0,NULL,0},
+{"AES-256-CTR","aes-256-ctr",NID_aes_256_ctr,0,NULL,0},
+{"id-camellia128-wrap","id-camellia128-wrap",NID_id_camellia128_wrap,
+	11,&(lvalues[5915]),0},
+{"id-camellia192-wrap","id-camellia192-wrap",NID_id_camellia192_wrap,
+	11,&(lvalues[5926]),0},
+{"id-camellia256-wrap","id-camellia256-wrap",NID_id_camellia256_wrap,
+	11,&(lvalues[5937]),0},
+{"anyExtendedKeyUsage","Any Extended Key Usage",
+	NID_anyExtendedKeyUsage,4,&(lvalues[5948]),0},
+{"MGF1","mgf1",NID_mgf1,9,&(lvalues[5952]),0},
+{"RSASSA-PSS","rsassaPss",NID_rsassaPss,9,&(lvalues[5961]),0},
+{"AES-128-XTS","aes-128-xts",NID_aes_128_xts,0,NULL,0},
+{"AES-256-XTS","aes-256-xts",NID_aes_256_xts,0,NULL,0},
+{"RC4-HMAC-MD5","rc4-hmac-md5",NID_rc4_hmac_md5,0,NULL,0},
+{"AES-128-CBC-HMAC-SHA1","aes-128-cbc-hmac-sha1",
+	NID_aes_128_cbc_hmac_sha1,0,NULL,0},
+{"AES-192-CBC-HMAC-SHA1","aes-192-cbc-hmac-sha1",
+	NID_aes_192_cbc_hmac_sha1,0,NULL,0},
+{"AES-256-CBC-HMAC-SHA1","aes-256-cbc-hmac-sha1",
+	NID_aes_256_cbc_hmac_sha1,0,NULL,0},
+{"RSAES-OAEP","rsaesOaep",NID_rsaesOaep,9,&(lvalues[5970]),0},
 };
 
 static const unsigned int sn_objs[NUM_SN]={
 364,	/* "AD_DVCS" */
 419,	/* "AES-128-CBC" */
+916,	/* "AES-128-CBC-HMAC-SHA1" */
 421,	/* "AES-128-CFB" */
 650,	/* "AES-128-CFB1" */
 653,	/* "AES-128-CFB8" */
+904,	/* "AES-128-CTR" */
 418,	/* "AES-128-ECB" */
 420,	/* "AES-128-OFB" */
+913,	/* "AES-128-XTS" */
 423,	/* "AES-192-CBC" */
+917,	/* "AES-192-CBC-HMAC-SHA1" */
 425,	/* "AES-192-CFB" */
 651,	/* "AES-192-CFB1" */
 654,	/* "AES-192-CFB8" */
+905,	/* "AES-192-CTR" */
 422,	/* "AES-192-ECB" */
 424,	/* "AES-192-OFB" */
 427,	/* "AES-256-CBC" */
+918,	/* "AES-256-CBC-HMAC-SHA1" */
 429,	/* "AES-256-CFB" */
 652,	/* "AES-256-CFB1" */
 655,	/* "AES-256-CFB8" */
+906,	/* "AES-256-CTR" */
 426,	/* "AES-256-ECB" */
 428,	/* "AES-256-OFB" */
+914,	/* "AES-256-XTS" */
 91,	/* "BF-CBC" */
 93,	/* "BF-CFB" */
 92,	/* "BF-ECB" */
@@ -2400,6 +2463,7 @@ static const unsigned int sn_objs[NUM_SN]={
 110,	/* "CAST5-CFB" */
 109,	/* "CAST5-ECB" */
 111,	/* "CAST5-OFB" */
+894,	/* "CMAC" */
 13,	/* "CN" */
 141,	/* "CRLReason" */
 417,	/* "CSPName" */
@@ -2451,6 +2515,7 @@ static const unsigned int sn_objs[NUM_SN]={
  4,	/* "MD5" */
 114,	/* "MD5-SHA1" */
 95,	/* "MDC2" */
+911,	/* "MGF1" */
 388,	/* "Mail" */
 393,	/* "NULL" */
 404,	/* "NULL" */
@@ -2487,6 +2552,7 @@ static const unsigned int sn_objs[NUM_SN]={
 40,	/* "RC2-OFB" */
  5,	/* "RC4" */
 97,	/* "RC4-40" */
+915,	/* "RC4-HMAC-MD5" */
 120,	/* "RC5-CBC" */
 122,	/* "RC5-CFB" */
 121,	/* "RC5-ECB" */
@@ -2507,6 +2573,8 @@ static const unsigned int sn_objs[NUM_SN]={
 668,	/* "RSA-SHA256" */
 669,	/* "RSA-SHA384" */
 670,	/* "RSA-SHA512" */
+919,	/* "RSAES-OAEP" */
+912,	/* "RSASSA-PSS" */
 777,	/* "SEED-CBC" */
 779,	/* "SEED-CFB" */
 776,	/* "SEED-ECB" */
@@ -2540,6 +2608,7 @@ static const unsigned int sn_objs[NUM_SN]={
 363,	/* "ad_timestamping" */
 376,	/* "algorithm" */
 405,	/* "ansi-X9-62" */
+910,	/* "anyExtendedKeyUsage" */
 746,	/* "anyPolicy" */
 370,	/* "archiveCutoff" */
 484,	/* "associatedDomain" */
@@ -2716,14 +2785,27 @@ static const unsigned int sn_objs[NUM_SN]={
 357,	/* "id-aca-group" */
 358,	/* "id-aca-role" */
 176,	/* "id-ad" */
+896,	/* "id-aes128-CCM" */
+895,	/* "id-aes128-GCM" */
 788,	/* "id-aes128-wrap" */
+897,	/* "id-aes128-wrap-pad" */
+899,	/* "id-aes192-CCM" */
+898,	/* "id-aes192-GCM" */
 789,	/* "id-aes192-wrap" */
+900,	/* "id-aes192-wrap-pad" */
+902,	/* "id-aes256-CCM" */
+901,	/* "id-aes256-GCM" */
 790,	/* "id-aes256-wrap" */
+903,	/* "id-aes256-wrap-pad" */
 262,	/* "id-alg" */
+893,	/* "id-alg-PWRI-KEK" */
 323,	/* "id-alg-des40" */
 326,	/* "id-alg-dh-pop" */
 325,	/* "id-alg-dh-sig-hmac-sha1" */
 324,	/* "id-alg-noSignature" */
+907,	/* "id-camellia128-wrap" */
+908,	/* "id-camellia192-wrap" */
+909,	/* "id-camellia256-wrap" */
 268,	/* "id-cct" */
 361,	/* "id-cct-PKIData" */
 362,	/* "id-cct-PKIResponse" */
@@ -3246,6 +3328,7 @@ static const unsigned int ln_objs[NUM_LN]={
 363,	/* "AD Time Stamping" */
 405,	/* "ANSI X9.62" */
 368,	/* "Acceptable OCSP Responses" */
+910,	/* "Any Extended Key Usage" */
 664,	/* "Any language" */
 177,	/* "Authority Information Access" */
 365,	/* "Basic OCSP Response" */
@@ -3386,23 +3469,37 @@ static const unsigned int ln_objs[NUM_LN]={
 364,	/* "ad dvcs" */
 606,	/* "additional verification" */
 419,	/* "aes-128-cbc" */
+916,	/* "aes-128-cbc-hmac-sha1" */
+896,	/* "aes-128-ccm" */
 421,	/* "aes-128-cfb" */
 650,	/* "aes-128-cfb1" */
 653,	/* "aes-128-cfb8" */
+904,	/* "aes-128-ctr" */
 418,	/* "aes-128-ecb" */
+895,	/* "aes-128-gcm" */
 420,	/* "aes-128-ofb" */
+913,	/* "aes-128-xts" */
 423,	/* "aes-192-cbc" */
+917,	/* "aes-192-cbc-hmac-sha1" */
+899,	/* "aes-192-ccm" */
 425,	/* "aes-192-cfb" */
 651,	/* "aes-192-cfb1" */
 654,	/* "aes-192-cfb8" */
+905,	/* "aes-192-ctr" */
 422,	/* "aes-192-ecb" */
+898,	/* "aes-192-gcm" */
 424,	/* "aes-192-ofb" */
 427,	/* "aes-256-cbc" */
+918,	/* "aes-256-cbc-hmac-sha1" */
+902,	/* "aes-256-ccm" */
 429,	/* "aes-256-cfb" */
 652,	/* "aes-256-cfb1" */
 655,	/* "aes-256-cfb8" */
+906,	/* "aes-256-ctr" */
 426,	/* "aes-256-ecb" */
+901,	/* "aes-256-gcm" */
 428,	/* "aes-256-ofb" */
+914,	/* "aes-256-xts" */
 376,	/* "algorithm" */
 484,	/* "associatedDomain" */
 485,	/* "associatedName" */
@@ -3467,6 +3564,7 @@ static const unsigned int ln_objs[NUM_LN]={
 407,	/* "characteristic-two-field" */
 395,	/* "clearance" */
 633,	/* "cleartext track 2" */
+894,	/* "cmac" */
 13,	/* "commonName" */
 513,	/* "content types" */
 50,	/* "contentType" */
@@ -3602,13 +3700,20 @@ static const unsigned int ln_objs[NUM_LN]={
 358,	/* "id-aca-role" */
 176,	/* "id-ad" */
 788,	/* "id-aes128-wrap" */
+897,	/* "id-aes128-wrap-pad" */
 789,	/* "id-aes192-wrap" */
+900,	/* "id-aes192-wrap-pad" */
 790,	/* "id-aes256-wrap" */
+903,	/* "id-aes256-wrap-pad" */
 262,	/* "id-alg" */
+893,	/* "id-alg-PWRI-KEK" */
 323,	/* "id-alg-des40" */
 326,	/* "id-alg-dh-pop" */
 325,	/* "id-alg-dh-sig-hmac-sha1" */
 324,	/* "id-alg-noSignature" */
+907,	/* "id-camellia128-wrap" */
+908,	/* "id-camellia192-wrap" */
+909,	/* "id-camellia256-wrap" */
 268,	/* "id-cct" */
 361,	/* "id-cct-PKIData" */
 362,	/* "id-cct-PKIResponse" */
@@ -3806,6 +3911,7 @@ static const unsigned int ln_objs[NUM_LN]={
 602,	/* "merchant initiated auth" */
 514,	/* "message extensions" */
 51,	/* "messageDigest" */
+911,	/* "mgf1" */
 506,	/* "mime-mhs-bodies" */
 505,	/* "mime-mhs-headings" */
 488,	/* "mobileTelephoneNumber" */
@@ -3889,6 +3995,7 @@ static const unsigned int ln_objs[NUM_LN]={
 40,	/* "rc2-ofb" */
  5,	/* "rc4" */
 97,	/* "rc4-40" */
+915,	/* "rc4-hmac-md5" */
 120,	/* "rc5-cbc" */
 122,	/* "rc5-cfb" */
 121,	/* "rc5-ecb" */
@@ -3905,6 +4012,8 @@ static const unsigned int ln_objs[NUM_LN]={
  6,	/* "rsaEncryption" */
 644,	/* "rsaOAEPEncryptionSET" */
 377,	/* "rsaSignature" */
+919,	/* "rsaesOaep" */
+912,	/* "rsassaPss" */
 124,	/* "run length compression" */
 482,	/* "sOARecord" */
 155,	/* "safeContentsBag" */
@@ -4254,6 +4363,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
 96,	/* OBJ_mdc2WithRSA                  2 5 8 3 100 */
 95,	/* OBJ_mdc2                         2 5 8 3 101 */
 746,	/* OBJ_any_policy                   2 5 29 32 0 */
+910,	/* OBJ_anyExtendedKeyUsage          2 5 29 37 0 */
 519,	/* OBJ_setct_PANData                2 23 42 0 0 */
 520,	/* OBJ_setct_PANToken               2 23 42 0 1 */
 521,	/* OBJ_setct_PANOnly                2 23 42 0 2 */
@@ -4720,6 +4830,9 @@ static const unsigned int obj_objs[NUM_OBJ]={
  8,	/* OBJ_md5WithRSAEncryption         1 2 840 113549 1 1 4 */
 65,	/* OBJ_sha1WithRSAEncryption        1 2 840 113549 1 1 5 */
 644,	/* OBJ_rsaOAEPEncryptionSET         1 2 840 113549 1 1 6 */
+919,	/* OBJ_rsaesOaep                    1 2 840 113549 1 1 7 */
+911,	/* OBJ_mgf1                         1 2 840 113549 1 1 8 */
+912,	/* OBJ_rsassaPss                    1 2 840 113549 1 1 10 */
 668,	/* OBJ_sha256WithRSAEncryption      1 2 840 113549 1 1 11 */
 669,	/* OBJ_sha384WithRSAEncryption      1 2 840 113549 1 1 12 */
 670,	/* OBJ_sha512WithRSAEncryption      1 2 840 113549 1 1 13 */
@@ -4785,16 +4898,25 @@ static const unsigned int obj_objs[NUM_OBJ]={
 420,	/* OBJ_aes_128_ofb128               2 16 840 1 101 3 4 1 3 */
 421,	/* OBJ_aes_128_cfb128               2 16 840 1 101 3 4 1 4 */
 788,	/* OBJ_id_aes128_wrap               2 16 840 1 101 3 4 1 5 */
+895,	/* OBJ_aes_128_gcm                  2 16 840 1 101 3 4 1 6 */
+896,	/* OBJ_aes_128_ccm                  2 16 840 1 101 3 4 1 7 */
+897,	/* OBJ_id_aes128_wrap_pad           2 16 840 1 101 3 4 1 8 */
 422,	/* OBJ_aes_192_ecb                  2 16 840 1 101 3 4 1 21 */
 423,	/* OBJ_aes_192_cbc                  2 16 840 1 101 3 4 1 22 */
 424,	/* OBJ_aes_192_ofb128               2 16 840 1 101 3 4 1 23 */
 425,	/* OBJ_aes_192_cfb128               2 16 840 1 101 3 4 1 24 */
 789,	/* OBJ_id_aes192_wrap               2 16 840 1 101 3 4 1 25 */
+898,	/* OBJ_aes_192_gcm                  2 16 840 1 101 3 4 1 26 */
+899,	/* OBJ_aes_192_ccm                  2 16 840 1 101 3 4 1 27 */
+900,	/* OBJ_id_aes192_wrap_pad           2 16 840 1 101 3 4 1 28 */
 426,	/* OBJ_aes_256_ecb                  2 16 840 1 101 3 4 1 41 */
 427,	/* OBJ_aes_256_cbc                  2 16 840 1 101 3 4 1 42 */
 428,	/* OBJ_aes_256_ofb128               2 16 840 1 101 3 4 1 43 */
 429,	/* OBJ_aes_256_cfb128               2 16 840 1 101 3 4 1 44 */
 790,	/* OBJ_id_aes256_wrap               2 16 840 1 101 3 4 1 45 */
+901,	/* OBJ_aes_256_gcm                  2 16 840 1 101 3 4 1 46 */
+902,	/* OBJ_aes_256_ccm                  2 16 840 1 101 3 4 1 47 */
+903,	/* OBJ_id_aes256_wrap_pad           2 16 840 1 101 3 4 1 48 */
 672,	/* OBJ_sha256                       2 16 840 1 101 3 4 2 1 */
 673,	/* OBJ_sha384                       2 16 840 1 101 3 4 2 2 */
 674,	/* OBJ_sha512                       2 16 840 1 101 3 4 2 3 */
@@ -4901,6 +5023,9 @@ static const unsigned int obj_objs[NUM_OBJ]={
 751,	/* OBJ_camellia_128_cbc             1 2 392 200011 61 1 1 1 2 */
 752,	/* OBJ_camellia_192_cbc             1 2 392 200011 61 1 1 1 3 */
 753,	/* OBJ_camellia_256_cbc             1 2 392 200011 61 1 1 1 4 */
+907,	/* OBJ_id_camellia128_wrap          1 2 392 200011 61 1 1 3 2 */
+908,	/* OBJ_id_camellia192_wrap          1 2 392 200011 61 1 1 3 3 */
+909,	/* OBJ_id_camellia256_wrap          1 2 392 200011 61 1 1 3 4 */
 196,	/* OBJ_id_smime_mod_cms             1 2 840 113549 1 9 16 0 1 */
 197,	/* OBJ_id_smime_mod_ess             1 2 840 113549 1 9 16 0 2 */
 198,	/* OBJ_id_smime_mod_oid             1 2 840 113549 1 9 16 0 3 */
@@ -4956,6 +5081,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
 246,	/* OBJ_id_smime_alg_CMS3DESwrap     1 2 840 113549 1 9 16 3 6 */
 247,	/* OBJ_id_smime_alg_CMSRC2wrap      1 2 840 113549 1 9 16 3 7 */
 125,	/* OBJ_zlib_compression             1 2 840 113549 1 9 16 3 8 */
+893,	/* OBJ_id_alg_PWRI_KEK              1 2 840 113549 1 9 16 3 9 */
 248,	/* OBJ_id_smime_cd_ldap             1 2 840 113549 1 9 16 4 1 */
 249,	/* OBJ_id_smime_spq_ets_sqt_uri     1 2 840 113549 1 9 16 5 1 */
 250,	/* OBJ_id_smime_spq_ets_sqt_unotice 1 2 840 113549 1 9 16 5 2 */
diff --git a/crypto/objects/obj_mac.h b/crypto/objects/obj_mac.h
index 282f11a..b5ea7cd 100644
--- a/crypto/objects/obj_mac.h
+++ b/crypto/objects/obj_mac.h
@@ -580,6 +580,21 @@
 #define NID_sha1WithRSAEncryption		65
 #define OBJ_sha1WithRSAEncryption		OBJ_pkcs1,5L
 
+#define SN_rsaesOaep		"RSAES-OAEP"
+#define LN_rsaesOaep		"rsaesOaep"
+#define NID_rsaesOaep		919
+#define OBJ_rsaesOaep		OBJ_pkcs1,7L
+
+#define SN_mgf1		"MGF1"
+#define LN_mgf1		"mgf1"
+#define NID_mgf1		911
+#define OBJ_mgf1		OBJ_pkcs1,8L
+
+#define SN_rsassaPss		"RSASSA-PSS"
+#define LN_rsassaPss		"rsassaPss"
+#define NID_rsassaPss		912
+#define OBJ_rsassaPss		OBJ_pkcs1,10L
+
 #define SN_sha256WithRSAEncryption		"RSA-SHA256"
 #define LN_sha256WithRSAEncryption		"sha256WithRSAEncryption"
 #define NID_sha256WithRSAEncryption		668
@@ -981,6 +996,10 @@
 #define NID_id_smime_alg_CMSRC2wrap		247
 #define OBJ_id_smime_alg_CMSRC2wrap		OBJ_id_smime_alg,7L
 
+#define SN_id_alg_PWRI_KEK		"id-alg-PWRI-KEK"
+#define NID_id_alg_PWRI_KEK		893
+#define OBJ_id_alg_PWRI_KEK		OBJ_id_smime_alg,9L
+
 #define SN_id_smime_cd_ldap		"id-smime-cd-ldap"
 #define NID_id_smime_cd_ldap		248
 #define OBJ_id_smime_cd_ldap		OBJ_id_smime_cd,1L
@@ -2399,6 +2418,11 @@
 #define NID_no_rev_avail		403
 #define OBJ_no_rev_avail		OBJ_id_ce,56L
 
+#define SN_anyExtendedKeyUsage		"anyExtendedKeyUsage"
+#define LN_anyExtendedKeyUsage		"Any Extended Key Usage"
+#define NID_anyExtendedKeyUsage		910
+#define OBJ_anyExtendedKeyUsage		OBJ_ext_key_usage,0L
+
 #define SN_netscape		"Netscape"
 #define LN_netscape		"Netscape Communications Corp."
 #define NID_netscape		57
@@ -2586,6 +2610,24 @@
 #define NID_aes_128_cfb128		421
 #define OBJ_aes_128_cfb128		OBJ_aes,4L
 
+#define SN_id_aes128_wrap		"id-aes128-wrap"
+#define NID_id_aes128_wrap		788
+#define OBJ_id_aes128_wrap		OBJ_aes,5L
+
+#define SN_aes_128_gcm		"id-aes128-GCM"
+#define LN_aes_128_gcm		"aes-128-gcm"
+#define NID_aes_128_gcm		895
+#define OBJ_aes_128_gcm		OBJ_aes,6L
+
+#define SN_aes_128_ccm		"id-aes128-CCM"
+#define LN_aes_128_ccm		"aes-128-ccm"
+#define NID_aes_128_ccm		896
+#define OBJ_aes_128_ccm		OBJ_aes,7L
+
+#define SN_id_aes128_wrap_pad		"id-aes128-wrap-pad"
+#define NID_id_aes128_wrap_pad		897
+#define OBJ_id_aes128_wrap_pad		OBJ_aes,8L
+
 #define SN_aes_192_ecb		"AES-192-ECB"
 #define LN_aes_192_ecb		"aes-192-ecb"
 #define NID_aes_192_ecb		422
@@ -2606,6 +2648,24 @@
 #define NID_aes_192_cfb128		425
 #define OBJ_aes_192_cfb128		OBJ_aes,24L
 
+#define SN_id_aes192_wrap		"id-aes192-wrap"
+#define NID_id_aes192_wrap		789
+#define OBJ_id_aes192_wrap		OBJ_aes,25L
+
+#define SN_aes_192_gcm		"id-aes192-GCM"
+#define LN_aes_192_gcm		"aes-192-gcm"
+#define NID_aes_192_gcm		898
+#define OBJ_aes_192_gcm		OBJ_aes,26L
+
+#define SN_aes_192_ccm		"id-aes192-CCM"
+#define LN_aes_192_ccm		"aes-192-ccm"
+#define NID_aes_192_ccm		899
+#define OBJ_aes_192_ccm		OBJ_aes,27L
+
+#define SN_id_aes192_wrap_pad		"id-aes192-wrap-pad"
+#define NID_id_aes192_wrap_pad		900
+#define OBJ_id_aes192_wrap_pad		OBJ_aes,28L
+
 #define SN_aes_256_ecb		"AES-256-ECB"
 #define LN_aes_256_ecb		"aes-256-ecb"
 #define NID_aes_256_ecb		426
@@ -2626,6 +2686,24 @@
 #define NID_aes_256_cfb128		429
 #define OBJ_aes_256_cfb128		OBJ_aes,44L
 
+#define SN_id_aes256_wrap		"id-aes256-wrap"
+#define NID_id_aes256_wrap		790
+#define OBJ_id_aes256_wrap		OBJ_aes,45L
+
+#define SN_aes_256_gcm		"id-aes256-GCM"
+#define LN_aes_256_gcm		"aes-256-gcm"
+#define NID_aes_256_gcm		901
+#define OBJ_aes_256_gcm		OBJ_aes,46L
+
+#define SN_aes_256_ccm		"id-aes256-CCM"
+#define LN_aes_256_ccm		"aes-256-ccm"
+#define NID_aes_256_ccm		902
+#define OBJ_aes_256_ccm		OBJ_aes,47L
+
+#define SN_id_aes256_wrap_pad		"id-aes256-wrap-pad"
+#define NID_id_aes256_wrap_pad		903
+#define OBJ_id_aes256_wrap_pad		OBJ_aes,48L
+
 #define SN_aes_128_cfb1		"AES-128-CFB1"
 #define LN_aes_128_cfb1		"aes-128-cfb1"
 #define NID_aes_128_cfb1		650
@@ -2650,6 +2728,26 @@
 #define LN_aes_256_cfb8		"aes-256-cfb8"
 #define NID_aes_256_cfb8		655
 
+#define SN_aes_128_ctr		"AES-128-CTR"
+#define LN_aes_128_ctr		"aes-128-ctr"
+#define NID_aes_128_ctr		904
+
+#define SN_aes_192_ctr		"AES-192-CTR"
+#define LN_aes_192_ctr		"aes-192-ctr"
+#define NID_aes_192_ctr		905
+
+#define SN_aes_256_ctr		"AES-256-CTR"
+#define LN_aes_256_ctr		"aes-256-ctr"
+#define NID_aes_256_ctr		906
+
+#define SN_aes_128_xts		"AES-128-XTS"
+#define LN_aes_128_xts		"aes-128-xts"
+#define NID_aes_128_xts		913
+
+#define SN_aes_256_xts		"AES-256-XTS"
+#define LN_aes_256_xts		"aes-256-xts"
+#define NID_aes_256_xts		914
+
 #define SN_des_cfb1		"DES-CFB1"
 #define LN_des_cfb1		"des-cfb1"
 #define NID_des_cfb1		656
@@ -2666,18 +2764,6 @@
 #define LN_des_ede3_cfb8		"des-ede3-cfb8"
 #define NID_des_ede3_cfb8		659
 
-#define SN_id_aes128_wrap		"id-aes128-wrap"
-#define NID_id_aes128_wrap		788
-#define OBJ_id_aes128_wrap		OBJ_aes,5L
-
-#define SN_id_aes192_wrap		"id-aes192-wrap"
-#define NID_id_aes192_wrap		789
-#define OBJ_id_aes192_wrap		OBJ_aes,25L
-
-#define SN_id_aes256_wrap		"id-aes256-wrap"
-#define NID_id_aes256_wrap		790
-#define OBJ_id_aes256_wrap		OBJ_aes,45L
-
 #define OBJ_nist_hashalgs		OBJ_nistAlgorithms,2L
 
 #define SN_sha256		"SHA256"
@@ -3810,6 +3896,18 @@
 #define NID_camellia_256_cbc		753
 #define OBJ_camellia_256_cbc		1L,2L,392L,200011L,61L,1L,1L,1L,4L
 
+#define SN_id_camellia128_wrap		"id-camellia128-wrap"
+#define NID_id_camellia128_wrap		907
+#define OBJ_id_camellia128_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,2L
+
+#define SN_id_camellia192_wrap		"id-camellia192-wrap"
+#define NID_id_camellia192_wrap		908
+#define OBJ_id_camellia192_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,3L
+
+#define SN_id_camellia256_wrap		"id-camellia256-wrap"
+#define NID_id_camellia256_wrap		909
+#define OBJ_id_camellia256_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,4L
+
 #define OBJ_ntt_ds		0L,3L,4401L,5L
 
 #define OBJ_camellia		OBJ_ntt_ds,3L,1L,9L
@@ -3912,3 +4010,23 @@
 #define LN_hmac		"hmac"
 #define NID_hmac		855
 
+#define SN_cmac		"CMAC"
+#define LN_cmac		"cmac"
+#define NID_cmac		894
+
+#define SN_rc4_hmac_md5		"RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5		"rc4-hmac-md5"
+#define NID_rc4_hmac_md5		915
+
+#define SN_aes_128_cbc_hmac_sha1		"AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1		"aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1		916
+
+#define SN_aes_192_cbc_hmac_sha1		"AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1		"aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1		917
+
+#define SN_aes_256_cbc_hmac_sha1		"AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1		"aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1		918
+
diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num
index 8c50aac..1d0a7c8 100644
--- a/crypto/objects/obj_mac.num
+++ b/crypto/objects/obj_mac.num
@@ -890,3 +890,30 @@ houseIdentifier		889
 supportedAlgorithms		890
 deltaRevocationList		891
 dmdName		892
+id_alg_PWRI_KEK		893
+cmac		894
+aes_128_gcm		895
+aes_128_ccm		896
+id_aes128_wrap_pad		897
+aes_192_gcm		898
+aes_192_ccm		899
+id_aes192_wrap_pad		900
+aes_256_gcm		901
+aes_256_ccm		902
+id_aes256_wrap_pad		903
+aes_128_ctr		904
+aes_192_ctr		905
+aes_256_ctr		906
+id_camellia128_wrap		907
+id_camellia192_wrap		908
+id_camellia256_wrap		909
+anyExtendedKeyUsage		910
+mgf1		911
+rsassaPss		912
+aes_128_xts		913
+aes_256_xts		914
+rc4_hmac_md5		915
+aes_128_cbc_hmac_sha1		916
+aes_192_cbc_hmac_sha1		917
+aes_256_cbc_hmac_sha1		918
+rsaesOaep		919
diff --git a/crypto/objects/obj_xref.c b/crypto/objects/obj_xref.c
index 152eca5..9f744bc 100644
--- a/crypto/objects/obj_xref.c
+++ b/crypto/objects/obj_xref.c
@@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid)
 #endif
 	if (rv == NULL)
 		return 0;
-	*pdig_nid = rv->hash_id;
-	*ppkey_nid = rv->pkey_id;
+	if (pdig_nid)
+		*pdig_nid = rv->hash_id;
+	if (ppkey_nid)
+		*ppkey_nid = rv->pkey_id;
 	return 1;
 	}
 
@@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid)
 #endif
 	if (rv == NULL)
 		return 0;
-	*psignid = (*rv)->sign_id;
+	if (psignid)
+		*psignid = (*rv)->sign_id;
 	return 1;
 	}
 
diff --git a/crypto/objects/obj_xref.h b/crypto/objects/obj_xref.h
index d5b9b8e..e23938c 100644
--- a/crypto/objects/obj_xref.h
+++ b/crypto/objects/obj_xref.h
@@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] =
 	{NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94},
 	{NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc},
 	{NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc},
+	{NID_rsassaPss, NID_undef, NID_rsaEncryption},
 	};
 
 static const nid_triple * const sigoid_srt_xref[] =
 	{
+	&sigoid_srt[29],
 	&sigoid_srt[17],
 	&sigoid_srt[18],
 	&sigoid_srt[0],
diff --git a/crypto/objects/obj_xref.txt b/crypto/objects/obj_xref.txt
index e45b3d3..cb91718 100644
--- a/crypto/objects/obj_xref.txt
+++ b/crypto/objects/obj_xref.txt
@@ -13,6 +13,10 @@ sha512WithRSAEncryption	sha512	rsaEncryption
 sha224WithRSAEncryption	sha224	rsaEncryption
 mdc2WithRSA		mdc2	rsaEncryption
 ripemd160WithRSA	ripemd160 rsaEncryption
+# For PSS the digest algorithm can vary and depends on the included
+# AlgorithmIdentifier. The digest "undef" indicates the public key
+# method should handle this explicitly.
+rsassaPss		undef	rsaEncryption
 
 # Alternative deprecated OIDs. By using the older "rsa" OID this
 # type will be recognized by not normally used.
diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt
index e61fe60..d3bfad7 100644
--- a/crypto/objects/objects.txt
+++ b/crypto/objects/objects.txt
@@ -166,6 +166,10 @@ pkcs1 3			: RSA-MD4		: md4WithRSAEncryption
 pkcs1 4			: RSA-MD5		: md5WithRSAEncryption
 pkcs1 5			: RSA-SHA1		: sha1WithRSAEncryption
 # According to PKCS #1 version 2.1
+pkcs1 7			: RSAES-OAEP		: rsaesOaep
+pkcs1 8			: MGF1			: mgf1
+pkcs1 10		: RSASSA-PSS		: rsassaPss
+
 pkcs1 11		: RSA-SHA256		: sha256WithRSAEncryption
 pkcs1 12		: RSA-SHA384		: sha384WithRSAEncryption
 pkcs1 13		: RSA-SHA512		: sha512WithRSAEncryption
@@ -299,6 +303,7 @@ id-smime-alg 4		: id-smime-alg-RC2wrap
 id-smime-alg 5		: id-smime-alg-ESDH
 id-smime-alg 6		: id-smime-alg-CMS3DESwrap
 id-smime-alg 7		: id-smime-alg-CMSRC2wrap
+id-smime-alg 9		: id-alg-PWRI-KEK
 
 # S/MIME Certificate Distribution
 id-smime-cd 1		: id-smime-cd-ldap
@@ -770,6 +775,10 @@ id-ce 55		: targetInformation	: X509v3 AC Targeting
 !Cname no-rev-avail
 id-ce 56		: noRevAvail		: X509v3 No Revocation Available
 
+# From RFC5280
+ext-key-usage 0		: anyExtendedKeyUsage	: Any Extended Key Usage
+
+
 !Cname netscape
 2 16 840 1 113730	: Netscape		: Netscape Communications Corp.
 !Cname netscape-cert-extension
@@ -846,6 +855,10 @@ aes 2			: AES-128-CBC		: aes-128-cbc
 aes 3			: AES-128-OFB		: aes-128-ofb
 !Cname aes-128-cfb128
 aes 4			: AES-128-CFB		: aes-128-cfb
+aes 5			: id-aes128-wrap
+aes 6			: id-aes128-GCM		: aes-128-gcm
+aes 7			: id-aes128-CCM		: aes-128-ccm
+aes 8			: id-aes128-wrap-pad
 
 aes 21			: AES-192-ECB		: aes-192-ecb
 aes 22			: AES-192-CBC		: aes-192-cbc
@@ -853,6 +866,10 @@ aes 22			: AES-192-CBC		: aes-192-cbc
 aes 23			: AES-192-OFB		: aes-192-ofb
 !Cname aes-192-cfb128
 aes 24			: AES-192-CFB		: aes-192-cfb
+aes 25			: id-aes192-wrap
+aes 26			: id-aes192-GCM		: aes-192-gcm
+aes 27			: id-aes192-CCM		: aes-192-ccm
+aes 28			: id-aes192-wrap-pad
 
 aes 41			: AES-256-ECB		: aes-256-ecb
 aes 42			: AES-256-CBC		: aes-256-cbc
@@ -860,6 +877,10 @@ aes 42			: AES-256-CBC		: aes-256-cbc
 aes 43			: AES-256-OFB		: aes-256-ofb
 !Cname aes-256-cfb128
 aes 44			: AES-256-CFB		: aes-256-cfb
+aes 45			: id-aes256-wrap
+aes 46			: id-aes256-GCM		: aes-256-gcm
+aes 47			: id-aes256-CCM		: aes-256-ccm
+aes 48			: id-aes256-wrap-pad
 
 # There are no OIDs for these modes...
 
@@ -869,15 +890,16 @@ aes 44			: AES-256-CFB		: aes-256-cfb
 			: AES-128-CFB8		: aes-128-cfb8
 			: AES-192-CFB8		: aes-192-cfb8
 			: AES-256-CFB8		: aes-256-cfb8
+			: AES-128-CTR		: aes-128-ctr
+			: AES-192-CTR		: aes-192-ctr
+			: AES-256-CTR		: aes-256-ctr
+			: AES-128-XTS		: aes-128-xts
+			: AES-256-XTS		: aes-256-xts
 			: DES-CFB1		: des-cfb1
 			: DES-CFB8		: des-cfb8
 			: DES-EDE3-CFB1		: des-ede3-cfb1
 			: DES-EDE3-CFB8		: des-ede3-cfb8
 
-aes 5			: id-aes128-wrap 
-aes 25			: id-aes192-wrap 
-aes 45			: id-aes256-wrap 
-
 # OIDs for SHA224, SHA256, SHA385 and SHA512, according to x9.84.
 !Alias nist_hashalgs nistAlgorithms 2
 nist_hashalgs 1		: SHA256		: sha256
@@ -1211,6 +1233,9 @@ cryptocom 1 8 1		: id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se
 1 2 392 200011 61 1 1 1 2 : CAMELLIA-128-CBC		: camellia-128-cbc
 1 2 392 200011 61 1 1 1 3 : CAMELLIA-192-CBC		: camellia-192-cbc
 1 2 392 200011 61 1 1 1 4 : CAMELLIA-256-CBC		: camellia-256-cbc
+1 2 392 200011 61 1 1 3 2 : id-camellia128-wrap
+1 2 392 200011 61 1 1 3 3 : id-camellia192-wrap
+1 2 392 200011 61 1 1 3 4 : id-camellia256-wrap
 
 # Definitions for Camellia cipher - ECB, CFB, OFB MODE
 
@@ -1257,3 +1282,11 @@ kisa 1 6                : SEED-OFB      : seed-ofb
 # There is no OID that just denotes "HMAC" oddly enough...
 
 			: HMAC				: hmac
+# Nor CMAC either
+			: CMAC				: cmac
+
+# Synthetic composite ciphersuites
+			: RC4-HMAC-MD5			: rc4-hmac-md5
+			: AES-128-CBC-HMAC-SHA1		: aes-128-cbc-hmac-sha1
+			: AES-192-CBC-HMAC-SHA1		: aes-192-cbc-hmac-sha1
+			: AES-256-CBC-HMAC-SHA1		: aes-256-cbc-hmac-sha1
diff --git a/crypto/ocsp/ocsp_lib.c b/crypto/ocsp/ocsp_lib.c
index e92b86c..a94dc83 100644
--- a/crypto/ocsp/ocsp_lib.c
+++ b/crypto/ocsp/ocsp_lib.c
@@ -124,7 +124,8 @@ OCSP_CERTID *OCSP_cert_id_new(const EVP_MD *dgst,
 	if (!(ASN1_OCTET_STRING_set(cid->issuerNameHash, md, i))) goto err;
 
 	/* Calculate the issuerKey hash, excluding tag and length */
-	EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL);
+	if (!EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL))
+		goto err;
 
 	if (!(ASN1_OCTET_STRING_set(cid->issuerKeyHash, md, i))) goto err;
 
diff --git a/crypto/ocsp/ocsp_vfy.c b/crypto/ocsp/ocsp_vfy.c
index 415d67e..2767183 100644
--- a/crypto/ocsp/ocsp_vfy.c
+++ b/crypto/ocsp/ocsp_vfy.c
@@ -91,9 +91,12 @@ int OCSP_basic_verify(OCSP_BASICRESP *bs, STACK_OF(X509) *certs,
 		{
 		EVP_PKEY *skey;
 		skey = X509_get_pubkey(signer);
-		ret = OCSP_BASICRESP_verify(bs, skey, 0);
-		EVP_PKEY_free(skey);
-		if(ret <= 0)
+		if (skey)
+			{
+			ret = OCSP_BASICRESP_verify(bs, skey, 0);
+			EVP_PKEY_free(skey);
+			}
+		if(!skey || ret <= 0)
 			{
 			OCSPerr(OCSP_F_OCSP_BASIC_VERIFY, OCSP_R_SIGNATURE_FAILURE);
 			goto end;
@@ -108,6 +111,7 @@ int OCSP_basic_verify(OCSP_BASICRESP *bs, STACK_OF(X509) *certs,
 			init_res = X509_STORE_CTX_init(&ctx, st, signer, bs->certs);
 		if(!init_res)
 			{
+			ret = -1;
 			OCSPerr(OCSP_F_OCSP_BASIC_VERIFY,ERR_R_X509_LIB);
 			goto end;
 			}
diff --git a/crypto/opensslconf.h b/crypto/opensslconf.h
index 26ac6ba..6149d3e 100644
--- a/crypto/opensslconf.h
+++ b/crypto/opensslconf.h
@@ -5,12 +5,33 @@
 #ifndef OPENSSL_DOING_MAKEDEPEND
 
 
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
 #ifndef OPENSSL_NO_CAST
 # define OPENSSL_NO_CAST
 #endif
+#ifndef OPENSSL_NO_CMS
+# define OPENSSL_NO_CMS
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
 #ifndef OPENSSL_NO_GMP
 # define OPENSSL_NO_GMP
 #endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
 #ifndef OPENSSL_NO_IDEA
 # define OPENSSL_NO_IDEA
 #endif
@@ -23,23 +44,38 @@
 #ifndef OPENSSL_NO_MD2
 # define OPENSSL_NO_MD2
 #endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
 #ifndef OPENSSL_NO_RC5
 # define OPENSSL_NO_RC5
 #endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
 #ifndef OPENSSL_NO_RFC3779
 # define OPENSSL_NO_RFC3779
 #endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
 #ifndef OPENSSL_NO_SEED
 # define OPENSSL_NO_SEED
 #endif
 #ifndef OPENSSL_NO_SHA0
 # define OPENSSL_NO_SHA0
 #endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
 #ifndef OPENSSL_NO_STORE
 # define OPENSSL_NO_STORE
 #endif
-#ifndef OPENSSL_NO_WHRLPOOL
-# define OPENSSL_NO_WHRLPOOL
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
 #endif
 
 #endif /* OPENSSL_DOING_MAKEDEPEND */
@@ -56,12 +92,33 @@
    who haven't had the time to do the appropriate changes in their
    applications.  */
 #ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
 # if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
 #  define NO_CAST
 # endif
+# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
+#  define NO_CMS
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
 # if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
 #  define NO_GMP
 # endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
 # if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
 #  define NO_IDEA
 # endif
@@ -74,23 +131,38 @@
 # if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
 #  define NO_MD2
 # endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
 # if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
 #  define NO_RC5
 # endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
 # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
 #  define NO_RFC3779
 # endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
 # if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
 #  define NO_SEED
 # endif
 # if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
 #  define NO_SHA0
 # endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
 # if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
 #  define NO_STORE
 # endif
-# if defined(OPENSSL_NO_WHRLPOOL) && !defined(NO_WHRLPOOL)
-#  define NO_WHRLPOOL
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
 # endif
 #endif
 
diff --git a/crypto/opensslv.h b/crypto/opensslv.h
index 310a338..5bc8e53 100644
--- a/crypto/opensslv.h
+++ b/crypto/opensslv.h
@@ -25,11 +25,11 @@
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-#define OPENSSL_VERSION_NUMBER	0x1000005fL
+#define OPENSSL_VERSION_NUMBER	0x1000105fL
 #ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0e-fips 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1e-fips 11 Feb 2013"
 #else
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0e 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1e 11 Feb 2013"
 #endif
 #define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
 
diff --git a/crypto/ossl_typ.h b/crypto/ossl_typ.h
index 12bd701..ea9227f 100644
--- a/crypto/ossl_typ.h
+++ b/crypto/ossl_typ.h
@@ -91,10 +91,12 @@ typedef struct asn1_string_st ASN1_TIME;
 typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
 typedef struct asn1_string_st ASN1_VISIBLESTRING;
 typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
 typedef int ASN1_BOOLEAN;
 typedef int ASN1_NULL;
 #endif
 
+typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct asn1_pctx_st ASN1_PCTX;
 
 #ifdef OPENSSL_SYS_WIN32
diff --git a/crypto/pariscid.pl b/crypto/pariscid.pl
new file mode 100644
index 0000000..477ec9b
--- /dev/null
+++ b/crypto/pariscid.pl
@@ -0,0 +1,224 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$ST		="std";
+} else {
+	$LEVEL		="1.1";
+	$SIZE_T		=4;
+	$ST		="stw";
+}
+
+$rp="%r2";
+$sp="%r30";
+$rv="%r28";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	OPENSSL_cpuid_setup,ENTRY
+	.ALIGN	8
+OPENSSL_cpuid_setup
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_rdtsc,ENTRY
+	.ALIGN	8
+OPENSSL_rdtsc
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	mfctl	%cr16,$rv
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_wipe_cpu,ENTRY
+	.ALIGN	8
+OPENSSL_wipe_cpu
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	xor		%r0,%r0,%r1
+	fcpy,dbl	%fr0,%fr4
+	xor		%r0,%r0,%r19
+	fcpy,dbl	%fr0,%fr5
+	xor		%r0,%r0,%r20
+	fcpy,dbl	%fr0,%fr6
+	xor		%r0,%r0,%r21
+	fcpy,dbl	%fr0,%fr7
+	xor		%r0,%r0,%r22
+	fcpy,dbl	%fr0,%fr8
+	xor		%r0,%r0,%r23
+	fcpy,dbl	%fr0,%fr9
+	xor		%r0,%r0,%r24
+	fcpy,dbl	%fr0,%fr10
+	xor		%r0,%r0,%r25
+	fcpy,dbl	%fr0,%fr11
+	xor		%r0,%r0,%r26
+	fcpy,dbl	%fr0,%fr22
+	xor		%r0,%r0,%r29
+	fcpy,dbl	%fr0,%fr23
+	xor		%r0,%r0,%r31
+	fcpy,dbl	%fr0,%fr24
+	fcpy,dbl	%fr0,%fr25
+	fcpy,dbl	%fr0,%fr26
+	fcpy,dbl	%fr0,%fr27
+	fcpy,dbl	%fr0,%fr28
+	fcpy,dbl	%fr0,%fr29
+	fcpy,dbl	%fr0,%fr30
+	fcpy,dbl	%fr0,%fr31
+	bv		($rp)
+	.EXIT
+	ldo		0($sp),$rv
+	.PROCEND
+___
+{
+my $inp="%r26";
+my $len="%r25";
+
+$code.=<<___;
+	.EXPORT	OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_cleanse
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	cmpib,*=	0,$len,Ldone
+	nop
+	cmpib,*>>=	15,$len,Little
+	ldi		$SIZE_T-1,%r1
+
+Lalign
+	and,*<>		$inp,%r1,%r28
+	b,n		Laligned
+	stb		%r0,0($inp)
+	ldo		-1($len),$len
+	b		Lalign
+	ldo		1($inp),$inp
+
+Laligned
+	andcm		$len,%r1,%r28
+Lot
+	$ST		%r0,0($inp)
+	addib,*<>	-$SIZE_T,%r28,Lot
+	ldo		$SIZE_T($inp),$inp
+
+	and,*<>		$len,%r1,$len
+	b,n		Ldone
+Little
+	stb		%r0,0($inp)
+	addib,*<>	-1,$len,Little
+	ldo		1($inp),$inp
+Ldone
+	bv		($rp)
+	.EXIT
+	nop
+	.PROCEND
+___
+}
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+
+$code.=<<___;
+	.EXPORT	OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+Loop
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,<>	-1,$cnt,Loop
+	addi		4,$out,$out
+
+	bv		($rp)
+	.EXIT
+	sub		$rv,$cnt,$rv
+	.PROCEND
+
+	.EXPORT	OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus2
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	sub		%r0,$cnt,$cnt
+
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+Loop2
+	copy		$diff,$lastdiff
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,=		-1,$max,Ldone2
+	nop
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+	cmpclr,<>	$lastdiff,$diff,$tick
+	ldi		1,$tick
+
+	ldi		1,%r1
+	xor		%r1,$tick,$tick
+	addb,<>		$tick,$cnt,Loop2
+	shladd,l	$tick,2,$out,$out
+Ldone2
+	bv		($rp)
+	.EXIT
+	add		$rv,$cnt,$rv
+	.PROCEND
+___
+}
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
+
diff --git a/crypto/pem/pem_all.c b/crypto/pem/pem_all.c
index 3e7a609..eac0460 100644
--- a/crypto/pem/pem_all.c
+++ b/crypto/pem/pem_all.c
@@ -193,7 +193,61 @@ RSA *PEM_read_RSAPrivateKey(FILE *fp, RSA **rsa, pem_password_cb *cb,
 
 #endif
 
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_RSAPrivateKey(BIO *bp, RSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_RSA(k, x);
+
+		ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write_bio((i2d_of_void *)i2d_RSAPrivateKey,
+					PEM_STRING_RSA,bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_RSAPrivateKey(FILE *fp, RSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+
+		EVP_PKEY_set1_RSA(k, x);
+
+		ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write((i2d_of_void *)i2d_RSAPrivateKey,
+					PEM_STRING_RSA,fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
 IMPLEMENT_PEM_write_cb_const(RSAPrivateKey, RSA, PEM_STRING_RSA, RSAPrivateKey)
+
+#endif
+
 IMPLEMENT_PEM_rw_const(RSAPublicKey, RSA, PEM_STRING_RSA_PUBLIC, RSAPublicKey)
 IMPLEMENT_PEM_rw(RSA_PUBKEY, RSA, PEM_STRING_PUBLIC, RSA_PUBKEY)
 
@@ -223,7 +277,59 @@ DSA *PEM_read_bio_DSAPrivateKey(BIO *bp, DSA **dsa, pem_password_cb *cb,
 	return pkey_get_dsa(pktmp, dsa);	/* will free pktmp */
 }
 
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_DSAPrivateKey(BIO *bp, DSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_DSA(k, x);
+
+		ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write_bio((i2d_of_void *)i2d_DSAPrivateKey,
+					PEM_STRING_DSA,bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_DSAPrivateKey(FILE *fp, DSA *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_DSA(k, x);
+		ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write((i2d_of_void *)i2d_DSAPrivateKey,
+					PEM_STRING_DSA,fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
 IMPLEMENT_PEM_write_cb_const(DSAPrivateKey, DSA, PEM_STRING_DSA, DSAPrivateKey)
+
+#endif
+
 IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY)
 
 #ifndef OPENSSL_NO_FP_API
@@ -269,8 +375,63 @@ EC_KEY *PEM_read_bio_ECPrivateKey(BIO *bp, EC_KEY **key, pem_password_cb *cb,
 
 IMPLEMENT_PEM_rw_const(ECPKParameters, EC_GROUP, PEM_STRING_ECPARAMETERS, ECPKParameters)
 
+
+
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_ECPrivateKey(BIO *bp, EC_KEY *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_EC_KEY(k, x);
+
+		ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write_bio((i2d_of_void *)i2d_ECPrivateKey,
+						PEM_STRING_ECPRIVATEKEY,
+						bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_ECPrivateKey(FILE *fp, EC_KEY *x, const EVP_CIPHER *enc,
+                                               unsigned char *kstr, int klen,
+                                               pem_password_cb *cb, void *u)
+{
+	if (FIPS_mode())
+		{
+		EVP_PKEY *k;
+		int ret;
+		k = EVP_PKEY_new();
+		if (!k)
+			return 0;
+		EVP_PKEY_set1_EC_KEY(k, x);
+		ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+		EVP_PKEY_free(k);
+		return ret;
+		}
+	else
+		return PEM_ASN1_write((i2d_of_void *)i2d_ECPrivateKey,
+						PEM_STRING_ECPRIVATEKEY,
+						fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
 IMPLEMENT_PEM_write_cb(ECPrivateKey, EC_KEY, PEM_STRING_ECPRIVATEKEY, ECPrivateKey)
 
+#endif
+
 IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY)
 
 #ifndef OPENSSL_NO_FP_API
diff --git a/crypto/pem/pem_lib.c b/crypto/pem/pem_lib.c
index cfc89a9..5a421fc 100644
--- a/crypto/pem/pem_lib.c
+++ b/crypto/pem/pem_lib.c
@@ -394,7 +394,8 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp,
 			goto err;
 		/* The 'iv' is used as the iv and as a salt.  It is
 		 * NOT taken from the BytesToKey function */
-		EVP_BytesToKey(enc,EVP_md5(),iv,kstr,klen,1,key,NULL);
+		if (!EVP_BytesToKey(enc,EVP_md5(),iv,kstr,klen,1,key,NULL))
+			goto err;
 
 		if (kstr == (unsigned char *)buf) OPENSSL_cleanse(buf,PEM_BUFSIZE);
 
@@ -406,12 +407,15 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp,
 		/* k=strlen(buf); */
 
 		EVP_CIPHER_CTX_init(&ctx);
-		EVP_EncryptInit_ex(&ctx,enc,NULL,key,iv);
-		EVP_EncryptUpdate(&ctx,data,&j,data,i);
-		EVP_EncryptFinal_ex(&ctx,&(data[j]),&i);
+		ret = 1;
+		if (!EVP_EncryptInit_ex(&ctx,enc,NULL,key,iv)
+			|| !EVP_EncryptUpdate(&ctx,data,&j,data,i)
+			|| !EVP_EncryptFinal_ex(&ctx,&(data[j]),&i))
+			ret = 0;
 		EVP_CIPHER_CTX_cleanup(&ctx);
+		if (ret == 0)
+			goto err;
 		i+=j;
-		ret=1;
 		}
 	else
 		{
@@ -459,14 +463,17 @@ int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *plen,
 	ebcdic2ascii(buf, buf, klen);
 #endif
 
-	EVP_BytesToKey(cipher->cipher,EVP_md5(),&(cipher->iv[0]),
-		(unsigned char *)buf,klen,1,key,NULL);
+	if (!EVP_BytesToKey(cipher->cipher,EVP_md5(),&(cipher->iv[0]),
+		(unsigned char *)buf,klen,1,key,NULL))
+		return 0;
 
 	j=(int)len;
 	EVP_CIPHER_CTX_init(&ctx);
-	EVP_DecryptInit_ex(&ctx,cipher->cipher,NULL, key,&(cipher->iv[0]));
-	EVP_DecryptUpdate(&ctx,data,&i,data,j);
-	o=EVP_DecryptFinal_ex(&ctx,&(data[i]),&j);
+	o = EVP_DecryptInit_ex(&ctx,cipher->cipher,NULL, key,&(cipher->iv[0]));
+	if (o)
+		o = EVP_DecryptUpdate(&ctx,data,&i,data,j);
+	if (o)
+		o = EVP_DecryptFinal_ex(&ctx,&(data[i]),&j);
 	EVP_CIPHER_CTX_cleanup(&ctx);
 	OPENSSL_cleanse((char *)buf,sizeof(buf));
 	OPENSSL_cleanse((char *)key,sizeof(key));
diff --git a/crypto/pem/pem_seal.c b/crypto/pem/pem_seal.c
index 59690b5..b6b4e13 100644
--- a/crypto/pem/pem_seal.c
+++ b/crypto/pem/pem_seal.c
@@ -96,7 +96,8 @@ int PEM_SealInit(PEM_ENCODE_SEAL_CTX *ctx, EVP_CIPHER *type, EVP_MD *md_type,
 	EVP_EncodeInit(&ctx->encode);
 
 	EVP_MD_CTX_init(&ctx->md);
-	EVP_SignInit(&ctx->md,md_type);
+	if (!EVP_SignInit(&ctx->md,md_type))
+		goto err;
 
 	EVP_CIPHER_CTX_init(&ctx->cipher);
 	ret=EVP_SealInit(&ctx->cipher,type,ek,ekl,iv,pubk,npubk);
@@ -163,7 +164,8 @@ int PEM_SealFinal(PEM_ENCODE_SEAL_CTX *ctx, unsigned char *sig, int *sigl,
 		goto err;
 		}
 
-	EVP_EncryptFinal_ex(&ctx->cipher,s,(int *)&i);
+	if (!EVP_EncryptFinal_ex(&ctx->cipher,s,(int *)&i))
+		goto err;
 	EVP_EncodeUpdate(&ctx->encode,out,&j,s,i);
 	*outl=j;
 	out+=j;
diff --git a/crypto/pem/pvkfmt.c b/crypto/pem/pvkfmt.c
index 5f130c4..b1bf71a 100644
--- a/crypto/pem/pvkfmt.c
+++ b/crypto/pem/pvkfmt.c
@@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key,
 			const unsigned char *pass, int passlen)
 	{
 	EVP_MD_CTX mctx;
+	int rv = 1;
 	EVP_MD_CTX_init(&mctx);
-	EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL);
-	EVP_DigestUpdate(&mctx, salt, saltlen);
-	EVP_DigestUpdate(&mctx, pass, passlen);
-	EVP_DigestFinal_ex(&mctx, key, NULL);
+	if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL)
+		|| !EVP_DigestUpdate(&mctx, salt, saltlen)
+		|| !EVP_DigestUpdate(&mctx, pass, passlen)
+		|| !EVP_DigestFinal_ex(&mctx, key, NULL))
+			rv = 0;
+
 	EVP_MD_CTX_cleanup(&mctx);
-	return 1;
+	return rv;
 	}
 	
 
@@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
 	const unsigned char *p = *in;
 	unsigned int magic;
 	unsigned char *enctmp = NULL, *q;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (saltlen)
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,0,u);
@@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
 		p += 8;
 		inlen = keylen - 8;
 		q = enctmp + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
-		EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-		EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen);
+		if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto err;
+		if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+			goto err;
+		if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen))
+			goto err;
 		magic = read_ledword((const unsigned char **)&q);
 		if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 			{
 			q = enctmp + 8;
 			memset(keybuf + 5, 0, 11);
-			EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
-								NULL);
+			if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
+								NULL))
+				goto err;
 			OPENSSL_cleanse(keybuf, 20);
-			EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-			EVP_DecryptFinal_ex(&cctx, q + enctmplen,
-								&enctmplen);
+			if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+				goto err;
+			if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen,
+								&enctmplen))
+				goto err;
 			magic = read_ledword((const unsigned char **)&q);
 			if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 				{
-				EVP_CIPHER_CTX_cleanup(&cctx);
 				PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT);
 				goto err;
 				}
 			}
 		else
 			OPENSSL_cleanse(keybuf, 20);
-		EVP_CIPHER_CTX_cleanup(&cctx);
 		p = enctmp;
 		}
 
 	ret = b2i_PrivateKey(&p, keylen);
 	err:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	if (enctmp && saltlen)
 		OPENSSL_free(enctmp);
 	return ret;
@@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 	{
 	int outlen = 24, pklen;
 	unsigned char *p, *salt = NULL;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (enclevel)
 		outlen += PVK_SALTLEN;
 	pklen = do_i2b(NULL, pk, 0);
@@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,1,u);
@@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 		if (enclevel == 1)
 			memset(keybuf + 5, 0, 11);
 		p = salt + PVK_SALTLEN + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
+		if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto error;
 		OPENSSL_cleanse(keybuf, 20);
-		EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8);
-		EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen);
-		EVP_CIPHER_CTX_cleanup(&cctx);
+		if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8))
+			goto error;
+		if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen))
+			goto error;
 		}
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return outlen;
 
 	error:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return -1;
 	}
 
diff --git a/crypto/perlasm/cbc.pl b/crypto/perlasm/cbc.pl
index 6fc2510..24561e7 100644
--- a/crypto/perlasm/cbc.pl
+++ b/crypto/perlasm/cbc.pl
@@ -150,7 +150,7 @@ sub cbc
 &set_label("PIC_point");
 	&blindpop("edx");
 	&lea("ecx",&DWP(&label("cbc_enc_jmp_table")."-".&label("PIC_point"),"edx"));
-	&mov($count,&DWP(0,"ecx",$count,4))
+	&mov($count,&DWP(0,"ecx",$count,4));
 	&add($count,"edx");
 	&xor("ecx","ecx");
 	&xor("edx","edx");
diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
index 4579671..a3edd98 100755
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl
@@ -31,10 +31,9 @@
 				$ret .= ".type	$name,\@function";
 				last;
 			      };
-	/linux.*64/	&& do {	$ret .= ".globl	.$name\n";
-				$ret .= ".type	.$name,\@function\n";
+	/linux.*64/	&& do {	$ret .= ".globl	$name\n";
+				$ret .= ".type	$name,\@function\n";
 				$ret .= ".section	\".opd\",\"aw\"\n";
-				$ret .= ".globl	$name\n";
 				$ret .= ".align	3\n";
 				$ret .= "$name:\n";
 				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
@@ -62,6 +61,14 @@
     }
     ".machine	$arch";
 };
+my $size = sub {
+    if ($flavour =~ /linux.*32/)
+    {	shift;
+	".size	" . join(",",@_);
+    }
+    else
+    {	"";	}
+};
 my $asciz = sub {
     shift;
     my $line = join(",",@_);
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index e47116b..56d9b64 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -62,12 +62,8 @@
 my $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
-{ my ($stddev,$stdino,@junk)=stat(STDOUT);
-  my ($outdev,$outino,@junk)=stat($output);
-
-    open STDOUT,">$output" || die "can't open $output: $!"
-	if ($stddev!=$outdev || $stdino!=$outino);
-}
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
 
 my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
 my $elf=1;	$elf=0 if (!$gas);
@@ -116,12 +112,16 @@
 	    $line = substr($line,@+[0]); $line =~ s/^\s+//;
 
 	    undef $self->{sz};
-	    if ($self->{op} =~ /^(movz)b.*/) {	# movz is pain...
+	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
 		$self->{op} = $1;
-		$self->{sz} = "b";
+		$self->{sz} = $2;
 	    } elsif ($self->{op} =~ /call|jmp/) {
 		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op)/) { # SSEn
+	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^v/) { # VEX
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) {
 		$self->{sz} = "";
 	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
 		$self->{op} = $1;
@@ -246,35 +246,39 @@
 	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 
+	# Solaris /usr/ccs/bin/as can't handle multiplications
+	# in $self->{label}, new gas requires sign extension...
+	use integer;
+	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	$self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+	$self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+
 	if ($gas) {
-	    # Solaris /usr/ccs/bin/as can't handle multiplications
-	    # in $self->{label}, new gas requires sign extension...
-	    use integer;
-	    $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	    $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
-	    $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
 	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
 
 	    if (defined($self->{index})) {
-		sprintf "%s%s(%%%s,%%%s,%d)",$self->{asterisk},
-					$self->{label},$self->{base},
+		sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk},
+					$self->{label},
+					$self->{base}?"%$self->{base}":"",
 					$self->{index},$self->{scale};
 	    } else {
 		sprintf "%s%s(%%%s)",	$self->{asterisk},$self->{label},$self->{base};
 	    }
 	} else {
-	    %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", q=>"QWORD$PTR" );
+	    %szmap = (	b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
+	    		q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
 
 	    $self->{label} =~ s/\./\$/g;
 	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
 	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
-	    $sz="q" if ($self->{asterisk});
+	    $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
+	    $sz="l" if (opcode->mnemonic() eq "movd");
 
 	    if (defined($self->{index})) {
-		sprintf "%s[%s%s*%d+%s]",$szmap{$sz},
+		sprintf "%s[%s%s*%d%s]",$szmap{$sz},
 					$self->{label}?"$self->{label}+":"",
 					$self->{index},$self->{scale},
-					$self->{base};
+					$self->{base}?"+$self->{base}":"";
 	    } elsif ($self->{base} eq "rip") {
 		sprintf "%s[%s]",$szmap{$sz},$self->{label};
 	    } else {
@@ -506,6 +510,12 @@
 		    }
 		} elsif ($dir =~ /\.(text|data)/) {
 		    $current_segment=".$1";
+		} elsif ($dir =~ /\.hidden/) {
+		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$line"; }
+		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+		} elsif ($dir =~ /\.comm/) {
+		    $self->{value} = "$dir\t$prefix$line";
+		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
 		}
 		$line = "";
 		return $self;
@@ -555,7 +565,8 @@
 					    $v.=" READONLY";
 					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
 					} elsif ($line=~/\.CRT\$/i) {
-					    $v.=" READONLY DWORD";
+					    $v.=" READONLY ";
+					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
 					}
 				    }
 				    $current_segment = $line;
@@ -577,7 +588,7 @@
 					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
 					    $self->{value}.=":\n" if($masm);
 					}
-					$self->{value}.="$current_function->{name}\tENDP" if($masm);
+					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
 					undef $current_function;
 				    }
 				    last;
@@ -613,6 +624,19 @@
 						.join(",",@str) if (@str);
 				    last;
 				  };
+		/\.comm/    && do { my @str=split(/,\s*/,$line);
+				    my $v=undef;
+				    if ($nasm) {
+					$v.="common	$prefix@str[0] @str[1]";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT\n";
+					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
 	    }
 	    $line = "";
 	}
@@ -625,9 +649,133 @@
     }
 }
 
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @opcode,($rex|0x40) if ($rex);
+}
+
+# older gas and ml64 don't handle SSE>2 instructions
+my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
+
+my $movq = sub {	# elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+	my ($src,$dst)=($1,$2);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x7e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+	my ($src,$dst)=($2,$1);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x6e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
+	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
+	rex(\@opcode,$src,$dst);
+	push @opcode,0x0f,0x3a,0x16;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
+	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
+	rex(\@opcode,$dst,$src);
+	push @opcode,0x0f,0x3a,0x22;
+	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$2,$1);
+	push @opcode,0x0f,0x38,0x00;
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x0f;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	push @opcode,$1;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$1,8);
+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
 if ($nasm) {
     print <<___;
 default	rel
+%define XMMWORD
 ___
 } elsif ($masm) {
     print <<___;
@@ -644,14 +792,22 @@
 
     undef $label;
     undef $opcode;
-    undef $sz;
     undef @args;
 
     if ($label=label->re(\$line))	{ print $label->out(); }
 
     if (directive->re(\$line)) {
 	printf "%s",directive->out();
-    } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
+    } elsif ($opcode=opcode->re(\$line)) {
+	my $asm = eval("\$".$opcode->mnemonic());
+	undef @bytes;
+	
+	if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
+	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+	    next;
+	}
+
+	ARGUMENT: while (1) {
 	my $arg;
 
 	if ($arg=register->re(\$line))	{ opcode->size($arg->size()); }
@@ -667,19 +823,26 @@
 	$line =~ s/^,\s*//;
 	} # ARGUMENT:
 
-	$sz=opcode->size();
-
 	if ($#args>=0) {
 	    my $insn;
+	    my $sz=opcode->size();
+
 	    if ($gas) {
 		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+		@args = map($_->out($sz),@args);
+		printf "\t%s\t%s",$insn,join(",",@args);
 	    } else {
 		$insn = $opcode->out();
-		$insn .= $sz if (map($_->out() =~ /x?mm/,@args));
+		foreach (@args) {
+		    my $arg = $_->out();
+		    # $insn.=$sz compensates for movq, pinsrw, ...
+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+		}
 		@args = reverse(@args);
 		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	    }
-	    printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	} else {
 	    printf "\t%s",$opcode->out();
 	}
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index 28080ca..eb543db 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl
@@ -80,6 +80,57 @@ sub ::movq
     {	&::generic("movq",@_);			}
 }
 
+# SSE>2 instructions
+my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
+    else
+    {	&::generic("pextrd",@_);		}
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
+    else
+    {	&::generic("pinsrd",@_);		}
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
+    else
+    {	&::generic("pshufb",@_);		}
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("palignr",@_);		}
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("pclmulqdq",@_);		}
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
+    else
+    {	&::generic("rdrand",@_);	}
+}
+
 # label management
 $lbdecor="L";		# local label decoration, set by package
 $label="000";
@@ -167,7 +218,7 @@ sub ::asm_init
     $filename=$fn;
     $i386=$cpu;
 
-    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=0;
+    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0;
     if    (($type eq "elf"))
     {	$elf=1;			require "x86gas.pl";	}
     elsif (($type eq "a\.out"))
@@ -184,6 +235,8 @@ sub ::asm_init
     {	$win32=1;		require "x86masm.pl";	}
     elsif (($type eq "macosx"))
     {	$aout=1; $macosx=1;	require "x86gas.pl";	}
+    elsif (($type eq "android"))
+    {	$elf=1; $android=1;	require "x86gas.pl";	}
     else
     {	print STDERR <<"EOF";
 Pick one target type from
diff --git a/crypto/perlasm/x86gas.pl b/crypto/perlasm/x86gas.pl
index 6eab727..682a3a3 100644
--- a/crypto/perlasm/x86gas.pl
+++ b/crypto/perlasm/x86gas.pl
@@ -45,9 +45,8 @@ sub ::generic
     undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
 
     if ($#_==0)				{ &::emit($opcode);		}
-    elsif ($opcode =~ m/^j/o && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "call" && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode =~ m/^set/&& $#_==1)	{ &::emit($opcode,@arg);	}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
     else				{ &::emit($opcode.$suffix,@arg);}
 
   1;
@@ -91,6 +90,7 @@ sub ::DWP
 }
 sub ::QWP	{ &::DWP(@_);	}
 sub ::BP	{ &::DWP(@_);	}
+sub ::WP	{ &::DWP(@_);	}
 sub ::BC	{ @_;		}
 sub ::DWC	{ @_;		}
 
@@ -149,22 +149,24 @@ sub ::public_label
 {   push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");   }
 
 sub ::file_end
-{   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
-	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,4";
-	if ($::elf)	{ push (@out,"$tmp,4\n"); }
-	else		{ push (@out,"$tmp\n"); }
-    }
-    if ($::macosx)
+{   if ($::macosx)
     {	if (%non_lazy_ptr)
     	{   push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n");
 	    foreach $i (keys %non_lazy_ptr)
 	    {	push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
 	}
     }
+    if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
+	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+	if ($::macosx)	{ push (@out,"$tmp,2\n"); }
+	elsif ($::elf)	{ push (@out,"$tmp,4\n"); }
+	else		{ push (@out,"$tmp\n"); }
+    }
     push(@out,$initseg) if ($initseg);
 }
 
 sub ::data_byte	{   push(@out,".byte\t".join(',',@_)."\n");   }
+sub ::data_short{   push(@out,".value\t".join(',',@_)."\n");  }
 sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
 
 sub ::align
@@ -180,7 +182,7 @@ sub ::align
 sub ::picmeup
 { my($dst,$sym,$base,$reflabel)=@_;
 
-    if ($::pic && ($::elf || $::aout))
+    if (($::pic && ($::elf || $::aout)) || $::macosx)
     {	if (!defined($base))
 	{   &::call(&::label("PIC_me_up"));
 	    &::set_label("PIC_me_up");
@@ -206,13 +208,17 @@ sub ::picmeup
 sub ::initseg
 { my $f=$nmdecor.shift;
 
-    if ($::elf)
+    if ($::android)
+    {	$initseg.=<<___;
+.section	.init_array
+.align	4
+.long	$f
+___
+    }
+    elsif ($::elf)
     {	$initseg.=<<___;
 .section	.init
 	call	$f
-	jmp	.Linitalign
-.align	$align
-.Linitalign:
 ___
     }
     elsif ($::coff)
diff --git a/crypto/perlasm/x86masm.pl b/crypto/perlasm/x86masm.pl
index 3d50e4a..f937d07 100644
--- a/crypto/perlasm/x86masm.pl
+++ b/crypto/perlasm/x86masm.pl
@@ -14,9 +14,11 @@ sub ::generic
 { my ($opcode,@arg)=@_;
 
     # fix hexadecimal constants
-    for (@arg) { s/0x([0-9a-f]+)/0$1h/oi; }
+    for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; }
 
-    if ($opcode !~ /movq/)
+    if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+(\(.*\))$/OFFSET $1/)	# no []
+    {	$opcode="mov";	}
+    elsif ($opcode !~ /movq/)
     {	# fix xmm references
 	$arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i);
 	$arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i);
@@ -31,6 +33,7 @@ sub ::generic
 sub ::call	{ &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
 sub ::call_ptr	{ &::emit("call",@_);	}
 sub ::jmp_ptr	{ &::emit("jmp",@_);	}
+sub ::lock	{ &::data_byte(0xf0);	}
 
 sub get_mem
 { my($size,$addr,$reg1,$reg2,$idx)=@_;
@@ -65,6 +68,7 @@ sub get_mem
   $ret;
 }
 sub ::BP	{ &get_mem("BYTE",@_);  }
+sub ::WP	{ &get_mem("WORD",@_);	}
 sub ::DWP	{ &get_mem("DWORD",@_); }
 sub ::QWP	{ &get_mem("QWORD",@_); }
 sub ::BC	{ "@_";  }
@@ -129,7 +133,7 @@ sub ::file_end
     if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 .bss	SEGMENT 'BSS'
-COMM	${nmdecor}OPENSSL_ia32cap_P:DWORD
+COMM	${nmdecor}OPENSSL_ia32cap_P:QWORD
 .bss	ENDS
 ___
 	# comment out OPENSSL_ia32cap_P declarations
@@ -156,6 +160,9 @@ sub ::public_label
 sub ::data_byte
 {   push(@out,("DB\t").join(',',@_)."\n");	}
 
+sub ::data_short
+{   push(@out,("DW\t").join(',',@_)."\n");	}
+
 sub ::data_word
 {   push(@out,("DD\t").join(',',@_)."\n");	}
 
@@ -181,4 +188,11 @@ sub ::initseg
 sub ::dataseg
 {   push(@out,"$segment\tENDS\n_DATA\tSEGMENT\n"); $segment="_DATA";   }
 
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"IF \@Version GE 710\n");
+    push(@out,".SAFESEH	".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"ENDIF\n");
+}
+
 1;
diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl
index ce2bed9..ca2511c 100644
--- a/crypto/perlasm/x86nasm.pl
+++ b/crypto/perlasm/x86nasm.pl
@@ -19,6 +19,8 @@ sub ::generic
 	{   $_[0] = "NEAR $_[0]";   	}
 	elsif ($opcode eq "lea" && $#_==1)  # wipe storage qualifier from lea
 	{   $_[1] =~ s/^[^\[]*\[/\[/o;	}
+	elsif ($opcode eq "clflush" && $#_==0)
+	{   $_[0] =~ s/^[^\[]*\[/\[/o;	}
     }
     &::emit($opcode,@_);
   1;
@@ -67,6 +69,7 @@ sub get_mem
 }
 sub ::BP	{ &get_mem("BYTE",@_);  }
 sub ::DWP	{ &get_mem("DWORD",@_); }
+sub ::WP	{ &get_mem("WORD",@_);	}
 sub ::QWP	{ &get_mem("",@_);      }
 sub ::BC	{ (($::mwerks)?"":"BYTE ")."@_";  }
 sub ::DWC	{ (($::mwerks)?"":"DWORD ")."@_"; }
@@ -114,7 +117,7 @@ sub ::file_end
 {   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 ${drdecor}segment	.bss
-${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 4
+${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 8
 ___
 	# comment out OPENSSL_ia32cap_P declarations
 	grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
@@ -135,7 +138,8 @@ sub ::public_label
 
 sub ::data_byte
 {   push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n");	}
-
+sub ::data_short
+{   push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n");	}
 sub ::data_word
 {   push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n");	}
 
@@ -163,4 +167,11 @@ sub ::dataseg
     else		{ push(@out,"section\t.data align=4\n"); }
 }
 
+sub ::safeseh
+{ my $nm=shift;
+    push(@out,"%if	__NASM_VERSION_ID__ >= 0x02030000\n");
+    push(@out,"safeseh	".&::LABEL($nm,$nmdecor.$nm)."\n");
+    push(@out,"%endif\n");
+}
+
 1;
diff --git a/crypto/pkcs12/p12_decr.c b/crypto/pkcs12/p12_decr.c
index ba77dbb..9d3557e 100644
--- a/crypto/pkcs12/p12_decr.c
+++ b/crypto/pkcs12/p12_decr.c
@@ -89,7 +89,14 @@ unsigned char * PKCS12_pbe_crypt(X509_ALGOR *algor, const char *pass,
 		goto err;
 	}
 
-	EVP_CipherUpdate(&ctx, out, &i, in, inlen);
+	if (!EVP_CipherUpdate(&ctx, out, &i, in, inlen))
+		{
+		OPENSSL_free(out);
+		out = NULL;
+		PKCS12err(PKCS12_F_PKCS12_PBE_CRYPT,ERR_R_EVP_LIB);
+		goto err;
+		}
+
 	outlen = i;
 	if(!EVP_CipherFinal_ex(&ctx, out + i, &i)) {
 		OPENSSL_free(out);
diff --git a/crypto/pkcs12/p12_key.c b/crypto/pkcs12/p12_key.c
index 424203f..61d5850 100644
--- a/crypto/pkcs12/p12_key.c
+++ b/crypto/pkcs12/p12_key.c
@@ -152,14 +152,16 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt,
 	for (i = 0; i < Slen; i++) *p++ = salt[i % saltlen];
 	for (i = 0; i < Plen; i++) *p++ = pass[i % passlen];
 	for (;;) {
-		EVP_DigestInit_ex(&ctx, md_type, NULL);
-		EVP_DigestUpdate(&ctx, D, v);
-		EVP_DigestUpdate(&ctx, I, Ilen);
-		EVP_DigestFinal_ex(&ctx, Ai, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+			|| !EVP_DigestUpdate(&ctx, D, v)
+			|| !EVP_DigestUpdate(&ctx, I, Ilen)
+			|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		for (j = 1; j < iter; j++) {
-			EVP_DigestInit_ex(&ctx, md_type, NULL);
-			EVP_DigestUpdate(&ctx, Ai, u);
-			EVP_DigestFinal_ex(&ctx, Ai, NULL);
+			if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+				|| !EVP_DigestUpdate(&ctx, Ai, u)
+				|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		}
 		memcpy (out, Ai, min (n, u));
 		if (u >= n) {
@@ -174,24 +176,32 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt,
 		out += u;
 		for (j = 0; j < v; j++) B[j] = Ai[j % u];
 		/* Work out B + 1 first then can use B as tmp space */
-		if (!BN_bin2bn (B, v, Bpl1)) goto err;
-		if (!BN_add_word (Bpl1, 1)) goto err;
+		if (!BN_bin2bn (B, v, Bpl1))
+			goto err;
+		if (!BN_add_word (Bpl1, 1))
+			goto err;
 		for (j = 0; j < Ilen ; j+=v) {
-			if (!BN_bin2bn (I + j, v, Ij)) goto err;
-			if (!BN_add (Ij, Ij, Bpl1)) goto err;
-			BN_bn2bin (Ij, B);
+			if (!BN_bin2bn(I + j, v, Ij))
+				goto err;
+			if (!BN_add(Ij, Ij, Bpl1))
+				goto err;
+			if (!BN_bn2bin(Ij, B))
+				goto err;
 			Ijlen = BN_num_bytes (Ij);
 			/* If more than 2^(v*8) - 1 cut off MSB */
 			if (Ijlen > v) {
-				BN_bn2bin (Ij, B);
+				if (!BN_bn2bin (Ij, B))
+					goto err;
 				memcpy (I + j, B + 1, v);
 #ifndef PKCS12_BROKEN_KEYGEN
 			/* If less than v bytes pad with zeroes */
 			} else if (Ijlen < v) {
 				memset(I + j, 0, v - Ijlen);
-				BN_bn2bin(Ij, I + j + v - Ijlen); 
+				if (!BN_bn2bin(Ij, I + j + v - Ijlen))
+					goto err;
 #endif
-			} else BN_bn2bin (Ij, I + j);
+			} else if (!BN_bn2bin (Ij, I + j))
+				goto err;
 		}
 	}
 
diff --git a/crypto/pkcs12/p12_kiss.c b/crypto/pkcs12/p12_kiss.c
index 292cc3e..206b1b0 100644
--- a/crypto/pkcs12/p12_kiss.c
+++ b/crypto/pkcs12/p12_kiss.c
@@ -167,7 +167,7 @@ int PKCS12_parse(PKCS12 *p12, const char *pass, EVP_PKEY **pkey, X509 **cert,
 	if (cert && *cert)
 		X509_free(*cert);
 	if (x)
-		X509_free(*cert);
+		X509_free(x);
 	if (ocerts)
 		sk_X509_pop_free(ocerts, X509_free);
 	return 0;
diff --git a/crypto/pkcs12/p12_mutl.c b/crypto/pkcs12/p12_mutl.c
index 9ab740d..96de1bd 100644
--- a/crypto/pkcs12/p12_mutl.c
+++ b/crypto/pkcs12/p12_mutl.c
@@ -97,10 +97,14 @@ int PKCS12_gen_mac(PKCS12 *p12, const char *pass, int passlen,
 		return 0;
 	}
 	HMAC_CTX_init(&hmac);
-	HMAC_Init_ex(&hmac, key, md_size, md_type, NULL);
-    	HMAC_Update(&hmac, p12->authsafes->d.data->data,
-					 p12->authsafes->d.data->length);
-    	HMAC_Final(&hmac, mac, maclen);
+	if (!HMAC_Init_ex(&hmac, key, md_size, md_type, NULL)
+    		|| !HMAC_Update(&hmac, p12->authsafes->d.data->data,
+					 p12->authsafes->d.data->length)
+    		|| !HMAC_Final(&hmac, mac, maclen))
+		{
+    		HMAC_CTX_cleanup(&hmac);
+		return 0;
+		}
     	HMAC_CTX_cleanup(&hmac);
 	return 1;
 }
diff --git a/crypto/pkcs7/pk7_doit.c b/crypto/pkcs7/pk7_doit.c
index 3bf1a36..77fda3b 100644
--- a/crypto/pkcs7/pk7_doit.c
+++ b/crypto/pkcs7/pk7_doit.c
@@ -204,11 +204,11 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
 	unsigned char *ek = NULL;
 	size_t eklen;
 
-	int ret = 0;
+	int ret = -1;
 
 	pctx = EVP_PKEY_CTX_new(pkey, NULL);
 	if (!pctx)
-		return 0;
+		return -1;
 
 	if (EVP_PKEY_decrypt_init(pctx) <= 0)
 		goto err;
@@ -235,12 +235,19 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
 	if (EVP_PKEY_decrypt(pctx, ek, &eklen,
 				ri->enc_key->data, ri->enc_key->length) <= 0)
 		{
+		ret = 0;
 		PKCS7err(PKCS7_F_PKCS7_DECRYPT_RINFO, ERR_R_EVP_LIB);
 		goto err;
 		}
 
 	ret = 1;
 
+	if (*pek)
+		{
+		OPENSSL_cleanse(*pek, *peklen);
+		OPENSSL_free(*pek);
+		}
+
 	*pek = ek;
 	*peklen = eklen;
 
@@ -423,6 +430,8 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 	STACK_OF(X509_ALGOR) *md_sk=NULL;
 	STACK_OF(PKCS7_RECIP_INFO) *rsk=NULL;
 	PKCS7_RECIP_INFO *ri=NULL;
+       unsigned char *ek = NULL, *tkey = NULL;
+       int eklen = 0, tkeylen = 0;
 
 	i=OBJ_obj2nid(p7->type);
 	p7->state=PKCS7_S_HEADER;
@@ -500,8 +509,6 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 		int max;
 		X509_OBJECT ret;
 #endif
-		unsigned char *ek = NULL;
-		int eklen;
 
 		if ((etmp=BIO_new(BIO_f_cipher())) == NULL)
 			{
@@ -534,29 +541,28 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			}
 
 		/* If we haven't got a certificate try each ri in turn */
-
 		if (pcert == NULL)
 			{
+			/* Always attempt to decrypt all rinfo even
+			 * after sucess as a defence against MMA timing
+			 * attacks.
+			 */
 			for (i=0; i<sk_PKCS7_RECIP_INFO_num(rsk); i++)
 				{
 				ri=sk_PKCS7_RECIP_INFO_value(rsk,i);
+				
 				if (pkcs7_decrypt_rinfo(&ek, &eklen,
-							ri, pkey) > 0)
-					break;
+							ri, pkey) < 0)
+					goto err;
 				ERR_clear_error();
-				ri = NULL;
-				}
-			if (ri == NULL)
-				{
-				PKCS7err(PKCS7_F_PKCS7_DATADECODE,
-				      PKCS7_R_NO_RECIPIENT_MATCHES_KEY);
-				goto err;
 				}
 			}
 		else
 			{
-			if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) <= 0)
+			/* Only exit on fatal errors, not decrypt failure */
+			if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) < 0)
 				goto err;
+			ERR_clear_error();
 			}
 
 		evp_ctx=NULL;
@@ -565,6 +571,19 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			goto err;
 		if (EVP_CIPHER_asn1_to_param(evp_ctx,enc_alg->parameter) < 0)
 			goto err;
+		/* Generate random key as MMA defence */
+		tkeylen = EVP_CIPHER_CTX_key_length(evp_ctx);
+		tkey = OPENSSL_malloc(tkeylen);
+		if (!tkey)
+			goto err;
+		if (EVP_CIPHER_CTX_rand_key(evp_ctx, tkey) <= 0)
+			goto err;
+		if (ek == NULL)
+			{
+			ek = tkey;
+			eklen = tkeylen;
+			tkey = NULL;
+			}
 
 		if (eklen != EVP_CIPHER_CTX_key_length(evp_ctx)) {
 			/* Some S/MIME clients don't use the same key
@@ -573,11 +592,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			 */
 			if(!EVP_CIPHER_CTX_set_key_length(evp_ctx, eklen))
 				{
-				PKCS7err(PKCS7_F_PKCS7_DATADECODE,
-					PKCS7_R_DECRYPTED_KEY_IS_WRONG_LENGTH);
-				goto err;
+				/* Use random key as MMA defence */
+				OPENSSL_cleanse(ek, eklen);
+				OPENSSL_free(ek);
+				ek = tkey;
+				eklen = tkeylen;
+				tkey = NULL;
 				}
 		} 
+		/* Clear errors so we don't leak information useful in MMA */
+		ERR_clear_error();
 		if (EVP_CipherInit_ex(evp_ctx,NULL,NULL,ek,NULL,0) <= 0)
 			goto err;
 
@@ -585,6 +609,13 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			{
 			OPENSSL_cleanse(ek,eklen);
 			OPENSSL_free(ek);
+                       ek = NULL;
+			}
+		if (tkey)
+			{
+			OPENSSL_cleanse(tkey,tkeylen);
+			OPENSSL_free(tkey);
+                       tkey = NULL;
 			}
 
 		if (out == NULL)
@@ -627,6 +658,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 	if (0)
 		{
 err:
+               if (ek)
+                       {
+                       OPENSSL_cleanse(ek,eklen);
+                       OPENSSL_free(ek);
+                       }
+               if (tkey)
+                       {
+                       OPENSSL_cleanse(tkey,tkeylen);
+                       OPENSSL_free(tkey);
+                       }
 		if (out != NULL) BIO_free_all(out);
 		if (btmp != NULL) BIO_free_all(btmp);
 		if (etmp != NULL) BIO_free_all(etmp);
@@ -676,7 +717,11 @@ static int do_pkcs7_signed_attrib(PKCS7_SIGNER_INFO *si, EVP_MD_CTX *mctx)
 		}
 
 	/* Add digest */
-	EVP_DigestFinal_ex(mctx, md_data,&md_len);
+	if (!EVP_DigestFinal_ex(mctx, md_data,&md_len))
+		{
+		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_EVP_LIB);
+		return 0;
+		}
 	if (!PKCS7_add1_attrib_digest(si, md_data, md_len))
 		{
 		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_MALLOC_FAILURE);
@@ -784,7 +829,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
 
 			/* We now have the EVP_MD_CTX, lets do the
 			 * signing. */
-			EVP_MD_CTX_copy_ex(&ctx_tmp,mdc);
+			if (!EVP_MD_CTX_copy_ex(&ctx_tmp,mdc))
+				goto err;
 
 			sk=si->auth_attr;
 
@@ -822,7 +868,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
 		if (!PKCS7_find_digest(&mdc, bio,
 				OBJ_obj2nid(p7->d.digest->md->algorithm)))
 			goto err;
-		EVP_DigestFinal_ex(mdc,md_data,&md_len);
+		if (!EVP_DigestFinal_ex(mdc,md_data,&md_len))
+			goto err;
 		M_ASN1_OCTET_STRING_set(p7->d.digest->digest, md_data, md_len);
 		}
 
@@ -1015,7 +1062,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
 
 	/* mdc is the digest ctx that we want, unless there are attributes,
 	 * in which case the digest is the signed attributes */
-	EVP_MD_CTX_copy_ex(&mdc_tmp,mdc);
+	if (!EVP_MD_CTX_copy_ex(&mdc_tmp,mdc))
+		goto err;
 
 	sk=si->auth_attr;
 	if ((sk != NULL) && (sk_X509_ATTRIBUTE_num(sk) != 0))
@@ -1025,7 +1073,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
 		int alen;
 		ASN1_OCTET_STRING *message_digest;
 
-		EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len);
+		if (!EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len))
+			goto err;
 		message_digest=PKCS7_digest_from_attributes(sk);
 		if (!message_digest)
 			{
@@ -1050,7 +1099,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
 			goto err;
 			}
 
-		EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL);
+		if (!EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL))
+			goto err;
 
 		alen = ASN1_item_i2d((ASN1_VALUE *)sk, &abuf,
 						ASN1_ITEM_rptr(PKCS7_ATTR_VERIFY));
@@ -1060,7 +1110,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
 			ret = -1;
 			goto err;
 			}
-		EVP_VerifyUpdate(&mdc_tmp, abuf, alen);
+		if (!EVP_VerifyUpdate(&mdc_tmp, abuf, alen))
+			goto err;
 
 		OPENSSL_free(abuf);
 		}
diff --git a/crypto/pkcs7/pk7_smime.c b/crypto/pkcs7/pk7_smime.c
index 86742d0..a5104f8 100644
--- a/crypto/pkcs7/pk7_smime.c
+++ b/crypto/pkcs7/pk7_smime.c
@@ -573,15 +573,34 @@ int PKCS7_decrypt(PKCS7 *p7, EVP_PKEY *pkey, X509 *cert, BIO *data, int flags)
 			return 0;
 		}
 		ret = SMIME_text(bread, data);
+		if (ret > 0 && BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+			{
+			if (!BIO_get_cipher_status(tmpmem))
+				ret = 0;
+			}
 		BIO_free_all(bread);
 		return ret;
 	} else {
 		for(;;) {
 			i = BIO_read(tmpmem, buf, sizeof(buf));
-			if(i <= 0) break;
-			BIO_write(data, buf, i);
+			if(i <= 0)
+				{
+				ret = 1;
+				if (BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+					{
+					if (!BIO_get_cipher_status(tmpmem))
+						ret = 0;
+					}
+					
+				break;
+				}
+			if (BIO_write(data, buf, i) != i)
+				{
+				ret = 0;
+				break;
+				}
 		}
 		BIO_free_all(tmpmem);
-		return 1;
+		return ret;
 	}
 }
diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
index 369e1d0..4ba736a 100755
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl
@@ -23,36 +23,67 @@
 .machine	"any"
 .text
 
-.globl	.OPENSSL_cpuid_setup
+.globl	.OPENSSL_ppc64_probe
 .align	4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+	fcfid	f1,f1
+	extrdi	r0,r0,32,0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
+.globl	.OPENSSL_altivec_probe
+.align	4
+.OPENSSL_altivec_probe:
+	.long	0x10000484	# vor	v0,v0,v0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_wipe_cpu
 .align	4
 .OPENSSL_wipe_cpu:
 	xor	r0,r0,r0
+	fmr	f0,f31
+	fmr	f1,f31
+	fmr	f2,f31
 	mr	r3,r1
+	fmr	f3,f31
 	xor	r4,r4,r4
+	fmr	f4,f31
 	xor	r5,r5,r5
+	fmr	f5,f31
 	xor	r6,r6,r6
+	fmr	f6,f31
 	xor	r7,r7,r7
+	fmr	f7,f31
 	xor	r8,r8,r8
+	fmr	f8,f31
 	xor	r9,r9,r9
+	fmr	f9,f31
 	xor	r10,r10,r10
+	fmr	f10,f31
 	xor	r11,r11,r11
+	fmr	f11,f31
 	xor	r12,r12,r12
+	fmr	f12,f31
+	fmr	f13,f31
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_atomic_add
 .align	4
 .OPENSSL_atomic_add:
-Loop:	lwarx	r5,0,r3
+Ladd:	lwarx	r5,0,r3
 	add	r0,r4,r5
 	stwcx.	r0,0,r3
-	bne-	Loop
+	bne-	Ladd
 	$SIGNX	r3,r0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 .globl	.OPENSSL_rdtsc
 .align	4
@@ -60,6 +91,8 @@
 	mftb	r3
 	mftbu	r4
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_cleanse
 .align	4
@@ -72,7 +105,7 @@
 Little:	mtctr	r4
 	stb	r0,0(r3)
 	addi	r3,r3,1
-	bdnz-	\$-8
+	bdnz	\$-8
 	blr
 Lot:	andi.	r5,r3,3
 	beq	Laligned
@@ -85,10 +118,13 @@
 	mtctr	r5
 	stw	r0,0(r3)
 	addi	r3,r3,4
-	bdnz-	\$-8
+	bdnz	\$-8
 	andi.	r4,r4,3
 	bne	Little
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/rand/md_rand.c b/crypto/rand/md_rand.c
index b2f04ff..1e3bcb9 100644
--- a/crypto/rand/md_rand.c
+++ b/crypto/rand/md_rand.c
@@ -109,6 +109,8 @@
  *
  */
 
+#define OPENSSL_FIPSEVP
+
 #ifdef MD_RAND_DEBUG
 # ifndef NDEBUG
 #   define NDEBUG
@@ -121,10 +123,10 @@
 
 #include "e_os.h"
 
+#include <openssl/crypto.h>
 #include <openssl/rand.h>
 #include "rand_lcl.h"
 
-#include <openssl/crypto.h>
 #include <openssl/err.h>
 
 #ifdef BN_DEBUG
@@ -157,13 +159,14 @@ const char RAND_version[]="RAND" OPENSSL_VERSION_PTEXT;
 static void ssleay_rand_cleanup(void);
 static void ssleay_rand_seed(const void *buf, int num);
 static void ssleay_rand_add(const void *buf, int num, double add_entropy);
-static int ssleay_rand_bytes(unsigned char *buf, int num);
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo);
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num);
 static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num);
 static int ssleay_rand_status(void);
 
 RAND_METHOD rand_ssleay_meth={
 	ssleay_rand_seed,
-	ssleay_rand_bytes,
+	ssleay_rand_nopseudo_bytes,
 	ssleay_rand_cleanup,
 	ssleay_rand_add,
 	ssleay_rand_pseudo_bytes,
@@ -328,7 +331,7 @@ static void ssleay_rand_seed(const void *buf, int num)
 	ssleay_rand_add(buf, num, (double)num);
 	}
 
-static int ssleay_rand_bytes(unsigned char *buf, int num)
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo)
 	{
 	static volatile int stirred_pool = 0;
 	int i,j,k,st_num,st_idx;
@@ -517,7 +520,9 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
 	EVP_MD_CTX_cleanup(&m);
 	if (ok)
 		return(1);
-	else
+	else if (pseudo)
+		return 0;
+	else 
 		{
 		RANDerr(RAND_F_SSLEAY_RAND_BYTES,RAND_R_PRNG_NOT_SEEDED);
 		ERR_add_error_data(1, "You need to read the OpenSSL FAQ, "
@@ -526,22 +531,16 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
 		}
 	}
 
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num)
+	{
+	return ssleay_rand_bytes(buf, num, 0);
+	}
+
 /* pseudo-random bytes that are guaranteed to be unique but not
    unpredictable */
 static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num) 
 	{
-	int ret;
-	unsigned long err;
-
-	ret = RAND_bytes(buf, num);
-	if (ret == 0)
-		{
-		err = ERR_peek_error();
-		if (ERR_GET_LIB(err) == ERR_LIB_RAND &&
-		    ERR_GET_REASON(err) == RAND_R_PRNG_NOT_SEEDED)
-			ERR_clear_error();
-		}
-	return (ret);
+	return ssleay_rand_bytes(buf, num, 1);
 	}
 
 static int ssleay_rand_status(void)
diff --git a/crypto/rand/rand.h b/crypto/rand/rand.h
index ac6c021..dc8fcf9 100644
--- a/crypto/rand/rand.h
+++ b/crypto/rand/rand.h
@@ -119,6 +119,11 @@ int RAND_event(UINT, WPARAM, LPARAM);
 
 #endif
 
+#ifdef OPENSSL_FIPS
+void RAND_set_fips_drbg_type(int type, int flags);
+int RAND_init_fips(void);
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -129,9 +134,13 @@ void ERR_load_RAND_strings(void);
 
 /* Function codes. */
 #define RAND_F_RAND_GET_RAND_METHOD			 101
+#define RAND_F_RAND_INIT_FIPS				 102
 #define RAND_F_SSLEAY_RAND_BYTES			 100
 
 /* Reason codes. */
+#define RAND_R_ERROR_INITIALISING_DRBG			 102
+#define RAND_R_ERROR_INSTANTIATING_DRBG			 103
+#define RAND_R_NO_FIPS_RANDOM_METHOD_SET		 101
 #define RAND_R_PRNG_NOT_SEEDED				 100
 
 #ifdef  __cplusplus
diff --git a/crypto/rand/rand_err.c b/crypto/rand/rand_err.c
index 03cda4d..b8586c8 100644
--- a/crypto/rand/rand_err.c
+++ b/crypto/rand/rand_err.c
@@ -1,6 +1,6 @@
 /* crypto/rand/rand_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -71,12 +71,16 @@
 static ERR_STRING_DATA RAND_str_functs[]=
 	{
 {ERR_FUNC(RAND_F_RAND_GET_RAND_METHOD),	"RAND_get_rand_method"},
+{ERR_FUNC(RAND_F_RAND_INIT_FIPS),	"RAND_init_fips"},
 {ERR_FUNC(RAND_F_SSLEAY_RAND_BYTES),	"SSLEAY_RAND_BYTES"},
 {0,NULL}
 	};
 
 static ERR_STRING_DATA RAND_str_reasons[]=
 	{
+{ERR_REASON(RAND_R_ERROR_INITIALISING_DRBG),"error initialising drbg"},
+{ERR_REASON(RAND_R_ERROR_INSTANTIATING_DRBG),"error instantiating drbg"},
+{ERR_REASON(RAND_R_NO_FIPS_RANDOM_METHOD_SET),"no fips random method set"},
 {ERR_REASON(RAND_R_PRNG_NOT_SEEDED)      ,"PRNG not seeded"},
 {0,NULL}
 	};
diff --git a/crypto/rand/rand_lib.c b/crypto/rand/rand_lib.c
index 513e338..476a0cd 100644
--- a/crypto/rand/rand_lib.c
+++ b/crypto/rand/rand_lib.c
@@ -60,10 +60,16 @@
 #include <time.h>
 #include "cryptlib.h"
 #include <openssl/rand.h>
+
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/fips_rand.h>
+#endif
+
 #ifndef OPENSSL_NO_ENGINE
 /* non-NULL if default_RAND_meth is ENGINE-provided */
 static ENGINE *funct_ref =NULL;
@@ -174,3 +180,119 @@ int RAND_status(void)
 		return meth->status();
 	return 0;
 	}
+
+#ifdef OPENSSL_FIPS
+
+/* FIPS DRBG initialisation code. This sets up the DRBG for use by the
+ * rest of OpenSSL. 
+ */
+
+/* Entropy gatherer: use standard OpenSSL PRNG to seed (this will gather
+ * entropy internally through RAND_poll().
+ */
+
+static size_t drbg_get_entropy(DRBG_CTX *ctx, unsigned char **pout,
+                                int entropy, size_t min_len, size_t max_len)
+        {
+	/* Round up request to multiple of block size */
+	min_len = ((min_len + 19) / 20) * 20;
+	*pout = OPENSSL_malloc(min_len);
+	if (!*pout)
+		return 0;
+	if (RAND_SSLeay()->bytes(*pout, min_len) <= 0)
+		{
+		OPENSSL_free(*pout);
+		*pout = NULL;
+		return 0;
+		}
+        return min_len;
+        }
+
+static void drbg_free_entropy(DRBG_CTX *ctx, unsigned char *out, size_t olen)
+	{
+	if (out)
+		{
+		OPENSSL_cleanse(out, olen);
+		OPENSSL_free(out);
+		}
+	}
+
+/* Set "additional input" when generating random data. This uses the
+ * current PID, a time value and a counter.
+ */
+
+static size_t drbg_get_adin(DRBG_CTX *ctx, unsigned char **pout)
+    	{
+	/* Use of static variables is OK as this happens under a lock */
+	static unsigned char buf[16];
+	static unsigned long counter;
+	FIPS_get_timevec(buf, &counter);
+	*pout = buf;
+	return sizeof(buf);
+	}
+
+/* RAND_add() and RAND_seed() pass through to OpenSSL PRNG so it is 
+ * correctly seeded by RAND_poll().
+ */
+
+static int drbg_rand_add(DRBG_CTX *ctx, const void *in, int inlen,
+				double entropy)
+	{
+	RAND_SSLeay()->add(in, inlen, entropy);
+	return 1;
+	}
+
+static int drbg_rand_seed(DRBG_CTX *ctx, const void *in, int inlen)
+	{
+	RAND_SSLeay()->seed(in, inlen);
+	return 1;
+	}
+
+#ifndef OPENSSL_DRBG_DEFAULT_TYPE
+#define OPENSSL_DRBG_DEFAULT_TYPE	NID_aes_256_ctr
+#endif
+#ifndef OPENSSL_DRBG_DEFAULT_FLAGS
+#define OPENSSL_DRBG_DEFAULT_FLAGS	DRBG_FLAG_CTR_USE_DF
+#endif 
+
+static int fips_drbg_type = OPENSSL_DRBG_DEFAULT_TYPE;
+static int fips_drbg_flags = OPENSSL_DRBG_DEFAULT_FLAGS;
+
+void RAND_set_fips_drbg_type(int type, int flags)
+	{
+	fips_drbg_type = type;
+	fips_drbg_flags = flags;
+	}
+
+int RAND_init_fips(void)
+	{
+	DRBG_CTX *dctx;
+	size_t plen;
+	unsigned char pers[32], *p;
+	dctx = FIPS_get_default_drbg();
+        if (FIPS_drbg_init(dctx, fips_drbg_type, fips_drbg_flags) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INITIALISING_DRBG);
+		return 0;
+		}
+		
+        FIPS_drbg_set_callbacks(dctx,
+				drbg_get_entropy, drbg_free_entropy, 20,
+				drbg_get_entropy, drbg_free_entropy);
+	FIPS_drbg_set_rand_callbacks(dctx, drbg_get_adin, 0,
+					drbg_rand_seed, drbg_rand_add);
+	/* Personalisation string: a string followed by date time vector */
+	strcpy((char *)pers, "OpenSSL DRBG2.0");
+	plen = drbg_get_adin(dctx, &p);
+	memcpy(pers + 16, p, plen);
+
+        if (FIPS_drbg_instantiate(dctx, pers, sizeof(pers)) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INSTANTIATING_DRBG);
+		return 0;
+		}
+        FIPS_rand_set_method(FIPS_drbg_method());
+	return 1;
+	}
+
+#endif
diff --git a/crypto/rand/rand_unix.c b/crypto/rand/rand_unix.c
index e9ead3a..e3a6557 100644
--- a/crypto/rand/rand_unix.c
+++ b/crypto/rand/rand_unix.c
@@ -133,47 +133,87 @@
 # define FD_SETSIZE (8*sizeof(fd_set))
 #endif
 
-#ifdef __VOS__
+#if defined(OPENSSL_SYS_VOS)
+
+/* The following algorithm repeatedly samples the real-time clock
+   (RTC) to generate a sequence of unpredictable data.  The algorithm
+   relies upon the uneven execution speed of the code (due to factors
+   such as cache misses, interrupts, bus activity, and scheduling) and
+   upon the rather large relative difference between the speed of the
+   clock and the rate at which it can be read.
+
+   If this code is ported to an environment where execution speed is
+   more constant or where the RTC ticks at a much slower rate, or the
+   clock can be read with fewer instructions, it is likely that the
+   results would be far more predictable.
+
+   As a precaution, we generate 4 times the minimum required amount of
+   seed data.  */
+
 int RAND_poll(void)
 {
-	unsigned char buf[ENTROPY_NEEDED];
+	short int code;
+	gid_t curr_gid;
 	pid_t curr_pid;
 	uid_t curr_uid;
-	static int first=1;
-	int i;
-	long rnd = 0;
+	int i, k;
 	struct timespec ts;
-	unsigned seed;
-
-/* The VOS random() function starts from a static seed so its
-   initial value is predictable.  If random() returns the
-   initial value, reseed it with dynamic data.  The VOS
-   real-time clock has a granularity of 1 nsec so it should be
-   reasonably difficult to predict its exact value.  Do not
-   gratuitously reseed the PRNG because other code in this
-   process or thread may be using it.  */
-
-	if (first) {
-		first = 0;
-		rnd = random ();
-		if (rnd == 1804289383) {
-			clock_gettime (CLOCK_REALTIME, &ts);
-			curr_pid = getpid();
-			curr_uid = getuid();
-			seed = ts.tv_sec ^ ts.tv_nsec ^ curr_pid ^ curr_uid;
-			srandom (seed);
-		}
-	}
+	unsigned char v;
 
-	for (i = 0; i < sizeof(buf); i++) {
-		if (i % 4 == 0)
-			rnd = random();
-		buf[i] = rnd;
-		rnd >>= 8;
-	}
-	RAND_add(buf, sizeof(buf), ENTROPY_NEEDED);
-	memset(buf, 0, sizeof(buf));
+#ifdef OPENSSL_SYS_VOS_HPPA
+	long duration;
+	extern void s$sleep (long *_duration, short int *_code);
+#else
+#ifdef OPENSSL_SYS_VOS_IA32
+	long long duration;
+	extern void s$sleep2 (long long *_duration, short int *_code);
+#else
+#error "Unsupported Platform."
+#endif /* OPENSSL_SYS_VOS_IA32 */
+#endif /* OPENSSL_SYS_VOS_HPPA */
+
+	/* Seed with the gid, pid, and uid, to ensure *some*
+	   variation between different processes.  */
+
+	curr_gid = getgid();
+	RAND_add (&curr_gid, sizeof curr_gid, 1);
+	curr_gid = 0;
+
+	curr_pid = getpid();
+	RAND_add (&curr_pid, sizeof curr_pid, 1);
+	curr_pid = 0;
+
+	curr_uid = getuid();
+	RAND_add (&curr_uid, sizeof curr_uid, 1);
+	curr_uid = 0;
 
+	for (i=0; i<(ENTROPY_NEEDED*4); i++)
+	{
+		/* burn some cpu; hope for interrupts, cache
+		   collisions, bus interference, etc.  */
+		for (k=0; k<99; k++)
+			ts.tv_nsec = random ();
+
+#ifdef OPENSSL_SYS_VOS_HPPA
+		/* sleep for 1/1024 of a second (976 us).  */
+		duration = 1;
+		s$sleep (&duration, &code);
+#else
+#ifdef OPENSSL_SYS_VOS_IA32
+		/* sleep for 1/65536 of a second (15 us).  */
+		duration = 1;
+		s$sleep2 (&duration, &code);
+#endif /* OPENSSL_SYS_VOS_IA32 */
+#endif /* OPENSSL_SYS_VOS_HPPA */
+
+		/* get wall clock time.  */
+		clock_gettime (CLOCK_REALTIME, &ts);
+
+		/* take 8 bits */
+		v = (unsigned char) (ts.tv_nsec % 256);
+		RAND_add (&v, sizeof v, 1);
+		v = 0;
+	}
 	return 1;
 }
 #elif defined __OpenBSD__
diff --git a/crypto/rand/randfile.c b/crypto/rand/randfile.c
index bc7d9c5..7f14280 100644
--- a/crypto/rand/randfile.c
+++ b/crypto/rand/randfile.c
@@ -57,7 +57,9 @@
  */
 
 /* We need to define this to get macros like S_IFBLK and S_IFCHR */
+#if !defined(OPENSSL_SYS_VXWORKS)
 #define _XOPEN_SOURCE 500
+#endif
 
 #include <errno.h>
 #include <stdio.h>
@@ -137,7 +139,7 @@ int RAND_load_file(const char *file, long bytes)
 	in=fopen(file,"rb");
 #endif
 	if (in == NULL) goto err;
-#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPNESSL_NO_POSIX_IO)
+#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPENSSL_NO_POSIX_IO)
 	if (sb.st_mode & (S_IFBLK | S_IFCHR)) {
 	  /* this file is a device. we don't want read an infinite number
 	   * of bytes from a random device, nor do we want to use buffered
diff --git a/crypto/rc2/rc2.h b/crypto/rc2/rc2.h
index 34c8362..e542ec9 100644
--- a/crypto/rc2/rc2.h
+++ b/crypto/rc2/rc2.h
@@ -79,7 +79,9 @@ typedef struct rc2_key_st
 	RC2_INT data[64];
 	} RC2_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
+#endif
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
 void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
 		     int enc);
diff --git a/crypto/rc2/rc2_skey.c b/crypto/rc2/rc2_skey.c
index 0150b0e..6668ac0 100644
--- a/crypto/rc2/rc2_skey.c
+++ b/crypto/rc2/rc2_skey.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/rc2.h>
 #include "rc2_locl.h"
 
@@ -95,6 +96,13 @@ static const unsigned char key_table[256]={
  * the same as specifying 1024 for the 'bits' parameter.  Bsafe uses
  * a version where the bits parameter is the same as len*8 */
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(RC2);
+	private_RC2_set_key(key, len, data, bits);
+	}
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#endif
 	{
 	int i,j;
 	unsigned char *k;
diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl
index 38a44a7..5c9ac6a 100644
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl
@@ -28,6 +28,34 @@
 #
 #					<appro@fy.chalmers.se>
 
+# May 2011
+#
+# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
+# performance in cycles per processed byte (less is better) and
+# improvement relative to previous version of this module is:
+#
+# Pentium	10.2			# original numbers
+# Pentium III	7.8(*)
+# Intel P4	7.5
+#
+# Opteron	6.1/+20%		# new MMX numbers
+# Core2		5.3/+67%(**)
+# Westmere	5.1/+94%(**)
+# Sandy Bridge	5.0/+8%
+# Atom		12.6/+6%
+#
+# (*)	PIII can actually deliver 6.6 cycles per byte with MMX code,
+#	but this specific code performs poorly on Core2. And vice
+#	versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
+#	poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
+#	[anymore], I chose to discard PIII-specific code path and opt
+#	for original IALU-only code, which is why MMX/SSE code path
+#	is guarded by SSE2 bit (see below), not MMX/SSE.
+# (**)	Performance vs. block size on Core2 and Westmere had a maximum
+#	at ... 64 bytes block size. And it was quite a maximum, 40-60%
+#	in comparison to largest 8KB block size. Above improvement
+#	coefficients are for the largest block size.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
@@ -62,6 +90,68 @@ sub RC4_loop {
 	&$func	($out,&DWP(0,$dat,$ty,4));
 }
 
+if ($alt=0) {
+  # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
+  # but ~40% slower on Core2 and Westmere... Attempt to add movz
+  # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
+  # on Core2 with movz it's almost 20% slower than below alternative
+  # code... Yes, it's a total mess...
+  my @XX=($xx,$out);
+  $RC4_loop_mmx = sub {		# SSE actually...
+    my $i=shift;
+    my $j=$i<=0?0:$i>>1;
+    my $mm=$i<=0?"mm0":"mm".($i&1);
+
+	&add	(&LB($yy),&LB($tx));
+	&lea	(@XX[1],&DWP(1,@XX[0]));
+	&pxor	("mm2","mm0")				if ($i==0);
+	&psllq	("mm1",8)				if ($i==0);
+	&and	(@XX[1],0xff);
+	&pxor	("mm0","mm0")				if ($i<=0);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&pxor	("mm1","mm2")				if ($i==0);
+	&mov	(&DWP(0,$dat,$XX[0],4),$ty);
+	&add	(&LB($ty),&LB($tx));
+	&movd	(@XX[0],"mm7")				if ($i==0);
+	&mov	($tx,&DWP(0,$dat,@XX[1],4));
+	&pxor	("mm1","mm1")				if ($i==1);
+	&movq	("mm2",&QWP(0,$inp))			if ($i==1);
+	&movq	(&QWP(-8,(@XX[0],$inp)),"mm1")		if ($i==0);
+	&pinsrw	($mm,&DWP(0,$dat,$ty,4),$j);
+
+	push	(@XX,shift(@XX))			if ($i>=0);
+  }
+} else {
+  # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
+  # brings down AMD by 7%...
+  $RC4_loop_mmx = sub {
+    my $i=shift;
+
+	&add	(&LB($yy),&LB($tx));
+	&psllq	("mm1",8*(($i-1)&7))			if (abs($i)!=1);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&mov	(&DWP(0,$dat,$xx,4),$ty);
+	&inc	($xx);
+	&add	($ty,$tx);
+	&movz	($xx,&LB($xx));				# (*)
+	&movz	($ty,&LB($ty));				# (*)
+	&pxor	("mm2",$i==1?"mm0":"mm1")		if ($i>=0);
+	&movq	("mm0",&QWP(0,$inp))			if ($i<=0);
+	&movq	(&QWP(-8,($out,$inp)),"mm2")		if ($i==0);
+	&mov	($tx,&DWP(0,$dat,$xx,4));
+	&movd	($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
+
+	# (*)	This is the key to Core2 and Westmere performance.
+	#	Whithout movz out-of-order execution logic confuses
+	#	itself and fails to reorder loads and stores. Problem
+	#	appears to be fixed in Sandy Bridge...
+  }
+}
+
+&external_label("OPENSSL_ia32cap_P");
+
 # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
 &function_begin("RC4");
 	&mov	($dat,&wparam(0));	# load key schedule pointer
@@ -94,11 +184,56 @@ sub RC4_loop {
 	&and	($ty,-4);		# how many 4-byte chunks?
 	&jz	(&label("loop1"));
 
+	&test	($ty,-8);
+	&mov	(&wparam(3),$out);	# $out as accumulator in these loops
+	&jz	(&label("go4loop4"));
+
+	&picmeup($out,"OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,$out),26);	# check SSE2 bit [could have been MMX]
+	&jnc	(&label("go4loop4"));
+
+	&mov	($out,&wparam(3))	if (!$alt);
+	&movd	("mm7",&wparam(3))	if ($alt);
+	&and	($ty,-8);
+	&lea	($ty,&DWP(-8,$inp,$ty));
+	&mov	(&DWP(-4,$dat),$ty);	# save input+(len/8)*8-8
+
+	&$RC4_loop_mmx(-1);
+	&jmp(&label("loop_mmx_enter"));
+
+	&set_label("loop_mmx",16);
+		&$RC4_loop_mmx(0);
+	&set_label("loop_mmx_enter");
+		for 	($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
+		&mov	($ty,$yy);
+		&xor	($yy,$yy);		# this is second key to Core2
+		&mov	(&LB($yy),&LB($ty));	# and Westmere performance...
+		&cmp	($inp,&DWP(-4,$dat));
+		&lea	($inp,&DWP(8,$inp));
+	&jb	(&label("loop_mmx"));
+
+    if ($alt) {
+	&movd	($out,"mm7");
+	&pxor	("mm2","mm0");
+	&psllq	("mm1",8);
+	&pxor	("mm1","mm2");
+	&movq	(&QWP(-8,$out,$inp),"mm1");
+    } else {
+	&psllq	("mm1",56);
+	&pxor	("mm2","mm1");
+	&movq	(&QWP(-8,$out,$inp),"mm2");
+    }
+	&emms	();
+
+	&cmp	($inp,&wparam(1));	# compare to input+len
+	&je	(&label("done"));
+	&jmp	(&label("loop1"));
+
+&set_label("go4loop4",16);
 	&lea	($ty,&DWP(-4,$inp,$ty));
 	&mov	(&wparam(2),$ty);	# save input+(len/4)*4-4
-	&mov	(&wparam(3),$out);	# $out as accumulator in this loop
 
-	&set_label("loop4",16);
+	&set_label("loop4");
 		for ($i=0;$i<4;$i++) { RC4_loop($i); }
 		&ror	($out,8);
 		&xor	($out,&DWP(0,$inp));
@@ -151,7 +286,7 @@ sub RC4_loop {
 
 &set_label("done");
 	&dec	(&LB($xx));
-	&mov	(&BP(-4,$dat),&LB($yy));	# save key->y
+	&mov	(&DWP(-4,$dat),$yy);		# save key->y
 	&mov	(&BP(-8,$dat),&LB($xx));	# save key->x
 &set_label("abort");
 &function_end("RC4");
@@ -164,10 +299,8 @@ sub RC4_loop {
 $ido="ecx";
 $idx="edx";
 
-&external_label("OPENSSL_ia32cap_P");
-
 # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
-&function_begin("RC4_set_key");
+&function_begin("private_RC4_set_key");
 	&mov	($out,&wparam(0));		# load key
 	&mov	($idi,&wparam(1));		# load len
 	&mov	($inp,&wparam(2));		# load data
@@ -245,7 +378,7 @@ sub RC4_loop {
 	&xor	("eax","eax");
 	&mov	(&DWP(-8,$out),"eax");		# key->x=0;
 	&mov	(&DWP(-4,$out),"eax");		# key->y=0;
-&function_end("RC4_set_key");
+&function_end("private_RC4_set_key");
 
 # const char *RC4_options(void);
 &function_begin_B("RC4_options");
@@ -254,14 +387,21 @@ sub RC4_loop {
 	&blindpop("eax");
 	&lea	("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
 	&picmeup("edx","OPENSSL_ia32cap_P");
-	&bt	(&DWP(0,"edx"),20);
-	&jnc	(&label("skip"));
-	  &add	("eax",12);
-	&set_label("skip");
+	&mov	("edx",&DWP(0,"edx"));
+	&bt	("edx",20);
+	&jc	(&label("1xchar"));
+	&bt	("edx",26);
+	&jnc	(&label("ret"));
+	&add	("eax",25);
+	&ret	();
+&set_label("1xchar");
+	&add	("eax",12);
+&set_label("ret");
 	&ret	();
 &set_label("opts",64);
 &asciz	("rc4(4x,int)");
 &asciz	("rc4(1x,char)");
+&asciz	("rc4(8x,mmx)");
 &asciz	("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &align	(64);
 &function_end_B("RC4_options");
diff --git a/crypto/rc4/asm/rc4-md5-x86_64.S b/crypto/rc4/asm/rc4-md5-x86_64.S
new file mode 100644
index 0000000..aab3c6d
--- /dev/null
+++ b/crypto/rc4/asm/rc4-md5-x86_64.S
@@ -0,0 +1,1259 @@
+.text	
+.align	16
+
+.globl	rc4_md5_enc
+.type	rc4_md5_enc,@function
+rc4_md5_enc:
+	cmpq	$0,%r9
+	je	.Labort
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$40,%rsp
+.Lbody:
+	movq	%rcx,%r11
+	movq	%r9,%r12
+	movq	%rsi,%r13
+	movq	%rdx,%r14
+	movq	%r8,%r15
+	xorq	%rbp,%rbp
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%bpl
+	movb	-4(%rdi),%cl
+
+	incb	%bpl
+	subq	%r13,%r14
+	movl	(%rdi,%rbp,4),%eax
+	addb	%al,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	shlq	$6,%r12
+	addq	%r15,%r12
+	movq	%r12,16(%rsp)
+
+	movq	%r11,24(%rsp)
+	movl	0(%r11),%r8d
+	movl	4(%r11),%r9d
+	movl	8(%r11),%r10d
+	movl	12(%r11),%r11d
+	jmp	.Loop
+
+.align	16
+.Loop:
+	movl	%r8d,0(%rsp)
+	movl	%r9d,4(%rsp)
+	movl	%r10d,8(%rsp)
+	movl	%r11d,%r12d
+	movl	%r11d,12(%rsp)
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	0(%r15),%r8d
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	addl	$3614090360,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	4(%r15),%r11d
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	addl	$3905402710,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	8(%r15),%r10d
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	addl	$606105819,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	12(%r15),%r9d
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	addl	$3250441966,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	16(%r15),%r8d
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	addl	$4118548399,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	20(%r15),%r11d
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	addl	$1200080426,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	24(%r15),%r10d
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	addl	$2821735955,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	28(%r15),%r9d
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	addl	$4249261313,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	32(%r15),%r8d
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	addl	$1770035416,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	36(%r15),%r11d
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	addl	$2336552879,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	40(%r15),%r10d
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	addl	$4294925233,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	44(%r15),%r9d
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	addl	$2304563134,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	48(%r15),%r8d
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	addl	$1804603682,%r8d
+	xorl	%r11d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$7,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	52(%r15),%r11d
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	addl	$4254626195,%r11d
+	xorl	%r10d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$12,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	56(%r15),%r10d
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	addl	$2792965006,%r10d
+	xorl	%r9d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$17,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	(%r13),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	60(%r15),%r9d
+	addb	%dl,%bl
+	movl	64(%rsi),%eax
+	addl	$1236535329,%r9d
+	xorl	%r8d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$22,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	4(%r15),%r8d
+	addb	%dl,%al
+	movl	68(%rsi),%ebx
+	addl	$4129170786,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,64(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	24(%r15),%r11d
+	addb	%dl,%bl
+	movl	72(%rsi),%eax
+	addl	$3225465664,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,68(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	44(%r15),%r10d
+	addb	%dl,%al
+	movl	76(%rsi),%ebx
+	addl	$643717713,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,72(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	0(%r15),%r9d
+	addb	%dl,%bl
+	movl	80(%rsi),%eax
+	addl	$3921069994,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,76(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	20(%r15),%r8d
+	addb	%dl,%al
+	movl	84(%rsi),%ebx
+	addl	$3593408605,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,80(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	40(%r15),%r11d
+	addb	%dl,%bl
+	movl	88(%rsi),%eax
+	addl	$38016083,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,84(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	60(%r15),%r10d
+	addb	%dl,%al
+	movl	92(%rsi),%ebx
+	addl	$3634488961,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,88(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	16(%r15),%r9d
+	addb	%dl,%bl
+	movl	96(%rsi),%eax
+	addl	$3889429448,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,92(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	36(%r15),%r8d
+	addb	%dl,%al
+	movl	100(%rsi),%ebx
+	addl	$568446438,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,96(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	56(%r15),%r11d
+	addb	%dl,%bl
+	movl	104(%rsi),%eax
+	addl	$3275163606,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,100(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	12(%r15),%r10d
+	addb	%dl,%al
+	movl	108(%rsi),%ebx
+	addl	$4107603335,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,104(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	32(%r15),%r9d
+	addb	%dl,%bl
+	movl	112(%rsi),%eax
+	addl	$1163531501,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,108(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r10d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r11d,%r12d
+	addl	52(%r15),%r8d
+	addb	%dl,%al
+	movl	116(%rsi),%ebx
+	addl	$2850285829,%r8d
+	xorl	%r10d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,112(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$5,%r8d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r10d,%r12d
+	addl	8(%r15),%r11d
+	addb	%dl,%bl
+	movl	120(%rsi),%eax
+	addl	$4243563512,%r11d
+	xorl	%r9d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,116(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$9,%r11d
+	movl	%r8d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	andl	%r9d,%r12d
+	addl	28(%r15),%r10d
+	addb	%dl,%al
+	movl	124(%rsi),%ebx
+	addl	$1735328473,%r10d
+	xorl	%r8d,%r12d
+	movzbl	%al,%eax
+	movl	%edx,120(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$14,%r10d
+	movl	%r11d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	16(%r13),%xmm3
+	addb	$32,%bpl
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	andl	%r8d,%r12d
+	addl	48(%r15),%r9d
+	addb	%dl,%bl
+	movl	0(%rdi,%rbp,4),%eax
+	addl	$2368359562,%r9d
+	xorl	%r11d,%r12d
+	movzbl	%bl,%ebx
+	movl	%edx,124(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$20,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movq	%rcx,%rsi
+	xorq	%rcx,%rcx
+	movb	%sil,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	20(%r15),%r8d
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	addl	$4294588738,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	32(%r15),%r11d
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	addl	$2272392833,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	44(%r15),%r10d
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	addl	$1839030562,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	56(%r15),%r9d
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	addl	$4259657740,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	4(%r15),%r8d
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	addl	$2763975236,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	16(%r15),%r11d
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	addl	$1272893353,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	28(%r15),%r10d
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	addl	$4139469664,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	40(%r15),%r9d
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	addl	$3200236656,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	52(%r15),%r8d
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	addl	$681279174,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	0(%r15),%r11d
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	addl	$3936430074,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	12(%r15),%r10d
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	addl	$3572445317,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	24(%r15),%r9d
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	addl	$76029189,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	%r11d,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r9d,%r12d
+	addl	36(%r15),%r8d
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	addl	$3654602809,%r8d
+	movzbl	%al,%eax
+	addl	%r12d,%r8d
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	roll	$4,%r8d
+	movl	%r10d,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r8d,%r12d
+	addl	48(%r15),%r11d
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	addl	$3873151461,%r11d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r11d
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	roll	$11,%r11d
+	movl	%r9d,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	xorl	%r11d,%r12d
+	addl	60(%r15),%r10d
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	addl	$530742520,%r10d
+	movzbl	%al,%eax
+	addl	%r12d,%r10d
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	roll	$16,%r10d
+	movl	%r8d,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	32(%r13),%xmm4
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	xorl	%r10d,%r12d
+	addl	8(%r15),%r9d
+	addb	%dl,%bl
+	movl	64(%rsi),%eax
+	addl	$3299628645,%r9d
+	movzbl	%bl,%ebx
+	addl	%r12d,%r9d
+	movl	%edx,60(%rsi)
+	addb	%al,%cl
+	roll	$23,%r9d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+	pxor	%xmm0,%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	0(%r15),%r8d
+	addb	%dl,%al
+	movl	68(%rsi),%ebx
+	addl	$4096336452,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,64(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	movd	(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	pxor	%xmm1,%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	28(%r15),%r11d
+	addb	%dl,%bl
+	movl	72(%rsi),%eax
+	addl	$1126891415,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,68(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	movd	(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	56(%r15),%r10d
+	addb	%dl,%al
+	movl	76(%rsi),%ebx
+	addl	$2878612391,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,72(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	20(%r15),%r9d
+	addb	%dl,%bl
+	movl	80(%rsi),%eax
+	addl	$4237533241,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,76(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	48(%r15),%r8d
+	addb	%dl,%al
+	movl	84(%rsi),%ebx
+	addl	$1700485571,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,80(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	12(%r15),%r11d
+	addb	%dl,%bl
+	movl	88(%rsi),%eax
+	addl	$2399980690,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,84(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	40(%r15),%r10d
+	addb	%dl,%al
+	movl	92(%rsi),%ebx
+	addl	$4293915773,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,88(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	4(%r15),%r9d
+	addb	%dl,%bl
+	movl	96(%rsi),%eax
+	addl	$2240044497,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,92(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	32(%r15),%r8d
+	addb	%dl,%al
+	movl	100(%rsi),%ebx
+	addl	$1873313359,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,96(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	60(%r15),%r11d
+	addb	%dl,%bl
+	movl	104(%rsi),%eax
+	addl	$4264355552,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,100(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	24(%r15),%r10d
+	addb	%dl,%al
+	movl	108(%rsi),%ebx
+	addl	$2734768916,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,104(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	52(%r15),%r9d
+	addb	%dl,%bl
+	movl	112(%rsi),%eax
+	addl	$1309151649,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,108(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r11d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r9d,%r12d
+	addl	16(%r15),%r8d
+	addb	%dl,%al
+	movl	116(%rsi),%ebx
+	addl	$4149444226,%r8d
+	movzbl	%al,%eax
+	xorl	%r10d,%r12d
+	movl	%edx,112(%rsi)
+	addl	%r12d,%r8d
+	addb	%bl,%cl
+	roll	$6,%r8d
+	movl	$-1,%r12d
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+
+	addl	%r9d,%r8d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r10d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r8d,%r12d
+	addl	44(%r15),%r11d
+	addb	%dl,%bl
+	movl	120(%rsi),%eax
+	addl	$3174756917,%r11d
+	movzbl	%bl,%ebx
+	xorl	%r9d,%r12d
+	movl	%edx,116(%rsi)
+	addl	%r12d,%r11d
+	addb	%al,%cl
+	roll	$10,%r11d
+	movl	$-1,%r12d
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+
+	addl	%r8d,%r11d
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r9d,%r12d
+	movl	%eax,(%rdi,%rcx,4)
+	orl	%r11d,%r12d
+	addl	8(%r15),%r10d
+	addb	%dl,%al
+	movl	124(%rsi),%ebx
+	addl	$718787259,%r10d
+	movzbl	%al,%eax
+	xorl	%r8d,%r12d
+	movl	%edx,120(%rsi)
+	addl	%r12d,%r10d
+	addb	%bl,%cl
+	roll	$15,%r10d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+
+	addl	%r11d,%r10d
+	movdqu	48(%r13),%xmm5
+	addb	$32,%bpl
+	movl	(%rdi,%rcx,4),%edx
+	xorl	%r8d,%r12d
+	movl	%ebx,(%rdi,%rcx,4)
+	orl	%r10d,%r12d
+	addl	36(%r15),%r9d
+	addb	%dl,%bl
+	movl	0(%rdi,%rbp,4),%eax
+	addl	$3951481745,%r9d
+	movzbl	%bl,%ebx
+	xorl	%r11d,%r12d
+	movl	%edx,124(%rsi)
+	addl	%r12d,%r9d
+	addb	%al,%cl
+	roll	$21,%r9d
+	movl	$-1,%r12d
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+
+	addl	%r10d,%r9d
+	movq	%rbp,%rsi
+	xorq	%rbp,%rbp
+	movb	%sil,%bpl
+	movq	%rcx,%rsi
+	xorq	%rcx,%rcx
+	movb	%sil,%cl
+	leaq	(%rdi,%rbp,4),%rsi
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+	addl	0(%rsp),%r8d
+	addl	4(%rsp),%r9d
+	addl	8(%rsp),%r10d
+	addl	12(%rsp),%r11d
+
+	movdqu	%xmm2,(%r14,%r13,1)
+	movdqu	%xmm3,16(%r14,%r13,1)
+	movdqu	%xmm4,32(%r14,%r13,1)
+	movdqu	%xmm5,48(%r14,%r13,1)
+	leaq	64(%r15),%r15
+	leaq	64(%r13),%r13
+	cmpq	16(%rsp),%r15
+	jb	.Loop
+
+	movq	24(%rsp),%r12
+	subb	%al,%cl
+	movl	%r8d,0(%r12)
+	movl	%r9d,4(%r12)
+	movl	%r10d,8(%r12)
+	movl	%r11d,12(%r12)
+	subb	$1,%bpl
+	movl	%ebp,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	40(%rsp),%r15
+	movq	48(%rsp),%r14
+	movq	56(%rsp),%r13
+	movq	64(%rsp),%r12
+	movq	72(%rsp),%rbp
+	movq	80(%rsp),%rbx
+	leaq	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	.byte	0xf3,0xc3
+.size	rc4_md5_enc,.-rc4_md5_enc
diff --git a/crypto/rc4/asm/rc4-md5-x86_64.pl b/crypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 0000000..272fa91
--- /dev/null
+++ b/crypto/rc4/asm/rc4-md5-x86_64.pl
@@ -0,0 +1,632 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+#		RC4	MD5	RC4+MD5	stitch	gain
+# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
+# Core2		6.5	5.8	12.3	7.7	+60%
+# Westmere	4.3	5.2	9.5	7.0	+36%
+# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
+# Atom		9.3	6.5	15.8	11.1	+42%
+#
+# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+#	is +53%...
+
+my ($rc4,$md5)=(1,1);	# what to generate?
+my $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
+			# but its result is discarded. Idea here is
+			# to be able to use 'openssl speed rc4' for
+			# benchmarking the stitched subroutine... 
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+
+if ($rc4 && !$md5) {
+  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+  $func="RC4";				$nargs=4;
+} elsif ($md5 && !$rc4) {
+  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+  $func="md5_block_asm_data_order";	$nargs=3;
+} else {
+  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+  $func="rc4_md5_enc";			$nargs=6;
+  # void rc4_md5_enc(
+  #		RC4_KEY *key,		#
+  #		const void *in0,	# RC4 input
+  #		void *out,		# RC4 output
+  #		MD5_CTX *ctx,		#
+  #		const void *inp,	# MD5 input
+  #		size_t len);		# number of 64-byte blocks
+}
+
+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
+
+my @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
+my $tmp="%r12d";
+
+my @XX=("%rbp","%rsi");			# RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+my $MOD=32;				# 16, 32 or 64
+
+$code.=<<___;
+.text
+.align 16
+
+.globl	$func
+.type	$func,\@function,$nargs
+$func:
+	cmp	\$0,$len
+	je	.Labort
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5#	mov	$ctx,%r11		# reassign arguments
+	mov	$len,%r12
+	mov	$in0,%r13
+	mov	$out,%r14
+$D#md5#	mov	$inp,%r15
+___
+    $ctx="%r11"	if ($md5);		# reassign arguments
+    $len="%r12";
+    $in0="%r13";
+    $out="%r14";
+    $inp="%r15"	if ($md5);
+    $inp=$in0	if (!$md5);
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
+
+	inc	$XX[0]#b
+	sub	$in0,$out
+	movl	($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+	xor	$TX[1],$TX[1]
+	test	\$-128,$len
+	jz	.Loop1
+	sub	$XX[0],$TX[1]
+	and	\$`$MOD-1`,$TX[1]
+	jz	.Loop${MOD}_is_hot
+	sub	$TX[1],$len
+.Loop${MOD}_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$TX[1]
+	jnz	.Loop${MOD}_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop${MOD}_is_hot:
+	mov	$len,32(%rsp)		# save original $len
+	shr	\$6,$len		# number of 64-byte blocks
+___
+  if ($D && !$md5) {			# stitch in dummy MD5
+    $md5=1;
+    $ctx="%r11";
+    $inp="%r15";
+    $code.=<<___;
+	mov	%rsp,$ctx
+	mov	$in0,$inp
+___
+  }
+}
+$code.=<<___;
+#rc4#	add	$TX[0]#b,$YY#b
+#rc4#	lea	($dat,$XX[0],4),$XX[1]
+	shl	\$6,$len
+	add	$inp,$len		# pointer to the end of input
+	mov	$len,16(%rsp)
+
+#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
+#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
+#md5#	mov	1*4($ctx),$V[1]
+#md5#	mov	2*4($ctx),$V[2]
+#md5#	mov	3*4($ctx),$V[3]
+	jmp	.Loop
+
+.align	16
+.Loop:
+#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
+#md5#	mov	$V[1],1*4(%rsp)
+#md5#	mov	$V[2],2*4(%rsp)
+#md5#	mov	$V[3],$tmp		# forward reference
+#md5#	mov	$V[3],3*4(%rsp)
+___
+
+sub R0 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot0=(7,12,17,22);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$b,$tmp
+#md5#	add	4*`$j`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$d,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot0[$j%4],$a
+#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+___
+}
+sub R1 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot1=(5,9,14,20);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$b,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$d,$tmp
+#md5#	add	4*`((1+5*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$c,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot1[$j%4],$a
+#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+___
+}
+sub R2 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot2=(4,11,16,23);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	xor	$b,$tmp
+#md5#	add	4*`((5+3*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	add	$tmp,$a
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot2[$j%4],$a
+#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+___
+}
+sub R3 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot3=(6,10,15,21);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$d,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	or	$b,$tmp
+#md5#	add	4*`((7*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot3[$j%4],$a
+#md5#	mov	\$-1,$tmp			# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	mov	$XX[0],$XX[1]
+	xor	$XX[0],$XX[0]			# keyword to partial register
+	mov	$XX[1]#b,$XX[0]#b
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+___
+}
+
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+
+$code.=<<___;
+#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
+#md5#	add	1*4(%rsp),$V[1]
+#md5#	add	2*4(%rsp),$V[2]
+#md5#	add	3*4(%rsp),$V[3]
+
+#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
+#rc4#	movdqu	%xmm3,16($out,$in0)
+#rc4#	movdqu	%xmm4,32($out,$in0)
+#rc4#	movdqu	%xmm5,48($out,$in0)
+#md5#	lea	64($inp),$inp
+#rc4#	lea	64($in0),$in0
+	cmp	16(%rsp),$inp		# are we done?
+	jb	.Loop
+
+#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
+#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
+#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
+#md5#	mov	$V[1],1*4($len)
+#md5#	mov	$V[2],2*4($len)
+#md5#	mov	$V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+	mov	32(%rsp),$len		# restore original $len
+	and	\$63,$len		# remaining bytes
+	jnz	.Loop1
+	jmp	.Ldone
+	
+.align	16
+.Loop1:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$len
+	jnz	.Loop1
+
+.Ldone:
+___
+$code.=<<___;
+#rc4#	sub	\$1,$XX[0]#b
+#rc4#	movl	$XX[0]#d,-8($dat)
+#rc4#	movl	$YY#d,-4($dat)
+
+	mov	40(%rsp),%r15
+	mov	48(%rsp),%r14
+	mov	56(%rsp),%r13
+	mov	64(%rsp),%r12
+	mov	72(%rsp),%rbp
+	mov	80(%rsp),%rbx
+	lea	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	ret
+.size $func,.-$func
+___
+
+if ($rc4 && $D) {	# sole purpose of this section is to provide
+			# option to use the generated module as drop-in
+			# replacement for rc4-x86_64.pl for debugging
+			# and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@abi-omnipotent
+.align	16
+RC4_options:
+	lea	.Lopts(%rip),%rax
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(64x,int)"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lbody
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	mov	40(%rax),%r15
+	mov	48(%rax),%r14
+	mov	56(%rax),%r13
+	mov	64(%rax),%r12
+	mov	72(%rax),%rbp
+	mov	80(%rax),%rbx
+	lea	88(%rax),%rax
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R12
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_$func
+	.rva	.LSEH_end_$func
+	.rva	.LSEH_info_$func
+
+.section	.xdata
+.align	8
+.LSEH_info_$func:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd	/gm;
+
+$code =~ s/#md5#//gm	if ($md5);
+$code =~ s/#rc4#//gm	if ($rc4);
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/rc4/asm/rc4-parisc.pl b/crypto/rc4/asm/rc4-parisc.pl
new file mode 100644
index 0000000..9165067
--- /dev/null
+++ b/crypto/rc4/asm/rc4-parisc.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# RC4 for PA-RISC.
+
+# June 2009.
+#
+# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
+# For reference, [4x] unrolled loop is >40% faster than folded one.
+# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
+# is believed to be not sufficient to justify the effort...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=4*$SIZE_T+$FRAME_MARKER;	# 4 saved regs + frame marker
+				#                [+ argument transfer]
+$SZ=1;				# defaults to RC4_CHAR
+if (open CONF,"<${dir}../../opensslconf.h") {
+    while(<CONF>) {
+	if (m/#\s*define\s+RC4_INT\s+(.*)/) {
+	    $SZ = ($1=~/char$/) ? 1 : 4;
+	    last;
+	}
+    }
+    close CONF;
+}
+
+if ($SZ==1) {	# RC4_CHAR
+    $LD="ldb";
+    $LDX="ldbx";
+    $MKX="addl";
+    $ST="stb";
+} else {	# RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
+    $LD="ldw";
+    $LDX="ldwx,s";
+    $MKX="sh2addl";
+    $ST="stw";
+}
+
+$key="%r26";
+$len="%r25";
+$inp="%r24";
+$out="%r23";
+
+@XX=("%r19","%r20");
+@TX=("%r21","%r22");
+$YY="%r28";
+$TY="%r29";
+
+$acc="%r1";
+$ix="%r2";
+$iy="%r3";
+$dat0="%r4";
+$dat1="%r5";
+$rem="%r6";
+$mask="%r31";
+
+sub unrolledloopbody {
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+	ldo	1($XX[0]),$XX[1]
+	`sprintf("$LDX	%$TY(%$key),%$dat1") if ($i>0)`	
+	and	$mask,$XX[1],$XX[1]
+	$LDX	$YY($key),$TY
+	$MKX	$YY,$key,$ix
+	$LDX	$XX[1]($key),$TX[1]
+	$MKX	$XX[0],$key,$iy
+	$ST	$TX[0],0($ix)
+	comclr,<> $XX[1],$YY,%r0	; conditional
+	copy	$TX[0],$TX[1]		; move
+	`sprintf("%sdep	%$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
+	$ST	$TY,0($iy)
+	addl	$TX[0],$TY,$TY
+	addl	$TX[1],$YY,$YY
+	and	$mask,$TY,$TY
+	and	$mask,$YY,$YY
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+} }
+
+sub foldedloop {
+my ($label,$count)=@_;
+$code.=<<___;
+$label
+	$MKX	$YY,$key,$iy
+	$LDX	$YY($key),$TY
+	$MKX	$XX[0],$key,$ix
+	$ST	$TX[0],0($iy)
+	ldo	1($XX[0]),$XX[0]
+	$ST	$TY,0($ix)
+	addl	$TX[0],$TY,$TY
+	ldbx	$inp($out),$dat1
+	and	$mask,$TY,$TY
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$TY($key),$acc
+	$LDX	$XX[0]($key),$TX[0]
+	ldo	1($out),$out
+	xor	$dat1,$acc,$acc
+	addl	$TX[0],$YY,$YY
+	stb	$acc,-1($out)
+	addib,<> -1,$count,$label	; $count is always small
+	and	$mask,$YY,$YY
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+RC4
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+
+	cmpib,*= 0,$len,L\$abort
+	sub	$inp,$out,$inp		; distance between $inp and $out
+
+	$LD	`0*$SZ`($key),$XX[0]
+	$LD	`1*$SZ`($key),$YY
+	ldo	`2*$SZ`($key),$key
+
+	ldi	0xff,$mask
+	ldi	3,$dat0		
+
+	ldo	1($XX[0]),$XX[0]	; warm up loop
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$XX[0]($key),$TX[0]
+	addl	$TX[0],$YY,$YY
+	cmpib,*>>= 6,$len,L\$oop1	; is $len large enough to bother?
+	and	$mask,$YY,$YY
+
+	and,<>	$out,$dat0,$rem		; is $out aligned?
+	b	L\$alignedout
+	subi	4,$rem,$rem
+	sub	$len,$rem,$len
+___
+&foldedloop("L\$alignout",$rem);	# process till $out is aligned
+
+$code.=<<___;
+L\$alignedout				; $len is at least 4 here
+	and,<>	$inp,$dat0,$acc		; is $inp aligned?
+	b	L\$oop4
+	sub	$inp,$acc,$rem		; align $inp
+
+	sh3addl	$acc,%r0,$acc
+	subi	32,$acc,$acc
+	mtctl	$acc,%cr11		; load %sar with vshd align factor
+	ldwx	$rem($out),$dat0
+	ldo	4($rem),$rem
+L\$oop4misalignedinp
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$rem($out),$dat1
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	vshd	$dat0,$dat1,$iy		; align data
+	copy	$dat1,$dat0
+	xor	$iy,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4misalignedinp
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+	b	L\$oop1
+	nop
+
+	.ALIGN	8
+L\$oop4
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$inp($out),$dat0
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	xor	$dat0,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+___
+&foldedloop("L\$oop1",$len);
+$code.=<<___;
+L\$done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2
+	ldo	-1($XX[0]),$XX[0]	; chill out loop
+	sub	$YY,$TX[0],$YY
+	and	$mask,$XX[0],$XX[0]
+	and	$mask,$YY,$YY
+	$ST	$XX[0],`-2*$SZ`($key)
+	$ST	$YY,`-1*$SZ`($key)
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+___
+
+$code.=<<___;
+
+	.EXPORT	private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	8
+private_RC4_set_key
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	$ST	%r0,`0*$SZ`($key)
+	$ST	%r0,`1*$SZ`($key)
+	ldo	`2*$SZ`($key),$key
+	copy	%r0,@XX[0]
+L\$1st
+	$ST	@XX[0],0($key)
+	ldo	1(@XX[0]),@XX[0]
+	bb,>=	@XX[0],`31-8`,L\$1st	; @XX[0]<256
+	ldo	$SZ($key),$key
+
+	ldo	`-256*$SZ`($key),$key	; rewind $key
+	addl	$len,$inp,$inp		; $inp to point at the end
+	sub	%r0,$len,%r23		; inverse index
+	copy	%r0,@XX[0]
+	copy	%r0,@XX[1]
+	ldi	0xff,$mask
+
+L\$2nd
+	$LDX	@XX[0]($key),@TX[0]
+	ldbx	%r23($inp),@TX[1]
+	addi,nuv 1,%r23,%r23		; increment and conditional
+	sub	%r0,$len,%r23		; inverse index
+	addl	@TX[0],@XX[1],@XX[1]
+	addl	@TX[1],@XX[1],@XX[1]
+	and	$mask,@XX[1],@XX[1]
+	$MKX	@XX[0],$key,$TY
+	$LDX	@XX[1]($key),@TX[1]
+	$MKX	@XX[1],$key,$YY
+	ldo	1(@XX[0]),@XX[0]
+	$ST	@TX[0],0($YY)
+	bb,>=	@XX[0],`31-8`,L\$2nd	; @XX[0]<256
+	$ST	@TX[1],0($TY)
+
+	bv,n	(%r2)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	RC4_options,ENTRY
+	.ALIGN	8
+RC4_options
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	blr	%r0,%r28
+	ldi	3,%r1
+L\$pic
+	andcm	%r28,%r1,%r28
+	bv	(%r2)
+	.EXIT
+	ldo	L\$opts-L\$pic(%r28),%r28
+	.PROCEND
+	.ALIGN	8
+L\$opts
+	.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
+	.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+
+print $code;
+close STDOUT;
diff --git a/crypto/rc4/asm/rc4-s390x.pl b/crypto/rc4/asm/rc4-s390x.pl
index 96681fa..7528ece 100644
--- a/crypto/rc4/asm/rc4-s390x.pl
+++ b/crypto/rc4/asm/rc4-s390x.pl
@@ -13,6 +13,29 @@
 # "cluster" Address Generation Interlocks, so that one pipeline stall
 # resolves several dependencies.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $rp="%r14";
 $sp="%r15";
 $code=<<___;
@@ -39,7 +62,12 @@
 .type	RC4,\@function
 .align	64
 RC4:
-	stmg	%r6,%r11,48($sp)
+	stm${g}	%r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
 	llgc	$XX[0],0($key)
 	llgc	$YY,1($key)
 	la	$XX[0],1($XX[0])
@@ -90,7 +118,7 @@
 	xgr	$acc,$TX[1]
 	stg	$acc,0($out)
 	la	$out,8($out)
-	brct	$cnt,.Loop8
+	brctg	$cnt,.Loop8
 
 .Lshort:
 	lghi	$acc,7
@@ -122,7 +150,7 @@
 	ahi	$XX[0],-1
 	stc	$XX[0],0($key)
 	stc	$YY,1($key)
-	lmg	%r6,%r11,48($sp)
+	lm${g}	%r6,%r11,6*$SIZE_T($sp)
 	br	$rp
 .size	RC4,.-RC4
 .string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -143,11 +171,11 @@
 $iinp="%r8";
 
 $code.=<<___;
-.globl	RC4_set_key
-.type	RC4_set_key,\@function
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,\@function
 .align	64
-RC4_set_key:
-	stmg	%r6,%r8,48($sp)
+private_RC4_set_key:
+	stm${g}	%r6,%r8,6*$SIZE_T($sp)
 	lhi	$cnt,256
 	la	$idx,0(%r0)
 	sth	$idx,0($key)
@@ -180,9 +208,9 @@
 	la	$iinp,0(%r0)
 	j	.L2ndloop
 .Ldone:
-	lmg	%r6,%r8,48($sp)
+	lm${g}	%r6,%r8,6*$SIZE_T($sp)
 	br	$rp
-.size	RC4_set_key,.-RC4_set_key
+.size	private_RC4_set_key,.-private_RC4_set_key
 
 ___
 }
@@ -203,3 +231,4 @@
 ___
 
 print $code;
+close STDOUT;	# force flush
diff --git a/crypto/rc4/asm/rc4-x86_64.S b/crypto/rc4/asm/rc4-x86_64.S
new file mode 100644
index 0000000..af16158
--- /dev/null
+++ b/crypto/rc4/asm/rc4-x86_64.S
@@ -0,0 +1,615 @@
+.text	
+
+
+.globl	RC4
+.type	RC4,@function
+.align	16
+RC4:	orq	%rsi,%rsi
+	jne	.Lentry
+	.byte	0xf3,0xc3
+.Lentry:
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+.Lprologue:
+	movq	%rsi,%r11
+	movq	%rdx,%r12
+	movq	%rcx,%r13
+	xorq	%r10,%r10
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%r10b
+	movb	-4(%rdi),%cl
+	cmpl	$-1,256(%rdi)
+	je	.LRC4_CHAR
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	xorq	%rbx,%rbx
+	incb	%r10b
+	subq	%r10,%rbx
+	subq	%r12,%r13
+	movl	(%rdi,%r10,4),%eax
+	testq	$-16,%r11
+	jz	.Lloop1
+	btl	$30,%r8d
+	jc	.Lintel
+	andq	$7,%rbx
+	leaq	1(%r10),%rsi
+	jz	.Loop8
+	subq	%rbx,%r11
+.Loop8_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop8_warmup
+
+	leaq	1(%r10),%rsi
+	jmp	.Loop8
+.align	16
+.Loop8:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	0(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,0(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,4(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	8(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,8(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	12(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,12(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	16(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,16(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	20(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,20(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	24(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,24(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%sil
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	-4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,28(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%r10b
+	rorq	$8,%r8
+	subq	$8,%r11
+
+	xorq	(%r12),%r8
+	movq	%r8,(%r13,%r12,1)
+	leaq	8(%r12),%r12
+
+	testq	$-8,%r11
+	jnz	.Loop8
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	testq	$-32,%r11
+	jz	.Lloop1
+	andq	$15,%rbx
+	jz	.Loop16_is_hot
+	subq	%rbx,%r11
+.Loop16_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop16_warmup
+
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	movb	%bl,%cl
+
+.Loop16_is_hot:
+	leaq	(%rdi,%r10,4),%rsi
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm2
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	pxor	%xmm1,%xmm2
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+.Loop16_enter:
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm1,%xmm1
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	pinsrw	$0,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+	addb	$16,%r10b
+	movdqu	(%r12),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	leaq	(%rdi,%r10,4),%rsi
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+	movl	(%rsi),%eax
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	subq	$16,%r11
+	movb	%bl,%cl
+	testq	$-16,%r11
+	jnz	.Loop16
+
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lloop1:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%r11
+	jnz	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.LRC4_CHAR:
+	addb	$1,%r10b
+	movzbl	(%rdi,%r10,1),%eax
+	testq	$-8,%r11
+	jz	.Lcloop1
+	jmp	.Lcloop8
+.align	16
+.Lcloop8:
+	movl	(%r12),%r8d
+	movl	4(%r12),%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov0			
+	movq	%rax,%rbx
+.Lcmov0:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov1			
+	movq	%rbx,%rax
+.Lcmov1:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov2			
+	movq	%rax,%rbx
+.Lcmov2:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov3			
+	movq	%rbx,%rax
+.Lcmov3:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov4			
+	movq	%rax,%rbx
+.Lcmov4:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov5			
+	movq	%rbx,%rax
+.Lcmov5:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov6			
+	movq	%rax,%rbx
+.Lcmov6:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov7			
+	movq	%rbx,%rax
+.Lcmov7:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	leaq	-8(%r11),%r11
+	movl	%r8d,(%r13)
+	leaq	8(%r12),%r12
+	movl	%r9d,4(%r13)
+	leaq	8(%r13),%r13
+
+	testq	$-8,%r11
+	jnz	.Lcloop8
+	cmpq	$0,%r11
+	jne	.Lcloop1
+	jmp	.Lexit
+.align	16
+.Lcloop1:
+	addb	%al,%cl
+	movzbl	%cl,%ecx
+	movzbl	(%rdi,%rcx,1),%edx
+	movb	%al,(%rdi,%rcx,1)
+	movb	%dl,(%rdi,%r10,1)
+	addb	%al,%dl
+	addb	$1,%r10b
+	movzbl	%dl,%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%rdx,1),%edx
+	movzbl	(%rdi,%r10,1),%eax
+	xorb	(%r12),%dl
+	leaq	1(%r12),%r12
+	movb	%dl,(%r13)
+	leaq	1(%r13),%r13
+	subq	$1,%r11
+	jnz	.Lcloop1
+	jmp	.Lexit
+
+.align	16
+.Lexit:
+	subb	$1,%r10b
+	movl	%r10d,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	(%rsp),%r13
+	movq	8(%rsp),%r12
+	movq	16(%rsp),%rbx
+	addq	$24,%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	RC4,.-RC4
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,@function
+.align	16
+private_RC4_set_key:
+	leaq	8(%rdi),%rdi
+	leaq	(%rdx,%rsi,1),%rdx
+	negq	%rsi
+	movq	%rsi,%rcx
+	xorl	%eax,%eax
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	btl	$20,%r8d
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	movl	%eax,(%rdi,%rax,4)
+	addb	$1,%al
+	jnc	.Lw1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lw2ndloop:
+	movl	(%rdi,%r9,4),%r10d
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movl	(%rdi,%r8,4),%r11d
+	cmovzq	%rcx,%rsi
+	movl	%r10d,(%rdi,%r8,4)
+	movl	%r11d,(%rdi,%r9,4)
+	addb	$1,%r9b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	movb	%al,(%rdi,%rax,1)
+	addb	$1,%al
+	jnc	.Lc1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lc2ndloop:
+	movb	(%rdi,%r9,1),%r10b
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movb	(%rdi,%r8,1),%r11b
+	jnz	.Lcnowrap
+	movq	%rcx,%rsi
+.Lcnowrap:
+	movb	%r10b,(%rdi,%r8,1)
+	movb	%r11b,(%rdi,%r9,1)
+	addb	$1,%r9b
+	jnc	.Lc2ndloop
+	movl	$-1,256(%rdi)
+
+.align	16
+.Lexit_key:
+	xorl	%eax,%eax
+	movl	%eax,-8(%rdi)
+	movl	%eax,-4(%rdi)
+	.byte	0xf3,0xc3
+.size	private_RC4_set_key,.-private_RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,@function
+.align	16
+RC4_options:
+	leaq	.Lopts(%rip),%rax
+	movl	OPENSSL_ia32cap_P(%rip),%edx
+	btl	$20,%edx
+	jc	.L8xchar
+	btl	$30,%edx
+	jnc	.Ldone
+	addq	$25,%rax
+	.byte	0xf3,0xc3
+.L8xchar:
+	addq	$12,%rax
+.Ldone:
+	.byte	0xf3,0xc3
+.align	64
+.Lopts:
+.byte	114,99,52,40,56,120,44,105,110,116,41,0
+.byte	114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte	114,99,52,40,49,54,120,44,105,110,116,41,0
+.byte	82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+.size	RC4_options,.-RC4_options
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
old mode 100755
new mode 100644
index 677be5f..20722d3
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -7,6 +7,8 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
+# July 2004
+#
 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
 # "hand-coded assembler"] doesn't stand for the whole improvement
 # coefficient. It turned out that eliminating RC4_CHAR from config
@@ -19,6 +21,8 @@
 # to operate on partial registers, it turned out to be the best bet.
 # At least for AMD... How IA32E would perform remains to be seen...
 
+# November 2004
+#
 # As was shown by Marc Bevand reordering of couple of load operations
 # results in even higher performance gain of 3.3x:-) At least on
 # Opteron... For reference, 1x in this case is RC4_CHAR C-code
@@ -26,6 +30,8 @@
 # Latter means that if you want to *estimate* what to expect from
 # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
 
+# November 2004
+#
 # Intel P4 EM64T core was found to run the AMD64 code really slow...
 # The only way to achieve comparable performance on P4 was to keep
 # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
@@ -33,10 +39,14 @@
 # on either AMD and Intel platforms, I implement both cases. See
 # rc4_skey.c for further details...
 
+# April 2005
+#
 # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
 # those with add/sub results in 50% performance improvement of folded
 # loop...
 
+# May 2005
+#
 # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
 # performance by >30% [unlike P4 32-bit case that is]. But this is
 # provided that loads are reordered even more aggressively! Both code
@@ -46,10 +56,12 @@
 # achieves respectful 432MBps on 2.8GHz processor now. For reference.
 # If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
 # RC4_INT code-path. While if executed on Opteron, it's only 25%
-# slower than the RC4_INT one [meaning that if CPU �-arch detection
+# slower than the RC4_INT one [meaning that if CPU µ-arch detection
 # is not implemented, then this final RC4_CHAR code-path should be
 # preferred, as it provides better *all-round* performance].
 
+# March 2007
+#
 # Intel Core2 was observed to perform poorly on both code paths:-( It
 # apparently suffers from some kind of partial register stall, which
 # occurs in 64-bit mode only [as virtually identical 32-bit loop was
@@ -58,6 +70,37 @@
 # fit for Core2 and therefore the code was modified to skip cloop8 on
 # this CPU.
 
+# May 2010
+#
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
+# May 2011
+#
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
+#
+# Opteron	5.3/+0%(*)
+# P4		6.5
+# Core2		6.2/+15%(**)
+# Westmere	4.2/+60%
+# Sandy Bridge	4.2/+120%
+# Atom		9.3/+80%
+#
+# (*)	But corresponding loop has less instructions, which should have
+#	positive effect on upcoming Bulldozer, which has one less ALU.
+#	For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**)	Note that Core2 result is ~15% lower than corresponding result
+#	for 32-bit code, meaning that it's possible to improve it,
+#	but more than likely at the cost of the others (see rc4-586.pl
+#	to get the idea)...
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -69,20 +112,18 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $dat="%rdi";	    # arg1
 $len="%rsi";	    # arg2
 $inp="%rdx";	    # arg3
 $out="%rcx";	    # arg4
 
-@XX=("%r8","%r10");
-@TX=("%r9","%r11");
-$YY="%r12";
-$TY="%r13";
-
+{
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	RC4
 .type	RC4,\@function,4
@@ -95,48 +136,173 @@
 	push	%r12
 	push	%r13
 .Lprologue:
+	mov	$len,%r11
+	mov	$inp,%r12
+	mov	$out,%r13
+___
+my $len="%r11";		# reassign input arguments
+my $inp="%r12";
+my $out="%r13";
 
-	add	\$8,$dat
-	movl	-8($dat),$XX[0]#d
-	movl	-4($dat),$YY#d
+my @XX=("%r10","%rsi");
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
 	cmpl	\$-1,256($dat)
 	je	.LRC4_CHAR
+	mov	OPENSSL_ia32cap_P(%rip),%r8d
+	xor	$TX[1],$TX[1]
 	inc	$XX[0]#b
+	sub	$XX[0],$TX[1]
+	sub	$inp,$out
 	movl	($dat,$XX[0],4),$TX[0]#d
-	test	\$-8,$len
+	test	\$-16,$len
 	jz	.Lloop1
-	jmp	.Lloop8
+	bt	\$30,%r8d	# Intel CPU?
+	jc	.Lintel
+	and	\$7,$TX[1]
+	lea	1($XX[0]),$XX[1]
+	jz	.Loop8
+	sub	$TX[1],$len
+.Loop8_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop8_warmup
+
+	lea	1($XX[0]),$XX[1]
+	jmp	.Loop8
 .align	16
-.Lloop8:
+.Loop8:
 ___
 for ($i=0;$i<8;$i++) {
+$code.=<<___ if ($i==7);
+	add	\$8,$XX[1]#b
+___
 $code.=<<___;
 	add	$TX[0]#b,$YY#b
-	mov	$XX[0],$XX[1]
 	movl	($dat,$YY,4),$TY#d
-	ror	\$8,%rax			# ror is redundant when $i=0
-	inc	$XX[1]#b
-	movl	($dat,$XX[1],4),$TX[1]#d
-	cmp	$XX[1],$YY
 	movl	$TX[0]#d,($dat,$YY,4)
-	cmove	$TX[0],$TX[1]
-	movl	$TY#d,($dat,$XX[0],4)
+	movl	`4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
+	ror	\$8,%r8				# ror is redundant when $i=0
+	movl	$TY#d,4*$i($dat,$XX[0],4)
 	add	$TX[0]#b,$TY#b
-	movb	($dat,$TY,4),%al
+	movb	($dat,$TY,4),%r8b
 ___
-push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+push(@TX,shift(@TX)); #push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
-	ror	\$8,%rax
+	add	\$8,$XX[0]#b
+	ror	\$8,%r8
 	sub	\$8,$len
 
-	xor	($inp),%rax
-	add	\$8,$inp
-	mov	%rax,($out)
-	add	\$8,$out
+	xor	($inp),%r8
+	mov	%r8,($out,$inp)
+	lea	8($inp),$inp
 
 	test	\$-8,$len
-	jnz	.Lloop8
+	jnz	.Loop8
+	cmp	\$0,$len
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	test	\$-32,$len
+	jz	.Lloop1
+	and	\$15,$TX[1]
+	jz	.Loop16_is_hot
+	sub	$TX[1],$len
+.Loop16_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop16_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop16_is_hot:
+	lea	($dat,$XX[0],4),$XX[1]
+___
+sub RC4_loop {
+  my $i=shift;
+  my $j=$i<0?0:$i;
+  my $xmm="%xmm".($j&1);
+
+    $code.="	add	\$16,$XX[0]#b\n"		if ($i==15);
+    $code.="	movdqu	($inp),%xmm2\n"			if ($i==15);
+    $code.="	add	$TX[0]#b,$YY#b\n"		if ($i<=0);
+    $code.="	movl	($dat,$YY,4),$TY#d\n";
+    $code.="	pxor	%xmm0,%xmm2\n"			if ($i==0);
+    $code.="	psllq	\$8,%xmm1\n"			if ($i==0);
+    $code.="	pxor	$xmm,$xmm\n"			if ($i<=1);
+    $code.="	movl	$TX[0]#d,($dat,$YY,4)\n";
+    $code.="	add	$TY#b,$TX[0]#b\n";
+    $code.="	movl	`4*($j+1)`($XX[1]),$TX[1]#d\n"	if ($i<15);
+    $code.="	movz	$TX[0]#b,$TX[0]#d\n";
+    $code.="	movl	$TY#d,4*$j($XX[1])\n";
+    $code.="	pxor	%xmm1,%xmm2\n"			if ($i==0);
+    $code.="	lea	($dat,$XX[0],4),$XX[1]\n"	if ($i==15);
+    $code.="	add	$TX[1]#b,$YY#b\n"		if ($i<15);
+    $code.="	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
+    $code.="	movdqu	%xmm2,($out,$inp)\n"		if ($i==0);
+    $code.="	lea	16($inp),$inp\n"		if ($i==0);
+    $code.="	movl	($XX[1]),$TX[1]#d\n"		if ($i==15);
+}
+	RC4_loop(-1);
+$code.=<<___;
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+___
+
+for ($i=0;$i<16;$i++) {
+    $code.=".Loop16_enter:\n"		if ($i==1);
+	RC4_loop($i);
+	push(@TX,shift(@TX)); 		# "rotate" registers
+}
+$code.=<<___;
+	mov	$YY,$TX[1]
+	xor	$YY,$YY			# keyword to partial register
+	sub	\$16,$len
+	mov	$TX[1]#b,$YY#b
+	test	\$-16,$len
+	jnz	.Loop16
+
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,($out,$inp)
+	lea	16($inp),$inp
+
 	cmp	\$0,$len
 	jne	.Lloop1
 	jmp	.Lexit
@@ -152,9 +318,8 @@
 	movl	($dat,$TX[0],4),$TY#d
 	movl	($dat,$XX[0],4),$TX[0]#d
 	xorb	($inp),$TY#b
-	inc	$inp
-	movb	$TY#b,($out)
-	inc	$out
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
 	dec	$len
 	jnz	.Lloop1
 	jmp	.Lexit
@@ -165,13 +330,11 @@
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
-	cmpl	\$0,260($dat)
-	jnz	.Lcloop1
 	jmp	.Lcloop8
 .align	16
 .Lcloop8:
-	mov	($inp),%eax
-	mov	4($inp),%ebx
+	mov	($inp),%r8d
+	mov	4($inp),%r9d
 ___
 # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
 for ($i=0;$i<4;$i++) {
@@ -188,8 +351,8 @@
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%al
-	ror	\$8,%eax
+	xor	($dat,$TY),%r8b
+	ror	\$8,%r8d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
@@ -207,16 +370,16 @@
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%bl
-	ror	\$8,%ebx
+	xor	($dat,$TY),%r9b
+	ror	\$8,%r9d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
 	lea	-8($len),$len
-	mov	%eax,($out)
+	mov	%r8d,($out)
 	lea	8($inp),$inp
-	mov	%ebx,4($out)
+	mov	%r9d,4($out)
 	lea	8($out),$out
 
 	test	\$-8,$len
@@ -229,6 +392,7 @@
 .align	16
 .Lcloop1:
 	add	$TX[0]#b,$YY#b
+	movzb	$YY#b,$YY#d
 	movzb	($dat,$YY),$TY#d
 	movb	$TX[0]#b,($dat,$YY)
 	movb	$TY#b,($dat,$XX[0])
@@ -260,16 +424,16 @@
 	ret
 .size	RC4,.-RC4
 ___
+}
 
 $idx="%r8";
 $ido="%r9";
 
 $code.=<<___;
-.extern	OPENSSL_ia32cap_P
-.globl	RC4_set_key
-.type	RC4_set_key,\@function,3
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,\@function,3
 .align	16
-RC4_set_key:
+private_RC4_set_key:
 	lea	8($dat),$dat
 	lea	($inp,$len),$inp
 	neg	$len
@@ -280,12 +444,9 @@
 	xor	%r11,%r11
 
 	mov	OPENSSL_ia32cap_P(%rip),$idx#d
-	bt	\$20,$idx#d
-	jnc	.Lw1stloop
-	bt	\$30,$idx#d
-	setc	$ido#b
-	mov	$ido#d,260($dat)
-	jmp	.Lc1stloop
+	bt	\$20,$idx#d	# RC4_CHAR?
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
 
 .align	16
 .Lw1stloop:
@@ -339,7 +500,7 @@
 	mov	%eax,-8($dat)
 	mov	%eax,-4($dat)
 	ret
-.size	RC4_set_key,.-RC4_set_key
+.size	private_RC4_set_key,.-private_RC4_set_key
 
 .globl	RC4_options
 .type	RC4_options,\@abi-omnipotent
@@ -348,18 +509,20 @@
 	lea	.Lopts(%rip),%rax
 	mov	OPENSSL_ia32cap_P(%rip),%edx
 	bt	\$20,%edx
-	jnc	.Ldone
-	add	\$12,%rax
+	jc	.L8xchar
 	bt	\$30,%edx
 	jnc	.Ldone
-	add	\$13,%rax
+	add	\$25,%rax
+	ret
+.L8xchar:
+	add	\$12,%rax
 .Ldone:
 	ret
 .align	64
 .Lopts:
 .asciz	"rc4(8x,int)"
 .asciz	"rc4(8x,char)"
-.asciz	"rc4(1x,char)"
+.asciz	"rc4(16x,int)"
 .asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 .size	RC4_options,.-RC4_options
@@ -482,22 +645,32 @@
 	.rva	.LSEH_end_RC4
 	.rva	.LSEH_info_RC4
 
-	.rva	.LSEH_begin_RC4_set_key
-	.rva	.LSEH_end_RC4_set_key
-	.rva	.LSEH_info_RC4_set_key
+	.rva	.LSEH_begin_private_RC4_set_key
+	.rva	.LSEH_end_private_RC4_set_key
+	.rva	.LSEH_info_private_RC4_set_key
 
 .section	.xdata
 .align	8
 .LSEH_info_RC4:
 	.byte	9,0,0,0
 	.rva	stream_se_handler
-.LSEH_info_RC4_set_key:
+.LSEH_info_private_RC4_set_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 ___
 }
 
-$code =~ s/#([bwd])/$1/gm;
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
 
 print $code;
 
diff --git a/crypto/rc4/rc4.h b/crypto/rc4/rc4.h
index 29d1acc..88ceb46 100644
--- a/crypto/rc4/rc4.h
+++ b/crypto/rc4/rc4.h
@@ -79,6 +79,7 @@ typedef struct rc4_key_st
  
 const char *RC4_options(void);
 void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
 void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
 		unsigned char *outdata);
 
diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c
index b22c40b..fda2763 100644
--- a/crypto/rc4/rc4_skey.c
+++ b/crypto/rc4/rc4_skey.c
@@ -85,7 +85,7 @@ const char *RC4_options(void)
  * Date: Wed, 14 Sep 1994 06:35:31 GMT
  */
 
-void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 	{
         register RC4_INT tmp;
         register int id1,id2;
@@ -104,40 +104,6 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 		d[(n)]=d[id2]; \
 		d[id2]=tmp; }
 
-#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
-# if	defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
-	defined(__INTEL__) || \
-	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
-	if (sizeof(RC4_INT) > 1) {
-		/*
-		 * Unlike all other x86 [and x86_64] implementations,
-		 * Intel P4 core [including EM64T] was found to perform
-		 * poorly with wider RC4_INT. Performance improvement
-		 * for IA-32 hand-coded assembler turned out to be 2.8x
-		 * if re-coded for RC4_CHAR! It's however inappropriate
-		 * to just switch to RC4_CHAR for x86[_64], as non-P4
-		 * implementations suffer from significant performance
-		 * losses then, e.g. PIII exhibits >2x deterioration,
-		 * and so does Opteron. In order to assure optimal
-		 * all-round performance, let us [try to] detect P4 at
-		 * run-time by checking upon HTT bit in CPU capability
-		 * vector and set up compressed key schedule, which is
-		 * recognized by correspondingly updated assembler
-		 * module...
-		 *				<appro@fy.chalmers.se>
-		 */
-		if (OPENSSL_ia32cap_P & (1<<28)) {
-			unsigned char *cp=(unsigned char *)d;
-
-			for (i=0;i<256;i++) cp[i]=i;
-			for (i=0;i<256;i++) SK_LOOP(cp,i);
-			/* mark schedule as compressed! */
-			d[256/sizeof(RC4_INT)]=-1;
-			return;
-		}
-	}
-# endif
-#endif
 	for (i=0; i < 256; i++) d[i]=i;
 	for (i=0; i < 256; i+=4)
 		{
diff --git a/crypto/rc4/rc4_utl.c b/crypto/rc4/rc4_utl.c
new file mode 100644
index 0000000..ab3f02f
--- /dev/null
+++ b/crypto/rc4/rc4_utl.c
@@ -0,0 +1,62 @@
+/* crypto/rc4/rc4_utl.c -*- mode:C; c-file-style: "eay" -*- */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include <openssl/rc4.h>
+
+void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(RC4);
+#endif
+	private_RC4_set_key(key, len, data);
+	}
diff --git a/crypto/rc4/rc4test.c b/crypto/rc4/rc4test.c
index 633a79e..4312605 100644
--- a/crypto/rc4/rc4test.c
+++ b/crypto/rc4/rc4test.c
@@ -120,6 +120,12 @@ int main(int argc, char *argv[])
 	RC4_KEY key;
 	unsigned char obuf[512];
 
+#if !defined(OPENSSL_PIC)
+	void OPENSSL_cpuid_setup(void);
+
+	OPENSSL_cpuid_setup();
+#endif
+
 	for (i=0; i<6; i++)
 		{
 		RC4_set_key(&key,keys[i][0],&(keys[i][1]));
diff --git a/crypto/ripemd/ripemd.h b/crypto/ripemd/ripemd.h
index 5942eb6..189bd8c 100644
--- a/crypto/ripemd/ripemd.h
+++ b/crypto/ripemd/ripemd.h
@@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st
 	unsigned int   num;
 	} RIPEMD160_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_RIPEMD160_Init(RIPEMD160_CTX *c);
+#endif
 int RIPEMD160_Init(RIPEMD160_CTX *c);
 int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
 int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);
diff --git a/crypto/ripemd/rmd_dgst.c b/crypto/ripemd/rmd_dgst.c
index 59b017f..d8e72da 100644
--- a/crypto/ripemd/rmd_dgst.c
+++ b/crypto/ripemd/rmd_dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "rmd_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
 
@@ -69,7 +70,7 @@ const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
      void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num);
 #  endif
 
-int RIPEMD160_Init(RIPEMD160_CTX *c)
+fips_md_init(RIPEMD160)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=RIPEMD160_A;
@@ -104,21 +105,21 @@ void ripemd160_block_data_order (RIPEMD160_CTX *ctx, const void *p, size_t num)
 
 	A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E;
 
-	HOST_c2l(data,l); X( 0)=l;	HOST_c2l(data,l); X( 1)=l;
-	RIP1(A,B,C,D,E,WL00,SL00);	HOST_c2l(data,l); X( 2)=l;
-	RIP1(E,A,B,C,D,WL01,SL01);	HOST_c2l(data,l); X( 3)=l;
-	RIP1(D,E,A,B,C,WL02,SL02);	HOST_c2l(data,l); X( 4)=l;
-	RIP1(C,D,E,A,B,WL03,SL03);	HOST_c2l(data,l); X( 5)=l;
-	RIP1(B,C,D,E,A,WL04,SL04);	HOST_c2l(data,l); X( 6)=l;
-	RIP1(A,B,C,D,E,WL05,SL05);	HOST_c2l(data,l); X( 7)=l;
-	RIP1(E,A,B,C,D,WL06,SL06);	HOST_c2l(data,l); X( 8)=l;
-	RIP1(D,E,A,B,C,WL07,SL07);	HOST_c2l(data,l); X( 9)=l;
-	RIP1(C,D,E,A,B,WL08,SL08);	HOST_c2l(data,l); X(10)=l;
-	RIP1(B,C,D,E,A,WL09,SL09);	HOST_c2l(data,l); X(11)=l;
-	RIP1(A,B,C,D,E,WL10,SL10);	HOST_c2l(data,l); X(12)=l;
-	RIP1(E,A,B,C,D,WL11,SL11);	HOST_c2l(data,l); X(13)=l;
-	RIP1(D,E,A,B,C,WL12,SL12);	HOST_c2l(data,l); X(14)=l;
-	RIP1(C,D,E,A,B,WL13,SL13);	HOST_c2l(data,l); X(15)=l;
+	(void)HOST_c2l(data,l); X( 0)=l;(void)HOST_c2l(data,l); X( 1)=l;
+	RIP1(A,B,C,D,E,WL00,SL00);	(void)HOST_c2l(data,l); X( 2)=l;
+	RIP1(E,A,B,C,D,WL01,SL01);	(void)HOST_c2l(data,l); X( 3)=l;
+	RIP1(D,E,A,B,C,WL02,SL02);	(void)HOST_c2l(data,l); X( 4)=l;
+	RIP1(C,D,E,A,B,WL03,SL03);	(void)HOST_c2l(data,l); X( 5)=l;
+	RIP1(B,C,D,E,A,WL04,SL04);	(void)HOST_c2l(data,l); X( 6)=l;
+	RIP1(A,B,C,D,E,WL05,SL05);	(void)HOST_c2l(data,l); X( 7)=l;
+	RIP1(E,A,B,C,D,WL06,SL06);	(void)HOST_c2l(data,l); X( 8)=l;
+	RIP1(D,E,A,B,C,WL07,SL07);	(void)HOST_c2l(data,l); X( 9)=l;
+	RIP1(C,D,E,A,B,WL08,SL08);	(void)HOST_c2l(data,l); X(10)=l;
+	RIP1(B,C,D,E,A,WL09,SL09);	(void)HOST_c2l(data,l); X(11)=l;
+	RIP1(A,B,C,D,E,WL10,SL10);	(void)HOST_c2l(data,l); X(12)=l;
+	RIP1(E,A,B,C,D,WL11,SL11);	(void)HOST_c2l(data,l); X(13)=l;
+	RIP1(D,E,A,B,C,WL12,SL12);	(void)HOST_c2l(data,l); X(14)=l;
+	RIP1(C,D,E,A,B,WL13,SL13);	(void)HOST_c2l(data,l); X(15)=l;
 	RIP1(B,C,D,E,A,WL14,SL14);
 	RIP1(A,B,C,D,E,WL15,SL15);
 
diff --git a/crypto/ripemd/rmd_locl.h b/crypto/ripemd/rmd_locl.h
index f14b346..2bd8957 100644
--- a/crypto/ripemd/rmd_locl.h
+++ b/crypto/ripemd/rmd_locl.h
@@ -88,11 +88,11 @@ void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num);
 #define HASH_FINAL              RIPEMD160_Final
 #define	HASH_MAKE_STRING(c,s)	do {	\
 	unsigned long ll;		\
-	ll=(c)->A; HOST_l2c(ll,(s));	\
-	ll=(c)->B; HOST_l2c(ll,(s));	\
-	ll=(c)->C; HOST_l2c(ll,(s));	\
-	ll=(c)->D; HOST_l2c(ll,(s));	\
-	ll=(c)->E; HOST_l2c(ll,(s));	\
+	ll=(c)->A; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->B; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->C; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->D; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->E; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 #define HASH_BLOCK_DATA_ORDER   ripemd160_block_data_order
 
diff --git a/crypto/rsa/rsa.h b/crypto/rsa/rsa.h
index cf74343..5f269e5 100644
--- a/crypto/rsa/rsa.h
+++ b/crypto/rsa/rsa.h
@@ -222,12 +222,22 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \
 				pad, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \
+				EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad)
+
 #define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
 				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
 				EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \
 				len, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
+				EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \
+				0, plen)
+
 #define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL)
@@ -236,11 +246,24 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
 
+#define	 EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+
+#define	 EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+
 #define EVP_PKEY_CTRL_RSA_PADDING	(EVP_PKEY_ALG_CTRL + 1)
 #define EVP_PKEY_CTRL_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 2)
 
 #define EVP_PKEY_CTRL_RSA_KEYGEN_BITS	(EVP_PKEY_ALG_CTRL + 3)
 #define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP	(EVP_PKEY_ALG_CTRL + 4)
+#define EVP_PKEY_CTRL_RSA_MGF1_MD	(EVP_PKEY_ALG_CTRL + 5)
+
+#define EVP_PKEY_CTRL_GET_RSA_PADDING		(EVP_PKEY_ALG_CTRL + 6)
+#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 7)
+#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD		(EVP_PKEY_ALG_CTRL + 8)
 
 #define RSA_PKCS1_PADDING	1
 #define RSA_SSLV23_PADDING	2
@@ -257,7 +280,7 @@ struct rsa_st
 
 RSA *	RSA_new(void);
 RSA *	RSA_new_method(ENGINE *engine);
-int	RSA_size(const RSA *);
+int	RSA_size(const RSA *rsa);
 
 /* Deprecated version */
 #ifndef OPENSSL_NO_DEPRECATED
@@ -300,6 +323,16 @@ const RSA_METHOD *RSA_null_method(void);
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey)
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey)
 
+typedef struct rsa_pss_params_st
+	{
+	X509_ALGOR *hashAlgorithm;
+	X509_ALGOR *maskGenAlgorithm;
+	ASN1_INTEGER *saltLength;
+	ASN1_INTEGER *trailerField;
+	} RSA_PSS_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 #ifndef OPENSSL_NO_FP_API
 int	RSA_print_fp(FILE *fp, const RSA *r,int offset);
 #endif
@@ -380,6 +413,14 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen);
 
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, 
+			const unsigned char *EM, int sLen);
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen);
+
 int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func,
 	CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func);
 int RSA_set_ex_data(RSA *r,int idx,void *arg);
@@ -388,6 +429,25 @@ void *RSA_get_ex_data(const RSA *r, int idx);
 RSA *RSAPublicKey_dup(RSA *rsa);
 RSA *RSAPrivateKey_dup(RSA *rsa);
 
+/* If this flag is set the RSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define RSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define RSA_FLAG_NON_FIPS_ALLOW			0x0400
+/* Application has decided PRNG is good enough to generate a key: don't
+ * check.
+ */
+#define RSA_FLAG_CHECKED			0x0800
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -405,6 +465,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_PKEY_RSA_CTRL				 143
 #define RSA_F_PKEY_RSA_CTRL_STR				 144
 #define RSA_F_PKEY_RSA_SIGN				 142
+#define RSA_F_PKEY_RSA_VERIFY				 154
 #define RSA_F_PKEY_RSA_VERIFYRECOVER			 141
 #define RSA_F_RSA_BUILTIN_KEYGEN			 129
 #define RSA_F_RSA_CHECK_KEY				 123
@@ -413,6 +474,8 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_EAY_PUBLIC_DECRYPT			 103
 #define RSA_F_RSA_EAY_PUBLIC_ENCRYPT			 104
 #define RSA_F_RSA_GENERATE_KEY				 105
+#define RSA_F_RSA_GENERATE_KEY_EX			 155
+#define RSA_F_RSA_ITEM_VERIFY				 156
 #define RSA_F_RSA_MEMORY_LOCK				 130
 #define RSA_F_RSA_NEW_METHOD				 106
 #define RSA_F_RSA_NULL					 124
@@ -424,6 +487,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_ADD_NONE			 107
 #define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP		 121
 #define RSA_F_RSA_PADDING_ADD_PKCS1_PSS			 125
+#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1		 148
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1		 108
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2		 109
 #define RSA_F_RSA_PADDING_ADD_SSLV23			 110
@@ -436,8 +500,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_CHECK_X931			 128
 #define RSA_F_RSA_PRINT					 115
 #define RSA_F_RSA_PRINT_FP				 116
+#define RSA_F_RSA_PRIVATE_DECRYPT			 150
+#define RSA_F_RSA_PRIVATE_ENCRYPT			 151
 #define RSA_F_RSA_PRIV_DECODE				 137
 #define RSA_F_RSA_PRIV_ENCODE				 138
+#define RSA_F_RSA_PUBLIC_DECRYPT			 152
+#define RSA_F_RSA_PUBLIC_ENCRYPT			 153
 #define RSA_F_RSA_PUB_DECODE				 139
 #define RSA_F_RSA_SETUP_BLINDING			 136
 #define RSA_F_RSA_SIGN					 117
@@ -445,6 +513,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_VERIFY				 119
 #define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING		 120
 #define RSA_F_RSA_VERIFY_PKCS1_PSS			 126
+#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1			 149
 
 /* Reason codes. */
 #define RSA_R_ALGORITHM_MISMATCH			 100
@@ -470,19 +539,24 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_INVALID_HEADER				 137
 #define RSA_R_INVALID_KEYBITS				 145
 #define RSA_R_INVALID_MESSAGE_LENGTH			 131
+#define RSA_R_INVALID_MGF1_MD				 156
 #define RSA_R_INVALID_PADDING				 138
 #define RSA_R_INVALID_PADDING_MODE			 141
+#define RSA_R_INVALID_PSS_PARAMETERS			 149
 #define RSA_R_INVALID_PSS_SALTLEN			 146
+#define RSA_R_INVALID_SALT_LENGTH			 150
 #define RSA_R_INVALID_TRAILER				 139
 #define RSA_R_INVALID_X931_DIGEST			 142
 #define RSA_R_IQMP_NOT_INVERSE_OF_Q			 126
 #define RSA_R_KEY_SIZE_TOO_SMALL			 120
 #define RSA_R_LAST_OCTET_INVALID			 134
 #define RSA_R_MODULUS_TOO_LARGE				 105
+#define RSA_R_NON_FIPS_RSA_METHOD			 157
 #define RSA_R_NO_PUBLIC_EXPONENT			 140
 #define RSA_R_NULL_BEFORE_BLOCK_MISSING			 113
 #define RSA_R_N_DOES_NOT_EQUAL_P_Q			 127
 #define RSA_R_OAEP_DECODING_ERROR			 121
+#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE	 158
 #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE	 148
 #define RSA_R_PADDING_CHECK_FAILED			 114
 #define RSA_R_P_NOT_PRIME				 128
@@ -493,7 +567,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_SSLV3_ROLLBACK_ATTACK			 115
 #define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
 #define RSA_R_UNKNOWN_ALGORITHM_TYPE			 117
+#define RSA_R_UNKNOWN_MASK_DIGEST			 151
 #define RSA_R_UNKNOWN_PADDING_TYPE			 118
+#define RSA_R_UNKNOWN_PSS_DIGEST			 152
+#define RSA_R_UNSUPPORTED_MASK_ALGORITHM		 153
+#define RSA_R_UNSUPPORTED_MASK_PARAMETER		 154
+#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE		 155
 #define RSA_R_VALUE_MISSING				 147
 #define RSA_R_WRONG_SIGNATURE_LENGTH			 119
 
diff --git a/crypto/rsa/rsa_ameth.c b/crypto/rsa/rsa_ameth.c
index 8c32098..2460910 100644
--- a/crypto/rsa/rsa_ameth.c
+++ b/crypto/rsa/rsa_ameth.c
@@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
 	return do_rsa_print(bp, pkey->pkey.rsa, indent, 1);
 	}
 
+static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
+					X509_ALGOR **pmaskHash)
+	{
+	const unsigned char *p;
+	int plen;
+	RSA_PSS_PARAMS *pss;
+
+	*pmaskHash = NULL;
+
+	if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE)
+		return NULL;
+	p = alg->parameter->value.sequence->data;
+	plen = alg->parameter->value.sequence->length;
+	pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen);
+
+	if (!pss)
+		return NULL;
+	
+	if (pss->maskGenAlgorithm)
+		{
+		ASN1_TYPE *param = pss->maskGenAlgorithm->parameter;
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1
+			&& param->type == V_ASN1_SEQUENCE)
+			{
+			p = param->value.sequence->data;
+			plen = param->value.sequence->length;
+			*pmaskHash = d2i_X509_ALGOR(NULL, &p, plen);
+			}
+		}
+
+	return pss;
+	}
+
+static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, 
+				X509_ALGOR *maskHash, int indent)
+	{
+	int rv = 0;
+	if (!pss)
+		{
+		if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0)
+			return 0;
+		return 1;
+		}
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Hash Algorithm: ") <= 0)
+		goto err;
+
+	if (pss->hashAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "sha1 (default)") <= 0)
+		goto err;
+
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+
+	if (BIO_puts(bp, "Mask Algorithm: ") <= 0)
+			goto err;
+	if (pss->maskGenAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0)
+			goto err;
+		if (BIO_puts(bp, " with ") <= 0)
+			goto err;
+		if (maskHash)
+			{
+			if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0)
+			goto err;
+			}
+		else if (BIO_puts(bp, "INVALID") <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Salt Length: ") <= 0)
+			goto err;
+	if (pss->saltLength)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "20 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Trailer Field: ") <= 0)
+			goto err;
+	if (pss->trailerField)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "0xbc (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+	
+	rv = 1;
+
+	err:
+	return rv;
+
+	}
+
+static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss)
+		{
+		int rv;
+		RSA_PSS_PARAMS *pss;
+		X509_ALGOR *maskHash;
+		pss = rsa_pss_decode(sigalg, &maskHash);
+		rv = rsa_pss_param_print(bp, pss, maskHash, indent);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (maskHash)
+			X509_ALGOR_free(maskHash);
+		if (!rv)
+			return 0;
+		}
+	else if (!sig && BIO_puts(bp, "\n") <= 0)
+		return 0;
+	if (sig)
+		return X509_signature_dump(bp, sig, indent);
+	return 1;
+	}
 
 static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
@@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 
 	}
 
+/* Customised RSA item verification routine. This is called 
+ * when a signature is encountered requiring special handling. We 
+ * currently only handle PSS.
+ */
+
+
+static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+			X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
+			EVP_PKEY *pkey)
+	{
+	int rv = -1;
+	int saltlen;
+	const EVP_MD *mgf1md = NULL, *md = NULL;
+	RSA_PSS_PARAMS *pss;
+	X509_ALGOR *maskHash;
+	EVP_PKEY_CTX *pkctx;
+	/* Sanity check: make sure it is PSS */
+	if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+		return -1;
+		}
+	/* Decode PSS parameters */
+	pss = rsa_pss_decode(sigalg, &maskHash);
+
+	if (pss == NULL)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS);
+		goto err;
+		}
+	/* Check mask and lookup mask hash algorithm */
+	if (pss->maskGenAlgorithm)
+		{
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
+			goto err;
+			}
+		if (!maskHash)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER);
+			goto err;
+			}
+		mgf1md = EVP_get_digestbyobj(maskHash->algorithm);
+		if (mgf1md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST);
+			goto err;
+			}
+		}
+	else
+		mgf1md = EVP_sha1();
+
+	if (pss->hashAlgorithm)
+		{
+		md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm);
+		if (md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST);
+			goto err;
+			}
+		}
+	else
+		md = EVP_sha1();
+
+	if (pss->saltLength)
+		{
+		saltlen = ASN1_INTEGER_get(pss->saltLength);
+
+		/* Could perform more salt length sanity checks but the main
+		 * RSA routines will trap other invalid values anyway.
+		 */
+		if (saltlen < 0)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH);
+			goto err;
+			}
+		}
+	else
+		saltlen = 20;
+
+	/* low-level routines support only trailer field 0xbc (value 1)
+	 * and PKCS#1 says we should reject any other value anyway.
+	 */
+	if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER);
+		goto err;
+		}
+
+	/* We have all parameters now set up context */
+
+	if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
+		goto err;
+	/* Carry on */
+	rv = 2;
+
+	err:
+	RSA_PSS_PARAMS_free(pss);
+	if (maskHash)
+		X509_ALGOR_free(maskHash);
+	return rv;
+	}
+
+static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig)
+	{
+	int pad_mode;
+	EVP_PKEY_CTX *pkctx = ctx->pctx;
+	if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+		return 0;
+	if (pad_mode == RSA_PKCS1_PADDING)
+		return 2;
+	if (pad_mode == RSA_PKCS1_PSS_PADDING)
+		{
+		const EVP_MD *sigmd, *mgf1md;
+		RSA_PSS_PARAMS *pss = NULL;
+		X509_ALGOR *mgf1alg = NULL;
+		ASN1_STRING *os1 = NULL, *os2 = NULL;
+		EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
+		int saltlen, rv = 0;
+		sigmd = EVP_MD_CTX_md(ctx);
+		if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+			goto err;
+		if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
+			goto err;
+		if (saltlen == -1)
+			saltlen = EVP_MD_size(sigmd);
+		else if (saltlen == -2)
+			{
+			saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
+			if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
+				saltlen--;
+			}
+		pss = RSA_PSS_PARAMS_new();
+		if (!pss)
+			goto err;
+		if (saltlen != 20)
+			{
+			pss->saltLength = ASN1_INTEGER_new();
+			if (!pss->saltLength)
+				goto err;
+			if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
+				goto err;
+			}
+		if (EVP_MD_type(sigmd) != NID_sha1)
+			{
+			pss->hashAlgorithm = X509_ALGOR_new();
+			if (!pss->hashAlgorithm)
+				goto err;
+			X509_ALGOR_set_md(pss->hashAlgorithm, sigmd);
+			}
+		if (EVP_MD_type(mgf1md) != NID_sha1)
+			{
+			ASN1_STRING *stmp = NULL;
+			/* need to embed algorithm ID inside another */
+			mgf1alg = X509_ALGOR_new();
+			X509_ALGOR_set_md(mgf1alg, mgf1md);
+			if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR),
+									&stmp))
+					goto err;
+			pss->maskGenAlgorithm = X509_ALGOR_new();
+			if (!pss->maskGenAlgorithm)
+				goto err;
+			X509_ALGOR_set0(pss->maskGenAlgorithm,
+					OBJ_nid2obj(NID_mgf1),
+					V_ASN1_SEQUENCE, stmp);
+			}
+		/* Finally create string with pss parameter encoding. */
+		if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1))
+			goto err;
+		if (alg2)
+			{
+			os2 = ASN1_STRING_dup(os1);
+			if (!os2)
+				goto err;
+			X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss),
+						V_ASN1_SEQUENCE, os2);
+			}
+		X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss),
+					V_ASN1_SEQUENCE, os1);
+		os1 = os2 = NULL;
+		rv = 3;
+		err:
+		if (mgf1alg)
+			X509_ALGOR_free(mgf1alg);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (os1)
+			ASN1_STRING_free(os1);
+		return rv;
+		
+		}
+	return 2;
+	}
 
 const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = 
 	{
@@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] =
 
 		0,0,0,0,0,0,
 
+		rsa_sig_print,
 		int_rsa_free,
 		rsa_pkey_ctrl,
 		old_rsa_priv_decode,
-		old_rsa_priv_encode
+		old_rsa_priv_encode,
+		rsa_item_verify,
+		rsa_item_sign
 		},
 
 		{
diff --git a/crypto/rsa/rsa_asn1.c b/crypto/rsa/rsa_asn1.c
index 4efca8c..6ed5de3 100644
--- a/crypto/rsa/rsa_asn1.c
+++ b/crypto/rsa/rsa_asn1.c
@@ -60,6 +60,7 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#include <openssl/x509.h>
 #include <openssl/asn1t.h>
 
 /* Override the default free and new methods */
@@ -96,6 +97,15 @@ ASN1_SEQUENCE_cb(RSAPublicKey, rsa_cb) = {
 	ASN1_SIMPLE(RSA, e, BIGNUM),
 } ASN1_SEQUENCE_END_cb(RSA, RSAPublicKey)
 
+ASN1_SEQUENCE(RSA_PSS_PARAMS) = {
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR,0),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR,1),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER,2),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER,3)
+} ASN1_SEQUENCE_END(RSA_PSS_PARAMS)
+
+IMPLEMENT_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPrivateKey, RSAPrivateKey)
 
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPublicKey, RSAPublicKey)
diff --git a/crypto/rsa/rsa_crpt.c b/crypto/rsa/rsa_crpt.c
new file mode 100644
index 0000000..d3e4478
--- /dev/null
+++ b/crypto/rsa/rsa_crpt.c
@@ -0,0 +1,257 @@
+/* crypto/rsa/rsa_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/lhash.h>
+#include <openssl/bn.h>
+#include <openssl/rsa.h>
+#include <openssl/rand.h>
+#ifndef OPENSSL_NO_ENGINE
+#include <openssl/engine.h>
+#endif
+
+int RSA_size(const RSA *r)
+	{
+	return(BN_num_bytes(r->n));
+	}
+
+int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_flags(const RSA *r)
+	{
+	return((r == NULL)?0:r->meth->flags);
+	}
+
+void RSA_blinding_off(RSA *rsa)
+	{
+	if (rsa->blinding != NULL)
+		{
+		BN_BLINDING_free(rsa->blinding);
+		rsa->blinding=NULL;
+		}
+	rsa->flags &= ~RSA_FLAG_BLINDING;
+	rsa->flags |= RSA_FLAG_NO_BLINDING;
+	}
+
+int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
+	{
+	int ret=0;
+
+	if (rsa->blinding != NULL)
+		RSA_blinding_off(rsa);
+
+	rsa->blinding = RSA_setup_blinding(rsa, ctx);
+	if (rsa->blinding == NULL)
+		goto err;
+
+	rsa->flags |= RSA_FLAG_BLINDING;
+	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
+	ret=1;
+err:
+	return(ret);
+	}
+
+static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
+	const BIGNUM *q, BN_CTX *ctx)
+{
+	BIGNUM *ret = NULL, *r0, *r1, *r2;
+
+	if (d == NULL || p == NULL || q == NULL)
+		return NULL;
+
+	BN_CTX_start(ctx);
+	r0 = BN_CTX_get(ctx);
+	r1 = BN_CTX_get(ctx);
+	r2 = BN_CTX_get(ctx);
+	if (r2 == NULL)
+		goto err;
+
+	if (!BN_sub(r1, p, BN_value_one())) goto err;
+	if (!BN_sub(r2, q, BN_value_one())) goto err;
+	if (!BN_mul(r0, r1, r2, ctx)) goto err;
+
+	ret = BN_mod_inverse(NULL, d, r0, ctx);
+err:
+	BN_CTX_end(ctx);
+	return ret;
+}
+
+BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
+{
+	BIGNUM local_n;
+	BIGNUM *e,*n;
+	BN_CTX *ctx;
+	BN_BLINDING *ret = NULL;
+
+	if (in_ctx == NULL)
+		{
+		if ((ctx = BN_CTX_new()) == NULL) return 0;
+		}
+	else
+		ctx = in_ctx;
+
+	BN_CTX_start(ctx);
+	e  = BN_CTX_get(ctx);
+	if (e == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	if (rsa->e == NULL)
+		{
+		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
+		if (e == NULL)
+			{
+			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
+			goto err;
+			}
+		}
+	else
+		e = rsa->e;
+
+	
+	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
+		{
+		/* if PRNG is not properly seeded, resort to secret
+		 * exponent as unpredictable seed */
+		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
+		}
+
+	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
+		{
+		/* Set BN_FLG_CONSTTIME flag */
+		n = &local_n;
+		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
+		}
+	else
+		n = rsa->n;
+
+	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
+			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
+	if (ret == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
+		goto err;
+		}
+	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
+err:
+	BN_CTX_end(ctx);
+	if (in_ctx == NULL)
+		BN_CTX_free(ctx);
+	if(rsa->e == NULL)
+		BN_free(e);
+
+	return ret;
+}
diff --git a/crypto/rsa/rsa_eay.c b/crypto/rsa/rsa_eay.c
index 7c94188..88ee2cb 100644
--- a/crypto/rsa/rsa_eay.c
+++ b/crypto/rsa/rsa_eay.c
@@ -314,51 +314,56 @@ static BN_BLINDING *rsa_get_blinding(RSA *rsa, int *local, BN_CTX *ctx)
 	return ret;
 }
 
-static int rsa_blinding_convert(BN_BLINDING *b, int local, BIGNUM *f,
-	BIGNUM *r, BN_CTX *ctx)
-{
-	if (local)
+static int rsa_blinding_convert(BN_BLINDING *b, BIGNUM *f, BIGNUM *unblind,
+	BN_CTX *ctx)
+	{
+	if (unblind == NULL)
+		/* Local blinding: store the unblinding factor
+		 * in BN_BLINDING. */
 		return BN_BLINDING_convert_ex(f, NULL, b, ctx);
 	else
 		{
-		int ret;
-		CRYPTO_r_lock(CRYPTO_LOCK_RSA_BLINDING);
-		ret = BN_BLINDING_convert_ex(f, r, b, ctx);
-		CRYPTO_r_unlock(CRYPTO_LOCK_RSA_BLINDING);
-		return ret;
-		}
-}
-
-static int rsa_blinding_invert(BN_BLINDING *b, int local, BIGNUM *f,
-	BIGNUM *r, BN_CTX *ctx)
-{
-	if (local)
-		return BN_BLINDING_invert_ex(f, NULL, b, ctx);
-	else
-		{
+		/* Shared blinding: store the unblinding factor
+		 * outside BN_BLINDING. */
 		int ret;
 		CRYPTO_w_lock(CRYPTO_LOCK_RSA_BLINDING);
-		ret = BN_BLINDING_invert_ex(f, r, b, ctx);
+		ret = BN_BLINDING_convert_ex(f, unblind, b, ctx);
 		CRYPTO_w_unlock(CRYPTO_LOCK_RSA_BLINDING);
 		return ret;
 		}
-}
+	}
+
+static int rsa_blinding_invert(BN_BLINDING *b, BIGNUM *f, BIGNUM *unblind,
+	BN_CTX *ctx)
+	{
+	/* For local blinding, unblind is set to NULL, and BN_BLINDING_invert_ex
+	 * will use the unblinding factor stored in BN_BLINDING.
+	 * If BN_BLINDING is shared between threads, unblind must be non-null:
+	 * BN_BLINDING_invert_ex will then use the local unblinding factor,
+	 * and will only read the modulus from BN_BLINDING.
+	 * In both cases it's safe to access the blinding without a lock.
+	 */
+	return BN_BLINDING_invert_ex(f, unblind, b, ctx);
+	}
 
 /* signing */
 static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 	     unsigned char *to, RSA *rsa, int padding)
 	{
-	BIGNUM *f, *ret, *br, *res;
+	BIGNUM *f, *ret, *res;
 	int i,j,k,num=0,r= -1;
 	unsigned char *buf=NULL;
 	BN_CTX *ctx=NULL;
 	int local_blinding = 0;
+	/* Used only if the blinding structure is shared. A non-NULL unblind
+	 * instructs rsa_blinding_convert() and rsa_blinding_invert() to store
+	 * the unblinding factor outside the blinding structure. */
+	BIGNUM *unblind = NULL;
 	BN_BLINDING *blinding = NULL;
 
 	if ((ctx=BN_CTX_new()) == NULL) goto err;
 	BN_CTX_start(ctx);
 	f   = BN_CTX_get(ctx);
-	br  = BN_CTX_get(ctx);
 	ret = BN_CTX_get(ctx);
 	num = BN_num_bytes(rsa->n);
 	buf = OPENSSL_malloc(num);
@@ -406,8 +411,15 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 		}
 	
 	if (blinding != NULL)
-		if (!rsa_blinding_convert(blinding, local_blinding, f, br, ctx))
+		{
+		if (!local_blinding && ((unblind = BN_CTX_get(ctx)) == NULL))
+			{
+			RSAerr(RSA_F_RSA_EAY_PRIVATE_ENCRYPT,ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (!rsa_blinding_convert(blinding, f, unblind, ctx))
 			goto err;
+		}
 
 	if ( (rsa->flags & RSA_FLAG_EXT_PKEY) ||
 		((rsa->p != NULL) &&
@@ -441,7 +453,7 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 		}
 
 	if (blinding)
-		if (!rsa_blinding_invert(blinding, local_blinding, ret, br, ctx))
+		if (!rsa_blinding_invert(blinding, ret, unblind, ctx))
 			goto err;
 
 	if (padding == RSA_X931_PADDING)
@@ -480,18 +492,21 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
 	     unsigned char *to, RSA *rsa, int padding)
 	{
-	BIGNUM *f, *ret, *br;
+	BIGNUM *f, *ret;
 	int j,num=0,r= -1;
 	unsigned char *p;
 	unsigned char *buf=NULL;
 	BN_CTX *ctx=NULL;
 	int local_blinding = 0;
+	/* Used only if the blinding structure is shared. A non-NULL unblind
+	 * instructs rsa_blinding_convert() and rsa_blinding_invert() to store
+	 * the unblinding factor outside the blinding structure. */
+	BIGNUM *unblind = NULL;
 	BN_BLINDING *blinding = NULL;
 
 	if((ctx = BN_CTX_new()) == NULL) goto err;
 	BN_CTX_start(ctx);
 	f   = BN_CTX_get(ctx);
-	br  = BN_CTX_get(ctx);
 	ret = BN_CTX_get(ctx);
 	num = BN_num_bytes(rsa->n);
 	buf = OPENSSL_malloc(num);
@@ -529,8 +544,15 @@ static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
 		}
 	
 	if (blinding != NULL)
-		if (!rsa_blinding_convert(blinding, local_blinding, f, br, ctx))
+		{
+		if (!local_blinding && ((unblind = BN_CTX_get(ctx)) == NULL))
+			{
+			RSAerr(RSA_F_RSA_EAY_PRIVATE_DECRYPT,ERR_R_MALLOC_FAILURE);
 			goto err;
+			}
+		if (!rsa_blinding_convert(blinding, f, unblind, ctx))
+			goto err;
+		}
 
 	/* do the decrypt */
 	if ( (rsa->flags & RSA_FLAG_EXT_PKEY) ||
@@ -564,7 +586,7 @@ static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
 		}
 
 	if (blinding)
-		if (!rsa_blinding_invert(blinding, local_blinding, ret, br, ctx))
+		if (!rsa_blinding_invert(blinding, ret, unblind, ctx))
 			goto err;
 
 	p=buf;
@@ -825,12 +847,12 @@ static int RSA_eay_mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx)
 	if (!BN_mod(r0,pr1,rsa->p,ctx)) goto err;
 
 	/* If p < q it is occasionally possible for the correction of
-         * adding 'p' if r0 is negative above to leave the result still
+	 * adding 'p' if r0 is negative above to leave the result still
 	 * negative. This can break the private key operations: the following
 	 * second correction should *always* correct this rare occurrence.
 	 * This will *never* happen with OpenSSL generated keys because
-         * they ensure p > q [steve]
-         */
+	 * they ensure p > q [steve]
+	 */
 	if (BN_is_negative(r0))
 		if (!BN_add(r0,r0,rsa->p)) goto err;
 	if (!BN_mul(r1,r0,rsa->q,ctx)) goto err;
diff --git a/crypto/rsa/rsa_err.c b/crypto/rsa/rsa_err.c
index cf9f110..46e0bf9 100644
--- a/crypto/rsa/rsa_err.c
+++ b/crypto/rsa/rsa_err.c
@@ -1,6 +1,6 @@
 /* crypto/rsa/rsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -78,6 +78,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL),	"PKEY_RSA_CTRL"},
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL_STR),	"PKEY_RSA_CTRL_STR"},
 {ERR_FUNC(RSA_F_PKEY_RSA_SIGN),	"PKEY_RSA_SIGN"},
+{ERR_FUNC(RSA_F_PKEY_RSA_VERIFY),	"PKEY_RSA_VERIFY"},
 {ERR_FUNC(RSA_F_PKEY_RSA_VERIFYRECOVER),	"PKEY_RSA_VERIFYRECOVER"},
 {ERR_FUNC(RSA_F_RSA_BUILTIN_KEYGEN),	"RSA_BUILTIN_KEYGEN"},
 {ERR_FUNC(RSA_F_RSA_CHECK_KEY),	"RSA_check_key"},
@@ -86,6 +87,8 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_DECRYPT),	"RSA_EAY_PUBLIC_DECRYPT"},
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_ENCRYPT),	"RSA_EAY_PUBLIC_ENCRYPT"},
 {ERR_FUNC(RSA_F_RSA_GENERATE_KEY),	"RSA_generate_key"},
+{ERR_FUNC(RSA_F_RSA_GENERATE_KEY_EX),	"RSA_generate_key_ex"},
+{ERR_FUNC(RSA_F_RSA_ITEM_VERIFY),	"RSA_ITEM_VERIFY"},
 {ERR_FUNC(RSA_F_RSA_MEMORY_LOCK),	"RSA_memory_lock"},
 {ERR_FUNC(RSA_F_RSA_NEW_METHOD),	"RSA_new_method"},
 {ERR_FUNC(RSA_F_RSA_NULL),	"RSA_NULL"},
@@ -97,6 +100,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_NONE),	"RSA_padding_add_none"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP),	"RSA_padding_add_PKCS1_OAEP"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS),	"RSA_padding_add_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1),	"RSA_padding_add_PKCS1_PSS_mgf1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1),	"RSA_padding_add_PKCS1_type_1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2),	"RSA_padding_add_PKCS1_type_2"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_SSLV23),	"RSA_padding_add_SSLv23"},
@@ -109,8 +113,12 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_PADDING_CHECK_X931),	"RSA_padding_check_X931"},
 {ERR_FUNC(RSA_F_RSA_PRINT),	"RSA_print"},
 {ERR_FUNC(RSA_F_RSA_PRINT_FP),	"RSA_print_fp"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_DECRYPT),	"RSA_private_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_ENCRYPT),	"RSA_private_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PRIV_DECODE),	"RSA_PRIV_DECODE"},
 {ERR_FUNC(RSA_F_RSA_PRIV_ENCODE),	"RSA_PRIV_ENCODE"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_DECRYPT),	"RSA_public_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_ENCRYPT),	"RSA_public_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PUB_DECODE),	"RSA_PUB_DECODE"},
 {ERR_FUNC(RSA_F_RSA_SETUP_BLINDING),	"RSA_setup_blinding"},
 {ERR_FUNC(RSA_F_RSA_SIGN),	"RSA_sign"},
@@ -118,6 +126,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_VERIFY),	"RSA_verify"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_ASN1_OCTET_STRING),	"RSA_verify_ASN1_OCTET_STRING"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS),	"RSA_verify_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1),	"RSA_verify_PKCS1_PSS_mgf1"},
 {0,NULL}
 	};
 
@@ -146,19 +155,24 @@ static ERR_STRING_DATA RSA_str_reasons[]=
 {ERR_REASON(RSA_R_INVALID_HEADER)        ,"invalid header"},
 {ERR_REASON(RSA_R_INVALID_KEYBITS)       ,"invalid keybits"},
 {ERR_REASON(RSA_R_INVALID_MESSAGE_LENGTH),"invalid message length"},
+{ERR_REASON(RSA_R_INVALID_MGF1_MD)       ,"invalid mgf1 md"},
 {ERR_REASON(RSA_R_INVALID_PADDING)       ,"invalid padding"},
 {ERR_REASON(RSA_R_INVALID_PADDING_MODE)  ,"invalid padding mode"},
+{ERR_REASON(RSA_R_INVALID_PSS_PARAMETERS),"invalid pss parameters"},
 {ERR_REASON(RSA_R_INVALID_PSS_SALTLEN)   ,"invalid pss saltlen"},
+{ERR_REASON(RSA_R_INVALID_SALT_LENGTH)   ,"invalid salt length"},
 {ERR_REASON(RSA_R_INVALID_TRAILER)       ,"invalid trailer"},
 {ERR_REASON(RSA_R_INVALID_X931_DIGEST)   ,"invalid x931 digest"},
 {ERR_REASON(RSA_R_IQMP_NOT_INVERSE_OF_Q) ,"iqmp not inverse of q"},
 {ERR_REASON(RSA_R_KEY_SIZE_TOO_SMALL)    ,"key size too small"},
 {ERR_REASON(RSA_R_LAST_OCTET_INVALID)    ,"last octet invalid"},
 {ERR_REASON(RSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(RSA_R_NON_FIPS_RSA_METHOD)   ,"non fips rsa method"},
 {ERR_REASON(RSA_R_NO_PUBLIC_EXPONENT)    ,"no public exponent"},
 {ERR_REASON(RSA_R_NULL_BEFORE_BLOCK_MISSING),"null before block missing"},
 {ERR_REASON(RSA_R_N_DOES_NOT_EQUAL_P_Q)  ,"n does not equal p q"},
 {ERR_REASON(RSA_R_OAEP_DECODING_ERROR)   ,"oaep decoding error"},
+{ERR_REASON(RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE),"operation not allowed in fips mode"},
 {ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"},
 {ERR_REASON(RSA_R_PADDING_CHECK_FAILED)  ,"padding check failed"},
 {ERR_REASON(RSA_R_P_NOT_PRIME)           ,"p not prime"},
@@ -169,7 +183,12 @@ static ERR_STRING_DATA RSA_str_reasons[]=
 {ERR_REASON(RSA_R_SSLV3_ROLLBACK_ATTACK) ,"sslv3 rollback attack"},
 {ERR_REASON(RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),"the asn1 object identifier is not known for this md"},
 {ERR_REASON(RSA_R_UNKNOWN_ALGORITHM_TYPE),"unknown algorithm type"},
+{ERR_REASON(RSA_R_UNKNOWN_MASK_DIGEST)   ,"unknown mask digest"},
 {ERR_REASON(RSA_R_UNKNOWN_PADDING_TYPE)  ,"unknown padding type"},
+{ERR_REASON(RSA_R_UNKNOWN_PSS_DIGEST)    ,"unknown pss digest"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_ALGORITHM),"unsupported mask algorithm"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_PARAMETER),"unsupported mask parameter"},
+{ERR_REASON(RSA_R_UNSUPPORTED_SIGNATURE_TYPE),"unsupported signature type"},
 {ERR_REASON(RSA_R_VALUE_MISSING)         ,"value missing"},
 {ERR_REASON(RSA_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
 {0,NULL}
diff --git a/crypto/rsa/rsa_gen.c b/crypto/rsa/rsa_gen.c
index 767f7ab..42290cc 100644
--- a/crypto/rsa/rsa_gen.c
+++ b/crypto/rsa/rsa_gen.c
@@ -67,6 +67,9 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb);
 
@@ -77,8 +80,20 @@ static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
  * now just because key-generation is part of RSA_METHOD. */
 int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_GENERATE_KEY_EX, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if(rsa->meth->rsa_keygen)
 		return rsa->meth->rsa_keygen(rsa, bits, e_value, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_rsa_generate_key_ex(rsa, bits, e_value, cb);
+#endif
 	return rsa_builtin_keygen(rsa, bits, e_value, cb);
 	}
 
diff --git a/crypto/rsa/rsa_lib.c b/crypto/rsa/rsa_lib.c
index de45088..c95ceaf 100644
--- a/crypto/rsa/rsa_lib.c
+++ b/crypto/rsa/rsa_lib.c
@@ -67,6 +67,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char RSA_version[]="RSA" OPENSSL_VERSION_PTEXT;
 
 static const RSA_METHOD *default_RSA_meth=NULL;
@@ -87,11 +91,14 @@ const RSA_METHOD *RSA_get_default_method(void)
 	{
 	if (default_RSA_meth == NULL)
 		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_rsa_pkcs1_ssleay();
+		else
+			return RSA_PKCS1_SSLeay();
+#else
 #ifdef RSA_NULL
 		default_RSA_meth=RSA_null_method();
-#else
-#if 0 /* was: #ifdef RSAref */
-		default_RSA_meth=RSA_PKCS1_RSAref();
 #else
 		default_RSA_meth=RSA_PKCS1_SSLeay();
 #endif
@@ -181,7 +188,7 @@ RSA *RSA_new_method(ENGINE *engine)
 	ret->blinding=NULL;
 	ret->mt_blinding=NULL;
 	ret->bignum_data=NULL;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~RSA_FLAG_NON_FIPS_ALLOW;
 	if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_RSA, ret, &ret->ex_data))
 		{
 #ifndef OPENSSL_NO_ENGINE
@@ -280,163 +287,6 @@ void *RSA_get_ex_data(const RSA *r, int idx)
 	return(CRYPTO_get_ex_data(&r->ex_data,idx));
 	}
 
-int RSA_size(const RSA *r)
-	{
-	return(BN_num_bytes(r->n));
-	}
-
-int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_flags(const RSA *r)
-	{
-	return((r == NULL)?0:r->meth->flags);
-	}
-
-void RSA_blinding_off(RSA *rsa)
-	{
-	if (rsa->blinding != NULL)
-		{
-		BN_BLINDING_free(rsa->blinding);
-		rsa->blinding=NULL;
-		}
-	rsa->flags &= ~RSA_FLAG_BLINDING;
-	rsa->flags |= RSA_FLAG_NO_BLINDING;
-	}
-
-int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
-	{
-	int ret=0;
-
-	if (rsa->blinding != NULL)
-		RSA_blinding_off(rsa);
-
-	rsa->blinding = RSA_setup_blinding(rsa, ctx);
-	if (rsa->blinding == NULL)
-		goto err;
-
-	rsa->flags |= RSA_FLAG_BLINDING;
-	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
-	ret=1;
-err:
-	return(ret);
-	}
-
-static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
-	const BIGNUM *q, BN_CTX *ctx)
-{
-	BIGNUM *ret = NULL, *r0, *r1, *r2;
-
-	if (d == NULL || p == NULL || q == NULL)
-		return NULL;
-
-	BN_CTX_start(ctx);
-	r0 = BN_CTX_get(ctx);
-	r1 = BN_CTX_get(ctx);
-	r2 = BN_CTX_get(ctx);
-	if (r2 == NULL)
-		goto err;
-
-	if (!BN_sub(r1, p, BN_value_one())) goto err;
-	if (!BN_sub(r2, q, BN_value_one())) goto err;
-	if (!BN_mul(r0, r1, r2, ctx)) goto err;
-
-	ret = BN_mod_inverse(NULL, d, r0, ctx);
-err:
-	BN_CTX_end(ctx);
-	return ret;
-}
-
-BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
-{
-	BIGNUM local_n;
-	BIGNUM *e,*n;
-	BN_CTX *ctx;
-	BN_BLINDING *ret = NULL;
-
-	if (in_ctx == NULL)
-		{
-		if ((ctx = BN_CTX_new()) == NULL) return 0;
-		}
-	else
-		ctx = in_ctx;
-
-	BN_CTX_start(ctx);
-	e  = BN_CTX_get(ctx);
-	if (e == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-
-	if (rsa->e == NULL)
-		{
-		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
-		if (e == NULL)
-			{
-			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
-			goto err;
-			}
-		}
-	else
-		e = rsa->e;
-
-	
-	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
-		{
-		/* if PRNG is not properly seeded, resort to secret
-		 * exponent as unpredictable seed */
-		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
-		}
-
-	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
-		{
-		/* Set BN_FLG_CONSTTIME flag */
-		n = &local_n;
-		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
-		}
-	else
-		n = rsa->n;
-
-	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
-			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
-	if (ret == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
-		goto err;
-		}
-	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
-err:
-	BN_CTX_end(ctx);
-	if (in_ctx == NULL)
-		BN_CTX_free(ctx);
-	if(rsa->e == NULL)
-		BN_free(e);
-
-	return ret;
-}
-
 int RSA_memory_lock(RSA *r)
 	{
 	int i,j,k,off;
diff --git a/crypto/rsa/rsa_oaep.c b/crypto/rsa/rsa_oaep.c
index 18d307e..af4d24a 100644
--- a/crypto/rsa/rsa_oaep.c
+++ b/crypto/rsa/rsa_oaep.c
@@ -56,7 +56,8 @@ int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen,
 	seed = to + 1;
 	db = to + SHA_DIGEST_LENGTH + 1;
 
-	EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL))
+		return 0;
 	memset(db + SHA_DIGEST_LENGTH, 0,
 		emlen - flen - 2 * SHA_DIGEST_LENGTH - 1);
 	db[emlen - flen - SHA_DIGEST_LENGTH - 1] = 0x01;
@@ -145,9 +146,10 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
 	for (i = 0; i < dblen; i++)
 		db[i] ^= maskeddb[i];
 
-	EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL))
+		return -1;
 
-	if (memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
+	if (CRYPTO_memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
 		goto decoding_err;
 	else
 		{
diff --git a/crypto/rsa/rsa_pmeth.c b/crypto/rsa/rsa_pmeth.c
index c6892ec..5b2ecf5 100644
--- a/crypto/rsa/rsa_pmeth.c
+++ b/crypto/rsa/rsa_pmeth.c
@@ -63,6 +63,12 @@
 #include <openssl/rsa.h>
 #include <openssl/bn.h>
 #include <openssl/evp.h>
+#ifndef OPENSSL_NO_CMS
+#include <openssl/cms.h>
+#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 #include "rsa_locl.h"
 
@@ -79,6 +85,8 @@ typedef struct
 	int pad_mode;
 	/* message digest */
 	const EVP_MD *md;
+	/* message digest for MGF1 */
+	const EVP_MD *mgf1md;
 	/* PSS/OAEP salt length */
 	int saltlen;
 	/* Temp buffer */
@@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx)
 	rctx->pub_exp = NULL;
 	rctx->pad_mode = RSA_PKCS1_PADDING;
 	rctx->md = NULL;
+	rctx->mgf1md = NULL;
 	rctx->tbuf = NULL;
 
 	rctx->saltlen = -2;
@@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx)
 		OPENSSL_free(rctx);
 		}
 	}
+#ifdef OPENSSL_FIPS
+/* FIP checker. Return value indicates status of context parameters:
+ * 1  : redirect to FIPS.
+ * 0  : don't redirect to FIPS.
+ * -1 : illegal operation in FIPS mode.
+ */
+
+static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx)
+	{
+	RSA_PKEY_CTX *rctx = ctx->data;
+	RSA *rsa = ctx->pkey->pkey.rsa;
+	int rv = -1;
+	if (!FIPS_mode())
+		return 0;
+	if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)
+		rv = 0;
+	if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv)
+		return -1;
+	if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	return 1;
+	}
+#endif
 
 static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 					const unsigned char *tbs, size_t tbslen)
@@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 
+#ifdef OPENSSL_FIPS
+	ret = pkey_fips_check_ctx(ctx);
+	if (ret < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
+
 	if (rctx->md)
 		{
 		if (tbslen != (size_t)EVP_MD_size(rctx->md))
@@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 					RSA_R_INVALID_DIGEST_LENGTH);
 			return -1;
 			}
-		if (rctx->pad_mode == RSA_X931_PADDING)
+#ifdef OPENSSL_FIPS
+		if (ret > 0)
+			{
+			unsigned int slen;
+			ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, &slen);
+			if (ret > 0)
+				*siglen = slen;
+			else
+				*siglen = 0;
+			return ret;
+			}
+#endif
+
+		if (EVP_MD_type(rctx->md) == NID_mdc2)
+			{
+			unsigned int sltmp;
+			if (rctx->pad_mode != RSA_PKCS1_PADDING)
+				return -1;
+			ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2,
+						tbs, tbslen, sig, &sltmp, rsa);
+
+			if (ret <= 0)
+				return ret;
+			ret = sltmp;
+			}
+		else if (rctx->pad_mode == RSA_X931_PADDING)
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
@@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
-			if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs,
-						rctx->md, rctx->saltlen))
+			if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa,
+						rctx->tbuf, tbs,
+						rctx->md, rctx->mgf1md,
+						rctx->saltlen))
 				return -1;
 			ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf,
 						sig, rsa, RSA_NO_PADDING);
@@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 	size_t rslen;
+#ifdef OPENSSL_FIPS
+	int rv;
+	rv = pkey_fips_check_ctx(ctx);
+	if (rv < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
 	if (rctx->md)
 		{
+#ifdef OPENSSL_FIPS
+		if (rv > 0)
+			{
+			return FIPS_rsa_verify_digest(rsa,
+							tbs, tbslen,
+							rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, siglen);
+							
+			}
+#endif
 		if (rctx->pad_mode == RSA_PKCS1_PADDING)
 			return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen,
 					sig, siglen, rsa);
@@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
 							rsa, RSA_NO_PADDING);
 			if (ret <= 0)
 				return 0;
-			ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md,
+			ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs,
+						rctx->md, rctx->mgf1md,
 						rctx->tbuf, rctx->saltlen);
 			if (ret <= 0)
 				return 0;
@@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 				RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE);
 		return -2;
 
+		case EVP_PKEY_CTRL_GET_RSA_PADDING:
+		*(int *)p2 = rctx->pad_mode;
+		return 1;
+
 		case EVP_PKEY_CTRL_RSA_PSS_SALTLEN:
-		if (p1 < -2)
-			return -2;
+		case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN:
 		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
 			{
 			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN);
 			return -2;
 			}
-		rctx->saltlen = p1;
+		if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN)
+			*(int *)p2 = rctx->saltlen;
+		else
+			{
+			if (p1 < -2)
+				return -2;
+			rctx->saltlen = p1;
+			}
 		return 1;
 
 		case EVP_PKEY_CTRL_RSA_KEYGEN_BITS:
@@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 		rctx->md = p2;
 		return 1;
 
+		case EVP_PKEY_CTRL_RSA_MGF1_MD:
+		case EVP_PKEY_CTRL_GET_RSA_MGF1_MD:
+		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
+			{
+			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD);
+			return -2;
+			}
+		if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD)
+			{
+			if (rctx->mgf1md)
+				*(const EVP_MD **)p2 = rctx->mgf1md;
+			else
+				*(const EVP_MD **)p2 = rctx->md;
+			}
+		else
+			rctx->mgf1md = p2;
+		return 1;
+
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		case EVP_PKEY_CTRL_PKCS7_ENCRYPT:
 		case EVP_PKEY_CTRL_PKCS7_DECRYPT:
 		case EVP_PKEY_CTRL_PKCS7_SIGN:
+		return 1;
 #ifndef OPENSSL_NO_CMS
-		case EVP_PKEY_CTRL_CMS_ENCRYPT:
 		case EVP_PKEY_CTRL_CMS_DECRYPT:
+		{
+		X509_ALGOR *alg = NULL;
+		ASN1_OBJECT *encalg = NULL;
+		if (p2)
+			CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg);
+		if (alg)
+			X509_ALGOR_get0(&encalg, NULL, NULL, alg);
+		if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep)
+			rctx->pad_mode = RSA_PKCS1_OAEP_PADDING;
+		}
+		case EVP_PKEY_CTRL_CMS_ENCRYPT:
 		case EVP_PKEY_CTRL_CMS_SIGN:
-#endif
 		return 1;
+#endif
 		case EVP_PKEY_CTRL_PEER_KEY:
 			RSAerr(RSA_F_PKEY_RSA_CTRL,
 			RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);
diff --git a/crypto/rsa/rsa_pss.c b/crypto/rsa/rsa_pss.c
index ac211e2..5f9f533 100644
--- a/crypto/rsa/rsa_pss.c
+++ b/crypto/rsa/rsa_pss.c
@@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0};
 int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 			const EVP_MD *Hash, const unsigned char *EM, int sLen)
 	{
+	return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen);
+	}
+
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash,
+			const unsigned char *EM, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
@@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	unsigned char *DB = NULL;
 	EVP_MD_CTX ctx;
 	unsigned char H_[EVP_MAX_MD_SIZE];
+	EVP_MD_CTX_init(&ctx);
+
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
 
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
@@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	emLen = RSA_size(rsa);
 	if (EM[0] & (0xFF << MSBits))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID);
 		goto err;
 		}
 	if (MSBits == 0)
@@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 		}
 	if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE);
 		goto err;
 		}
 	if (EM[emLen - 1] != 0xbc)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID);
 		goto err;
 		}
 	maskedDBLen = emLen - hLen - 1;
@@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	DB = OPENSSL_malloc(maskedDBLen);
 	if (!DB)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
-	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0)
+	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0)
 		goto err;
 	for (i = 0; i < maskedDBLen; i++)
 		DB[i] ^= EM[i];
@@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ;
 	if (DB[i++] != 0x1)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED);
 		goto err;
 		}
 	if (sLen >= 0 && (maskedDBLen - i) != sLen)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
 	if (maskedDBLen - i)
-		EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i);
-	EVP_DigestFinal(&ctx, H_, NULL);
-	EVP_MD_CTX_cleanup(&ctx);
+		{
+		if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i))
+			goto err;
+		}
+	if (!EVP_DigestFinal_ex(&ctx, H_, NULL))
+		goto err;
 	if (memcmp(H_, H, hLen))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE);
 		ret = 0;
 		}
 	else 
@@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	err:
 	if (DB)
 		OPENSSL_free(DB);
+	EVP_MD_CTX_cleanup(&ctx);
 
 	return ret;
 
@@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen)
 	{
+	return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen);
+	}
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
 	unsigned char *H, *salt = NULL, *p;
 	EVP_MD_CTX ctx;
 
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
+
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
 		goto err;
@@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 		}
 	else if (emLen < (hLen + sLen + 2))
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
 		goto err;
 		}
 	if (sLen > 0)
@@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 		salt = OPENSSL_malloc(sLen);
 		if (!salt)
 			{
-			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   		ERR_R_MALLOC_FAILURE);
+			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE);
 			goto err;
 			}
 		if (RAND_bytes(salt, sLen) <= 0)
@@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 	maskedDBLen = emLen - hLen - 1;
 	H = EM + maskedDBLen;
 	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
-	if (sLen)
-		EVP_DigestUpdate(&ctx, salt, sLen);
-	EVP_DigestFinal(&ctx, H, NULL);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
+	if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx, H, NULL))
+		goto err;
 	EVP_MD_CTX_cleanup(&ctx);
 
 	/* Generate dbMask in place then perform XOR on it */
-	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash))
+	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash))
 		goto err;
 
 	p = EM;
diff --git a/crypto/rsa/rsa_sign.c b/crypto/rsa/rsa_sign.c
index 0be4ec7..b6f6037 100644
--- a/crypto/rsa/rsa_sign.c
+++ b/crypto/rsa/rsa_sign.c
@@ -77,6 +77,14 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len,
 	const unsigned char *s = NULL;
 	X509_ALGOR algor;
 	ASN1_OCTET_STRING digest;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_SIGN, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign)
 		{
 		return rsa->meth->rsa_sign(type, m, m_len,
@@ -153,6 +161,15 @@ int int_rsa_verify(int dtype, const unsigned char *m,
 	unsigned char *s;
 	X509_SIG *sig=NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
+
 	if (siglen != (unsigned int)RSA_size(rsa))
 		{
 		RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_WRONG_SIGNATURE_LENGTH);
@@ -182,6 +199,22 @@ int int_rsa_verify(int dtype, const unsigned char *m,
 	i=RSA_public_decrypt((int)siglen,sigbuf,s,rsa,RSA_PKCS1_PADDING);
 
 	if (i <= 0) goto err;
+	/* Oddball MDC2 case: signature can be OCTET STRING.
+	 * check for correct tag and length octets.
+	 */
+	if (dtype == NID_mdc2 && i == 18 && s[0] == 0x04 && s[1] == 0x10)
+		{
+		if (rm)
+			{
+			memcpy(rm, s + 2, 16);
+			*prm_len = 16;
+			ret = 1;
+			}
+		else if(memcmp(m, s + 2, 16))
+			RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_BAD_SIGNATURE);
+		else
+			ret = 1;
+		}
 
 	/* Special case: SSL signature */
 	if(dtype == NID_md5_sha1) {
diff --git a/crypto/s390xcap.c b/crypto/s390xcap.c
index ffbe023..f2e94ef 100644
--- a/crypto/s390xcap.c
+++ b/crypto/s390xcap.c
@@ -4,7 +4,7 @@
 #include <setjmp.h>
 #include <signal.h>
 
-extern unsigned long OPENSSL_s390xcap_P;
+extern unsigned long OPENSSL_s390xcap_P[];
 
 static sigjmp_buf ill_jmp;
 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
@@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void)
 	sigset_t oset;
 	struct sigaction ill_act,oact;
 
-	if (OPENSSL_s390xcap_P) return;
+	if (OPENSSL_s390xcap_P[0]) return;
+
+	OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1);
 
 	memset(&ill_act,0,sizeof(ill_act));
 	ill_act.sa_handler = ill_handler;
@@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void)
 	sigaction (SIGILL,&ill_act,&oact);
 
 	/* protection against missing store-facility-list-extended */
-	if (sigsetjmp(ill_jmp,0) == 0)
-		OPENSSL_s390xcap_P = OPENSSL_s390x_facilities();
-	else
-		OPENSSL_s390xcap_P = 1UL<<63;
+	if (sigsetjmp(ill_jmp,1) == 0)
+		OPENSSL_s390x_facilities();
 
 	sigaction (SIGILL,&oact,NULL);
 	sigprocmask(SIG_SETMASK,&oset,NULL);
diff --git a/crypto/s390xcpuid.S b/crypto/s390xcpuid.S
index b053c6a..0681534 100644
--- a/crypto/s390xcpuid.S
+++ b/crypto/s390xcpuid.S
@@ -5,10 +5,14 @@
 .align	16
 OPENSSL_s390x_facilities:
 	lghi	%r0,0
-	.long	0xb2b0f010	# stfle	16(%r15)
-	lg	%r2,16(%r15)
-	larl	%r1,OPENSSL_s390xcap_P
-	stg	%r2,0(%r1)
+	larl	%r2,OPENSSL_s390xcap_P
+	stg	%r0,8(%r2)
+	.long	0xb2b02000	# stfle	0(%r2)
+	brc	8,.Ldone
+	lghi	%r0,1
+	.long	0xb2b02000	# stfle 0(%r2)
+.Ldone:
+	lg	%r2,0(%r2)
 	br	%r14
 .size	OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
 
@@ -58,6 +62,9 @@ OPENSSL_wipe_cpu:
 .type	OPENSSL_cleanse,@function
 .align	16
 OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+	llgfr	%r3,%r3
+#endif
 	lghi	%r4,15
 	lghi	%r0,0
 	clgr	%r3,%r4
@@ -89,4 +96,4 @@ OPENSSL_cleanse:
 .section	.init
 	brasl	%r14,OPENSSL_cpuid_setup
 
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
diff --git a/crypto/sha/asm/sha1-586.S b/crypto/sha/asm/sha1-586.S
new file mode 100644
index 0000000..e77f654
--- /dev/null
+++ b/crypto/sha/asm/sha1-586.S
@@ -0,0 +1,1380 @@
+.file	"sha1-586.s"
+.text
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,@function
+.align	16
+sha1_block_data_order:
+.L_sha1_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%eax
+	subl	$76,%esp
+	shll	$6,%eax
+	addl	%esi,%eax
+	movl	%eax,104(%esp)
+	movl	16(%ebp),%edi
+	jmp	.L000loop
+.align	16
+.L000loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,12(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edx,28(%esp)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edx,44(%esp)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,48(%esp)
+	movl	%ebx,52(%esp)
+	movl	%ecx,56(%esp)
+	movl	%edx,60(%esp)
+	movl	%esi,100(%esp)
+	movl	(%ebp),%eax
+	movl	4(%ebp),%ebx
+	movl	8(%ebp),%ecx
+	movl	12(%ebp),%edx
+
+	movl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	4(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	8(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	12(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	16(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	20(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	24(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	28(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	32(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	36(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	40(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	44(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	48(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	52(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	56(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	60(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	32(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	xorl	%esi,%ebp
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	rorl	$2,%edx
+	movl	%ebx,(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	36(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	xorl	%edi,%ebp
+	addl	%ebp,%esi
+	movl	%ebx,%ebp
+	rorl	$2,%ecx
+	movl	%eax,4(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	40(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	rorl	$2,%ebx
+	movl	%esi,8(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	44(%esp),%edi
+	andl	%eax,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	movl	%esi,%ebp
+	rorl	$2,%eax
+	movl	%edi,12(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,52(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,56(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,60(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	8(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	4(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,4(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	8(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,8(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	12(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,12(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	16(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	4(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,16(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	20(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	8(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,20(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	24(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	32(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	12(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,24(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	28(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	16(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,28(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	32(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	20(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,32(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	36(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	4(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	24(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,36(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	40(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	8(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	28(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,40(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	44(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	12(%esp),%edi
+	andl	%eax,%ebp
+	xorl	32(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,44(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	48(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	16(%esp),%edx
+	andl	%esi,%ebp
+	xorl	36(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,48(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	52(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	20(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	40(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,52(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	56(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	24(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	44(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,56(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	60(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	4(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	28(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	48(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,60(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	8(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	32(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	52(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	4(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	36(%esp),%edi
+	andl	%eax,%ebp
+	xorl	56(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,4(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	8(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	16(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	40(%esp),%edx
+	andl	%esi,%ebp
+	xorl	60(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,8(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	12(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	44(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,12(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	16(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	24(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	48(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	4(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,16(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	20(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	28(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	52(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	8(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,20(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	24(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	32(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	56(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	12(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,24(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	28(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	60(%esp),%edi
+	andl	%eax,%ebp
+	xorl	16(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,28(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	32(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	40(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	(%esp),%edx
+	andl	%esi,%ebp
+	xorl	20(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,32(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	36(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	4(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	24(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,36(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	40(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	48(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	8(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	28(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,40(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	44(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	52(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	12(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	32(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,44(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	48(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	56(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	36(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,48(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	52(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	60(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,52(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	56(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	24(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,56(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	60(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	4(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,60(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,4(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,8(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,12(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%edi,%edx,1),%edi
+	addl	%ebp,%edi
+	movl	96(%esp),%ebp
+	movl	100(%esp),%edx
+	addl	(%ebp),%edi
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%eax
+	addl	12(%ebp),%ebx
+	addl	16(%ebp),%ecx
+	movl	%edi,(%ebp)
+	addl	$64,%edx
+	movl	%esi,4(%ebp)
+	cmpl	104(%esp),%edx
+	movl	%eax,8(%ebp)
+	movl	%ecx,%edi
+	movl	%ebx,12(%ebp)
+	movl	%edx,%esi
+	movl	%ecx,16(%ebp)
+	jb	.L000loop
+	addl	$76,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index a1f8762..2b119ff 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -12,6 +12,8 @@
 # commentary below], and in 2006 the rest was rewritten in order to
 # gain freedom to liberate licensing terms.
 
+# January, September 2004.
+#
 # It was noted that Intel IA-32 C compiler generates code which
 # performs ~30% *faster* on P4 CPU than original *hand-coded*
 # SHA1 assembler implementation. To address this problem (and
@@ -31,12 +33,92 @@
 # ----------------------------------------------------------------
 #					<appro@fy.chalmers.se>
 
+# August 2009.
+#
+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
+# and lighten "pressure" on scratch registers. This resulted in
+# >12% performance improvement on contemporary AMD cores (with no
+# degradation on other CPUs:-). Also, the code was revised to maximize
+# "distance" between instructions producing input to 'lea' instruction
+# and the 'lea' instruction itself, which is essential for Intel Atom
+# core and resulted in ~15% improvement.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
+# have changed that made it interesting again:
+#
+# a) XMM units became faster and wider;
+# b) instruction set became more versatile;
+# c) an important observation was made by Max Locktykhin, which made
+#    it possible to reduce amount of instructions required to perform
+#    the operation in question, for further details see
+#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
+
+# April 2011.
+#
+# Add AVX code path, probably most controversial... The thing is that
+# switch to AVX alone improves performance by as little as 4% in
+# comparison to SSSE3 code path. But below result doesn't look like
+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
+# pair of µ-ops, and it's the additional µ-ops, two per round, that
+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
+# cycles per processed byte. But 'sh[rl]d' is not something that used
+# to be fast, nor does it appear to be fast in upcoming Bulldozer
+# [according to its optimization manual]. Which is why AVX code path
+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
+# makes no sense to keep the AVX code path. If somebody feels that
+# strongly, it's probably more appropriate to discuss possibility of
+# using vector rotate XOP on AMD...
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86		SSSE3		AVX
+# Pentium	15.7		-
+# PIII		11.5		-
+# P4		10.6		-
+# AMD K8	7.1		-
+# Core2		7.3		6.1/+20%	-
+# Atom		12.5		9.5(*)/+32%	-
+# Westmere	7.3		5.6/+30%	-
+# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
+#
+# (*)	Loop is 1056 instructions long and expected result is ~8.25.
+#	It remains mystery [to me] why ILP is limited to 1.7.
+#
+# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
 
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+$ymm=1 if ($xmm &&
+		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+		$1>=2.19);	# first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 
+		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+		$1>=2.03);	# first version supporting AVX
+
+&external_label("OPENSSL_ia32cap_P") if ($xmm);
+
+
 $A="eax";
 $B="ebx";
 $C="ecx";
@@ -47,6 +129,10 @@
 
 @V=($A,$B,$C,$D,$E,$T);
 
+$alt=0;	# 1 denotes alternative IALU implementation, which performs
+	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
+	# Sandy Bridge...
+
 sub BODY_00_15
 	{
 	local($n,$a,$b,$c,$d,$e,$f)=@_;
@@ -59,16 +145,18 @@ sub BODY_00_15
 	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
 	 &xor($f,$d);
 	&add($tmp1,$e);			# tmp1+=e;
-	 &and($f,$b);
-	&mov($e,&swtmp($n%16));		# e becomes volatile and is loaded
+	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
 	 				# with xi, also note that e becomes
 					# f in next round...
-	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&and($f,$b);
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
+	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
 
-	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
+	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
+		      &add($f,$tmp1); }	# f+=tmp1
 	else        { &add($tmp1,$f); }	# f becomes a in next round
+	&mov($tmp1,$a)			if ($alt && $n==15);
 	}
 
 sub BODY_16_19
@@ -77,22 +165,41 @@ sub BODY_16_19
 
 	&comment("16_19 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,$c);		# tmp1 to hold F_00_19(b,c,d)
-	&xor($f,&swtmp(($n+2)%16));
-	 &xor($tmp1,$d);
-	&xor($f,&swtmp(($n+8)%16));
-	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
-	&rotr($b,2);			# b=ROTATE(b,30)
+if ($alt) {
+	&xor($c,$d);
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
+	&xor($c,$d);			# restore $c
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
 	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
-	&mov(&swtmp($n%16),$f);		# xi=f
-	&lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
-	&add($f,$e);			# f+=ROTATE(a,5)
+	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
+	 &mov($tmp1,$a);
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_20_39
@@ -102,21 +209,41 @@ sub BODY_20_39
 
 	&comment("20_39 $n");
 
+if ($alt) {
+	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+8)%16));
+	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
+	&and($tmp1,$b)			if($n==39);
+	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+	&rotr($a,5)			if ($n==79);
+} else {
 	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
-	 &mov($f,&swtmp($n%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-	&rotr($b,2);			# b=ROTATE(b,30)
-	 &xor($f,&swtmp(($n+2)%16));
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
 	&xor($tmp1,$c);
 	 &xor($f,&swtmp(($n+8)%16));
 	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &add($tmp1,$e);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &lea($f,&DWP($K,$f,$tmp1));	# f+=K_20_39+e
-	&add($f,$e);			# f+=ROTATE(a,5)
+	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov($tmp1,$a);
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
+	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
+	&add($f,$tmp1);			# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_40_59
@@ -125,41 +252,86 @@ sub BODY_40_59
 
 	&comment("40_59 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,&swtmp(($n+2)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+8)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+13)%16));
-	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
-	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
+if ($alt) {
+	&add($e,$tmp1);			# e+=b&(c^d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&mov($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($c,$d);			# restore $c
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &or($tmp1,$c);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &and($tmp1,$d);
-	&lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
-	 &mov($e,$b);			# e becomes volatile and is used
-					# to calculate F_40_59(b,c,d)
+	 &and($tmp1,$c);
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &add($e,$tmp1);		# e+=c&d
+	&mov($tmp1,$a);			# b in next round
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if ($n<59);
+	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
+	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($tmp1,$e);		# b&(c^d)+=e
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &and($e,$c);
-	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)		
-	 &mov($e,$a);
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=tmp1;
+	 &mov($e,$a);			# e becomes volatile
+	&rotl($e,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
+	 &mov($tmp1,$c);
 	&add($f,$e);			# f+=ROTATE(a,5)
+	 &and($tmp1,$d);
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=c&d
+}
 	}
 
 &function_begin("sha1_block_data_order");
+if ($xmm) {
+  &static_label("ssse3_shortcut");
+  &static_label("avx_shortcut")		if ($ymm);
+  &static_label("K_XX_XX");
+
+	&call	(&label("pic_point"));	# make it PIC!
+  &set_label("pic_point");
+	&blindpop($tmp1);
+	&picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+
+	&mov	($A,&DWP(0,$T));
+	&mov	($D,&DWP(4,$T));
+	&test	($D,1<<9);		# check SSSE3 bit
+	&jz	(&label("x86"));
+	&test	($A,1<<24);		# check FXSR bit
+	&jz	(&label("x86"));
+	if ($ymm) {
+		&and	($D,1<<28);		# mask AVX bit
+		&and	($A,1<<30);		# mask "Intel CPU" bit
+		&or	($A,$D);
+		&cmp	($A,1<<28|1<<30);
+		&je	(&label("avx_shortcut"));
+	}
+	&jmp	(&label("ssse3_shortcut"));
+  &set_label("x86",16);
+}
 	&mov($tmp1,&wparam(0));	# SHA_CTX *c
 	&mov($T,&wparam(1));	# const void *input
 	&mov($A,&wparam(2));	# size_t num
-	&stack_push(16);	# allocate X[16]
+	&stack_push(16+3);	# allocate X[16]
 	&shl($A,6);
 	&add($A,$T);
 	&mov(&wparam(2),$A);	# pointer beyond the end of input
 	&mov($E,&DWP(16,$tmp1));# pre-load E
+	&jmp(&label("loop"));
 
-	&set_label("loop",16);
+&set_label("loop",16);
 
 	# copy input chunk to X, but reversing byte order!
 	for ($i=0; $i<16; $i+=4)
@@ -213,8 +385,845 @@ sub BODY_40_59
 	&mov(&DWP(16,$tmp1),$C);
 	&jb(&label("loop"));
 
-	&stack_pop(16);
+	&stack_pop(16+3);
 &function_end("sha1_block_data_order");
+
+if ($xmm) {
+######################################################################
+# The SSSE3 implementation.
+#
+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
+# 32 elements of the message schedule or Xupdate outputs. First 4
+# quadruples are simply byte-swapped input, next 4 are calculated
+# according to method originally suggested by Dean Gaudet (modulo
+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
+# collected, it switches to routine proposed by Max Locktyukhin.
+#
+# Calculations inevitably require temporary reqisters, and there are
+# no %xmm registers left to spare. For this reason part of the ring
+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
+# X[-5], and X[4] - X[-4]...
+#
+# Another notable optimization is aggressive stack frame compression
+# aiming to minimize amount of 9-byte instructions...
+#
+# Yet another notable optimization is "jumping" $B variable. It means
+# that there is no register permanently allocated for $B value. This
+# allowed to eliminate one instruction from body_20_39...
+#
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+&function_begin("_sha1_block_data_order_ssse3");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("ssse3_shortcut");
+
+	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&movdqa	(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&movdqa	(&QWP(112+16,"esp"),@X[5]);
+	&movdqa	(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&movdqa	(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&movdqa	(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&movdqu	(@X[-3&7],&QWP(-48,$inp));
+	&movdqu	(@X[-2&7],&QWP(-32,$inp));
+	&movdqu	(@X[-1&7],&QWP(-16,$inp));
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&pshufb	(@X[-3&7],@X[2]);
+	&pshufb	(@X[-2&7],@X[2]);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&pshufb	(@X[-1&7],@X[2]);
+	&paddd	(@X[-4&7],@X[3]);		# add K_00_19
+	&paddd	(@X[-3&7],@X[3]);
+	&paddd	(@X[-2&7],@X[3]);
+	&movdqa	(&QWP(0,"esp"),@X[-4&7]);	# X[]+K xfer to IALU
+	&psubd	(@X[-4&7],@X[3]);		# restore X[]
+	&movdqa	(&QWP(0+16,"esp"),@X[-3&7]);
+	&psubd	(@X[-3&7],@X[3]);
+	&movdqa	(&QWP(0+32,"esp"),@X[-2&7]);
+	&psubd	(@X[-2&7],@X[3]);
+	&movdqa	(@X[0],@X[-3&7]);
+	&jmp	(&label("loop"));
+
+######################################################################
+# SSE instruction sequence is first broken to groups of indepentent
+# instructions, independent in respect to their inputs and shifter
+# (not all architectures have more than one). Then IALU instructions
+# are "knitted in" between the SSE groups. Distance is maintained for
+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
+# [which allegedly also implements SSSE3]...
+#
+# Temporary registers usage. X[2] is volatile at the entry and at the
+# end is restored from backtrace ring buffer. X[3] is expected to
+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
+# from previous round, it becomes volatile the moment the value is
+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
+# end it is loaded with next K_XX_XX [which becomes X[3] in next
+# round]...
+#
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	&movdqa	(@X[2],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@X[3],@X[-1&7]);
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@X[4],@X[0]);
+	&movdqa	(@X[2],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[2],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@X[3],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[4],30);
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@X[3],2);
+	&pxor	(@X[0],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
+	  &movdqa	(@X[1],@X[-2&7])	if ($Xi<7);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@X[2],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@X[2],@X[0]);
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@X[2],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	  &movdqa	(@X[3],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
+	&movdqu	(@X[-3&7],&QWP(16,$inp));
+	&movdqu	(@X[-2&7],&QWP(32,$inp));
+	&movdqu	(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@X[3]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+&set_label("loop",16);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+	&movdqa	(@X[0],@X[-3&7]);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+&function_end("_sha1_block_data_order_ssse3");
+
+if ($ymm) {
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+&function_begin("_sha1_block_data_order_avx");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("avx_shortcut");
+	&vzeroall();
+
+	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&vmovdqa(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&vmovdqa(&QWP(112+16,"esp"),@X[5]);
+	&vmovdqa(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&vmovdqa(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&vmovdqa(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
+	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&vpshufb(@X[-3&7],@X[-3&7],@X[2]);
+	&vpshufb(@X[-2&7],@X[-2&7],@X[2]);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&vpshufb(@X[-1&7],@X[-1&7],@X[2]);
+	&vpaddd	(@X[0],@X[-4&7],@X[3]);		# add K_00_19
+	&vpaddd	(@X[1],@X[-3&7],@X[3]);
+	&vpaddd	(@X[2],@X[-2&7],@X[3]);
+	&vmovdqa(&QWP(0,"esp"),@X[0]);		# X[]+K xfer to IALU
+	&vmovdqa(&QWP(0+16,"esp"),@X[1]);
+	&vmovdqa(&QWP(0+32,"esp"),@X[2]);
+	&jmp	(&label("loop"));
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@X[2],@X[-1&7],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[2],@X[2],@X[-2&7]);		# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[2],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@X[4],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[3],@X[4],30);
+	&vpor	(@X[0],@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@X[4],@X[4],2);
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &vmovdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@X[2],@X[0],30);
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@X[2]);	# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
+	&vmovdqu(@X[-3&7],&QWP(16,$inp));
+	&vmovdqu(@X[-2&7],&QWP(32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb	(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa	(&QWP(0+16*$Xi,"esp"),@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+&set_label("loop",16);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+	&vzeroall();
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+&function_end("_sha1_block_data_order_avx");
+}
+&set_label("K_XX_XX",64);
+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
+}
 &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
diff --git a/crypto/sha/asm/sha1-alpha.pl b/crypto/sha/asm/sha1-alpha.pl
new file mode 100644
index 0000000..6c4b925
--- /dev/null
+++ b/crypto/sha/asm/sha1-alpha.pl
@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Alpha.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+
+@X=(	"\$0",	"\$1",	"\$2",	"\$3",	"\$4",	"\$5",	"\$6",	"\$7",
+	"\$8",	"\$9",	"\$10",	"\$11",	"\$12",	"\$13",	"\$14",	"\$15");
+$ctx="a0";	# $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4";	# 20
+$C="a5";
+$D="t8";
+$E="t9";	@V=($A,$B,$C,$D,$E);
+$t0="t10";	# 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT";	# 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+	ldq_u	@X[0],0+0($inp)
+	ldq_u	@X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+	ldq_u	@X[$i+2],($i+2)*4+0($inp)
+	ldq_u	@X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+	extql	@X[$i],$inp,@X[$i]
+	extqh	@X[$i+1],$inp,@X[$i+1]
+
+	or	@X[$i+1],@X[$i],@X[$i]	# pair of 32-bit values are fetched
+
+	srl	@X[$i],24,$t0		# vectorized byte swap
+	srl	@X[$i],8,$t2
+
+	sll	@X[$i],8,$t3
+	sll	@X[$i],24,@X[$i]
+	zapnot	$t0,0x11,$t0
+	zapnot	$t2,0x22,$t2
+
+	zapnot	@X[$i],0x88,@X[$i]
+	or	$t0,$t2,$t0
+	zapnot	$t3,0x44,$t3
+	sll	$a,5,$t1
+
+	or	@X[$i],$t0,@X[$i]
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	or	@X[$i],$t3,@X[$i]
+	srl	$a,27,$t0
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	extll	@X[$i],4,@X[$i+1]	# extract upper half
+	or	$t2,$t3,$t2
+	addl	@X[$i],$e,$e
+
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	srl	$a,27,$t0
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	or	$t2,$t3,$t2
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	zapnot	$a,0xf,$a
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	addl	$t1,$e,$e
+	or	$t2,$t3,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t0,$e,$e
+	srl	@X[$j%16],31,$t1
+
+	addl	$t2,$e,$e
+	srl	$b,32,$t3
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t2,$e,$e
+	srl	$a,27,$t0
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79);	# with context fetch
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	ldl	@X[0],0($ctx)
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	ldl	@X[1],4($ctx)
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	ldl	@X[2],8($ctx)
+
+	srl	$a,27,$t0
+	addl	$t2,$e,$e
+	ldl	@X[3],12($ctx)
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	ldl	@X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	and	$b,$c,$t2
+	and	$b,$d,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t1,$e,$e
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t0,$e,$e
+	or	$t2,$t3,$t2
+	and	$c,$d,$t3
+
+	or	$t2,$t3,$t2
+	srl	$b,32,$t3
+	addl	@X[$i%16],$e,$e
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t2,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	sha1_block_data_order
+.align	5
+.ent	sha1_block_data_order
+sha1_block_data_order:
+	lda	sp,-64(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	stq	s2,24(sp)
+	stq	s3,32(sp)
+	stq	s4,40(sp)
+	stq	s5,48(sp)
+	stq	fp,56(sp)
+	.mask	0x0400fe00,-64
+	.frame	sp,64,ra
+	.prologue 0
+
+	ldl	$A,0($ctx)
+	ldl	$B,4($ctx)
+	sll	$num,6,$num
+	ldl	$C,8($ctx)
+	ldl	$D,12($ctx)
+	ldl	$E,16($ctx)
+	addq	$inp,$num,$num
+
+.Lloop:
+	.set	noreorder
+	ldah	$K,23170(zero)
+	zapnot	$B,0xf,$B
+	lda	$K,31129($K)	# K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,28378(zero)
+	lda	$K,-5215($K)	# K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-28900(zero)
+	lda	$K,-17188($K)	# K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-13725(zero)
+	lda	$K,-15914($K)	# K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stl	$A,0($ctx)
+	stl	$B,4($ctx)
+	addq	$inp,64,$inp
+	stl	$C,8($ctx)
+	stl	$D,12($ctx)
+	stl	$E,16($ctx)
+	cmpult	$inp,$num,$t1
+	bne	$t1,.Lloop
+
+	.set	noreorder
+	ldq	ra,0(sp)
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	ldq	s2,24(sp)
+	ldq	s3,32(sp)
+	ldq	s4,40(sp)
+	ldq	s5,48(sp)
+	ldq	fp,56(sp)
+	lda	sp,64(sp)
+	ret	(ra)
+.end	sha1_block_data_order
+.ascii	"SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-armv4-large.s b/crypto/sha/asm/sha1-armv4-large.S
similarity index 75%
rename from crypto/sha/asm/sha1-armv4-large.s
rename to crypto/sha/asm/sha1-armv4-large.S
index 7f687d9..639ae78 100644
--- a/crypto/sha/asm/sha1-armv4-large.s
+++ b/crypto/sha/asm/sha1-armv4-large.S
@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
 .text
 
 .global	sha1_block_data_order
@@ -16,76 +18,126 @@ sha1_block_data_order:
 	mov	r6,r6,ror#30
 	mov	r7,r7,ror#30		@ [6]
 .L_00_15:
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
 	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r4,r10,ror#2
 	add	r7,r7,r9			@ E+=X[i]
 	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r4,r5			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
 	eor	r10,r4,r5			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r3,r10,ror#2
 	add	r6,r6,r9			@ E+=X[i]
 	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r3,r4			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
 	eor	r10,r3,r4			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r7,r10,ror#2
 	add	r5,r5,r9			@ E+=X[i]
 	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r7,r3			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
 	eor	r10,r7,r3			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r6,r10,ror#2
 	add	r4,r4,r9			@ E+=X[i]
 	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
 	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r6,r7			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
 	eor	r10,r6,r7			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r5,r10,ror#2
 	add	r3,r3,r9			@ E+=X[i]
 	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
@@ -93,17 +145,27 @@ sha1_block_data_order:
 	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
 	teq	r14,sp
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
-	sub	sp,sp,#5*4
-	ldrb	r9,[r1],#4
-	ldrb	r10,[r1,#-1]
-	ldrb	r11,[r1,#-2]
+	sub	sp,sp,#25*4
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
 	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1,#-3]
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r10,r9,lsl#24
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
 	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#8
-	orr	r9,r9,r12,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
 	and	r10,r4,r10,ror#2
 	add	r7,r7,r9			@ E+=X[i]
 	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
@@ -115,15 +177,15 @@ sha1_block_data_order:
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r3,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
 	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -132,15 +194,15 @@ sha1_block_data_order:
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r7,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
 	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -149,15 +211,15 @@ sha1_block_data_order:
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r6,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
 	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
 	ldr	r9,[r14,#15*4]
@@ -166,20 +228,19 @@ sha1_block_data_order:
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r5,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
 	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
 
 	ldr	r8,.LK_20_39		@ [+15+16*4]
-	sub	sp,sp,#20*4
 	cmn	sp,#0			@ [+3], clear carry to denote 20_39
 .L_20_39_or_60_79:
 	ldr	r9,[r14,#15*4]
@@ -188,15 +249,15 @@ sha1_block_data_order:
 	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r5,r6			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r4,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r7,r7,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -204,15 +265,15 @@ sha1_block_data_order:
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r3,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -220,15 +281,15 @@ sha1_block_data_order:
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r7,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -236,15 +297,15 @@ sha1_block_data_order:
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r6,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
 	ldr	r9,[r14,#15*4]
 	ldr	r10,[r14,#13*4]
@@ -252,15 +313,15 @@ sha1_block_data_order:
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	eor r10,r5,r10,ror#2					@ F_xx_xx
 						@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
 	teq	r14,sp			@ preserve carry
 	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
@@ -275,15 +336,15 @@ sha1_block_data_order:
 	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r5,r6			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r4,r10,ror#2					@ F_xx_xx
 	and r11,r5,r6					@ F_xx_xx
 	add	r7,r7,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
 	add	r7,r7,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -292,15 +353,15 @@ sha1_block_data_order:
 	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r4,r5			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r3,r10,ror#2					@ F_xx_xx
 	and r11,r4,r5					@ F_xx_xx
 	add	r6,r6,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
 	add	r6,r6,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -309,15 +370,15 @@ sha1_block_data_order:
 	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r3,r4			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r7,r10,ror#2					@ F_xx_xx
 	and r11,r3,r4					@ F_xx_xx
 	add	r5,r5,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
 	add	r5,r5,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -326,15 +387,15 @@ sha1_block_data_order:
 	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r7,r3			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r6,r10,ror#2					@ F_xx_xx
 	and r11,r7,r3					@ F_xx_xx
 	add	r4,r4,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
 	add	r4,r4,r11,ror#2
 	ldr	r9,[r14,#15*4]
@@ -343,15 +404,15 @@ sha1_block_data_order:
 	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
 	ldr	r12,[r14,#2*4]
 	eor	r9,r9,r10
-	eor	r11,r11,r12
+	eor	r11,r11,r12			@ 1 cycle stall
 	eor	r10,r6,r7			@ F_xx_xx
 	mov	r9,r9,ror#31
 	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
 	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
 	and r10,r5,r10,ror#2					@ F_xx_xx
 	and r11,r6,r7					@ F_xx_xx
 	add	r3,r3,r9			@ E+=X[i]
-	str	r9,[r14,#-4]!
 	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
 	add	r3,r3,r11,ror#2
 	teq	r14,sp
@@ -373,10 +434,14 @@ sha1_block_data_order:
 	teq	r1,r2
 	bne	.Lloop			@ [+18], total 1307
 
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .align	2
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index 79e3f61..33da3e0 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -47,6 +47,10 @@
 # Cortex A8 core and in absolute terms ~870 cycles per input block
 # [or 13.6 cycles per byte].
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -76,31 +80,41 @@ sub Xupdate {
 	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
 	ldr	$t3,[$Xi,#2*4]
 	eor	$t0,$t0,$t1
-	eor	$t2,$t2,$t3
+	eor	$t2,$t2,$t3			@ 1 cycle stall
 	eor	$t1,$c,$d			@ F_xx_xx
 	mov	$t0,$t0,ror#31
 	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
 	eor	$t0,$t0,$t2,ror#31
+	str	$t0,[$Xi,#-4]!
 	$opt1					@ F_xx_xx
 	$opt2					@ F_xx_xx
 	add	$e,$e,$t0			@ E+=X[i]
-	str	$t0,[$Xi,#-4]!
 ___
 }
 
 sub BODY_00_15 {
 my ($a,$b,$c,$d,$e)=@_;
 $code.=<<___;
-	ldrb	$t0,[$inp],#4
-	ldrb	$t1,[$inp,#-1]
-	ldrb	$t2,[$inp,#-2]
+#if __ARM_ARCH__<7
+	ldrb	$t1,[$inp,#2]
+	ldrb	$t0,[$inp,#3]
+	ldrb	$t2,[$inp,#1]
 	add	$e,$K,$e,ror#2			@ E+=K_00_19
-	ldrb	$t3,[$inp,#-3]
+	ldrb	$t3,[$inp],#4
+	orr	$t0,$t0,$t1,lsl#8
+	eor	$t1,$c,$d			@ F_xx_xx
+	orr	$t0,$t0,$t2,lsl#16
 	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
-	orr	$t0,$t1,$t0,lsl#24
+	orr	$t0,$t0,$t3,lsl#24
+#else
+	ldr	$t0,[$inp],#4			@ handles unaligned
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
 	eor	$t1,$c,$d			@ F_xx_xx
-	orr	$t0,$t0,$t2,lsl#8
-	orr	$t0,$t0,$t3,lsl#16
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	$t0,$t0				@ byte swap
+#endif
+#endif
 	and	$t1,$b,$t1,ror#2
 	add	$e,$e,$t0			@ E+=X[i]
 	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
@@ -136,6 +150,8 @@ sub BODY_40_59 {
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 
 .global	sha1_block_data_order
@@ -161,7 +177,7 @@ sub BODY_40_59 {
 $code.=<<___;
 	teq	$Xi,sp
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
-	sub	sp,sp,#5*4
+	sub	sp,sp,#25*4
 ___
 	&BODY_00_15(@V);	unshift(@V,pop(@V));
 	&BODY_16_19(@V);	unshift(@V,pop(@V));
@@ -171,7 +187,6 @@ sub BODY_40_59 {
 $code.=<<___;
 
 	ldr	$K,.LK_20_39		@ [+15+16*4]
-	sub	sp,sp,#20*4
 	cmn	sp,#0			@ [+3], clear carry to denote 20_39
 .L_20_39_or_60_79:
 ___
@@ -210,10 +225,14 @@ sub BODY_40_59 {
 	teq	$inp,$len
 	bne	.Lloop			@ [+18], total 1307
 
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .align	2
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
diff --git a/crypto/sha/asm/sha1-ia64.pl b/crypto/sha/asm/sha1-ia64.pl
index 51c4f47..02d35d1 100644
--- a/crypto/sha/asm/sha1-ia64.pl
+++ b/crypto/sha/asm/sha1-ia64.pl
@@ -15,7 +15,7 @@
 # is >50% better than HP C and >2x better than gcc.
 
 $code=<<___;
-.ident  \"sha1-ia64.s, version 1.2\"
+.ident  \"sha1-ia64.s, version 1.3\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
 
@@ -26,14 +26,10 @@
     $ADDP="addp4";
     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
 } else { $ADDP="add"; }
-for (@ARGV) {	$big_endian=1 if (/\-DB_ENDIAN/);
-		$big_endian=0 if (/\-DL_ENDIAN/);   }
-if (!defined($big_endian))
-	    {	$big_endian=(unpack('L',pack('N',1))==1);   }
 
 #$human=1;
 if ($human) {	# useful for visual code auditing...
-	($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
+	($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
 	($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
 	    (	"K_00_19","K_20_39","K_40_59","K_60_79"	);
@@ -41,47 +37,50 @@
 		"X8", "X9","X10","X11","X12","X13","X14","X15"	);
 }
 else {
-	($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
-	($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
+	($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
+	($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
-	    (	"r14", "r15", "loc11", "loc12"	);
+	    (	"r14", "r15", "loc10", "loc11"	);
 	@X= (	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
 		"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"	);
 }
 
 sub BODY_00_15 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___ if ($i==0);
-{ .mmi;	ld1	$X[$i&0xf]=[inp],2	    // MSB
+{ .mmi;	ld1	$X[$i]=[inp],2		    // MSB
 	ld1	tmp2=[tmp3],2		};;
 { .mmi;	ld1	tmp0=[inp],2
 	ld1	tmp4=[tmp3],2		    // LSB
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp2,8,8	};;
+	dep	$X[$i]=$X[$i],tmp2,8,8	};;
 ___
 if ($i<15) {
 	$code.=<<___;
-{ .mmi;	ld1	$X[($i+1)&0xf]=[inp],2	    // +1
+{ .mmi;	ld1	$Xn=[inp],2		    // forward Xload
+	nop.m	0x0
 	dep	tmp1=tmp0,tmp4,8,8	};;
-{ .mmi;	ld1	tmp2=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp2=[tmp3],2		    // forward Xload
 	and	tmp4=$c,$b
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xload
+	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		};; // a>>27
-{ .mmi;	ld1	tmp0=[inp],2		    // +1
-	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
+{ .mmi;	ld1	tmp0=[inp],2		    // forward Xload
+	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	ld1	tmp4=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp4=[tmp3],2		    // forward Xload
 	or	tmp5=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp5		    // f+=ROTATE(a,5)
-	dep	$X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8	// +1
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44	} //;;
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)
+	dep	$Xn=$Xn,tmp2,8,8	    // forward Xload
+	mux2	$X[$i]=$X[$i],0x44	} //;;
 
 ___
 	}
@@ -89,24 +88,24 @@ sub BODY_00_15 {
 	$code.=<<___;
 { .mii;	and	tmp3=$c,$b
 	dep	tmp1=tmp0,tmp4,8,8;;
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xupdate
+	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi; or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44  };;
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	mux2	$X[$i]=$X[$i],0x44	};;
 
 ___
 	}
@@ -114,27 +113,28 @@ sub BODY_00_15 {
 
 sub BODY_16_19 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_00_19		    // e+=K_00_19
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp4=$e,$K_00_19	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=$f,tmp4		    // f+=e+K_00_19
+{ .mib;	andcm	tmp1=$d,$b
+	and	tmp0=$c,$b		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16]	// forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp0		    // f+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
@@ -142,49 +142,47 @@ sub BODY_16_19 {
 
 sub BODY_20_39 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
+my	($i,$a,$b,$c,$d,$e,$Konst)=@_;
 	$Konst = $K_20_39 if (!defined($Konst));
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 if ($i<79) {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_XX_XX
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mmi;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	add	$f=$f,tmp4		    // f+=e+K_20_39
+	xor	$Xn=$Xn,$X[($j+2)%16]	};; // forward Xupdate
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
+	xor	$Xn=$Xn,$X[($j+8)%16]	};; // forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	xor	$Xn=$Xn,$X[($j+13)%16]	    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
 }
 else {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_60_79
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mib;	add	$f=$f,tmp4		    // f+=e+K_20_39
 	add	$h1=$h1,$a		};; // wrap up
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30) ;;?
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	extr.u	tmp1=$a,27,5		}   // a>>27
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
 	add	$h3=$h3,$c		};; // wrap up
-{ .mib;	add	tmp3=1,inp		    // used in unaligned codepath
-	add	$f=$f,tmp1		}   // f+=ROTATE(a,5)
-{ .mib;	add	$h2=$h2,$b		    // wrap up
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+	shrp	$b=tmp6,tmp6,2		};; // b=ROTATE(b,30) ;;?
+{ .mmi;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	add	tmp3=1,inp		    // used in unaligned codepath
 	add	$h4=$h4,$d		};; // wrap up
 
 ___
@@ -193,29 +191,29 @@ sub BODY_20_39 {
 
 sub BODY_40_59 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_40_59		    // e+=K_40_59
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	and	tmp1=$d,$b
-	add	tmp4=$e,$K_40_59	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // (b&c)|(b&d)
-	add	$f=$f,tmp4		    // f+=e+K_40_59
+{ .mib;	and	tmp1=$c,$d
+	xor	tmp0=$c,$d		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	add	tmp5=tmp5,tmp1		    // a<<5+(c&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	and	tmp4=$c,$d
-	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	};;
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	and	tmp0=tmp0,$b
+	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] };;	// forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=b&(c^d)
+	add	tmp5=tmp5,tmp1		    // ROTATE(a,5)+(c&d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	or	tmp0=tmp0,tmp4		    // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
+{ .mmi;	xor	$Xn=$Xn,tmp3
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp0		    // f+=F_40_59(b,c,d)
-	shrp	$e=tmp2,tmp2,31;;	    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	add	$f=$f,tmp1		};; // f+=ROTATE(a,5)
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)+(c&d)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	nop.i	0x0			};;
 
 ___
 }
@@ -237,7 +235,7 @@ sub BODY_40_59 {
 .align	32
 sha1_block_data_order:
 	.prologue
-{ .mmi;	alloc	tmp1=ar.pfs,3,15,0,0
+{ .mmi;	alloc	tmp1=ar.pfs,3,14,0,0
 	$ADDP	tmp0=4,ctx
 	.save	ar.lc,r3
 	mov	r3=ar.lc		}
@@ -245,8 +243,8 @@ sub BODY_40_59 {
 	$ADDP	inp=0,inp
 	mov	r2=pr			};;
 tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
+tmp5=loc12;
+tmp6=loc13;
 	.body
 { .mlx;	ld4	$h0=[ctx],8
 	movl	$K_00_19=0x5a827999	}
@@ -273,7 +271,8 @@ sub BODY_40_59 {
 
 ___
 
-{ my $i,@V=($A,$B,$C,$D,$E,$T);
+{ my $i;
+  my @V=($A,$B,$C,$D,$E);
 
 	for($i=0;$i<16;$i++)	{ &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<20;$i++)	{ &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +280,12 @@ sub BODY_40_59 {
 	for(;$i<60;$i++)	{ &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<80;$i++)	{ &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
 
-	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
+	(($V[0] eq $A) and ($V[4] eq $E)) or die;	# double-check
 }
 
 $code.=<<___;
-{ .mmb;	add	$h0=$h0,$E
-	nop.m	0
+{ .mmb;	add	$h0=$h0,$A
+	add	$h2=$h2,$C
 	br.ctop.dptk.many	.Ldtop	};;
 .Ldend:
 { .mmi;	add	tmp0=4,ctx
diff --git a/crypto/sha/asm/sha1-mips.s b/crypto/sha/asm/sha1-mips.S
similarity index 100%
rename from crypto/sha/asm/sha1-mips.s
rename to crypto/sha/asm/sha1-mips.S
diff --git a/crypto/sha/asm/sha1-parisc.pl b/crypto/sha/asm/sha1-parisc.pl
new file mode 100644
index 0000000..6d7bf49
--- /dev/null
+++ b/crypto/sha/asm/sha1-parisc.pl
@@ -0,0 +1,259 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for PA-RISC.
+
+# June 2009.
+#
+# On PA-7100LC performance is >30% better than gcc 3.2 generated code
+# for aligned input and >50% better for unaligned. Compared to vendor
+# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
+# few percent faster in 32-bit one (this for aligned input, data for
+# unaligned input is not available).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
+				#                 [+ argument transfer]
+$ctx="%r26";		# arg0
+$inp="%r25";		# arg1
+$num="%r24";		# arg2
+
+$t0="%r28";
+$t1="%r29";
+$K="%r31";
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
+
+@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<15);
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	addl	@X[$i],$e,$e
+	and	$c,$b,$t0
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	and	$c,$b,$t0
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	add	$t0,$e,$e
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i==79);	# with context load
+	ldw	0($ctx),@X[0]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	ldw	4($ctx),@X[1]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	ldw	8($ctx),@X[2]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	ldw	12($ctx),@X[3]
+	addl	$t0,$e,$e
+	ldw	16($ctx),@X[4]
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+	shd	$a,$a,27,$t1	; $i
+	addl	$K,$e,$e
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	xor	$d,$c,$t0
+	addl	@X[$i%16],$e,$e
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	and	$b,$t0,$t0
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t0,$e,$e
+	and	$d,$c,$t1
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t1,$e,$e
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+sha1_block_data_order
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+
+	ldw	0($ctx),$A
+	ldw	4($ctx),$B
+	ldw	8($ctx),$C
+	ldw	12($ctx),$D
+	ldw	16($ctx),$E
+
+	extru	$inp,31,2,$t0		; t0=inp&3;
+	sh3addl	$t0,%r0,$t0		; t0*=8;
+	subi	32,$t0,$t0		; t0=32-t0;
+	mtctl	$t0,%cr11		; %sar=t0;
+
+L\$oop
+	ldi	3,$t0
+	andcm	$inp,$t0,$t0		; 64-bit neutral
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\tldw	`4*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	ldw	60($t0),@X[15]
+	ldw	64($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align input
+	$code.="\tvshd	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	ldil	L'0x5a827000,$K		; K_00_19
+	ldo	0x999($K),$K
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x6ed9e000,$K		; K_20_39
+	ldo	0xba1($K),$K
+___
+
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x8f1bb000,$K		; K_40_59
+	ldo	0xcdc($K),$K
+___
+
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0xca62c000,$K		; K_60_79
+	ldo	0x1d6($K),$K
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stw	$A,0($ctx)
+	stw	$B,4($ctx)
+	stw	$C,8($ctx)
+	stw	$D,12($ctx)
+	stw	$E,16($ctx)
+	addib,*<> -1,$num,L\$oop
+	ldo	64($inp),$inp
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl
index dcd0fcd..2140dd2 100755
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@@ -24,12 +24,14 @@
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$UCMP	="cmpld";
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$UCMP	="cmplw";
 	$STU	="stwu";
 	$POP	="lwz";
@@ -43,7 +45,8 @@
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -162,9 +165,8 @@ sub BODY_40_59 {
 .globl	.sha1_block_data_order
 .align	4
 .sha1_block_data_order:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+64)`($sp)
-	$PUSH	r0,`$FRAME-$SIZE_T*18`($sp)
 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
 	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ sub BODY_40_59 {
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 	lwz	$A,0($ctx)
 	lwz	$B,4($ctx)
 	lwz	$C,8($ctx)
@@ -192,37 +195,14 @@ sub BODY_40_59 {
 Laligned:
 	mtctr	$num
 	bl	Lsha1_block_private
-Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*18`($sp)
-	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
-	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
-	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
-	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
-	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
-	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
-	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
-	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
-	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
-	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
-	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
-	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
-	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
-	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
-	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
-	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
-	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
-	mtlr	r0
-	addi	$sp,$sp,`$FRAME+64`
-	blr
-___
+	b	Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align	4
 Lunaligned:
 	subfic	$t1,$inp,4096
@@ -237,7 +217,7 @@ sub BODY_40_59 {
 Lcross_page:
 	li	$t1,16
 	mtctr	$t1
-	addi	r20,$sp,$FRAME	; spot below the frame
+	addi	r20,$sp,$LOCALS	; spot within the frame
 Lmemcpy:
 	lbz	r16,0($inp)
 	lbz	r17,1($inp)
@@ -251,15 +231,40 @@ sub BODY_40_59 {
 	addi	r20,r20,4
 	bdnz	Lmemcpy
 
-	$PUSH	$inp,`$FRAME-$SIZE_T*19`($sp)
+	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
 	li	$t1,1
-	addi	$inp,$sp,$FRAME
+	addi	$inp,$sp,$LOCALS
 	mtctr	$t1
 	bl	Lsha1_block_private
-	$POP	$inp,`$FRAME-$SIZE_T*19`($sp)
+	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
 	addic.	$num,$num,-1
 	bne-	Lunaligned
-	b	Ldone
+
+Ldone:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ sub BODY_40_59 {
 	addi	$inp,$inp,`16*4`
 	bdnz-	Lsha1_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 $code.=<<___;
 .asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
diff --git a/crypto/sha/asm/sha1-s390x.pl b/crypto/sha/asm/sha1-s390x.pl
index 4b17848..9193dda 100644
--- a/crypto/sha/asm/sha1-s390x.pl
+++ b/crypto/sha/asm/sha1-s390x.pl
@@ -21,9 +21,28 @@
 # instructions to favour dual-issue z10 pipeline. On z10 hardware is
 # "only" ~2.3x faster than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
+
 $kimdfunc=1;	# magic function code for kimd instruction
 
-$output=shift;
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $K_00_39="%r0"; $K=$K_00_39;
@@ -42,13 +61,14 @@
 @X=("%r12","%r13","%r14");
 $sp="%r15";
 
-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;
 
 sub Xupdate {
 my $i=shift;
 
 $code.=<<___ if ($i==15);
-	lg	$prefetch,160($sp)	### Xupdate(16) warm-up
+	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
 	lr	$X[0],$X[2]
 ___
 return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@ sub Xupdate {
 ___
 $code.=<<___ if ($i>=16);
 	xgr	$X[0],$prefetch		### Xupdate($i)
-	lg	$prefetch,`160+4*(($i+2)%16)`($sp)
-	xg	$X[0],`160+4*(($i+8)%16)`($sp)
+	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
+	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
 	xgr	$X[0],$prefetch
 	rll	$X[0],$X[0],1
 	rllg	$X[1],$X[0],32
@@ -68,7 +88,7 @@ sub Xupdate {
 	lr	$X[2],$X[1]		# feedback
 ___
 $code.=<<___ if ($i<=70);
-	stg	$X[0],`160+4*($i%16)`($sp)
+	stg	$X[0],`$stdframe+4*($i%16)`($sp)
 ___
 unshift(@X,pop(@X));
 }
@@ -148,9 +168,9 @@ sub BODY_40_59 {
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
@@ -165,11 +185,11 @@ sub BODY_40_59 {
 ___
 $code.=<<___;
 	lghi	%r1,-$frame
-	stg	$ctx,16($sp)
-	stmg	%r6,%r15,48($sp)
+	st${g}	$ctx,`2*$SIZE_T`($sp)
+	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$t0,Ktable
 	llgf	$A,0($ctx)
@@ -199,7 +219,7 @@ sub BODY_40_59 {
 for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,64($inp)
 	al	$A,0($ctx)
 	al	$B,4($ctx)
@@ -211,13 +231,13 @@ sub BODY_40_59 {
 	st	$C,8($ctx)
 	st	$D,12($ctx)
 	st	$E,16($ctx)
-	brct	$len,.Lloop
+	brct${g} $len,.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
 	br	%r14
 .size	sha1_block_data_order,.-sha1_block_data_order
 .string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/sha/asm/sha1-sparcv9a.pl b/crypto/sha/asm/sha1-sparcv9a.pl
index 85e8d68..e65291b 100644
--- a/crypto/sha/asm/sha1-sparcv9a.pl
+++ b/crypto/sha/asm/sha1-sparcv9a.pl
@@ -549,7 +549,7 @@ sub BODY_70_79 {
 # programmer detect if current CPU is VIS capable at run-time.
 sub unvis {
 my ($mnemonic,$rs1,$rs2,$rd)=@_;
-my $ref,$opf;
+my ($ref,$opf);
 my %visopf = (	"fmul8ulx16"	=> 0x037,
 		"faligndata"	=> 0x048,
 		"fpadd32"	=> 0x052,
diff --git a/crypto/sha/asm/sha1-x86_64.S b/crypto/sha/asm/sha1-x86_64.S
new file mode 100644
index 0000000..3922e20
--- /dev/null
+++ b/crypto/sha/asm/sha1-x86_64.S
@@ -0,0 +1,2486 @@
+.text	
+
+
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,@function
+.align	16
+sha1_block_data_order:
+	movl	OPENSSL_ia32cap_P+0(%rip),%r9d
+	movl	OPENSSL_ia32cap_P+4(%rip),%r8d
+	testl	$512,%r8d
+	jz	.Lialu
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	movq	%rsp,%r11
+	movq	%rdi,%r8
+	subq	$72,%rsp
+	movq	%rsi,%r9
+	andq	$-64,%rsp
+	movq	%rdx,%r10
+	movq	%r11,64(%rsp)
+.Lprologue:
+
+	movl	0(%r8),%esi
+	movl	4(%r8),%edi
+	movl	8(%r8),%r11d
+	movl	12(%r8),%r12d
+	movl	16(%r8),%r13d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	0(%r9),%edx
+	bswapl	%edx
+	movl	%edx,0(%rsp)
+	movl	%r11d,%eax
+	movl	4(%r9),%ebp
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%ebp,4(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	8(%r9),%edx
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%edx,8(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	12(%r9),%ebp
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	16(%r9),%edx
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%edx,16(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	20(%r9),%ebp
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%ebp,20(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	%r11d,%eax
+	movl	24(%r9),%edx
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%edx,24(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	28(%r9),%ebp
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%ebp,28(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	32(%r9),%edx
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%edx,32(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	36(%r9),%ebp
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	40(%r9),%edx
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%edx,40(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	%r11d,%eax
+	movl	44(%r9),%ebp
+	movl	%esi,%ecx
+	xorl	%r12d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r13,1),%r13d
+	andl	%edi,%eax
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	%edi,%eax
+	movl	48(%r9),%edx
+	movl	%r13d,%ecx
+	xorl	%r11d,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%r12,1),%r12d
+	andl	%esi,%eax
+	movl	%edx,48(%rsp)
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	%esi,%eax
+	movl	52(%r9),%ebp
+	movl	%r12d,%ecx
+	xorl	%edi,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%r11,1),%r11d
+	andl	%r13d,%eax
+	movl	%ebp,52(%rsp)
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	%r13d,%eax
+	movl	56(%r9),%edx
+	movl	%r11d,%ecx
+	xorl	%esi,%eax
+	bswapl	%edx
+	roll	$5,%ecx
+	leal	1518500249(%rbp,%rdi,1),%edi
+	andl	%r12d,%eax
+	movl	%edx,56(%rsp)
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	%r12d,%eax
+	movl	60(%r9),%ebp
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	bswapl	%ebp
+	roll	$5,%ecx
+	leal	1518500249(%rdx,%rsi,1),%esi
+	andl	%r11d,%eax
+	movl	%ebp,60(%rsp)
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	0(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%edx
+	andl	%edi,%eax
+	leal	1518500249(%rbp,%r13,1),%r13d
+	xorl	52(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$1,%edx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	movl	%edx,0(%rsp)
+	addl	%eax,%r13d
+	movl	4(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%ebp
+	andl	%esi,%eax
+	leal	1518500249(%rdx,%r12,1),%r12d
+	xorl	56(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$1,%ebp
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	movl	%ebp,4(%rsp)
+	addl	%eax,%r12d
+	movl	8(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	andl	%r13d,%eax
+	leal	1518500249(%rbp,%r11,1),%r11d
+	xorl	60(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$1,%edx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	movl	%edx,8(%rsp)
+	addl	%eax,%r11d
+	movl	12(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	xorl	0(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$1,%ebp
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	movl	%ebp,12(%rsp)
+	addl	%eax,%edi
+	movl	16(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%edx
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	xorl	4(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$1,%edx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	movl	%edx,16(%rsp)
+	addl	%eax,%esi
+	movl	20(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	52(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	8(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	56(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	12(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	60(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	16(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	0(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	20(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	4(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	24(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,36(%rsp)
+	movl	40(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	28(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,40(%rsp)
+	movl	44(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	32(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,44(%rsp)
+	movl	48(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	36(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,48(%rsp)
+	movl	52(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	40(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,52(%rsp)
+	movl	56(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	44(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,56(%rsp)
+	movl	60(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	28(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	48(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,60(%rsp)
+	movl	0(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	32(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	52(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,0(%rsp)
+	movl	4(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	36(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	56(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,4(%rsp)
+	movl	8(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	60(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,8(%rsp)
+	movl	12(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	0(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,12(%rsp)
+	movl	16(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	48(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	4(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,16(%rsp)
+	movl	20(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	52(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	8(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	56(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	12(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	60(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	16(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	20(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	24(%rsp),%ebp
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%r13d
+	movl	40(%rsp),%edx
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	48(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%edx,40(%rsp)
+	addl	%ecx,%r12d
+	movl	44(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%edi,%ebx
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%r11d
+	movl	48(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%esi,%ebx
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%edx,48(%rsp)
+	addl	%ecx,%edi
+	movl	52(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	60(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	40(%rsp),%ebp
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%ebp,52(%rsp)
+	addl	%ecx,%esi
+	movl	56(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	0(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	44(%rsp),%edx
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%edx,56(%rsp)
+	addl	%ecx,%r13d
+	movl	60(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	4(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	48(%rsp),%ebp
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%ebp,60(%rsp)
+	addl	%ecx,%r12d
+	movl	0(%rsp),%edx
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	8(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%edi,%ebx
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%edx,0(%rsp)
+	addl	%ecx,%r11d
+	movl	4(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	12(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%esi,%ebx
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%ebp,4(%rsp)
+	addl	%ecx,%edi
+	movl	8(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	16(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	60(%rsp),%edx
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%edx,8(%rsp)
+	addl	%ecx,%esi
+	movl	12(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	20(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	0(%rsp),%ebp
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%ebp,12(%rsp)
+	addl	%ecx,%r13d
+	movl	16(%rsp),%edx
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	24(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	4(%rsp),%edx
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%edx,16(%rsp)
+	addl	%ecx,%r12d
+	movl	20(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	28(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%edi,%ebx
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	8(%rsp),%ebp
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%ebp,20(%rsp)
+	addl	%ecx,%r11d
+	movl	24(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	32(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%esi,%ebx
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%edx,24(%rsp)
+	addl	%ecx,%edi
+	movl	28(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	36(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%ebp,28(%rsp)
+	addl	%ecx,%esi
+	movl	32(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r11d,%ebx
+	xorl	40(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%esi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r12d,%ebx
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	roll	$5,%ecx
+	xorl	20(%rsp),%edx
+	addl	%eax,%r13d
+	andl	%edi,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r13d
+	roll	$30,%edi
+	movl	%edx,32(%rsp)
+	addl	%ecx,%r13d
+	movl	36(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edi,%ebx
+	xorl	44(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%r13d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r11d,%ebx
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	roll	$5,%ecx
+	xorl	24(%rsp),%ebp
+	addl	%eax,%r12d
+	andl	%esi,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%r12d
+	roll	$30,%esi
+	movl	%ebp,36(%rsp)
+	addl	%ecx,%r12d
+	movl	40(%rsp),%edx
+	movl	%esi,%eax
+	movl	%esi,%ebx
+	xorl	48(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r12d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%edi,%ebx
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	addl	%eax,%r11d
+	andl	%r13d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%r11d
+	roll	$30,%r13d
+	movl	%edx,40(%rsp)
+	addl	%ecx,%r11d
+	movl	44(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r13d,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r11d,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%esi,%ebx
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	addl	%eax,%edi
+	andl	%r12d,%ebx
+	roll	$1,%ebp
+	addl	%ebx,%edi
+	roll	$30,%r12d
+	movl	%ebp,44(%rsp)
+	addl	%ecx,%edi
+	movl	48(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r12d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%edi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r13d,%ebx
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	addl	%eax,%esi
+	andl	%r11d,%ebx
+	roll	$1,%edx
+	addl	%ebx,%esi
+	roll	$30,%r11d
+	movl	%edx,48(%rsp)
+	addl	%ecx,%esi
+	movl	52(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	20(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	40(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,52(%rsp)
+	movl	56(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	24(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	44(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,56(%rsp)
+	movl	60(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	28(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	48(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,60(%rsp)
+	movl	0(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	8(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	32(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	52(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,0(%rsp)
+	movl	4(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	12(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	36(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	56(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,4(%rsp)
+	movl	8(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	40(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	60(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,8(%rsp)
+	movl	12(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	44(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	0(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	%ebp,12(%rsp)
+	movl	16(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	48(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	4(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	%edx,16(%rsp)
+	movl	20(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	52(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	8(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%ebp,20(%rsp)
+	movl	24(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	32(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rsi,1),%esi
+	xorl	56(%rsp),%edx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	12(%rsp),%edx
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	movl	%edx,24(%rsp)
+	movl	28(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	36(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	16(%rsp),%ebp
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	movl	%ebp,28(%rsp)
+	movl	32(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	0(%rsp),%edx
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	20(%rsp),%edx
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	movl	%edx,32(%rsp)
+	movl	36(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	4(%rsp),%ebp
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	24(%rsp),%ebp
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	movl	%ebp,36(%rsp)
+	movl	40(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	48(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	8(%rsp),%edx
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	28(%rsp),%edx
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	movl	%edx,40(%rsp)
+	movl	44(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	52(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	12(%rsp),%ebp
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	xorl	32(%rsp),%ebp
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	movl	%ebp,44(%rsp)
+	movl	48(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	56(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	xorl	36(%rsp),%edx
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	movl	%edx,48(%rsp)
+	movl	52(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	xorl	40(%rsp),%ebp
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	movl	56(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	24(%rsp),%edx
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	xorl	44(%rsp),%edx
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	movl	60(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	28(%rsp),%ebp
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	xorl	48(%rsp),%ebp
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	%r11d,%eax
+	leal	-899497514(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	addl	0(%r8),%esi
+	addl	4(%r8),%edi
+	addl	8(%r8),%r11d
+	addl	12(%r8),%r12d
+	addl	16(%r8),%r13d
+	movl	%esi,0(%r8)
+	movl	%edi,4(%r8)
+	movl	%r11d,8(%r8)
+	movl	%r12d,12(%r8)
+	movl	%r13d,16(%r8)
+
+	subq	$1,%r10
+	leaq	64(%r9),%r9
+	jnz	.Lloop
+
+	movq	64(%rsp),%rsi
+	movq	(%rsi),%r13
+	movq	8(%rsi),%r12
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%rbx
+	leaq	32(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order,.-sha1_block_data_order
+.type	sha1_block_data_order_ssse3,@function
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	leaq	-64(%rsp),%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX(%rip),%r11
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm0
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	jmp	.Loop_ssse3
+.align	16
+.Loop_ssse3:
+	movdqa	%xmm1,%xmm4
+	addl	0(%rsp),%ebp
+	xorl	%edx,%ecx
+	movdqa	%xmm3,%xmm8
+.byte	102,15,58,15,224,8
+	movl	%eax,%edi
+	roll	$5,%eax
+	paddd	%xmm3,%xmm9
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrldq	$4,%xmm8
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	pxor	%xmm0,%xmm4
+	rorl	$2,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm2,%xmm8
+	addl	4(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pxor	%xmm8,%xmm4
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm9,48(%rsp)
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm4,%xmm8
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	8(%rsp),%ecx
+	xorl	%ebx,%eax
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	roll	$5,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrld	$31,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm4
+	addl	12(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	0(%r11),%xmm10
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm4
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movdqa	%xmm2,%xmm5
+	addl	16(%rsp),%eax
+	xorl	%ebp,%edx
+	movdqa	%xmm4,%xmm9
+.byte	102,15,58,15,233,8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm4,%xmm10
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrldq	$4,%xmm9
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	pxor	%xmm1,%xmm5
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm3,%xmm9
+	addl	20(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pxor	%xmm9,%xmm5
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm10,0(%rsp)
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm5,%xmm9
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	24(%rsp),%edx
+	xorl	%ecx,%ebx
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	psrld	$31,%xmm9
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm8,%xmm10
+	rorl	$7,%eax
+	addl	%esi,%edx
+	psrld	$30,%xmm8
+	por	%xmm9,%xmm5
+	addl	28(%rsp),%ecx
+	xorl	%ebx,%eax
+	movl	%edx,%esi
+	roll	$5,%edx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	16(%r11),%xmm8
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm5
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movdqa	%xmm3,%xmm6
+	addl	32(%rsp),%ebx
+	xorl	%eax,%ebp
+	movdqa	%xmm5,%xmm10
+.byte	102,15,58,15,242,8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm5,%xmm8
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	psrldq	$4,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	pxor	%xmm2,%xmm6
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm4,%xmm10
+	addl	36(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	pxor	%xmm10,%xmm6
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm8,16(%rsp)
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm6,%xmm10
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	40(%rsp),%ebp
+	xorl	%edx,%ecx
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	roll	$5,%eax
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	psrld	$31,%xmm10
+	xorl	%edx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	psrld	$30,%xmm9
+	por	%xmm10,%xmm6
+	addl	44(%rsp),%edx
+	xorl	%ecx,%ebx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	16(%r11),%xmm9
+	xorl	%ecx,%edi
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm6
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movdqa	%xmm4,%xmm7
+	addl	48(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm8
+.byte	102,15,58,15,251,8
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm6,%xmm9
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	psrldq	$4,%xmm8
+	xorl	%ebx,%esi
+	addl	%edx,%ecx
+	pxor	%xmm3,%xmm7
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm5,%xmm8
+	addl	52(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	pxor	%xmm8,%xmm7
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm9,32(%rsp)
+	xorl	%eax,%edi
+	addl	%ecx,%ebx
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm7,%xmm8
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	56(%rsp),%eax
+	xorl	%ebp,%edx
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	psrld	$31,%xmm8
+	xorl	%ebp,%esi
+	addl	%ebx,%eax
+	movdqa	%xmm10,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	psrld	$30,%xmm10
+	por	%xmm8,%xmm7
+	addl	60(%rsp),%ebp
+	xorl	%edx,%ecx
+	movl	%eax,%esi
+	roll	$5,%eax
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	16(%r11),%xmm10
+	xorl	%edx,%edi
+	addl	%eax,%ebp
+	pxor	%xmm9,%xmm7
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movdqa	%xmm7,%xmm9
+	addl	0(%rsp),%edx
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,206,8
+	xorl	%ecx,%ebx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm7,%xmm10
+	xorl	%ecx,%esi
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	4(%rsp),%ecx
+	xorl	%ebx,%eax
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm10,48(%rsp)
+	movl	%edx,%esi
+	roll	$5,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	pslld	$2,%xmm0
+	xorl	%ebx,%edi
+	addl	%edx,%ecx
+	psrld	$30,%xmm9
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	8(%rsp),%ebx
+	xorl	%eax,%ebp
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	por	%xmm9,%xmm0
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	movdqa	%xmm0,%xmm10
+	xorl	%eax,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edx
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	xorl	%ebp,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	16(%rsp),%ebp
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,215,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm0,%xmm8
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	movdqa	%xmm8,0(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm10,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%eax,%edi
+	movdqa	%xmm1,%xmm8
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	32(%rsp),%eax
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,192,8
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	movdqa	32(%r11),%xmm10
+	paddd	%xmm1,%xmm9
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	movdqa	%xmm9,16(%rsp)
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	pslld	$2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ecx,%esi
+	psrld	$30,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	por	%xmm8,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%ebx,%edi
+	movdqa	%xmm2,%xmm9
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	48(%rsp),%ebx
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,201,8
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm2,%xmm10
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	movdqa	%xmm10,32(%rsp)
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ebp
+	xorl	%edx,%esi
+	psrld	$30,%xmm9
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	por	%xmm9,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ecx,%edi
+	movdqa	%xmm3,%xmm10
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	0(%rsp),%ecx
+	pxor	%xmm0,%xmm4
+.byte	102,68,15,58,15,210,8
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm3,%xmm8
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	movdqa	%xmm8,48(%rsp)
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	pslld	$2,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%ebp,%esi
+	psrld	$30,%xmm10
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	por	%xmm10,%xmm4
+	addl	12(%rsp),%ebp
+	xorl	%edx,%edi
+	movdqa	%xmm4,%xmm8
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	16(%rsp),%edx
+	pxor	%xmm1,%xmm5
+.byte	102,68,15,58,15,195,8
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm4,%xmm9
+	rorl	$7,%eax
+	addl	%esi,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	movdqa	%xmm9,0(%rsp)
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	pslld	$2,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%eax,%esi
+	psrld	$30,%xmm8
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	por	%xmm8,%xmm5
+	addl	28(%rsp),%eax
+	xorl	%ebp,%edi
+	movdqa	%xmm5,%xmm9
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ecx,%edi
+	pxor	%xmm2,%xmm6
+.byte	102,68,15,58,15,204,8
+	xorl	%edx,%ecx
+	addl	32(%rsp),%ebp
+	andl	%edx,%edi
+	pxor	%xmm7,%xmm6
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm5,%xmm10
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	pxor	%xmm9,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movdqa	%xmm6,%xmm9
+	movdqa	%xmm10,16(%rsp)
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	36(%rsp),%edx
+	andl	%ecx,%esi
+	pslld	$2,%xmm6
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	psrld	$30,%xmm9
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	por	%xmm9,%xmm6
+	movl	%eax,%edi
+	xorl	%ebx,%eax
+	movdqa	%xmm6,%xmm10
+	addl	40(%rsp),%ecx
+	andl	%ebx,%edi
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	44(%rsp),%ebx
+	andl	%eax,%esi
+	andl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%edi
+	pxor	%xmm3,%xmm7
+.byte	102,68,15,58,15,213,8
+	xorl	%ebp,%edx
+	addl	48(%rsp),%eax
+	andl	%ebp,%edi
+	pxor	%xmm0,%xmm7
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	48(%r11),%xmm9
+	paddd	%xmm6,%xmm8
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	pxor	%xmm10,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movdqa	%xmm7,%xmm10
+	movdqa	%xmm8,32(%rsp)
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	52(%rsp),%ebp
+	andl	%edx,%esi
+	pslld	$2,%xmm7
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	psrld	$30,%xmm10
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	por	%xmm10,%xmm7
+	movl	%ebx,%edi
+	xorl	%ecx,%ebx
+	movdqa	%xmm7,%xmm8
+	addl	56(%rsp),%edx
+	andl	%ecx,%edi
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	60(%rsp),%ecx
+	andl	%ebx,%esi
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movl	%ebp,%edi
+	pxor	%xmm4,%xmm0
+.byte	102,68,15,58,15,198,8
+	xorl	%eax,%ebp
+	addl	0(%rsp),%ebx
+	andl	%eax,%edi
+	pxor	%xmm1,%xmm0
+	andl	%ebp,%esi
+	rorl	$7,%edx
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm7,%xmm9
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	pxor	%xmm8,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movdqa	%xmm0,%xmm8
+	movdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	4(%rsp),%eax
+	andl	%ebp,%esi
+	pslld	$2,%xmm0
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	psrld	$30,%xmm8
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	por	%xmm8,%xmm0
+	movl	%ecx,%edi
+	xorl	%edx,%ecx
+	movdqa	%xmm0,%xmm9
+	addl	8(%rsp),%ebp
+	andl	%edx,%edi
+	andl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	12(%rsp),%edx
+	andl	%ecx,%esi
+	andl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movl	%eax,%edi
+	pxor	%xmm5,%xmm1
+.byte	102,68,15,58,15,207,8
+	xorl	%ebx,%eax
+	addl	16(%rsp),%ecx
+	andl	%ebx,%edi
+	pxor	%xmm2,%xmm1
+	andl	%eax,%esi
+	rorl	$7,%ebp
+	movdqa	%xmm10,%xmm8
+	paddd	%xmm0,%xmm10
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	pxor	%xmm9,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	movdqa	%xmm1,%xmm9
+	movdqa	%xmm10,0(%rsp)
+	movl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	20(%rsp),%ebx
+	andl	%eax,%esi
+	pslld	$2,%xmm1
+	andl	%ebp,%edi
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	addl	%esi,%ebx
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	por	%xmm9,%xmm1
+	movl	%edx,%edi
+	xorl	%ebp,%edx
+	movdqa	%xmm1,%xmm10
+	addl	24(%rsp),%eax
+	andl	%ebp,%edi
+	andl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	movl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	28(%rsp),%ebp
+	andl	%edx,%esi
+	andl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%edx,%ecx
+	addl	%eax,%ebp
+	movl	%ebx,%edi
+	pxor	%xmm6,%xmm2
+.byte	102,68,15,58,15,208,8
+	xorl	%ecx,%ebx
+	addl	32(%rsp),%edx
+	andl	%ecx,%edi
+	pxor	%xmm3,%xmm2
+	andl	%ebx,%esi
+	rorl	$7,%eax
+	movdqa	%xmm8,%xmm9
+	paddd	%xmm1,%xmm8
+	addl	%edi,%edx
+	movl	%ebp,%edi
+	pxor	%xmm10,%xmm2
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ecx,%ebx
+	addl	%ebp,%edx
+	movdqa	%xmm2,%xmm10
+	movdqa	%xmm8,16(%rsp)
+	movl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	36(%rsp),%ecx
+	andl	%ebx,%esi
+	pslld	$2,%xmm2
+	andl	%eax,%edi
+	rorl	$7,%ebp
+	psrld	$30,%xmm10
+	addl	%esi,%ecx
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebx,%eax
+	addl	%edx,%ecx
+	por	%xmm10,%xmm2
+	movl	%ebp,%edi
+	xorl	%eax,%ebp
+	movdqa	%xmm2,%xmm8
+	addl	40(%rsp),%ebx
+	andl	%eax,%edi
+	andl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%eax,%ebp
+	addl	%ecx,%ebx
+	movl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	44(%rsp),%eax
+	andl	%ebp,%esi
+	andl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ebp,%edx
+	addl	%ebx,%eax
+	addl	48(%rsp),%ebp
+	pxor	%xmm7,%xmm3
+.byte	102,68,15,58,15,193,8
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	movdqa	%xmm9,%xmm10
+	paddd	%xmm2,%xmm9
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	movdqa	%xmm9,32(%rsp)
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm8
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	por	%xmm8,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	0(%rsp),%eax
+	paddd	%xmm3,%xmm10
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	movdqa	%xmm10,48(%rsp)
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	4(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	12(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_ssse3
+	movdqa	64(%r11),%xmm6
+	movdqa	0(%r11),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+.byte	102,15,56,0,206
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	paddd	%xmm9,%xmm0
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	movdqa	%xmm0,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	psubd	%xmm9,%xmm0
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+.byte	102,15,56,0,214
+	movl	%edx,%edi
+	roll	$5,%edx
+	paddd	%xmm9,%xmm1
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	movdqa	%xmm1,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	psubd	%xmm9,%xmm1
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+.byte	102,15,56,0,222
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	paddd	%xmm9,%xmm2
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	movdqa	%xmm2,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	psubd	%xmm9,%xmm2
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	20(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	24(%rsp),%ebp
+	xorl	%edx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	xorl	%ecx,%esi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%esi,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ebx,%edi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%rsp),%ecx
+	xorl	%ebx,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	xorl	%eax,%esi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%esi,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	xorl	%ebp,%edi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%edi,%ebx
+	addl	40(%rsp),%eax
+	xorl	%ebp,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%esi,%eax
+	addl	44(%rsp),%ebp
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	xorl	%ecx,%edi
+	addl	%eax,%ebp
+	rorl	$7,%ebx
+	addl	%edi,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%esi
+	addl	%ebp,%edx
+	rorl	$7,%eax
+	addl	%esi,%edx
+	addl	52(%rsp),%ecx
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%ebp
+	addl	%edi,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	xorl	%ebp,%esi
+	addl	%ecx,%ebx
+	rorl	$7,%edx
+	addl	%esi,%ebx
+	addl	60(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	xorl	%edx,%edi
+	addl	%ebx,%eax
+	rorl	$7,%ecx
+	addl	%edi,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	leaq	64(%rsp),%rsi
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%rbp
+	movq	16(%rsi),%rbx
+	leaq	24(%rsi),%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index 4edc5ea..cfdc45c 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -16,7 +16,7 @@
 # There was suggestion to mechanically translate 32-bit code, but I
 # dismissed it, reasoning that x86_64 offers enough register bank
 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
-# implementation:-) However! While 64-bit code does performs better
+# implementation:-) However! While 64-bit code does perform better
 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
 # x86_64 does offer larger *addressable* bank, but out-of-order core
 # reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,38 @@
 # Xeon P4	+65%		+0%		9.9
 # Core2		+60%		+10%		7.0
 
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86_64		SSSE3		AVX
+# P4		9.8		-
+# Opteron	6.6		-
+# Core2		6.7		6.1/+10%	-
+# Atom		11.0		9.7/+13%	-
+# Westmere	7.1		5.6/+27%	-
+# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -40,7 +72,18 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $ctx="%rdi";	# 1st arg
 $inp="%rsi";	# 2nd arg
@@ -51,196 +94,994 @@
 $inp="%r9";
 $num="%r10";
 
-$xi="%eax";
-$t0="%ebx";
-$t1="%ecx";
-$A="%edx";
-$B="%esi";
-$C="%edi";
-$D="%ebp";
-$E="%r11d";
-$T="%r12d";
-
-@V=($A,$B,$C,$D,$E,$T);
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
 
-sub PROLOGUE {
-my $func=shift;
-$code.=<<___;
-.globl	$func
-.type	$func,\@function,3
-.align	16
-$func:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	mov	%rsp,%r11
-	mov	%rdi,$ctx	# reassigned argument
-	sub	\$`8+16*4`,%rsp
-	mov	%rsi,$inp	# reassigned argument
-	and	\$-64,%rsp
-	mov	%rdx,$num	# reassigned argument
-	mov	%r11,`16*4`(%rsp)
-.Lprologue:
-
-	mov	0($ctx),$A
-	mov	4($ctx),$B
-	mov	8($ctx),$C
-	mov	12($ctx),$D
-	mov	16($ctx),$E
-___
-}
-
-sub EPILOGUE {
-my $func=shift;
-$code.=<<___;
-	mov	`16*4`(%rsp),%rsi
-	mov	(%rsi),%r12
-	mov	8(%rsi),%rbp
-	mov	16(%rsi),%rbx
-	lea	24(%rsi),%rsp
-.Lepilogue:
-	ret
-.size	$func,.-$func
-___
-}
+@V=($A,$B,$C,$D,$E);
 
 sub BODY_00_19 {
-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___ if ($i==0);
-	mov	`4*$i`($inp),$xi	
-	`"bswap	$xi"	if(!defined($host))`
-	mov	$xi,`4*$i`(%rsp)
+	mov	`4*$i`($inp),$xi[0]
+	bswap	$xi[0]
+	mov	$xi[0],`4*$i`(%rsp)
 ___
 $code.=<<___ if ($i<15);
-	lea	0x5a827999($xi,$e),$f
 	mov	$c,$t0
-	mov	`4*$j`($inp),$xi
-	mov	$a,$e
+	mov	`4*$j`($inp),$xi[1]
+	mov	$a,$t2
 	xor	$d,$t0
-	`"bswap	$xi"	if(!defined($host))`	
-	rol	\$5,$e
+	bswap	$xi[1]
+	rol	\$5,$t2
+	lea	0x5a827999($xi[0],$e),$e
 	and	$b,$t0
-	mov	$xi,`4*$j`(%rsp)
-	add	$e,$f
+	mov	$xi[1],`4*$j`(%rsp)
+	add	$t2,$e
 	xor	$d,$t0
 	rol	\$30,$b
-	add	$t0,$f
+	add	$t0,$e
 ___
 $code.=<<___ if ($i>=15);
-	lea	0x5a827999($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
+	mov	`4*($j%16)`(%rsp),$xi[1]
 	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
+	rol	\$5,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
 	and	$b,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
+	lea	0x5a827999($xi[0],$e),$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
+	rol	\$1,$xi[1]
+	add	$t2,$e
 	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
-	mov	$xi,`4*($j%16)`(%rsp)
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t0,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
 sub BODY_20_39 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 $code.=<<___ if ($i<79);
-	lea	$K($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
+	mov	`4*($j%16)`(%rsp),$xi[1]
 	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
 	xor	$b,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
+	rol	\$5,$t2
+	lea	$K($xi[0],$e),$e
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
+	add	$t2,$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
 	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
+	add	$t0,$e
+	rol	\$1,$xi[1]
 ___
 $code.=<<___ if ($i<76);
-	mov	$xi,`4*($j%16)`(%rsp)
+	mov	$xi[1],`4*($j%16)`(%rsp)
 ___
 $code.=<<___ if ($i==79);
-	lea	$K($xi,$e),$f
 	mov	$c,$t0
-	mov	$a,$e
+	mov	$a,$t2
 	xor	$b,$t0
-	rol	\$5,$e
+	lea	$K($xi[0],$e),$e
+	rol	\$5,$t2
 	xor	$d,$t0
-	add	$e,$f
+	add	$t2,$e
 	rol	\$30,$b
-	add	$t0,$f
+	add	$t0,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
 sub BODY_40_59 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___;
-	lea	0x8f1bbcdc($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
-	mov	$b,$t0
-	mov	$b,$t1
-	xor	`4*(($j+2)%16)`(%rsp),$xi
-	mov	$a,$e
-	and	$c,$t0
-	xor	`4*(($j+8)%16)`(%rsp),$xi
-	or	$c,$t1
-	rol	\$5,$e
-	xor	`4*(($j+13)%16)`(%rsp),$xi
-	and	$d,$t1
-	add	$e,$f
-	rol	\$1,$xi
-	or	$t1,$t0
+	mov	`4*($j%16)`(%rsp),$xi[1]
+	mov	$c,$t0
+	mov	$c,$t1
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	and	$d,$t0
+	mov	$a,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	xor	$d,$t1
+	lea	0x8f1bbcdc($xi[0],$e),$e
+	rol	\$5,$t2
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+	add	$t0,$e
+	and	$b,$t1
+	rol	\$1,$xi[1]
+	add	$t1,$e
 	rol	\$30,$b
-	mov	$xi,`4*($j%16)`(%rsp)
-	add	$t0,$f
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t2,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
-$code=".text\n";
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
 
-&PROLOGUE("sha1_block_data_order");
-$code.=".align	4\n.Lloop:\n";
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,\@function,3
+.align	16
+sha1_block_data_order:
+	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
+	test	\$`1<<9`,%r8d		# check SSSE3 bit
+	jz	.Lialu
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r8d		# mask AVX bit
+	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
+	or	%r9d,%r8d
+	cmp	\$`1<<28|1<<30`,%r8d
+	je	_avx_shortcut
+___
+$code.=<<___;
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	mov	%rsp,%r11
+	mov	%rdi,$ctx	# reassigned argument
+	sub	\$`8+16*4`,%rsp
+	mov	%rsi,$inp	# reassigned argument
+	and	\$-64,%rsp
+	mov	%rdx,$num	# reassigned argument
+	mov	%r11,`16*4`(%rsp)
+.Lprologue:
+
+	mov	0($ctx),$A
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	16($ctx),$E
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+___
 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
-	add	0($ctx),$E
-	add	4($ctx),$T
-	add	8($ctx),$A
-	add	12($ctx),$B
-	add	16($ctx),$C
-	mov	$E,0($ctx)
-	mov	$T,4($ctx)
-	mov	$A,8($ctx)
-	mov	$B,12($ctx)
-	mov	$C,16($ctx)
-
-	xchg	$E,$A	# mov	$E,$A
-	xchg	$T,$B	# mov	$T,$B
-	xchg	$E,$C	# mov	$A,$C
-	xchg	$T,$D	# mov	$B,$D
-			# mov	$C,$E
-	lea	`16*4`($inp),$inp
+	add	0($ctx),$A
+	add	4($ctx),$B
+	add	8($ctx),$C
+	add	12($ctx),$D
+	add	16($ctx),$E
+	mov	$A,0($ctx)
+	mov	$B,4($ctx)
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+
 	sub	\$1,$num
+	lea	`16*4`($inp),$inp
 	jnz	.Lloop
+
+	mov	`16*4`(%rsp),%rsi
+	mov	(%rsi),%r13
+	mov	8(%rsi),%r12
+	mov	16(%rsi),%rbp
+	mov	24(%rsi),%rbx
+	lea	32(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
 ___
-&EPILOGUE("sha1_block_data_order");
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_ssse3,\@function,3
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	jmp	.Loop_ssse3
+___
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
 $code.=<<___;
-.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_avx,\@function,3
+.align	16
+sha1_block_data_order_avx:
+_avx_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+	vzeroall
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	jmp	.Loop_avx
+___
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vzeroall
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+___
+}}}
+$code.=<<___;
+.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
 ___
 
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -272,25 +1113,75 @@ sub BODY_40_59 {
 
 	lea	.Lprologue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
 	lea	.Lepilogue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	jae	.Lcommon_seh_tail
 
 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
-	lea	24(%rax),%rax
+	lea	32(%rax),%rax
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+
+	jmp	.Lcommon_seh_tail
+.size	se_handler,.-se_handler
 
-.Lin_prologue:
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	64(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$10,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore cotnext->R12
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -328,19 +1219,38 @@ sub BODY_40_59 {
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	ssse3_handler,.-ssse3_handler
 
 .section	.pdata
 .align	4
 	.rva	.LSEH_begin_sha1_block_data_order
 	.rva	.LSEH_end_sha1_block_data_order
 	.rva	.LSEH_info_sha1_block_data_order
-
+	.rva	.LSEH_begin_sha1_block_data_order_ssse3
+	.rva	.LSEH_end_sha1_block_data_order_ssse3
+	.rva	.LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_sha1_block_data_order_avx
+	.rva	.LSEH_end_sha1_block_data_order_avx
+	.rva	.LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_sha1_block_data_order:
 	.byte	9,0,0,0
 	.rva	se_handler
+.LSEH_info_sha1_block_data_order_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
 ___
 }
 
diff --git a/crypto/sha/asm/sha256-586.S b/crypto/sha/asm/sha256-586.S
new file mode 100644
index 0000000..77a8951
--- /dev/null
+++ b/crypto/sha/asm/sha256-586.S
@@ -0,0 +1,258 @@
+.file	"sha512-586.s"
+.text
+.globl	sha256_block_data_order
+.type	sha256_block_data_order,@function
+.align	16
+sha256_block_data_order:
+.L_sha256_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.L001K256-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+.align	16
+.L002loop:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	movl	28(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	movl	44(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	movl	60(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	addl	$64,%edi
+	subl	$32,%esp
+	movl	%edi,100(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esp)
+.align	16
+.L00300_15:
+	movl	92(%esp),%ebx
+	movl	%edx,%ecx
+	rorl	$14,%ecx
+	movl	20(%esp),%esi
+	xorl	%edx,%ecx
+	rorl	$5,%ecx
+	xorl	%edx,%ecx
+	rorl	$6,%ecx
+	movl	24(%esp),%edi
+	addl	%ecx,%ebx
+	xorl	%edi,%esi
+	movl	%edx,16(%esp)
+	movl	%eax,%ecx
+	andl	%edx,%esi
+	movl	12(%esp),%edx
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	28(%esp),%ebx
+	xorl	%eax,%ecx
+	rorl	$11,%ecx
+	movl	4(%esp),%esi
+	xorl	%eax,%ecx
+	rorl	$2,%ecx
+	addl	%ebx,%edx
+	movl	8(%esp),%edi
+	addl	%ecx,%ebx
+	movl	%eax,(%esp)
+	movl	%eax,%ecx
+	subl	$4,%esp
+	orl	%esi,%eax
+	andl	%esi,%ecx
+	andl	%edi,%eax
+	movl	(%ebp),%esi
+	orl	%ecx,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	addl	%esi,%edx
+	addl	%esi,%eax
+	cmpl	$3248222580,%esi
+	jne	.L00300_15
+	movl	152(%esp),%ebx
+.align	16
+.L00416_63:
+	movl	%ebx,%esi
+	movl	100(%esp),%ecx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	xorl	%ebx,%esi
+	rorl	$7,%esi
+	shrl	$3,%ebx
+	rorl	$2,%edi
+	xorl	%esi,%ebx
+	xorl	%ecx,%edi
+	rorl	$17,%edi
+	shrl	$10,%ecx
+	addl	156(%esp),%ebx
+	xorl	%ecx,%edi
+	addl	120(%esp),%ebx
+	movl	%edx,%ecx
+	addl	%edi,%ebx
+	rorl	$14,%ecx
+	movl	20(%esp),%esi
+	xorl	%edx,%ecx
+	rorl	$5,%ecx
+	movl	%ebx,92(%esp)
+	xorl	%edx,%ecx
+	rorl	$6,%ecx
+	movl	24(%esp),%edi
+	addl	%ecx,%ebx
+	xorl	%edi,%esi
+	movl	%edx,16(%esp)
+	movl	%eax,%ecx
+	andl	%edx,%esi
+	movl	12(%esp),%edx
+	xorl	%edi,%esi
+	movl	%eax,%edi
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	28(%esp),%ebx
+	xorl	%eax,%ecx
+	rorl	$11,%ecx
+	movl	4(%esp),%esi
+	xorl	%eax,%ecx
+	rorl	$2,%ecx
+	addl	%ebx,%edx
+	movl	8(%esp),%edi
+	addl	%ecx,%ebx
+	movl	%eax,(%esp)
+	movl	%eax,%ecx
+	subl	$4,%esp
+	orl	%esi,%eax
+	andl	%esi,%ecx
+	andl	%edi,%eax
+	movl	(%ebp),%esi
+	orl	%ecx,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	movl	152(%esp),%ebx
+	addl	%esi,%edx
+	addl	%esi,%eax
+	cmpl	$3329325298,%esi
+	jne	.L00416_63
+	movl	352(%esp),%esi
+	movl	4(%esp),%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edi
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%ecx
+	addl	12(%esi),%edi
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%ecx,8(%esi)
+	movl	%edi,12(%esi)
+	movl	20(%esp),%eax
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	movl	356(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	addl	$352,%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K256:
+.long	1116352408,1899447441,3049323471,3921009573
+.long	961987163,1508970993,2453635748,2870763221
+.long	3624381080,310598401,607225278,1426881987
+.long	1925078388,2162078206,2614888103,3248222580
+.long	3835390401,4022224774,264347078,604807628
+.long	770255983,1249150122,1555081692,1996064986
+.long	2554220882,2821834349,2952996808,3210313671
+.long	3336571891,3584528711,113926993,338241895
+.long	666307205,773529912,1294757372,1396182291
+.long	1695183700,1986661051,2177026350,2456956037
+.long	2730485921,2820302411,3259730800,3345764771
+.long	3516065817,3600352804,4094571909,275423344
+.long	430227734,506948616,659060556,883997877
+.long	958139571,1322822218,1537002063,1747873779
+.long	1955562222,2024104815,2227730452,2361852424
+.long	2428436474,2756734187,3204031479,3329325298
+.size	sha256_block_data_order,.-.L_sha256_block_data_order_begin
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl
index ecc8b69..52a7c7f 100644
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl
@@ -14,14 +14,14 @@
 #		Pentium	PIII	P4	AMD K8	Core2
 # gcc		46	36	41	27	26
 # icc		57	33	38	25	23	
-# x86 asm	40	30	35	20	20
-# x86_64 asm(*)	-	-	21	15.8	16.5
+# x86 asm	40	30	33	20	18
+# x86_64 asm(*)	-	-	21	16	16
 #
 # (*) x86_64 assembler performance is presented for reference
 #     purposes.
 #
 # Performance improvement over compiler generated code varies from
-# 10% to 40% [see above]. Not very impressive on some �-archs, but
+# 10% to 40% [see above]. Not very impressive on some µ-archs, but
 # it's 5 times smaller and optimizies amount of writes.
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
@@ -48,20 +48,19 @@ ()
     my $in_16_63=shift;
 
 	&mov	("ecx",$E);
-	 &add	($T,&DWP(4*(8+15+16-9),"esp"))	if ($in_16_63);	# T += X[-7]
-	&ror	("ecx",6);
-	&mov	("edi",$E);
-	&ror	("edi",11);
+	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
+	&ror	("ecx",25-11);
 	 &mov	("esi",$Foff);
-	&xor	("ecx","edi");
-	&ror	("edi",25-11);
+	&xor	("ecx",$E);
+	&ror	("ecx",11-6);
 	 &mov	(&DWP(4*(8+15),"esp"),$T)	if ($in_16_63);	# save X[0]
-	&xor	("ecx","edi");	# Sigma1(e)
+	&xor	("ecx",$E);
+	&ror	("ecx",6);	# Sigma1(e)
 	 &mov	("edi",$Goff);
 	&add	($T,"ecx");	# T += Sigma1(e)
-	 &mov	($Eoff,$E);	# modulo-scheduled
 
 	&xor	("esi","edi");
+	 &mov	($Eoff,$E);	# modulo-scheduled
 	 &mov	("ecx",$A);
 	&and	("esi",$E);
 	 &mov	($E,$Doff);	# e becomes d, which is e in next iteration
@@ -69,14 +68,14 @@ ()
 	 &mov	("edi",$A);
 	&add	($T,"esi");	# T += Ch(e,f,g)
 
-	&ror	("ecx",2);
+	&ror	("ecx",22-13);
 	 &add	($T,$Hoff);	# T += h
-	&ror	("edi",13);
+	&xor	("ecx",$A);
+	&ror	("ecx",13-2);
 	 &mov	("esi",$Boff);
-	&xor	("ecx","edi");
-	&ror	("edi",22-13);
+	&xor	("ecx",$A);
+	&ror	("ecx",2);	# Sigma0(a)
 	 &add	($E,$T);	# d += T
-	&xor	("ecx","edi");	# Sigma0(a)
 	 &mov	("edi",$Coff);
 
 	&add	($T,"ecx");	# T += Sigma0(a)
@@ -168,23 +167,22 @@ ()
 &set_label("16_63",16);
 	&mov	("esi",$T);
 	 &mov	("ecx",&DWP(4*(8+15+16-14),"esp"));
-	&shr	($T,3);
-	&ror	("esi",7);
-	&xor	($T,"esi");
 	&ror	("esi",18-7);
 	 &mov	("edi","ecx");
-	&xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("esi",$T);
+	&ror	("esi",7);
+	&shr	($T,3);
 
-	&shr	("ecx",10);
-	 &mov	("esi",&DWP(4*(8+15+16),"esp"));
-	&ror	("edi",17);
-	&xor	("ecx","edi");
 	&ror	("edi",19-17);
-	 &add	($T,"esi");			# T += X[-16]
-	&xor	("edi","ecx")			# sigma1(X[-2])
+	 &xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("edi","ecx");
+	&ror	("edi",17);
+	&shr	("ecx",10);
+	 &add	($T,&DWP(4*(8+15+16),"esp"));	# T += X[-16]
+	&xor	("edi","ecx");			# sigma1(X[-2])
 
-	&add	($T,"edi");			# T += sigma1(X[-2])
-	# &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7], moved to BODY_00_15(1)
+	 &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7]
+	# &add	($T,"edi");			# T += sigma1(X[-2])
 	# &mov	(&DWP(4*(8+15),"esp"),$T);	# save X[0]
 
 	&BODY_00_15(1);
diff --git a/crypto/sha/asm/sha256-armv4.s b/crypto/sha/asm/sha256-armv4.S
similarity index 62%
rename from crypto/sha/asm/sha256-armv4.s
rename to crypto/sha/asm/sha256-armv4.S
index ee903dc..9c20a63 100644
--- a/crypto/sha/asm/sha256-armv4.s
+++ b/crypto/sha/asm/sha256-armv4.S
@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
 .text
 .code	32
 
@@ -27,11 +29,14 @@ K256:
 sha256_block_data_order:
 	sub	r3,pc,#8		@ sha256_block_data_order
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{r0,r1,r2,r4-r12,lr}
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
 	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
 	sub	r14,r3,#256		@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 0
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -39,14 +44,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r8,ror#6
-	str	r3,[sp,#0*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 0>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 0==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#0*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -55,6 +68,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 0>=15
+	ldr	r1,[sp,#2*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -62,6 +78,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 1
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -69,14 +88,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r7,ror#6
-	str	r3,[sp,#1*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 1>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 1==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#1*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -85,6 +112,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 1>=15
+	ldr	r1,[sp,#3*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -92,6 +122,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 2
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -99,14 +132,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r6,ror#6
-	str	r3,[sp,#2*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 2>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 2==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#2*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -115,6 +156,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 2>=15
+	ldr	r1,[sp,#4*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -122,6 +166,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 3
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -129,14 +176,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r5,ror#6
-	str	r3,[sp,#3*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 3>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 3==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#3*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -145,6 +200,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 3>=15
+	ldr	r1,[sp,#5*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -152,6 +210,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 4
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -159,14 +220,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r4,ror#6
-	str	r3,[sp,#4*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 4>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 4==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#4*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -175,6 +244,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 4>=15
+	ldr	r1,[sp,#6*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -182,6 +254,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 5
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -189,14 +264,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r11,ror#6
-	str	r3,[sp,#5*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 5>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 5==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#5*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -205,6 +288,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 5>=15
+	ldr	r1,[sp,#7*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -212,6 +298,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 6
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -219,14 +308,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r10,ror#6
-	str	r3,[sp,#6*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 6>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 6==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#6*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -235,6 +332,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 6>=15
+	ldr	r1,[sp,#8*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -242,6 +342,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 7
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -249,14 +352,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r9,ror#6
-	str	r3,[sp,#7*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 7>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 7==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#7*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -265,6 +376,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 7>=15
+	ldr	r1,[sp,#9*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -272,6 +386,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r8,r8,r3
 	add	r4,r4,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 8
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -279,14 +396,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r8,ror#6
-	str	r3,[sp,#8*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 8>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 8==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#8*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -295,6 +420,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 8>=15
+	ldr	r1,[sp,#10*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -302,6 +430,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 9
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -309,14 +440,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r7,ror#6
-	str	r3,[sp,#9*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 9>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 9==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#9*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -325,6 +464,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 9>=15
+	ldr	r1,[sp,#11*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -332,6 +474,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 10
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -339,14 +484,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r6,ror#6
-	str	r3,[sp,#10*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 10>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 10==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#10*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -355,6 +508,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 10>=15
+	ldr	r1,[sp,#12*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -362,6 +518,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 11
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -369,14 +528,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r5,ror#6
-	str	r3,[sp,#11*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 11>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 11==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#11*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -385,6 +552,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 11>=15
+	ldr	r1,[sp,#13*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -392,6 +562,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 12
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -399,14 +572,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r4,ror#6
-	str	r3,[sp,#12*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 12>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 12==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#12*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -415,6 +596,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 12>=15
+	ldr	r1,[sp,#14*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -422,6 +606,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 13
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -429,14 +616,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r11,ror#6
-	str	r3,[sp,#13*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 13>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 13==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#13*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -445,6 +640,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 13>=15
+	ldr	r1,[sp,#15*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -452,6 +650,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 14
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -459,14 +660,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r10,ror#6
-	str	r3,[sp,#14*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 14>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 14==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#14*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -475,6 +684,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 14>=15
+	ldr	r1,[sp,#0*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -482,6 +694,9 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
+#if __ARM_ARCH__>=7
+	ldr	r3,[r1],#4
+#else
 	ldrb	r3,[r1,#3]			@ 15
 	ldrb	r12,[r1,#2]
 	ldrb	r2,[r1,#1]
@@ -489,14 +704,22 @@ sha256_block_data_order:
 	orr	r3,r3,r12,lsl#8
 	orr	r3,r3,r2,lsl#16
 	orr	r3,r3,r0,lsl#24
-	str	r1,[sp,#17*4]
-	ldr	r12,[r14],#4			@ *K256++
+#endif
 	mov	r0,r9,ror#6
-	str	r3,[sp,#15*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 15>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 15==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#15*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -505,6 +728,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 15>=15
+	ldr	r1,[sp,#1*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -513,26 +739,34 @@ sha256_block_data_order:
 	add	r8,r8,r3
 	add	r4,r4,r0
 .Lrounds_16_xx:
-	ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#1*4]		@ 16
 	ldr	r12,[sp,#14*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#0*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#9*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#9*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r8,ror#6
-	str	r3,[sp,#0*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 16>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 16==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#0*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -541,6 +775,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 16>=15
+	ldr	r1,[sp,#2*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -548,26 +785,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
-	ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#2*4]		@ 17
 	ldr	r12,[sp,#15*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#1*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#10*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#10*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r7,ror#6
-	str	r3,[sp,#1*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 17>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 17==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#1*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -576,6 +821,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 17>=15
+	ldr	r1,[sp,#3*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -583,26 +831,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
-	ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#3*4]		@ 18
 	ldr	r12,[sp,#0*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#2*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#11*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#11*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r6,ror#6
-	str	r3,[sp,#2*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 18>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 18==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#2*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -611,6 +867,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 18>=15
+	ldr	r1,[sp,#4*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -618,26 +877,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
-	ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#4*4]		@ 19
 	ldr	r12,[sp,#1*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#3*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#12*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#12*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r5,ror#6
-	str	r3,[sp,#3*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 19>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 19==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#3*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -646,6 +913,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 19>=15
+	ldr	r1,[sp,#5*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -653,26 +923,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
-	ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#5*4]		@ 20
 	ldr	r12,[sp,#2*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#4*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#13*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#13*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r4,ror#6
-	str	r3,[sp,#4*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 20>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 20==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#4*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -681,6 +959,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 20>=15
+	ldr	r1,[sp,#6*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -688,26 +969,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
-	ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#6*4]		@ 21
 	ldr	r12,[sp,#3*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#5*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#14*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#14*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r11,ror#6
-	str	r3,[sp,#5*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 21>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 21==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#5*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -716,6 +1005,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 21>=15
+	ldr	r1,[sp,#7*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -723,26 +1015,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
-	ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#7*4]		@ 22
 	ldr	r12,[sp,#4*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#6*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#15*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#15*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r10,ror#6
-	str	r3,[sp,#6*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 22>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 22==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#6*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -751,6 +1051,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 22>=15
+	ldr	r1,[sp,#8*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -758,26 +1061,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
-	ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#8*4]		@ 23
 	ldr	r12,[sp,#5*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#7*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#0*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#0*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r9,ror#6
-	str	r3,[sp,#7*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 23>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 23==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#7*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -786,6 +1097,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 23>=15
+	ldr	r1,[sp,#9*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -793,26 +1107,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r8,r8,r3
 	add	r4,r4,r0
-	ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#9*4]		@ 24
 	ldr	r12,[sp,#6*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#8*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#1*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#1*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r8,ror#6
-	str	r3,[sp,#8*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r8,ror#11
 	eor	r2,r9,r10
+#if 24>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 24==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r8,ror#25	@ Sigma1(e)
 	and	r2,r2,r8
+	str	r3,[sp,#8*4]
 	add	r3,r3,r0
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	add	r3,r3,r11
@@ -821,6 +1143,9 @@ sha256_block_data_order:
 	eor	r11,r11,r4,ror#13
 	add	r3,r3,r12
 	eor	r11,r11,r4,ror#22		@ Sigma0(a)
+#if 24>=15
+	ldr	r1,[sp,#10*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r4,r5
 	and	r2,r4,r5
 	and	r0,r0,r6
@@ -828,26 +1153,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r7,r7,r3
 	add	r11,r11,r0
-	ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#10*4]		@ 25
 	ldr	r12,[sp,#7*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#9*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#2*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#2*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r7,ror#6
-	str	r3,[sp,#9*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r7,ror#11
 	eor	r2,r8,r9
+#if 25>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 25==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r7,ror#25	@ Sigma1(e)
 	and	r2,r2,r7
+	str	r3,[sp,#9*4]
 	add	r3,r3,r0
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	add	r3,r3,r10
@@ -856,6 +1189,9 @@ sha256_block_data_order:
 	eor	r10,r10,r11,ror#13
 	add	r3,r3,r12
 	eor	r10,r10,r11,ror#22		@ Sigma0(a)
+#if 25>=15
+	ldr	r1,[sp,#11*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r11,r4
 	and	r2,r11,r4
 	and	r0,r0,r5
@@ -863,26 +1199,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r6,r6,r3
 	add	r10,r10,r0
-	ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#11*4]		@ 26
 	ldr	r12,[sp,#8*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#10*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#3*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#3*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r6,ror#6
-	str	r3,[sp,#10*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r6,ror#11
 	eor	r2,r7,r8
+#if 26>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 26==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r6,ror#25	@ Sigma1(e)
 	and	r2,r2,r6
+	str	r3,[sp,#10*4]
 	add	r3,r3,r0
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	add	r3,r3,r9
@@ -891,6 +1235,9 @@ sha256_block_data_order:
 	eor	r9,r9,r10,ror#13
 	add	r3,r3,r12
 	eor	r9,r9,r10,ror#22		@ Sigma0(a)
+#if 26>=15
+	ldr	r1,[sp,#12*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r10,r11
 	and	r2,r10,r11
 	and	r0,r0,r4
@@ -898,26 +1245,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r5,r5,r3
 	add	r9,r9,r0
-	ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#12*4]		@ 27
 	ldr	r12,[sp,#9*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#11*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#4*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#4*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r5,ror#6
-	str	r3,[sp,#11*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r5,ror#11
 	eor	r2,r6,r7
+#if 27>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 27==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r5,ror#25	@ Sigma1(e)
 	and	r2,r2,r5
+	str	r3,[sp,#11*4]
 	add	r3,r3,r0
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	add	r3,r3,r8
@@ -926,6 +1281,9 @@ sha256_block_data_order:
 	eor	r8,r8,r9,ror#13
 	add	r3,r3,r12
 	eor	r8,r8,r9,ror#22		@ Sigma0(a)
+#if 27>=15
+	ldr	r1,[sp,#13*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r9,r10
 	and	r2,r9,r10
 	and	r0,r0,r11
@@ -933,26 +1291,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r4,r4,r3
 	add	r8,r8,r0
-	ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#13*4]		@ 28
 	ldr	r12,[sp,#10*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#12*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#5*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#5*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r4,ror#6
-	str	r3,[sp,#12*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r4,ror#11
 	eor	r2,r5,r6
+#if 28>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 28==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r4,ror#25	@ Sigma1(e)
 	and	r2,r2,r4
+	str	r3,[sp,#12*4]
 	add	r3,r3,r0
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	add	r3,r3,r7
@@ -961,6 +1327,9 @@ sha256_block_data_order:
 	eor	r7,r7,r8,ror#13
 	add	r3,r3,r12
 	eor	r7,r7,r8,ror#22		@ Sigma0(a)
+#if 28>=15
+	ldr	r1,[sp,#14*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r8,r9
 	and	r2,r8,r9
 	and	r0,r0,r10
@@ -968,26 +1337,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r11,r11,r3
 	add	r7,r7,r0
-	ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#14*4]		@ 29
 	ldr	r12,[sp,#11*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#13*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#6*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#6*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r11,ror#6
-	str	r3,[sp,#13*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r11,ror#11
 	eor	r2,r4,r5
+#if 29>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 29==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r11,ror#25	@ Sigma1(e)
 	and	r2,r2,r11
+	str	r3,[sp,#13*4]
 	add	r3,r3,r0
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	add	r3,r3,r6
@@ -996,6 +1373,9 @@ sha256_block_data_order:
 	eor	r6,r6,r7,ror#13
 	add	r3,r3,r12
 	eor	r6,r6,r7,ror#22		@ Sigma0(a)
+#if 29>=15
+	ldr	r1,[sp,#15*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r7,r8
 	and	r2,r7,r8
 	and	r0,r0,r9
@@ -1003,26 +1383,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r10,r10,r3
 	add	r6,r6,r0
-	ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#15*4]		@ 30
 	ldr	r12,[sp,#12*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#14*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#7*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#7*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r10,ror#6
-	str	r3,[sp,#14*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r10,ror#11
 	eor	r2,r11,r4
+#if 30>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 30==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r10,ror#25	@ Sigma1(e)
 	and	r2,r2,r10
+	str	r3,[sp,#14*4]
 	add	r3,r3,r0
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	add	r3,r3,r5
@@ -1031,6 +1419,9 @@ sha256_block_data_order:
 	eor	r5,r5,r6,ror#13
 	add	r3,r3,r12
 	eor	r5,r5,r6,ror#22		@ Sigma0(a)
+#if 30>=15
+	ldr	r1,[sp,#0*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r6,r7
 	and	r2,r6,r7
 	and	r0,r0,r8
@@ -1038,26 +1429,34 @@ sha256_block_data_order:
 	orr	r0,r0,r2			@ Maj(a,b,c)
 	add	r9,r9,r3
 	add	r5,r5,r0
-	ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#0*4]		@ 31
 	ldr	r12,[sp,#13*4]
+	mov	r0,r1,ror#7
 	ldr	r3,[sp,#15*4]
-	mov	r0,r2,ror#7
-	ldr	r1,[sp,#8*4]
-	eor	r0,r0,r2,ror#18
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	mov	r2,r12,ror#17
+	eor	r0,r0,r1,ror#18
+	ldr	r2,[sp,#8*4]
+	eor	r0,r0,r1,lsr#3	@ sigma0(X[i+1])
+	mov	r1,r12,ror#17
 	add	r3,r3,r0
-	eor	r2,r2,r12,ror#19
-	add	r3,r3,r1
-	eor	r2,r2,r12,lsr#10	@ sigma1(X[i+14])
+	eor	r1,r1,r12,ror#19
 	add	r3,r3,r2
-	ldr	r12,[r14],#4			@ *K256++
+	eor	r1,r1,r12,lsr#10	@ sigma1(X[i+14])
+	@ add	r3,r3,r1
 	mov	r0,r9,ror#6
-	str	r3,[sp,#15*4]
+	ldr	r12,[r14],#4			@ *K256++
 	eor	r0,r0,r9,ror#11
 	eor	r2,r10,r11
+#if 31>=16
+	add	r3,r3,r1			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r3,r3
+#endif
+#if 31==15
+	str	r1,[sp,#17*4]			@ leave room for r1
+#endif
 	eor	r0,r0,r9,ror#25	@ Sigma1(e)
 	and	r2,r2,r9
+	str	r3,[sp,#15*4]
 	add	r3,r3,r0
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	add	r3,r3,r4
@@ -1066,6 +1465,9 @@ sha256_block_data_order:
 	eor	r4,r4,r5,ror#13
 	add	r3,r3,r12
 	eor	r4,r4,r5,ror#22		@ Sigma0(a)
+#if 31>=15
+	ldr	r1,[sp,#1*4]		@ from BODY_16_xx
+#endif
 	orr	r0,r5,r6
 	and	r2,r5,r6
 	and	r0,r0,r7
@@ -1102,10 +1504,14 @@ sha256_block_data_order:
 	bne	.Loop
 
 	add	sp,sp,#19*4	@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
 .align	2
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 492cb62..9c84e8d 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -18,11 +18,16 @@
 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
 # Cortex A8 core and ~20 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~17 cycles per processed byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";	$t0="r0";
-$inp="r1";
+$inp="r1";	$t3="r1";
 $len="r2";	$t1="r2";
 $T1="r3";
 $A="r4";
@@ -46,6 +51,9 @@ sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	ldr	$T1,[$inp],#4
+#else
 	ldrb	$T1,[$inp,#3]			@ $i
 	ldrb	$t2,[$inp,#2]
 	ldrb	$t1,[$inp,#1]
@@ -53,16 +61,24 @@ sub BODY_00_15 {
 	orr	$T1,$T1,$t2,lsl#8
 	orr	$T1,$T1,$t1,lsl#16
 	orr	$T1,$T1,$t0,lsl#24
-	`"str	$inp,[sp,#17*4]"	if ($i==15)`
+#endif
 ___
 $code.=<<___;
-	ldr	$t2,[$Ktbl],#4			@ *K256++
 	mov	$t0,$e,ror#$Sigma1[0]
-	str	$T1,[sp,#`$i%16`*4]
+	ldr	$t2,[$Ktbl],#4			@ *K256++
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
 	eor	$t1,$f,$g
+#if $i>=16
+	add	$T1,$T1,$t3			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$T1,$T1
+#endif
+#if $i==15
+	str	$inp,[sp,#17*4]			@ leave room for $t3
+#endif
 	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	str	$T1,[sp,#`$i%16`*4]
 	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
 	add	$T1,$T1,$h
@@ -71,6 +87,9 @@ sub BODY_00_15 {
 	eor	$h,$h,$a,ror#$Sigma0[1]
 	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
+#if $i>=15
+	ldr	$t3,[sp,#`($i+2)%16`*4]		@ from BODY_16_xx
+#endif
 	orr	$t0,$a,$b
 	and	$t1,$a,$b
 	and	$t0,$t0,$c
@@ -85,24 +104,26 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t3,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t3,ror#$sigma0[0]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	mov	$t0,$t1,ror#$sigma0[0]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
-	eor	$t0,$t0,$t1,ror#$sigma0[1]
-	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
-	mov	$t1,$t2,ror#$sigma1[0]
+	eor	$t0,$t0,$t3,ror#$sigma0[1]
+	ldr	$t1,[sp,#`($i+9)%16`*4]
+	eor	$t0,$t0,$t3,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	mov	$t3,$t2,ror#$sigma1[0]
 	add	$T1,$T1,$t0
-	eor	$t1,$t1,$t2,ror#$sigma1[1]
-	add	$T1,$T1,$inp
-	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	eor	$t3,$t3,$t2,ror#$sigma1[1]
 	add	$T1,$T1,$t1
+	eor	$t3,$t3,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	@ add	$T1,$T1,$t3
 ___
 	&BODY_00_15(@_);
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 .code	32
 
@@ -132,7 +153,7 @@ sub BODY_16_XX {
 sha256_block_data_order:
 	sub	r3,pc,#8		@ sha256_block_data_order
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{$ctx,$inp,$len,r4-r12,lr}
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 	sub	$Ktbl,r3,#256		@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
@@ -171,10 +192,14 @@ sub BODY_16_XX {
 	bne	.Loop
 
 	add	sp,sp,#`16+3`*4	@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
diff --git a/crypto/sha/asm/sha256-mips.s b/crypto/sha/asm/sha256-mips.S
similarity index 99%
rename from crypto/sha/asm/sha256-mips.s
rename to crypto/sha/asm/sha256-mips.S
index 83876d9..5fc697f 100644
--- a/crypto/sha/asm/sha256-mips.s
+++ b/crypto/sha/asm/sha256-mips.S
@@ -15,6 +15,7 @@ sha256_block_data_order:
 	.frame	$29,128,$31
 	.mask	3237937152,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,128
 	sw	$31,128-1*4($29)
 	sw	$30,128-2*4($29)
@@ -27,8 +28,6 @@ sha256_block_data_order:
 	sw	$17,128-9*4($29)
 	sw	$16,128-10*4($29)
 	sll $23,$6,6
-	.cplocal	$6
-	.cpsetup	$25,$0,sha256_block_data_order
 	.set	reorder
 	la	$6,K256		# PIC-ified 'load address'
 
diff --git a/crypto/sha/asm/sha256-x86_64.S b/crypto/sha/asm/sha256-x86_64.S
new file mode 100644
index 0000000..db5b898
--- /dev/null
+++ b/crypto/sha/asm/sha256-x86_64.S
@@ -0,0 +1,1778 @@
+.text	
+
+.globl	sha256_block_data_order
+.type	sha256_block_data_order,@function
+.align	16
+sha256_block_data_order:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%r11,64+24(%rsp)
+.Lprologue:
+
+	leaq	K256(%rip),%rbp
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xorq	%rdi,%rdi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,0(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,4(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,8(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,12(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,16(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,20(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,24(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,28(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,32(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,36(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,40(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,44(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,48(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,52(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,56(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,60(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	36(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r14d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,0(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	40(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%r14d,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,4(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	44(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r14d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,8(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	48(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%r14d,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,12(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	52(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r14d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,16(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	56(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%r14d,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,20(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	60(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r14d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,24(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	0(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%r14d,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,28(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	4(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r14d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+	movl	%r12d,32(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%eax,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r8d,%r15d
+	movl	%ebx,%r11d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r15d
+
+	xorl	%ecx,%r11d
+	xorl	%eax,%r14d
+	addl	%r15d,%r12d
+	movl	%ebx,%r15d
+
+	rorl	$6,%r13d
+	andl	%eax,%r11d
+	andl	%ecx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r11d
+
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r11d
+
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	8(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%r14d,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%r15d
+	movl	%r12d,36(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r11d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%edx,%r15d
+	movl	%eax,%r10d
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r15d
+
+	xorl	%ebx,%r10d
+	xorl	%r11d,%r14d
+	addl	%r15d,%r12d
+	movl	%eax,%r15d
+
+	rorl	$6,%r13d
+	andl	%r11d,%r10d
+	andl	%ebx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r10d
+
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r10d
+
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	12(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r14d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+	movl	%r12d,40(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r10d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ecx,%r15d
+	movl	%r11d,%r9d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r15d
+
+	xorl	%eax,%r9d
+	xorl	%r10d,%r14d
+	addl	%r15d,%r12d
+	movl	%r11d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r10d,%r9d
+	andl	%eax,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r9d
+
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r9d
+
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	16(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%r14d,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%r15d
+	movl	%r12d,44(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%r9d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%ebx,%r15d
+	movl	%r10d,%r8d
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r15d
+
+	xorl	%r11d,%r8d
+	xorl	%r9d,%r14d
+	addl	%r15d,%r12d
+	movl	%r10d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r9d,%r8d
+	andl	%r11d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%r8d
+
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%r8d
+
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	20(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r14d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+	movl	%r12d,48(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%r8d,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%eax,%r15d
+	movl	%r9d,%edx
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r15d
+
+	xorl	%r10d,%edx
+	xorl	%r8d,%r14d
+	addl	%r15d,%r12d
+	movl	%r9d,%r15d
+
+	rorl	$6,%r13d
+	andl	%r8d,%edx
+	andl	%r10d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%edx
+
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%edx
+
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	24(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%r14d,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%r15d
+	movl	%r12d,52(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%edx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r11d,%r15d
+	movl	%r8d,%ecx
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r15d
+
+	xorl	%r9d,%ecx
+	xorl	%edx,%r14d
+	addl	%r15d,%r12d
+	movl	%r8d,%r15d
+
+	rorl	$6,%r13d
+	andl	%edx,%ecx
+	andl	%r9d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ecx
+
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ecx
+
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	28(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r14d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+	movl	%r12d,56(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%ecx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r10d,%r15d
+	movl	%edx,%ebx
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r15d
+
+	xorl	%r8d,%ebx
+	xorl	%ecx,%r14d
+	addl	%r15d,%r12d
+	movl	%edx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ecx,%ebx
+	andl	%r8d,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%ebx
+
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%ebx
+
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%r14d
+	movl	%r13d,%r12d
+	movl	%r14d,%r15d
+
+	rorl	$11,%r12d
+	xorl	%r13d,%r12d
+	shrl	$3,%r13d
+
+	rorl	$7,%r12d
+	xorl	%r12d,%r13d
+	movl	32(%rsp),%r12d
+
+	rorl	$2,%r15d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	addl	%r13d,%r12d
+	xorl	%r15d,%r14d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%r14d,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%r15d
+	movl	%r12d,60(%rsp)
+
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%ebx,%r14d
+
+	addl	(%rbp,%rdi,4),%r12d
+	andl	%r9d,%r15d
+	movl	%ecx,%eax
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r15d
+
+	xorl	%edx,%eax
+	xorl	%ebx,%r14d
+	addl	%r15d,%r12d
+	movl	%ecx,%r15d
+
+	rorl	$6,%r13d
+	andl	%ebx,%eax
+	andl	%edx,%r15d
+
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+	addl	%r15d,%eax
+
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	1(%rdi),%rdi
+	addl	%r14d,%eax
+
+	cmpq	$64,%rdi
+	jb	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	movq	64+24(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha256_block_data_order,.-sha256_block_data_order
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/crypto/sha/asm/sha512-586.S b/crypto/sha/asm/sha512-586.S
new file mode 100644
index 0000000..4b806f3
--- /dev/null
+++ b/crypto/sha/asm/sha512-586.S
@@ -0,0 +1,563 @@
+.file	"sha512-586.s"
+.text
+.globl	sha512_block_data_order
+.type	sha512_block_data_order,@function
+.align	16
+sha512_block_data_order:
+.L_sha512_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.L001K512-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+.align	16
+.L002loop_x86:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	movl	28(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	movl	44(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	movl	60(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	64(%edi),%eax
+	movl	68(%edi),%ebx
+	movl	72(%edi),%ecx
+	movl	76(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	80(%edi),%eax
+	movl	84(%edi),%ebx
+	movl	88(%edi),%ecx
+	movl	92(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	96(%edi),%eax
+	movl	100(%edi),%ebx
+	movl	104(%edi),%ecx
+	movl	108(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	112(%edi),%eax
+	movl	116(%edi),%ebx
+	movl	120(%edi),%ecx
+	movl	124(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	addl	$128,%edi
+	subl	$72,%esp
+	movl	%edi,204(%esp)
+	leal	8(%esp),%edi
+	movl	$16,%ecx
+.long	2784229001
+.align	16
+.L00300_15_x86:
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$148,%dl
+	jne	.L00300_15_x86
+.align	16
+.L00416_79_x86:
+	movl	312(%esp),%ecx
+	movl	316(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$1,%ecx
+	movl	%edx,%edi
+	shrl	$1,%edx
+	movl	%ecx,%eax
+	shll	$24,%esi
+	movl	%edx,%ebx
+	shll	$24,%edi
+	xorl	%esi,%ebx
+	shrl	$6,%ecx
+	xorl	%edi,%eax
+	shrl	$6,%edx
+	xorl	%ecx,%eax
+	shll	$7,%esi
+	xorl	%edx,%ebx
+	shll	$1,%edi
+	xorl	%esi,%ebx
+	shrl	$1,%ecx
+	xorl	%edi,%eax
+	shrl	$1,%edx
+	xorl	%ecx,%eax
+	shll	$6,%edi
+	xorl	%edx,%ebx
+	xorl	%edi,%eax
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	208(%esp),%ecx
+	movl	212(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$6,%ecx
+	movl	%edx,%edi
+	shrl	$6,%edx
+	movl	%ecx,%eax
+	shll	$3,%esi
+	movl	%edx,%ebx
+	shll	$3,%edi
+	xorl	%esi,%eax
+	shrl	$13,%ecx
+	xorl	%edi,%ebx
+	shrl	$13,%edx
+	xorl	%ecx,%eax
+	shll	$10,%esi
+	xorl	%edx,%ebx
+	shll	$10,%edi
+	xorl	%esi,%ebx
+	shrl	$10,%ecx
+	xorl	%edi,%eax
+	shrl	$10,%edx
+	xorl	%ecx,%ebx
+	shll	$13,%edi
+	xorl	%edx,%eax
+	xorl	%edi,%eax
+	movl	320(%esp),%ecx
+	movl	324(%esp),%edx
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	movl	248(%esp),%esi
+	movl	252(%esp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,192(%esp)
+	movl	%ebx,196(%esp)
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$23,%dl
+	jne	.L00416_79_x86
+	movl	840(%esp),%esi
+	movl	844(%esp),%edi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	addl	8(%esp),%eax
+	adcl	12(%esp),%ebx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	addl	16(%esp),%ecx
+	adcl	20(%esp),%edx
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	addl	24(%esp),%eax
+	adcl	28(%esp),%ebx
+	movl	%eax,16(%esi)
+	movl	%ebx,20(%esi)
+	addl	32(%esp),%ecx
+	adcl	36(%esp),%edx
+	movl	%ecx,24(%esi)
+	movl	%edx,28(%esi)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	addl	40(%esp),%eax
+	adcl	44(%esp),%ebx
+	movl	%eax,32(%esi)
+	movl	%ebx,36(%esi)
+	addl	48(%esp),%ecx
+	adcl	52(%esp),%edx
+	movl	%ecx,40(%esi)
+	movl	%edx,44(%esi)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	addl	56(%esp),%eax
+	adcl	60(%esp),%ebx
+	movl	%eax,48(%esi)
+	movl	%ebx,52(%esi)
+	addl	64(%esp),%ecx
+	adcl	68(%esp),%edx
+	movl	%ecx,56(%esi)
+	movl	%edx,60(%esi)
+	addl	$840,%esp
+	subl	$640,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop_x86
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K512:
+.long	3609767458,1116352408
+.long	602891725,1899447441
+.long	3964484399,3049323471
+.long	2173295548,3921009573
+.long	4081628472,961987163
+.long	3053834265,1508970993
+.long	2937671579,2453635748
+.long	3664609560,2870763221
+.long	2734883394,3624381080
+.long	1164996542,310598401
+.long	1323610764,607225278
+.long	3590304994,1426881987
+.long	4068182383,1925078388
+.long	991336113,2162078206
+.long	633803317,2614888103
+.long	3479774868,3248222580
+.long	2666613458,3835390401
+.long	944711139,4022224774
+.long	2341262773,264347078
+.long	2007800933,604807628
+.long	1495990901,770255983
+.long	1856431235,1249150122
+.long	3175218132,1555081692
+.long	2198950837,1996064986
+.long	3999719339,2554220882
+.long	766784016,2821834349
+.long	2566594879,2952996808
+.long	3203337956,3210313671
+.long	1034457026,3336571891
+.long	2466948901,3584528711
+.long	3758326383,113926993
+.long	168717936,338241895
+.long	1188179964,666307205
+.long	1546045734,773529912
+.long	1522805485,1294757372
+.long	2643833823,1396182291
+.long	2343527390,1695183700
+.long	1014477480,1986661051
+.long	1206759142,2177026350
+.long	344077627,2456956037
+.long	1290863460,2730485921
+.long	3158454273,2820302411
+.long	3505952657,3259730800
+.long	106217008,3345764771
+.long	3606008344,3516065817
+.long	1432725776,3600352804
+.long	1467031594,4094571909
+.long	851169720,275423344
+.long	3100823752,430227734
+.long	1363258195,506948616
+.long	3750685593,659060556
+.long	3785050280,883997877
+.long	3318307427,958139571
+.long	3812723403,1322822218
+.long	2003034995,1537002063
+.long	3602036899,1747873779
+.long	1575990012,1955562222
+.long	1125592928,2024104815
+.long	2716904306,2227730452
+.long	442776044,2361852424
+.long	593698344,2428436474
+.long	3733110249,2756734187
+.long	2999351573,3204031479
+.long	3815920427,3329325298
+.long	3928383900,3391569614
+.long	566280711,3515267271
+.long	3454069534,3940187606
+.long	4000239992,4118630271
+.long	1914138554,116418474
+.long	2731055270,174292421
+.long	3203993006,289380356
+.long	320620315,460393269
+.long	587496836,685471733
+.long	1086792851,852142971
+.long	365543100,1017036298
+.long	2618297676,1126000580
+.long	3409855158,1288033470
+.long	4234509866,1501505948
+.long	987167468,1607167915
+.long	1246189591,1816402316
+.size	sha512_block_data_order,.-.L_sha512_block_data_order_begin
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
diff --git a/crypto/sha/asm/sha512-586.pl b/crypto/sha/asm/sha512-586.pl
index 5b9f333..9f8c51e 100644
--- a/crypto/sha/asm/sha512-586.pl
+++ b/crypto/sha/asm/sha512-586.pl
@@ -23,7 +23,7 @@
 #
 # IALU code-path is optimized for elder Pentiums. On vanilla Pentium
 # performance improvement over compiler generated code reaches ~60%,
-# while on PIII - ~35%. On newer �-archs improvement varies from 15%
+# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
 # to 50%, but it's less important as they are expected to execute SSE2
 # code-path, which is commonly ~2-3x faster [than compiler generated
 # code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
@@ -142,9 +142,9 @@ sub BODY_00_15_x86 {
 	&mov	("edx",$Ehi);
 	&mov	("esi","ecx");
 
-	&shr	("ecx",9)	# lo>>9
+	&shr	("ecx",9);	# lo>>9
 	&mov	("edi","edx");
-	&shr	("edx",9)	# hi>>9
+	&shr	("edx",9);	# hi>>9
 	&mov	("ebx","ecx");
 	&shl	("esi",14);	# lo<<14
 	&mov	("eax","edx");
@@ -207,9 +207,9 @@ sub BODY_00_15_x86 {
 	&mov	($Dhi,"ebx");
 	&mov	("esi","ecx");
 
-	&shr	("ecx",2)	# lo>>2
+	&shr	("ecx",2);	# lo>>2
 	&mov	("edi","edx");
-	&shr	("edx",2)	# hi>>2
+	&shr	("edx",2);	# hi>>2
 	&mov	("ebx","ecx");
 	&shl	("esi",4);	# lo<<4
 	&mov	("eax","edx");
@@ -452,9 +452,9 @@ sub BODY_00_15_x86 {
 	&mov	("edx",&DWP(8*(9+15+16-1)+4,"esp"));
 	&mov	("esi","ecx");
 
-	&shr	("ecx",1)	# lo>>1
+	&shr	("ecx",1);	# lo>>1
 	&mov	("edi","edx");
-	&shr	("edx",1)	# hi>>1
+	&shr	("edx",1);	# hi>>1
 	&mov	("eax","ecx");
 	&shl	("esi",24);	# lo<<24
 	&mov	("ebx","edx");
@@ -488,9 +488,9 @@ sub BODY_00_15_x86 {
 	&mov	("edx",&DWP(8*(9+15+16-14)+4,"esp"));
 	&mov	("esi","ecx");
 
-	&shr	("ecx",6)	# lo>>6
+	&shr	("ecx",6);	# lo>>6
 	&mov	("edi","edx");
-	&shr	("edx",6)	# hi>>6
+	&shr	("edx",6);	# hi>>6
 	&mov	("eax","ecx");
 	&shl	("esi",3);	# lo<<3
 	&mov	("ebx","edx");
diff --git a/crypto/sha/asm/sha512-armv4.S b/crypto/sha/asm/sha512-armv4.S
new file mode 100644
index 0000000..5730192
--- /dev/null
+++ b/crypto/sha/asm/sha512-armv4.S
@@ -0,0 +1,1783 @@
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+.code	32
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
+
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+	sub	r3,pc,#8		@ sha512_block_data_order
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
+	stmdb	sp!,{r4-r12,lr}
+	sub	r14,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+.Loop:
+	str	r9, [sp,#48+0]
+	str	r10, [sp,#48+4]
+	str	r11, [sp,#56+0]
+	str	r12, [sp,#56+4]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	str	r3,[sp,#8+0]
+	str	r4,[sp,#8+4]
+	str	r9, [sp,#16+0]
+	str	r10, [sp,#16+4]
+	str	r11, [sp,#24+0]
+	str	r12, [sp,#24+4]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
+	str	r3,[sp,#40+0]
+	str	r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	r3,[r1,#7]
+	ldrb	r9, [r1,#6]
+	ldrb	r10, [r1,#5]
+	ldrb	r11, [r1,#4]
+	ldrb	r4,[r1,#3]
+	ldrb	r12, [r1,#2]
+	orr	r3,r3,r9,lsl#8
+	ldrb	r9, [r1,#1]
+	orr	r3,r3,r10,lsl#16
+	ldrb	r10, [r1],#8
+	orr	r3,r3,r11,lsl#24
+	orr	r4,r4,r12,lsl#8
+	orr	r4,r4,r9,lsl#16
+	orr	r4,r4,r10,lsl#24
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#148
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+	tst	r14,#1
+	beq	.L00_15
+	ldr	r9,[sp,#184+0]
+	ldr	r10,[sp,#184+4]
+	bic	r14,r14,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
+	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
+	eor	r3,r3,r10,lsl#31
+	eor	r4,r4,r9,lsl#31
+	eor	r3,r3,r9,lsr#8
+	eor	r4,r4,r10,lsr#8
+	eor	r3,r3,r10,lsl#24
+	eor	r4,r4,r9,lsl#24
+	eor	r3,r3,r9,lsr#7
+	eor	r4,r4,r10,lsr#7
+	eor	r3,r3,r10,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	r9,r11,lsr#19
+	mov	r10,r12,lsr#19
+	eor	r9,r9,r12,lsl#13
+	eor	r10,r10,r11,lsl#13
+	eor	r9,r9,r12,lsr#29
+	eor	r10,r10,r11,lsr#29
+	eor	r9,r9,r11,lsl#3
+	eor	r10,r10,r12,lsl#3
+	eor	r9,r9,r11,lsr#6
+	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
+	eor	r9,r9,r12,lsl#26
+
+	ldr	r12,[sp,#120+4]
+	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
+	adc	r4,r4,r10
+
+	ldr	r10,[sp,#192+4]
+	adds	r3,r3,r11
+	adc	r4,r4,r12
+	adds	r3,r3,r9
+	adc	r4,r4,r10
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#23
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
+	beq	.L16_79
+	bic	r14,r14,#1
+
+	ldr	r3,[sp,#8+0]
+	ldr	r4,[sp,#8+4]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#8+HI]
+
+	ldr	r5,[sp,#16+0]
+	ldr	r6,[sp,#16+4]
+	ldr	r3,[sp,#24+0]
+	ldr	r4,[sp,#24+4]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#24+HI]
+
+	ldr	r3,[sp,#40+0]
+	ldr	r4,[sp,#40+4]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
+	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
+	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#40+HI]
+
+	ldr	r5,[sp,#48+0]
+	ldr	r6,[sp,#48+4]
+	ldr	r3,[sp,#56+0]
+	ldr	r4,[sp,#56+4]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#56+HI]
+
+	add	sp,sp,#640
+	sub	r14,r14,#640
+
+	teq	r1,r2
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.align	4
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	sub	r3,r3,#672		@ K512
+	vldmia	r0,{d16-d23}		@ load context
+.Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64		{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d0
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d1
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64		{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d2
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d3
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64		{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d4
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d5
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64		{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d6
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d7
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64		{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d8
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d9
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64		{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d10
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d11
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64		{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d12
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d13
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64		{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d14
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d15
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	mov		r12,#4
+.L16_79_neon:
+	subs		r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	vshr.u64	q15,q7,#6
+	vsli.64		q12,q7,#45
+	vext.8		q14,q0,q1,#8	@ X[i+1]
+	vsli.64		q13,q7,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q4,q5,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d0
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d1
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	vshr.u64	q15,q0,#6
+	vsli.64		q12,q0,#45
+	vext.8		q14,q1,q2,#8	@ X[i+1]
+	vsli.64		q13,q0,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q5,q6,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d2
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d3
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	vshr.u64	q15,q1,#6
+	vsli.64		q12,q1,#45
+	vext.8		q14,q2,q3,#8	@ X[i+1]
+	vsli.64		q13,q1,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q6,q7,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d4
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d5
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	vshr.u64	q15,q2,#6
+	vsli.64		q12,q2,#45
+	vext.8		q14,q3,q4,#8	@ X[i+1]
+	vsli.64		q13,q2,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q7,q0,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d6
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d7
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	vshr.u64	q15,q3,#6
+	vsli.64		q12,q3,#45
+	vext.8		q14,q4,q5,#8	@ X[i+1]
+	vsli.64		q13,q3,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q0,q1,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vsli.64		d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d23
+	veor		d29,d21,d22
+	veor		d24,d25
+	vand		d29,d20
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d22			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d16,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d16,#34
+	vshr.u64	d26,d16,#39
+	vsli.64		d24,d16,#36
+	vsli.64		d25,d16,#30
+	vsli.64		d26,d16,#25
+	vadd.i64	d27,d8
+	vorr		d30,d16,d18
+	vand		d29,d16,d18
+	veor		d23,d24,d25
+	vand		d30,d17
+	veor		d23,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d23,d27
+	vadd.i64	d19,d27
+	vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vsli.64		d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d22
+	veor		d29,d20,d21
+	veor		d24,d25
+	vand		d29,d19
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d21			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d23,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d23,#34
+	vshr.u64	d26,d23,#39
+	vsli.64		d24,d23,#36
+	vsli.64		d25,d23,#30
+	vsli.64		d26,d23,#25
+	vadd.i64	d27,d9
+	vorr		d30,d23,d17
+	vand		d29,d23,d17
+	veor		d22,d24,d25
+	vand		d30,d16
+	veor		d22,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d22,d27
+	vadd.i64	d18,d27
+	vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	vshr.u64	q15,q4,#6
+	vsli.64		q12,q4,#45
+	vext.8		q14,q5,q6,#8	@ X[i+1]
+	vsli.64		q13,q4,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q1,q2,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vsli.64		d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d21
+	veor		d29,d19,d20
+	veor		d24,d25
+	vand		d29,d18
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d20			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d22,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d22,#34
+	vshr.u64	d26,d22,#39
+	vsli.64		d24,d22,#36
+	vsli.64		d25,d22,#30
+	vsli.64		d26,d22,#25
+	vadd.i64	d27,d10
+	vorr		d30,d22,d16
+	vand		d29,d22,d16
+	veor		d21,d24,d25
+	vand		d30,d23
+	veor		d21,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d21,d27
+	vadd.i64	d17,d27
+	vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vsli.64		d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d20
+	veor		d29,d18,d19
+	veor		d24,d25
+	vand		d29,d17
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d19			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d21,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d21,#34
+	vshr.u64	d26,d21,#39
+	vsli.64		d24,d21,#36
+	vsli.64		d25,d21,#30
+	vsli.64		d26,d21,#25
+	vadd.i64	d27,d11
+	vorr		d30,d21,d23
+	vand		d29,d21,d23
+	veor		d20,d24,d25
+	vand		d30,d22
+	veor		d20,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d20,d27
+	vadd.i64	d16,d27
+	vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	vshr.u64	q15,q5,#6
+	vsli.64		q12,q5,#45
+	vext.8		q14,q6,q7,#8	@ X[i+1]
+	vsli.64		q13,q5,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q2,q3,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vsli.64		d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d19
+	veor		d29,d17,d18
+	veor		d24,d25
+	vand		d29,d16
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d18			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d20,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d20,#34
+	vshr.u64	d26,d20,#39
+	vsli.64		d24,d20,#36
+	vsli.64		d25,d20,#30
+	vsli.64		d26,d20,#25
+	vadd.i64	d27,d12
+	vorr		d30,d20,d22
+	vand		d29,d20,d22
+	veor		d19,d24,d25
+	vand		d30,d21
+	veor		d19,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d19,d27
+	vadd.i64	d23,d27
+	vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vsli.64		d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d18
+	veor		d29,d16,d17
+	veor		d24,d25
+	vand		d29,d23
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d17			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d19,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d19,#34
+	vshr.u64	d26,d19,#39
+	vsli.64		d24,d19,#36
+	vsli.64		d25,d19,#30
+	vsli.64		d26,d19,#25
+	vadd.i64	d27,d13
+	vorr		d30,d19,d21
+	vand		d29,d19,d21
+	veor		d18,d24,d25
+	vand		d30,d20
+	veor		d18,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d18,d27
+	vadd.i64	d22,d27
+	vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	vshr.u64	q15,q6,#6
+	vsli.64		q12,q6,#45
+	vext.8		q14,q7,q0,#8	@ X[i+1]
+	vsli.64		q13,q6,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q3,q4,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vsli.64		d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d17
+	veor		d29,d23,d16
+	veor		d24,d25
+	vand		d29,d22
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d16			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d18,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d18,#34
+	vshr.u64	d26,d18,#39
+	vsli.64		d24,d18,#36
+	vsli.64		d25,d18,#30
+	vsli.64		d26,d18,#25
+	vadd.i64	d27,d14
+	vorr		d30,d18,d20
+	vand		d29,d18,d20
+	veor		d17,d24,d25
+	vand		d30,d19
+	veor		d17,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d17,d27
+	vadd.i64	d21,d27
+	vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vsli.64		d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	vadd.i64	d27,d28,d16
+	veor		d29,d22,d23
+	veor		d24,d25
+	vand		d29,d21
+	veor		d24,d26			@ Sigma1(e)
+	veor		d29,d23			@ Ch(e,f,g)
+	vadd.i64	d27,d24
+	vshr.u64	d24,d17,#28
+	vadd.i64	d27,d29
+	vshr.u64	d25,d17,#34
+	vshr.u64	d26,d17,#39
+	vsli.64		d24,d17,#36
+	vsli.64		d25,d17,#30
+	vsli.64		d26,d17,#25
+	vadd.i64	d27,d15
+	vorr		d30,d17,d19
+	vand		d29,d17,d19
+	veor		d16,d24,d25
+	vand		d30,d18
+	veor		d16,d26			@ Sigma0(a)
+	vorr		d30,d29		@ Maj(a,b,c)
+	vadd.i64	d16,d27
+	vadd.i64	d20,d27
+	vadd.i64	d16,d30
+	bne		.L16_79_neon
+
+	vldmia		r0,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		r0,{d16-d23}	@ save context
+	teq		r1,r2
+	sub		r3,#640	@ rewind K512
+	bne		.Loop_neon
+
+	vldmia	sp!,{d8-d15}		@ epilogue
+	.word	0xe12fff1e
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
+.comm	OPENSSL_armcap_P,4,4
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 3a35861..7faf37b 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -18,22 +18,33 @@
 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
 # Cortex A8 core and ~40 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 25.5 cycles or 47% faster than integer-only code.
+
 # Byte order [in]dependence. =========================================
 #
-# Caller is expected to maintain specific *dword* order in h[0-7],
-# namely with most significant dword at *lower* address, which is
-# reflected in below two parameters. *Byte* order within these dwords
-# in turn is whatever *native* byte order on current platform.
-$hi=0;
-$lo=4;
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
 # ====================================================================
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
-$ctx="r0";
+$ctx="r0";	# parameter block
 $inp="r1";
 $len="r2";
+
 $Tlo="r3";
 $Thi="r4";
 $Alo="r5";
@@ -61,15 +72,17 @@
 sub BODY_00_15() {
 my $magic = shift;
 $code.=<<___;
-	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
-	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
 	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
 	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
 	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	eor	$t0,$t0,$Elo,lsr#18
 	eor	$t1,$t1,$Ehi,lsr#18
 	eor	$t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@ ()
 	and	$t1,$t1,$Ehi
 	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
 
 	adds	$Tlo,$Tlo,$t0
 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
 	adc	$Ehi,$Ehi,$Thi		@ d += T
-
-	and	$t0,$t2,#0xff
 	teq	$t0,#$magic
-	orreq	$Ktbl,$Ktbl,#1
 
-	ldr	$t2,[sp,#$Boff+0]	@ b.lo
 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+	orreq	$Ktbl,$Ktbl,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@ ()
 	eor	$t0,$t0,$Alo,lsl#25
 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
 	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
 
-	and	$t0,$Alo,$t2
-	orr	$Alo,$Alo,$t2
 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
 	and	$Alo,$Alo,$t3
-	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$t3,$Ahi,$t1
 	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$Ahi,$Ahi,$t2
-	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	adds	$Alo,$Alo,$Tlo
-	adc	$Ahi,$Ahi,$Thi		@ h += T
-
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
 	add	$Ktbl,$Ktbl,#8
 ___
 }
 $code=<<___;
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
 .text
 .code	32
 .type	K512,%object
 .align	5
 K512:
-.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
 
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
 	stmdb	sp!,{r4-r12,lr}
-	sub	$Ktbl,r3,#640		@ K512
+	sub	$Ktbl,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@ ()
 	str	$Thi,[sp,#$Foff+4]
 
 .L00_15:
+#if __ARM_ARCH__<7
 	ldrb	$Tlo,[$inp,#7]
 	ldrb	$t0, [$inp,#6]
 	ldrb	$t1, [$inp,#5]
@@ -252,26 +285,30 @@ ()
 	orr	$Thi,$Thi,$t3,lsl#8
 	orr	$Thi,$Thi,$t0,lsl#16
 	orr	$Thi,$Thi,$t1,lsl#24
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
 ___
 	&BODY_00_15(0x94);
 $code.=<<___;
 	tst	$Ktbl,#1
 	beq	.L00_15
-	bic	$Ktbl,$Ktbl,#1
-
-.L16_79:
 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
-	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
-	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
-
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
 	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
 	eor	$Tlo,$Tlo,$t1,lsl#31
 	eor	$Thi,$Thi,$t0,lsl#31
 	eor	$Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@ ()
 	eor	$t1,$t1,$t3,lsl#3
 	eor	$t0,$t0,$t2,lsr#6
 	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	eor	$t0,$t0,$t3,lsl#26
 
-	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
 	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	adc	$Thi,$Thi,$t1
 
-	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3
 	adds	$Tlo,$Tlo,$t0
 	adc	$Thi,$Thi,$t1
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
 ___
 	&BODY_00_15(0x17);
 $code.=<<___;
-	tst	$Ktbl,#1
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
 	beq	.L16_79
 	bic	$Ktbl,$Ktbl,#1
 
@@ -324,12 +360,12 @@ ()
 	ldr	$t2, [$ctx,#$Boff+$lo]
 	ldr	$t3, [$ctx,#$Boff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Boff+$hi]
 
 	ldr	$Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@ ()
 	ldr	$t2, [$ctx,#$Doff+$lo]
 	ldr	$t3, [$ctx,#$Doff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Doff+$hi]
 
 	ldr	$Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@ ()
 	ldr	$t2, [$ctx,#$Foff+$lo]
 	ldr	$t3, [$ctx,#$Foff+$hi]
 	adds	$Elo,$Elo,$t0
-	adc	$Ehi,$Ehi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
 	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Foff+$hi]
 
 	ldr	$Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@ ()
 	ldr	$t2, [$ctx,#$Hoff+$lo]
 	ldr	$t3, [$ctx,#$Hoff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Hoff+$hi]
 
 	add	sp,sp,#640
@@ -388,13 +424,156 @@ ()
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+#endif
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	vadd.i64	$T1,$K,$h
+	veor		$Ch,$f,$g
+	veor		$t0,$t1
+	vand		$Ch,$e
+	veor		$t0,$t2			@ Sigma1(e)
+	veor		$Ch,$g			@ Ch(e,f,g)
+	vadd.i64	$T1,$t0
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	vadd.i64	$T1,$Ch
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	vadd.i64	$T1,@X[$i%16]
+	vorr		$Maj,$a,$c
+	vand		$Ch,$a,$c
+	veor		$h,$t0,$t1
+	vand		$Maj,$b
+	veor		$h,$t2			@ Sigma0(a)
+	vorr		$Maj,$Ch		@ Maj(a,b,c)
+	vadd.i64	$h,$T1
+	vadd.i64	$d,$T1
+	vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.align	4
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	sub	$Ktbl,r3,#672		@ K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	vldmia	sp!,{d8-d15}		@ epilogue
+	bx	lr
+#endif
+___
+}
+$code.=<<___;
+.size	sha512_block_data_order,.-sha512_block_data_order
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+.comm	OPENSSL_armcap_P,4,4
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/sha/asm/sha512-armv4.s b/crypto/sha/asm/sha512-armv4.s
deleted file mode 100644
index b030c16..0000000
--- a/crypto/sha/asm/sha512-armv4.s
+++ /dev/null
@@ -1,412 +0,0 @@
-.text
-.code	32
-.type	K512,%object
-.align	5
-K512:
-.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
-.size	K512,.-K512
-
-.global	sha512_block_data_order
-.type	sha512_block_data_order,%function
-sha512_block_data_order:
-	sub	r3,pc,#8		@ sha512_block_data_order
-	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
-	stmdb	sp!,{r4-r12,lr}
-	sub	r14,r3,#640		@ K512
-	sub	sp,sp,#9*8
-
-	ldr	r7,[r0,#32+4]
-	ldr	r8,[r0,#32+0]
-	ldr	r9, [r0,#48+4]
-	ldr	r10, [r0,#48+0]
-	ldr	r11, [r0,#56+4]
-	ldr	r12, [r0,#56+0]
-.Loop:
-	str	r9, [sp,#48+0]
-	str	r10, [sp,#48+4]
-	str	r11, [sp,#56+0]
-	str	r12, [sp,#56+4]
-	ldr	r5,[r0,#0+4]
-	ldr	r6,[r0,#0+0]
-	ldr	r3,[r0,#8+4]
-	ldr	r4,[r0,#8+0]
-	ldr	r9, [r0,#16+4]
-	ldr	r10, [r0,#16+0]
-	ldr	r11, [r0,#24+4]
-	ldr	r12, [r0,#24+0]
-	str	r3,[sp,#8+0]
-	str	r4,[sp,#8+4]
-	str	r9, [sp,#16+0]
-	str	r10, [sp,#16+4]
-	str	r11, [sp,#24+0]
-	str	r12, [sp,#24+4]
-	ldr	r3,[r0,#40+4]
-	ldr	r4,[r0,#40+0]
-	str	r3,[sp,#40+0]
-	str	r4,[sp,#40+4]
-
-.L00_15:
-	ldrb	r3,[r1,#7]
-	ldrb	r9, [r1,#6]
-	ldrb	r10, [r1,#5]
-	ldrb	r11, [r1,#4]
-	ldrb	r4,[r1,#3]
-	ldrb	r12, [r1,#2]
-	orr	r3,r3,r9,lsl#8
-	ldrb	r9, [r1,#1]
-	orr	r3,r3,r10,lsl#16
-	ldrb	r10, [r1],#8
-	orr	r3,r3,r11,lsl#24
-	orr	r4,r4,r12,lsl#8
-	orr	r4,r4,r9,lsl#16
-	orr	r4,r4,r10,lsl#24
-	str	r3,[sp,#64+0]
-	str	r4,[sp,#64+4]
-	ldr	r11,[sp,#56+0]	@ h.lo
-	ldr	r12,[sp,#56+4]	@ h.hi
-	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-	mov	r9,r7,lsr#14
-	mov	r10,r8,lsr#14
-	eor	r9,r9,r8,lsl#18
-	eor	r10,r10,r7,lsl#18
-	eor	r9,r9,r7,lsr#18
-	eor	r10,r10,r8,lsr#18
-	eor	r9,r9,r8,lsl#14
-	eor	r10,r10,r7,lsl#14
-	eor	r9,r9,r8,lsr#9
-	eor	r10,r10,r7,lsr#9
-	eor	r9,r9,r7,lsl#23
-	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
-	adds	r3,r3,r9
-	ldr	r9,[sp,#40+0]	@ f.lo
-	adc	r4,r4,r10		@ T += Sigma1(e)
-	ldr	r10,[sp,#40+4]	@ f.hi
-	adds	r3,r3,r11
-	ldr	r11,[sp,#48+0]	@ g.lo
-	adc	r4,r4,r12		@ T += h
-	ldr	r12,[sp,#48+4]	@ g.hi
-
-	eor	r9,r9,r11
-	str	r7,[sp,#32+0]
-	eor	r10,r10,r12
-	str	r8,[sp,#32+4]
-	and	r9,r9,r7
-	str	r5,[sp,#0+0]
-	and	r10,r10,r8
-	str	r6,[sp,#0+4]
-	eor	r9,r9,r11
-	ldr	r11,[r14,#4]		@ K[i].lo
-	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#0]		@ K[i].hi
-
-	adds	r3,r3,r9
-	ldr	r7,[sp,#24+0]	@ d.lo
-	adc	r4,r4,r10		@ T += Ch(e,f,g)
-	ldr	r8,[sp,#24+4]	@ d.hi
-	adds	r3,r3,r11
-	adc	r4,r4,r12		@ T += K[i]
-	adds	r7,r7,r3
-	adc	r8,r8,r4		@ d += T
-
-	and	r9,r11,#0xff
-	teq	r9,#148
-	orreq	r14,r14,#1
-
-	ldr	r11,[sp,#8+0]	@ b.lo
-	ldr	r12,[sp,#16+0]	@ c.lo
-	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-	mov	r9,r5,lsr#28
-	mov	r10,r6,lsr#28
-	eor	r9,r9,r6,lsl#4
-	eor	r10,r10,r5,lsl#4
-	eor	r9,r9,r6,lsr#2
-	eor	r10,r10,r5,lsr#2
-	eor	r9,r9,r5,lsl#30
-	eor	r10,r10,r6,lsl#30
-	eor	r9,r9,r6,lsr#7
-	eor	r10,r10,r5,lsr#7
-	eor	r9,r9,r5,lsl#25
-	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
-	adds	r3,r3,r9
-	adc	r4,r4,r10		@ T += Sigma0(a)
-
-	and	r9,r5,r11
-	orr	r5,r5,r11
-	ldr	r10,[sp,#8+4]	@ b.hi
-	ldr	r11,[sp,#16+4]	@ c.hi
-	and	r5,r5,r12
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
-	and	r12,r6,r10
-	orr	r6,r6,r10
-	and	r6,r6,r11
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
-	adds	r5,r5,r3
-	adc	r6,r6,r4		@ h += T
-
-	sub	sp,sp,#8
-	add	r14,r14,#8
-	tst	r14,#1
-	beq	.L00_15
-	bic	r14,r14,#1
-
-.L16_79:
-	ldr	r9,[sp,#184+0]
-	ldr	r10,[sp,#184+4]
-	ldr	r11,[sp,#80+0]
-	ldr	r12,[sp,#80+4]
-
-	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
-	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
-	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
-	mov	r3,r9,lsr#1
-	mov	r4,r10,lsr#1
-	eor	r3,r3,r10,lsl#31
-	eor	r4,r4,r9,lsl#31
-	eor	r3,r3,r9,lsr#8
-	eor	r4,r4,r10,lsr#8
-	eor	r3,r3,r10,lsl#24
-	eor	r4,r4,r9,lsl#24
-	eor	r3,r3,r9,lsr#7
-	eor	r4,r4,r10,lsr#7
-	eor	r3,r3,r10,lsl#25
-
-	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
-	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
-	mov	r9,r11,lsr#19
-	mov	r10,r12,lsr#19
-	eor	r9,r9,r12,lsl#13
-	eor	r10,r10,r11,lsl#13
-	eor	r9,r9,r12,lsr#29
-	eor	r10,r10,r11,lsr#29
-	eor	r9,r9,r11,lsl#3
-	eor	r10,r10,r12,lsl#3
-	eor	r9,r9,r11,lsr#6
-	eor	r10,r10,r12,lsr#6
-	eor	r9,r9,r12,lsl#26
-
-	ldr	r11,[sp,#120+0]
-	ldr	r12,[sp,#120+4]
-	adds	r3,r3,r9
-	adc	r4,r4,r10
-
-	ldr	r9,[sp,#192+0]
-	ldr	r10,[sp,#192+4]
-	adds	r3,r3,r11
-	adc	r4,r4,r12
-	adds	r3,r3,r9
-	adc	r4,r4,r10
-	str	r3,[sp,#64+0]
-	str	r4,[sp,#64+4]
-	ldr	r11,[sp,#56+0]	@ h.lo
-	ldr	r12,[sp,#56+4]	@ h.hi
-	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-	mov	r9,r7,lsr#14
-	mov	r10,r8,lsr#14
-	eor	r9,r9,r8,lsl#18
-	eor	r10,r10,r7,lsl#18
-	eor	r9,r9,r7,lsr#18
-	eor	r10,r10,r8,lsr#18
-	eor	r9,r9,r8,lsl#14
-	eor	r10,r10,r7,lsl#14
-	eor	r9,r9,r8,lsr#9
-	eor	r10,r10,r7,lsr#9
-	eor	r9,r9,r7,lsl#23
-	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
-	adds	r3,r3,r9
-	ldr	r9,[sp,#40+0]	@ f.lo
-	adc	r4,r4,r10		@ T += Sigma1(e)
-	ldr	r10,[sp,#40+4]	@ f.hi
-	adds	r3,r3,r11
-	ldr	r11,[sp,#48+0]	@ g.lo
-	adc	r4,r4,r12		@ T += h
-	ldr	r12,[sp,#48+4]	@ g.hi
-
-	eor	r9,r9,r11
-	str	r7,[sp,#32+0]
-	eor	r10,r10,r12
-	str	r8,[sp,#32+4]
-	and	r9,r9,r7
-	str	r5,[sp,#0+0]
-	and	r10,r10,r8
-	str	r6,[sp,#0+4]
-	eor	r9,r9,r11
-	ldr	r11,[r14,#4]		@ K[i].lo
-	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#0]		@ K[i].hi
-
-	adds	r3,r3,r9
-	ldr	r7,[sp,#24+0]	@ d.lo
-	adc	r4,r4,r10		@ T += Ch(e,f,g)
-	ldr	r8,[sp,#24+4]	@ d.hi
-	adds	r3,r3,r11
-	adc	r4,r4,r12		@ T += K[i]
-	adds	r7,r7,r3
-	adc	r8,r8,r4		@ d += T
-
-	and	r9,r11,#0xff
-	teq	r9,#23
-	orreq	r14,r14,#1
-
-	ldr	r11,[sp,#8+0]	@ b.lo
-	ldr	r12,[sp,#16+0]	@ c.lo
-	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-	mov	r9,r5,lsr#28
-	mov	r10,r6,lsr#28
-	eor	r9,r9,r6,lsl#4
-	eor	r10,r10,r5,lsl#4
-	eor	r9,r9,r6,lsr#2
-	eor	r10,r10,r5,lsr#2
-	eor	r9,r9,r5,lsl#30
-	eor	r10,r10,r6,lsl#30
-	eor	r9,r9,r6,lsr#7
-	eor	r10,r10,r5,lsr#7
-	eor	r9,r9,r5,lsl#25
-	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
-	adds	r3,r3,r9
-	adc	r4,r4,r10		@ T += Sigma0(a)
-
-	and	r9,r5,r11
-	orr	r5,r5,r11
-	ldr	r10,[sp,#8+4]	@ b.hi
-	ldr	r11,[sp,#16+4]	@ c.hi
-	and	r5,r5,r12
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
-	and	r12,r6,r10
-	orr	r6,r6,r10
-	and	r6,r6,r11
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
-	adds	r5,r5,r3
-	adc	r6,r6,r4		@ h += T
-
-	sub	sp,sp,#8
-	add	r14,r14,#8
-	tst	r14,#1
-	beq	.L16_79
-	bic	r14,r14,#1
-
-	ldr	r3,[sp,#8+0]
-	ldr	r4,[sp,#8+4]
-	ldr	r9, [r0,#0+4]
-	ldr	r10, [r0,#0+0]
-	ldr	r11, [r0,#8+4]
-	ldr	r12, [r0,#8+0]
-	adds	r9,r5,r9
-	adc	r10,r6,r10
-	adds	r11,r3,r11
-	adc	r12,r4,r12
-	str	r9, [r0,#0+4]
-	str	r10, [r0,#0+0]
-	str	r11, [r0,#8+4]
-	str	r12, [r0,#8+0]
-
-	ldr	r5,[sp,#16+0]
-	ldr	r6,[sp,#16+4]
-	ldr	r3,[sp,#24+0]
-	ldr	r4,[sp,#24+4]
-	ldr	r9, [r0,#16+4]
-	ldr	r10, [r0,#16+0]
-	ldr	r11, [r0,#24+4]
-	ldr	r12, [r0,#24+0]
-	adds	r9,r5,r9
-	adc	r10,r6,r10
-	adds	r11,r3,r11
-	adc	r12,r4,r12
-	str	r9, [r0,#16+4]
-	str	r10, [r0,#16+0]
-	str	r11, [r0,#24+4]
-	str	r12, [r0,#24+0]
-
-	ldr	r3,[sp,#40+0]
-	ldr	r4,[sp,#40+4]
-	ldr	r9, [r0,#32+4]
-	ldr	r10, [r0,#32+0]
-	ldr	r11, [r0,#40+4]
-	ldr	r12, [r0,#40+0]
-	adds	r7,r7,r9
-	adc	r8,r8,r10
-	adds	r11,r3,r11
-	adc	r12,r4,r12
-	str	r7,[r0,#32+4]
-	str	r8,[r0,#32+0]
-	str	r11, [r0,#40+4]
-	str	r12, [r0,#40+0]
-
-	ldr	r5,[sp,#48+0]
-	ldr	r6,[sp,#48+4]
-	ldr	r3,[sp,#56+0]
-	ldr	r4,[sp,#56+4]
-	ldr	r9, [r0,#48+4]
-	ldr	r10, [r0,#48+0]
-	ldr	r11, [r0,#56+4]
-	ldr	r12, [r0,#56+0]
-	adds	r9,r5,r9
-	adc	r10,r6,r10
-	adds	r11,r3,r11
-	adc	r12,r4,r12
-	str	r9, [r0,#48+4]
-	str	r10, [r0,#48+0]
-	str	r11, [r0,#56+4]
-	str	r12, [r0,#56+0]
-
-	add	sp,sp,#640
-	sub	r14,r14,#640
-
-	teq	r1,r2
-	bne	.Loop
-
-	add	sp,sp,#8*9		@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
-.align	2
diff --git a/crypto/sha/asm/sha512-parisc.pl b/crypto/sha/asm/sha512-parisc.pl
new file mode 100755
index 0000000..e24ee58
--- /dev/null
+++ b/crypto/sha/asm/sha512-parisc.pl
@@ -0,0 +1,791 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 block procedure for PA-RISC.
+
+# June 2009.
+#
+# SHA256 performance is >75% better than gcc 3.2 generated code on
+# PA-7100LC. Compared to code generated by vendor compiler this
+# implementation is almost 70% faster in 64-bit build, but delivers
+# virtually same performance in 32-bit build on PA-8600.
+#
+# SHA512 performance is >2.9x better than gcc 3.2 generated code on
+# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
+# code is executed on PA-RISC 2.0 processor and switches to 64-bit
+# code path delivering adequate peformance even in "blended" 32-bit
+# build. Though 64-bit code is not any faster than code generated by
+# vendor compiler on PA-8600...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+if ($output =~ /512/) {
+	$func="sha512_block_data_order";
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$LAST10BITS=0x017;
+	$LD="ldd";
+	$LDM="ldd,ma";
+	$ST="std";
+} else {
+	$func="sha256_block_data_order";
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$LAST10BITS=0x0f2;
+	$LD="ldw";
+	$LDM="ldwm";
+	$ST="stw";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$XOFF=16*$SZ+32;		# local variables
+$FRAME+=$XOFF;
+$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
+
+$ctx="%r26";	# zapped by $a0
+$inp="%r25";	# zapped by $a1
+$num="%r24";	# zapped by $t0
+
+$a0 ="%r26";
+$a1 ="%r25";
+$t0 ="%r24";
+$t1 ="%r29";
+$Tbl="%r31";
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
+
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+	_ror	$e,$Sigma1[0],$a0
+	and	$f,$e,$t0
+	_ror	$e,$Sigma1[1],$a1
+	addl	$t1,$h,$h
+	andcm	$g,$e,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
+	or	$t0,$t1,$t1		; Ch(e,f,g)
+	addl	@X[$i%16],$h,$h
+	xor	$a0,$a1,$a1		; Sigma1(e)
+	addl	$t1,$h,$h
+	_ror	$a,$Sigma0[0],$a0
+	addl	$a1,$h,$h
+
+	_ror	$a,$Sigma0[1],$a1
+	and	$a,$b,$t0
+	and	$a,$c,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$t1,$t0,$t0
+	and	$b,$c,$t1
+	xor	$a0,$a1,$a1		; Sigma0(a)
+	addl	$h,$d,$d
+	xor	$t1,$t0,$t0		; Maj(a,b,c)
+	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
+	addl	$a1,$h,$h
+	addl	$t0,$h,$h
+
+___
+}
+
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+	_ror	@X[($i+1)%16],$sigma0[0],$a0
+	_ror	@X[($i+1)%16],$sigma0[1],$a1
+	addl	@X[($i+9)%16],@X[$i],@X[$i]
+	_ror	@X[($i+14)%16],$sigma1[0],$t0
+	_ror	@X[($i+14)%16],$sigma1[1],$t1
+	xor	$a1,$a0,$a0
+	_shr	@X[($i+1)%16],$sigma0[2],$a1
+	xor	$t1,$t0,$t0
+	_shr	@X[($i+14)%16],$sigma1[2],$t1
+	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
+	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
+	$LDM	$SZ($Tbl),$t1
+	addl	$a0,@X[$i],@X[$i]
+	addl	$t0,@X[$i],@X[$i]
+___
+$code.=<<___ if ($i==15);
+	extru	$t1,31,10,$a1
+	comiclr,<> $LAST10BITS,$a1,%r0
+	ldo	1($Tbl),$Tbl		; signal end of $Tbl
+___
+&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.ALIGN	64
+L\$table
+___
+$code.=<<___ if ($SZ==8);
+	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code.=<<___;
+
+	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+$func
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	_shl	$num,`log(16*$SZ)/log(2)`,$num
+	addl	$inp,$num,$num		; $num to point at the end of $inp
+
+	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
+	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
+
+	blr	%r0,$Tbl
+	ldi	3,$t1
+L\$pic
+	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
+	ldo	L\$table-L\$pic($Tbl),$Tbl
+___
+$code.=<<___ if ($SZ==8 && $SIZE_T==4);
+	ldi	31,$t1
+	mtctl	$t1,%cr11
+	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
+	b	L\$parisc1
+	nop
+___
+$code.=<<___;
+	$LD	`0*$SZ`($ctx),$A	; load context
+	$LD	`1*$SZ`($ctx),$B
+	$LD	`2*$SZ`($ctx),$C
+	$LD	`3*$SZ`($ctx),$D
+	$LD	`4*$SZ`($ctx),$E
+	$LD	`5*$SZ`($ctx),$F
+	$LD	`6*$SZ`($ctx),$G
+	$LD	`7*$SZ`($ctx),$H
+
+	extru	$inp,31,`log($SZ)/log(2)`,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	`8*$SZ`,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop
+	ldi	`$SZ-1`,$t0
+	$LDM	$SZ($Tbl),$t1
+	andcm	$inp,$t0,$t0		; align $inp
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	$LD	`$SZ*15`($t0),@X[15]
+	$LD	`$SZ*16`($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align data
+	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+L\$rounds
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
+	nop
+
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
+
+	$LD	`0*$SZ`($ctx),@X[0]	; load context
+	$LD	`1*$SZ`($ctx),@X[1]
+	$LD	`2*$SZ`($ctx),@X[2]
+	$LD	`3*$SZ`($ctx),@X[3]
+	$LD	`4*$SZ`($ctx),@X[4]
+	$LD	`5*$SZ`($ctx),@X[5]
+	addl	@X[0],$A,$A
+	$LD	`6*$SZ`($ctx),@X[6]
+	addl	@X[1],$B,$B
+	$LD	`7*$SZ`($ctx),@X[7]
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+
+	$ST	$A,`0*$SZ`($ctx)	; save context
+	addl	@X[2],$C,$C
+	$ST	$B,`1*$SZ`($ctx)
+	addl	@X[3],$D,$D
+	$ST	$C,`2*$SZ`($ctx)
+	addl	@X[4],$E,$E
+	$ST	$D,`3*$SZ`($ctx)
+	addl	@X[5],$F,$F
+	$ST	$E,`4*$SZ`($ctx)
+	addl	@X[6],$G,$G
+	$ST	$F,`5*$SZ`($ctx)
+	addl	@X[7],$H,$H
+	$ST	$G,`6*$SZ`($ctx)
+	$ST	$H,`7*$SZ`($ctx)
+
+	cmpb,*<>,n $inp,$num,L\$oop
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+___
+if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
+{{
+$code.=<<___;
+	b	L\$done
+	nop
+
+	.ALIGN	64
+L\$parisc1
+___
+
+@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
+      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) = 
+   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
+$a0 ="%r17";
+$a1 ="%r18";
+$a2 ="%r19";
+$a3 ="%r20";
+$t0 ="%r21";
+$t1 ="%r22";
+$t2 ="%r28";
+$t3 ="%r29";
+$Tbl="%r31";
+
+@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
+
+sub ROUND_00_15_pa1 {
+my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+
+$code.=<<___ if (!$flag);
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+___
+$code.=<<___;
+	shd	$ehi,$elo,$Sigma1[0],$t0
+	 add	$Xlo,$hlo,$hlo
+	shd	$elo,$ehi,$Sigma1[0],$t1
+	 addc	$Xhi,$hhi,$hhi		; h += X[i]
+	shd	$ehi,$elo,$Sigma1[1],$t2
+	 ldwm	8($Tbl),$Xhi
+	shd	$elo,$ehi,$Sigma1[1],$t3
+	 ldw	-4($Tbl),$Xlo		; load K[i]
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	 and	$flo,$elo,$a0
+	 and	$fhi,$ehi,$a1
+	shd	$ehi,$elo,$Sigma1[2],$t2
+	 andcm	$glo,$elo,$a2
+	shd	$elo,$ehi,$Sigma1[2],$t3
+	 andcm	$ghi,$ehi,$a3
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma1(e)
+	add	$Xlo,$hlo,$hlo
+	 xor	$a2,$a0,$a0
+	addc	$Xhi,$hhi,$hhi		; h += K[i]
+	 xor	$a3,$a1,$a1		; Ch(e,f,g)
+
+	 add	$t0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[0],$t0
+	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
+	shd	$alo,$ahi,$Sigma0[0],$t1	
+	 add	$a0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[1],$t2
+	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
+	shd	$alo,$ahi,$Sigma0[1],$t3
+
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	shd	$ahi,$alo,$Sigma0[2],$t2
+	and	$alo,$blo,$a0
+	shd	$alo,$ahi,$Sigma0[2],$t3
+	and	$ahi,$bhi,$a1
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma0(a)
+
+	and	$alo,$clo,$a2
+	and	$ahi,$chi,$a3
+	xor	$a2,$a0,$a0
+	 add	$hlo,$dlo,$dlo
+	xor	$a3,$a1,$a1
+	 addc	$hhi,$dhi,$dhi		; d += h
+	and	$blo,$clo,$a2
+	 add	$t0,$hlo,$hlo
+	and	$bhi,$chi,$a3
+	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
+	xor	$a2,$a0,$a0
+	 add	$a0,$hlo,$hlo
+	xor	$a3,$a1,$a1		; Maj(a,b,c)
+	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
+
+___
+$code.=<<___ if ($i==15 && $flag);
+	extru	$Xlo,31,10,$Xlo
+	comiclr,= $LAST10BITS,$Xlo,%r0
+	b	L\$rounds_pa1
+	nop
+___
+push(@X,shift(@X)); push(@X,shift(@X));
+}
+
+sub ROUND_16_xx_pa1 {
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+my ($i)=shift;
+$i-=16;
+$code.=<<___;
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
+	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
+	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
+	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
+	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
+	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
+	 add	$a0,$Xlo,$Xlo
+	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
+	 addc	$a1,$Xhi,$Xhi
+	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
+	xor	$t2,$t0,$t0
+	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
+	xor	$t3,$t1,$t1
+	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
+	xor	$t2,$t0,$t0
+	 shd	$a3,$a2,$sigma1[0],$a0
+	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
+	 shd	$a2,$a3,$sigma1[0],$a1
+	add	$t0,$Xlo,$Xlo
+	 shd	$a3,$a2,$sigma1[1],$t2
+	addc	$t1,$Xhi,$Xhi
+	 shd	$a2,$a3,$sigma1[1],$t3
+	xor	$t2,$a0,$a0
+	shd	$a3,$a2,$sigma1[2],$t2
+	xor	$t3,$a1,$a1
+	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
+	xor	$t2,$a0,$a0
+	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
+	add	$a0,$Xlo,$Xlo
+	addc	$a1,$Xhi,$Xhi
+
+	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
+	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
+___
+&ROUND_00_15_pa1($i,@_,1);
+}
+$code.=<<___;
+	ldw	`0*4`($ctx),$Ahi		; load context
+	ldw	`1*4`($ctx),$Alo
+	ldw	`2*4`($ctx),$Bhi
+	ldw	`3*4`($ctx),$Blo
+	ldw	`4*4`($ctx),$Chi
+	ldw	`5*4`($ctx),$Clo
+	ldw	`6*4`($ctx),$Dhi
+	ldw	`7*4`($ctx),$Dlo
+	ldw	`8*4`($ctx),$Ehi
+	ldw	`9*4`($ctx),$Elo
+	ldw	`10*4`($ctx),$Fhi
+	ldw	`11*4`($ctx),$Flo
+	ldw	`12*4`($ctx),$Ghi
+	ldw	`13*4`($ctx),$Glo
+	ldw	`14*4`($ctx),$Hhi
+	ldw	`15*4`($ctx),$Hlo
+
+	extru	$inp,31,2,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop_pa1
+	extru	$inp,31,2,$a3
+	comib,=	0,$a3,L\$aligned_pa1
+	sub	$inp,$a3,$inp
+
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	vshd	$X[0],$X[1],$X[0]
+	vshd	$X[1],$t2,$X[1]
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	vshd	$t2,$t3,$t2
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+	vshd	$t3,$a0,$t3
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<=(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+for (;$i<(128/4-1);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+$code.=<<___;
+	b	L\$collected_pa1
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+
+___
+}
+$code.=<<___;
+L\$aligned_pa1
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+___
+push(@t,shift(@t));
+}
+for (;$i<128/4;$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+___
+push(@t,shift(@t));
+}
+$code.="L\$collected_pa1\n";
+}
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+$code.="L\$rounds_pa1\n";
+for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
+
+	ldw	`0*4`($ctx),$t1		; update context
+	ldw	`1*4`($ctx),$t0
+	ldw	`2*4`($ctx),$t3
+	ldw	`3*4`($ctx),$t2
+	ldw	`4*4`($ctx),$a1
+	ldw	`5*4`($ctx),$a0
+	ldw	`6*4`($ctx),$a3
+	add	$t0,$Alo,$Alo
+	ldw	`7*4`($ctx),$a2
+	addc	$t1,$Ahi,$Ahi
+	ldw	`8*4`($ctx),$t1
+	add	$t2,$Blo,$Blo
+	ldw	`9*4`($ctx),$t0
+	addc	$t3,$Bhi,$Bhi
+	ldw	`10*4`($ctx),$t3
+	add	$a0,$Clo,$Clo
+	ldw	`11*4`($ctx),$t2
+	addc	$a1,$Chi,$Chi
+	ldw	`12*4`($ctx),$a1
+	add	$a2,$Dlo,$Dlo
+	ldw	`13*4`($ctx),$a0
+	addc	$a3,$Dhi,$Dhi
+	ldw	`14*4`($ctx),$a3
+	add	$t0,$Elo,$Elo
+	ldw	`15*4`($ctx),$a2
+	addc	$t1,$Ehi,$Ehi
+	stw	$Ahi,`0*4`($ctx)
+	add	$t2,$Flo,$Flo
+	stw	$Alo,`1*4`($ctx)
+	addc	$t3,$Fhi,$Fhi
+	stw	$Bhi,`2*4`($ctx)
+	add	$a0,$Glo,$Glo
+	stw	$Blo,`3*4`($ctx)
+	addc	$a1,$Ghi,$Ghi
+	stw	$Chi,`4*4`($ctx)
+	add	$a2,$Hlo,$Hlo
+	stw	$Clo,`5*4`($ctx)
+	addc	$a3,$Hhi,$Hhi
+	stw	$Dhi,`6*4`($ctx)
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+	stw	$Dlo,`7*4`($ctx)
+	stw	$Ehi,`8*4`($ctx)
+	stw	$Elo,`9*4`($ctx)
+	stw	$Fhi,`10*4`($ctx)
+	stw	$Flo,`11*4`($ctx)
+	stw	$Ghi,`12*4`($ctx)
+	stw	$Glo,`13*4`($ctx)
+	stw	$Hhi,`14*4`($ctx)
+	comb,=	$inp,$num,L\$done
+	stw	$Hlo,`15*4`($ctx)
+	b	L\$oop_pa1
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+L\$done
+___
+}}
+$code.=<<___;
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
+    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
+	$opcode|=(1<<3) if ($mod =~ /^,m/);
+	$opcode|=(1<<2) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
+		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
+		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
+	# translate made up instructons: _ror, _shr, _align, _shl
+	s/_ror(\s+)(%r[0-9]+),/
+		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
+
+	s/_shr(\s+%r[0-9]+),([0-9]+),/
+		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
+		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
+
+	s/_align(\s+%r[0-9]+,%r[0-9]+),/
+		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
+
+	s/_shl(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
+		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
+
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
+
+	s/cmpb,\*/comb,/ if ($SIZE_T==4);
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl
index 768a6a6..6b44a68 100755
--- a/crypto/sha/asm/sha512-ppc.pl
+++ b/crypto/sha/asm/sha512-ppc.pl
@@ -40,6 +40,7 @@
 
 if ($flavour =~ /64/) {
 	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
 	$STU="stdu";
 	$UCMP="cmpld";
 	$SHL="sldi";
@@ -47,6 +48,7 @@
 	$PUSH="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
 	$STU="stwu";
 	$UCMP="cmplw";
 	$SHL="slwi";
@@ -87,7 +89,8 @@
 	$SHR="srwi";
 }
 
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -179,13 +182,12 @@ sub ROUND_16_xx {
 .globl	$func
 .align	6
 $func:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+16*$SZ)`($sp)
 	$SHL	$num,$num,`log(16*$SZ)/log(2)`
 
 	$PUSH	$ctx,`$FRAME-$SIZE_T*22`($sp)
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ sub ROUND_16_xx {
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
 	$LD	$A,`0*$SZ`($ctx)
 	mr	$inp,r4				; incarnate $inp
@@ -217,7 +220,7 @@ sub ROUND_16_xx {
 	$LD	$G,`6*$SZ`($ctx)
 	$LD	$H,`7*$SZ`($ctx)
 
-	b	LPICmeup
+	bl	LPICmeup
 LPICedup:
 	andi.	r0,$inp,3
 	bne	Lunaligned
@@ -226,40 +229,14 @@ sub ROUND_16_xx {
 	$PUSH	$num,`$FRAME-$SIZE_T*24`($sp)	; end pointer
 	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
 	bl	Lsha2_block_private
-Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
-	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
-	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
-	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
-	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
-	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
-	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
-	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
-	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
-	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
-	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
-	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
-	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
-	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
-	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
-	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
-	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
-	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
-	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
-	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
-	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
-	mtlr	r0
-	addi	$sp,$sp,`$FRAME+16*$SZ`
-	blr
-___
+	b	Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for the input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align	4
 Lunaligned:
 	subfic	$t1,$inp,4096
@@ -278,7 +255,7 @@ sub ROUND_16_xx {
 Lcross_page:
 	li	$t1,`16*$SZ/4`
 	mtctr	$t1
-	addi	r20,$sp,$FRAME			; aligned spot below the frame
+	addi	r20,$sp,$LOCALS			; aligned spot below the frame
 Lmemcpy:
 	lbz	r16,0($inp)
 	lbz	r17,1($inp)
@@ -293,8 +270,8 @@ sub ROUND_16_xx {
 	bdnz	Lmemcpy
 
 	$PUSH	$inp,`$FRAME-$SIZE_T*26`($sp)	; save real inp
-	addi	$t1,$sp,`$FRAME+16*$SZ`		; fictitious end pointer
-	addi	$inp,$sp,$FRAME			; fictitious inp pointer
+	addi	$t1,$sp,`$LOCALS+16*$SZ`	; fictitious end pointer
+	addi	$inp,$sp,$LOCALS		; fictitious inp pointer
 	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real num
 	$PUSH	$t1,`$FRAME-$SIZE_T*24`($sp)	; end pointer
 	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
@@ -303,10 +280,36 @@ sub ROUND_16_xx {
 	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real num
 	addic.	$num,$num,`-16*$SZ`		; num--
 	bne-	Lunaligned
-	b	Ldone
-___
 
-$code.=<<___;
+Ldone:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
+	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
+	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
+	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
+
 .align	4
 Lsha2_block_private:
 ___
@@ -372,6 +375,8 @@ sub ROUND_16_xx {
 	$ST	$H,`7*$SZ`($ctx)
 	bne	Lsha2_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 
 # Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ sub ROUND_16_xx {
 $code.=<<___;
 .align	6
 LPICmeup:
-	bl	LPIC
-	addi	$Tbl,$Tbl,`64-4`	; "distance" between . and last nop
-	b	LPICedup
-	nop
-	nop
-	nop
-	nop
-	nop
-LPIC:	mflr	$Tbl
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
+	addi	$Tbl,$Tbl,`64-8`
+	mtlr	r0
 	blr
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 ___
 $code.=<<___ if ($SZ==8);
 	.long	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl
index e7ef2d5..079a3fc 100644
--- a/crypto/sha/asm/sha512-s390x.pl
+++ b/crypto/sha/asm/sha512-s390x.pl
@@ -26,6 +26,26 @@
 # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
 # than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 $t0="%r0";
 $t1="%r1";
 $ctx="%r2";	$t2="%r2";
@@ -44,7 +64,7 @@
 $T1="%r14";
 $sp="%r15";
 
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 if ($output =~ /512/) {
@@ -78,7 +98,8 @@
 }
 $Func="sha${label}_block_data_order";
 $Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;
 
 sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@ sub BODY_00_15 {
 	xgr	$t0,$t1
 	$ROT	$t1,$t1,`$Sigma1[2]-$Sigma1[1]`
 	 xgr	$t2,$g
-	$ST	$T1,`160+$SZ*($i%16)`($sp)
+	$ST	$T1,`$stdframe+$SZ*($i%16)`($sp)
 	xgr	$t0,$t1			# Sigma1(e)
-	la	$T1,0($T1,$h)		# T1+=h
+	algr	$T1,$h			# T1+=h
 	 ngr	$t2,$e
 	 lgr	$t1,$a
 	algr	$T1,$t0			# T1+=Sigma1(e)
@@ -113,7 +134,7 @@ sub BODY_00_15 {
 	 ngr	$t2,$b
 	algr	$h,$T1			# h+=T1
 	 ogr	$t2,$t1			# Maj(a,b,c)
-	la	$d,0($d,$T1)		# d+=T1
+	algr	$d,$T1			# d+=T1
 	algr	$h,$t2			# h+=Maj(a,b,c)
 ___
 }
@@ -122,19 +143,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	$LD	$T1,`160+$SZ*(($i+1)%16)`($sp)	### $i
-	$LD	$t1,`160+$SZ*(($i+14)%16)`($sp)
+	$LD	$T1,`$stdframe+$SZ*(($i+1)%16)`($sp)	### $i
+	$LD	$t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
 	$ROT	$t0,$T1,$sigma0[0]
 	$SHR	$T1,$sigma0[2]
 	$ROT	$t2,$t0,`$sigma0[1]-$sigma0[0]`
 	xgr	$T1,$t0
 	$ROT	$t0,$t1,$sigma1[0]
-	xgr	$T1,$t2				# sigma0(X[i+1])
+	xgr	$T1,$t2					# sigma0(X[i+1])
 	$SHR	$t1,$sigma1[2]
-	$ADD	$T1,`160+$SZ*($i%16)`($sp)	# +=X[i]
+	$ADD	$T1,`$stdframe+$SZ*($i%16)`($sp)	# +=X[i]
 	xgr	$t1,$t0
 	$ROT	$t0,$t0,`$sigma1[1]-$sigma1[0]`
-	$ADD	$T1,`160+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
+	$ADD	$T1,`$stdframe+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
 	xgr	$t1,$t0				# sigma1(X[i+14])
 	algr	$T1,$t1				# +=sigma1(X[i+14])
 ___
@@ -212,6 +233,7 @@ sub BODY_16_XX {
 .globl	$Func
 .type	$Func,\@function
 $Func:
+	sllg	$len,$len,`log(16*$SZ)/log(2)`
 ___
 $code.=<<___ if ($kimdfunc);
 	larl	%r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@ sub BODY_16_XX {
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
 	lgr	%r1,$ctx
 	lgr	%r2,$inp
-	sllg	%r3,$len,`log(16*$SZ)/log(2)`
+	lgr	%r3,$len
 	.long	0xb93e0002	# kimd %r0,%r2
 	brc	1,.-4		# pay attention to "partial completion"
 	br	%r14
@@ -235,13 +257,12 @@ sub BODY_16_XX {
 .Lsoftware:
 ___
 $code.=<<___;
-	sllg	$len,$len,`log(16*$SZ)/log(2)`
 	lghi	%r1,-$frame
-	agr	$len,$inp
-	stmg	$ctx,%r15,16($sp)
+	la	$len,0($len,$inp)
+	stm${g}	$ctx,%r15,`2*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$tbl,$Table
 	$LD	$A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@ sub BODY_16_XX {
 	clgr	$len,$t0
 	jne	.Lrounds_16_xx
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,`16*$SZ`($inp)
 	$ADD	$A,`0*$SZ`($ctx)
 	$ADD	$B,`1*$SZ`($ctx)
@@ -283,14 +304,14 @@ sub BODY_16_XX {
 	$ST	$F,`5*$SZ`($ctx)
 	$ST	$G,`6*$SZ`($ctx)
 	$ST	$H,`7*$SZ`($ctx)
-	clg	$inp,`$frame+32`($sp)
+	cl${g}	$inp,`$frame+4*$SIZE_T`($sp)
 	jne	.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)	
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)	
 	br	%r14
 .size	$Func,.-$Func
 .string	"SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl
index ec5d781..5857407 100644
--- a/crypto/sha/asm/sha512-sparcv9.pl
+++ b/crypto/sha/asm/sha512-sparcv9.pl
@@ -305,9 +305,9 @@ sub BODY_00_15 {
 	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
 	srl	@X[($i/2)%8],0,$tmp0
+	add	$tmp2,$tmp1,$tmp1
 	add	$xi,$T1,$T1			! +=X[i]
 	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
-	add	$tmp2,$T1,$T1
 	add	$tmp1,$T1,$T1
 
 	srl	$T1,0,$T1
@@ -318,9 +318,9 @@ sub BODY_00_15 {
 $code.=<<___;
 	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
-	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$xi,$T1,$T1			! +=X[i+9]
-	add	$tmp2,$T1,$T1
+	add	$tmp2,$tmp1,$tmp1
+	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$tmp1,$T1,$T1
 
 	sllx	$T1,32,$tmp0
diff --git a/crypto/sha/asm/sha512-x86_64.S b/crypto/sha/asm/sha512-x86_64.S
new file mode 100644
index 0000000..2d3294e
--- /dev/null
+++ b/crypto/sha/asm/sha512-x86_64.S
@@ -0,0 +1,1802 @@
+.text	
+
+.globl	sha512_block_data_order
+.type	sha512_block_data_order,@function
+.align	16
+sha512_block_data_order:
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	movq	%rsp,%r11
+	shlq	$4,%rdx
+	subq	$128+32,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%r11,128+24(%rsp)
+.Lprologue:
+
+	leaq	K512(%rip),%rbp
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xorq	%rdi,%rdi
+	movq	0(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,0(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	8(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,8(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	16(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,16(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	24(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,24(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	32(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,32(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	40(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,40(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	48(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,48(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	56(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,56(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	movq	64(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,64(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	72(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,72(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	80(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,80(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	88(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,88(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	96(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,96(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	104(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,104(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	112(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,112(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	120(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,120(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movq	8(%rsp),%r13
+	movq	112(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	72(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	0(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r14,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,0(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	16(%rsp),%r13
+	movq	120(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	80(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	8(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%r14,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,8(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	24(%rsp),%r13
+	movq	0(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	88(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	16(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r14,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,16(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	32(%rsp),%r13
+	movq	8(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	96(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	24(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%r14,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,24(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	40(%rsp),%r13
+	movq	16(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	104(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	32(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r14,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,32(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	48(%rsp),%r13
+	movq	24(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	112(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	40(%rsp),%r12
+	movq	%r11,%r13
+	addq	%r14,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,40(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	56(%rsp),%r13
+	movq	32(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	120(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	48(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r14,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,48(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	64(%rsp),%r13
+	movq	40(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	0(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	56(%rsp),%r12
+	movq	%r9,%r13
+	addq	%r14,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,56(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	movq	72(%rsp),%r13
+	movq	48(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	8(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	64(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r14,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+	movq	%r12,64(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%rax,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r8,%r15
+	movq	%rbx,%r11
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r15
+
+	xorq	%rcx,%r11
+	xorq	%rax,%r14
+	addq	%r15,%r12
+	movq	%rbx,%r15
+
+	rorq	$14,%r13
+	andq	%rax,%r11
+	andq	%rcx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r11
+
+	addq	%r12,%rdx
+	addq	%r12,%r11
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r11
+
+	movq	80(%rsp),%r13
+	movq	56(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	16(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	72(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%r14,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%r15
+	movq	%r12,72(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r11,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rdx,%r15
+	movq	%rax,%r10
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r15
+
+	xorq	%rbx,%r10
+	xorq	%r11,%r14
+	addq	%r15,%r12
+	movq	%rax,%r15
+
+	rorq	$14,%r13
+	andq	%r11,%r10
+	andq	%rbx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r10
+
+	addq	%r12,%rcx
+	addq	%r12,%r10
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r10
+
+	movq	88(%rsp),%r13
+	movq	64(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	24(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	80(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r14,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+	movq	%r12,80(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r10,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rcx,%r15
+	movq	%r11,%r9
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r15
+
+	xorq	%rax,%r9
+	xorq	%r10,%r14
+	addq	%r15,%r12
+	movq	%r11,%r15
+
+	rorq	$14,%r13
+	andq	%r10,%r9
+	andq	%rax,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r9
+
+	addq	%r12,%rbx
+	addq	%r12,%r9
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r9
+
+	movq	96(%rsp),%r13
+	movq	72(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	32(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	88(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%r14,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%r15
+	movq	%r12,88(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%r9,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rbx,%r15
+	movq	%r10,%r8
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r15
+
+	xorq	%r11,%r8
+	xorq	%r9,%r14
+	addq	%r15,%r12
+	movq	%r10,%r15
+
+	rorq	$14,%r13
+	andq	%r9,%r8
+	andq	%r11,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%r8
+
+	addq	%r12,%rax
+	addq	%r12,%r8
+	leaq	1(%rdi),%rdi
+	addq	%r14,%r8
+
+	movq	104(%rsp),%r13
+	movq	80(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	40(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	96(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r14,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+	movq	%r12,96(%rsp)
+
+	rorq	$5,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%r8,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%rax,%r15
+	movq	%r9,%rdx
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r15
+
+	xorq	%r10,%rdx
+	xorq	%r8,%r14
+	addq	%r15,%r12
+	movq	%r9,%r15
+
+	rorq	$14,%r13
+	andq	%r8,%rdx
+	andq	%r10,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rdx
+
+	addq	%r12,%r11
+	addq	%r12,%rdx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rdx
+
+	movq	112(%rsp),%r13
+	movq	88(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	48(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	104(%rsp),%r12
+	movq	%r11,%r13
+	addq	%r14,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%r15
+	movq	%r12,104(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rdx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r11,%r15
+	movq	%r8,%rcx
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r15
+
+	xorq	%r9,%rcx
+	xorq	%rdx,%r14
+	addq	%r15,%r12
+	movq	%r8,%r15
+
+	rorq	$14,%r13
+	andq	%rdx,%rcx
+	andq	%r9,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rcx
+
+	addq	%r12,%r10
+	addq	%r12,%rcx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rcx
+
+	movq	120(%rsp),%r13
+	movq	96(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	56(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	112(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r14,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+	movq	%r12,112(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rcx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r10,%r15
+	movq	%rdx,%rbx
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r15
+
+	xorq	%r8,%rbx
+	xorq	%rcx,%r14
+	addq	%r15,%r12
+	movq	%rdx,%r15
+
+	rorq	$14,%r13
+	andq	%rcx,%rbx
+	andq	%r8,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rbx
+
+	addq	%r12,%r9
+	addq	%r12,%rbx
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rbx
+
+	movq	0(%rsp),%r13
+	movq	104(%rsp),%r14
+	movq	%r13,%r12
+	movq	%r14,%r15
+
+	rorq	$7,%r12
+	xorq	%r13,%r12
+	shrq	$7,%r13
+
+	rorq	$1,%r12
+	xorq	%r12,%r13
+	movq	64(%rsp),%r12
+
+	rorq	$42,%r15
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	addq	%r13,%r12
+	xorq	%r15,%r14
+
+	addq	120(%rsp),%r12
+	movq	%r9,%r13
+	addq	%r14,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%r15
+	movq	%r12,120(%rsp)
+
+	rorq	$5,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%rbx,%r14
+
+	addq	(%rbp,%rdi,8),%r12
+	andq	%r9,%r15
+	movq	%rcx,%rax
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r15
+
+	xorq	%rdx,%rax
+	xorq	%rbx,%r14
+	addq	%r15,%r12
+	movq	%rcx,%r15
+
+	rorq	$14,%r13
+	andq	%rbx,%rax
+	andq	%rdx,%r15
+
+	rorq	$28,%r14
+	addq	%r13,%r12
+	addq	%r15,%rax
+
+	addq	%r12,%r8
+	addq	%r12,%rax
+	leaq	1(%rdi),%rdi
+	addq	%r14,%rax
+
+	cmpq	$80,%rdi
+	jb	.Lrounds_16_xx
+
+	movq	128+0(%rsp),%rdi
+	leaq	128(%rsi),%rsi
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop
+
+	movq	128+24(%rsp),%rsi
+	movq	(%rsi),%r15
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r13
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rbx
+	leaq	48(%rsi),%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.size	sha512_block_data_order,.-sha512_block_data_order
+.align	64
+.type	K512,@object
+K512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index e6643f8..8d51678 100755
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -51,7 +51,8 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 if ($output =~ /512/) {
 	$func="sha512_block_data_order";
@@ -95,50 +96,44 @@ ()
 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	mov	$e,$a0
-	mov	$e,$a1
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
 	mov	$f,$a2
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
 
-	ror	\$$Sigma1[0],$a0
-	ror	\$$Sigma1[1],$a1
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$e,$a0
 	xor	$g,$a2			# f^g
 
-	xor	$a1,$a0
-	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a1
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$a,$a1
+
+	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
 	and	$e,$a2			# (f^g)&e
-	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	mov	$b,$h
 
-	xor	$a1,$a0			# Sigma1(e)
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
 	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
-	add	$h,$T1			# T1+=h
-
-	mov	$a,$h
-	add	$a0,$T1			# T1+=Sigma1(e)
 
+	xor	$c,$h			# b^c
+	xor	$a,$a1
 	add	$a2,$T1			# T1+=Ch(e,f,g)
-	mov	$a,$a0
-	mov	$a,$a1
+	mov	$b,$a2
 
-	ror	\$$Sigma0[0],$h
-	ror	\$$Sigma0[1],$a0
-	mov	$a,$a2
-	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	and	$a,$h			# h=(b^c)&a
+	and	$c,$a2			# b&c
 
-	xor	$a0,$h
-	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a0
-	or	$c,$a1			# a|c
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+	add	$a2,$h			# h+=b&c (completes +=Maj(a,b,c)
 
-	xor	$a0,$h			# h=Sigma0(a)
-	and	$c,$a2			# a&c
 	add	$T1,$d			# d+=T1
-
-	and	$b,$a1			# (a|c)&b
 	add	$T1,$h			# h+=T1
-
-	or	$a2,$a1			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1($round),$round	# round++
+	add	$a1,$h			# h+=Sigma0(a)
 
-	add	$a1,$h			# h+=Maj(a,b,c)
 ___
 }
 
@@ -147,32 +142,30 @@ ()
 
 $code.=<<___;
 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
-	mov	`$SZ*(($i+14)&0xf)`(%rsp),$T1
-
-	mov	$a0,$a2
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a1
+	mov	$a0,$T1
+	mov	$a1,$a2
 
+	ror	\$`$sigma0[1]-$sigma0[0]`,$T1
+	xor	$a0,$T1
 	shr	\$$sigma0[2],$a0
-	ror	\$$sigma0[0],$a2
-
-	xor	$a2,$a0
-	ror	\$`$sigma0[1]-$sigma0[0]`,$a2
 
-	xor	$a2,$a0			# sigma0(X[(i+1)&0xf])
-	mov	$T1,$a1
+	ror	\$$sigma0[0],$T1
+	xor	$T1,$a0			# sigma0(X[(i+1)&0xf])
+	mov	`$SZ*(($i+9)&0xf)`(%rsp),$T1
 
-	shr	\$$sigma1[2],$T1
-	ror	\$$sigma1[0],$a1
-
-	xor	$a1,$T1
-	ror	\$`$sigma1[1]-$sigma1[0]`,$a1
-
-	xor	$a1,$T1			# sigma1(X[(i+14)&0xf])
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
 
+	ror	\$$sigma1[0],$a2
 	add	$a0,$T1
-
-	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+	xor	$a2,$a1			# sigma1(X[(i+14)&0xf])
 
 	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a1,$T1
+	mov	$a,$a1
 ___
 	&ROUND_00_15(@_);
 }
@@ -219,6 +212,8 @@ ()
 ___
 	for($i=0;$i<16;$i++) {
 		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
 		$code.="	bswap	$T1\n";
 		&ROUND_00_15($i,@ROT);
 		unshift(@ROT,pop(@ROT));
diff --git a/crypto/sha/sha.h b/crypto/sha/sha.h
index 16cacf9..8a6bf4b 100644
--- a/crypto/sha/sha.h
+++ b/crypto/sha/sha.h
@@ -106,6 +106,9 @@ typedef struct SHAstate_st
 	} SHA_CTX;
 
 #ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
 int SHA_Init(SHA_CTX *c);
 int SHA_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@ unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
 void SHA_Transform(SHA_CTX *c, const unsigned char *data);
 #endif
 #ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
 int SHA1_Init(SHA_CTX *c);
 int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@ typedef struct SHA256state_st
 	} SHA256_CTX;
 
 #ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
 int SHA224_Init(SHA256_CTX *c);
 int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
 int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@ typedef struct SHA512state_st
 #endif
 
 #ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
 int SHA384_Init(SHA512_CTX *c);
 int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
 int SHA384_Final(unsigned char *md, SHA512_CTX *c);
diff --git a/crypto/sha/sha1_one.c b/crypto/sha/sha1_one.c
index 7c65b60..c56ec94 100644
--- a/crypto/sha/sha1_one.c
+++ b/crypto/sha/sha1_one.c
@@ -58,8 +58,8 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <openssl/sha.h>
 #include <openssl/crypto.h>
+#include <openssl/sha.h>
 
 #ifndef OPENSSL_NO_SHA1
 unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
diff --git a/crypto/sha/sha1dgst.c b/crypto/sha/sha1dgst.c
index 50d1925..a986902 100644
--- a/crypto/sha/sha1dgst.c
+++ b/crypto/sha/sha1dgst.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/opensslconf.h>
 #if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
 
diff --git a/crypto/sha/sha256.c b/crypto/sha/sha256.c
index 8952d87..4eae074 100644
--- a/crypto/sha/sha256.c
+++ b/crypto/sha/sha256.c
@@ -16,7 +16,7 @@
 
 const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
 
-int SHA224_Init (SHA256_CTX *c)
+fips_md_init_ctx(SHA224, SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0xc1059ed8UL;	c->h[1]=0x367cd507UL;
@@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c)
 	return 1;
 	}
 
-int SHA256_Init (SHA256_CTX *c)
+fips_md_init(SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0x6a09e667UL;	c->h[1]=0xbb67ae85UL;
@@ -88,17 +88,17 @@ int SHA224_Final (unsigned char *md, SHA256_CTX *c)
 	switch ((c)->md_len)		\
 	{   case SHA224_DIGEST_LENGTH:	\
 		for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++)	\
-		{   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }	\
+		{   ll=(c)->h[nn]; (void)HOST_l2c(ll,(s));   }	\
 		break;			\
 	    case SHA256_DIGEST_LENGTH:	\
 		for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++)	\
-		{   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }	\
+		{   ll=(c)->h[nn]; (void)HOST_l2c(ll,(s));   }	\
 		break;			\
 	    default:			\
 		if ((c)->md_len > SHA256_DIGEST_LENGTH)	\
 		    return 0;				\
 		for (nn=0;nn<(c)->md_len/4;nn++)		\
-		{   ll=(c)->h[nn]; HOST_l2c(ll,(s));   }	\
+		{   ll=(c)->h[nn]; (void)HOST_l2c(ll,(s));   }	\
 		break;			\
 	}				\
 	} while (0)
diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c
index cbc0e58..50dd7dc 100644
--- a/crypto/sha/sha512.c
+++ b/crypto/sha/sha512.c
@@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
 #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 #endif
 
-int SHA384_Init (SHA512_CTX *c)
+fips_md_init_ctx(SHA384, SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0xcbbb9d5d; h[1]  = 0xc1059ed8;
-	h[2]  = 0x629a292a; h[3]  = 0x367cd507;
-	h[4]  = 0x9159015a; h[5]  = 0x3070dd17;
-	h[6]  = 0x152fecd8; h[7]  = 0xf70e5939;
-	h[8]  = 0x67332667; h[9]  = 0xffc00b31;
-	h[10] = 0x8eb44a87; h[11] = 0x68581511;
-	h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
-	h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
-#else
 	c->h[0]=U64(0xcbbb9d5dc1059ed8);
 	c->h[1]=U64(0x629a292a367cd507);
 	c->h[2]=U64(0x9159015a3070dd17);
@@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c)
 	c->h[5]=U64(0x8eb44a8768581511);
 	c->h[6]=U64(0xdb0c2e0d64f98fa7);
 	c->h[7]=U64(0x47b5481dbefa4fa4);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA384_DIGEST_LENGTH;
         return 1;
 	}
 
-int SHA512_Init (SHA512_CTX *c)
+fips_md_init(SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0x6a09e667; h[1]  = 0xf3bcc908;
-	h[2]  = 0xbb67ae85; h[3]  = 0x84caa73b;
-	h[4]  = 0x3c6ef372; h[5]  = 0xfe94f82b;
-	h[6]  = 0xa54ff53a; h[7]  = 0x5f1d36f1;
-	h[8]  = 0x510e527f; h[9]  = 0xade682d1;
-	h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
-	h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
-	h[14] = 0x5be0cd19; h[15] = 0x137e2179;
-#else
 	c->h[0]=U64(0x6a09e667f3bcc908);
 	c->h[1]=U64(0xbb67ae8584caa73b);
 	c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
 	c->h[5]=U64(0x9b05688c2b3e6c1f);
 	c->h[6]=U64(0x1f83d9abfb41bd6b);
 	c->h[7]=U64(0x5be0cd19137e2179);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA512_DIGEST_LENGTH;
         return 1;
@@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
 
 	if (md==0) return 0;
 
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* recall assembler dword order... */
-	n = c->md_len;
-	if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
-		{
-		unsigned int *h = (unsigned int *)c->h, t;
-
-		for (n/=4;n;n--)
-			{
-			t = *(h++);
-			*(md++) = (unsigned char)(t>>24);
-			*(md++) = (unsigned char)(t>>16);
-			*(md++) = (unsigned char)(t>>8);
-			*(md++) = (unsigned char)(t);
-			}
-		}
-	else	return 0;
-#else
 	switch (c->md_len)
 		{
 		/* Let compiler decide if it's appropriate to unroll... */
@@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
 		/* ... as well as make sure md_len is not abused. */
 		default:	return 0;
 		}
-#endif
+
 	return 1;
 	}
 
diff --git a/crypto/sha/sha_dgst.c b/crypto/sha/sha_dgst.c
index 70eb560..fb63b17 100644
--- a/crypto/sha/sha_dgst.c
+++ b/crypto/sha/sha_dgst.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/opensslconf.h>
 #if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
 
diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h
index 672c26e..d673255 100644
--- a/crypto/sha/sha_locl.h
+++ b/crypto/sha/sha_locl.h
@@ -69,11 +69,11 @@
 #define HASH_CBLOCK             SHA_CBLOCK
 #define HASH_MAKE_STRING(c,s)   do {	\
 	unsigned long ll;		\
-	ll=(c)->h0; HOST_l2c(ll,(s));	\
-	ll=(c)->h1; HOST_l2c(ll,(s));	\
-	ll=(c)->h2; HOST_l2c(ll,(s));	\
-	ll=(c)->h3; HOST_l2c(ll,(s));	\
-	ll=(c)->h4; HOST_l2c(ll,(s));	\
+	ll=(c)->h0; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h1; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h2; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h3; (void)HOST_l2c(ll,(s));	\
+	ll=(c)->h4; (void)HOST_l2c(ll,(s));	\
 	} while (0)
 
 #if defined(SHA_0)
@@ -122,7 +122,11 @@ void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
 #define INIT_DATA_h3 0x10325476UL
 #define INIT_DATA_h4 0xc3d2e1f0UL
 
-int HASH_INIT (SHA_CTX *c)
+#ifdef SHA_0
+fips_md_init(SHA)
+#else
+fips_md_init_ctx(SHA1, SHA)
+#endif
 	{
 	memset (c,0,sizeof(*c));
 	c->h0=INIT_DATA_h0;
@@ -252,21 +256,21 @@ static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
 		}
 	else
 		{
-		HOST_c2l(data,l); X( 0)=l;		HOST_c2l(data,l); X( 1)=l;
-		BODY_00_15( 0,A,B,C,D,E,T,X( 0));	HOST_c2l(data,l); X( 2)=l;
-		BODY_00_15( 1,T,A,B,C,D,E,X( 1));	HOST_c2l(data,l); X( 3)=l;
-		BODY_00_15( 2,E,T,A,B,C,D,X( 2));	HOST_c2l(data,l); X( 4)=l;
-		BODY_00_15( 3,D,E,T,A,B,C,X( 3));	HOST_c2l(data,l); X( 5)=l;
-		BODY_00_15( 4,C,D,E,T,A,B,X( 4));	HOST_c2l(data,l); X( 6)=l;
-		BODY_00_15( 5,B,C,D,E,T,A,X( 5));	HOST_c2l(data,l); X( 7)=l;
-		BODY_00_15( 6,A,B,C,D,E,T,X( 6));	HOST_c2l(data,l); X( 8)=l;
-		BODY_00_15( 7,T,A,B,C,D,E,X( 7));	HOST_c2l(data,l); X( 9)=l;
-		BODY_00_15( 8,E,T,A,B,C,D,X( 8));	HOST_c2l(data,l); X(10)=l;
-		BODY_00_15( 9,D,E,T,A,B,C,X( 9));	HOST_c2l(data,l); X(11)=l;
-		BODY_00_15(10,C,D,E,T,A,B,X(10));	HOST_c2l(data,l); X(12)=l;
-		BODY_00_15(11,B,C,D,E,T,A,X(11));	HOST_c2l(data,l); X(13)=l;
-		BODY_00_15(12,A,B,C,D,E,T,X(12));	HOST_c2l(data,l); X(14)=l;
-		BODY_00_15(13,T,A,B,C,D,E,X(13));	HOST_c2l(data,l); X(15)=l;
+		(void)HOST_c2l(data,l); X( 0)=l;	(void)HOST_c2l(data,l); X( 1)=l;
+		BODY_00_15( 0,A,B,C,D,E,T,X( 0));	(void)HOST_c2l(data,l); X( 2)=l;
+		BODY_00_15( 1,T,A,B,C,D,E,X( 1));	(void)HOST_c2l(data,l); X( 3)=l;
+		BODY_00_15( 2,E,T,A,B,C,D,X( 2));	(void)HOST_c2l(data,l); X( 4)=l;
+		BODY_00_15( 3,D,E,T,A,B,C,X( 3));	(void)HOST_c2l(data,l); X( 5)=l;
+		BODY_00_15( 4,C,D,E,T,A,B,X( 4));	(void)HOST_c2l(data,l); X( 6)=l;
+		BODY_00_15( 5,B,C,D,E,T,A,X( 5));	(void)HOST_c2l(data,l); X( 7)=l;
+		BODY_00_15( 6,A,B,C,D,E,T,X( 6));	(void)HOST_c2l(data,l); X( 8)=l;
+		BODY_00_15( 7,T,A,B,C,D,E,X( 7));	(void)HOST_c2l(data,l); X( 9)=l;
+		BODY_00_15( 8,E,T,A,B,C,D,X( 8));	(void)HOST_c2l(data,l); X(10)=l;
+		BODY_00_15( 9,D,E,T,A,B,C,X( 9));	(void)HOST_c2l(data,l); X(11)=l;
+		BODY_00_15(10,C,D,E,T,A,B,X(10));	(void)HOST_c2l(data,l); X(12)=l;
+		BODY_00_15(11,B,C,D,E,T,A,X(11));	(void)HOST_c2l(data,l); X(13)=l;
+		BODY_00_15(12,A,B,C,D,E,T,X(12));	(void)HOST_c2l(data,l); X(14)=l;
+		BODY_00_15(13,T,A,B,C,D,E,X(13));	(void)HOST_c2l(data,l); X(15)=l;
 		BODY_00_15(14,E,T,A,B,C,D,X(14));
 		BODY_00_15(15,D,E,T,A,B,C,X(15));
 		}
diff --git a/crypto/sparccpuid.S b/crypto/sparccpuid.S
index ae61f7f..c63d5da 100644
--- a/crypto/sparccpuid.S
+++ b/crypto/sparccpuid.S
@@ -123,7 +123,7 @@ OPENSSL_wipe_cpu:
 			fmovs	%f1,%f3
 			fmovs	%f0,%f2
 
-	add	%fp,BIAS,%i0	! return pointer to caller�s top of stack
+	add	%fp,BIAS,%i0	! return pointer to caller´s top of stack
 
 	ret
 	restore
@@ -235,10 +235,10 @@ _sparcv9_rdtick:
 .global	_sparcv9_vis1_probe
 .align	8
 _sparcv9_vis1_probe:
-	.word	0x81b00d80	!fxor	%f0,%f0,%f0
 	add	%sp,BIAS+2,%o1
-	retl
 	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
+	retl
+	.word	0x81b00d80	!fxor	%f0,%f0,%f0
 .type	_sparcv9_vis1_probe,#function
 .size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
 
diff --git a/crypto/sparcv9cap.c b/crypto/sparcv9cap.c
index ed195ab..43b3ac6 100644
--- a/crypto/sparcv9cap.c
+++ b/crypto/sparcv9cap.c
@@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
 	int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 
-	if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+	if (num>=8 && !(num&1) &&
+	    (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
 		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
 		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
 	else
@@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void)
 	char *e;
 	struct sigaction	common_act,ill_oact,bus_oact;
 	sigset_t		all_masked,oset;
-	int			sig;
 	static int trigger=0;
 
 	if (trigger) return;
diff --git a/crypto/srp/srp.h b/crypto/srp/srp.h
new file mode 100644
index 0000000..7ec7825
--- /dev/null
+++ b/crypto/srp/srp.h
@@ -0,0 +1,172 @@
+/* crypto/srp/srp.h */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef __SRP_H__
+#define __SRP_H__
+
+#ifndef OPENSSL_NO_SRP
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/safestack.h>
+#include <openssl/bn.h>
+#include <openssl/crypto.h>
+
+typedef struct SRP_gN_cache_st
+	{
+	char *b64_bn;
+	BIGNUM *bn;
+	} SRP_gN_cache;
+
+
+DECLARE_STACK_OF(SRP_gN_cache)
+
+typedef struct SRP_user_pwd_st
+	{
+	char *id;
+	BIGNUM *s;
+	BIGNUM *v;
+	const BIGNUM *g;
+	const BIGNUM *N;
+	char *info;
+	} SRP_user_pwd;
+
+DECLARE_STACK_OF(SRP_user_pwd)
+
+typedef struct SRP_VBASE_st
+	{
+	STACK_OF(SRP_user_pwd) *users_pwd;
+	STACK_OF(SRP_gN_cache) *gN_cache;
+/* to simulate a user */
+	char *seed_key;
+	BIGNUM *default_g;
+	BIGNUM *default_N;
+	} SRP_VBASE;
+
+
+/*Structure interne pour retenir les couples N et g*/
+typedef struct SRP_gN_st
+	{
+	char *id;
+	BIGNUM *g;
+	BIGNUM *N;
+	} SRP_gN;
+
+DECLARE_STACK_OF(SRP_gN)
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key);
+int SRP_VBASE_free(SRP_VBASE *vb);
+int SRP_VBASE_init(SRP_VBASE *vb, char * verifier_file);
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g);
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g);
+
+
+#define SRP_NO_ERROR 0
+#define SRP_ERR_VBASE_INCOMPLETE_FILE 1
+#define SRP_ERR_VBASE_BN_LIB 2
+#define SRP_ERR_OPEN_FILE 3
+#define SRP_ERR_MEMORY 4
+
+#define DB_srptype	0
+#define DB_srpverifier	1
+#define DB_srpsalt 	2
+#define DB_srpid	3              
+#define DB_srpgN	4       
+#define DB_srpinfo	5 
+#undef  DB_NUMBER      
+#define DB_NUMBER       6
+
+#define DB_SRP_INDEX	'I'
+#define DB_SRP_VALID	'V'
+#define DB_SRP_REVOKED	'R'
+#define DB_SRP_MODIF	'v'
+
+
+/* see srp.c */
+char * SRP_check_known_gN_param(BIGNUM* g, BIGNUM* N); 
+SRP_gN *SRP_get_default_gN(const char * id) ;
+
+/* server side .... */
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N);
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v);
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N);
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) ;
+
+
+
+/* client side .... */
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass);
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g);
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u);
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N);
+
+#define SRP_MINIMAL_N 1024
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+#endif
diff --git a/crypto/srp/srp_grps.h b/crypto/srp/srp_grps.h
new file mode 100644
index 0000000..d77c9ff
--- /dev/null
+++ b/crypto/srp/srp_grps.h
@@ -0,0 +1,517 @@
+/* start of generated data */
+
+static BN_ULONG bn_group_1024_value[] = {
+	bn_pack4(9FC6,1D2F,C0EB,06E3),
+	bn_pack4(FD51,38FE,8376,435B),
+	bn_pack4(2FD4,CBF4,976E,AA9A),
+	bn_pack4(68ED,BC3C,0572,6CC0),
+	bn_pack4(C529,F566,660E,57EC),
+	bn_pack4(8255,9B29,7BCF,1885),
+	bn_pack4(CE8E,F4AD,69B1,5D49),
+	bn_pack4(5DC7,D7B4,6154,D6B6),
+	bn_pack4(8E49,5C1D,6089,DAD1),
+	bn_pack4(E0D5,D8E2,50B9,8BE4),
+	bn_pack4(383B,4813,D692,C6E0),
+	bn_pack4(D674,DF74,96EA,81D3),
+	bn_pack4(9EA2,314C,9C25,6576),
+	bn_pack4(6072,6187,75FF,3C0B),
+	bn_pack4(9C33,F80A,FA8F,C5E8),
+	bn_pack4(EEAF,0AB9,ADB3,8DD6)
+};
+static BIGNUM bn_group_1024 = {
+	bn_group_1024_value,
+	(sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_1536_value[] = {
+	bn_pack4(CF76,E3FE,D135,F9BB),
+	bn_pack4(1518,0F93,499A,234D),
+	bn_pack4(8CE7,A28C,2442,C6F3),
+	bn_pack4(5A02,1FFF,5E91,479E),
+	bn_pack4(7F8A,2FE9,B8B5,292E),
+	bn_pack4(837C,264A,E3A9,BEB8),
+	bn_pack4(E442,734A,F7CC,B7AE),
+	bn_pack4(6577,2E43,7D6C,7F8C),
+	bn_pack4(DB2F,D53D,24B7,C486),
+	bn_pack4(6EDF,0195,3934,9627),
+	bn_pack4(158B,FD3E,2B9C,8CF5),
+	bn_pack4(764E,3F4B,53DD,9DA1),
+	bn_pack4(4754,8381,DBC5,B1FC),
+	bn_pack4(9B60,9E0B,E3BA,B63D),
+	bn_pack4(8134,B1C8,B979,8914),
+	bn_pack4(DF02,8A7C,EC67,F0D0),
+	bn_pack4(80B6,55BB,9A22,E8DC),
+	bn_pack4(1558,903B,A0D0,F843),
+	bn_pack4(51C6,A94B,E460,7A29),
+	bn_pack4(5F4F,5F55,6E27,CBDE),
+	bn_pack4(BEEE,A961,4B19,CC4D),
+	bn_pack4(DBA5,1DF4,99AC,4C80),
+	bn_pack4(B1F1,2A86,17A4,7BBB),
+	bn_pack4(9DEF,3CAF,B939,277A)
+};
+static BIGNUM bn_group_1536 = {
+	bn_group_1536_value,
+	(sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_2048_value[] = {
+	bn_pack4(0FA7,111F,9E4A,FF73),
+	bn_pack4(9B65,E372,FCD6,8EF2),
+	bn_pack4(35DE,236D,525F,5475),
+	bn_pack4(94B5,C803,D89F,7AE4),
+	bn_pack4(71AE,35F8,E9DB,FBB6),
+	bn_pack4(2A56,98F3,A8D0,C382),
+	bn_pack4(9CCC,041C,7BC3,08D8),
+	bn_pack4(AF87,4E73,03CE,5329),
+	bn_pack4(6160,2790,04E5,7AE6),
+	bn_pack4(032C,FBDB,F52F,B378),
+	bn_pack4(5EA7,7A27,75D2,ECFA),
+	bn_pack4(5445,23B5,24B0,D57D),
+	bn_pack4(5B9D,32E6,88F8,7748),
+	bn_pack4(F1D2,B907,8717,461A),
+	bn_pack4(76BD,207A,436C,6481),
+	bn_pack4(CA97,B43A,23FB,8016),
+	bn_pack4(1D28,1E44,6B14,773B),
+	bn_pack4(7359,D041,D5C3,3EA7),
+	bn_pack4(A80D,740A,DBF4,FF74),
+	bn_pack4(55F9,7993,EC97,5EEA),
+	bn_pack4(2918,A996,2F0B,93B8),
+	bn_pack4(661A,05FB,D5FA,AAE8),
+	bn_pack4(CF60,9517,9A16,3AB3),
+	bn_pack4(E808,3969,EDB7,67B0),
+	bn_pack4(CD7F,48A9,DA04,FD50),
+	bn_pack4(D523,12AB,4B03,310D),
+	bn_pack4(8193,E075,7767,A13D),
+	bn_pack4(A373,29CB,B4A0,99ED),
+	bn_pack4(FC31,9294,3DB5,6050),
+	bn_pack4(AF72,B665,1987,EE07),
+	bn_pack4(F166,DE5E,1389,582F),
+	bn_pack4(AC6B,DB41,324A,9A9B)
+};
+static BIGNUM bn_group_2048 = {
+	bn_group_2048_value,
+	(sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_3072_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(4B82,D120,A93A,D2CA),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_3072 = {
+	bn_group_3072_value,
+	(sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_4096_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(4DF4,35C9,3406,3199),
+	bn_pack4(86FF,B7DC,90A6,C08F),
+	bn_pack4(93B4,EA98,8D8F,DDC1),
+	bn_pack4(D006,9127,D5B0,5AA9),
+	bn_pack4(B81B,DD76,2170,481C),
+	bn_pack4(1F61,2970,CEE2,D7AF),
+	bn_pack4(233B,A186,515B,E7ED),
+	bn_pack4(99B2,964F,A090,C3A2),
+	bn_pack4(287C,5947,4E6B,C05D),
+	bn_pack4(2E8E,FC14,1FBE,CAA6),
+	bn_pack4(DBBB,C2DB,04DE,8EF9),
+	bn_pack4(2583,E9CA,2AD4,4CE8),
+	bn_pack4(1A94,6834,B615,0BDA),
+	bn_pack4(99C3,2718,6AF4,E23C),
+	bn_pack4(8871,9A10,BDBA,5B26),
+	bn_pack4(1A72,3C12,A787,E6D7),
+	bn_pack4(4B82,D120,A921,0801),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_4096 = {
+	bn_group_4096_value,
+	(sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_6144_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(E694,F91E,6DCC,4024),
+	bn_pack4(12BF,2D5B,0B74,74D6),
+	bn_pack4(043E,8F66,3F48,60EE),
+	bn_pack4(387F,E8D7,6E3C,0468),
+	bn_pack4(DA56,C9EC,2EF2,9632),
+	bn_pack4(EB19,CCB1,A313,D55C),
+	bn_pack4(F550,AA3D,8A1F,BFF0),
+	bn_pack4(06A1,D58B,B7C5,DA76),
+	bn_pack4(A797,15EE,F29B,E328),
+	bn_pack4(14CC,5ED2,0F80,37E0),
+	bn_pack4(CC8F,6D7E,BF48,E1D8),
+	bn_pack4(4BD4,07B2,2B41,54AA),
+	bn_pack4(0F1D,45B7,FF58,5AC5),
+	bn_pack4(23A9,7A7E,36CC,88BE),
+	bn_pack4(59E7,C97F,BEC7,E8F3),
+	bn_pack4(B5A8,4031,900B,1C9E),
+	bn_pack4(D55E,702F,4698,0C82),
+	bn_pack4(F482,D7CE,6E74,FEF6),
+	bn_pack4(F032,EA15,D172,1D03),
+	bn_pack4(5983,CA01,C64B,92EC),
+	bn_pack4(6FB8,F401,378C,D2BF),
+	bn_pack4(3320,5151,2BD7,AF42),
+	bn_pack4(DB7F,1447,E6CC,254B),
+	bn_pack4(44CE,6CBA,CED4,BB1B),
+	bn_pack4(DA3E,DBEB,CF9B,14ED),
+	bn_pack4(1797,27B0,865A,8918),
+	bn_pack4(B06A,53ED,9027,D831),
+	bn_pack4(E5DB,382F,4130,01AE),
+	bn_pack4(F8FF,9406,AD9E,530E),
+	bn_pack4(C975,1E76,3DBA,37BD),
+	bn_pack4(C1D4,DCB2,6026,46DE),
+	bn_pack4(36C3,FAB4,D27C,7026),
+	bn_pack4(4DF4,35C9,3402,8492),
+	bn_pack4(86FF,B7DC,90A6,C08F),
+	bn_pack4(93B4,EA98,8D8F,DDC1),
+	bn_pack4(D006,9127,D5B0,5AA9),
+	bn_pack4(B81B,DD76,2170,481C),
+	bn_pack4(1F61,2970,CEE2,D7AF),
+	bn_pack4(233B,A186,515B,E7ED),
+	bn_pack4(99B2,964F,A090,C3A2),
+	bn_pack4(287C,5947,4E6B,C05D),
+	bn_pack4(2E8E,FC14,1FBE,CAA6),
+	bn_pack4(DBBB,C2DB,04DE,8EF9),
+	bn_pack4(2583,E9CA,2AD4,4CE8),
+	bn_pack4(1A94,6834,B615,0BDA),
+	bn_pack4(99C3,2718,6AF4,E23C),
+	bn_pack4(8871,9A10,BDBA,5B26),
+	bn_pack4(1A72,3C12,A787,E6D7),
+	bn_pack4(4B82,D120,A921,0801),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_6144 = {
+	bn_group_6144_value,
+	(sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_8192_value[] = {
+	bn_pack4(FFFF,FFFF,FFFF,FFFF),
+	bn_pack4(60C9,80DD,98ED,D3DF),
+	bn_pack4(C81F,56E8,80B9,6E71),
+	bn_pack4(9E30,50E2,7656,94DF),
+	bn_pack4(9558,E447,5677,E9AA),
+	bn_pack4(C919,0DA6,FC02,6E47),
+	bn_pack4(889A,002E,D5EE,382B),
+	bn_pack4(4009,438B,481C,6CD7),
+	bn_pack4(3590,46F4,EB87,9F92),
+	bn_pack4(FAF3,6BC3,1ECF,A268),
+	bn_pack4(B1D5,10BD,7EE7,4D73),
+	bn_pack4(F9AB,4819,5DED,7EA1),
+	bn_pack4(64F3,1CC5,0846,851D),
+	bn_pack4(4597,E899,A025,5DC1),
+	bn_pack4(DF31,0EE0,74AB,6A36),
+	bn_pack4(6D2A,13F8,3F44,F82D),
+	bn_pack4(062B,3CF5,B3A2,78A6),
+	bn_pack4(7968,3303,ED5B,DD3A),
+	bn_pack4(FA9D,4B7F,A2C0,87E8),
+	bn_pack4(4BCB,C886,2F83,85DD),
+	bn_pack4(3473,FC64,6CEA,306B),
+	bn_pack4(13EB,57A8,1A23,F0C7),
+	bn_pack4(2222,2E04,A403,7C07),
+	bn_pack4(E3FD,B8BE,FC84,8AD9),
+	bn_pack4(238F,16CB,E39D,652D),
+	bn_pack4(3423,B474,2BF1,C978),
+	bn_pack4(3AAB,639C,5AE4,F568),
+	bn_pack4(2576,F693,6BA4,2466),
+	bn_pack4(741F,A7BF,8AFC,47ED),
+	bn_pack4(3BC8,32B6,8D9D,D300),
+	bn_pack4(D8BE,C4D0,73B9,31BA),
+	bn_pack4(3877,7CB6,A932,DF8C),
+	bn_pack4(74A3,926F,12FE,E5E4),
+	bn_pack4(E694,F91E,6DBE,1159),
+	bn_pack4(12BF,2D5B,0B74,74D6),
+	bn_pack4(043E,8F66,3F48,60EE),
+	bn_pack4(387F,E8D7,6E3C,0468),
+	bn_pack4(DA56,C9EC,2EF2,9632),
+	bn_pack4(EB19,CCB1,A313,D55C),
+	bn_pack4(F550,AA3D,8A1F,BFF0),
+	bn_pack4(06A1,D58B,B7C5,DA76),
+	bn_pack4(A797,15EE,F29B,E328),
+	bn_pack4(14CC,5ED2,0F80,37E0),
+	bn_pack4(CC8F,6D7E,BF48,E1D8),
+	bn_pack4(4BD4,07B2,2B41,54AA),
+	bn_pack4(0F1D,45B7,FF58,5AC5),
+	bn_pack4(23A9,7A7E,36CC,88BE),
+	bn_pack4(59E7,C97F,BEC7,E8F3),
+	bn_pack4(B5A8,4031,900B,1C9E),
+	bn_pack4(D55E,702F,4698,0C82),
+	bn_pack4(F482,D7CE,6E74,FEF6),
+	bn_pack4(F032,EA15,D172,1D03),
+	bn_pack4(5983,CA01,C64B,92EC),
+	bn_pack4(6FB8,F401,378C,D2BF),
+	bn_pack4(3320,5151,2BD7,AF42),
+	bn_pack4(DB7F,1447,E6CC,254B),
+	bn_pack4(44CE,6CBA,CED4,BB1B),
+	bn_pack4(DA3E,DBEB,CF9B,14ED),
+	bn_pack4(1797,27B0,865A,8918),
+	bn_pack4(B06A,53ED,9027,D831),
+	bn_pack4(E5DB,382F,4130,01AE),
+	bn_pack4(F8FF,9406,AD9E,530E),
+	bn_pack4(C975,1E76,3DBA,37BD),
+	bn_pack4(C1D4,DCB2,6026,46DE),
+	bn_pack4(36C3,FAB4,D27C,7026),
+	bn_pack4(4DF4,35C9,3402,8492),
+	bn_pack4(86FF,B7DC,90A6,C08F),
+	bn_pack4(93B4,EA98,8D8F,DDC1),
+	bn_pack4(D006,9127,D5B0,5AA9),
+	bn_pack4(B81B,DD76,2170,481C),
+	bn_pack4(1F61,2970,CEE2,D7AF),
+	bn_pack4(233B,A186,515B,E7ED),
+	bn_pack4(99B2,964F,A090,C3A2),
+	bn_pack4(287C,5947,4E6B,C05D),
+	bn_pack4(2E8E,FC14,1FBE,CAA6),
+	bn_pack4(DBBB,C2DB,04DE,8EF9),
+	bn_pack4(2583,E9CA,2AD4,4CE8),
+	bn_pack4(1A94,6834,B615,0BDA),
+	bn_pack4(99C3,2718,6AF4,E23C),
+	bn_pack4(8871,9A10,BDBA,5B26),
+	bn_pack4(1A72,3C12,A787,E6D7),
+	bn_pack4(4B82,D120,A921,0801),
+	bn_pack4(43DB,5BFC,E0FD,108E),
+	bn_pack4(08E2,4FA0,74E5,AB31),
+	bn_pack4(7709,88C0,BAD9,46E2),
+	bn_pack4(BBE1,1757,7A61,5D6C),
+	bn_pack4(521F,2B18,177B,200C),
+	bn_pack4(D876,0273,3EC8,6A64),
+	bn_pack4(F12F,FA06,D98A,0864),
+	bn_pack4(CEE3,D226,1AD2,EE6B),
+	bn_pack4(1E8C,94E0,4A25,619D),
+	bn_pack4(ABF5,AE8C,DB09,33D7),
+	bn_pack4(B397,0F85,A6E1,E4C7),
+	bn_pack4(8AEA,7157,5D06,0C7D),
+	bn_pack4(ECFB,8504,58DB,EF0A),
+	bn_pack4(A855,21AB,DF1C,BA64),
+	bn_pack4(AD33,170D,0450,7A33),
+	bn_pack4(1572,8E5A,8AAA,C42D),
+	bn_pack4(15D2,2618,98FA,0510),
+	bn_pack4(3995,497C,EA95,6AE5),
+	bn_pack4(DE2B,CBF6,9558,1718),
+	bn_pack4(B5C5,5DF0,6F4C,52C9),
+	bn_pack4(9B27,83A2,EC07,A28F),
+	bn_pack4(E39E,772C,180E,8603),
+	bn_pack4(3290,5E46,2E36,CE3B),
+	bn_pack4(F174,6C08,CA18,217C),
+	bn_pack4(670C,354E,4ABC,9804),
+	bn_pack4(9ED5,2907,7096,966D),
+	bn_pack4(1C62,F356,2085,52BB),
+	bn_pack4(8365,5D23,DCA3,AD96),
+	bn_pack4(6916,3FA8,FD24,CF5F),
+	bn_pack4(98DA,4836,1C55,D39A),
+	bn_pack4(C200,7CB8,A163,BF05),
+	bn_pack4(4928,6651,ECE4,5B3D),
+	bn_pack4(AE9F,2411,7C4B,1FE6),
+	bn_pack4(EE38,6BFB,5A89,9FA5),
+	bn_pack4(0BFF,5CB6,F406,B7ED),
+	bn_pack4(F44C,42E9,A637,ED6B),
+	bn_pack4(E485,B576,625E,7EC6),
+	bn_pack4(4FE1,356D,6D51,C245),
+	bn_pack4(302B,0A6D,F25F,1437),
+	bn_pack4(EF95,19B3,CD3A,431B),
+	bn_pack4(514A,0879,8E34,04DD),
+	bn_pack4(020B,BEA6,3B13,9B22),
+	bn_pack4(2902,4E08,8A67,CC74),
+	bn_pack4(C4C6,628B,80DC,1CD1),
+	bn_pack4(C90F,DAA2,2168,C234),
+	bn_pack4(FFFF,FFFF,FFFF,FFFF)
+};
+static BIGNUM bn_group_8192 = {
+	bn_group_8192_value,
+	(sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+	(sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_generator_19_value[] = {19} ;
+static BIGNUM bn_generator_19 = {
+	bn_generator_19_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_5_value[] = {5} ;
+static BIGNUM bn_generator_5 = {
+	bn_generator_5_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_2_value[] = {2} ;
+static BIGNUM bn_generator_2 = {
+	bn_generator_2_value,
+	1,
+	1,
+	0,
+	BN_FLG_STATIC_DATA
+};
+
+static SRP_gN knowngN[] = {
+	{"8192",&bn_generator_19 , &bn_group_8192},
+	{"6144",&bn_generator_5 , &bn_group_6144},
+	{"4096",&bn_generator_5 , &bn_group_4096},
+	{"3072",&bn_generator_5 , &bn_group_3072},
+	{"2048",&bn_generator_2 , &bn_group_2048},
+	{"1536",&bn_generator_2 , &bn_group_1536},
+	{"1024",&bn_generator_2 , &bn_group_1024},
+};
+#define KNOWN_GN_NUMBER sizeof(knowngN) / sizeof(SRP_gN)
+
+/* end of generated data */
diff --git a/crypto/srp/srp_lcl.h b/crypto/srp/srp_lcl.h
new file mode 100644
index 0000000..42bda3f
--- /dev/null
+++ b/crypto/srp/srp_lcl.h
@@ -0,0 +1,83 @@
+/* crypto/srp/srp_lcl.h */
+/* Written by Peter Sylvester (peter.sylvester@edelweb.fr)  
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef HEADER_SRP_LCL_H
+#define HEADER_SRP_LCL_H
+
+#include <openssl/srp.h>
+#include <openssl/sha.h>
+
+#if 0
+#define srp_bn_print(a) {fprintf(stderr, #a "="); BN_print_fp(stderr,a); \
+   fprintf(stderr,"\n");}
+#else
+#define   srp_bn_print(a)
+#endif
+
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/crypto/srp/srp_lib.c b/crypto/srp/srp_lib.c
new file mode 100644
index 0000000..92cea98
--- /dev/null
+++ b/crypto/srp/srp_lib.c
@@ -0,0 +1,357 @@
+/* crypto/srp/srp_lib.c */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+
+#if (BN_BYTES == 8)
+#define bn_pack4(a1,a2,a3,a4) 0x##a1##a2##a3##a4##ul
+#endif
+#if (BN_BYTES == 4)
+#define bn_pack4(a1,a2,a3,a4)  0x##a3##a4##ul, 0x##a1##a2##ul
+#endif
+#if (BN_BYTES == 2)
+#define bn_pack4(a1,a2,a3,a4) 0x##a4##u,0x##a3##u,0x##a2##u,0x##a1##u
+#endif
+
+
+#include "srp_grps.h"
+
+static BIGNUM *srp_Calc_k(BIGNUM *N, BIGNUM *g)
+	{
+	/* k = SHA1(N | PAD(g)) -- tls-srp draft 8 */
+
+	unsigned char digest[SHA_DIGEST_LENGTH];
+	unsigned char *tmp;
+	EVP_MD_CTX ctxt;
+	int longg ;
+	int longN = BN_num_bytes(N);
+
+	if ((tmp = OPENSSL_malloc(longN)) == NULL)
+		return NULL;
+	BN_bn2bin(N,tmp) ;
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, tmp, longN);
+
+	memset(tmp, 0, longN);
+	longg = BN_bn2bin(g,tmp) ;
+        /* use the zeros behind to pad on left */
+	EVP_DigestUpdate(&ctxt, tmp + longg, longN-longg);
+	EVP_DigestUpdate(&ctxt, tmp, longg);
+	OPENSSL_free(tmp);
+
+	EVP_DigestFinal_ex(&ctxt, digest, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+	return BN_bin2bn(digest, sizeof(digest), NULL);	
+	}
+
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N)
+	{
+	/* k = SHA1(PAD(A) || PAD(B) ) -- tls-srp draft 8 */
+
+	BIGNUM *u;	
+	unsigned char cu[SHA_DIGEST_LENGTH];
+	unsigned char *cAB;
+	EVP_MD_CTX ctxt;
+	int longN;  
+	if ((A == NULL) ||(B == NULL) || (N == NULL))
+		return NULL;
+
+	longN= BN_num_bytes(N);
+
+	if ((cAB = OPENSSL_malloc(2*longN)) == NULL) 
+		return NULL;
+
+	memset(cAB, 0, longN);
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(A,cAB+longN), longN);
+	EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(B,cAB+longN), longN);
+	OPENSSL_free(cAB);
+	EVP_DigestFinal_ex(&ctxt, cu, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+
+	if (!(u = BN_bin2bn(cu, sizeof(cu), NULL)))
+		return NULL;
+	if (!BN_is_zero(u))
+		return u;
+	BN_free(u);
+	return NULL;
+}
+
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N)
+	{
+	BIGNUM *tmp = NULL, *S = NULL;
+	BN_CTX *bn_ctx; 
+	
+	if (u == NULL || A == NULL || v == NULL || b == NULL || N == NULL)
+		return NULL; 
+
+	if ((bn_ctx = BN_CTX_new()) == NULL ||
+		(tmp = BN_new()) == NULL ||
+		(S = BN_new()) == NULL )
+		goto err;
+
+	/* S = (A*v**u) ** b */ 
+
+	if (!BN_mod_exp(tmp,v,u,N,bn_ctx))
+		goto err;
+	if (!BN_mod_mul(tmp,A,tmp,N,bn_ctx))
+		goto err;
+	if (!BN_mod_exp(S,tmp,b,N,bn_ctx))
+		goto err;
+err:
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(tmp);
+	return S;
+	}
+
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v)
+	{
+	BIGNUM  *kv = NULL, *gb = NULL;
+	BIGNUM *B = NULL, *k = NULL;
+	BN_CTX *bn_ctx;
+
+	if (b == NULL || N == NULL || g == NULL || v == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return NULL; 
+
+	if ( (kv = BN_new()) == NULL ||
+		(gb = BN_new()) == NULL ||
+		(B = BN_new())== NULL)
+		goto err;
+
+	/* B = g**b + k*v */
+
+	if (!BN_mod_exp(gb,g,b,N,bn_ctx) ||
+	   !(k = srp_Calc_k(N,g)) ||
+	   !BN_mod_mul(kv,v,k,N,bn_ctx) || 
+	   !BN_mod_add(B,gb,kv,N,bn_ctx))
+		{
+		BN_free(B);
+		B = NULL;
+		}
+err:
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(kv);
+	BN_clear_free(gb);
+	BN_free(k); 
+	return B;
+	}
+
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass)
+	{
+	unsigned char dig[SHA_DIGEST_LENGTH];
+	EVP_MD_CTX ctxt;
+	unsigned char *cs;
+
+	if ((s == NULL) ||
+		(user == NULL) ||
+		(pass == NULL))
+		return NULL;
+
+	if ((cs = OPENSSL_malloc(BN_num_bytes(s))) == NULL)
+		return NULL;
+
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, user, strlen(user));
+	EVP_DigestUpdate(&ctxt, ":", 1);
+	EVP_DigestUpdate(&ctxt, pass, strlen(pass));
+	EVP_DigestFinal_ex(&ctxt, dig, NULL);
+
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	BN_bn2bin(s,cs);
+	EVP_DigestUpdate(&ctxt, cs, BN_num_bytes(s));
+	OPENSSL_free(cs);
+	EVP_DigestUpdate(&ctxt, dig, sizeof(dig));
+	EVP_DigestFinal_ex(&ctxt, dig, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+
+	return BN_bin2bn(dig, sizeof(dig), NULL);
+	}
+
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g)
+	{
+	BN_CTX *bn_ctx; 
+	BIGNUM * A = NULL;
+
+	if (a == NULL || N == NULL || g == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL) 
+		return NULL;
+
+	if ((A = BN_new()) != NULL &&
+	   !BN_mod_exp(A,g,a,N,bn_ctx))
+		{
+		BN_free(A);
+		A = NULL;
+		}
+	BN_CTX_free(bn_ctx);
+	return A;
+	}
+
+
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u)
+	{
+	BIGNUM *tmp = NULL, *tmp2 = NULL, *tmp3 = NULL , *k = NULL, *K = NULL;
+	BN_CTX *bn_ctx;
+
+	if (u == NULL || B == NULL || N == NULL || g == NULL || x == NULL || a == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return NULL; 
+
+	if ((tmp = BN_new()) == NULL ||
+		(tmp2 = BN_new())== NULL ||
+		(tmp3 = BN_new())== NULL ||
+		(K = BN_new()) == NULL)
+		goto err;
+	
+	if (!BN_mod_exp(tmp,g,x,N,bn_ctx))
+		goto err;
+	if (!(k = srp_Calc_k(N,g)))
+		goto err;
+	if (!BN_mod_mul(tmp2,tmp,k,N,bn_ctx))
+		goto err;
+	if (!BN_mod_sub(tmp,B,tmp2,N,bn_ctx))
+		goto err;
+
+	if (!BN_mod_mul(tmp3,u,x,N,bn_ctx))
+		goto err;
+	if (!BN_mod_add(tmp2,a,tmp3,N,bn_ctx))
+		goto err;
+	if (!BN_mod_exp(K,tmp,tmp2,N,bn_ctx))
+		goto err;
+
+err :
+	BN_CTX_free(bn_ctx);
+	BN_clear_free(tmp);
+	BN_clear_free(tmp2);
+	BN_clear_free(tmp3);
+	BN_free(k);
+	return K;	
+	}
+
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N)
+	{
+	BIGNUM *r;
+	BN_CTX *bn_ctx; 
+	int ret = 0;
+
+	if (B == NULL || N == NULL ||
+		(bn_ctx = BN_CTX_new()) == NULL)
+		return 0;
+
+	if ((r = BN_new()) == NULL)
+		goto err;
+	/* Checks if B % N == 0 */
+	if (!BN_nnmod(r,B,N,bn_ctx))
+		goto err;
+	ret = !BN_is_zero(r);
+err:
+	BN_CTX_free(bn_ctx);
+	BN_free(r);
+	return ret;
+	}
+
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N)
+	{
+	/* Checks if A % N == 0 */
+	return SRP_Verify_B_mod_N(A,N) ;
+	}
+
+
+/* Check if G and N are kwown parameters. 
+   The values have been generated from the ietf-tls-srp draft version 8
+*/
+char *SRP_check_known_gN_param(BIGNUM *g, BIGNUM *N)
+	{
+	size_t i;
+	if ((g == NULL) || (N == NULL))
+		return 0;
+
+	srp_bn_print(g);
+	srp_bn_print(N);
+
+	for(i = 0; i < KNOWN_GN_NUMBER; i++)
+		{
+		if (BN_cmp(knowngN[i].g, g) == 0 && BN_cmp(knowngN[i].N, N) == 0) 
+			return knowngN[i].id;
+		}
+	return NULL;
+	}
+
+SRP_gN *SRP_get_default_gN(const char *id)
+	{
+	size_t i;
+
+	if (id == NULL) 
+		return knowngN;
+	for(i = 0; i < KNOWN_GN_NUMBER; i++)
+		{
+		if (strcmp(knowngN[i].id, id)==0)
+			return knowngN + i;
+		}
+	return NULL;
+	}
+#endif
diff --git a/crypto/srp/srp_vfy.c b/crypto/srp/srp_vfy.c
new file mode 100644
index 0000000..4a3d13e
--- /dev/null
+++ b/crypto/srp/srp_vfy.c
@@ -0,0 +1,658 @@
+/* crypto/srp/srp_vfy.c */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+#include <openssl/buffer.h>
+#include <openssl/rand.h>
+#include <openssl/txt_db.h>
+
+#define SRP_RANDOM_SALT_LEN 20
+#define MAX_LEN 2500
+
+static char b64table[] =
+  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./";
+
+/* the following two conversion routines have been inspired by code from Stanford */ 
+
+/*
+ * Convert a base64 string into raw byte array representation.
+ */
+static int t_fromb64(unsigned char *a, const char *src)
+	{
+	char *loc;
+	int i, j;
+	int size;
+
+	while(*src && (*src == ' ' || *src == '\t' || *src == '\n'))
+		++src;
+	size = strlen(src);
+	i = 0;
+	while(i < size)
+		{
+		loc = strchr(b64table, src[i]);
+		if(loc == (char *) 0) break;
+		else a[i] = loc - b64table;
+		++i;
+		}
+	size = i;
+	i = size - 1;
+	j = size;
+	while(1)
+		{
+		a[j] = a[i];
+		if(--i < 0) break;
+		a[j] |= (a[i] & 3) << 6;
+		--j;
+		a[j] = (unsigned char) ((a[i] & 0x3c) >> 2);
+		if(--i < 0) break;
+		a[j] |= (a[i] & 0xf) << 4;
+		--j;
+		a[j] = (unsigned char) ((a[i] & 0x30) >> 4);
+		if(--i < 0) break;
+		a[j] |= (a[i] << 2);
+
+		a[--j] = 0;
+		if(--i < 0) break;
+		}
+	while(a[j] == 0 && j <= size) ++j;
+	i = 0;
+	while (j <= size) a[i++] = a[j++];
+	return i;
+	}
+
+
+/*
+ * Convert a raw byte string into a null-terminated base64 ASCII string.
+ */
+static char *t_tob64(char *dst, const unsigned char *src, int size)
+	{
+	int c, pos = size % 3;
+	unsigned char b0 = 0, b1 = 0, b2 = 0, notleading = 0;
+	char *olddst = dst;
+
+	switch(pos)
+		{
+	case 1:
+		b2 = src[0];
+		break;
+	case 2:
+		b1 = src[0];
+		b2 = src[1];
+		break;
+		}
+
+	while(1)
+		{
+		c = (b0 & 0xfc) >> 2;
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = ((b0 & 3) << 4) | ((b1 & 0xf0) >> 4);
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = ((b1 & 0xf) << 2) | ((b2 & 0xc0) >> 6);
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		c = b2 & 0x3f;
+		if(notleading || c != 0)
+			{
+			*dst++ = b64table[c];
+			notleading = 1;
+			}
+		if(pos >= size) break;
+		else
+			{
+			b0 = src[pos++];
+			b1 = src[pos++];
+			b2 = src[pos++];
+			}
+		}
+
+	*dst++ = '\0';
+	return olddst;
+	}
+
+static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+	{
+	if (user_pwd == NULL) 
+		return;
+	BN_free(user_pwd->s);
+	BN_clear_free(user_pwd->v);
+	OPENSSL_free(user_pwd->id);
+	OPENSSL_free(user_pwd->info);
+	OPENSSL_free(user_pwd);
+	}
+
+static SRP_user_pwd *SRP_user_pwd_new()
+	{
+	SRP_user_pwd *ret = OPENSSL_malloc(sizeof(SRP_user_pwd));
+	if (ret == NULL)
+		return NULL;								
+	ret->N = NULL;
+	ret->g = NULL;	
+	ret->s = NULL;
+	ret->v = NULL;
+	ret->id = NULL ;
+	ret->info = NULL;
+	return ret;
+	}
+
+static void SRP_user_pwd_set_gN(SRP_user_pwd *vinfo, const BIGNUM *g,
+				const BIGNUM *N)
+	{
+	vinfo->N = N;
+	vinfo->g = g;	
+	}
+
+static int SRP_user_pwd_set_ids(SRP_user_pwd *vinfo, const char *id,
+				const char *info)
+	{
+	if (id != NULL && NULL == (vinfo->id = BUF_strdup(id)))
+		return 0;
+	return (info == NULL || NULL != (vinfo->info = BUF_strdup(info))) ;
+	}
+
+static int SRP_user_pwd_set_sv(SRP_user_pwd *vinfo, const char *s,
+			       const char *v)
+	{
+	unsigned char tmp[MAX_LEN];
+	int len;
+
+	if (strlen(s) > MAX_LEN || strlen(v) > MAX_LEN) 
+		return 0; 
+	len = t_fromb64(tmp, v);
+	if (NULL == (vinfo->v = BN_bin2bn(tmp, len, NULL)) )
+		return 0;
+	len = t_fromb64(tmp, s);
+	return ((vinfo->s = BN_bin2bn(tmp, len, NULL)) != NULL) ;
+	}
+
+static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
+	{
+	vinfo->v = v;
+	vinfo->s = s;
+	return (vinfo->s != NULL && vinfo->v != NULL) ;
+	}
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key)
+	{
+	SRP_VBASE *vb = (SRP_VBASE *) OPENSSL_malloc(sizeof(SRP_VBASE));
+
+	if (vb == NULL)
+		return NULL;
+	if (!(vb->users_pwd = sk_SRP_user_pwd_new_null()) ||
+		!(vb->gN_cache = sk_SRP_gN_cache_new_null()))
+		{
+		OPENSSL_free(vb);
+		return NULL;
+		}
+	vb->default_g = NULL;
+	vb->default_N = NULL;
+	vb->seed_key = NULL;
+	if ((seed_key != NULL) && 
+		(vb->seed_key = BUF_strdup(seed_key)) == NULL)
+		{
+		sk_SRP_user_pwd_free(vb->users_pwd);
+		sk_SRP_gN_cache_free(vb->gN_cache);
+		OPENSSL_free(vb);
+		return NULL;
+		}
+	return vb;
+	}
+
+
+int SRP_VBASE_free(SRP_VBASE *vb)
+	{
+	sk_SRP_user_pwd_pop_free(vb->users_pwd,SRP_user_pwd_free);
+	sk_SRP_gN_cache_free(vb->gN_cache);
+	OPENSSL_free(vb->seed_key);
+	OPENSSL_free(vb);
+	return 0;
+	}
+
+
+static SRP_gN_cache *SRP_gN_new_init(const char *ch)
+	{
+	unsigned char tmp[MAX_LEN];
+	int len;
+
+	SRP_gN_cache *newgN = (SRP_gN_cache *)OPENSSL_malloc(sizeof(SRP_gN_cache));
+	if (newgN == NULL)
+		return NULL;
+
+	if ((newgN->b64_bn = BUF_strdup(ch)) == NULL)
+		goto err;
+
+	len = t_fromb64(tmp, ch);
+	if ((newgN->bn = BN_bin2bn(tmp, len, NULL)))
+		return newgN;
+
+	OPENSSL_free(newgN->b64_bn);
+err:
+	OPENSSL_free(newgN);
+	return NULL;
+	}
+
+
+static void SRP_gN_free(SRP_gN_cache *gN_cache)
+	{
+	if (gN_cache == NULL)
+		return;
+	OPENSSL_free(gN_cache->b64_bn);
+	BN_free(gN_cache->bn);
+	OPENSSL_free(gN_cache);
+	}
+
+static SRP_gN *SRP_get_gN_by_id(const char *id, STACK_OF(SRP_gN) *gN_tab)
+	{
+	int i;
+
+	SRP_gN *gN;
+	if (gN_tab != NULL) 
+	for(i = 0; i < sk_SRP_gN_num(gN_tab); i++)
+		{
+		gN = sk_SRP_gN_value(gN_tab, i);
+		if (gN && (id == NULL || strcmp(gN->id,id)==0))
+			return gN;
+		}
+	
+	return SRP_get_default_gN(id);
+	}
+
+static BIGNUM *SRP_gN_place_bn(STACK_OF(SRP_gN_cache) *gN_cache, char *ch)
+	{
+	int i;
+	if (gN_cache == NULL)
+		return NULL;
+
+	/* search if we have already one... */
+	for(i = 0; i < sk_SRP_gN_cache_num(gN_cache); i++)
+		{
+		SRP_gN_cache *cache = sk_SRP_gN_cache_value(gN_cache, i);
+		if (strcmp(cache->b64_bn,ch)==0)
+			return cache->bn;
+		}
+		{		/* it is the first time that we find it */
+		SRP_gN_cache *newgN = SRP_gN_new_init(ch);
+		if (newgN)
+			{
+			if (sk_SRP_gN_cache_insert(gN_cache,newgN,0)>0)
+				return newgN->bn;
+			SRP_gN_free(newgN);
+			}
+		}
+	return NULL;
+	}
+
+/* this function parses verifier file. Format is:
+ * string(index):base64(N):base64(g):0
+ * string(username):base64(v):base64(salt):int(index)
+ */
+
+
+int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
+	{
+	int error_code ;
+	STACK_OF(SRP_gN) *SRP_gN_tab = sk_SRP_gN_new_null();
+	char *last_index = NULL;
+	int i;
+	char **pp;
+
+	SRP_gN *gN = NULL;
+	SRP_user_pwd *user_pwd = NULL ;
+
+	TXT_DB *tmpdb = NULL;
+	BIO *in = BIO_new(BIO_s_file());
+
+	error_code = SRP_ERR_OPEN_FILE;
+
+	if (in == NULL || BIO_read_filename(in,verifier_file) <= 0)
+		goto err;
+
+	error_code = SRP_ERR_VBASE_INCOMPLETE_FILE;
+
+	if ((tmpdb =TXT_DB_read(in,DB_NUMBER)) == NULL)
+		goto err;
+
+	error_code = SRP_ERR_MEMORY;
+
+
+	if (vb->seed_key)
+		{
+		last_index = SRP_get_default_gN(NULL)->id;
+		}
+	for (i = 0; i < sk_OPENSSL_PSTRING_num(tmpdb->data); i++)
+		{
+		pp = sk_OPENSSL_PSTRING_value(tmpdb->data,i);
+		if (pp[DB_srptype][0] == DB_SRP_INDEX)
+			{
+			/*we add this couple in the internal Stack */
+
+			if ((gN = (SRP_gN *)OPENSSL_malloc(sizeof(SRP_gN))) == NULL) 
+ 				goto err;
+
+			if  (!(gN->id = BUF_strdup(pp[DB_srpid]))
+	                ||  !(gN->N = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpverifier]))
+			||  !(gN->g = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpsalt]))
+			||  sk_SRP_gN_insert(SRP_gN_tab,gN,0) == 0)
+				goto err;
+
+			gN = NULL;
+
+			if (vb->seed_key != NULL)
+				{
+				last_index = pp[DB_srpid];
+				}
+			}
+		else if (pp[DB_srptype][0] == DB_SRP_VALID)
+			{
+			/* it is a user .... */
+			SRP_gN *lgN;
+			if ((lgN = SRP_get_gN_by_id(pp[DB_srpgN],SRP_gN_tab))!=NULL)
+				{
+				error_code = SRP_ERR_MEMORY;
+				if ((user_pwd = SRP_user_pwd_new()) == NULL) 
+					goto err;
+				
+				SRP_user_pwd_set_gN(user_pwd,lgN->g,lgN->N);
+				if (!SRP_user_pwd_set_ids(user_pwd, pp[DB_srpid],pp[DB_srpinfo]))
+					goto err;
+				
+				error_code = SRP_ERR_VBASE_BN_LIB;
+				if (!SRP_user_pwd_set_sv(user_pwd, pp[DB_srpsalt],pp[DB_srpverifier]))
+					goto err;
+
+				if (sk_SRP_user_pwd_insert(vb->users_pwd, user_pwd, 0) == 0)
+					goto err;
+				user_pwd = NULL; /* abandon responsability */
+				}
+			}
+		}
+	
+	if (last_index != NULL)
+		{
+		/* this means that we want to simulate a default user */
+
+		if (((gN = SRP_get_gN_by_id(last_index,SRP_gN_tab))==NULL))
+			{
+			error_code = SRP_ERR_VBASE_BN_LIB;
+			goto err;
+			}
+		vb->default_g = gN->g ;
+		vb->default_N = gN->N ;
+		gN = NULL ;
+		}
+	error_code = SRP_NO_ERROR;
+
+ err:
+	/* there may be still some leaks to fix, if this fails, the application terminates most likely */
+
+	if (gN != NULL)
+		{
+		OPENSSL_free(gN->id);
+		OPENSSL_free(gN);
+		}
+
+	SRP_user_pwd_free(user_pwd);
+
+	if (tmpdb) TXT_DB_free(tmpdb);
+	if (in) BIO_free_all(in);
+
+	sk_SRP_gN_free(SRP_gN_tab);
+
+	return error_code;
+
+	}
+
+
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+	{
+	int i;
+	SRP_user_pwd *user;
+	unsigned char digv[SHA_DIGEST_LENGTH];
+	unsigned char digs[SHA_DIGEST_LENGTH];
+	EVP_MD_CTX ctxt;
+
+	if (vb == NULL)
+		return NULL;
+	for(i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++)
+		{
+		user = sk_SRP_user_pwd_value(vb->users_pwd, i);
+		if (strcmp(user->id,username)==0)
+			return user;
+		}
+	if ((vb->seed_key == NULL) ||
+		(vb->default_g == NULL) ||
+		(vb->default_N == NULL))
+		return NULL;
+
+/* if the user is unknown we set parameters as well if we have a seed_key */
+
+	if ((user = SRP_user_pwd_new()) == NULL) 
+		return NULL;
+
+	SRP_user_pwd_set_gN(user,vb->default_g,vb->default_N);
+				
+	if (!SRP_user_pwd_set_ids(user,username,NULL))
+		goto err;
+		
+	RAND_pseudo_bytes(digv, SHA_DIGEST_LENGTH);
+	EVP_MD_CTX_init(&ctxt);
+	EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+	EVP_DigestUpdate(&ctxt, vb->seed_key, strlen(vb->seed_key));
+	EVP_DigestUpdate(&ctxt, username, strlen(username));
+	EVP_DigestFinal_ex(&ctxt, digs, NULL);
+	EVP_MD_CTX_cleanup(&ctxt);
+	if (SRP_user_pwd_set_sv_BN(user, BN_bin2bn(digs,SHA_DIGEST_LENGTH,NULL), BN_bin2bn(digv,SHA_DIGEST_LENGTH, NULL))) 
+		return user;
+
+err:    SRP_user_pwd_free(user);
+	return NULL;
+	}
+
+
+/*
+   create a verifier (*salt,*verifier,g and N are in base64)
+*/
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g)
+	{
+	int len;
+	char * result=NULL;
+	char *vf;
+	BIGNUM *N_bn = NULL, *g_bn = NULL, *s = NULL, *v = NULL;
+	unsigned char tmp[MAX_LEN];
+	unsigned char tmp2[MAX_LEN];
+	char * defgNid = NULL;
+
+	if ((user == NULL)||
+		(pass == NULL)||
+		(salt == NULL)||
+		(verifier == NULL))
+		goto err;
+
+	if (N)
+		{
+		if (!(len = t_fromb64(tmp, N))) goto err;
+		N_bn = BN_bin2bn(tmp, len, NULL);
+		if (!(len = t_fromb64(tmp, g))) goto err;
+		g_bn = BN_bin2bn(tmp, len, NULL);
+		defgNid = "*";
+		}
+	else
+		{ 
+		SRP_gN * gN = SRP_get_gN_by_id(g, NULL) ;
+		if (gN == NULL)
+			goto err;
+		N_bn = gN->N;
+		g_bn = gN->g;
+		defgNid = gN->id;
+		}
+
+	if (*salt == NULL)
+		{
+		RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+		s = BN_bin2bn(tmp2, SRP_RANDOM_SALT_LEN, NULL);
+		}
+	else
+		{
+		if (!(len = t_fromb64(tmp2, *salt)))
+			goto err;
+		s = BN_bin2bn(tmp2, len, NULL);
+		}
+
+
+	if(!SRP_create_verifier_BN(user, pass, &s, &v, N_bn, g_bn)) goto err;
+
+	BN_bn2bin(v,tmp);
+	if (((vf = OPENSSL_malloc(BN_num_bytes(v)*2)) == NULL))
+		goto err;
+	t_tob64(vf, tmp, BN_num_bytes(v));
+
+	*verifier = vf;
+	if (*salt == NULL)
+		{
+		char *tmp_salt;
+
+		if ((tmp_salt = OPENSSL_malloc(SRP_RANDOM_SALT_LEN * 2)) == NULL)
+			{
+			OPENSSL_free(vf);
+			goto err;
+			}
+		t_tob64(tmp_salt, tmp2, SRP_RANDOM_SALT_LEN);
+		*salt = tmp_salt;
+		}
+
+	result=defgNid;
+
+err:
+	if(N)
+		{
+		BN_free(N_bn);
+		BN_free(g_bn);
+		}
+	return result;
+	}
+
+/*
+   create a verifier (*salt,*verifier,g and N are BIGNUMs)
+*/
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g)
+	{
+	int result=0;
+	BIGNUM *x = NULL;
+	BN_CTX *bn_ctx = BN_CTX_new();
+	unsigned char tmp2[MAX_LEN];
+
+	if ((user == NULL)||
+		(pass == NULL)||
+		(salt == NULL)||
+		(verifier == NULL)||
+		(N == NULL)||
+		(g == NULL)||
+		(bn_ctx == NULL))
+		goto err;
+
+	srp_bn_print(N);
+	srp_bn_print(g);
+
+	if (*salt == NULL)
+		{
+		RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+		*salt = BN_bin2bn(tmp2,SRP_RANDOM_SALT_LEN,NULL);
+		}
+
+	x = SRP_Calc_x(*salt,user,pass);
+
+	*verifier = BN_new();
+	if(*verifier == NULL) goto err;
+
+	if (!BN_mod_exp(*verifier,g,x,N,bn_ctx))
+		{
+		BN_clear_free(*verifier);
+		goto err;
+		}
+
+	srp_bn_print(*verifier);
+
+	result=1;
+
+err:
+
+	BN_clear_free(x);
+	BN_CTX_free(bn_ctx);
+	return result;
+	}
+
+
+
+#endif
diff --git a/crypto/stack/safestack.h b/crypto/stack/safestack.h
index 3e76aa5..ea3aa0d 100644
--- a/crypto/stack/safestack.h
+++ b/crypto/stack/safestack.h
@@ -1459,6 +1459,94 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
 #define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
 
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
 #define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
 #define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
 #define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
@@ -2056,31 +2144,6 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
 
 
-#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
-#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
-#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
-#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
-#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
-#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
-#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
-	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
-	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
-#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
-
-
 #define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
 #define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
 #define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
@@ -2106,6 +2169,31 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
 
 
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
+	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
 #define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
 	SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
 #define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \
diff --git a/crypto/symhacks.h b/crypto/symhacks.h
index 3fd4a81..07a412f 100644
--- a/crypto/symhacks.h
+++ b/crypto/symhacks.h
@@ -176,7 +176,6 @@
 #define SSL_CTX_set_default_passwd_cb_userdata  SSL_CTX_set_def_passwd_cb_ud
 #undef SSL_COMP_get_compression_methods
 #define SSL_COMP_get_compression_methods	SSL_COMP_get_compress_methods
-
 #undef ssl_add_clienthello_renegotiate_ext
 #define ssl_add_clienthello_renegotiate_ext	ssl_add_clienthello_reneg_ext
 #undef ssl_add_serverhello_renegotiate_ext
@@ -185,6 +184,26 @@
 #define ssl_parse_clienthello_renegotiate_ext	ssl_parse_clienthello_reneg_ext
 #undef ssl_parse_serverhello_renegotiate_ext
 #define ssl_parse_serverhello_renegotiate_ext	ssl_parse_serverhello_reneg_ext
+#undef SSL_srp_server_param_with_username
+#define SSL_srp_server_param_with_username	SSL_srp_server_param_with_un
+#undef SSL_CTX_set_srp_client_pwd_callback
+#define SSL_CTX_set_srp_client_pwd_callback	SSL_CTX_set_srp_client_pwd_cb
+#undef SSL_CTX_set_srp_verify_param_callback
+#define SSL_CTX_set_srp_verify_param_callback	SSL_CTX_set_srp_vfy_param_cb
+#undef SSL_CTX_set_srp_username_callback
+#define SSL_CTX_set_srp_username_callback	SSL_CTX_set_srp_un_cb
+#undef ssl_add_clienthello_use_srtp_ext
+#define ssl_add_clienthello_use_srtp_ext	ssl_add_clihello_use_srtp_ext
+#undef ssl_add_serverhello_use_srtp_ext
+#define ssl_add_serverhello_use_srtp_ext	ssl_add_serhello_use_srtp_ext
+#undef ssl_parse_clienthello_use_srtp_ext
+#define ssl_parse_clienthello_use_srtp_ext	ssl_parse_clihello_use_srtp_ext
+#undef ssl_parse_serverhello_use_srtp_ext
+#define ssl_parse_serverhello_use_srtp_ext	ssl_parse_serhello_use_srtp_ext
+#undef SSL_CTX_set_next_protos_advertised_cb
+#define SSL_CTX_set_next_protos_advertised_cb	SSL_CTX_set_next_protos_adv_cb
+#undef SSL_CTX_set_next_proto_select_cb
+#define SSL_CTX_set_next_proto_select_cb	SSL_CTX_set_next_proto_sel_cb
 
 /* Hack some long ENGINE names */
 #undef ENGINE_get_default_BN_mod_exp_crt
@@ -238,6 +257,9 @@
 #define EC_GROUP_get_point_conversion_form	EC_GROUP_get_point_conv_form
 #undef EC_GROUP_clear_free_all_extra_data
 #define EC_GROUP_clear_free_all_extra_data	EC_GROUP_clr_free_all_xtra_data
+#undef EC_KEY_set_public_key_affine_coordinates
+#define EC_KEY_set_public_key_affine_coordinates \
+						EC_KEY_set_pub_key_aff_coords
 #undef EC_POINT_set_Jprojective_coordinates_GFp
 #define EC_POINT_set_Jprojective_coordinates_GFp \
                                                 EC_POINT_set_Jproj_coords_GFp
@@ -294,8 +316,6 @@
 #define ec_GFp_simple_point_set_to_infinity     ec_GFp_simple_pt_set_to_inf
 #undef ec_GFp_simple_points_make_affine
 #define ec_GFp_simple_points_make_affine	ec_GFp_simple_pts_make_affine
-#undef ec_GFp_simple_group_get_curve_GFp
-#define ec_GFp_simple_group_get_curve_GFp       ec_GFp_simple_grp_get_curve_GFp
 #undef ec_GFp_simple_set_Jprojective_coordinates_GFp
 #define ec_GFp_simple_set_Jprojective_coordinates_GFp \
                                                 ec_GFp_smp_set_Jproj_coords_GFp
@@ -399,6 +419,12 @@
 #undef dtls1_retransmit_buffered_messages
 #define dtls1_retransmit_buffered_messages	dtls1_retransmit_buffered_msgs
 
+/* Hack some long SRP names */
+#undef SRP_generate_server_master_secret
+#define SRP_generate_server_master_secret	SRP_gen_server_master_secret
+#undef SRP_generate_client_master_secret
+#define SRP_generate_client_master_secret	SRP_gen_client_master_secret
+
 /* Hack some long UI names */
 #undef UI_method_get_prompt_constructor
 #define UI_method_get_prompt_constructor	UI_method_get_prompt_constructr
diff --git a/crypto/ui/ui.h b/crypto/ui/ui.h
index 2b1cfa2..bd78aa4 100644
--- a/crypto/ui/ui.h
+++ b/crypto/ui/ui.h
@@ -316,7 +316,7 @@ int (*UI_method_get_writer(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_flusher(UI_METHOD *method))(UI*);
 int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_closer(UI_METHOD *method))(UI*);
-char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
+char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
 
 /* The following functions are helpers for method writers to access relevant
    data from a UI_STRING. */
diff --git a/crypto/ui/ui_openssl.c b/crypto/ui/ui_openssl.c
index b05cbf3..e6ccd34 100644
--- a/crypto/ui/ui_openssl.c
+++ b/crypto/ui/ui_openssl.c
@@ -122,9 +122,15 @@
  * sigaction and fileno included. -pedantic would be more appropriate for
  * the intended purposes, but we can't prevent users from adding -ansi.
  */
+#if defined(OPENSSL_SYSNAME_VXWORKS)
+#include <sys/types.h>
+#endif
+
+#if !defined(_POSIX_C_SOURCE) && defined(OPENSSL_SYS_VMS)
 #ifndef _POSIX_C_SOURCE
 #define _POSIX_C_SOURCE 2
 #endif
+#endif
 #include <signal.h>
 #include <stdio.h>
 #include <string.h>
diff --git a/crypto/x509/x509.h b/crypto/x509/x509.h
index e6f8a40..092dd74 100644
--- a/crypto/x509/x509.h
+++ b/crypto/x509/x509.h
@@ -657,11 +657,15 @@ int NETSCAPE_SPKI_set_pubkey(NETSCAPE_SPKI *x, EVP_PKEY *pkey);
 
 int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki);
 
+int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent);
 int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig);
 
 int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
 
 int X509_pubkey_digest(const X509 *data,const EVP_MD *type,
@@ -763,6 +767,7 @@ X509_ALGOR *X509_ALGOR_dup(X509_ALGOR *xn);
 int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval);
 void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 						X509_ALGOR *algor);
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
 
 X509_NAME *X509_NAME_dup(X509_NAME *xn);
 X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne);
@@ -896,6 +901,9 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *algor1,
 int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	ASN1_BIT_STRING *signature,
 	void *data, EVP_PKEY *pkey, const EVP_MD *type);
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx);
 #endif
 
 int 		X509_set_version(X509 *x,long version);
@@ -1161,6 +1169,9 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 				 unsigned char *salt, int saltlen,
 				 unsigned char *aiv, int prf_nid);
 
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen);
+
 /* PKCS#8 utilities */
 
 DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)
diff --git a/crypto/x509/x509_cmp.c b/crypto/x509/x509_cmp.c
index 4bc9da0..352aa37 100644
--- a/crypto/x509/x509_cmp.c
+++ b/crypto/x509/x509_cmp.c
@@ -86,16 +86,20 @@ unsigned long X509_issuer_and_serial_hash(X509 *a)
 
 	EVP_MD_CTX_init(&ctx);
 	f=X509_NAME_oneline(a->cert_info->issuer,NULL,0);
-	ret=strlen(f);
-	EVP_DigestInit_ex(&ctx, EVP_md5(), NULL);
-	EVP_DigestUpdate(&ctx,(unsigned char *)f,ret);
+	if (!EVP_DigestInit_ex(&ctx, EVP_md5(), NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx,(unsigned char *)f,strlen(f)))
+		goto err;
 	OPENSSL_free(f);
-	EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
-		(unsigned long)a->cert_info->serialNumber->length);
-	EVP_DigestFinal_ex(&ctx,&(md[0]),NULL);
+	if(!EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
+		(unsigned long)a->cert_info->serialNumber->length))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx,&(md[0]),NULL))
+		goto err;
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
 		)&0xffffffffL;
+	err:
 	EVP_MD_CTX_cleanup(&ctx);
 	return(ret);
 	}
@@ -219,7 +223,9 @@ unsigned long X509_NAME_hash(X509_NAME *x)
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(),
+		NULL))
+		return 0;
 
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
@@ -234,16 +240,22 @@ unsigned long X509_NAME_hash(X509_NAME *x)
 
 unsigned long X509_NAME_hash_old(X509_NAME *x)
 	{
+	EVP_MD_CTX md_ctx;
 	unsigned long ret=0;
 	unsigned char md[16];
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->bytes->data, x->bytes->length, md, NULL, EVP_md5(), NULL);
+	EVP_MD_CTX_init(&md_ctx);
+	EVP_MD_CTX_set_flags(&md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	if (EVP_DigestInit_ex(&md_ctx, EVP_md5(), NULL)
+	    && EVP_DigestUpdate(&md_ctx, x->bytes->data, x->bytes->length)
+	    && EVP_DigestFinal_ex(&md_ctx,md,NULL))
+		ret=(((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
+		     ((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
+		     )&0xffffffffL;
+	EVP_MD_CTX_cleanup(&md_ctx);
 
-	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
-		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
-		)&0xffffffffL;
 	return(ret);
 	}
 #endif
diff --git a/crypto/x509/x509_lu.c b/crypto/x509/x509_lu.c
index 3a6e04a..38525a8 100644
--- a/crypto/x509/x509_lu.c
+++ b/crypto/x509/x509_lu.c
@@ -87,7 +87,7 @@ void X509_LOOKUP_free(X509_LOOKUP *ctx)
 	if (ctx == NULL) return;
 	if (	(ctx->method != NULL) &&
 		(ctx->method->free != NULL))
-		ctx->method->free(ctx);
+		(*ctx->method->free)(ctx);
 	OPENSSL_free(ctx);
 	}
 
diff --git a/crypto/x509/x509_vfy.c b/crypto/x509/x509_vfy.c
index 5a0b024..12d71f5 100644
--- a/crypto/x509/x509_vfy.c
+++ b/crypto/x509/x509_vfy.c
@@ -153,7 +153,6 @@ static int x509_subject_cmp(X509 **a, X509 **b)
 int X509_verify_cert(X509_STORE_CTX *ctx)
 	{
 	X509 *x,*xtmp,*chain_ss=NULL;
-	X509_NAME *xn;
 	int bad_chain = 0;
 	X509_VERIFY_PARAM *param = ctx->param;
 	int depth,i,ok=0;
@@ -205,7 +204,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		                         */
 
 		/* If we are self signed, we break */
-		xn=X509_get_issuer_name(x);
 		if (ctx->check_issued(ctx, x,x)) break;
 
 		/* If we were passed a cert chain, use it first */
@@ -242,7 +240,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 
 	i=sk_X509_num(ctx->chain);
 	x=sk_X509_value(ctx->chain,i-1);
-	xn = X509_get_subject_name(x);
 	if (ctx->check_issued(ctx, x, x))
 		{
 		/* we have a self signed certificate */
@@ -291,7 +288,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		if (depth < num) break;
 
 		/* If we are self signed, we break */
-		xn=X509_get_issuer_name(x);
 		if (ctx->check_issued(ctx,x,x)) break;
 
 		ok = ctx->get_issuer(&xtmp, ctx, x);
@@ -310,7 +306,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		}
 
 	/* we now have our chain, lets check it... */
-	xn=X509_get_issuer_name(x);
 
 	/* Is last certificate looked up self signed? */
 	if (!ctx->check_issued(ctx,x,x))
@@ -877,7 +872,7 @@ static int crl_extension_match(X509_CRL *a, X509_CRL *b, int nid)
 	{
 	ASN1_OCTET_STRING *exta, *extb;
 	int i;
-	i = X509_CRL_get_ext_by_NID(a, nid, 0);
+	i = X509_CRL_get_ext_by_NID(a, nid, -1);
 	if (i >= 0)
 		{
 		/* Can't have multiple occurrences */
@@ -888,7 +883,7 @@ static int crl_extension_match(X509_CRL *a, X509_CRL *b, int nid)
 	else
 		exta = NULL;
 
-	i = X509_CRL_get_ext_by_NID(b, nid, 0);
+	i = X509_CRL_get_ext_by_NID(b, nid, -1);
 
 	if (i >= 0)
 		{
@@ -1732,7 +1727,7 @@ int X509_cmp_time(const ASN1_TIME *ctm, time_t *cmp_time)
 	atm.length=sizeof(buff2);
 	atm.data=(unsigned char *)buff2;
 
-	if (X509_time_adj(&atm,-offset*60, cmp_time) == NULL)
+	if (X509_time_adj(&atm, offset*60, cmp_time) == NULL)
 		return 0;
 
 	if (ctm->type == V_ASN1_UTCTIME)
diff --git a/crypto/x509/x509type.c b/crypto/x509/x509type.c
index 3385ad3..9702ec5 100644
--- a/crypto/x509/x509type.c
+++ b/crypto/x509/x509type.c
@@ -100,20 +100,26 @@ int X509_certificate_type(X509 *x, EVP_PKEY *pkey)
 		break;
 		}
 
-	i=X509_get_signature_type(x);
-	switch (i)
+	i=OBJ_obj2nid(x->sig_alg->algorithm);
+	if (i && OBJ_find_sigid_algs(i, NULL, &i))
 		{
-	case EVP_PKEY_RSA:
-		ret|=EVP_PKS_RSA;
-		break;
-	case EVP_PKEY_DSA:
-		ret|=EVP_PKS_DSA;
-		break;
-	case EVP_PKEY_EC:
-		ret|=EVP_PKS_EC;
-		break;
-	default:
-		break;
+
+		switch (i)
+			{
+		case NID_rsaEncryption:
+		case NID_rsa:
+			ret|=EVP_PKS_RSA;
+			break;
+		case NID_dsa:
+		case NID_dsa_2:
+			ret|=EVP_PKS_DSA;
+			break;
+		case NID_X9_62_id_ecPublicKey:
+			ret|=EVP_PKS_EC;
+			break;
+		default:
+			break;
+			}
 		}
 
 	if (EVP_PKEY_size(pk) <= 1024/8)/* /8 because it's 1024 bits we look
diff --git a/crypto/x509/x_all.c b/crypto/x509/x_all.c
index 8ec88c2..b94aeeb 100644
--- a/crypto/x509/x_all.c
+++ b/crypto/x509/x_all.c
@@ -95,12 +95,25 @@ int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md)
 		x->sig_alg, x->signature, x->cert_info,pkey,md));
 	}
 
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CINF),
+		x->cert_info->signature,
+		x->sig_alg, x->signature, x->cert_info, ctx);
+	}
+
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(X509_REQ_INFO),x->sig_alg, NULL,
 		x->signature, x->req_info,pkey,md));
 	}
 
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_REQ_INFO),
+		x->sig_alg, NULL, x->signature, x->req_info, ctx);
+	}
+
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	x->crl->enc.modified = 1;
@@ -108,6 +121,12 @@ int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
 		x->sig_alg, x->signature, x->crl,pkey,md));
 	}
 
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CRL_INFO),
+		x->crl->sig_alg, x->sig_alg, x->signature, x->crl, ctx);
+	}
+
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(NETSCAPE_SPKAC), x->sig_algor,NULL,
diff --git a/crypto/x509v3/v3_addr.c b/crypto/x509v3/v3_addr.c
index 0d70e86..df46a49 100644
--- a/crypto/x509v3/v3_addr.c
+++ b/crypto/x509v3/v3_addr.c
@@ -142,12 +142,13 @@ unsigned int v3_addr_get_afi(const IPAddressFamily *f)
  * Expand the bitstring form of an address into a raw byte array.
  * At the moment this is coded for simplicity, not speed.
  */
-static void addr_expand(unsigned char *addr,
+static int addr_expand(unsigned char *addr,
 			const ASN1_BIT_STRING *bs,
 			const int length,
 			const unsigned char fill)
 {
-  OPENSSL_assert(bs->length >= 0 && bs->length <= length);
+  if (bs->length < 0 || bs->length > length)
+    return 0;
   if (bs->length > 0) {
     memcpy(addr, bs->data, bs->length);
     if ((bs->flags & 7) != 0) {
@@ -159,6 +160,7 @@ static void addr_expand(unsigned char *addr,
     }
   }
   memset(addr + bs->length, fill, length - bs->length);
+  return 1;
 }
 
 /*
@@ -181,15 +183,13 @@ static int i2r_address(BIO *out,
     return 0;
   switch (afi) {
   case IANA_AFI_IPV4:
-    if (bs->length > 4)
+    if (!addr_expand(addr, bs, 4, fill))
       return 0;
-    addr_expand(addr, bs, 4, fill);
     BIO_printf(out, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]);
     break;
   case IANA_AFI_IPV6:
-    if (bs->length > 16)
+    if (!addr_expand(addr, bs, 16, fill))
       return 0;
-    addr_expand(addr, bs, 16, fill);
     for (n = 16; n > 1 && addr[n-1] == 0x00 && addr[n-2] == 0x00; n -= 2)
       ;
     for (i = 0; i < n; i += 2)
@@ -315,6 +315,12 @@ static int i2r_IPAddrBlocks(const X509V3_EXT_METHOD *method,
 /*
  * Sort comparison function for a sequence of IPAddressOrRange
  * elements.
+ *
+ * There's no sane answer we can give if addr_expand() fails, and an
+ * assertion failure on externally supplied data is seriously uncool,
+ * so we just arbitrarily declare that if given invalid inputs this
+ * function returns -1.  If this messes up your preferred sort order
+ * for garbage input, tough noogies.
  */
 static int IPAddressOrRange_cmp(const IPAddressOrRange *a,
 				const IPAddressOrRange *b,
@@ -326,22 +332,26 @@ static int IPAddressOrRange_cmp(const IPAddressOrRange *a,
 
   switch (a->type) {
   case IPAddressOrRange_addressPrefix:
-    addr_expand(addr_a, a->u.addressPrefix, length, 0x00);
+    if (!addr_expand(addr_a, a->u.addressPrefix, length, 0x00))
+      return -1;
     prefixlen_a = addr_prefixlen(a->u.addressPrefix);
     break;
   case IPAddressOrRange_addressRange:
-    addr_expand(addr_a, a->u.addressRange->min, length, 0x00);
+    if (!addr_expand(addr_a, a->u.addressRange->min, length, 0x00))
+      return -1;
     prefixlen_a = length * 8;
     break;
   }
 
   switch (b->type) {
   case IPAddressOrRange_addressPrefix:
-    addr_expand(addr_b, b->u.addressPrefix, length, 0x00);
+    if (!addr_expand(addr_b, b->u.addressPrefix, length, 0x00))
+      return -1;
     prefixlen_b = addr_prefixlen(b->u.addressPrefix);
     break;
   case IPAddressOrRange_addressRange:
-    addr_expand(addr_b, b->u.addressRange->min, length, 0x00);
+    if (!addr_expand(addr_b, b->u.addressRange->min, length, 0x00))
+      return -1;
     prefixlen_b = length * 8;
     break;
   }
@@ -383,6 +393,7 @@ static int range_should_be_prefix(const unsigned char *min,
   unsigned char mask;
   int i, j;
 
+  OPENSSL_assert(memcmp(min, max, length) <= 0);
   for (i = 0; i < length && min[i] == max[i]; i++)
     ;
   for (j = length - 1; j >= 0 && min[j] == 0x00 && max[j] == 0xFF; j--)
@@ -601,10 +612,10 @@ static IPAddressOrRanges *make_prefix_or_range(IPAddrBlocks *addr,
     return NULL;
   switch (afi) {
   case IANA_AFI_IPV4:
-    sk_IPAddressOrRange_set_cmp_func(aors, v4IPAddressOrRange_cmp);
+    (void) sk_IPAddressOrRange_set_cmp_func(aors, v4IPAddressOrRange_cmp);
     break;
   case IANA_AFI_IPV6:
-    sk_IPAddressOrRange_set_cmp_func(aors, v6IPAddressOrRange_cmp);
+    (void) sk_IPAddressOrRange_set_cmp_func(aors, v6IPAddressOrRange_cmp);
     break;
   }
   f->ipAddressChoice->type = IPAddressChoice_addressesOrRanges;
@@ -656,22 +667,22 @@ int v3_addr_add_range(IPAddrBlocks *addr,
 /*
  * Extract min and max values from an IPAddressOrRange.
  */
-static void extract_min_max(IPAddressOrRange *aor,
+static int extract_min_max(IPAddressOrRange *aor,
 			    unsigned char *min,
 			    unsigned char *max,
 			    int length)
 {
-  OPENSSL_assert(aor != NULL && min != NULL && max != NULL);
+  if (aor == NULL || min == NULL || max == NULL)
+    return 0;
   switch (aor->type) {
   case IPAddressOrRange_addressPrefix:
-    addr_expand(min, aor->u.addressPrefix, length, 0x00);
-    addr_expand(max, aor->u.addressPrefix, length, 0xFF);
-    return;
+    return (addr_expand(min, aor->u.addressPrefix, length, 0x00) &&
+	    addr_expand(max, aor->u.addressPrefix, length, 0xFF));
   case IPAddressOrRange_addressRange:
-    addr_expand(min, aor->u.addressRange->min, length, 0x00);
-    addr_expand(max, aor->u.addressRange->max, length, 0xFF);
-    return;
+    return (addr_expand(min, aor->u.addressRange->min, length, 0x00) &&
+	    addr_expand(max, aor->u.addressRange->max, length, 0xFF));
   }
+  return 0;
 }
 
 /*
@@ -687,9 +698,10 @@ int v3_addr_get_range(IPAddressOrRange *aor,
   if (aor == NULL || min == NULL || max == NULL ||
       afi_length == 0 || length < afi_length ||
       (aor->type != IPAddressOrRange_addressPrefix &&
-       aor->type != IPAddressOrRange_addressRange))
+       aor->type != IPAddressOrRange_addressRange) ||
+      !extract_min_max(aor, min, max, afi_length))
     return 0;
-  extract_min_max(aor, min, max, afi_length);
+
   return afi_length;
 }
 
@@ -771,8 +783,9 @@ int v3_addr_is_canonical(IPAddrBlocks *addr)
       IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
       IPAddressOrRange *b = sk_IPAddressOrRange_value(aors, j + 1);
 
-      extract_min_max(a, a_min, a_max, length);
-      extract_min_max(b, b_min, b_max, length);
+      if (!extract_min_max(a, a_min, a_max, length) ||
+	  !extract_min_max(b, b_min, b_max, length))
+	return 0;
 
       /*
        * Punt misordered list, overlapping start, or inverted range.
@@ -800,14 +813,17 @@ int v3_addr_is_canonical(IPAddrBlocks *addr)
     }
 
     /*
-     * Check final range to see if it should be a prefix.
+     * Check range to see if it's inverted or should be a
+     * prefix.
      */
     j = sk_IPAddressOrRange_num(aors) - 1;
     {
       IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
-      if (a->type == IPAddressOrRange_addressRange) {
-	extract_min_max(a, a_min, a_max, length);
-	if (range_should_be_prefix(a_min, a_max, length) >= 0)
+      if (a != NULL && a->type == IPAddressOrRange_addressRange) {
+	if (!extract_min_max(a, a_min, a_max, length))
+	  return 0;
+	if (memcmp(a_min, a_max, length) > 0 ||
+	    range_should_be_prefix(a_min, a_max, length) >= 0)
 	  return 0;
       }
     }
@@ -841,8 +857,16 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
     unsigned char a_min[ADDR_RAW_BUF_LEN], a_max[ADDR_RAW_BUF_LEN];
     unsigned char b_min[ADDR_RAW_BUF_LEN], b_max[ADDR_RAW_BUF_LEN];
 
-    extract_min_max(a, a_min, a_max, length);
-    extract_min_max(b, b_min, b_max, length);
+    if (!extract_min_max(a, a_min, a_max, length) ||
+	!extract_min_max(b, b_min, b_max, length))
+      return 0;
+
+    /*
+     * Punt inverted ranges.
+     */
+    if (memcmp(a_min, a_max, length) > 0 ||
+	memcmp(b_min, b_max, length) > 0)
+      return 0;
 
     /*
      * Punt overlaps.
@@ -860,8 +884,8 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
       IPAddressOrRange *merged;
       if (!make_addressRange(&merged, a_min, b_max, length))
 	return 0;
-      sk_IPAddressOrRange_set(aors, i, merged);
-      sk_IPAddressOrRange_delete(aors, i + 1);
+      (void) sk_IPAddressOrRange_set(aors, i, merged);
+      (void) sk_IPAddressOrRange_delete(aors, i + 1);
       IPAddressOrRange_free(a);
       IPAddressOrRange_free(b);
       --i;
@@ -869,6 +893,20 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
     }
   }
 
+  /*
+   * Check for inverted final range.
+   */
+  j = sk_IPAddressOrRange_num(aors) - 1;
+  {
+    IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
+    if (a != NULL && a->type == IPAddressOrRange_addressRange) {
+      unsigned char a_min[ADDR_RAW_BUF_LEN], a_max[ADDR_RAW_BUF_LEN];
+      extract_min_max(a, a_min, a_max, length);
+      if (memcmp(a_min, a_max, length) > 0)
+	return 0;
+    }
+  }
+
   return 1;
 }
 
@@ -885,7 +923,7 @@ int v3_addr_canonize(IPAddrBlocks *addr)
 				    v3_addr_get_afi(f)))
       return 0;
   }
-  sk_IPAddressFamily_set_cmp_func(addr, IPAddressFamily_cmp);
+  (void) sk_IPAddressFamily_set_cmp_func(addr, IPAddressFamily_cmp);
   sk_IPAddressFamily_sort(addr);
   OPENSSL_assert(v3_addr_is_canonical(addr));
   return 1;
@@ -1017,6 +1055,11 @@ static void *v2i_IPAddrBlocks(const struct v3_ext_method *method,
 	X509V3_conf_err(val);
 	goto err;
       }
+      if (memcmp(min, max, length_from_afi(afi)) > 0) {
+	X509V3err(X509V3_F_V2I_IPADDRBLOCKS, X509V3_R_EXTENSION_VALUE_ERROR);
+	X509V3_conf_err(val);
+	goto err;
+      }
       if (!v3_addr_add_range(addr, afi, safi, min, max)) {
 	X509V3err(X509V3_F_V2I_IPADDRBLOCKS, ERR_R_MALLOC_FAILURE);
 	goto err;
@@ -1102,13 +1145,15 @@ static int addr_contains(IPAddressOrRanges *parent,
 
   p = 0;
   for (c = 0; c < sk_IPAddressOrRange_num(child); c++) {
-    extract_min_max(sk_IPAddressOrRange_value(child, c),
-		    c_min, c_max, length);
+    if (!extract_min_max(sk_IPAddressOrRange_value(child, c),
+			 c_min, c_max, length))
+      return -1;
     for (;; p++) {
       if (p >= sk_IPAddressOrRange_num(parent))
 	return 0;
-      extract_min_max(sk_IPAddressOrRange_value(parent, p),
-		      p_min, p_max, length);
+      if (!extract_min_max(sk_IPAddressOrRange_value(parent, p),
+			   p_min, p_max, length))
+	return 0;
       if (memcmp(p_max, c_max, length) < 0)
 	continue;
       if (memcmp(p_min, c_min, length) > 0)
@@ -1130,7 +1175,7 @@ int v3_addr_subset(IPAddrBlocks *a, IPAddrBlocks *b)
     return 1;
   if (b == NULL || v3_addr_inherits(a) || v3_addr_inherits(b))
     return 0;
-  sk_IPAddressFamily_set_cmp_func(b, IPAddressFamily_cmp);
+  (void) sk_IPAddressFamily_set_cmp_func(b, IPAddressFamily_cmp);
   for (i = 0; i < sk_IPAddressFamily_num(a); i++) {
     IPAddressFamily *fa = sk_IPAddressFamily_value(a, i);
     int j = sk_IPAddressFamily_find(b, fa);
@@ -1195,7 +1240,7 @@ static int v3_addr_validate_path_internal(X509_STORE_CTX *ctx,
   }
   if (!v3_addr_is_canonical(ext))
     validation_err(X509_V_ERR_INVALID_EXTENSION);
-  sk_IPAddressFamily_set_cmp_func(ext, IPAddressFamily_cmp);
+  (void) sk_IPAddressFamily_set_cmp_func(ext, IPAddressFamily_cmp);
   if ((child = sk_IPAddressFamily_dup(ext)) == NULL) {
     X509V3err(X509V3_F_V3_ADDR_VALIDATE_PATH_INTERNAL, ERR_R_MALLOC_FAILURE);
     ret = 0;
@@ -1221,7 +1266,7 @@ static int v3_addr_validate_path_internal(X509_STORE_CTX *ctx,
       }
       continue;
     }
-    sk_IPAddressFamily_set_cmp_func(x->rfc3779_addr, IPAddressFamily_cmp);
+    (void) sk_IPAddressFamily_set_cmp_func(x->rfc3779_addr, IPAddressFamily_cmp);
     for (j = 0; j < sk_IPAddressFamily_num(child); j++) {
       IPAddressFamily *fc = sk_IPAddressFamily_value(child, j);
       int k = sk_IPAddressFamily_find(x->rfc3779_addr, fc);
diff --git a/crypto/x509v3/v3_asid.c b/crypto/x509v3/v3_asid.c
index 3f434c0..1587e8e 100644
--- a/crypto/x509v3/v3_asid.c
+++ b/crypto/x509v3/v3_asid.c
@@ -358,6 +358,20 @@ static int ASIdentifierChoice_is_canonical(ASIdentifierChoice *choice)
       goto done;
   }
 
+  /*
+   * Check for inverted range.
+   */
+  i = sk_ASIdOrRange_num(choice->u.asIdsOrRanges) - 1;
+  {
+    ASIdOrRange *a = sk_ASIdOrRange_value(choice->u.asIdsOrRanges, i);
+    ASN1_INTEGER *a_min, *a_max;
+    if (a != NULL && a->type == ASIdOrRange_range) {
+      extract_min_max(a, &a_min, &a_max);
+      if (ASN1_INTEGER_cmp(a_min, a_max) > 0)
+	goto done;
+    }
+  }
+
   ret = 1;
 
  done:
@@ -392,9 +406,18 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
     return 1;
 
   /*
-   * We have a list.  Sort it.
+   * If not a list, or if empty list, it's broken.
+   */
+  if (choice->type != ASIdentifierChoice_asIdsOrRanges ||
+      sk_ASIdOrRange_num(choice->u.asIdsOrRanges) == 0) {
+    X509V3err(X509V3_F_ASIDENTIFIERCHOICE_CANONIZE,
+	      X509V3_R_EXTENSION_VALUE_ERROR);
+    return 0;
+  }
+
+  /*
+   * We have a non-empty list.  Sort it.
    */
-  OPENSSL_assert(choice->type == ASIdentifierChoice_asIdsOrRanges);
   sk_ASIdOrRange_sort(choice->u.asIdsOrRanges);
 
   /*
@@ -414,6 +437,13 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
      */
     OPENSSL_assert(ASN1_INTEGER_cmp(a_min, b_min) <= 0);
 
+    /*
+     * Punt inverted ranges.
+     */
+    if (ASN1_INTEGER_cmp(a_min, a_max) > 0 ||
+	ASN1_INTEGER_cmp(b_min, b_max) > 0)
+      goto done;
+
     /*
      * Check for overlaps.
      */
@@ -465,12 +495,26 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
 	break;
       }
       ASIdOrRange_free(b);
-      sk_ASIdOrRange_delete(choice->u.asIdsOrRanges, i + 1);
+      (void) sk_ASIdOrRange_delete(choice->u.asIdsOrRanges, i + 1);
       i--;
       continue;
     }
   }
 
+  /*
+   * Check for final inverted range.
+   */
+  i = sk_ASIdOrRange_num(choice->u.asIdsOrRanges) - 1;
+  {
+    ASIdOrRange *a = sk_ASIdOrRange_value(choice->u.asIdsOrRanges, i);
+    ASN1_INTEGER *a_min, *a_max;
+    if (a != NULL && a->type == ASIdOrRange_range) {
+      extract_min_max(a, &a_min, &a_max);
+      if (ASN1_INTEGER_cmp(a_min, a_max) > 0)
+	goto done;
+    }
+  }
+
   OPENSSL_assert(ASIdentifierChoice_is_canonical(choice)); /* Paranoia */
 
   ret = 1;
@@ -498,6 +542,7 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
 			       struct v3_ext_ctx *ctx,
 			       STACK_OF(CONF_VALUE) *values)
 {
+  ASN1_INTEGER *min = NULL, *max = NULL;
   ASIdentifiers *asid = NULL;
   int i;
 
@@ -508,7 +553,6 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
 
   for (i = 0; i < sk_CONF_VALUE_num(values); i++) {
     CONF_VALUE *val = sk_CONF_VALUE_value(values, i);
-    ASN1_INTEGER *min = NULL, *max = NULL;
     int i1, i2, i3, is_range, which;
 
     /*
@@ -578,18 +622,19 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
       max = s2i_ASN1_INTEGER(NULL, s + i2);
       OPENSSL_free(s);
       if (min == NULL || max == NULL) {
-	ASN1_INTEGER_free(min);
-	ASN1_INTEGER_free(max);
 	X509V3err(X509V3_F_V2I_ASIDENTIFIERS, ERR_R_MALLOC_FAILURE);
 	goto err;
       }
+      if (ASN1_INTEGER_cmp(min, max) > 0) {
+	X509V3err(X509V3_F_V2I_ASIDENTIFIERS, X509V3_R_EXTENSION_VALUE_ERROR);
+	goto err;
+      }
     }
     if (!v3_asid_add_id_or_range(asid, which, min, max)) {
-      ASN1_INTEGER_free(min);
-      ASN1_INTEGER_free(max);
       X509V3err(X509V3_F_V2I_ASIDENTIFIERS, ERR_R_MALLOC_FAILURE);
       goto err;
     }
+    min = max = NULL;
   }
 
   /*
@@ -601,6 +646,8 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
 
  err:
   ASIdentifiers_free(asid);
+  ASN1_INTEGER_free(min);
+  ASN1_INTEGER_free(max);
   return NULL;
 }
 
diff --git a/crypto/x509v3/v3_pci.c b/crypto/x509v3/v3_pci.c
index 0dcfa00..f7b733a 100644
--- a/crypto/x509v3/v3_pci.c
+++ b/crypto/x509v3/v3_pci.c
@@ -2,7 +2,7 @@
 /* Contributed to the OpenSSL Project 2004
  * by Richard Levitte (richard@levitte.org)
  */
-/* Copyright (c) 2004 Kungliga Tekniska H�gskolan
+/* Copyright (c) 2004 Kungliga Tekniska Högskolan
  * (Royal Institute of Technology, Stockholm, Sweden).
  * All rights reserved.
  *
diff --git a/crypto/x509v3/v3_pcia.c b/crypto/x509v3/v3_pcia.c
index bb362e0..eb08273 100644
--- a/crypto/x509v3/v3_pcia.c
+++ b/crypto/x509v3/v3_pcia.c
@@ -2,7 +2,7 @@
 /* Contributed to the OpenSSL Project 2004
  * by Richard Levitte (richard@levitte.org)
  */
-/* Copyright (c) 2004 Kungliga Tekniska H�gskolan
+/* Copyright (c) 2004 Kungliga Tekniska Högskolan
  * (Royal Institute of Technology, Stockholm, Sweden).
  * All rights reserved.
  *
diff --git a/crypto/x509v3/v3_purp.c b/crypto/x509v3/v3_purp.c
index 181bd34..ad68865 100644
--- a/crypto/x509v3/v3_purp.c
+++ b/crypto/x509v3/v3_purp.c
@@ -474,11 +474,11 @@ static void x509v3_cache_extensions(X509 *x)
 	for (i = 0; i < X509_get_ext_count(x); i++)
 		{
 		ex = X509_get_ext(x, i);
-		if (!X509_EXTENSION_get_critical(ex))
-			continue;
 		if (OBJ_obj2nid(X509_EXTENSION_get_object(ex))
 					== NID_freshest_crl)
 			x->ex_flags |= EXFLAG_FRESHEST;
+		if (!X509_EXTENSION_get_critical(ex))
+			continue;
 		if (!X509_supported_extension(ex))
 			{
 			x->ex_flags |= EXFLAG_CRITICAL;
diff --git a/crypto/x509v3/v3_skey.c b/crypto/x509v3/v3_skey.c
index 202c9e4..0a984fb 100644
--- a/crypto/x509v3/v3_skey.c
+++ b/crypto/x509v3/v3_skey.c
@@ -129,7 +129,8 @@ static ASN1_OCTET_STRING *s2i_skey_id(X509V3_EXT_METHOD *method,
 		goto err;
 	}
 
-	EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL);
+	if (!EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL))
+		goto err;
 
 	if(!M_ASN1_OCTET_STRING_set(oct, pkey_dig, diglen)) {
 		X509V3err(X509V3_F_S2I_SKEY_ID,ERR_R_MALLOC_FAILURE);
diff --git a/crypto/x509v3/v3_utl.c b/crypto/x509v3/v3_utl.c
index e030234..87cfceb 100644
--- a/crypto/x509v3/v3_utl.c
+++ b/crypto/x509v3/v3_utl.c
@@ -365,7 +365,7 @@ char *hex_to_string(const unsigned char *buffer, long len)
 	char *tmp, *q;
 	const unsigned char *p;
 	int i;
-	const static char hexdig[] = "0123456789ABCDEF";
+	static const char hexdig[] = "0123456789ABCDEF";
 	if(!buffer || !len) return NULL;
 	if(!(tmp = OPENSSL_malloc(len * 3 + 1))) {
 		X509V3err(X509V3_F_HEX_TO_STRING,ERR_R_MALLOC_FAILURE);
diff --git a/crypto/x86_64cpuid.S b/crypto/x86_64cpuid.S
new file mode 100644
index 0000000..562b03a
--- /dev/null
+++ b/crypto/x86_64cpuid.S
@@ -0,0 +1,234 @@
+
+.hidden	OPENSSL_cpuid_setup
+.section	.init
+	call	OPENSSL_cpuid_setup
+
+.hidden	OPENSSL_ia32cap_P
+.comm	OPENSSL_ia32cap_P,8,4
+
+.text	
+
+.globl	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,@function
+.align	16
+OPENSSL_atomic_add:
+	movl	(%rdi),%eax
+.Lspin:	leaq	(%rsi,%rax,1),%r8
+.byte	0xf0		
+	cmpxchgl	%r8d,(%rdi)
+	jne	.Lspin
+	movl	%r8d,%eax
+.byte	0x48,0x98	
+	.byte	0xf3,0xc3
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.globl	OPENSSL_rdtsc
+.type	OPENSSL_rdtsc,@function
+.align	16
+OPENSSL_rdtsc:
+	rdtsc
+	shlq	$32,%rdx
+	orq	%rdx,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_rdtsc,.-OPENSSL_rdtsc
+
+.globl	OPENSSL_ia32_cpuid
+.type	OPENSSL_ia32_cpuid,@function
+.align	16
+OPENSSL_ia32_cpuid:
+	movq	%rbx,%r8
+
+	xorl	%eax,%eax
+	cpuid
+	movl	%eax,%r11d
+
+	xorl	%eax,%eax
+	cmpl	$1970169159,%ebx
+	setne	%al
+	movl	%eax,%r9d
+	cmpl	$1231384169,%edx
+	setne	%al
+	orl	%eax,%r9d
+	cmpl	$1818588270,%ecx
+	setne	%al
+	orl	%eax,%r9d
+	jz	.Lintel
+
+	cmpl	$1752462657,%ebx
+	setne	%al
+	movl	%eax,%r10d
+	cmpl	$1769238117,%edx
+	setne	%al
+	orl	%eax,%r10d
+	cmpl	$1145913699,%ecx
+	setne	%al
+	orl	%eax,%r10d
+	jnz	.Lintel
+
+
+	movl	$2147483648,%eax
+	cpuid
+	cmpl	$2147483649,%eax
+	jb	.Lintel
+	movl	%eax,%r10d
+	movl	$2147483649,%eax
+	cpuid
+	orl	%ecx,%r9d
+	andl	$2049,%r9d
+
+	cmpl	$2147483656,%r10d
+	jb	.Lintel
+
+	movl	$2147483656,%eax
+	cpuid
+	movzbq	%cl,%r10
+	incq	%r10
+
+	movl	$1,%eax
+	cpuid
+	btl	$28,%edx
+	jnc	.Lgeneric
+	shrl	$16,%ebx
+	cmpb	%r10b,%bl
+	ja	.Lgeneric
+	andl	$4026531839,%edx
+	jmp	.Lgeneric
+
+.Lintel:
+	cmpl	$4,%r11d
+	movl	$-1,%r10d
+	jb	.Lnocacheinfo
+
+	movl	$4,%eax
+	movl	$0,%ecx
+	cpuid
+	movl	%eax,%r10d
+	shrl	$14,%r10d
+	andl	$4095,%r10d
+
+.Lnocacheinfo:
+	movl	$1,%eax
+	cpuid
+	andl	$3220176895,%edx
+	cmpl	$0,%r9d
+	jne	.Lnotintel
+	orl	$1073741824,%edx
+	andb	$15,%ah
+	cmpb	$15,%ah
+	jne	.Lnotintel
+	orl	$1048576,%edx
+.Lnotintel:
+	btl	$28,%edx
+	jnc	.Lgeneric
+	andl	$4026531839,%edx
+	cmpl	$0,%r10d
+	je	.Lgeneric
+
+	orl	$268435456,%edx
+	shrl	$16,%ebx
+	cmpb	$1,%bl
+	ja	.Lgeneric
+	andl	$4026531839,%edx
+.Lgeneric:
+	andl	$2048,%r9d
+	andl	$4294965247,%ecx
+	orl	%ecx,%r9d
+
+	movl	%edx,%r10d
+	btl	$27,%r9d
+	jnc	.Lclear_avx
+	xorl	%ecx,%ecx
+.byte	0x0f,0x01,0xd0		
+	andl	$6,%eax
+	cmpl	$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	movl	$4026525695,%eax
+	andl	%eax,%r9d
+.Ldone:
+	shlq	$32,%r9
+	movl	%r10d,%eax
+	movq	%r8,%rbx
+	orq	%r9,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.globl	OPENSSL_cleanse
+.type	OPENSSL_cleanse,@function
+.align	16
+OPENSSL_cleanse:
+	xorq	%rax,%rax
+	cmpq	$15,%rsi
+	jae	.Lot
+	cmpq	$0,%rsi
+	je	.Lret
+.Little:
+	movb	%al,(%rdi)
+	subq	$1,%rsi
+	leaq	1(%rdi),%rdi
+	jnz	.Little
+.Lret:
+	.byte	0xf3,0xc3
+.align	16
+.Lot:
+	testq	$7,%rdi
+	jz	.Laligned
+	movb	%al,(%rdi)
+	leaq	-1(%rsi),%rsi
+	leaq	1(%rdi),%rdi
+	jmp	.Lot
+.Laligned:
+	movq	%rax,(%rdi)
+	leaq	-8(%rsi),%rsi
+	testq	$-8,%rsi
+	leaq	8(%rdi),%rdi
+	jnz	.Laligned
+	cmpq	$0,%rsi
+	jne	.Little
+	.byte	0xf3,0xc3
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+.globl	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,@function
+.align	16
+OPENSSL_wipe_cpu:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	xorq	%rcx,%rcx
+	xorq	%rdx,%rdx
+	xorq	%rsi,%rsi
+	xorq	%rdi,%rdi
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	leaq	8(%rsp),%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,@function
+.align	16
+OPENSSL_ia32_rdrand:
+	movl	$8,%ecx
+.Loop_rdrand:
+.byte	72,15,199,240
+	jc	.Lbreak_rdrand
+	loop	.Loop_rdrand
+.Lbreak_rdrand:
+	cmpq	$0,%rax
+	cmoveq	%rcx,%rax
+	.byte	0xf3,0xc3
+.size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index c96821a..6ebfd01 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -7,15 +7,25 @@
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+				 ("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
-if ($win64)	{ $arg1="%rcx"; $arg2="%rdx"; }
-else		{ $arg1="%rdi"; $arg2="%rsi"; }
 print<<___;
 .extern		OPENSSL_cpuid_setup
+.hidden		OPENSSL_cpuid_setup
 .section	.init
 	call	OPENSSL_cpuid_setup
 
+.hidden	OPENSSL_ia32cap_P
+.comm	OPENSSL_ia32cap_P,8,4
+
 .text
 
 .globl	OPENSSL_atomic_add
@@ -46,7 +56,7 @@
 .type	OPENSSL_ia32_cpuid,\@abi-omnipotent
 .align	16
 OPENSSL_ia32_cpuid:
-	mov	%rbx,%r8
+	mov	%rbx,%r8		# save %rbx
 
 	xor	%eax,%eax
 	cpuid
@@ -78,7 +88,15 @@
 	# AMD specific
 	mov	\$0x80000000,%eax
 	cpuid
-	cmp	\$0x80000008,%eax
+	cmp	\$0x80000001,%eax
+	jb	.Lintel
+	mov	%eax,%r10d
+	mov	\$0x80000001,%eax
+	cpuid
+	or	%ecx,%r9d
+	and	\$0x00000801,%r9d	# isolate AMD XOP bit, 1<<11
+
+	cmp	\$0x80000008,%r10d
 	jb	.Lintel
 
 	mov	\$0x80000008,%eax
@@ -89,12 +107,12 @@
 	mov	\$1,%eax
 	cpuid
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	shr	\$16,%ebx		# number of logical processors
 	cmp	%r10b,%bl
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
-	jmp	.Ldone
+	jmp	.Lgeneric
 
 .Lintel:
 	cmp	\$4,%r11d
@@ -111,30 +129,47 @@
 .Lnocacheinfo:
 	mov	\$1,%eax
 	cpuid
+	and	\$0xbfefffff,%edx	# force reserved bits to 0
 	cmp	\$0,%r9d
 	jne	.Lnotintel
-	or	\$0x00100000,%edx	# use reserved 20th bit to engage RC4_CHAR
+	or	\$0x40000000,%edx	# set reserved bit#30 on Intel CPUs
 	and	\$15,%ah
 	cmp	\$15,%ah		# examine Family ID
-	je	.Lnotintel
-	or	\$0x40000000,%edx	# use reserved bit to skip unrolled loop
+	jne	.Lnotintel
+	or	\$0x00100000,%edx	# set reserved bit#20 to engage RC4_CHAR
 .Lnotintel:
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
 	cmp	\$0,%r10d
-	je	.Ldone
+	je	.Lgeneric
 
 	or	\$0x10000000,%edx	# 1<<28
 	shr	\$16,%ebx
 	cmp	\$1,%bl			# see if cache is shared
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
+.Lgeneric:
+	and	\$0x00000800,%r9d	# isolate AMD XOP flag
+	and	\$0xfffff7ff,%ecx
+	or	%ecx,%r9d		# merge AMD XOP flag
+
+	mov	%edx,%r10d		# %r9d:%r10d is copy of %ecx:%edx
+	bt	\$27,%r9d		# check OSXSAVE bit
+	jnc	.Lclear_avx
+	xor	%ecx,%ecx		# XCR0
+	.byte	0x0f,0x01,0xd0		# xgetbv
+	and	\$6,%eax		# isolate XMM and YMM state support
+	cmp	\$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	mov	\$0xefffe7ff,%eax	# ~(1<<28|1<<12|1<<11)
+	and	%eax,%r9d		# clear AVX, FMA and AMD XOP bits
 .Ldone:
-	shl	\$32,%rcx
-	mov	%edx,%eax
-	mov	%r8,%rbx
-	or	%rcx,%rax
+	shl	\$32,%r9
+	mov	%r10d,%eax
+	mov	%r8,%rbx		# restore %rbx
+	or	%r9,%rax
 	ret
 .size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
 
@@ -229,4 +264,21 @@
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 ___
 
+print<<___;
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,\@abi-omnipotent
+.align	16
+OPENSSL_ia32_rdrand:
+	mov	\$8,%ecx
+.Loop_rdrand:
+	rdrand	%rax
+	jc	.Lbreak_rdrand
+	loop	.Loop_rdrand
+.Lbreak_rdrand:
+	cmp	\$0,%rax
+	cmove	%rcx,%rax
+	ret
+.size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
+___
+
 close STDOUT;	# flush
diff --git a/crypto/x86cpuid.S b/crypto/x86cpuid.S
new file mode 100644
index 0000000..1ed297f
--- /dev/null
+++ b/crypto/x86cpuid.S
@@ -0,0 +1,332 @@
+.file	"x86cpuid.s"
+.text
+.globl	OPENSSL_ia32_cpuid
+.type	OPENSSL_ia32_cpuid,@function
+.align	16
+OPENSSL_ia32_cpuid:
+.L_OPENSSL_ia32_cpuid_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%edx,%edx
+	pushfl
+	popl	%eax
+	movl	%eax,%ecx
+	xorl	$2097152,%eax
+	pushl	%eax
+	popfl
+	pushfl
+	popl	%eax
+	xorl	%eax,%ecx
+	xorl	%eax,%eax
+	btl	$21,%ecx
+	jnc	.L000nocpuid
+	.byte	0x0f,0xa2
+	movl	%eax,%edi
+	xorl	%eax,%eax
+	cmpl	$1970169159,%ebx
+	setne	%al
+	movl	%eax,%ebp
+	cmpl	$1231384169,%edx
+	setne	%al
+	orl	%eax,%ebp
+	cmpl	$1818588270,%ecx
+	setne	%al
+	orl	%eax,%ebp
+	jz	.L001intel
+	cmpl	$1752462657,%ebx
+	setne	%al
+	movl	%eax,%esi
+	cmpl	$1769238117,%edx
+	setne	%al
+	orl	%eax,%esi
+	cmpl	$1145913699,%ecx
+	setne	%al
+	orl	%eax,%esi
+	jnz	.L001intel
+	movl	$2147483648,%eax
+	.byte	0x0f,0xa2
+	cmpl	$2147483649,%eax
+	jb	.L001intel
+	movl	%eax,%esi
+	movl	$2147483649,%eax
+	.byte	0x0f,0xa2
+	orl	%ecx,%ebp
+	andl	$2049,%ebp
+	cmpl	$2147483656,%esi
+	jb	.L001intel
+	movl	$2147483656,%eax
+	.byte	0x0f,0xa2
+	movzbl	%cl,%esi
+	incl	%esi
+	movl	$1,%eax
+	.byte	0x0f,0xa2
+	btl	$28,%edx
+	jnc	.L002generic
+	shrl	$16,%ebx
+	andl	$255,%ebx
+	cmpl	%esi,%ebx
+	ja	.L002generic
+	andl	$4026531839,%edx
+	jmp	.L002generic
+.L001intel:
+	cmpl	$4,%edi
+	movl	$-1,%edi
+	jb	.L003nocacheinfo
+	movl	$4,%eax
+	movl	$0,%ecx
+	.byte	0x0f,0xa2
+	movl	%eax,%edi
+	shrl	$14,%edi
+	andl	$4095,%edi
+.L003nocacheinfo:
+	movl	$1,%eax
+	.byte	0x0f,0xa2
+	andl	$3220176895,%edx
+	cmpl	$0,%ebp
+	jne	.L004notintel
+	orl	$1073741824,%edx
+	andb	$15,%ah
+	cmpb	$15,%ah
+	jne	.L004notintel
+	orl	$1048576,%edx
+.L004notintel:
+	btl	$28,%edx
+	jnc	.L002generic
+	andl	$4026531839,%edx
+	cmpl	$0,%edi
+	je	.L002generic
+	orl	$268435456,%edx
+	shrl	$16,%ebx
+	cmpb	$1,%bl
+	ja	.L002generic
+	andl	$4026531839,%edx
+.L002generic:
+	andl	$2048,%ebp
+	andl	$4294965247,%ecx
+	movl	%edx,%esi
+	orl	%ecx,%ebp
+	btl	$27,%ecx
+	jnc	.L005clear_avx
+	xorl	%ecx,%ecx
+.byte	15,1,208
+	andl	$6,%eax
+	cmpl	$6,%eax
+	je	.L006done
+	cmpl	$2,%eax
+	je	.L005clear_avx
+.L007clear_xmm:
+	andl	$4261412861,%ebp
+	andl	$4278190079,%esi
+.L005clear_avx:
+	andl	$4026525695,%ebp
+.L006done:
+	movl	%esi,%eax
+	movl	%ebp,%edx
+.L000nocpuid:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	OPENSSL_ia32_cpuid,.-.L_OPENSSL_ia32_cpuid_begin
+.globl	OPENSSL_rdtsc
+.type	OPENSSL_rdtsc,@function
+.align	16
+OPENSSL_rdtsc:
+.L_OPENSSL_rdtsc_begin:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	call	.L008PIC_me_up
+.L008PIC_me_up:
+	popl	%ecx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L008PIC_me_up](%ecx),%ecx
+	movl	OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+	btl	$4,(%ecx)
+	jnc	.L009notsc
+	.byte	0x0f,0x31
+.L009notsc:
+	ret
+.size	OPENSSL_rdtsc,.-.L_OPENSSL_rdtsc_begin
+.globl	OPENSSL_instrument_halt
+.type	OPENSSL_instrument_halt,@function
+.align	16
+OPENSSL_instrument_halt:
+.L_OPENSSL_instrument_halt_begin:
+	call	.L010PIC_me_up
+.L010PIC_me_up:
+	popl	%ecx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%ecx),%ecx
+	movl	OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+	btl	$4,(%ecx)
+	jnc	.L011nohalt
+.long	2421723150
+	andl	$3,%eax
+	jnz	.L011nohalt
+	pushfl
+	popl	%eax
+	btl	$9,%eax
+	jnc	.L011nohalt
+	.byte	0x0f,0x31
+	pushl	%edx
+	pushl	%eax
+	hlt
+	.byte	0x0f,0x31
+	subl	(%esp),%eax
+	sbbl	4(%esp),%edx
+	addl	$8,%esp
+	ret
+.L011nohalt:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	ret
+.size	OPENSSL_instrument_halt,.-.L_OPENSSL_instrument_halt_begin
+.globl	OPENSSL_far_spin
+.type	OPENSSL_far_spin,@function
+.align	16
+OPENSSL_far_spin:
+.L_OPENSSL_far_spin_begin:
+	pushfl
+	popl	%eax
+	btl	$9,%eax
+	jnc	.L012nospin
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+.long	2430111262
+	xorl	%eax,%eax
+	movl	(%ecx),%edx
+	jmp	.L013spin
+.align	16
+.L013spin:
+	incl	%eax
+	cmpl	(%ecx),%edx
+	je	.L013spin
+.long	529567888
+	ret
+.L012nospin:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	ret
+.size	OPENSSL_far_spin,.-.L_OPENSSL_far_spin_begin
+.globl	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,@function
+.align	16
+OPENSSL_wipe_cpu:
+.L_OPENSSL_wipe_cpu_begin:
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	call	.L014PIC_me_up
+.L014PIC_me_up:
+	popl	%ecx
+	leal	_GLOBAL_OFFSET_TABLE_+[.-.L014PIC_me_up](%ecx),%ecx
+	movl	OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+	movl	(%ecx),%ecx
+	btl	$1,(%ecx)
+	jnc	.L015no_x87
+.long	4007259865,4007259865,4007259865,4007259865,2430851995
+.L015no_x87:
+	leal	4(%esp),%eax
+	ret
+.size	OPENSSL_wipe_cpu,.-.L_OPENSSL_wipe_cpu_begin
+.globl	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,@function
+.align	16
+OPENSSL_atomic_add:
+.L_OPENSSL_atomic_add_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+	pushl	%ebx
+	nop
+	movl	(%edx),%eax
+.L016spin:
+	leal	(%eax,%ecx,1),%ebx
+	nop
+.long	447811568
+	jne	.L016spin
+	movl	%ebx,%eax
+	popl	%ebx
+	ret
+.size	OPENSSL_atomic_add,.-.L_OPENSSL_atomic_add_begin
+.globl	OPENSSL_indirect_call
+.type	OPENSSL_indirect_call,@function
+.align	16
+OPENSSL_indirect_call:
+.L_OPENSSL_indirect_call_begin:
+	pushl	%ebp
+	movl	%esp,%ebp
+	subl	$28,%esp
+	movl	12(%ebp),%ecx
+	movl	%ecx,(%esp)
+	movl	16(%ebp),%edx
+	movl	%edx,4(%esp)
+	movl	20(%ebp),%eax
+	movl	%eax,8(%esp)
+	movl	24(%ebp),%eax
+	movl	%eax,12(%esp)
+	movl	28(%ebp),%eax
+	movl	%eax,16(%esp)
+	movl	32(%ebp),%eax
+	movl	%eax,20(%esp)
+	movl	36(%ebp),%eax
+	movl	%eax,24(%esp)
+	call	*8(%ebp)
+	movl	%ebp,%esp
+	popl	%ebp
+	ret
+.size	OPENSSL_indirect_call,.-.L_OPENSSL_indirect_call_begin
+.globl	OPENSSL_cleanse
+.type	OPENSSL_cleanse,@function
+.align	16
+OPENSSL_cleanse:
+.L_OPENSSL_cleanse_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+	xorl	%eax,%eax
+	cmpl	$7,%ecx
+	jae	.L017lot
+	cmpl	$0,%ecx
+	je	.L018ret
+.L019little:
+	movb	%al,(%edx)
+	subl	$1,%ecx
+	leal	1(%edx),%edx
+	jnz	.L019little
+.L018ret:
+	ret
+.align	16
+.L017lot:
+	testl	$3,%edx
+	jz	.L020aligned
+	movb	%al,(%edx)
+	leal	-1(%ecx),%ecx
+	leal	1(%edx),%edx
+	jmp	.L017lot
+.L020aligned:
+	movl	%eax,(%edx)
+	leal	-4(%ecx),%ecx
+	testl	$-4,%ecx
+	leal	4(%edx),%edx
+	jnz	.L020aligned
+	cmpl	$0,%ecx
+	jne	.L019little
+	ret
+.size	OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,@function
+.align	16
+OPENSSL_ia32_rdrand:
+.L_OPENSSL_ia32_rdrand_begin:
+	movl	$8,%ecx
+.L021loop:
+.byte	15,199,240
+	jc	.L022break
+	loop	.L021loop
+.L022break:
+	cmpl	$0,%eax
+	cmovel	%ecx,%eax
+	ret
+.size	OPENSSL_ia32_rdrand,.-.L_OPENSSL_ia32_rdrand_begin
+.comm	OPENSSL_ia32cap_P,8,4
+.section	.init
+	call	OPENSSL_cpuid_setup
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index a7464af..c18b0e2 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -19,9 +19,9 @@
 	&pushf	();
 	&pop	("eax");
 	&xor	("ecx","eax");
-	&bt	("ecx",21);
-	&jnc	(&label("done"));
 	&xor	("eax","eax");
+	&bt	("ecx",21);
+	&jnc	(&label("nocpuid"));
 	&cpuid	();
 	&mov	("edi","eax");		# max value for standard query level
 
@@ -51,7 +51,14 @@
 	# AMD specific
 	&mov	("eax",0x80000000);
 	&cpuid	();
-	&cmp	("eax",0x80000008);
+	&cmp	("eax",0x80000001);
+	&jb	(&label("intel"));
+	&mov	("esi","eax");
+	&mov	("eax",0x80000001);
+	&cpuid	();
+	&or	("ebp","ecx");
+	&and	("ebp",1<<11|1);	# isolate XOP bit
+	&cmp	("esi",0x80000008);
 	&jb	(&label("intel"));
 
 	&mov	("eax",0x80000008);
@@ -62,13 +69,13 @@
 	&mov	("eax",1);
 	&cpuid	();
 	&bt	("edx",28);
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&shr	("ebx",16);
 	&and	("ebx",0xff);
 	&cmp	("ebx","esi");
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit
-	&jmp	(&label("done"));
+	&jmp	(&label("generic"));
 	
 &set_label("intel");
 	&cmp	("edi",4);
@@ -85,27 +92,51 @@
 &set_label("nocacheinfo");
 	&mov	("eax",1);
 	&cpuid	();
+	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
 	&cmp	("ebp",0);
-	&jne	(&label("notP4"));
+	&jne	(&label("notintel"));
+	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
 	&and	(&HB("eax"),15);	# familiy ID
 	&cmp	(&HB("eax"),15);	# P4?
-	&jne	(&label("notP4"));
-	&or	("edx",1<<20);		# use reserved bit to engage RC4_CHAR
-&set_label("notP4");
+	&jne	(&label("notintel"));
+	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
+&set_label("notintel");
 	&bt	("edx",28);		# test hyper-threading bit
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&and	("edx",0xefffffff);
 	&cmp	("edi",0);
-	&je	(&label("done"));
+	&je	(&label("generic"));
 
 	&or	("edx",0x10000000);
 	&shr	("ebx",16);
 	&cmp	(&LB("ebx"),1);
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
+
+&set_label("generic");
+	&and	("ebp",1<<11);		# isolate AMD XOP flag
+	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
+	&mov	("esi","edx");
+	&or	("ebp","ecx");		# merge AMD XOP flag
+
+	&bt	("ecx",27);		# check OSXSAVE bit
+	&jnc	(&label("clear_avx"));
+	&xor	("ecx","ecx");
+	&data_byte(0x0f,0x01,0xd0);	# xgetbv
+	&and	("eax",6);
+	&cmp	("eax",6);
+	&je	(&label("done"));
+	&cmp	("eax",2);
+	&je	(&label("clear_avx"));
+&set_label("clear_xmm");
+	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
+	&and	("esi",0xfeffffff);	# clear FXSR
+&set_label("clear_avx");
+	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
 &set_label("done");
-	&mov	("eax","edx");
-	&mov	("edx","ecx");
+	&mov	("eax","esi");
+	&mov	("edx","ebp");
+&set_label("nocpuid");
 &function_end("OPENSSL_ia32_cpuid");
 
 &external_label("OPENSSL_ia32cap_P");
@@ -134,7 +165,7 @@
 	&jnz	(&label("nohalt"));	# not enough privileges
 
 	&pushf	();
-	&pop	("eax")
+	&pop	("eax");
 	&bt	("eax",9);
 	&jnc	(&label("nohalt"));	# interrupts are disabled
 
@@ -199,8 +230,9 @@
 	&bt	(&DWP(0,"ecx"),1);
 	&jnc	(&label("no_x87"));
 	if ($sse2) {
-		&bt	(&DWP(0,"ecx"),26);
-		&jnc	(&label("no_sse2"));
+		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
+		&cmp	("ecx",1<<26|1<<24);
+		&jne	(&label("no_sse2"));
 		&pxor	("xmm0","xmm0");
 		&pxor	("xmm1","xmm1");
 		&pxor	("xmm2","xmm2");
@@ -248,7 +280,7 @@
 #	arguments is 1 or 2!
 &function_begin_B("OPENSSL_indirect_call");
 	{
-	my $i,$max=7;		# $max has to be chosen as 4*n-1
+	my ($max,$i)=(7,);	# $max has to be chosen as 4*n-1
 				# in order to preserve eventual
 				# stack alignment
 	&push	("ebp");
@@ -307,6 +339,18 @@
 	&ret	();
 &function_end_B("OPENSSL_cleanse");
 
+&function_begin_B("OPENSSL_ia32_rdrand");
+	&mov	("ecx",8);
+&set_label("loop");
+	&rdrand	("eax");
+	&jc	(&label("break"));
+	&loop	(&label("loop"));
+&set_label("break");
+	&cmp	("eax",0);
+	&cmove	("eax","ecx");
+	&ret	();
+&function_end_B("OPENSSL_ia32_rdrand");
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();
diff --git a/e_os.h b/e_os.h
index 5ceeeeb..79c1392 100644
--- a/e_os.h
+++ b/e_os.h
@@ -99,7 +99,6 @@ extern "C" {
 #  ifndef MAC_OS_GUSI_SOURCE
 #    define MAC_OS_pre_X
 #    define NO_SYS_TYPES_H
-     typedef long ssize_t;
 #  endif
 #  define NO_SYS_PARAM_H
 #  define NO_CHMOD
@@ -340,8 +339,6 @@ static unsigned int _strlen31(const char *str)
 #    define OPENSSL_NO_POSIX_IO
 #  endif
 
-#  define ssize_t long
-
 #  if defined (__BORLANDC__)
 #    define _setmode setmode
 #    define _O_TEXT O_TEXT
@@ -456,9 +453,6 @@ static unsigned int _strlen31(const char *str)
                          * (unless when compiling with -D_POSIX_SOURCE,
                          * which doesn't work for us) */
 #    endif
-#    if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
-#      define ssize_t int /* ditto */
-#    endif
 #    ifdef OPENSSL_SYS_NEWS4 /* setvbuf is missing on mips-sony-bsd */
 #      define setvbuf(a, b, c, d) setbuffer((a), (b), (d))
        typedef unsigned long clock_t;
@@ -637,12 +631,6 @@ static unsigned int _strlen31(const char *str)
 
 #endif
 
-#if defined(__ultrix)
-#  ifndef ssize_t
-#    define ssize_t int 
-#  endif
-#endif
-
 #if defined(sun) && !defined(__svr4__) && !defined(__SVR4)
   /* include headers first, so our defines don't break it */
 #include <stdlib.h>
diff --git a/e_os2.h b/e_os2.h
index 4c785c6..d22c036 100644
--- a/e_os2.h
+++ b/e_os2.h
@@ -193,8 +193,14 @@ extern "C" {
 #endif
 
 /* --------------------------------- VOS ----------------------------------- */
-#ifdef OPENSSL_SYSNAME_VOS
+#if defined(__VOS__) || defined(OPENSSL_SYSNAME_VOS)
 # define OPENSSL_SYS_VOS
+#ifdef __HPPA__
+# define OPENSSL_SYS_VOS_HPPA
+#endif
+#ifdef __IA32__
+# define OPENSSL_SYS_VOS_IA32
+#endif
 #endif
 
 /* ------------------------------- VxWorks --------------------------------- */
@@ -283,6 +289,26 @@ extern "C" {
 # define OPENSSL_GLOBAL_REF(name) _shadow_##name
 #endif
 
+#if defined(OPENSSL_SYS_MACINTOSH_CLASSIC) && macintosh==1 && !defined(MAC_OS_GUSI_SOURCE)
+#  define ossl_ssize_t long
+#endif
+
+#ifdef OPENSSL_SYS_MSDOS
+#  define ossl_ssize_t long
+#endif
+
+#if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
+#  define ssize_t int
+#endif
+
+#if defined(__ultrix) && !defined(ssize_t)
+#  define ossl_ssize_t int 
+#endif
+
+#ifndef ossl_ssize_t
+#  define ossl_ssize_t ssize_t
+#endif
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/import_openssl.sh b/import_openssl.sh
index f04532c..332b6ca 100755
--- a/import_openssl.sh
+++ b/import_openssl.sh
@@ -27,6 +27,10 @@
 set -e
 trap  "echo WARNING: Exiting on non-zero subprocess exit code" ERR;
 
+# Ensure consistent sorting order / tool output.
+export LANG=C
+export LC_ALL=C
+
 function die() {
   declare -r message=$1
 
@@ -83,7 +87,7 @@ function main() {
     declare -r patch=$1
     shift || usage "No patch file specified."
     [ -d $OPENSSL_DIR ] || usage "$OPENSSL_DIR not found, did you mean to use generate?"
-    [ -d $OPENSSL_DIR_ORIG_ORIG ] || usage "$OPENSSL_DIR_ORIG not found, did you mean to use generate?"
+    [ -d $OPENSSL_DIR_ORIG ] || usage "$OPENSSL_DIR_ORIG not found, did you mean to use generate?"
     regenerate $patch
   elif [ "$command" = "generate" ]; then
     declare -r patch=$1
@@ -96,6 +100,246 @@ function main() {
   fi
 }
 
+# Compute the name of an assembly source file generated by one of the
+# gen_asm_xxxx() functions below. The logic is the following:
+# - if "$2" is not empty, output it directly
+# - otherwise, change the file extension of $1 from .pl to .S and output
+#   it.
+# Usage: default_asm_file "$1" "$2"
+#     or default_asm_file "$@"
+#
+# $1: generator path (perl script)
+# $2: optional output file name.
+function default_asm_file () {
+  if [ "$2" ]; then
+    echo "$2"
+  else
+    echo "${1%%.pl}.S"
+  fi
+}
+
+# Generate an ARM assembly file.
+# $1: generator (perl script)
+# $2: [optional] output file name
+function gen_asm_arm () {
+  local OUT
+  OUT=$(default_asm_file "$@")
+  perl "$1" > "$OUT"
+}
+
+function gen_asm_mips () {
+  local OUT
+  OUT=$(default_asm_file "$@")
+  # The perl scripts expect to run the target compiler as $CC to determine
+  # the endianess of the target. Setting CC to true is a hack that forces the scripts
+  # to generate little endian output
+  CC=true perl "$1" o32 > "$OUT"
+}
+
+function gen_asm_x86 () {
+  local OUT
+  OUT=$(default_asm_file "$@")
+  perl "$1" elf -fPIC > "$OUT"
+}
+
+function gen_asm_x86_64 () {
+  local OUT
+  OUT=$(default_asm_file "$@")
+  perl "$1" elf "$OUT" > "$OUT"
+}
+
+
+# Filter all items in a list that match a given pattern.
+# $1: space-separated list
+# $2: egrep pattern.
+# Out: items in $1 that match $2
+function filter_by_egrep() {
+  declare -r pattern=$1
+  shift
+  echo "$@" | tr ' ' '\n' | grep -e "$pattern" | tr '\n' ' '
+}
+
+# Sort and remove duplicates in a space-separated list
+# $1: space-separated list
+# Out: new space-separated list
+function uniq_sort () {
+  echo "$@" | tr ' ' '\n' | sort -u | tr '\n' ' '
+}
+
+function print_autogenerated_header() {
+  echo "# Auto-generated - DO NOT EDIT!"
+  echo "# To regenerate, edit openssl.config, then run:"
+  echo "#     ./import_openssl.sh import /path/to/openssl-$OPENSSL_VERSION.tar.gz"
+  echo "#"
+}
+
+function generate_build_config_mk() {
+  ./Configure $CONFIGURE_ARGS
+  rm -f apps/CA.pl.bak crypto/opensslconf.h.bak
+
+  declare -r tmpfile=$(mktemp)
+  (grep -e -D Makefile | grep -v CONFIGURE_ARGS= | grep -v OPTIONS= | grep -v -e -DOPENSSL_NO_DEPRECATED) > $tmpfile
+
+  declare -r cflags=$(filter_by_egrep "^-D" $(grep -e "^CFLAG=" $tmpfile))
+  declare -r depflags=$(filter_by_egrep "^-D" $(grep -e "^DEPFLAG=" $tmpfile))
+  rm -f $tmpfile
+
+  echo "Generating $(basename $1)"
+  (
+    print_autogenerated_header
+
+    echo "openssl_cflags := \\"
+    for cflag in $cflags $depflags; do
+      echo "  $cflag \\"
+    done
+    echo ""
+  ) > $1
+}
+
+# Return the value of a computed variable name.
+# E.g.:
+#   FOO=foo
+#   BAR=bar
+#   echo $(var_value FOO_$BAR)   -> prints the value of ${FOO_bar}
+# $1: Variable name
+# Out: variable value
+var_value() {
+  # Note: don't use 'echo' here, because it's sensitive to values
+  #       that begin with an underscore (e.g. "-n")
+  eval printf \"%s\\n\" \$$1
+}
+
+# Same as var_value, but returns sorted output without duplicates.
+# $1: Variable name
+# Out: variable value (if space-separated list, sorted with no duplicates)
+var_sorted_value() {
+  uniq_sort $(var_value $1)
+}
+
+# Print the definition of a given variable in a GNU Make build file.
+# $1: Variable name (e.g. common_src_files)
+# $2+: Variable value (e.g. list of sources)
+print_vardef_in_mk() {
+  declare -r varname=$1
+  shift
+  if [ -z "$1" ]; then
+    echo "$varname :="
+  else
+    echo "$varname := \\"
+    for src; do
+      echo "  $src \\"
+    done
+  fi
+  echo ""
+}
+
+# Same as print_vardef_in_mk, but print a CFLAGS definition from
+# a list of compiler defines.
+# $1: Variable name (e.g. common_c_flags)
+# $2: List of defines (e.g. OPENSSL_NO_CAMELLIA ...)
+print_defines_in_mk() {
+  declare -r varname=$1
+  shift
+  if [ -z "$1" ]; then
+    echo "$varname :="
+  else
+    echo "$varname := \\"
+    for def; do
+    echo "  -D$def \\"
+    done
+  fi
+  echo ""
+}
+
+# Generate a configuration file like Crypto-config.mk
+# This uses variable definitions from openssl.config to build a config
+# file that can compute the list of target- and host-specific sources /
+# compiler flags for a given component.
+#
+# $1: Target file name.  (e.g. Crypto-config.mk)
+# $2: Variable prefix.   (e.g. CRYPTO)
+function generate_config_mk() {
+  declare -r output="$1"
+  declare -r prefix="$2"
+  declare -r all_archs="arm x86 x86_64 mips"
+
+  echo "Generating $(basename $output)"
+  (
+    print_autogenerated_header
+    echo \
+"# Before including this file, the local Android.mk must define the following
+# variables:
+#
+#    local_c_flags
+#    local_c_includes
+#    local_additional_dependencies
+#
+# This script will define the following variables:
+#
+#    target_c_flags
+#    target_c_includes
+#    target_src_files
+#
+#    host_c_flags
+#    host_c_includes
+#    host_src_files
+#
+
+# Ensure these are empty.
+unknown_arch_c_flags :=
+unknown_arch_src_files :=
+unknown_arch_exclude_files :=
+
+"
+    common_defines=$(var_sorted_value OPENSSL_${prefix}_DEFINES)
+    print_defines_in_mk common_c_flags $common_defines
+
+    common_sources=$(var_sorted_value OPENSSL_${prefix}_SOURCES)
+    print_vardef_in_mk common_src_files $common_sources
+
+    common_includes=$(var_sorted_value OPENSSL_${prefix}_INCLUDES)
+    print_vardef_in_mk common_c_includes $common_includes
+
+    for arch in $all_archs; do
+      arch_defines=$(var_sorted_value OPENSSL_${prefix}_DEFINES_${arch})
+      print_defines_in_mk ${arch}_c_flags $arch_defines
+
+      arch_sources=$(var_sorted_value OPENSSL_${prefix}_SOURCES_${arch})
+      print_vardef_in_mk ${arch}_src_files $arch_sources
+
+      arch_exclude_sources=$(var_sorted_value OPENSSL_${prefix}_SOURCES_EXCLUDES_${arch})
+      print_vardef_in_mk ${arch}_exclude_files $arch_exclude_sources
+
+    done
+
+    echo "\
+target_arch := \$(TARGET_ARCH)
+ifeq (\$(target_arch)-\$(TARGET_HAS_BIGENDIAN),mips-true)
+target_arch := unknown_arch
+endif
+
+target_c_flags    := \$(common_c_flags) \$(\$(target_arch)_c_flags) \$(local_c_flags)
+target_c_includes := \$(addprefix external/openssl/,\$(common_c_includes)) \$(local_c_includes)
+target_src_files  := \$(common_src_files) \$(\$(target_arch)_src_files)
+target_src_files  := \$(filter-out \$(\$(target_arch)_exclude_files), \$(target_src_files))
+
+ifeq (\$(HOST_OS)-\$(HOST_ARCH),linux-x86)
+host_arch := x86
+else
+host_arch := unknown_arch
+endif
+
+host_c_flags    := \$(common_c_flags) \$(\$(host_arch)_c_flags) \$(local_c_flags)
+host_c_includes := \$(addprefix external/openssl/,\$(common_c_includes)) \$(local_c_includes)
+host_src_files  := \$(common_src_files) \$(\$(host_arch)_src_files)
+host_src_files  := \$(filter-out \$(\$(host_arch)_exclude_files), \$(host_src_files))
+
+local_additional_dependencies += \$(LOCAL_PATH)/$(basename $output)
+"
+
+  ) > "$output"
+}
+
 function import() {
   declare -r OPENSSL_SOURCE=$1
 
@@ -104,18 +348,7 @@ function import() {
 
   cd $OPENSSL_DIR
 
-  # Configure source (and print Makefile defines for review, see README.android)
-  ./Configure $CONFIGURE_ARGS
-  rm -f apps/CA.pl.bak crypto/opensslconf.h.bak
-  echo
-  echo BEGIN Makefile defines to compare with android-config.mk
-  echo
-  grep -e -D Makefile | grep -v CONFIGURE_ARGS= | grep -v OPTIONS= | grep -v -e -DOPENSSL_NO_DEPRECATED
-  echo
-  echo END Makefile defines to compare with android-config.mk
-  echo
-
-  # TODO(): Fixup android-config.mk
+  generate_build_config_mk ../build-config.mk
 
   cp -f LICENSE ../NOTICE
   touch ../MODULE_LICENSE_BSD_LIKE
@@ -129,24 +362,58 @@ function import() {
     fi
   done
 
-  # Copy Makefiles
-  cp ../patches/apps_Android.mk apps/Android.mk
-  cp ../patches/crypto_Android.mk crypto/Android.mk
-  cp ../patches/ssl_Android.mk ssl/Android.mk
-
   # Generate arm asm
-  perl crypto/aes/asm/aes-armv4.pl         > crypto/aes/asm/aes-armv4.s
-  perl crypto/bn/asm/armv4-mont.pl         > crypto/bn/asm/armv4-mont.s
-  perl crypto/sha/asm/sha1-armv4-large.pl  > crypto/sha/asm/sha1-armv4-large.s
-  perl crypto/sha/asm/sha256-armv4.pl      > crypto/sha/asm/sha256-armv4.s
-  perl crypto/sha/asm/sha512-armv4.pl      > crypto/sha/asm/sha512-armv4.s
+  gen_asm_arm crypto/aes/asm/aes-armv4.pl
+  gen_asm_arm crypto/bn/asm/armv4-gf2m.pl
+  gen_asm_arm crypto/bn/asm/armv4-mont.pl
+  gen_asm_arm crypto/modes/asm/ghash-armv4.pl
+  gen_asm_arm crypto/sha/asm/sha1-armv4-large.pl
+  gen_asm_arm crypto/sha/asm/sha256-armv4.pl
+  gen_asm_arm crypto/sha/asm/sha512-armv4.pl
 
   # Generate mips asm
-  perl crypto/aes/asm/aes-mips.pl          > crypto/aes/asm/aes-mips.s
-  perl crypto/bn/asm/mips.pl               > crypto/bn/asm/bn-mips.s
-  perl crypto/bn/asm/mips-mont.pl          > crypto/bn/asm/mips-mont.s
-  perl crypto/sha/asm/sha1-mips.pl         > crypto/sha/asm/sha1-mips.s
-  perl crypto/sha/asm/sha512-mips.pl       > crypto/sha/asm/sha256-mips.s
+  gen_asm_mips crypto/aes/asm/aes-mips.pl
+  gen_asm_mips crypto/bn/asm/mips.pl crypto/bn/asm/bn-mips.S
+  gen_asm_mips crypto/bn/asm/mips-mont.pl
+  gen_asm_mips crypto/sha/asm/sha1-mips.pl
+  gen_asm_mips crypto/sha/asm/sha512-mips.pl crypto/sha/asm/sha256-mips.S
+
+  # Generate x86 asm
+  gen_asm_x86 crypto/x86cpuid.pl
+  gen_asm_x86 crypto/aes/asm/aes-586.pl
+  gen_asm_x86 crypto/aes/asm/vpaes-x86.pl
+  gen_asm_x86 crypto/aes/asm/aesni-x86.pl
+  gen_asm_x86 crypto/bn/asm/bn-586.pl
+  gen_asm_x86 crypto/bn/asm/co-586.pl
+  gen_asm_x86 crypto/bn/asm/x86-mont.pl
+  gen_asm_x86 crypto/bn/asm/x86-gf2m.pl
+  gen_asm_x86 crypto/modes/asm/ghash-x86.pl
+  gen_asm_x86 crypto/sha/asm/sha1-586.pl
+  gen_asm_x86 crypto/sha/asm/sha256-586.pl
+  gen_asm_x86 crypto/sha/asm/sha512-586.pl
+  gen_asm_x86 crypto/md5/asm/md5-586.pl
+  gen_asm_x86 crypto/des/asm/des-586.pl
+  gen_asm_x86 crypto/des/asm/crypt586.pl
+  gen_asm_x86 crypto/bf/asm/bf-586.pl
+
+  # Generate x86_64 asm
+  gen_asm_x86_64 crypto/x86_64cpuid.pl
+  gen_asm_x86_64 crypto/sha/asm/sha1-x86_64.pl
+  gen_asm_x86_64 crypto/sha/asm/sha512-x86_64.pl crypto/sha/asm/sha256-x86_64.S
+  gen_asm_x86_64 crypto/sha/asm/sha512-x86_64.pl
+  gen_asm_x86_64 crypto/modes/asm/ghash-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/aesni-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/vpaes-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/bsaes-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/aes-x86_64.pl
+  gen_asm_x86_64 crypto/aes/asm/aesni-sha1-x86_64.pl
+  gen_asm_x86_64 crypto/md5/asm/md5-x86_64.pl
+  gen_asm_x86_64 crypto/bn/asm/modexp512-x86_64.pl
+  gen_asm_x86_64 crypto/bn/asm/x86_64-mont.pl
+  gen_asm_x86_64 crypto/bn/asm/x86_64-gf2m.pl
+  gen_asm_x86_64 crypto/bn/asm/x86_64-mont5.pl
+  gen_asm_x86_64 crypto/rc4/asm/rc4-x86_64.pl
+  gen_asm_x86_64 crypto/rc4/asm/rc4-md5-x86_64.pl
 
   # Setup android.testssl directory
   mkdir android.testssl
@@ -166,6 +433,10 @@ function import() {
 
   cd ..
 
+  generate_config_mk Crypto-config.mk CRYPTO
+  generate_config_mk Ssl-config.mk SSL
+  generate_config_mk Apps-config.mk APPS
+
   # Prune unnecessary sources
   prune
 
@@ -203,6 +474,24 @@ function generate() {
   cleantar
 }
 
+# Find all files in a sub-directory that are encoded in ISO-8859
+# $1: Directory.
+# Out: list of files in $1 that are encoded as ISO-8859.
+function find_iso8859_files() {
+  find $1 -type f -print0 | xargs -0 file | fgrep "ISO-8859" | cut -d: -f1
+}
+
+# Convert all ISO-8859 files in a given subdirectory to UTF-8
+# $1: Directory name
+function convert_iso8859_to_utf8() {
+  declare -r iso_files=$(find_iso8859_files "$1")
+  for iso_file in $iso_files; do
+    iconv --from-code iso-8859-1 --to-code utf-8 $iso_file > $iso_file.tmp
+    rm -f $iso_file
+    mv $iso_file.tmp $iso_file
+  done
+}
+
 function untar() {
   declare -r OPENSSL_SOURCE=$1
   declare -r readonly=$2
@@ -212,11 +501,11 @@ function untar() {
 
   # Process new source
   tar -zxf $OPENSSL_SOURCE
-  mv $OPENSSL_DIR $OPENSSL_DIR_ORIG
+  convert_iso8859_to_utf8 $OPENSSL_DIR
+  cp -rfP $OPENSSL_DIR $OPENSSL_DIR_ORIG
   if [ ! -z $readonly ]; then
     find $OPENSSL_DIR_ORIG -type f -print0 | xargs -0 chmod a-w
   fi
-  tar -zxf $OPENSSL_SOURCE
 }
 
 function prune() {
@@ -240,7 +529,7 @@ function applypatches () {
   for i in $OPENSSL_PATCHES; do
     if [ ! "$skip_patch" = "patches/$i" ]; then
       echo "Applying patch $i"
-      patch -p1 < ../patches/$i || die "Could not apply patches/$i. Fix source and run: $0 regenerate patches/$i"
+      patch -p1 --merge < ../patches/$i || die "Could not apply patches/$i. Fix source and run: $0 regenerate patches/$i"
     else
       echo "Skiping patch $i"
     fi
@@ -248,7 +537,7 @@ function applypatches () {
   done
 
   # Cleanup patch output
-  find . -type f -name "*.orig" -print0 | xargs -0 rm -f
+  find . \( -type f -o -type l \) -name "*.orig" -print0 | xargs -0 rm -f
 
   cd ..
 }
diff --git a/include/openssl/aes.h b/include/openssl/aes.h
index d2c9973..031abf0 100644
--- a/include/openssl/aes.h
+++ b/include/openssl/aes.h
@@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	AES_KEY *key);
 
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+
 void AES_encrypt(const unsigned char *in, unsigned char *out,
 	const AES_KEY *key);
 void AES_decrypt(const unsigned char *in, unsigned char *out,
diff --git a/include/openssl/asn1.h b/include/openssl/asn1.h
index 59540e4..220a0c8 100644
--- a/include/openssl/asn1.h
+++ b/include/openssl/asn1.h
@@ -235,7 +235,7 @@ typedef struct asn1_object_st
  */
 #define ASN1_STRING_FLAG_MSTRING 0x040 
 /* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
 	{
 	int length;
 	int type;
@@ -245,7 +245,7 @@ typedef struct asn1_string_st
 	 * input data has a non-zero 'unused bits' value, it will be
 	 * handled correctly */
 	long flags;
-	} ASN1_STRING;
+	};
 
 /* ASN1_ENCODING structure: this is used to save the received
  * encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@ DECLARE_STACK_OF(ASN1_STRING_TABLE)
  * see asn1t.h
  */
 typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct ASN1_TLC_st ASN1_TLC;
 /* This is just an opaque pointer */
 typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_ASN1_ITEM_I2D_FP				 193
 #define ASN1_F_ASN1_ITEM_PACK				 198
 #define ASN1_F_ASN1_ITEM_SIGN				 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX			 220
 #define ASN1_F_ASN1_ITEM_UNPACK				 199
 #define ASN1_F_ASN1_ITEM_VERIFY				 197
 #define ASN1_F_ASN1_MBSTRING_NCOPY			 122
@@ -1266,6 +1266,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_PKCS5_PBE2_SET_IV			 167
 #define ASN1_F_PKCS5_PBE_SET				 202
 #define ASN1_F_PKCS5_PBE_SET0_ALGOR			 215
+#define ASN1_F_PKCS5_PBKDF2_SET				 219
 #define ASN1_F_SMIME_READ_ASN1				 212
 #define ASN1_F_SMIME_TEXT				 213
 #define ASN1_F_X509_CINF_NEW				 168
@@ -1291,6 +1292,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH			 106
 #define ASN1_R_BUFFER_TOO_SMALL				 107
 #define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER		 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED			 217
 #define ASN1_R_DATA_IS_WRONG				 109
 #define ASN1_R_DECODE_ERROR				 110
 #define ASN1_R_DECODING_ERROR				 111
diff --git a/include/openssl/bio.h b/include/openssl/bio.h
index 152802f..05699ab 100644
--- a/include/openssl/bio.h
+++ b/include/openssl/bio.h
@@ -68,6 +68,14 @@
 
 #include <openssl/crypto.h>
 
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include <stdint.h>
+# else
+# include <inttypes.h>
+# endif
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -95,6 +103,9 @@ extern "C" {
 #define BIO_TYPE_BIO		(19|0x0400)		/* (half a) BIO pair */
 #define BIO_TYPE_LINEBUFFER	(20|0x0200)		/* filter */
 #define BIO_TYPE_DGRAM		(21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP	(24|0x0400|0x0100)
+#endif
 #define BIO_TYPE_ASN1 		(22|0x0200)		/* filter */
 #define BIO_TYPE_COMP 		(23|0x0200)		/* filter */
 
@@ -146,6 +157,7 @@ extern "C" {
 /* #endif */
 
 #define BIO_CTRL_DGRAM_QUERY_MTU          40 /* as kernel for current MTU */
+#define BIO_CTRL_DGRAM_GET_FALLBACK_MTU   47
 #define BIO_CTRL_DGRAM_GET_MTU            41 /* get cached value for MTU */
 #define BIO_CTRL_DGRAM_SET_MTU            42 /* set cached value for
 					      * MTU. want to use this
@@ -161,7 +173,22 @@ extern "C" {
 #define BIO_CTRL_DGRAM_SET_PEER           44 /* Destination for the data */
 
 #define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT   45 /* Next DTLS handshake timeout to
-											  * adjust socket timeouts */
+                                              * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE	50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY		51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY		52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD		53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO		60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO		61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO		62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO		63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO			64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO			65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN		70
+#endif
 
 /* modifiers */
 #define BIO_FP_READ		0x02
@@ -306,6 +333,15 @@ DECLARE_STACK_OF(BIO)
 
 typedef struct bio_f_buffer_ctx_struct
 	{
+	/* Buffers are setup like this:
+	 *
+	 * <---------------------- size ----------------------->
+	 * +---------------------------------------------------+
+	 * | consumed | remaining          | free space        |
+	 * +---------------------------------------------------+
+	 * <-- off --><------- len ------->
+	 */
+
 	/* BIO *bio; */ /* this is now in the BIO struct */
 	int ibuf_size;	/* how big is the input buffer */
 	int obuf_size;	/* how big is the output buffer */
@@ -322,6 +358,34 @@ typedef struct bio_f_buffer_ctx_struct
 /* Prefix and suffix callback in ASN1 BIO */
 typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
 
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+	{
+	uint16_t snd_sid;
+	uint16_t snd_flags;
+	uint32_t snd_ppid;
+	uint32_t snd_context;
+	};
+
+struct bio_dgram_sctp_rcvinfo
+	{
+	uint16_t rcv_sid;
+	uint16_t rcv_ssn;
+	uint16_t rcv_flags;
+	uint32_t rcv_ppid;
+	uint32_t rcv_tsn;
+	uint32_t rcv_cumtsn;
+	uint32_t rcv_context;
+	};
+
+struct bio_dgram_sctp_prinfo
+	{
+	uint16_t pr_policy;
+	uint32_t pr_value;
+	};
+#endif
+
 /* connect BIO stuff */
 #define BIO_CONN_S_BEFORE		1
 #define BIO_CONN_S_GET_IP		2
@@ -619,6 +683,9 @@ BIO_METHOD *BIO_f_linebuffer(void);
 BIO_METHOD *BIO_f_nbio_test(void);
 #ifndef OPENSSL_NO_DGRAM
 BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
 #endif
 
 /* BIO_METHOD *BIO_f_ber(void); */
@@ -661,6 +728,15 @@ int BIO_set_tcp_ndelay(int sock,int turn_on);
 
 BIO *BIO_new_socket(int sock, int close_flag);
 BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
 BIO *BIO_new_fd(int fd, int close_flag);
 BIO *BIO_new_connect(char *host_port);
 BIO *BIO_new_accept(char *host_port);
@@ -725,6 +801,7 @@ void ERR_load_BIO_strings(void);
 #define BIO_F_BUFFER_CTRL				 114
 #define BIO_F_CONN_CTRL					 127
 #define BIO_F_CONN_STATE				 115
+#define BIO_F_DGRAM_SCTP_READ				 132
 #define BIO_F_FILE_CTRL					 116
 #define BIO_F_FILE_READ					 130
 #define BIO_F_LINEBUFFER_CTRL				 129
diff --git a/include/openssl/blowfish.h b/include/openssl/blowfish.h
index b97e76f..4b6c892 100644
--- a/include/openssl/blowfish.h
+++ b/include/openssl/blowfish.h
@@ -104,7 +104,9 @@ typedef struct bf_key_st
 	BF_LONG S[4*256];
 	} BF_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
 
 void BF_encrypt(BF_LONG *data,const BF_KEY *key);
diff --git a/include/openssl/bn.h b/include/openssl/bn.h
index a0bc478..f34248e 100644
--- a/include/openssl/bn.h
+++ b/include/openssl/bn.h
@@ -558,6 +558,17 @@ int	BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
 int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
 		int do_trial_division, BN_GENCB *cb);
 
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+			const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			BIGNUM *Xp1, BIGNUM *Xp2,
+			const BIGNUM *Xp,
+			const BIGNUM *e, BN_CTX *ctx,
+			BN_GENCB *cb);
+
 BN_MONT_CTX *BN_MONT_CTX_new(void );
 void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
 int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@ int	BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 	BN_RECP_CTX *recp, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Functions for arithmetic over binary polynomials represented by BIGNUMs. 
  *
  * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@ int	BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
 int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
 int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
 
+#endif
+
 /* faster mod functions for the 'NIST primes' 
  * 0 <= a < p^2 */
 int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
diff --git a/include/openssl/cmac.h b/include/openssl/cmac.h
new file mode 100644
index 0000000..712e92d
--- /dev/null
+++ b/include/openssl/cmac.h
@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/evp.h>
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/include/openssl/crypto.h b/include/openssl/crypto.h
index b0360ce..f92fc51 100644
--- a/include/openssl/crypto.h
+++ b/include/openssl/crypto.h
@@ -488,10 +488,10 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
 				    long (**go)(void));
 
 void *CRYPTO_malloc_locked(int num, const char *file, int line);
-void CRYPTO_free_locked(void *);
+void CRYPTO_free_locked(void *ptr);
 void *CRYPTO_malloc(int num, const char *file, int line);
 char *CRYPTO_strdup(const char *str, const char *file, int line);
-void CRYPTO_free(void *);
+void CRYPTO_free(void *ptr);
 void *CRYPTO_realloc(void *addr,int num, const char *file, int line);
 void *CRYPTO_realloc_clean(void *addr,int old_num,int num,const char *file,
 			   int line);
@@ -547,6 +547,40 @@ unsigned long *OPENSSL_ia32cap_loc(void);
 #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
 int OPENSSL_isservice(void);
 
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c) \
+	{ \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+	return private_##alg##_Init(c); \
+	} \
+	int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
+/* CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It
+ * takes an amount of time dependent on |len|, but independent of the contents
+ * of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a
+ * defined order as the return value when a != b is undefined, other than to be
+ * non-zero. */
+int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -562,11 +596,13 @@ void ERR_load_CRYPTO_strings(void);
 #define CRYPTO_F_CRYPTO_SET_EX_DATA			 102
 #define CRYPTO_F_DEF_ADD_INDEX				 104
 #define CRYPTO_F_DEF_GET_CLASS				 105
+#define CRYPTO_F_FIPS_MODE_SET				 109
 #define CRYPTO_F_INT_DUP_EX_DATA			 106
 #define CRYPTO_F_INT_FREE_EX_DATA			 107
 #define CRYPTO_F_INT_NEW_EX_DATA			 108
 
 /* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED		 101
 #define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK		 100
 
 #ifdef  __cplusplus
diff --git a/include/openssl/des.h b/include/openssl/des.h
index 92b6663..1eaedcb 100644
--- a/include/openssl/des.h
+++ b/include/openssl/des.h
@@ -224,6 +224,9 @@ int DES_set_key(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
 void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
 void DES_string_to_key(const char *str,DES_cblock *key);
 void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
 void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,
diff --git a/include/openssl/dh.h b/include/openssl/dh.h
index 849309a..ea59e61 100644
--- a/include/openssl/dh.h
+++ b/include/openssl/dh.h
@@ -86,6 +86,21 @@
                                        * be used for all exponents.
                                        */
 
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -230,6 +245,9 @@ void ERR_load_DH_strings(void);
 #define DH_F_COMPUTE_KEY				 102
 #define DH_F_DHPARAMS_PRINT_FP				 101
 #define DH_F_DH_BUILTIN_GENPARAMS			 106
+#define DH_F_DH_COMPUTE_KEY				 114
+#define DH_F_DH_GENERATE_KEY				 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX			 116
 #define DH_F_DH_NEW_METHOD				 105
 #define DH_F_DH_PARAM_DECODE				 107
 #define DH_F_DH_PRIV_DECODE				 110
@@ -249,7 +267,9 @@ void ERR_load_DH_strings(void);
 #define DH_R_DECODE_ERROR				 104
 #define DH_R_INVALID_PUBKEY				 102
 #define DH_R_KEYS_NOT_SET				 108
+#define DH_R_KEY_SIZE_TOO_SMALL				 110
 #define DH_R_MODULUS_TOO_LARGE				 103
+#define DH_R_NON_FIPS_METHOD				 111
 #define DH_R_NO_PARAMETERS_SET				 107
 #define DH_R_NO_PRIVATE_VALUE				 100
 #define DH_R_PARAMETER_ENCODING_ERROR			 105
diff --git a/include/openssl/dsa.h b/include/openssl/dsa.h
index ac50a5c..a6f6d0b 100644
--- a/include/openssl/dsa.h
+++ b/include/openssl/dsa.h
@@ -97,6 +97,21 @@
                                               * be used for all exponents.
                                               */
 
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -272,6 +287,8 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSAPARAMS_PRINT_FP			 101
 #define DSA_F_DSA_DO_SIGN				 112
 #define DSA_F_DSA_DO_VERIFY				 113
+#define DSA_F_DSA_GENERATE_KEY				 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX		 123
 #define DSA_F_DSA_NEW_METHOD				 103
 #define DSA_F_DSA_PARAM_DECODE				 119
 #define DSA_F_DSA_PRINT_FP				 105
@@ -282,6 +299,7 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSA_SIGN					 106
 #define DSA_F_DSA_SIGN_SETUP				 107
 #define DSA_F_DSA_SIG_NEW				 109
+#define DSA_F_DSA_SIG_PRINT				 125
 #define DSA_F_DSA_VERIFY				 108
 #define DSA_F_I2D_DSA_SIG				 111
 #define DSA_F_OLD_DSA_PRIV_DECODE			 122
@@ -298,6 +316,8 @@ void ERR_load_DSA_strings(void);
 #define DSA_R_INVALID_DIGEST_TYPE			 106
 #define DSA_R_MISSING_PARAMETERS			 101
 #define DSA_R_MODULUS_TOO_LARGE				 103
+#define DSA_R_NEED_NEW_SETUP_VALUES			 110
+#define DSA_R_NON_FIPS_DSA_METHOD			 111
 #define DSA_R_NO_PARAMETERS_SET				 107
 #define DSA_R_PARAMETER_ENCODING_ERROR			 105
 
diff --git a/include/openssl/dtls1.h b/include/openssl/dtls1.h
index 2900d1d..e65d501 100644
--- a/include/openssl/dtls1.h
+++ b/include/openssl/dtls1.h
@@ -57,8 +57,8 @@
  *
  */
 
-#ifndef HEADER_DTLS1_H 
-#define HEADER_DTLS1_H 
+#ifndef HEADER_DTLS1_H
+#define HEADER_DTLS1_H
 
 #include <openssl/buffer.h>
 #include <openssl/pqueue.h>
@@ -72,8 +72,12 @@
 #elif defined(OPENSSL_SYS_NETWARE) && !defined(_WINSOCK2API_)
 #include <sys/timeval.h>
 #else
+#if defined(OPENSSL_SYS_VXWORKS)
+#include <sys/times.h>
+#else
 #include <sys/time.h>
 #endif
+#endif
 
 #ifdef  __cplusplus
 extern "C" {
@@ -105,6 +109,11 @@ extern "C" {
 #define DTLS1_AL_HEADER_LENGTH                   2
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_AUTH_LABEL	"EXPORTER_DTLS_OVER_SCTP"
+#endif
 
 typedef struct dtls1_bitmap_st
 	{
@@ -227,7 +236,7 @@ typedef struct dtls1_state_st
 
 	struct dtls1_timeout_st timeout;
 
-	/* Indicates when the last handshake msg sent will timeout */
+	/* Indicates when the last handshake msg or heartbeat sent will timeout */
 	struct timeval next_timeout;
 
 	/* Timeout duration */
@@ -243,6 +252,13 @@ typedef struct dtls1_state_st
 	unsigned int retransmitting;
 	unsigned int change_cipher_spec_ok;
 
+#ifndef OPENSSL_NO_SCTP
+	/* used when SSL_ST_XX_FLUSH is entered */
+	int next_state;
+
+	int shutdown_received;
+#endif
+
 	} DTLS1_STATE;
 
 typedef struct dtls1_record_data_st
@@ -251,8 +267,12 @@ typedef struct dtls1_record_data_st
 	unsigned int   packet_length;
 	SSL3_BUFFER    rbuf;
 	SSL3_RECORD    rrec;
+#ifndef OPENSSL_NO_SCTP
+	struct bio_dgram_sctp_rcvinfo recordinfo;
+#endif
 	} DTLS1_RECORD_DATA;
 
+#endif
 
 /* Timeout multipliers (timeout slice is defined in apps/timeouts.h */
 #define DTLS1_TMO_READ_COUNT                      2
diff --git a/include/openssl/e_os2.h b/include/openssl/e_os2.h
index 4c785c6..d22c036 100644
--- a/include/openssl/e_os2.h
+++ b/include/openssl/e_os2.h
@@ -193,8 +193,14 @@ extern "C" {
 #endif
 
 /* --------------------------------- VOS ----------------------------------- */
-#ifdef OPENSSL_SYSNAME_VOS
+#if defined(__VOS__) || defined(OPENSSL_SYSNAME_VOS)
 # define OPENSSL_SYS_VOS
+#ifdef __HPPA__
+# define OPENSSL_SYS_VOS_HPPA
+#endif
+#ifdef __IA32__
+# define OPENSSL_SYS_VOS_IA32
+#endif
 #endif
 
 /* ------------------------------- VxWorks --------------------------------- */
@@ -283,6 +289,26 @@ extern "C" {
 # define OPENSSL_GLOBAL_REF(name) _shadow_##name
 #endif
 
+#if defined(OPENSSL_SYS_MACINTOSH_CLASSIC) && macintosh==1 && !defined(MAC_OS_GUSI_SOURCE)
+#  define ossl_ssize_t long
+#endif
+
+#ifdef OPENSSL_SYS_MSDOS
+#  define ossl_ssize_t long
+#endif
+
+#if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
+#  define ssize_t int
+#endif
+
+#if defined(__ultrix) && !defined(ssize_t)
+#  define ossl_ssize_t int 
+#endif
+
+#ifndef ossl_ssize_t
+#  define ossl_ssize_t ssize_t
+#endif
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/include/openssl/ec.h b/include/openssl/ec.h
index ee70781..dfe8710 100644
--- a/include/openssl/ec.h
+++ b/include/openssl/ec.h
@@ -151,7 +151,24 @@ const EC_METHOD *EC_GFp_mont_method(void);
  */
 const EC_METHOD *EC_GFp_nist_method(void);
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
+
+/** Returns 64-bit optimized methods for nistp256
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
 
+#ifndef OPENSSL_NO_EC2M
 /********************************************************************/ 
 /*           EC_METHOD for curves over GF(2^m)                      */
 /********************************************************************/
@@ -161,6 +178,8 @@ const EC_METHOD *EC_GFp_nist_method(void);
  */
 const EC_METHOD *EC_GF2m_simple_method(void);
 
+#endif
+
 
 /********************************************************************/
 /*                   EC_GROUP functions                             */
@@ -255,10 +274,10 @@ int EC_GROUP_get_curve_name(const EC_GROUP *group);
 void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag);
 int EC_GROUP_get_asn1_flag(const EC_GROUP *group);
 
-void EC_GROUP_set_point_conversion_form(EC_GROUP *, point_conversion_form_t);
+void EC_GROUP_set_point_conversion_form(EC_GROUP *group, point_conversion_form_t form);
 point_conversion_form_t EC_GROUP_get_point_conversion_form(const EC_GROUP *);
 
-unsigned char *EC_GROUP_get0_seed(const EC_GROUP *);
+unsigned char *EC_GROUP_get0_seed(const EC_GROUP *x);
 size_t EC_GROUP_get_seed_len(const EC_GROUP *);
 size_t EC_GROUP_set_seed(EC_GROUP *, const unsigned char *, size_t len);
 
@@ -282,6 +301,7 @@ int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, co
  */
 int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
 /** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
  *  \param  group  EC_GROUP object
  *  \param  p      BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@ int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, c
  *  \return 1 on success and 0 if an error occured
  */
 int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Returns the number of bits needed to represent a field element 
  *  \param  group  EC_GROUP object
  *  \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Creates a new EC_GROUP object with the specified parameters defined
  *  over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
  *  \param  p    BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Creates a EC_GROUP object with a curve specified by a NID
  *  \param  nid  NID of the OID of the curve name
  *  \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Sets the affine coordinates of a EC_POINT over GF2m
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -514,7 +534,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
 /** Encodes a EC_POINT object to a octet string
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -606,8 +626,8 @@ int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_CTX *c
  */
 int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
 
-int EC_POINT_make_affine(const EC_GROUP *, EC_POINT *, BN_CTX *);
-int EC_POINTs_make_affine(const EC_GROUP *, size_t num, EC_POINT *[], BN_CTX *);
+int EC_POINT_make_affine(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx);
+int EC_POINTs_make_affine(const EC_GROUP *group, size_t num, EC_POINT *points[], BN_CTX *ctx);
 
 /** Computes r = generator * n sum_{i=0}^num p[i] * m[i]
  *  \param  group  underlying EC_GROUP object
@@ -653,9 +673,11 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
 /* EC_GROUP_get_basis_type() returns the NID of the basis type
  * used to represent the field elements */
 int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
 	unsigned int *k2, unsigned int *k3);
+#endif
 
 #define OPENSSL_EC_NAMED_CURVE	0x001
 
@@ -689,11 +711,21 @@ typedef struct ec_key_st EC_KEY;
 #define EC_PKEY_NO_PARAMETERS	0x001
 #define EC_PKEY_NO_PUBKEY	0x002
 
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW	0x1
+#define EC_FLAG_FIPS_CHECKED	0x2
+
 /** Creates a new EC_KEY object.
  *  \return EC_KEY object or NULL if an error occurred.
  */
 EC_KEY *EC_KEY_new(void);
 
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
 /** Creates a new EC_KEY object using a named curve as underlying
  *  EC_GROUP object.
  *  \param  nid  NID of the named curve.
@@ -768,16 +800,24 @@ const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key);
 int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub);
 
 unsigned EC_KEY_get_enc_flags(const EC_KEY *key);
-void EC_KEY_set_enc_flags(EC_KEY *, unsigned int);
-point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *);
-void EC_KEY_set_conv_form(EC_KEY *, point_conversion_form_t);
+void EC_KEY_set_enc_flags(EC_KEY *eckey, unsigned int flags);
+point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key);
+void EC_KEY_set_conv_form(EC_KEY *eckey, point_conversion_form_t cform);
 /* functions to set/get method specific data  */
-void *EC_KEY_get_key_method_data(EC_KEY *, 
+void *EC_KEY_get_key_method_data(EC_KEY *key, 
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
-void EC_KEY_insert_key_method_data(EC_KEY *, void *data,
+/** Sets the key method data of an EC_KEY object, if none has yet been set.
+ *  \param  key              EC_KEY object
+ *  \param  data             opaque data to install.
+ *  \param  dup_func         a function that duplicates |data|.
+ *  \param  free_func        a function that frees |data|.
+ *  \param  clear_free_func  a function that wipes and frees |data|.
+ *  \return the previously set data pointer, or NULL if |data| was inserted.
+ */
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
 	void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
 /* wrapper functions for the underlying EC_GROUP object */
-void EC_KEY_set_asn1_flag(EC_KEY *, int);
+void EC_KEY_set_asn1_flag(EC_KEY *eckey, int asn1_flag);
 
 /** Creates a table of pre-computed multiples of the generator to 
  *  accelerate further EC_KEY operations.
@@ -799,6 +839,15 @@ int EC_KEY_generate_key(EC_KEY *key);
  */
 int EC_KEY_check_key(const EC_KEY *key);
 
+/** Sets a public key from affine coordindates performing
+ *  neccessary NIST PKV tests.
+ *  \param  key  the EC_KEY object
+ *  \param  x    public key x coordinate
+ *  \param  y    public key y coordinate
+ *  \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
 
 /********************************************************************/
 /*        de- and encoding functions for SEC1 ECPrivateKey          */
@@ -926,6 +975,7 @@ void ERR_load_EC_strings(void);
 /* Error codes for the EC functions. */
 
 /* Function codes. */
+#define EC_F_BN_TO_FELEM				 224
 #define EC_F_COMPUTE_WNAF				 143
 #define EC_F_D2I_ECPARAMETERS				 144
 #define EC_F_D2I_ECPKPARAMETERS				 145
@@ -968,6 +1018,15 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_GFP_MONT_FIELD_SQR			 132
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE		 189
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP		 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE		 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL			 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE		 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL			 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE		 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL			 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
 #define EC_F_EC_GFP_NIST_FIELD_MUL			 200
 #define EC_F_EC_GFP_NIST_FIELD_SQR			 201
 #define EC_F_EC_GFP_NIST_GROUP_SET_CURVE		 202
@@ -1010,6 +1069,7 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_KEY_NEW					 182
 #define EC_F_EC_KEY_PRINT				 180
 #define EC_F_EC_KEY_PRINT_FP				 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES	 229
 #define EC_F_EC_POINTS_MAKE_AFFINE			 136
 #define EC_F_EC_POINT_ADD				 112
 #define EC_F_EC_POINT_CMP				 113
@@ -1040,6 +1100,9 @@ void ERR_load_EC_strings(void);
 #define EC_F_I2D_ECPKPARAMETERS				 191
 #define EC_F_I2D_ECPRIVATEKEY				 192
 #define EC_F_I2O_ECPUBLICKEY				 151
+#define EC_F_NISTP224_PRE_COMP_NEW			 227
+#define EC_F_NISTP256_PRE_COMP_NEW			 236
+#define EC_F_NISTP521_PRE_COMP_NEW			 237
 #define EC_F_O2I_ECPUBLICKEY				 152
 #define EC_F_OLD_EC_PRIV_DECODE				 222
 #define EC_F_PKEY_EC_CTRL				 197
@@ -1052,12 +1115,15 @@ void ERR_load_EC_strings(void);
 /* Reason codes. */
 #define EC_R_ASN1_ERROR					 115
 #define EC_R_ASN1_UNKNOWN_FIELD				 116
+#define EC_R_BIGNUM_OUT_OF_RANGE			 144
 #define EC_R_BUFFER_TOO_SMALL				 100
+#define EC_R_COORDINATES_OUT_OF_RANGE			 146
 #define EC_R_D2I_ECPKPARAMETERS_FAILURE			 117
 #define EC_R_DECODE_ERROR				 142
 #define EC_R_DISCRIMINANT_IS_ZERO			 118
 #define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE		 119
 #define EC_R_FIELD_TOO_LARGE				 143
+#define EC_R_GF2M_NOT_SUPPORTED				 147
 #define EC_R_GROUP2PKPARAMETERS_FAILURE			 120
 #define EC_R_I2D_ECPKPARAMETERS_FAILURE			 121
 #define EC_R_INCOMPATIBLE_OBJECTS			 101
@@ -1092,6 +1158,7 @@ void ERR_load_EC_strings(void);
 #define EC_R_UNKNOWN_GROUP				 129
 #define EC_R_UNKNOWN_ORDER				 114
 #define EC_R_UNSUPPORTED_FIELD				 131
+#define EC_R_WRONG_CURVE_PARAMETERS			 145
 #define EC_R_WRONG_ORDER				 130
 
 #ifdef  __cplusplus
diff --git a/include/openssl/ecdh.h b/include/openssl/ecdh.h
index b4b58ee..8887102 100644
--- a/include/openssl/ecdh.h
+++ b/include/openssl/ecdh.h
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void);
 /* Error codes for the ECDH functions. */
 
 /* Function codes. */
+#define ECDH_F_ECDH_CHECK				 102
 #define ECDH_F_ECDH_COMPUTE_KEY				 100
 #define ECDH_F_ECDH_DATA_NEW_METHOD			 101
 
 /* Reason codes. */
 #define ECDH_R_KDF_FAILED				 102
+#define ECDH_R_NON_FIPS_METHOD				 103
 #define ECDH_R_NO_PRIVATE_VALUE				 100
 #define ECDH_R_POINT_ARITHMETIC_FAILURE			 101
 
diff --git a/include/openssl/ecdsa.h b/include/openssl/ecdsa.h
index e61c539..7fb5254 100644
--- a/include/openssl/ecdsa.h
+++ b/include/openssl/ecdsa.h
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void);
 /* Error codes for the ECDSA functions. */
 
 /* Function codes. */
+#define ECDSA_F_ECDSA_CHECK				 104
 #define ECDSA_F_ECDSA_DATA_NEW_METHOD			 100
 #define ECDSA_F_ECDSA_DO_SIGN				 101
 #define ECDSA_F_ECDSA_DO_VERIFY				 102
@@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void);
 #define ECDSA_R_ERR_EC_LIB				 102
 #define ECDSA_R_MISSING_PARAMETERS			 103
 #define ECDSA_R_NEED_NEW_SETUP_VALUES			 106
+#define ECDSA_R_NON_FIPS_METHOD				 107
 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED		 104
 #define ECDSA_R_SIGNATURE_MALLOC_FAILED			 105
 
diff --git a/include/openssl/engine.h b/include/openssl/engine.h
index 943aeae..f8be497 100644
--- a/include/openssl/engine.h
+++ b/include/openssl/engine.h
@@ -141,6 +141,13 @@ extern "C" {
  * the existing ENGINE's structural reference count. */
 #define ENGINE_FLAGS_BY_ID_COPY		(int)0x0004
 
+/* This flag if for an ENGINE that does not want its methods registered as 
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL	(int)0x0008
+
 /* ENGINEs can support their own command types, and these flags are used in
  * ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
  * command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@ void ENGINE_load_gost(void);
 #endif
 #endif
 void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
 void ENGINE_load_builtin_engines(void);
 
 /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation
diff --git a/include/openssl/err.h b/include/openssl/err.h
index b9f8c16..974cc9c 100644
--- a/include/openssl/err.h
+++ b/include/openssl/err.h
@@ -344,8 +344,9 @@ void ERR_print_errors_fp(FILE *fp);
 #endif
 #ifndef OPENSSL_NO_BIO
 void ERR_print_errors(BIO *bp);
-void ERR_add_error_data(int num, ...);
 #endif
+void ERR_add_error_data(int num, ...);
+void ERR_add_error_vdata(int num, va_list args);
 void ERR_load_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_unload_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_load_ERR_strings(void);
diff --git a/include/openssl/evp.h b/include/openssl/evp.h
index 9f9795e..e43a58e 100644
--- a/include/openssl/evp.h
+++ b/include/openssl/evp.h
@@ -83,7 +83,7 @@
 #define EVP_RC5_32_12_16_KEY_SIZE	16
 */
 #define EVP_MAX_MD_SIZE			64	/* longest known is SHA512 */
-#define EVP_MAX_KEY_LENGTH		32
+#define EVP_MAX_KEY_LENGTH		64
 #define EVP_MAX_IV_LENGTH		16
 #define EVP_MAX_BLOCK_LENGTH		32
 
@@ -116,6 +116,7 @@
 #define EVP_PKEY_DH	NID_dhKeyAgreement
 #define EVP_PKEY_EC	NID_X9_62_id_ecPublicKey
 #define EVP_PKEY_HMAC	NID_hmac
+#define EVP_PKEY_CMAC	NID_cmac
 
 #ifdef	__cplusplus
 extern "C" {
@@ -216,6 +217,8 @@ typedef int evp_verify_method(int type,const unsigned char *m,
 
 #define EVP_MD_FLAG_DIGALGID_CUSTOM		0x0018
 
+#define EVP_MD_FLAG_FIPS	0x0400 /* Note if suitable for use in FIPS mode */
+
 /* Digest ctrls */
 
 #define	EVP_MD_CTRL_DIGALGID			0x1
@@ -325,6 +328,10 @@ struct evp_cipher_st
 #define		EVP_CIPH_CBC_MODE		0x2
 #define		EVP_CIPH_CFB_MODE		0x3
 #define		EVP_CIPH_OFB_MODE		0x4
+#define		EVP_CIPH_CTR_MODE		0x5
+#define		EVP_CIPH_GCM_MODE		0x6
+#define		EVP_CIPH_CCM_MODE		0x7
+#define		EVP_CIPH_XTS_MODE		0x10001
 #define 	EVP_CIPH_MODE			0xF0007
 /* Set if variable length cipher */
 #define 	EVP_CIPH_VARIABLE_LENGTH	0x8
@@ -346,6 +353,15 @@ struct evp_cipher_st
 #define		EVP_CIPH_FLAG_DEFAULT_ASN1	0x1000
 /* Buffer length in bits not bytes: CFB1 mode only */
 #define		EVP_CIPH_FLAG_LENGTH_BITS	0x2000
+/* Note if suitable for use in FIPS mode */
+#define		EVP_CIPH_FLAG_FIPS		0x4000
+/* Allow non FIPS cipher in FIPS mode */
+#define		EVP_CIPH_FLAG_NON_FIPS_ALLOW	0x8000
+/* Cipher handles any and all padding logic as well
+ * as finalisation.
+ */
+#define 	EVP_CIPH_FLAG_CUSTOM_CIPHER	0x100000
+#define		EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
 
 /* ctrl() values */
 
@@ -358,6 +374,33 @@ struct evp_cipher_st
 #define 	EVP_CTRL_RAND_KEY		0x6
 #define 	EVP_CTRL_PBE_PRF_NID		0x7
 #define 	EVP_CTRL_COPY			0x8
+#define 	EVP_CTRL_GCM_SET_IVLEN		0x9
+#define 	EVP_CTRL_GCM_GET_TAG		0x10
+#define 	EVP_CTRL_GCM_SET_TAG		0x11
+#define		EVP_CTRL_GCM_SET_IV_FIXED	0x12
+#define		EVP_CTRL_GCM_IV_GEN		0x13
+#define		EVP_CTRL_CCM_SET_IVLEN		EVP_CTRL_GCM_SET_IVLEN
+#define		EVP_CTRL_CCM_GET_TAG		EVP_CTRL_GCM_GET_TAG
+#define		EVP_CTRL_CCM_SET_TAG		EVP_CTRL_GCM_SET_TAG
+#define		EVP_CTRL_CCM_SET_L		0x14
+#define		EVP_CTRL_CCM_SET_MSGLEN		0x15
+/* AEAD cipher deduces payload length and returns number of bytes
+ * required to store MAC and eventual padding. Subsequent call to
+ * EVP_Cipher even appends/verifies MAC.
+ */
+#define		EVP_CTRL_AEAD_TLS1_AAD		0x16
+/* Used by composite AEAD ciphers, no-op in GCM, CCM... */
+#define		EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+/* Set the GCM invocation field, decrypt only */
+#define		EVP_CTRL_GCM_SET_IV_INV		0x18
+
+/* GCM TLS constants */
+/* Length of fixed part of IV derived from PRF */
+#define EVP_GCM_TLS_FIXED_IV_LEN			4
+/* Length of explicit part of IV part of TLS records */
+#define EVP_GCM_TLS_EXPLICIT_IV_LEN			8
+/* Length of tag for TLS */
+#define EVP_GCM_TLS_TAG_LEN				16
 
 typedef struct evp_cipher_info_st
 	{
@@ -375,7 +418,7 @@ struct evp_cipher_ctx_st
 	unsigned char  oiv[EVP_MAX_IV_LENGTH];	/* original iv */
 	unsigned char  iv[EVP_MAX_IV_LENGTH];	/* working iv */
 	unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */
-	int num;				/* used by cfb/ofb mode */
+	int num;				/* used by cfb/ofb/ctr mode */
 
 	void *app_data;		/* application stuff */
 	int key_len;		/* May change for variable length cipher */
@@ -695,6 +738,9 @@ const EVP_MD *EVP_dev_crypto_md5(void);
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -741,9 +787,10 @@ const EVP_CIPHER *EVP_aes_128_cfb8(void);
 const EVP_CIPHER *EVP_aes_128_cfb128(void);
 # define EVP_aes_128_cfb EVP_aes_128_cfb128
 const EVP_CIPHER *EVP_aes_128_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_128_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_128_ccm(void);
+const EVP_CIPHER *EVP_aes_128_gcm(void);
+const EVP_CIPHER *EVP_aes_128_xts(void);
 const EVP_CIPHER *EVP_aes_192_ecb(void);
 const EVP_CIPHER *EVP_aes_192_cbc(void);
 const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -751,9 +798,9 @@ const EVP_CIPHER *EVP_aes_192_cfb8(void);
 const EVP_CIPHER *EVP_aes_192_cfb128(void);
 # define EVP_aes_192_cfb EVP_aes_192_cfb128
 const EVP_CIPHER *EVP_aes_192_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_192_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_192_ccm(void);
+const EVP_CIPHER *EVP_aes_192_gcm(void);
 const EVP_CIPHER *EVP_aes_256_ecb(void);
 const EVP_CIPHER *EVP_aes_256_cbc(void);
 const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -761,8 +808,13 @@ const EVP_CIPHER *EVP_aes_256_cfb8(void);
 const EVP_CIPHER *EVP_aes_256_cfb128(void);
 # define EVP_aes_256_cfb EVP_aes_256_cfb128
 const EVP_CIPHER *EVP_aes_256_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_256_ctr(void);
+const EVP_CIPHER *EVP_aes_256_ccm(void);
+const EVP_CIPHER *EVP_aes_256_gcm(void);
+const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
 #endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -869,6 +921,7 @@ struct ec_key_st *EVP_PKEY_get1_EC_KEY(EVP_PKEY *pkey);
 #endif
 
 EVP_PKEY *	EVP_PKEY_new(void);
+EVP_PKEY *	EVP_PKEY_dup(EVP_PKEY *pkey);
 void		EVP_PKEY_free(EVP_PKEY *pkey);
 
 EVP_PKEY *	d2i_PublicKey(int type,EVP_PKEY **a, const unsigned char **pp,
@@ -1047,13 +1100,22 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
 #define EVP_PKEY_CTRL_CMS_DECRYPT	10
 #define EVP_PKEY_CTRL_CMS_SIGN		11
 
+#define EVP_PKEY_CTRL_CIPHER		12
+
 #define EVP_PKEY_ALG_CTRL		0x1000
 
 
 #define EVP_PKEY_FLAG_AUTOARGLEN	2
+/* Method handles all operations: don't assume any digest related
+ * defaults.
+ */
+#define EVP_PKEY_FLAG_SIGCTX_CUSTOM	4
 
 const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
 EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags);
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth);
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
 int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
 
@@ -1071,7 +1133,7 @@ int EVP_PKEY_CTX_get_operation(EVP_PKEY_CTX *ctx);
 void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen);
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen);
+				const unsigned char *key, int keylen);
 
 void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data);
 void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx);
@@ -1181,6 +1243,8 @@ void EVP_PKEY_meth_set_ctrl(EVP_PKEY_METHOD *pmeth,
 	int (*ctrl_str)(EVP_PKEY_CTX *ctx,
 					const char *type, const char *value));
 
+void EVP_add_alg_module(void);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1190,8 +1254,14 @@ void ERR_load_EVP_strings(void);
 /* Error codes for the EVP functions. */
 
 /* Function codes. */
+#define EVP_F_AESNI_INIT_KEY				 165
+#define EVP_F_AESNI_XTS_CIPHER				 176
 #define EVP_F_AES_INIT_KEY				 133
+#define EVP_F_AES_XTS					 172
+#define EVP_F_AES_XTS_CIPHER				 175
+#define EVP_F_ALG_MODULE_INIT				 177
 #define EVP_F_CAMELLIA_INIT_KEY				 159
+#define EVP_F_CMAC_INIT					 173
 #define EVP_F_D2I_PKEY					 100
 #define EVP_F_DO_SIGVER_INIT				 161
 #define EVP_F_DSAPKEY2PKCS8				 134
@@ -1246,15 +1316,24 @@ void ERR_load_EVP_strings(void);
 #define EVP_F_EVP_RIJNDAEL				 126
 #define EVP_F_EVP_SIGNFINAL				 107
 #define EVP_F_EVP_VERIFYFINAL				 108
+#define EVP_F_FIPS_CIPHERINIT				 166
+#define EVP_F_FIPS_CIPHER_CTX_COPY			 170
+#define EVP_F_FIPS_CIPHER_CTX_CTRL			 167
+#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH		 171
+#define EVP_F_FIPS_DIGESTINIT				 168
+#define EVP_F_FIPS_MD_CTX_COPY				 169
+#define EVP_F_HMAC_INIT_EX				 174
 #define EVP_F_INT_CTX_NEW				 157
 #define EVP_F_PKCS5_PBE_KEYIVGEN			 117
 #define EVP_F_PKCS5_V2_PBE_KEYIVGEN			 118
+#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN			 164
 #define EVP_F_PKCS8_SET_BROKEN				 112
 #define EVP_F_PKEY_SET_TYPE				 158
 #define EVP_F_RC2_MAGIC_TO_METH				 109
 #define EVP_F_RC5_CTRL					 125
 
 /* Reason codes. */
+#define EVP_R_AES_IV_SETUP_FAILED			 162
 #define EVP_R_AES_KEY_SETUP_FAILED			 143
 #define EVP_R_ASN1_LIB					 140
 #define EVP_R_BAD_BLOCK_LENGTH				 136
@@ -1272,16 +1351,21 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_DECODE_ERROR				 114
 #define EVP_R_DIFFERENT_KEY_TYPES			 101
 #define EVP_R_DIFFERENT_PARAMETERS			 153
+#define EVP_R_DISABLED_FOR_FIPS				 163
 #define EVP_R_ENCODE_ERROR				 115
+#define EVP_R_ERROR_LOADING_SECTION			 165
+#define EVP_R_ERROR_SETTING_FIPS_MODE			 166
 #define EVP_R_EVP_PBE_CIPHERINIT_ERROR			 119
 #define EVP_R_EXPECTING_AN_RSA_KEY			 127
 #define EVP_R_EXPECTING_A_DH_KEY			 128
 #define EVP_R_EXPECTING_A_DSA_KEY			 129
 #define EVP_R_EXPECTING_A_ECDSA_KEY			 141
 #define EVP_R_EXPECTING_A_EC_KEY			 142
+#define EVP_R_FIPS_MODE_NOT_SUPPORTED			 167
 #define EVP_R_INITIALIZATION_ERROR			 134
 #define EVP_R_INPUT_NOT_INITIALIZED			 111
 #define EVP_R_INVALID_DIGEST				 152
+#define EVP_R_INVALID_FIPS_MODE				 168
 #define EVP_R_INVALID_KEY_LENGTH			 130
 #define EVP_R_INVALID_OPERATION				 148
 #define EVP_R_IV_TOO_LARGE				 102
@@ -1303,8 +1387,10 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_PRIVATE_KEY_DECODE_ERROR			 145
 #define EVP_R_PRIVATE_KEY_ENCODE_ERROR			 146
 #define EVP_R_PUBLIC_KEY_NOT_RSA			 106
+#define EVP_R_TOO_LARGE					 164
 #define EVP_R_UNKNOWN_CIPHER				 160
 #define EVP_R_UNKNOWN_DIGEST				 161
+#define EVP_R_UNKNOWN_OPTION				 169
 #define EVP_R_UNKNOWN_PBE_ALGORITHM			 121
 #define EVP_R_UNSUPORTED_NUMBER_OF_ROUNDS		 135
 #define EVP_R_UNSUPPORTED_ALGORITHM			 156
diff --git a/include/openssl/kssl.h b/include/openssl/kssl.h
index a3d20e1..8242fd5 100644
--- a/include/openssl/kssl.h
+++ b/include/openssl/kssl.h
@@ -172,6 +172,10 @@ krb5_error_code  kssl_check_authent(KSSL_CTX *kssl_ctx, krb5_data *authentp,
 			            krb5_timestamp *atimep, KSSL_ERR *kssl_err);
 unsigned char	*kssl_skip_confound(krb5_enctype enctype, unsigned char *authn);
 
+void SSL_set0_kssl_ctx(SSL *s, KSSL_CTX *kctx);
+KSSL_CTX * SSL_get0_kssl_ctx(SSL *s);
+char *kssl_ctx_get0_client_princ(KSSL_CTX *kctx);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/include/openssl/md4.h b/include/openssl/md4.h
index c3ed9b3..a55368a 100644
--- a/include/openssl/md4.h
+++ b/include/openssl/md4.h
@@ -105,6 +105,9 @@ typedef struct MD4state_st
 	unsigned int num;
 	} MD4_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD4_Init(MD4_CTX *c);
+#endif
 int MD4_Init(MD4_CTX *c);
 int MD4_Update(MD4_CTX *c, const void *data, size_t len);
 int MD4_Final(unsigned char *md, MD4_CTX *c);
diff --git a/include/openssl/md5.h b/include/openssl/md5.h
index 4cbf843..541cc92 100644
--- a/include/openssl/md5.h
+++ b/include/openssl/md5.h
@@ -105,6 +105,9 @@ typedef struct MD5state_st
 	unsigned int num;
 	} MD5_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD5_Init(MD5_CTX *c);
+#endif
 int MD5_Init(MD5_CTX *c);
 int MD5_Update(MD5_CTX *c, const void *data, size_t len);
 int MD5_Final(unsigned char *md, MD5_CTX *c);
diff --git a/include/openssl/modes.h b/include/openssl/modes.h
index af8d97d..f18215b 100644
--- a/include/openssl/modes.h
+++ b/include/openssl/modes.h
@@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], int enc);
 
+typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
+			size_t blocks, const void *key,
+			const unsigned char ivec[16]);
+
+typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
+			size_t blocks, const void *key,
+			const unsigned char ivec[16],unsigned char cmac[16]);
+
 void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], block128_f block);
@@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
 			unsigned char ivec[16], unsigned char ecount_buf[16],
 			unsigned int *num, block128_f block);
 
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], unsigned char ecount_buf[16],
+			unsigned int *num, ctr128_f ctr);
+
 void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], int *num,
@@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], cbc128_f cbc);
+
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc);
+
+typedef struct gcm128_context GCM128_CONTEXT;
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
+			size_t len);
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
+			size_t len);
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len);
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len);
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len, ctr128_f stream);
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len, ctr128_f stream);
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+			size_t len);
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
+
+typedef struct ccm128_context CCM128_CONTEXT;
+
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+	unsigned int M, unsigned int L, void *key,block128_f block);
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+	const unsigned char *nonce, size_t nlen, size_t mlen);
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+	const unsigned char *aad, size_t alen);
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len,
+	ccm128_f stream);
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len,
+	ccm128_f stream);
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+
+typedef struct xts128_context XTS128_CONTEXT;
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+	const unsigned char *inp, unsigned char *out, size_t len, int enc);
diff --git a/include/openssl/obj_mac.h b/include/openssl/obj_mac.h
index 282f11a..b5ea7cd 100644
--- a/include/openssl/obj_mac.h
+++ b/include/openssl/obj_mac.h
@@ -580,6 +580,21 @@
 #define NID_sha1WithRSAEncryption		65
 #define OBJ_sha1WithRSAEncryption		OBJ_pkcs1,5L
 
+#define SN_rsaesOaep		"RSAES-OAEP"
+#define LN_rsaesOaep		"rsaesOaep"
+#define NID_rsaesOaep		919
+#define OBJ_rsaesOaep		OBJ_pkcs1,7L
+
+#define SN_mgf1		"MGF1"
+#define LN_mgf1		"mgf1"
+#define NID_mgf1		911
+#define OBJ_mgf1		OBJ_pkcs1,8L
+
+#define SN_rsassaPss		"RSASSA-PSS"
+#define LN_rsassaPss		"rsassaPss"
+#define NID_rsassaPss		912
+#define OBJ_rsassaPss		OBJ_pkcs1,10L
+
 #define SN_sha256WithRSAEncryption		"RSA-SHA256"
 #define LN_sha256WithRSAEncryption		"sha256WithRSAEncryption"
 #define NID_sha256WithRSAEncryption		668
@@ -981,6 +996,10 @@
 #define NID_id_smime_alg_CMSRC2wrap		247
 #define OBJ_id_smime_alg_CMSRC2wrap		OBJ_id_smime_alg,7L
 
+#define SN_id_alg_PWRI_KEK		"id-alg-PWRI-KEK"
+#define NID_id_alg_PWRI_KEK		893
+#define OBJ_id_alg_PWRI_KEK		OBJ_id_smime_alg,9L
+
 #define SN_id_smime_cd_ldap		"id-smime-cd-ldap"
 #define NID_id_smime_cd_ldap		248
 #define OBJ_id_smime_cd_ldap		OBJ_id_smime_cd,1L
@@ -2399,6 +2418,11 @@
 #define NID_no_rev_avail		403
 #define OBJ_no_rev_avail		OBJ_id_ce,56L
 
+#define SN_anyExtendedKeyUsage		"anyExtendedKeyUsage"
+#define LN_anyExtendedKeyUsage		"Any Extended Key Usage"
+#define NID_anyExtendedKeyUsage		910
+#define OBJ_anyExtendedKeyUsage		OBJ_ext_key_usage,0L
+
 #define SN_netscape		"Netscape"
 #define LN_netscape		"Netscape Communications Corp."
 #define NID_netscape		57
@@ -2586,6 +2610,24 @@
 #define NID_aes_128_cfb128		421
 #define OBJ_aes_128_cfb128		OBJ_aes,4L
 
+#define SN_id_aes128_wrap		"id-aes128-wrap"
+#define NID_id_aes128_wrap		788
+#define OBJ_id_aes128_wrap		OBJ_aes,5L
+
+#define SN_aes_128_gcm		"id-aes128-GCM"
+#define LN_aes_128_gcm		"aes-128-gcm"
+#define NID_aes_128_gcm		895
+#define OBJ_aes_128_gcm		OBJ_aes,6L
+
+#define SN_aes_128_ccm		"id-aes128-CCM"
+#define LN_aes_128_ccm		"aes-128-ccm"
+#define NID_aes_128_ccm		896
+#define OBJ_aes_128_ccm		OBJ_aes,7L
+
+#define SN_id_aes128_wrap_pad		"id-aes128-wrap-pad"
+#define NID_id_aes128_wrap_pad		897
+#define OBJ_id_aes128_wrap_pad		OBJ_aes,8L
+
 #define SN_aes_192_ecb		"AES-192-ECB"
 #define LN_aes_192_ecb		"aes-192-ecb"
 #define NID_aes_192_ecb		422
@@ -2606,6 +2648,24 @@
 #define NID_aes_192_cfb128		425
 #define OBJ_aes_192_cfb128		OBJ_aes,24L
 
+#define SN_id_aes192_wrap		"id-aes192-wrap"
+#define NID_id_aes192_wrap		789
+#define OBJ_id_aes192_wrap		OBJ_aes,25L
+
+#define SN_aes_192_gcm		"id-aes192-GCM"
+#define LN_aes_192_gcm		"aes-192-gcm"
+#define NID_aes_192_gcm		898
+#define OBJ_aes_192_gcm		OBJ_aes,26L
+
+#define SN_aes_192_ccm		"id-aes192-CCM"
+#define LN_aes_192_ccm		"aes-192-ccm"
+#define NID_aes_192_ccm		899
+#define OBJ_aes_192_ccm		OBJ_aes,27L
+
+#define SN_id_aes192_wrap_pad		"id-aes192-wrap-pad"
+#define NID_id_aes192_wrap_pad		900
+#define OBJ_id_aes192_wrap_pad		OBJ_aes,28L
+
 #define SN_aes_256_ecb		"AES-256-ECB"
 #define LN_aes_256_ecb		"aes-256-ecb"
 #define NID_aes_256_ecb		426
@@ -2626,6 +2686,24 @@
 #define NID_aes_256_cfb128		429
 #define OBJ_aes_256_cfb128		OBJ_aes,44L
 
+#define SN_id_aes256_wrap		"id-aes256-wrap"
+#define NID_id_aes256_wrap		790
+#define OBJ_id_aes256_wrap		OBJ_aes,45L
+
+#define SN_aes_256_gcm		"id-aes256-GCM"
+#define LN_aes_256_gcm		"aes-256-gcm"
+#define NID_aes_256_gcm		901
+#define OBJ_aes_256_gcm		OBJ_aes,46L
+
+#define SN_aes_256_ccm		"id-aes256-CCM"
+#define LN_aes_256_ccm		"aes-256-ccm"
+#define NID_aes_256_ccm		902
+#define OBJ_aes_256_ccm		OBJ_aes,47L
+
+#define SN_id_aes256_wrap_pad		"id-aes256-wrap-pad"
+#define NID_id_aes256_wrap_pad		903
+#define OBJ_id_aes256_wrap_pad		OBJ_aes,48L
+
 #define SN_aes_128_cfb1		"AES-128-CFB1"
 #define LN_aes_128_cfb1		"aes-128-cfb1"
 #define NID_aes_128_cfb1		650
@@ -2650,6 +2728,26 @@
 #define LN_aes_256_cfb8		"aes-256-cfb8"
 #define NID_aes_256_cfb8		655
 
+#define SN_aes_128_ctr		"AES-128-CTR"
+#define LN_aes_128_ctr		"aes-128-ctr"
+#define NID_aes_128_ctr		904
+
+#define SN_aes_192_ctr		"AES-192-CTR"
+#define LN_aes_192_ctr		"aes-192-ctr"
+#define NID_aes_192_ctr		905
+
+#define SN_aes_256_ctr		"AES-256-CTR"
+#define LN_aes_256_ctr		"aes-256-ctr"
+#define NID_aes_256_ctr		906
+
+#define SN_aes_128_xts		"AES-128-XTS"
+#define LN_aes_128_xts		"aes-128-xts"
+#define NID_aes_128_xts		913
+
+#define SN_aes_256_xts		"AES-256-XTS"
+#define LN_aes_256_xts		"aes-256-xts"
+#define NID_aes_256_xts		914
+
 #define SN_des_cfb1		"DES-CFB1"
 #define LN_des_cfb1		"des-cfb1"
 #define NID_des_cfb1		656
@@ -2666,18 +2764,6 @@
 #define LN_des_ede3_cfb8		"des-ede3-cfb8"
 #define NID_des_ede3_cfb8		659
 
-#define SN_id_aes128_wrap		"id-aes128-wrap"
-#define NID_id_aes128_wrap		788
-#define OBJ_id_aes128_wrap		OBJ_aes,5L
-
-#define SN_id_aes192_wrap		"id-aes192-wrap"
-#define NID_id_aes192_wrap		789
-#define OBJ_id_aes192_wrap		OBJ_aes,25L
-
-#define SN_id_aes256_wrap		"id-aes256-wrap"
-#define NID_id_aes256_wrap		790
-#define OBJ_id_aes256_wrap		OBJ_aes,45L
-
 #define OBJ_nist_hashalgs		OBJ_nistAlgorithms,2L
 
 #define SN_sha256		"SHA256"
@@ -3810,6 +3896,18 @@
 #define NID_camellia_256_cbc		753
 #define OBJ_camellia_256_cbc		1L,2L,392L,200011L,61L,1L,1L,1L,4L
 
+#define SN_id_camellia128_wrap		"id-camellia128-wrap"
+#define NID_id_camellia128_wrap		907
+#define OBJ_id_camellia128_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,2L
+
+#define SN_id_camellia192_wrap		"id-camellia192-wrap"
+#define NID_id_camellia192_wrap		908
+#define OBJ_id_camellia192_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,3L
+
+#define SN_id_camellia256_wrap		"id-camellia256-wrap"
+#define NID_id_camellia256_wrap		909
+#define OBJ_id_camellia256_wrap		1L,2L,392L,200011L,61L,1L,1L,3L,4L
+
 #define OBJ_ntt_ds		0L,3L,4401L,5L
 
 #define OBJ_camellia		OBJ_ntt_ds,3L,1L,9L
@@ -3912,3 +4010,23 @@
 #define LN_hmac		"hmac"
 #define NID_hmac		855
 
+#define SN_cmac		"CMAC"
+#define LN_cmac		"cmac"
+#define NID_cmac		894
+
+#define SN_rc4_hmac_md5		"RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5		"rc4-hmac-md5"
+#define NID_rc4_hmac_md5		915
+
+#define SN_aes_128_cbc_hmac_sha1		"AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1		"aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1		916
+
+#define SN_aes_192_cbc_hmac_sha1		"AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1		"aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1		917
+
+#define SN_aes_256_cbc_hmac_sha1		"AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1		"aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1		918
+
diff --git a/include/openssl/opensslconf.h b/include/openssl/opensslconf.h
index 26ac6ba..6149d3e 100644
--- a/include/openssl/opensslconf.h
+++ b/include/openssl/opensslconf.h
@@ -5,12 +5,33 @@
 #ifndef OPENSSL_DOING_MAKEDEPEND
 
 
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
 #ifndef OPENSSL_NO_CAST
 # define OPENSSL_NO_CAST
 #endif
+#ifndef OPENSSL_NO_CMS
+# define OPENSSL_NO_CMS
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
 #ifndef OPENSSL_NO_GMP
 # define OPENSSL_NO_GMP
 #endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
 #ifndef OPENSSL_NO_IDEA
 # define OPENSSL_NO_IDEA
 #endif
@@ -23,23 +44,38 @@
 #ifndef OPENSSL_NO_MD2
 # define OPENSSL_NO_MD2
 #endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
 #ifndef OPENSSL_NO_RC5
 # define OPENSSL_NO_RC5
 #endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
 #ifndef OPENSSL_NO_RFC3779
 # define OPENSSL_NO_RFC3779
 #endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
 #ifndef OPENSSL_NO_SEED
 # define OPENSSL_NO_SEED
 #endif
 #ifndef OPENSSL_NO_SHA0
 # define OPENSSL_NO_SHA0
 #endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
 #ifndef OPENSSL_NO_STORE
 # define OPENSSL_NO_STORE
 #endif
-#ifndef OPENSSL_NO_WHRLPOOL
-# define OPENSSL_NO_WHRLPOOL
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
 #endif
 
 #endif /* OPENSSL_DOING_MAKEDEPEND */
@@ -56,12 +92,33 @@
    who haven't had the time to do the appropriate changes in their
    applications.  */
 #ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
 # if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
 #  define NO_CAST
 # endif
+# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
+#  define NO_CMS
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+#  define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+#  define NO_EC_NISTP_64_GCC_128
+# endif
 # if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
 #  define NO_GMP
 # endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+#  define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+#  define NO_HEARTBEATS
+# endif
 # if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
 #  define NO_IDEA
 # endif
@@ -74,23 +131,38 @@
 # if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
 #  define NO_MD2
 # endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
 # if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
 #  define NO_RC5
 # endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+#  define NO_RDRAND
+# endif
 # if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
 #  define NO_RFC3779
 # endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+#  define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+#  define NO_SCTP
+# endif
 # if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
 #  define NO_SEED
 # endif
 # if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
 #  define NO_SHA0
 # endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+#  define NO_STATIC_ENGINE
+# endif
 # if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
 #  define NO_STORE
 # endif
-# if defined(OPENSSL_NO_WHRLPOOL) && !defined(NO_WHRLPOOL)
-#  define NO_WHRLPOOL
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+#  define NO_WHIRLPOOL
 # endif
 #endif
 
diff --git a/include/openssl/opensslv.h b/include/openssl/opensslv.h
index 310a338..5bc8e53 100644
--- a/include/openssl/opensslv.h
+++ b/include/openssl/opensslv.h
@@ -25,11 +25,11 @@
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-#define OPENSSL_VERSION_NUMBER	0x1000005fL
+#define OPENSSL_VERSION_NUMBER	0x1000105fL
 #ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0e-fips 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1e-fips 11 Feb 2013"
 #else
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0e 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1e 11 Feb 2013"
 #endif
 #define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
 
diff --git a/include/openssl/ossl_typ.h b/include/openssl/ossl_typ.h
index 12bd701..ea9227f 100644
--- a/include/openssl/ossl_typ.h
+++ b/include/openssl/ossl_typ.h
@@ -91,10 +91,12 @@ typedef struct asn1_string_st ASN1_TIME;
 typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
 typedef struct asn1_string_st ASN1_VISIBLESTRING;
 typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
 typedef int ASN1_BOOLEAN;
 typedef int ASN1_NULL;
 #endif
 
+typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct asn1_pctx_st ASN1_PCTX;
 
 #ifdef OPENSSL_SYS_WIN32
diff --git a/include/openssl/rand.h b/include/openssl/rand.h
index ac6c021..dc8fcf9 100644
--- a/include/openssl/rand.h
+++ b/include/openssl/rand.h
@@ -119,6 +119,11 @@ int RAND_event(UINT, WPARAM, LPARAM);
 
 #endif
 
+#ifdef OPENSSL_FIPS
+void RAND_set_fips_drbg_type(int type, int flags);
+int RAND_init_fips(void);
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -129,9 +134,13 @@ void ERR_load_RAND_strings(void);
 
 /* Function codes. */
 #define RAND_F_RAND_GET_RAND_METHOD			 101
+#define RAND_F_RAND_INIT_FIPS				 102
 #define RAND_F_SSLEAY_RAND_BYTES			 100
 
 /* Reason codes. */
+#define RAND_R_ERROR_INITIALISING_DRBG			 102
+#define RAND_R_ERROR_INSTANTIATING_DRBG			 103
+#define RAND_R_NO_FIPS_RANDOM_METHOD_SET		 101
 #define RAND_R_PRNG_NOT_SEEDED				 100
 
 #ifdef  __cplusplus
diff --git a/include/openssl/rc2.h b/include/openssl/rc2.h
index 34c8362..e542ec9 100644
--- a/include/openssl/rc2.h
+++ b/include/openssl/rc2.h
@@ -79,7 +79,9 @@ typedef struct rc2_key_st
 	RC2_INT data[64];
 	} RC2_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
+#endif
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
 void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
 		     int enc);
diff --git a/include/openssl/rc4.h b/include/openssl/rc4.h
index 29d1acc..88ceb46 100644
--- a/include/openssl/rc4.h
+++ b/include/openssl/rc4.h
@@ -79,6 +79,7 @@ typedef struct rc4_key_st
  
 const char *RC4_options(void);
 void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
 void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
 		unsigned char *outdata);
 
diff --git a/include/openssl/ripemd.h b/include/openssl/ripemd.h
index 5942eb6..189bd8c 100644
--- a/include/openssl/ripemd.h
+++ b/include/openssl/ripemd.h
@@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st
 	unsigned int   num;
 	} RIPEMD160_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_RIPEMD160_Init(RIPEMD160_CTX *c);
+#endif
 int RIPEMD160_Init(RIPEMD160_CTX *c);
 int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
 int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);
diff --git a/include/openssl/rsa.h b/include/openssl/rsa.h
index cf74343..5f269e5 100644
--- a/include/openssl/rsa.h
+++ b/include/openssl/rsa.h
@@ -222,12 +222,22 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \
 				pad, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \
+				EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad)
+
 #define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
 				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
 				EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \
 				len, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
+				EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \
+				0, plen)
+
 #define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL)
@@ -236,11 +246,24 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
 
+#define	 EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+
+#define	 EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+
 #define EVP_PKEY_CTRL_RSA_PADDING	(EVP_PKEY_ALG_CTRL + 1)
 #define EVP_PKEY_CTRL_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 2)
 
 #define EVP_PKEY_CTRL_RSA_KEYGEN_BITS	(EVP_PKEY_ALG_CTRL + 3)
 #define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP	(EVP_PKEY_ALG_CTRL + 4)
+#define EVP_PKEY_CTRL_RSA_MGF1_MD	(EVP_PKEY_ALG_CTRL + 5)
+
+#define EVP_PKEY_CTRL_GET_RSA_PADDING		(EVP_PKEY_ALG_CTRL + 6)
+#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 7)
+#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD		(EVP_PKEY_ALG_CTRL + 8)
 
 #define RSA_PKCS1_PADDING	1
 #define RSA_SSLV23_PADDING	2
@@ -257,7 +280,7 @@ struct rsa_st
 
 RSA *	RSA_new(void);
 RSA *	RSA_new_method(ENGINE *engine);
-int	RSA_size(const RSA *);
+int	RSA_size(const RSA *rsa);
 
 /* Deprecated version */
 #ifndef OPENSSL_NO_DEPRECATED
@@ -300,6 +323,16 @@ const RSA_METHOD *RSA_null_method(void);
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey)
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey)
 
+typedef struct rsa_pss_params_st
+	{
+	X509_ALGOR *hashAlgorithm;
+	X509_ALGOR *maskGenAlgorithm;
+	ASN1_INTEGER *saltLength;
+	ASN1_INTEGER *trailerField;
+	} RSA_PSS_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 #ifndef OPENSSL_NO_FP_API
 int	RSA_print_fp(FILE *fp, const RSA *r,int offset);
 #endif
@@ -380,6 +413,14 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen);
 
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, 
+			const unsigned char *EM, int sLen);
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen);
+
 int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func,
 	CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func);
 int RSA_set_ex_data(RSA *r,int idx,void *arg);
@@ -388,6 +429,25 @@ void *RSA_get_ex_data(const RSA *r, int idx);
 RSA *RSAPublicKey_dup(RSA *rsa);
 RSA *RSAPrivateKey_dup(RSA *rsa);
 
+/* If this flag is set the RSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define RSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define RSA_FLAG_NON_FIPS_ALLOW			0x0400
+/* Application has decided PRNG is good enough to generate a key: don't
+ * check.
+ */
+#define RSA_FLAG_CHECKED			0x0800
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -405,6 +465,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_PKEY_RSA_CTRL				 143
 #define RSA_F_PKEY_RSA_CTRL_STR				 144
 #define RSA_F_PKEY_RSA_SIGN				 142
+#define RSA_F_PKEY_RSA_VERIFY				 154
 #define RSA_F_PKEY_RSA_VERIFYRECOVER			 141
 #define RSA_F_RSA_BUILTIN_KEYGEN			 129
 #define RSA_F_RSA_CHECK_KEY				 123
@@ -413,6 +474,8 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_EAY_PUBLIC_DECRYPT			 103
 #define RSA_F_RSA_EAY_PUBLIC_ENCRYPT			 104
 #define RSA_F_RSA_GENERATE_KEY				 105
+#define RSA_F_RSA_GENERATE_KEY_EX			 155
+#define RSA_F_RSA_ITEM_VERIFY				 156
 #define RSA_F_RSA_MEMORY_LOCK				 130
 #define RSA_F_RSA_NEW_METHOD				 106
 #define RSA_F_RSA_NULL					 124
@@ -424,6 +487,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_ADD_NONE			 107
 #define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP		 121
 #define RSA_F_RSA_PADDING_ADD_PKCS1_PSS			 125
+#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1		 148
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1		 108
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2		 109
 #define RSA_F_RSA_PADDING_ADD_SSLV23			 110
@@ -436,8 +500,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_CHECK_X931			 128
 #define RSA_F_RSA_PRINT					 115
 #define RSA_F_RSA_PRINT_FP				 116
+#define RSA_F_RSA_PRIVATE_DECRYPT			 150
+#define RSA_F_RSA_PRIVATE_ENCRYPT			 151
 #define RSA_F_RSA_PRIV_DECODE				 137
 #define RSA_F_RSA_PRIV_ENCODE				 138
+#define RSA_F_RSA_PUBLIC_DECRYPT			 152
+#define RSA_F_RSA_PUBLIC_ENCRYPT			 153
 #define RSA_F_RSA_PUB_DECODE				 139
 #define RSA_F_RSA_SETUP_BLINDING			 136
 #define RSA_F_RSA_SIGN					 117
@@ -445,6 +513,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_VERIFY				 119
 #define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING		 120
 #define RSA_F_RSA_VERIFY_PKCS1_PSS			 126
+#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1			 149
 
 /* Reason codes. */
 #define RSA_R_ALGORITHM_MISMATCH			 100
@@ -470,19 +539,24 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_INVALID_HEADER				 137
 #define RSA_R_INVALID_KEYBITS				 145
 #define RSA_R_INVALID_MESSAGE_LENGTH			 131
+#define RSA_R_INVALID_MGF1_MD				 156
 #define RSA_R_INVALID_PADDING				 138
 #define RSA_R_INVALID_PADDING_MODE			 141
+#define RSA_R_INVALID_PSS_PARAMETERS			 149
 #define RSA_R_INVALID_PSS_SALTLEN			 146
+#define RSA_R_INVALID_SALT_LENGTH			 150
 #define RSA_R_INVALID_TRAILER				 139
 #define RSA_R_INVALID_X931_DIGEST			 142
 #define RSA_R_IQMP_NOT_INVERSE_OF_Q			 126
 #define RSA_R_KEY_SIZE_TOO_SMALL			 120
 #define RSA_R_LAST_OCTET_INVALID			 134
 #define RSA_R_MODULUS_TOO_LARGE				 105
+#define RSA_R_NON_FIPS_RSA_METHOD			 157
 #define RSA_R_NO_PUBLIC_EXPONENT			 140
 #define RSA_R_NULL_BEFORE_BLOCK_MISSING			 113
 #define RSA_R_N_DOES_NOT_EQUAL_P_Q			 127
 #define RSA_R_OAEP_DECODING_ERROR			 121
+#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE	 158
 #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE	 148
 #define RSA_R_PADDING_CHECK_FAILED			 114
 #define RSA_R_P_NOT_PRIME				 128
@@ -493,7 +567,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_SSLV3_ROLLBACK_ATTACK			 115
 #define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
 #define RSA_R_UNKNOWN_ALGORITHM_TYPE			 117
+#define RSA_R_UNKNOWN_MASK_DIGEST			 151
 #define RSA_R_UNKNOWN_PADDING_TYPE			 118
+#define RSA_R_UNKNOWN_PSS_DIGEST			 152
+#define RSA_R_UNSUPPORTED_MASK_ALGORITHM		 153
+#define RSA_R_UNSUPPORTED_MASK_PARAMETER		 154
+#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE		 155
 #define RSA_R_VALUE_MISSING				 147
 #define RSA_R_WRONG_SIGNATURE_LENGTH			 119
 
diff --git a/include/openssl/safestack.h b/include/openssl/safestack.h
index 3e76aa5..ea3aa0d 100644
--- a/include/openssl/safestack.h
+++ b/include/openssl/safestack.h
@@ -1459,6 +1459,94 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
 #define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
 
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
 #define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
 #define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
 #define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
@@ -2056,31 +2144,6 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
 
 
-#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
-#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
-#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
-#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
-#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
-#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
-#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
-	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
-	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
-#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
-
-
 #define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
 #define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
 #define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
@@ -2106,6 +2169,31 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
 
 
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
+	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
 #define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
 	SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
 #define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \
diff --git a/include/openssl/sha.h b/include/openssl/sha.h
index 16cacf9..8a6bf4b 100644
--- a/include/openssl/sha.h
+++ b/include/openssl/sha.h
@@ -106,6 +106,9 @@ typedef struct SHAstate_st
 	} SHA_CTX;
 
 #ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
 int SHA_Init(SHA_CTX *c);
 int SHA_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@ unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
 void SHA_Transform(SHA_CTX *c, const unsigned char *data);
 #endif
 #ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
 int SHA1_Init(SHA_CTX *c);
 int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@ typedef struct SHA256state_st
 	} SHA256_CTX;
 
 #ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
 int SHA224_Init(SHA256_CTX *c);
 int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
 int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@ typedef struct SHA512state_st
 #endif
 
 #ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
 int SHA384_Init(SHA512_CTX *c);
 int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
 int SHA384_Final(unsigned char *md, SHA512_CTX *c);
diff --git a/include/openssl/srp.h b/include/openssl/srp.h
new file mode 100644
index 0000000..7ec7825
--- /dev/null
+++ b/include/openssl/srp.h
@@ -0,0 +1,172 @@
+/* crypto/srp/srp.h */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef __SRP_H__
+#define __SRP_H__
+
+#ifndef OPENSSL_NO_SRP
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/safestack.h>
+#include <openssl/bn.h>
+#include <openssl/crypto.h>
+
+typedef struct SRP_gN_cache_st
+	{
+	char *b64_bn;
+	BIGNUM *bn;
+	} SRP_gN_cache;
+
+
+DECLARE_STACK_OF(SRP_gN_cache)
+
+typedef struct SRP_user_pwd_st
+	{
+	char *id;
+	BIGNUM *s;
+	BIGNUM *v;
+	const BIGNUM *g;
+	const BIGNUM *N;
+	char *info;
+	} SRP_user_pwd;
+
+DECLARE_STACK_OF(SRP_user_pwd)
+
+typedef struct SRP_VBASE_st
+	{
+	STACK_OF(SRP_user_pwd) *users_pwd;
+	STACK_OF(SRP_gN_cache) *gN_cache;
+/* to simulate a user */
+	char *seed_key;
+	BIGNUM *default_g;
+	BIGNUM *default_N;
+	} SRP_VBASE;
+
+
+/*Structure interne pour retenir les couples N et g*/
+typedef struct SRP_gN_st
+	{
+	char *id;
+	BIGNUM *g;
+	BIGNUM *N;
+	} SRP_gN;
+
+DECLARE_STACK_OF(SRP_gN)
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key);
+int SRP_VBASE_free(SRP_VBASE *vb);
+int SRP_VBASE_init(SRP_VBASE *vb, char * verifier_file);
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+			  char **verifier, const char *N, const char *g);
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g);
+
+
+#define SRP_NO_ERROR 0
+#define SRP_ERR_VBASE_INCOMPLETE_FILE 1
+#define SRP_ERR_VBASE_BN_LIB 2
+#define SRP_ERR_OPEN_FILE 3
+#define SRP_ERR_MEMORY 4
+
+#define DB_srptype	0
+#define DB_srpverifier	1
+#define DB_srpsalt 	2
+#define DB_srpid	3              
+#define DB_srpgN	4       
+#define DB_srpinfo	5 
+#undef  DB_NUMBER      
+#define DB_NUMBER       6
+
+#define DB_SRP_INDEX	'I'
+#define DB_SRP_VALID	'V'
+#define DB_SRP_REVOKED	'R'
+#define DB_SRP_MODIF	'v'
+
+
+/* see srp.c */
+char * SRP_check_known_gN_param(BIGNUM* g, BIGNUM* N); 
+SRP_gN *SRP_get_default_gN(const char * id) ;
+
+/* server side .... */
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N);
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v);
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N);
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) ;
+
+
+
+/* client side .... */
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass);
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g);
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u);
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N);
+
+#define SRP_MINIMAL_N 1024
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+#endif
diff --git a/include/openssl/srtp.h b/include/openssl/srtp.h
new file mode 100644
index 0000000..c0cf33e
--- /dev/null
+++ b/include/openssl/srtp.h
@@ -0,0 +1,145 @@
+/* ssl/tls1.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <ekr@rtfm.com>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#ifndef HEADER_D1_SRTP_H
+#define HEADER_D1_SRTP_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+     
+#define SRTP_AES128_CM_SHA1_80 0x0001
+#define SRTP_AES128_CM_SHA1_32 0x0002
+#define SRTP_AES128_F8_SHA1_80 0x0003
+#define SRTP_AES128_F8_SHA1_32 0x0004
+#define SRTP_NULL_SHA1_80      0x0005
+#define SRTP_NULL_SHA1_32      0x0006
+
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles);
+int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/include/openssl/ssl.h b/include/openssl/ssl.h
index 9cb2bf9..dac9c3e 100644
--- a/include/openssl/ssl.h
+++ b/include/openssl/ssl.h
@@ -252,6 +252,7 @@ extern "C" {
 #define SSL_TXT_kEECDH		"kEECDH"
 #define SSL_TXT_kPSK            "kPSK"
 #define SSL_TXT_kGOST		"kGOST"
+#define SSL_TXT_kSRP		"kSRP"
 
 #define	SSL_TXT_aRSA		"aRSA"
 #define	SSL_TXT_aDSS		"aDSS"
@@ -275,6 +276,7 @@ extern "C" {
 #define SSL_TXT_ECDSA		"ECDSA"
 #define SSL_TXT_KRB5      	"KRB5"
 #define SSL_TXT_PSK             "PSK"
+#define SSL_TXT_SRP		"SRP"
 
 #define SSL_TXT_DES		"DES"
 #define SSL_TXT_3DES		"3DES"
@@ -285,6 +287,7 @@ extern "C" {
 #define SSL_TXT_AES128		"AES128"
 #define SSL_TXT_AES256		"AES256"
 #define SSL_TXT_AES		"AES"
+#define SSL_TXT_AES_GCM		"AESGCM"
 #define SSL_TXT_CAMELLIA128	"CAMELLIA128"
 #define SSL_TXT_CAMELLIA256	"CAMELLIA256"
 #define SSL_TXT_CAMELLIA	"CAMELLIA"
@@ -294,10 +297,14 @@ extern "C" {
 #define SSL_TXT_SHA		"SHA" /* same as "SHA1" */
 #define SSL_TXT_GOST94		"GOST94" 
 #define SSL_TXT_GOST89MAC		"GOST89MAC" 
+#define SSL_TXT_SHA256		"SHA256"
+#define SSL_TXT_SHA384		"SHA384"
 
 #define SSL_TXT_SSLV2		"SSLv2"
 #define SSL_TXT_SSLV3		"SSLv3"
 #define SSL_TXT_TLSV1		"TLSv1"
+#define SSL_TXT_TLSV1_1		"TLSv1.1"
+#define SSL_TXT_TLSV1_2		"TLSv1.2"
 
 #define SSL_TXT_EXP		"EXP"
 #define SSL_TXT_EXPORT		"EXPORT"
@@ -356,9 +363,29 @@ extern "C" {
  * in SSL_CTX. */
 typedef struct ssl_st *ssl_crock_st;
 typedef struct tls_session_ticket_ext_st TLS_SESSION_TICKET_EXT;
+typedef struct ssl_method_st SSL_METHOD;
+typedef struct ssl_cipher_st SSL_CIPHER;
+typedef struct ssl_session_st SSL_SESSION;
+
+DECLARE_STACK_OF(SSL_CIPHER)
+
+/* SRTP protection profiles for use with the use_srtp extension (RFC 5764)*/
+typedef struct srtp_protection_profile_st
+       {
+       const char *name;
+       unsigned long id;
+       } SRTP_PROTECTION_PROFILE;
+
+DECLARE_STACK_OF(SRTP_PROTECTION_PROFILE)
+
+typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
+typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
+
+
+#ifndef OPENSSL_NO_SSL_INTERN
 
 /* used to hold info on the particular ciphers used */
-typedef struct ssl_cipher_st
+struct ssl_cipher_st
 	{
 	int valid;
 	const char *name;		/* text name */
@@ -375,15 +402,11 @@ typedef struct ssl_cipher_st
 	unsigned long algorithm2;	/* Extra flags */
 	int strength_bits;		/* Number of bits really used */
 	int alg_bits;			/* Number of bits for algorithm */
-	} SSL_CIPHER;
-
-DECLARE_STACK_OF(SSL_CIPHER)
+	};
 
-typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
-typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
 
 /* Used to hold functions for SSLv2 or SSLv3/TLSv1 functions */
-typedef struct ssl_method_st
+struct ssl_method_st
 	{
 	int version;
 	int (*ssl_new)(SSL *s);
@@ -416,7 +439,7 @@ typedef struct ssl_method_st
 	int (*ssl_version)(void);
 	long (*ssl_callback_ctrl)(SSL *s, int cb_id, void (*fp)(void));
 	long (*ssl_ctx_callback_ctrl)(SSL_CTX *s, int cb_id, void (*fp)(void));
-	} SSL_METHOD;
+	};
 
 /* Lets make this into an ASN.1 type structure as follows
  * SSL_SESSION_ID ::= SEQUENCE {
@@ -433,14 +456,17 @@ typedef struct ssl_method_st
  *	Session_ID_context [ 4 ] EXPLICIT OCTET STRING,   -- the Session ID context
  *	Verify_result [ 5 ] EXPLICIT INTEGER,   -- X509_V_... code for `Peer'
  *	HostName [ 6 ] EXPLICIT OCTET STRING,   -- optional HostName from servername TLS extension 
- *	ECPointFormatList [ 7 ] OCTET STRING,     -- optional EC point format list from TLS extension
- *	PSK_identity_hint [ 8 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
- *	PSK_identity [ 9 ] EXPLICIT OCTET STRING -- optional PSK identity
+ *	PSK_identity_hint [ 7 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
+ *	PSK_identity [ 8 ] EXPLICIT OCTET STRING,  -- optional PSK identity
+ *	Ticket_lifetime_hint [9] EXPLICIT INTEGER, -- server's lifetime hint for session ticket
+ *	Ticket [10]             EXPLICIT OCTET STRING, -- session ticket (clients only)
+ *	Compression_meth [11]   EXPLICIT OCTET STRING, -- optional compression method
+ *	SRP_username [ 12 ] EXPLICIT OCTET STRING -- optional SRP username
  *	}
  * Look in ssl/ssl_asn1.c for more details
  * I'm using EXPLICIT tags so I can read the damn things using asn1parse :-).
  */
-typedef struct ssl_session_st
+struct ssl_session_st
 	{
 	int ssl_version;	/* what ssl version session info is
 				 * being kept in here? */
@@ -467,6 +493,9 @@ typedef struct ssl_session_st
 	char *psk_identity_hint;
 	char *psk_identity;
 #endif
+	/* Used to indicate that session resumption is not allowed.
+	 * Applications can also set this bit for a new session via
+	 * not_resumable_session_cb to disable session caching and tickets. */
 	int not_resumable;
 
 	/* The cert is the certificate used to establish this connection */
@@ -509,11 +538,15 @@ typedef struct ssl_session_st
 #endif /* OPENSSL_NO_EC */
 	/* RFC4507 info */
 	unsigned char *tlsext_tick;	/* Session ticket */
-	size_t	tlsext_ticklen;		/* Session ticket length */	
+	size_t tlsext_ticklen;		/* Session ticket length */
 	long tlsext_tick_lifetime_hint;	/* Session lifetime hint in seconds */
 #endif
-	} SSL_SESSION;
+#ifndef OPENSSL_NO_SRP
+	char *srp_username;
+#endif
+	};
 
+#endif
 
 #define SSL_OP_MICROSOFT_SESS_ID_BUG			0x00000001L
 #define SSL_OP_NETSCAPE_CHALLENGE_BUG			0x00000002L
@@ -536,7 +569,7 @@ typedef struct ssl_session_st
 
 /* SSL_OP_ALL: various bug workarounds that should be rather harmless.
  *             This used to be 0x000FFFFFL before 0.9.7. */
-#define SSL_OP_ALL					0x80000FFFL
+#define SSL_OP_ALL					0x80000BFFL
 
 /* DTLS options */
 #define SSL_OP_NO_QUERY_MTU                 0x00001000L
@@ -572,11 +605,17 @@ typedef struct ssl_session_st
 #define SSL_OP_NO_SSLv2					0x01000000L
 #define SSL_OP_NO_SSLv3					0x02000000L
 #define SSL_OP_NO_TLSv1					0x04000000L
+#define SSL_OP_NO_TLSv1_2				0x08000000L
+#define SSL_OP_NO_TLSv1_1				0x10000000L
 
+/* These next two were never actually used for anything since SSLeay
+ * zap so we have some more flags.
+ */
 /* The next flag deliberately changes the ciphertest, this is a check
  * for the PKCS#1 attack */
-#define SSL_OP_PKCS1_CHECK_1				0x08000000L
-#define SSL_OP_PKCS1_CHECK_2				0x10000000L
+#define SSL_OP_PKCS1_CHECK_1				0x0
+#define SSL_OP_PKCS1_CHECK_2				0x0
+
 #define SSL_OP_NETSCAPE_CA_DN_BUG			0x20000000L
 #define SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG		0x40000000L
 /* Make server add server-hello extension from early version of
@@ -602,13 +641,10 @@ typedef struct ssl_session_st
  * TLS only.)  "Released" buffers are put onto a free-list in the context
  * or just freed (depending on the context's setting for freelist_max_len). */
 #define SSL_MODE_RELEASE_BUFFERS 0x00000010L
-/* Use small read and write buffers: (a) lazy allocate read buffers for
- * large incoming records, and (b) limit the size of outgoing records. */
-#define SSL_MODE_SMALL_BUFFERS 0x00000020L
 /* When set, clients may send application data before receipt of CCS
  * and Finished.  This mode enables full-handshakes to 'complete' in
  * one RTT. */
-#define SSL_MODE_HANDSHAKE_CUTTHROUGH 0x00000040L
+#define SSL_MODE_HANDSHAKE_CUTTHROUGH 0x00000020L
 
 /* Note: SSL[_CTX]_set_{options,mode} use |= op on the previous value,
  * they cannot be used to clear bits. */
@@ -644,12 +680,53 @@ typedef struct ssl_session_st
 #define SSL_get_secure_renegotiation_support(ssl) \
 	SSL_ctrl((ssl), SSL_CTRL_GET_RI_SUPPORT, 0, NULL)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_heartbeat(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_TLS_EXT_SEND_HEARTBEAT,0,NULL)
+#endif
+
 void SSL_CTX_set_msg_callback(SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 #define SSL_CTX_set_msg_callback_arg(ctx, arg) SSL_CTX_ctrl((ctx), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 #define SSL_set_msg_callback_arg(ssl, arg) SSL_ctrl((ssl), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 
+#ifndef OPENSSL_NO_SRP
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+typedef struct srp_ctx_st
+	{
+	/* param for all the callbacks */
+	void *SRP_cb_arg;
+	/* set client Hello login callback */
+	int (*TLS_ext_srp_username_callback)(SSL *, int *, void *);
+	/* set SRP N/g param callback for verification */
+	int (*SRP_verify_param_callback)(SSL *, void *);
+	/* set SRP client passwd callback */
+	char *(*SRP_give_srp_client_pwd_callback)(SSL *, void *);
+
+	char *login;
+	BIGNUM *N,*g,*s,*B,*A;
+	BIGNUM *a,*b,*v;
+	char *info;
+	int strength;
+
+	unsigned long srp_Mask;
+	} SRP_CTX;
+
+#endif
+
+/* see tls_srp.c */
+int SSL_SRP_CTX_init(SSL *s);
+int SSL_CTX_SRP_CTX_init(SSL_CTX *ctx);
+int SSL_SRP_CTX_free(SSL *ctx);
+int SSL_CTX_SRP_CTX_free(SSL_CTX *ctx);
+int SSL_srp_server_param_with_username(SSL *s, int *ad);
+int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key);
+int SRP_Calc_A_param(SSL *s);
+int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key);
+
+#endif
 
 #if defined(OPENSSL_SYS_MSDOS) && !defined(OPENSSL_SYS_WIN32)
 #define SSL_MAX_CERT_LIST_DEFAULT 1024*30 /* 30k max cert list :-) */
@@ -675,7 +752,11 @@ void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int con
 typedef int (*GEN_SESSION_CB)(const SSL *ssl, unsigned char *id,
 				unsigned int *id_len);
 
-typedef struct ssl_comp_st
+typedef struct ssl_comp_st SSL_COMP;
+
+#ifndef OPENSSL_NO_SSL_INTERN
+
+struct ssl_comp_st
 	{
 	int id;
 	const char *name;
@@ -684,7 +765,7 @@ typedef struct ssl_comp_st
 #else
 	char *method;
 #endif
-	} SSL_COMP;
+	};
 
 DECLARE_STACK_OF(SSL_COMP)
 DECLARE_LHASH_OF(SSL_SESSION);
@@ -857,6 +938,28 @@ struct ssl_ctx_st
 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
 	void *tlsext_opaque_prf_input_callback_arg;
+#endif
+
+#ifndef OPENSSL_NO_PSK
+	char *psk_identity_hint;
+	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
+		unsigned int max_identity_len, unsigned char *psk,
+		unsigned int max_psk_len);
+	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
+		unsigned char *psk, unsigned int max_psk_len);
+#endif
+
+#ifndef OPENSSL_NO_BUF_FREELISTS
+#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
+	unsigned int freelist_max_len;
+	struct ssl3_buf_freelist_st *wbuf_freelist;
+	struct ssl3_buf_freelist_st *rbuf_freelist;
+#endif
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
+
+#ifndef OPENSSL_NO_TLSEXT
 
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	/* Next protocol negotiation information */
@@ -876,24 +979,43 @@ struct ssl_ctx_st
 				    void *arg);
 	void *next_proto_select_cb_arg;
 # endif
-#endif
 
-#ifndef OPENSSL_NO_PSK
-	char *psk_identity_hint;
-	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
-		unsigned int max_identity_len, unsigned char *psk,
-		unsigned int max_psk_len);
-	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
-		unsigned char *psk, unsigned int max_psk_len);
+	/* ALPN information
+	 * (we are in the process of transitioning from NPN to ALPN.) */
+
+	/* For a server, this contains a callback function that allows the
+	 * server to select the protocol for the connection.
+	 *   out: on successful return, this must point to the raw protocol
+	 *        name (without the length prefix).
+	 *   outlen: on successful return, this contains the length of |*out|.
+	 *   in: points to the client's list of supported protocols in
+	 *       wire-format.
+	 *   inlen: the length of |in|. */
+	int (*alpn_select_cb)(SSL *s,
+			      const unsigned char **out,
+			      unsigned char *outlen,
+			      const unsigned char* in,
+			      unsigned int inlen,
+			      void *arg);
+	void *alpn_select_cb_arg;
+
+	/* For a client, this contains the list of supported protocols in wire
+	 * format. */
+	unsigned char* alpn_client_proto_list;
+	unsigned alpn_client_proto_list_len;
+
+        /* SRTP profiles we are willing to do from RFC 5764 */
+        STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  
+
+	/* If true, a client will advertise the Channel ID extension and a
+	 * server will echo it. */
+	char tlsext_channel_id_enabled;
+	/* The client's Channel ID private key. */
+	EVP_PKEY *tlsext_channel_id_private;
 #endif
+	};
 
-#ifndef OPENSSL_NO_BUF_FREELISTS
-#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
-	unsigned int freelist_max_len;
-	struct ssl3_buf_freelist_st *wbuf_freelist;
-	struct ssl3_buf_freelist_st *rbuf_freelist;
 #endif
-	};
 
 #define SSL_SESS_CACHE_OFF			0x0000
 #define SSL_SESS_CACHE_CLIENT			0x0001
@@ -931,6 +1053,10 @@ LHASH_OF(SSL_SESSION) *SSL_CTX_sessions(SSL_CTX *ctx);
 	SSL_CTX_ctrl(ctx,SSL_CTRL_SESS_TIMEOUTS,0,NULL)
 #define SSL_CTX_sess_cache_full(ctx) \
 	SSL_CTX_ctrl(ctx,SSL_CTRL_SESS_CACHE_FULL,0,NULL)
+/* SSL_CTX_enable_tls_channel_id configures a TLS server to accept TLS client
+ * IDs from clients. Returns 1 on success. */
+#define SSL_CTX_enable_tls_channel_id(ctx) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_CHANNEL_ID,0,NULL)
 
 void SSL_CTX_sess_set_new_cb(SSL_CTX *ctx, int (*new_session_cb)(struct ssl_st *ssl,SSL_SESSION *sess));
 int (*SSL_CTX_sess_get_new_cb(SSL_CTX *ctx))(struct ssl_st *ssl, SSL_SESSION *sess);
@@ -952,26 +1078,43 @@ void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s,
 					   int (*cb) (SSL *ssl,
 						      const unsigned char **out,
 						      unsigned int *outlen,
-						      void *arg), void *arg);
+						      void *arg),
+					   void *arg);
 void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-				      int (*cb) (SSL *ssl, unsigned char **out,
+				      int (*cb) (SSL *ssl,
+						 unsigned char **out,
 						 unsigned char *outlen,
 						 const unsigned char *in,
-						 unsigned int inlen, void *arg),
+						 unsigned int inlen,
+						 void *arg),
 				      void *arg);
 
 int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
 			  const unsigned char *in, unsigned int inlen,
 			  const unsigned char *client, unsigned int client_len);
-void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-				    unsigned *len);
+void SSL_get0_next_proto_negotiated(const SSL *s,
+				    const unsigned char **data, unsigned *len);
 
 #define OPENSSL_NPN_UNSUPPORTED	0
 #define OPENSSL_NPN_NEGOTIATED	1
 #define OPENSSL_NPN_NO_OVERLAP	2
-
 #endif
 
+int SSL_CTX_set_alpn_protos(SSL_CTX *ctx, const unsigned char* protos,
+			    unsigned protos_len);
+int SSL_set_alpn_protos(SSL *ssl, const unsigned char* protos,
+			unsigned protos_len);
+void SSL_CTX_set_alpn_select_cb(SSL_CTX* ctx,
+				int (*cb) (SSL *ssl,
+					   const unsigned char **out,
+					   unsigned char *outlen,
+					   const unsigned char *in,
+					   unsigned int inlen,
+					   void *arg),
+				void *arg);
+void SSL_get0_alpn_selected(const SSL *ssl, const unsigned char **data,
+			    unsigned *len);
+
 #ifndef OPENSSL_NO_PSK
 /* the maximum length of the buffer given to callbacks containing the
  * resulting identity/psk */
@@ -1011,6 +1154,8 @@ const char *SSL_get_psk_identity(const SSL *s);
 #define SSL_MAC_FLAG_READ_MAC_STREAM 1
 #define SSL_MAC_FLAG_WRITE_MAC_STREAM 2
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 struct ssl_st
 	{
 	/* protocol version
@@ -1055,9 +1200,7 @@ struct ssl_st
 
 	int server;	/* are we the server side? - mostly used by SSL_clear*/
 
-	int new_session;/* 1 if we are to use a new session.
-	                 * 2 if we are a server and are inside a handshake
-	                 *   (i.e. not just sending a HelloRequest)
+	int new_session;/* Generate a new session or reuse an old one.
 	                 * NB: For servers, the 'new' session may actually be a previously
 	                 * cached session or even the previous session unless
 	                 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
@@ -1244,11 +1387,44 @@ struct ssl_st
 #endif
 
 #define session_ctx initial_ctx
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  /* What we'll do */
+	SRTP_PROTECTION_PROFILE *srtp_profile;            /* What's been chosen */
+
+	unsigned int tlsext_heartbeat;  /* Is use of the Heartbeat extension negotiated?
+	                                   0: disabled
+	                                   1: enabled
+	                                   2: enabled, but not allowed to send Requests
+	                                 */
+	unsigned int tlsext_hb_pending; /* Indicates if a HeartbeatRequest is in flight */
+	unsigned int tlsext_hb_seq;     /* HeartbeatRequest sequence number */
+
+	/* Copied from the SSL_CTX. For a server, means that we'll accept
+	 * Channel IDs from clients. For a client, means that we'll advertise
+	 * support. */
+	char tlsext_channel_id_enabled;
+	/* The client's Channel ID private key. */
+	EVP_PKEY *tlsext_channel_id_private;
+
+	/* For a client, this contains the list of supported protocols in wire
+	 * format. */
+	unsigned char* alpn_client_proto_list;
+	unsigned alpn_client_proto_list_len;
 #else
 #define session_ctx ctx
 #endif /* OPENSSL_NO_TLSEXT */
+
+	int renegotiate;/* 1 if we are renegotiating.
+	                 * 2 if we are a server and are inside a handshake
+	                 * (i.e. not just sending a HelloRequest) */
+
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
 	};
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
@@ -1258,6 +1434,7 @@ struct ssl_st
 #include <openssl/tls1.h> /* This is mostly sslv3 with a few tweaks */
 #include <openssl/dtls1.h> /* Datagram TLS */
 #include <openssl/ssl23.h>
+#include <openssl/srtp.h>  /* Support for the use_srtp extension */
 
 #ifdef  __cplusplus
 extern "C" {
@@ -1304,7 +1481,7 @@ extern "C" {
 #define SSL_get_state(a)		SSL_state(a)
 #define SSL_is_init_finished(a)		(SSL_state(a) == SSL_ST_OK)
 #define SSL_in_init(a)			((SSL_state(a)&SSL_ST_INIT) && \
-                                  !SSL_cutthrough_complete(a))
+					!SSL_cutthrough_complete(a))
 #define SSL_in_before(a)		(SSL_state(a)&SSL_ST_BEFORE)
 #define SSL_in_connect_init(a)		(SSL_state(a)&SSL_ST_CONNECT)
 #define SSL_in_accept_init(a)		(SSL_state(a)&SSL_ST_ACCEPT)
@@ -1476,6 +1653,23 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_CTRL_SET_TLSEXT_STATUS_REQ_OCSP_RESP	71
 
 #define SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB	72
+
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB	75
+#define SSL_CTRL_SET_SRP_VERIFY_PARAM_CB		76
+#define SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB		77
+
+#define SSL_CTRL_SET_SRP_ARG		78
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME		79
+#define SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH		80
+#define SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD		81
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_CTRL_TLS_EXT_SEND_HEARTBEAT				85
+#define SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING		86
+#define SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS	87
+#endif
+#define SSL_CTRL_CHANNEL_ID			88
+#define SSL_CTRL_GET_CHANNEL_ID			89
+#define SSL_CTRL_SET_CHANNEL_ID			90
 #endif
 
 #define DTLS_CTRL_GET_TIMEOUT		73
@@ -1486,6 +1680,9 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_CTRL_CLEAR_OPTIONS			77
 #define SSL_CTRL_CLEAR_MODE			78
 
+#define SSL_CTRL_GET_EXTRA_CHAIN_CERTS		82
+#define SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS	83
+
 #define DTLSv1_get_timeout(ssl, arg) \
 	SSL_ctrl(ssl,DTLS_CTRL_GET_TIMEOUT,0, (void *)arg)
 #define DTLSv1_handle_timeout(ssl) \
@@ -1520,8 +1717,31 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_set_tmp_ecdh(ssl,ecdh) \
 	SSL_ctrl(ssl,SSL_CTRL_SET_TMP_ECDH,0,(char *)ecdh)
 
+/* SSL_enable_tls_channel_id configures a TLS server to accept TLS client
+ * IDs from clients. Returns 1 on success. */
+#define SSL_enable_tls_channel_id(ctx) \
+	SSL_ctrl(ctx,SSL_CTRL_CHANNEL_ID,0,NULL)
+/* SSL_set1_tls_channel_id configures a TLS client to send a TLS Channel ID to
+ * compatible servers. private_key must be a P-256 EVP_PKEY*. Returns 1 on
+ * success. */
+#define SSL_set1_tls_channel_id(s, private_key) \
+	SSL_ctrl(s,SSL_CTRL_SET_CHANNEL_ID,0,(void*)private_key)
+#define SSL_CTX_set1_tls_channel_id(ctx, private_key) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_SET_CHANNEL_ID,0,(void*)private_key)
+/* SSL_get_tls_channel_id gets the client's TLS Channel ID from a server SSL*
+ * and copies up to the first |channel_id_len| bytes into |channel_id|. The
+ * Channel ID consists of the client's P-256 public key as an (x,y) pair where
+ * each is a 32-byte, big-endian field element. Returns 0 if the client didn't
+ * offer a Channel ID and the length of the complete Channel ID otherwise. */
+#define SSL_get_tls_channel_id(ctx, channel_id, channel_id_len) \
+	SSL_ctrl(ctx,SSL_CTRL_GET_CHANNEL_ID,channel_id_len,(void*)channel_id)
+
 #define SSL_CTX_add_extra_chain_cert(ctx,x509) \
 	SSL_CTX_ctrl(ctx,SSL_CTRL_EXTRA_CHAIN_CERT,0,(char *)x509)
+#define SSL_CTX_get_extra_chain_certs(ctx,px509) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_GET_EXTRA_CHAIN_CERTS,0,px509)
+#define SSL_CTX_clear_extra_chain_certs(ctx) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS,0,NULL)
 
 #ifndef OPENSSL_NO_BIO
 BIO_METHOD *BIO_f_ssl(void);
@@ -1549,7 +1769,8 @@ const SSL_CIPHER *SSL_get_current_cipher(const SSL *s);
 int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);
 char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
 const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
-const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER *c);
+unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c);
+const char* SSL_CIPHER_authentication_method(const SSL_CIPHER* cipher);
 
 int	SSL_get_fd(const SSL *s);
 int	SSL_get_rfd(const SSL *s);
@@ -1619,11 +1840,15 @@ long	SSL_SESSION_set_time(SSL_SESSION *s, long t);
 long	SSL_SESSION_get_timeout(const SSL_SESSION *s);
 long	SSL_SESSION_set_timeout(SSL_SESSION *s, long t);
 void	SSL_copy_session_id(SSL *to,const SSL *from);
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s);
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len);
 
 SSL_SESSION *SSL_SESSION_new(void);
 const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s,
 					unsigned int *len);
 const char *	SSL_SESSION_get_version(const SSL_SESSION *s);
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s);
 #ifndef OPENSSL_NO_FP_API
 int	SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses);
 #endif
@@ -1687,6 +1912,30 @@ int SSL_set_trust(SSL *s, int trust);
 int SSL_CTX_set1_param(SSL_CTX *ctx, X509_VERIFY_PARAM *vpm);
 int SSL_set1_param(SSL *ssl, X509_VERIFY_PARAM *vpm);
 
+#ifndef OPENSSL_NO_SRP
+int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name);
+int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password);
+int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength);
+int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx,
+					char *(*cb)(SSL *,void *));
+int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx,
+					  int (*cb)(SSL *,void *));
+int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx,
+				      int (*cb)(SSL *,int *,void *));
+int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg);
+
+int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g,
+			     BIGNUM *sa, BIGNUM *v, char *info);
+int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass,
+				const char *grp);
+
+BIGNUM *SSL_get_srp_g(SSL *s);
+BIGNUM *SSL_get_srp_N(SSL *s);
+
+char *SSL_get_srp_username(SSL *s);
+char *SSL_get_srp_userinfo(SSL *s);
+#endif
+
 void	SSL_free(SSL *ssl);
 int 	SSL_accept(SSL *ssl);
 int 	SSL_connect(SSL *ssl);
@@ -1722,6 +1971,15 @@ const SSL_METHOD *TLSv1_method(void);		/* TLSv1.0 */
 const SSL_METHOD *TLSv1_server_method(void);	/* TLSv1.0 */
 const SSL_METHOD *TLSv1_client_method(void);	/* TLSv1.0 */
 
+const SSL_METHOD *TLSv1_1_method(void);		/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_server_method(void);	/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_client_method(void);	/* TLSv1.1 */
+
+const SSL_METHOD *TLSv1_2_method(void);		/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_server_method(void);	/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_client_method(void);	/* TLSv1.2 */
+
+
 const SSL_METHOD *DTLSv1_method(void);		/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_server_method(void);	/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_client_method(void);	/* DTLSv1.0 */
@@ -1730,6 +1988,7 @@ STACK_OF(SSL_CIPHER) *SSL_get_ciphers(const SSL *s);
 
 int SSL_do_handshake(SSL *s);
 int SSL_renegotiate(SSL *s);
+int SSL_renegotiate_abbreviated(SSL *s);
 int SSL_renegotiate_pending(SSL *s);
 int SSL_shutdown(SSL *s);
 
@@ -1781,6 +2040,7 @@ void SSL_set_info_callback(SSL *ssl,
 			   void (*cb)(const SSL *ssl,int type,int val));
 void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl,int type,int val);
 int SSL_state(const SSL *ssl);
+void SSL_set_state(SSL *ssl, int state);
 
 void SSL_set_verify_result(SSL *ssl,long v);
 long SSL_get_verify_result(const SSL *ssl);
@@ -1881,6 +2141,9 @@ int SSL_set_session_ticket_ext_cb(SSL *s, tls_session_ticket_ext_cb_fn cb,
 /* Pre-shared secret session resumption functions */
 int SSL_set_session_secret_cb(SSL *s, tls_session_secret_cb_fn tls_session_secret_cb, void *arg);
 
+void SSL_set_debug(SSL *s, int debug);
+int SSL_cache_hit(SSL *s);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1900,6 +2163,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_DTLS1_ACCEPT				 246
 #define SSL_F_DTLS1_ADD_CERT_TO_BUF			 295
 #define SSL_F_DTLS1_BUFFER_RECORD			 247
+#define SSL_F_DTLS1_CHECK_TIMEOUT_NUM			 316
 #define SSL_F_DTLS1_CLIENT_HELLO			 248
 #define SSL_F_DTLS1_CONNECT				 249
 #define SSL_F_DTLS1_ENC					 250
@@ -1908,6 +2172,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_DTLS1_GET_MESSAGE_FRAGMENT		 253
 #define SSL_F_DTLS1_GET_RECORD				 254
 #define SSL_F_DTLS1_HANDLE_TIMEOUT			 297
+#define SSL_F_DTLS1_HEARTBEAT				 305
 #define SSL_F_DTLS1_OUTPUT_CERT_CHAIN			 255
 #define SSL_F_DTLS1_PREPROCESS_FRAGMENT			 288
 #define SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE		 256
@@ -1957,6 +2222,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_CALLBACK_CTRL			 233
 #define SSL_F_SSL3_CHANGE_CIPHER_STATE			 129
 #define SSL_F_SSL3_CHECK_CERT_AND_ALGORITHM		 130
+#define SSL_F_SSL3_CHECK_CLIENT_HELLO			 304
 #define SSL_F_SSL3_CLIENT_HELLO				 131
 #define SSL_F_SSL3_CONNECT				 132
 #define SSL_F_SSL3_CTRL					 213
@@ -1968,6 +2234,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_GET_CERTIFICATE_REQUEST		 135
 #define SSL_F_SSL3_GET_CERT_STATUS			 289
 #define SSL_F_SSL3_GET_CERT_VERIFY			 136
+#define SSL_F_SSL3_GET_CHANNEL_ID			 317
 #define SSL_F_SSL3_GET_CLIENT_CERTIFICATE		 137
 #define SSL_F_SSL3_GET_CLIENT_HELLO			 138
 #define SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE		 139
@@ -1975,7 +2242,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
 #define SSL_F_SSL3_GET_MESSAGE				 142
 #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-#define SSL_F_SSL3_GET_NEXT_PROTO			 304
+#define SSL_F_SSL3_GET_NEXT_PROTO			 306
 #define SSL_F_SSL3_GET_RECORD				 143
 #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
 #define SSL_F_SSL3_GET_SERVER_DONE			 145
@@ -1987,6 +2254,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_READ_BYTES				 148
 #define SSL_F_SSL3_READ_N				 149
 #define SSL_F_SSL3_SEND_CERTIFICATE_REQUEST		 150
+#define SSL_F_SSL3_SEND_CHANNEL_ID			 318
 #define SSL_F_SSL3_SEND_CLIENT_CERTIFICATE		 151
 #define SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE		 152
 #define SSL_F_SSL3_SEND_CLIENT_VERIFY			 153
@@ -2000,10 +2268,12 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_WRITE_PENDING			 159
 #define SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT	 298
 #define SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT		 277
+#define SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT		 307
 #define SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK	 215
 #define SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK	 216
 #define SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT	 299
 #define SSL_F_SSL_ADD_SERVERHELLO_TLSEXT		 278
+#define SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT		 308
 #define SSL_F_SSL_BAD_METHOD				 160
 #define SSL_F_SSL_BYTES_TO_CIPHER_LIST			 161
 #define SSL_F_SSL_CERT_DUP				 221
@@ -2020,6 +2290,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_CREATE_CIPHER_LIST			 166
 #define SSL_F_SSL_CTRL					 232
 #define SSL_F_SSL_CTX_CHECK_PRIVATE_KEY			 168
+#define SSL_F_SSL_CTX_MAKE_PROFILES			 309
 #define SSL_F_SSL_CTX_NEW				 169
 #define SSL_F_SSL_CTX_SET_CIPHER_LIST			 269
 #define SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE		 290
@@ -2042,14 +2313,17 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_GET_NEW_SESSION			 181
 #define SSL_F_SSL_GET_PREV_SESSION			 217
 #define SSL_F_SSL_GET_SERVER_SEND_CERT			 182
+#define SSL_F_SSL_GET_SERVER_SEND_PKEY			 317
 #define SSL_F_SSL_GET_SIGN_PKEY				 183
 #define SSL_F_SSL_INIT_WBIO_BUFFER			 184
 #define SSL_F_SSL_LOAD_CLIENT_CA_FILE			 185
 #define SSL_F_SSL_NEW					 186
 #define SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT	 300
 #define SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT		 302
+#define SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT	 310
 #define SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT	 301
 #define SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT		 303
+#define SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT	 311
 #define SSL_F_SSL_PEEK					 270
 #define SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT		 281
 #define SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT		 282
@@ -2058,6 +2332,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_RSA_PUBLIC_ENCRYPT			 188
 #define SSL_F_SSL_SESSION_NEW				 189
 #define SSL_F_SSL_SESSION_PRINT_FP			 190
+#define SSL_F_SSL_SESSION_SET1_ID_CONTEXT		 312
 #define SSL_F_SSL_SESS_CERT_NEW				 225
 #define SSL_F_SSL_SET_CERT				 191
 #define SSL_F_SSL_SET_CIPHER_LIST			 271
@@ -2071,6 +2346,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_SET_TRUST				 228
 #define SSL_F_SSL_SET_WFD				 196
 #define SSL_F_SSL_SHUTDOWN				 224
+#define SSL_F_SSL_SRP_CTX_INIT				 313
 #define SSL_F_SSL_UNDEFINED_CONST_FUNCTION		 243
 #define SSL_F_SSL_UNDEFINED_FUNCTION			 197
 #define SSL_F_SSL_UNDEFINED_VOID_FUNCTION		 244
@@ -2091,6 +2367,8 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_TLS1_CHANGE_CIPHER_STATE			 209
 #define SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT		 274
 #define SSL_F_TLS1_ENC					 210
+#define SSL_F_TLS1_EXPORT_KEYING_MATERIAL		 314
+#define SSL_F_TLS1_HEARTBEAT				 315
 #define SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT		 275
 #define SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT		 276
 #define SSL_F_TLS1_PRF					 284
@@ -2130,6 +2408,13 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_BAD_RSA_MODULUS_LENGTH			 121
 #define SSL_R_BAD_RSA_SIGNATURE				 122
 #define SSL_R_BAD_SIGNATURE				 123
+#define SSL_R_BAD_SRP_A_LENGTH				 347
+#define SSL_R_BAD_SRP_B_LENGTH				 348
+#define SSL_R_BAD_SRP_G_LENGTH				 349
+#define SSL_R_BAD_SRP_N_LENGTH				 350
+#define SSL_R_BAD_SRP_S_LENGTH				 351
+#define SSL_R_BAD_SRTP_MKI_VALUE			 352
+#define SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST		 353
 #define SSL_R_BAD_SSL_FILETYPE				 124
 #define SSL_R_BAD_SSL_SESSION_ID_LENGTH			 125
 #define SSL_R_BAD_STATE					 126
@@ -2137,12 +2422,15 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_BIO_NOT_SET				 128
 #define SSL_R_BLOCK_CIPHER_PAD_IS_WRONG			 129
 #define SSL_R_BN_LIB					 130
+#define SSL_R_CANNOT_SERIALIZE_PUBLIC_KEY		 376
 #define SSL_R_CA_DN_LENGTH_MISMATCH			 131
 #define SSL_R_CA_DN_TOO_LONG				 132
 #define SSL_R_CCS_RECEIVED_EARLY			 133
 #define SSL_R_CERTIFICATE_VERIFY_FAILED			 134
 #define SSL_R_CERT_LENGTH_MISMATCH			 135
 #define SSL_R_CHALLENGE_IS_DIFFERENT			 136
+#define SSL_R_CHANNEL_ID_NOT_P256			 375
+#define SSL_R_CHANNEL_ID_SIGNATURE_INVALID		 371
 #define SSL_R_CIPHER_CODE_WRONG_LENGTH			 137
 #define SSL_R_CIPHER_OR_HASH_UNAVAILABLE		 138
 #define SSL_R_CIPHER_TABLE_SRC_ERROR			 139
@@ -2155,6 +2443,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_CONNECTION_ID_IS_DIFFERENT		 143
 #define SSL_R_CONNECTION_TYPE_NOT_SET			 144
 #define SSL_R_COOKIE_MISMATCH				 308
+#define SSL_R_D2I_ECDSA_SIG				 379
 #define SSL_R_DATA_BETWEEN_CCS_AND_FINISHED		 145
 #define SSL_R_DATA_LENGTH_TOO_LONG			 146
 #define SSL_R_DECRYPTION_FAILED				 147
@@ -2168,14 +2457,18 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE	 322
 #define SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE	 323
 #define SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER		 310
+#define SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST	 354
 #define SSL_R_ENCRYPTED_LENGTH_TOO_LONG			 150
 #define SSL_R_ERROR_GENERATING_TMP_RSA_KEY		 282
 #define SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST		 151
+#define SSL_R_EVP_DIGESTSIGNFINAL_FAILED		 377
+#define SSL_R_EVP_DIGESTSIGNINIT_FAILED			 378
 #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
 #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
 #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
+#define SSL_R_GOT_CHANNEL_ID_BEFORE_A_CCS		 372
+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 355
+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 356
 #define SSL_R_HTTPS_PROXY_REQUEST			 155
 #define SSL_R_HTTP_REQUEST				 156
 #define SSL_R_ILLEGAL_PADDING				 283
@@ -2183,7 +2476,9 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_INVALID_CHALLENGE_LENGTH			 158
 #define SSL_R_INVALID_COMMAND				 280
 #define SSL_R_INVALID_COMPRESSION_ALGORITHM		 341
+#define SSL_R_INVALID_MESSAGE				 374
 #define SSL_R_INVALID_PURPOSE				 278
+#define SSL_R_INVALID_SRP_USERNAME			 357
 #define SSL_R_INVALID_STATUS_RESPONSE			 328
 #define SSL_R_INVALID_TICKET_KEYS_LENGTH		 325
 #define SSL_R_INVALID_TRUST				 279
@@ -2213,11 +2508,13 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_MISSING_RSA_CERTIFICATE			 168
 #define SSL_R_MISSING_RSA_ENCRYPTING_CERT		 169
 #define SSL_R_MISSING_RSA_SIGNING_CERT			 170
+#define SSL_R_MISSING_SRP_PARAM				 358
 #define SSL_R_MISSING_TMP_DH_KEY			 171
 #define SSL_R_MISSING_TMP_ECDH_KEY			 311
 #define SSL_R_MISSING_TMP_RSA_KEY			 172
 #define SSL_R_MISSING_TMP_RSA_PKEY			 173
 #define SSL_R_MISSING_VERIFY_MESSAGE			 174
+#define SSL_R_MULTIPLE_SGC_RESTARTS			 346
 #define SSL_R_NON_SSLV2_INITIAL_PACKET			 175
 #define SSL_R_NO_CERTIFICATES_RETURNED			 176
 #define SSL_R_NO_CERTIFICATE_ASSIGNED			 177
@@ -2234,6 +2531,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_NO_COMPRESSION_SPECIFIED			 187
 #define SSL_R_NO_GOST_CERTIFICATE_SENT_BY_PEER		 330
 #define SSL_R_NO_METHOD_SPECIFIED			 188
+#define SSL_R_NO_P256_SUPPORT				 373
 #define SSL_R_NO_PRIVATEKEY				 189
 #define SSL_R_NO_PRIVATE_KEY_ASSIGNED			 190
 #define SSL_R_NO_PROTOCOLS_AVAILABLE			 191
@@ -2241,6 +2539,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_NO_RENEGOTIATION				 339
 #define SSL_R_NO_REQUIRED_DIGEST			 324
 #define SSL_R_NO_SHARED_CIPHER				 193
+#define SSL_R_NO_SRTP_PROFILES				 359
 #define SSL_R_NO_VERIFY_CALLBACK			 194
 #define SSL_R_NULL_SSL_CTX				 195
 #define SSL_R_NULL_SSL_METHOD_PASSED			 196
@@ -2285,7 +2584,12 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED		 277
 #define SSL_R_SESSION_MAY_NOT_BE_CREATED		 2000
 #define SSL_R_SHORT_READ				 219
+#define SSL_R_SIGNATURE_ALGORITHMS_ERROR		 360
 #define SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE	 220
+#define SSL_R_SRP_A_CALC				 361
+#define SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES		 362
+#define SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG	 363
+#define SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE		 364
 #define SSL_R_SSL23_DOING_SESSION_ID_REUSE		 221
 #define SSL_R_SSL2_CONNECTION_ID_TOO_LONG		 299
 #define SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT		 321
@@ -2330,6 +2634,9 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_TLSV1_UNRECOGNIZED_NAME			 1112
 #define SSL_R_TLSV1_UNSUPPORTED_EXTENSION		 1110
 #define SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER	 232
+#define SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT		 365
+#define SSL_R_TLS_HEARTBEAT_PENDING			 366
+#define SSL_R_TLS_ILLEGAL_EXPORTER_LABEL		 367
 #define SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST		 157
 #define SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST 233
 #define SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG	 234
@@ -2351,6 +2658,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_UNKNOWN_CERTIFICATE_TYPE			 247
 #define SSL_R_UNKNOWN_CIPHER_RETURNED			 248
 #define SSL_R_UNKNOWN_CIPHER_TYPE			 249
+#define SSL_R_UNKNOWN_DIGEST				 368
 #define SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE			 250
 #define SSL_R_UNKNOWN_PKEY_TYPE				 251
 #define SSL_R_UNKNOWN_PROTOCOL				 252
@@ -2365,12 +2673,14 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_UNSUPPORTED_PROTOCOL			 258
 #define SSL_R_UNSUPPORTED_SSL_VERSION			 259
 #define SSL_R_UNSUPPORTED_STATUS_TYPE			 329
+#define SSL_R_USE_SRTP_NOT_NEGOTIATED			 369
 #define SSL_R_WRITE_BIO_NOT_SET				 260
 #define SSL_R_WRONG_CIPHER_RETURNED			 261
 #define SSL_R_WRONG_MESSAGE_TYPE			 262
 #define SSL_R_WRONG_NUMBER_OF_KEY_BITS			 263
 #define SSL_R_WRONG_SIGNATURE_LENGTH			 264
 #define SSL_R_WRONG_SIGNATURE_SIZE			 265
+#define SSL_R_WRONG_SIGNATURE_TYPE			 370
 #define SSL_R_WRONG_SSL_VERSION				 266
 #define SSL_R_WRONG_VERSION_NUMBER			 267
 #define SSL_R_X509_LIB					 268
diff --git a/include/openssl/ssl2.h b/include/openssl/ssl2.h
index 99a52ea..eb25dcb 100644
--- a/include/openssl/ssl2.h
+++ b/include/openssl/ssl2.h
@@ -155,6 +155,8 @@ extern "C" {
 #define  CERT		char
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl2_state_st
 	{
 	int three_byte_header;
@@ -219,6 +221,8 @@ typedef struct ssl2_state_st
 		} tmp;
 	} SSL2_STATE;
 
+#endif
+
 /* SSLv2 */
 /* client */
 #define SSL2_ST_SEND_CLIENT_HELLO_A		(0x10|SSL_ST_CONNECT)
diff --git a/include/openssl/ssl3.h b/include/openssl/ssl3.h
index f9268c5..4729868 100644
--- a/include/openssl/ssl3.h
+++ b/include/openssl/ssl3.h
@@ -280,9 +280,6 @@ extern "C" {
 
 #define SSL3_RT_MAX_EXTRA			(16384)
 
-/* Default buffer length used for writen records.  Thus a generated record
- * will contain plaintext no larger than this value. */
-#define SSL3_RT_DEFAULT_PLAIN_LENGTH	2048
 /* Maximum plaintext length: defined by SSL/TLS standards */
 #define SSL3_RT_MAX_PLAIN_LENGTH		16384
 /* Maximum compression overhead: defined by SSL/TLS standards */
@@ -314,13 +311,6 @@ extern "C" {
 #define SSL3_RT_MAX_PACKET_SIZE		\
 		(SSL3_RT_MAX_ENCRYPTED_LENGTH+SSL3_RT_HEADER_LENGTH)
 
-/* Extra space for empty fragment, headers, MAC, and padding. */
-#define SSL3_RT_DEFAULT_WRITE_OVERHEAD  256
-#define SSL3_RT_DEFAULT_PACKET_SIZE     4096 - SSL3_RT_DEFAULT_WRITE_OVERHEAD
-#if SSL3_RT_DEFAULT_PLAIN_LENGTH + SSL3_RT_DEFAULT_WRITE_OVERHEAD > SSL3_RT_DEFAULT_PACKET_SIZE
-#error "Insufficient space allocated for write buffers."
-#endif
-
 #define SSL3_MD_CLIENT_FINISHED_CONST	"\x43\x4C\x4E\x54"
 #define SSL3_MD_SERVER_FINISHED_CONST	"\x53\x52\x56\x52"
 
@@ -332,6 +322,7 @@ extern "C" {
 #define SSL3_RT_ALERT			21
 #define SSL3_RT_HANDSHAKE		22
 #define SSL3_RT_APPLICATION_DATA	23
+#define TLS1_RT_HEARTBEAT		24
 
 #define SSL3_AL_WARNING			1
 #define SSL3_AL_FATAL			2
@@ -349,6 +340,11 @@ extern "C" {
 #define SSL3_AD_CERTIFICATE_UNKNOWN	46
 #define SSL3_AD_ILLEGAL_PARAMETER	47	/* fatal */
 
+#define TLS1_HB_REQUEST		1
+#define TLS1_HB_RESPONSE	2
+	
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_record_st
 	{
 /*r */	int type;               /* type of record */
@@ -370,6 +366,8 @@ typedef struct ssl3_buffer_st
 	int left;               /* how many bytes left */
 	} SSL3_BUFFER;
 
+#endif
+
 #define SSL3_CT_RSA_SIGN			1
 #define SSL3_CT_DSS_SIGN			2
 #define SSL3_CT_RSA_FIXED_DH			3
@@ -389,6 +387,20 @@ typedef struct ssl3_buffer_st
 #define SSL3_FLAGS_POP_BUFFER			0x0004
 #define TLS1_FLAGS_TLS_PADDING_BUG		0x0008
 #define TLS1_FLAGS_SKIP_CERT_VERIFY		0x0010
+#define TLS1_FLAGS_KEEP_HANDSHAKE		0x0020
+ 
+/* SSL3_FLAGS_SGC_RESTART_DONE is set when we
+ * restart a handshake because of MS SGC and so prevents us
+ * from restarting the handshake in a loop. It's reset on a
+ * renegotiation, so effectively limits the client to one restart
+ * per negotiation. This limits the possibility of a DDoS
+ * attack where the client handshakes in a loop using SGC to
+ * restart. Servers which permit renegotiation can still be
+ * effected, but we can't prevent that.
+ */
+#define SSL3_FLAGS_SGC_RESTART_DONE		0x0040
+
+#ifndef OPENSSL_NO_SSL_INTERN
 
 typedef struct ssl3_state_st
 	{
@@ -465,12 +477,6 @@ typedef struct ssl3_state_st
 	void *server_opaque_prf_input;
 	size_t server_opaque_prf_input_len;
 
-#ifndef OPENSSL_NO_NEXTPROTONEG
-	/* Set if we saw the Next Protocol Negotiation extension from
-	   our peer. */
-	int next_proto_neg_seen;
-#endif
-
 	struct	{
 		/* actually only needs to be 16+20 */
 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
@@ -480,7 +486,7 @@ typedef struct ssl3_state_st
 		int finish_md_len;
 		unsigned char peer_finish_md[EVP_MAX_MD_SIZE*2];
 		int peer_finish_md_len;
-		
+
 		unsigned long message_size;
 		int message_type;
 
@@ -528,14 +534,55 @@ typedef struct ssl3_state_st
         unsigned char previous_server_finished[EVP_MAX_MD_SIZE];
         unsigned char previous_server_finished_len;
         int send_connection_binding; /* TODOEKR */
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	/* Set if we saw the Next Protocol Negotiation extension from our peer. */
+	int next_proto_neg_seen;
+#endif
+
+	/* In a client, this means that the server supported Channel ID and that
+	 * a Channel ID was sent. In a server it means that we echoed support
+	 * for Channel IDs and that tlsext_channel_id will be valid after the
+	 * handshake. */
+	char tlsext_channel_id_valid;
+	/* For a server:
+	 *     If |tlsext_channel_id_valid| is true, then this contains the
+	 *     verified Channel ID from the client: a P256 point, (x,y), where
+	 *     each are big-endian values. */
+	unsigned char tlsext_channel_id[64];
+
+	/* ALPN information
+	 * (we are in the process of transitioning from NPN to ALPN.) */
+
+	/* In a server these point to the selected ALPN protocol after the
+	 * ClientHello has been processed. In a client these contain the
+	 * protocol that the server selected once the ServerHello has been
+	 * processed. */
+	unsigned char *alpn_selected;
+	unsigned alpn_selected_len;
+
+	/* These point to the digest function to use for signatures made with
+	 * each type of public key. A NULL value indicates that the default
+	 * digest should be used, which is SHA1 as of TLS 1.2.
+	 *
+	 * (These should be in the tmp member, but we have to put them here to
+	 * ensure binary compatibility with earlier OpenSSL 1.0.* releases.) */
+	const EVP_MD *digest_rsa;
+	const EVP_MD *digest_dsa;
+	const EVP_MD *digest_ecdsa;
 	} SSL3_STATE;
 
+#endif
 
 /* SSLv3 */
 /*client */
 /* extra state */
 #define SSL3_ST_CW_FLUSH		(0x100|SSL_ST_CONNECT)
 #define SSL3_ST_CUTTHROUGH_COMPLETE	(0x101|SSL_ST_CONNECT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_CW_WRITE_SOCK			(0x310|SSL_ST_CONNECT)
+#define DTLS1_SCTP_ST_CR_READ_SOCK			(0x320|SSL_ST_CONNECT)
+#endif	
 /* write to server */
 #define SSL3_ST_CW_CLNT_HELLO_A		(0x110|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CLNT_HELLO_B		(0x111|SSL_ST_CONNECT)
@@ -567,6 +614,8 @@ typedef struct ssl3_state_st
 #define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
 #define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
 #endif
+#define SSL3_ST_CW_CHANNEL_ID_A		(0x210|SSL_ST_CONNECT)
+#define SSL3_ST_CW_CHANNEL_ID_B		(0x211|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
 /* read from server */
@@ -582,6 +631,10 @@ typedef struct ssl3_state_st
 /* server */
 /* extra state */
 #define SSL3_ST_SW_FLUSH		(0x100|SSL_ST_ACCEPT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_SW_WRITE_SOCK			(0x310|SSL_ST_ACCEPT)
+#define DTLS1_SCTP_ST_SR_READ_SOCK			(0x320|SSL_ST_ACCEPT)
+#endif	
 /* read from client */
 /* Do not change the number values, they do matter */
 #define SSL3_ST_SR_CLNT_HELLO_A		(0x110|SSL_ST_ACCEPT)
@@ -612,10 +665,13 @@ typedef struct ssl3_state_st
 #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
+#define SSL3_ST_SR_POST_CLIENT_CERT	(0x1BF|SSL_ST_ACCEPT)
 #ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
 #endif
+#define SSL3_ST_SR_CHANNEL_ID_A		(0x220|SSL_ST_ACCEPT)
+#define SSL3_ST_SR_CHANNEL_ID_B		(0x221|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
 /* write to client */
@@ -643,6 +699,7 @@ typedef struct ssl3_state_st
 #ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_MT_NEXT_PROTO			67
 #endif
+#define SSL3_MT_ENCRYPTED_EXTENSIONS		203
 #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
 
 
diff --git a/include/openssl/symhacks.h b/include/openssl/symhacks.h
index 3fd4a81..07a412f 100644
--- a/include/openssl/symhacks.h
+++ b/include/openssl/symhacks.h
@@ -176,7 +176,6 @@
 #define SSL_CTX_set_default_passwd_cb_userdata  SSL_CTX_set_def_passwd_cb_ud
 #undef SSL_COMP_get_compression_methods
 #define SSL_COMP_get_compression_methods	SSL_COMP_get_compress_methods
-
 #undef ssl_add_clienthello_renegotiate_ext
 #define ssl_add_clienthello_renegotiate_ext	ssl_add_clienthello_reneg_ext
 #undef ssl_add_serverhello_renegotiate_ext
@@ -185,6 +184,26 @@
 #define ssl_parse_clienthello_renegotiate_ext	ssl_parse_clienthello_reneg_ext
 #undef ssl_parse_serverhello_renegotiate_ext
 #define ssl_parse_serverhello_renegotiate_ext	ssl_parse_serverhello_reneg_ext
+#undef SSL_srp_server_param_with_username
+#define SSL_srp_server_param_with_username	SSL_srp_server_param_with_un
+#undef SSL_CTX_set_srp_client_pwd_callback
+#define SSL_CTX_set_srp_client_pwd_callback	SSL_CTX_set_srp_client_pwd_cb
+#undef SSL_CTX_set_srp_verify_param_callback
+#define SSL_CTX_set_srp_verify_param_callback	SSL_CTX_set_srp_vfy_param_cb
+#undef SSL_CTX_set_srp_username_callback
+#define SSL_CTX_set_srp_username_callback	SSL_CTX_set_srp_un_cb
+#undef ssl_add_clienthello_use_srtp_ext
+#define ssl_add_clienthello_use_srtp_ext	ssl_add_clihello_use_srtp_ext
+#undef ssl_add_serverhello_use_srtp_ext
+#define ssl_add_serverhello_use_srtp_ext	ssl_add_serhello_use_srtp_ext
+#undef ssl_parse_clienthello_use_srtp_ext
+#define ssl_parse_clienthello_use_srtp_ext	ssl_parse_clihello_use_srtp_ext
+#undef ssl_parse_serverhello_use_srtp_ext
+#define ssl_parse_serverhello_use_srtp_ext	ssl_parse_serhello_use_srtp_ext
+#undef SSL_CTX_set_next_protos_advertised_cb
+#define SSL_CTX_set_next_protos_advertised_cb	SSL_CTX_set_next_protos_adv_cb
+#undef SSL_CTX_set_next_proto_select_cb
+#define SSL_CTX_set_next_proto_select_cb	SSL_CTX_set_next_proto_sel_cb
 
 /* Hack some long ENGINE names */
 #undef ENGINE_get_default_BN_mod_exp_crt
@@ -238,6 +257,9 @@
 #define EC_GROUP_get_point_conversion_form	EC_GROUP_get_point_conv_form
 #undef EC_GROUP_clear_free_all_extra_data
 #define EC_GROUP_clear_free_all_extra_data	EC_GROUP_clr_free_all_xtra_data
+#undef EC_KEY_set_public_key_affine_coordinates
+#define EC_KEY_set_public_key_affine_coordinates \
+						EC_KEY_set_pub_key_aff_coords
 #undef EC_POINT_set_Jprojective_coordinates_GFp
 #define EC_POINT_set_Jprojective_coordinates_GFp \
                                                 EC_POINT_set_Jproj_coords_GFp
@@ -294,8 +316,6 @@
 #define ec_GFp_simple_point_set_to_infinity     ec_GFp_simple_pt_set_to_inf
 #undef ec_GFp_simple_points_make_affine
 #define ec_GFp_simple_points_make_affine	ec_GFp_simple_pts_make_affine
-#undef ec_GFp_simple_group_get_curve_GFp
-#define ec_GFp_simple_group_get_curve_GFp       ec_GFp_simple_grp_get_curve_GFp
 #undef ec_GFp_simple_set_Jprojective_coordinates_GFp
 #define ec_GFp_simple_set_Jprojective_coordinates_GFp \
                                                 ec_GFp_smp_set_Jproj_coords_GFp
@@ -399,6 +419,12 @@
 #undef dtls1_retransmit_buffered_messages
 #define dtls1_retransmit_buffered_messages	dtls1_retransmit_buffered_msgs
 
+/* Hack some long SRP names */
+#undef SRP_generate_server_master_secret
+#define SRP_generate_server_master_secret	SRP_gen_server_master_secret
+#undef SRP_generate_client_master_secret
+#define SRP_generate_client_master_secret	SRP_gen_client_master_secret
+
 /* Hack some long UI names */
 #undef UI_method_get_prompt_constructor
 #define UI_method_get_prompt_constructor	UI_method_get_prompt_constructr
diff --git a/include/openssl/tls1.h b/include/openssl/tls1.h
index 76f368a..c6670f4 100644
--- a/include/openssl/tls1.h
+++ b/include/openssl/tls1.h
@@ -159,10 +159,24 @@ extern "C" {
 
 #define TLS1_ALLOW_EXPERIMENTAL_CIPHERSUITES	0
 
+#define TLS1_2_VERSION			0x0303
+#define TLS1_2_VERSION_MAJOR		0x03
+#define TLS1_2_VERSION_MINOR		0x03
+
+#define TLS1_1_VERSION			0x0302
+#define TLS1_1_VERSION_MAJOR		0x03
+#define TLS1_1_VERSION_MINOR		0x02
+
 #define TLS1_VERSION			0x0301
 #define TLS1_VERSION_MAJOR		0x03
 #define TLS1_VERSION_MINOR		0x01
 
+#define TLS1_get_version(s) \
+		((s->version >> 8) == TLS1_VERSION_MAJOR ? s->version : 0)
+
+#define TLS1_get_client_version(s) \
+		((s->client_version >> 8) == TLS1_VERSION_MAJOR ? s->client_version : 0)
+
 #define TLS1_AD_DECRYPTION_FAILED	21
 #define TLS1_AD_RECORD_OVERFLOW		22
 #define TLS1_AD_UNKNOWN_CA		48	/* fatal */
@@ -183,17 +197,45 @@ extern "C" {
 #define TLS1_AD_BAD_CERTIFICATE_HASH_VALUE 114
 #define TLS1_AD_UNKNOWN_PSK_IDENTITY	115	/* fatal */
 
-/* ExtensionType values from RFC3546 / RFC4366 */
+/* ExtensionType values from RFC3546 / RFC4366 / RFC6066 */
 #define TLSEXT_TYPE_server_name			0
 #define TLSEXT_TYPE_max_fragment_length		1
 #define TLSEXT_TYPE_client_certificate_url	2
 #define TLSEXT_TYPE_trusted_ca_keys		3
 #define TLSEXT_TYPE_truncated_hmac		4
 #define TLSEXT_TYPE_status_request		5
+/* ExtensionType values from RFC4681 */
+#define TLSEXT_TYPE_user_mapping		6
+
+/* ExtensionType values from RFC5878 */
+#define TLSEXT_TYPE_client_authz		7
+#define TLSEXT_TYPE_server_authz		8
+
+/* ExtensionType values from RFC6091 */
+#define TLSEXT_TYPE_cert_type		9
+
 /* ExtensionType values from RFC4492 */
 #define TLSEXT_TYPE_elliptic_curves		10
 #define TLSEXT_TYPE_ec_point_formats		11
+
+/* ExtensionType value from RFC5054 */
+#define TLSEXT_TYPE_srp				12
+
+/* ExtensionType values from RFC5246 */
+#define TLSEXT_TYPE_signature_algorithms	13
+
+/* ExtensionType value from RFC5764 */
+#define TLSEXT_TYPE_use_srtp	14
+
+/* ExtensionType value from RFC5620 */
+#define TLSEXT_TYPE_heartbeat	15
+
+/* ExtensionType value from draft-ietf-tls-applayerprotoneg-00 */
+#define TLSEXT_TYPE_application_layer_protocol_negotiation 16
+
+/* ExtensionType value from RFC4507 */
 #define TLSEXT_TYPE_session_ticket		35
+
 /* ExtensionType value from draft-rescorla-tls-opaque-prf-input-00.txt */
 #if 0 /* will have to be provided externally for now ,
        * i.e. build with -DTLSEXT_TYPE_opaque_prf_input=38183
@@ -209,6 +251,9 @@ extern "C" {
 #define TLSEXT_TYPE_next_proto_neg		13172
 #endif
 
+/* This is not an IANA defined extension number */
+#define TLSEXT_TYPE_channel_id			30031
+
 /* NameType value from RFC 3546 */
 #define TLSEXT_NAMETYPE_host_name 0
 /* status request value from RFC 3546 */
@@ -221,12 +266,37 @@ extern "C" {
 #define TLSEXT_ECPOINTFORMAT_ansiX962_compressed_char2	2
 #define TLSEXT_ECPOINTFORMAT_last			2
 
+/* Signature and hash algorithms from RFC 5246 */
+
+#define TLSEXT_signature_anonymous			0
+#define TLSEXT_signature_rsa				1
+#define TLSEXT_signature_dsa				2
+#define TLSEXT_signature_ecdsa				3
+
+#define TLSEXT_hash_none				0
+#define TLSEXT_hash_md5					1
+#define TLSEXT_hash_sha1				2
+#define TLSEXT_hash_sha224				3
+#define TLSEXT_hash_sha256				4
+#define TLSEXT_hash_sha384				5
+#define TLSEXT_hash_sha512				6
+
 #ifndef OPENSSL_NO_TLSEXT
 
 #define TLSEXT_MAXLEN_host_name 255
 
-const char *SSL_get_servername(const SSL *s, const int type) ;
-int SSL_get_servername_type(const SSL *s) ;
+const char *SSL_get_servername(const SSL *s, const int type);
+int SSL_get_servername_type(const SSL *s);
+/* SSL_export_keying_material exports a value derived from the master secret,
+ * as specified in RFC 5705. It writes |olen| bytes to |out| given a label and
+ * optional context. (Since a zero length context is allowed, the |use_context|
+ * flag controls whether a context is included.)
+ *
+ * It returns 1 on success and zero otherwise.
+ */
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context);
 
 #define SSL_set_tlsext_host_name(s,name) \
 SSL_ctrl(s,SSL_CTRL_SET_TLSEXT_HOSTNAME,TLSEXT_NAMETYPE_host_name,(char *)name)
@@ -290,6 +360,16 @@ SSL_CTX_ctrl(ctx,SSL_CTRL_SET_TLSEXT_OPAQUE_PRF_INPUT_CB_ARG, 0, arg)
 #define SSL_CTX_set_tlsext_ticket_key_cb(ssl, cb) \
 SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_TLSEXT_HB_ENABLED				0x01
+#define SSL_TLSEXT_HB_DONT_SEND_REQUESTS	0x02
+#define SSL_TLSEXT_HB_DONT_RECV_REQUESTS	0x04
+
+#define SSL_get_tlsext_heartbeat_pending(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING,0,NULL)
+#define SSL_set_tlsext_heartbeat_no_requests(ssl, arg) \
+        SSL_ctrl((ssl),SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS,arg,NULL)
+#endif
 #endif
 
 /* PSK ciphersuites from 4279 */
@@ -327,6 +407,14 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_AES_256_SHA		0x03000039
 #define TLS1_CK_ADH_WITH_AES_256_SHA			0x0300003A
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_RSA_WITH_NULL_SHA256			0x0300003B
+#define TLS1_CK_RSA_WITH_AES_128_SHA256			0x0300003C
+#define TLS1_CK_RSA_WITH_AES_256_SHA256			0x0300003D
+#define TLS1_CK_DH_DSS_WITH_AES_128_SHA256		0x0300003E
+#define TLS1_CK_DH_RSA_WITH_AES_128_SHA256		0x0300003F
+#define TLS1_CK_DHE_DSS_WITH_AES_128_SHA256		0x03000040
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_128_CBC_SHA		0x03000041
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	0x03000042
@@ -335,6 +423,16 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_CAMELLIA_128_CBC_SHA	0x03000045
 #define TLS1_CK_ADH_WITH_CAMELLIA_128_CBC_SHA		0x03000046
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_DHE_RSA_WITH_AES_128_SHA256		0x03000067
+#define TLS1_CK_DH_DSS_WITH_AES_256_SHA256		0x03000068
+#define TLS1_CK_DH_RSA_WITH_AES_256_SHA256		0x03000069
+#define TLS1_CK_DHE_DSS_WITH_AES_256_SHA256		0x0300006A
+#define TLS1_CK_DHE_RSA_WITH_AES_256_SHA256		0x0300006B
+#define TLS1_CK_ADH_WITH_AES_128_SHA256			0x0300006C
+#define TLS1_CK_ADH_WITH_AES_256_SHA256			0x0300006D
+
+/* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_256_CBC_SHA		0x03000084
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_256_CBC_SHA	0x03000085
 #define TLS1_CK_DH_RSA_WITH_CAMELLIA_256_CBC_SHA	0x03000086
@@ -350,6 +448,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_SEED_SHA                   0x0300009A
 #define TLS1_CK_ADH_WITH_SEED_SHA                	0x0300009B
 
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_CK_RSA_WITH_AES_128_GCM_SHA256		0x0300009C
+#define TLS1_CK_RSA_WITH_AES_256_GCM_SHA384		0x0300009D
+#define TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256		0x0300009E
+#define TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384		0x0300009F
+#define TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256		0x030000A0
+#define TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384		0x030000A1
+#define TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256		0x030000A2
+#define TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384		0x030000A3
+#define TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256		0x030000A4
+#define TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384		0x030000A5
+#define TLS1_CK_ADH_WITH_AES_128_GCM_SHA256		0x030000A6
+#define TLS1_CK_ADH_WITH_AES_256_GCM_SHA384		0x030000A7
+
 /* ECC ciphersuites from draft-ietf-tls-ecc-12.txt with changes soon to be in draft 13 */
 #define TLS1_CK_ECDH_ECDSA_WITH_NULL_SHA                0x0300C001
 #define TLS1_CK_ECDH_ECDSA_WITH_RC4_128_SHA             0x0300C002
@@ -381,6 +493,38 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA          0x0300C018
 #define TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA          0x0300C019
 
+/* SRP ciphersuites from RFC 5054 */
+#define TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA		0x0300C01A
+#define TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	0x0300C01B
+#define TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	0x0300C01C
+#define TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA		0x0300C01D
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	0x0300C01E
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	0x0300C01F
+#define TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA		0x0300C020
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	0x0300C021
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	0x0300C022
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256         0x0300C023
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384         0x0300C024
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256          0x0300C025
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384          0x0300C026
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256           0x0300C027
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384           0x0300C028
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256            0x0300C029
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384            0x0300C02A
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256	0x0300C02B
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384	0x0300C02C
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256      0x0300C02D
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384      0x0300C02E
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256       0x0300C02F
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384       0x0300C030
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032
+
 /* XXX
  * Inconsistency alert:
  * The OpenSSL names of ciphers with ephemeral DH here include the string
@@ -448,6 +592,17 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_PSK_WITH_AES_128_CBC_SHA		"PSK-AES128-CBC-SHA"
 #define TLS1_TXT_PSK_WITH_AES_256_CBC_SHA		"PSK-AES256-CBC-SHA"
 
+/* SRP ciphersuite from RFC 5054 */
+#define TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA		"SRP-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	"SRP-RSA-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	"SRP-DSS-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA		"SRP-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	"SRP-RSA-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	"SRP-DSS-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA		"SRP-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	"SRP-RSA-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	"SRP-DSS-AES-256-CBC-SHA"
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_TXT_RSA_WITH_CAMELLIA_128_CBC_SHA		"CAMELLIA128-SHA"
 #define TLS1_TXT_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	"DH-DSS-CAMELLIA128-SHA"
@@ -471,6 +626,55 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_DHE_RSA_WITH_SEED_SHA                  "DHE-RSA-SEED-SHA"
 #define TLS1_TXT_ADH_WITH_SEED_SHA                      "ADH-SEED-SHA"
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_TXT_RSA_WITH_NULL_SHA256			"NULL-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_128_SHA256		"AES128-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_SHA256		"AES256-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_SHA256		"DH-DSS-AES128-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_SHA256		"DH-RSA-AES128-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256		"DHE-DSS-AES128-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256		"DHE-RSA-AES128-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_SHA256		"DH-DSS-AES256-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_SHA256		"DH-RSA-AES256-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256		"DHE-DSS-AES256-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256		"DHE-RSA-AES256-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_128_SHA256		"ADH-AES128-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_SHA256		"ADH-AES256-SHA256"
+
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256		"AES128-GCM-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384		"AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256	"DHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384	"DHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256		"DH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384		"DH-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256	"DHE-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384	"DHE-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256		"DH-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384		"DH-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256		"ADH-AES128-GCM-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384		"ADH-AES256-GCM-SHA384"
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256    "ECDHE-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384    "ECDHE-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256     "ECDH-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384     "ECDH-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256      "ECDHE-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384      "ECDHE-RSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256       "ECDH-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384       "ECDH-RSA-AES256-SHA384"
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256    "ECDHE-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384    "ECDHE-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256     "ECDH-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384     "ECDH-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256      "ECDHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384      "ECDHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384"
 
 #define TLS_CT_RSA_SIGN			1
 #define TLS_CT_DSS_SIGN			2
diff --git a/include/openssl/ts.h b/include/openssl/ts.h
index 190e8a1..c2448e3 100644
--- a/include/openssl/ts.h
+++ b/include/openssl/ts.h
@@ -86,9 +86,6 @@
 #include <openssl/dh.h>
 #endif
 
-#include <openssl/evp.h>
-
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/include/openssl/ui.h b/include/openssl/ui.h
index 2b1cfa2..bd78aa4 100644
--- a/include/openssl/ui.h
+++ b/include/openssl/ui.h
@@ -316,7 +316,7 @@ int (*UI_method_get_writer(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_flusher(UI_METHOD *method))(UI*);
 int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_closer(UI_METHOD *method))(UI*);
-char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
+char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
 
 /* The following functions are helpers for method writers to access relevant
    data from a UI_STRING. */
diff --git a/include/openssl/x509.h b/include/openssl/x509.h
index e6f8a40..092dd74 100644
--- a/include/openssl/x509.h
+++ b/include/openssl/x509.h
@@ -657,11 +657,15 @@ int NETSCAPE_SPKI_set_pubkey(NETSCAPE_SPKI *x, EVP_PKEY *pkey);
 
 int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki);
 
+int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent);
 int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig);
 
 int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
 
 int X509_pubkey_digest(const X509 *data,const EVP_MD *type,
@@ -763,6 +767,7 @@ X509_ALGOR *X509_ALGOR_dup(X509_ALGOR *xn);
 int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval);
 void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 						X509_ALGOR *algor);
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
 
 X509_NAME *X509_NAME_dup(X509_NAME *xn);
 X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne);
@@ -896,6 +901,9 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *algor1,
 int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	ASN1_BIT_STRING *signature,
 	void *data, EVP_PKEY *pkey, const EVP_MD *type);
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx);
 #endif
 
 int 		X509_set_version(X509 *x,long version);
@@ -1161,6 +1169,9 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 				 unsigned char *salt, int saltlen,
 				 unsigned char *aiv, int prf_nid);
 
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen);
+
 /* PKCS#8 utilities */
 
 DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)
diff --git a/jni/Application.mk b/jni/Application.mk
new file mode 100644
index 0000000..879b98b
--- /dev/null
+++ b/jni/Application.mk
@@ -0,0 +1,3 @@
+APP_PROJECT_PATH := $(shell pwd)
+APP_BUILD_SCRIPT := $(APP_PROJECT_PATH)/Android.mk
+APP_MODULES := libcrypto_static libcrypto libssl_static libssl openssl
diff --git a/lib b/lib
new file mode 120000
index 0000000..e86a91e
--- /dev/null
+++ b/lib
@@ -0,0 +1 @@
+libs/armeabi
\ No newline at end of file
diff --git a/openssl.config b/openssl.config
index 001cca0..00e4ff9 100644
--- a/openssl.config
+++ b/openssl.config
@@ -1,7 +1,27 @@
 CONFIGURE_ARGS="\
-linux-generic32 \
-no-idea no-cast no-seed no-md2 no-sha0 no-whrlpool \
 -DL_ENDIAN \
+linux-generic32 \
+no-camellia \
+no-capieng \
+no-cast \
+no-cms \
+no-dtls1 \
+no-gost \
+no-gmp \
+no-heartbeats \
+no-idea \
+no-jpake \
+no-md2 \
+no-mdc2 \
+no-rc5 \
+no-rdrand \
+no-rfc3779 \
+no-rsax \
+no-sctp \
+no-seed \
+no-sha0 \
+no-static_engine \
+no-whirlpool \
 "
 
 # unneeded directories
@@ -10,6 +30,7 @@ MacOS \
 Netware \
 VMS \
 apps/demoCA \
+apps/demoSRP \
 apps/set \
 bugs \
 certs \
@@ -68,9 +89,9 @@ apps/tsget \
 apps/vms_decc_init.c \
 config \
 crypto/LPdir_vms.c \
-crypto/LPdir_win32.c \
 crypto/Makefile \
 crypto/aes/Makefile \
+crypto/armcap.c \
 crypto/asn1/Makefile \
 crypto/bf/INSTALL \
 crypto/bf/Makefile \
@@ -86,8 +107,10 @@ crypto/bio/Makefile \
 crypto/bio/bss_rtcp.c \
 crypto/bn/Makefile \
 crypto/bn/asm/vms.mar \
+crypto/bn/bn_x931p.c \
 crypto/bn/vms-helper.c \
 crypto/buffer/Makefile \
+crypto/cmac/Makefile \
 crypto/comp/Makefile \
 crypto/conf/Makefile \
 crypto/crypto-lib.com \
@@ -101,15 +124,22 @@ crypto/dso/dso_beos.c \
 crypto/dso/dso_vms.c \
 crypto/dso/dso_win32.c \
 crypto/ec/Makefile \
+crypto/ec/ecp_nistp224.c \
+crypto/ec/ecp_nistp256.c \
+crypto/ec/ecp_nistp521.c \
+crypto/ec/ecp_nistputil.c \
 crypto/ecdh/Makefile \
 crypto/ecdsa/Makefile \
 crypto/engine/Makefile \
-crypto/engine/tb_asnmth.c \
-crypto/engine/tb_pkmeth.c \
+crypto/engine/eng_rdrand.c \
+crypto/engine/eng_rsax.c \
 crypto/err/Makefile \
 crypto/evp/Makefile \
+crypto/evp/evp_fips.c \
 crypto/evp/m_md2.c \
 crypto/evp/m_sha.c \
+crypto/fips_err.h \
+crypto/fips_ers.c \
 crypto/hmac/Makefile \
 crypto/install-crypto.com \
 crypto/jpake/Makefile \
@@ -121,12 +151,14 @@ crypto/mdc2/Makefile \
 crypto/modes/Makefile \
 crypto/modes/cts128.c \
 crypto/modes/modes.h \
+crypto/o_fips.c \
 crypto/objects/Makefile \
 crypto/ocsp/Makefile \
 crypto/pem/Makefile \
 crypto/pkcs12/Makefile \
 crypto/pkcs7/Makefile \
 crypto/pkcs7/bio_pk7.c \
+crypto/ppccap.c \
 crypto/pqueue/Makefile \
 crypto/rand/Makefile \
 crypto/rand/rand_vms.c \
@@ -136,6 +168,8 @@ crypto/ripemd/Makefile \
 crypto/rsa/Makefile \
 crypto/sha/Makefile \
 crypto/sha/sha_one.c \
+crypto/srp/Makefile \
+crypto/srp/srptest.c \
 crypto/stack/Makefile \
 crypto/store/Makefile \
 crypto/threads/pthreads-vms.com \
@@ -168,6 +202,7 @@ install.com \
 makevms.com \
 openssl.doxy \
 openssl.spec \
+ssl/Makefile \
 ssl/install-ssl.com \
 ssl/ssl-lib.com \
 ssl/ssl_task.c \
@@ -182,15 +217,782 @@ ssl \
 include \
 "
 
+# Arch-specific compiler defines for crypto/ library.
+#
+OPENSSL_CRYPTO_DEFINES="\
+NO_WINDOWS_BRAINDEATH \
+"
+
+OPENSSL_CRYPTO_DEFINES_arm="\
+OPENSSL_BN_ASM_GF2m \
+OPENSSL_BN_ASM_MONT \
+GHASH_ASM \
+AES_ASM \
+SHA1_ASM \
+SHA256_ASM \
+SHA512_ASM \
+"
+
+OPENSSL_CRYPTO_DEFINES_mips="\
+OPENSSL_BN_ASM_MONT \
+AES_ASM \
+SHA1_ASM \
+SHA256_ASM \
+"
+
+OPENSSL_CRYPTO_DEFINES_x86="\
+OPENSSL_BN_ASM_GF2m \
+OPENSSL_BN_ASM_MONT \
+OPENSSL_BN_ASM_PART_WORDS \
+AES_ASM \
+GHASH_ASM \
+SHA1_ASM \
+SHA256_ASM \
+SHA512_ASM \
+MD5_ASM \
+DES_PTR \
+DES_RISC1 \
+DES_UNROLL \
+OPENSSL_CPUID_OBJ \
+"
+
+OPENSSL_CRYPTO_DEFINES_x86_64="\
+OPENSSL_BN_ASM_GF2m \
+OPENSSL_BN_ASM_MONT \
+AES_ASM \
+GHASH_ASM \
+SHA1_ASM \
+SHA256_ASM \
+SHA512_ASM \
+MD5_ASM \
+DES_PTR \
+DES_RISC1 \
+DES_UNROLL \
+OPENSSL_CPUID_OBJ \
+"
+
+OPENSSL_CRYPTO_INCLUDES="\
+. \
+include \
+crypto \
+crypto/asn1 \
+crypto/evp \
+crypto/modes \
+include \
+include/openssl \
+"
+
+OPENSSL_CRYPTO_SOURCES="\
+crypto/cryptlib.c \
+crypto/mem.c \
+crypto/mem_clr.c \
+crypto/mem_dbg.c \
+crypto/cversion.c \
+crypto/ex_data.c \
+crypto/cpt_err.c \
+crypto/ebcdic.c \
+crypto/uid.c \
+crypto/o_time.c \
+crypto/o_str.c \
+crypto/o_dir.c \
+crypto/aes/aes_cbc.c \
+crypto/aes/aes_cfb.c \
+crypto/aes/aes_core.c \
+crypto/aes/aes_ctr.c \
+crypto/aes/aes_ecb.c \
+crypto/aes/aes_misc.c \
+crypto/aes/aes_ofb.c \
+crypto/aes/aes_wrap.c \
+crypto/asn1/a_bitstr.c \
+crypto/asn1/a_bool.c \
+crypto/asn1/a_bytes.c \
+crypto/asn1/a_d2i_fp.c \
+crypto/asn1/a_digest.c \
+crypto/asn1/a_dup.c \
+crypto/asn1/a_enum.c \
+crypto/asn1/a_gentm.c \
+crypto/asn1/a_i2d_fp.c \
+crypto/asn1/a_int.c \
+crypto/asn1/a_mbstr.c \
+crypto/asn1/a_object.c \
+crypto/asn1/a_octet.c \
+crypto/asn1/a_print.c \
+crypto/asn1/a_set.c \
+crypto/asn1/a_sign.c \
+crypto/asn1/a_strex.c \
+crypto/asn1/a_strnid.c \
+crypto/asn1/a_time.c \
+crypto/asn1/a_type.c \
+crypto/asn1/a_utctm.c \
+crypto/asn1/a_utf8.c \
+crypto/asn1/a_verify.c \
+crypto/asn1/ameth_lib.c \
+crypto/asn1/asn1_err.c \
+crypto/asn1/asn1_gen.c \
+crypto/asn1/asn1_lib.c \
+crypto/asn1/asn1_par.c \
+crypto/asn1/asn_mime.c \
+crypto/asn1/asn_moid.c \
+crypto/asn1/asn_pack.c \
+crypto/asn1/bio_asn1.c \
+crypto/asn1/bio_ndef.c \
+crypto/asn1/d2i_pr.c \
+crypto/asn1/d2i_pu.c \
+crypto/asn1/evp_asn1.c \
+crypto/asn1/f_enum.c \
+crypto/asn1/f_int.c \
+crypto/asn1/f_string.c \
+crypto/asn1/i2d_pr.c \
+crypto/asn1/i2d_pu.c \
+crypto/asn1/n_pkey.c \
+crypto/asn1/nsseq.c \
+crypto/asn1/p5_pbe.c \
+crypto/asn1/p5_pbev2.c \
+crypto/asn1/p8_pkey.c \
+crypto/asn1/t_bitst.c \
+crypto/asn1/t_crl.c \
+crypto/asn1/t_pkey.c \
+crypto/asn1/t_req.c \
+crypto/asn1/t_spki.c \
+crypto/asn1/t_x509.c \
+crypto/asn1/t_x509a.c \
+crypto/asn1/tasn_dec.c \
+crypto/asn1/tasn_enc.c \
+crypto/asn1/tasn_fre.c \
+crypto/asn1/tasn_new.c \
+crypto/asn1/tasn_prn.c \
+crypto/asn1/tasn_typ.c \
+crypto/asn1/tasn_utl.c \
+crypto/asn1/x_algor.c \
+crypto/asn1/x_attrib.c \
+crypto/asn1/x_bignum.c \
+crypto/asn1/x_crl.c \
+crypto/asn1/x_exten.c \
+crypto/asn1/x_info.c \
+crypto/asn1/x_long.c \
+crypto/asn1/x_name.c \
+crypto/asn1/x_nx509.c \
+crypto/asn1/x_pkey.c \
+crypto/asn1/x_pubkey.c \
+crypto/asn1/x_req.c \
+crypto/asn1/x_sig.c \
+crypto/asn1/x_spki.c \
+crypto/asn1/x_val.c \
+crypto/asn1/x_x509.c \
+crypto/asn1/x_x509a.c \
+crypto/bf/bf_cfb64.c \
+crypto/bf/bf_ecb.c \
+crypto/bf/bf_enc.c \
+crypto/bf/bf_ofb64.c \
+crypto/bf/bf_skey.c \
+crypto/bio/b_dump.c \
+crypto/bio/b_print.c \
+crypto/bio/b_sock.c \
+crypto/bio/bf_buff.c \
+crypto/bio/bf_nbio.c \
+crypto/bio/bf_null.c \
+crypto/bio/bio_cb.c \
+crypto/bio/bio_err.c \
+crypto/bio/bio_lib.c \
+crypto/bio/bss_acpt.c \
+crypto/bio/bss_bio.c \
+crypto/bio/bss_conn.c \
+crypto/bio/bss_dgram.c \
+crypto/bio/bss_fd.c \
+crypto/bio/bss_file.c \
+crypto/bio/bss_log.c \
+crypto/bio/bss_mem.c \
+crypto/bio/bss_null.c \
+crypto/bio/bss_sock.c \
+crypto/bn/bn_add.c \
+crypto/bn/bn_asm.c \
+crypto/bn/bn_blind.c \
+crypto/bn/bn_const.c \
+crypto/bn/bn_ctx.c \
+crypto/bn/bn_div.c \
+crypto/bn/bn_err.c \
+crypto/bn/bn_exp.c \
+crypto/bn/bn_exp2.c \
+crypto/bn/bn_gcd.c \
+crypto/bn/bn_gf2m.c \
+crypto/bn/bn_kron.c \
+crypto/bn/bn_lib.c \
+crypto/bn/bn_mod.c \
+crypto/bn/bn_mont.c \
+crypto/bn/bn_mpi.c \
+crypto/bn/bn_mul.c \
+crypto/bn/bn_nist.c \
+crypto/bn/bn_prime.c \
+crypto/bn/bn_print.c \
+crypto/bn/bn_rand.c \
+crypto/bn/bn_recp.c \
+crypto/bn/bn_shift.c \
+crypto/bn/bn_sqr.c \
+crypto/bn/bn_sqrt.c \
+crypto/bn/bn_word.c \
+crypto/buffer/buf_err.c \
+crypto/buffer/buf_str.c \
+crypto/buffer/buffer.c \
+crypto/cmac/cm_ameth.c \
+crypto/cmac/cm_pmeth.c \
+crypto/cmac/cmac.c \
+crypto/comp/c_rle.c \
+crypto/comp/c_zlib.c \
+crypto/comp/comp_err.c \
+crypto/comp/comp_lib.c \
+crypto/conf/conf_api.c \
+crypto/conf/conf_def.c \
+crypto/conf/conf_err.c \
+crypto/conf/conf_lib.c \
+crypto/conf/conf_mall.c \
+crypto/conf/conf_mod.c \
+crypto/conf/conf_sap.c \
+crypto/des/cbc_cksm.c \
+crypto/des/cbc_enc.c \
+crypto/des/cfb64ede.c \
+crypto/des/cfb64enc.c \
+crypto/des/cfb_enc.c \
+crypto/des/des_enc.c \
+crypto/des/des_old.c \
+crypto/des/des_old2.c \
+crypto/des/ecb3_enc.c \
+crypto/des/ecb_enc.c \
+crypto/des/ede_cbcm_enc.c \
+crypto/des/enc_read.c \
+crypto/des/enc_writ.c \
+crypto/des/fcrypt.c \
+crypto/des/fcrypt_b.c \
+crypto/des/ofb64ede.c \
+crypto/des/ofb64enc.c \
+crypto/des/ofb_enc.c \
+crypto/des/pcbc_enc.c \
+crypto/des/qud_cksm.c \
+crypto/des/rand_key.c \
+crypto/des/read2pwd.c \
+crypto/des/rpc_enc.c \
+crypto/des/set_key.c \
+crypto/des/str2key.c \
+crypto/des/xcbc_enc.c \
+crypto/dh/dh_ameth.c \
+crypto/dh/dh_asn1.c \
+crypto/dh/dh_check.c \
+crypto/dh/dh_depr.c \
+crypto/dh/dh_err.c \
+crypto/dh/dh_gen.c \
+crypto/dh/dh_key.c \
+crypto/dh/dh_lib.c \
+crypto/dh/dh_pmeth.c \
+crypto/dsa/dsa_ameth.c \
+crypto/dsa/dsa_asn1.c \
+crypto/dsa/dsa_depr.c \
+crypto/dsa/dsa_err.c \
+crypto/dsa/dsa_gen.c \
+crypto/dsa/dsa_key.c \
+crypto/dsa/dsa_lib.c \
+crypto/dsa/dsa_ossl.c \
+crypto/dsa/dsa_pmeth.c \
+crypto/dsa/dsa_prn.c \
+crypto/dsa/dsa_sign.c \
+crypto/dsa/dsa_vrf.c \
+crypto/dso/dso_dl.c \
+crypto/dso/dso_dlfcn.c \
+crypto/dso/dso_err.c \
+crypto/dso/dso_lib.c \
+crypto/dso/dso_null.c \
+crypto/dso/dso_openssl.c \
+crypto/ec/ec2_mult.c \
+crypto/ec/ec2_oct.c \
+crypto/ec/ec2_smpl.c \
+crypto/ec/ec_ameth.c \
+crypto/ec/ec_asn1.c \
+crypto/ec/ec_check.c \
+crypto/ec/ec_curve.c \
+crypto/ec/ec_cvt.c \
+crypto/ec/ec_err.c \
+crypto/ec/ec_key.c \
+crypto/ec/ec_lib.c \
+crypto/ec/ec_mult.c \
+crypto/ec/ec_oct.c \
+crypto/ec/ec_pmeth.c \
+crypto/ec/ec_print.c \
+crypto/ec/eck_prn.c \
+crypto/ec/ecp_mont.c \
+crypto/ec/ecp_nist.c \
+crypto/ec/ecp_oct.c \
+crypto/ec/ecp_smpl.c \
+crypto/ecdh/ech_err.c \
+crypto/ecdh/ech_key.c \
+crypto/ecdh/ech_lib.c \
+crypto/ecdh/ech_ossl.c \
+crypto/ecdsa/ecs_asn1.c \
+crypto/ecdsa/ecs_err.c \
+crypto/ecdsa/ecs_lib.c \
+crypto/ecdsa/ecs_ossl.c \
+crypto/ecdsa/ecs_sign.c \
+crypto/ecdsa/ecs_vrf.c \
+crypto/engine/eng_all.c \
+crypto/engine/eng_cnf.c \
+crypto/engine/eng_ctrl.c \
+crypto/engine/eng_dyn.c \
+crypto/engine/eng_err.c \
+crypto/engine/eng_fat.c \
+crypto/engine/eng_init.c \
+crypto/engine/eng_lib.c \
+crypto/engine/eng_list.c \
+crypto/engine/eng_pkey.c \
+crypto/engine/eng_table.c \
+crypto/engine/tb_asnmth.c \
+crypto/engine/tb_cipher.c \
+crypto/engine/tb_dh.c \
+crypto/engine/tb_digest.c \
+crypto/engine/tb_dsa.c \
+crypto/engine/tb_ecdh.c \
+crypto/engine/tb_ecdsa.c \
+crypto/engine/tb_pkmeth.c \
+crypto/engine/tb_rand.c \
+crypto/engine/tb_rsa.c \
+crypto/engine/tb_store.c \
+crypto/err/err.c \
+crypto/err/err_all.c \
+crypto/err/err_prn.c \
+crypto/evp/bio_b64.c \
+crypto/evp/bio_enc.c \
+crypto/evp/bio_md.c \
+crypto/evp/bio_ok.c \
+crypto/evp/c_all.c \
+crypto/evp/c_allc.c \
+crypto/evp/c_alld.c \
+crypto/evp/digest.c \
+crypto/evp/e_aes.c \
+crypto/evp/e_aes_cbc_hmac_sha1.c \
+crypto/evp/e_bf.c \
+crypto/evp/e_des.c \
+crypto/evp/e_des3.c \
+crypto/evp/e_null.c \
+crypto/evp/e_old.c \
+crypto/evp/e_rc2.c \
+crypto/evp/e_rc4.c \
+crypto/evp/e_rc4_hmac_md5.c \
+crypto/evp/e_rc5.c \
+crypto/evp/e_xcbc_d.c \
+crypto/evp/encode.c \
+crypto/evp/evp_acnf.c \
+crypto/evp/evp_cnf.c \
+crypto/evp/evp_enc.c \
+crypto/evp/evp_err.c \
+crypto/evp/evp_key.c \
+crypto/evp/evp_lib.c \
+crypto/evp/evp_pbe.c \
+crypto/evp/evp_pkey.c \
+crypto/evp/m_dss.c \
+crypto/evp/m_dss1.c \
+crypto/evp/m_ecdsa.c \
+crypto/evp/m_md4.c \
+crypto/evp/m_md5.c \
+crypto/evp/m_mdc2.c \
+crypto/evp/m_null.c \
+crypto/evp/m_ripemd.c \
+crypto/evp/m_sha1.c \
+crypto/evp/m_sigver.c \
+crypto/evp/m_wp.c \
+crypto/evp/names.c \
+crypto/evp/p5_crpt.c \
+crypto/evp/p5_crpt2.c \
+crypto/evp/p_dec.c \
+crypto/evp/p_enc.c \
+crypto/evp/p_lib.c \
+crypto/evp/p_open.c \
+crypto/evp/p_seal.c \
+crypto/evp/p_sign.c \
+crypto/evp/p_verify.c \
+crypto/evp/pmeth_fn.c \
+crypto/evp/pmeth_gn.c \
+crypto/evp/pmeth_lib.c \
+crypto/hmac/hm_ameth.c \
+crypto/hmac/hm_pmeth.c \
+crypto/hmac/hmac.c \
+crypto/krb5/krb5_asn.c \
+crypto/lhash/lh_stats.c \
+crypto/lhash/lhash.c \
+crypto/md4/md4_dgst.c \
+crypto/md4/md4_one.c \
+crypto/md5/md5_dgst.c \
+crypto/md5/md5_one.c \
+crypto/modes/cbc128.c \
+crypto/modes/ccm128.c \
+crypto/modes/cfb128.c \
+crypto/modes/ctr128.c \
+crypto/modes/gcm128.c \
+crypto/modes/ofb128.c \
+crypto/modes/xts128.c \
+crypto/o_init.c \
+crypto/objects/o_names.c \
+crypto/objects/obj_dat.c \
+crypto/objects/obj_err.c \
+crypto/objects/obj_lib.c \
+crypto/objects/obj_xref.c \
+crypto/ocsp/ocsp_asn.c \
+crypto/ocsp/ocsp_cl.c \
+crypto/ocsp/ocsp_err.c \
+crypto/ocsp/ocsp_ext.c \
+crypto/ocsp/ocsp_ht.c \
+crypto/ocsp/ocsp_lib.c \
+crypto/ocsp/ocsp_prn.c \
+crypto/ocsp/ocsp_srv.c \
+crypto/ocsp/ocsp_vfy.c \
+crypto/pem/pem_all.c \
+crypto/pem/pem_err.c \
+crypto/pem/pem_info.c \
+crypto/pem/pem_lib.c \
+crypto/pem/pem_oth.c \
+crypto/pem/pem_pk8.c \
+crypto/pem/pem_pkey.c \
+crypto/pem/pem_seal.c \
+crypto/pem/pem_sign.c \
+crypto/pem/pem_x509.c \
+crypto/pem/pem_xaux.c \
+crypto/pem/pvkfmt.c \
+crypto/pkcs12/p12_add.c \
+crypto/pkcs12/p12_asn.c \
+crypto/pkcs12/p12_attr.c \
+crypto/pkcs12/p12_crpt.c \
+crypto/pkcs12/p12_crt.c \
+crypto/pkcs12/p12_decr.c \
+crypto/pkcs12/p12_init.c \
+crypto/pkcs12/p12_key.c \
+crypto/pkcs12/p12_kiss.c \
+crypto/pkcs12/p12_mutl.c \
+crypto/pkcs12/p12_npas.c \
+crypto/pkcs12/p12_p8d.c \
+crypto/pkcs12/p12_p8e.c \
+crypto/pkcs12/p12_utl.c \
+crypto/pkcs12/pk12err.c \
+crypto/pkcs7/pk7_asn1.c \
+crypto/pkcs7/pk7_attr.c \
+crypto/pkcs7/pk7_doit.c \
+crypto/pkcs7/pk7_lib.c \
+crypto/pkcs7/pk7_mime.c \
+crypto/pkcs7/pk7_smime.c \
+crypto/pkcs7/pkcs7err.c \
+crypto/pqueue/pqueue.c \
+crypto/rand/md_rand.c \
+crypto/rand/rand_egd.c \
+crypto/rand/rand_err.c \
+crypto/rand/rand_lib.c \
+crypto/rand/rand_unix.c \
+crypto/rand/rand_win.c \
+crypto/rand/randfile.c \
+crypto/rc2/rc2_cbc.c \
+crypto/rc2/rc2_ecb.c \
+crypto/rc2/rc2_skey.c \
+crypto/rc2/rc2cfb64.c \
+crypto/rc2/rc2ofb64.c \
+crypto/rc4/rc4_enc.c \
+crypto/rc4/rc4_skey.c \
+crypto/rc4/rc4_utl.c \
+crypto/ripemd/rmd_dgst.c \
+crypto/ripemd/rmd_one.c \
+crypto/rsa/rsa_ameth.c \
+crypto/rsa/rsa_asn1.c \
+crypto/rsa/rsa_chk.c \
+crypto/rsa/rsa_crpt.c \
+crypto/rsa/rsa_eay.c \
+crypto/rsa/rsa_err.c \
+crypto/rsa/rsa_gen.c \
+crypto/rsa/rsa_lib.c \
+crypto/rsa/rsa_none.c \
+crypto/rsa/rsa_null.c \
+crypto/rsa/rsa_oaep.c \
+crypto/rsa/rsa_pk1.c \
+crypto/rsa/rsa_pmeth.c \
+crypto/rsa/rsa_prn.c \
+crypto/rsa/rsa_pss.c \
+crypto/rsa/rsa_saos.c \
+crypto/rsa/rsa_sign.c \
+crypto/rsa/rsa_ssl.c \
+crypto/rsa/rsa_x931.c \
+crypto/sha/sha1_one.c \
+crypto/sha/sha1dgst.c \
+crypto/sha/sha256.c \
+crypto/sha/sha512.c \
+crypto/sha/sha_dgst.c \
+crypto/srp/srp_lib.c \
+crypto/srp/srp_vfy.c \
+crypto/stack/stack.c \
+crypto/ts/ts_err.c \
+crypto/txt_db/txt_db.c \
+crypto/ui/ui_compat.c \
+crypto/ui/ui_err.c \
+crypto/ui/ui_lib.c \
+crypto/ui/ui_openssl.c \
+crypto/ui/ui_util.c \
+crypto/x509/by_dir.c \
+crypto/x509/by_file.c \
+crypto/x509/x509_att.c \
+crypto/x509/x509_cmp.c \
+crypto/x509/x509_d2.c \
+crypto/x509/x509_def.c \
+crypto/x509/x509_err.c \
+crypto/x509/x509_ext.c \
+crypto/x509/x509_lu.c \
+crypto/x509/x509_obj.c \
+crypto/x509/x509_r2x.c \
+crypto/x509/x509_req.c \
+crypto/x509/x509_set.c \
+crypto/x509/x509_trs.c \
+crypto/x509/x509_txt.c \
+crypto/x509/x509_v3.c \
+crypto/x509/x509_vfy.c \
+crypto/x509/x509_vpm.c \
+crypto/x509/x509cset.c \
+crypto/x509/x509name.c \
+crypto/x509/x509rset.c \
+crypto/x509/x509spki.c \
+crypto/x509/x509type.c \
+crypto/x509/x_all.c \
+crypto/x509v3/pcy_cache.c \
+crypto/x509v3/pcy_data.c \
+crypto/x509v3/pcy_lib.c \
+crypto/x509v3/pcy_map.c \
+crypto/x509v3/pcy_node.c \
+crypto/x509v3/pcy_tree.c \
+crypto/x509v3/v3_akey.c \
+crypto/x509v3/v3_akeya.c \
+crypto/x509v3/v3_alt.c \
+crypto/x509v3/v3_bcons.c \
+crypto/x509v3/v3_bitst.c \
+crypto/x509v3/v3_conf.c \
+crypto/x509v3/v3_cpols.c \
+crypto/x509v3/v3_crld.c \
+crypto/x509v3/v3_enum.c \
+crypto/x509v3/v3_extku.c \
+crypto/x509v3/v3_genn.c \
+crypto/x509v3/v3_ia5.c \
+crypto/x509v3/v3_info.c \
+crypto/x509v3/v3_int.c \
+crypto/x509v3/v3_lib.c \
+crypto/x509v3/v3_ncons.c \
+crypto/x509v3/v3_ocsp.c \
+crypto/x509v3/v3_pci.c \
+crypto/x509v3/v3_pcia.c \
+crypto/x509v3/v3_pcons.c \
+crypto/x509v3/v3_pku.c \
+crypto/x509v3/v3_pmaps.c \
+crypto/x509v3/v3_prn.c \
+crypto/x509v3/v3_purp.c \
+crypto/x509v3/v3_skey.c \
+crypto/x509v3/v3_sxnet.c \
+crypto/x509v3/v3_utl.c \
+crypto/x509v3/v3err.c \
+"
+
+OPENSSL_CRYPTO_SOURCES_arm="\
+crypto/aes/asm/aes-armv4.S \
+crypto/bn/asm/armv4-gf2m.S \
+crypto/bn/asm/armv4-mont.S \
+crypto/modes/asm/ghash-armv4.S \
+crypto/sha/asm/sha1-armv4-large.S \
+crypto/sha/asm/sha256-armv4.S \
+crypto/sha/asm/sha512-armv4.S \
+"
+
+OPENSSL_CRYPTO_SOURCES_EXCLUDES_arm="\
+crypto/aes/aes_core.c \
+"
+
+OPENSSL_CRYPTO_SOURCES_mips="\
+crypto/aes/asm/aes-mips.S \
+crypto/bn/asm/bn-mips.S \
+crypto/bn/asm/mips-mont.S \
+crypto/sha/asm/sha1-mips.S \
+crypto/sha/asm/sha256-mips.S \
+"
+
+OPENSSL_CRYPTO_SOURCES_EXCLUDES_mips="\
+crypto/aes/aes_core.c \
+crypto/bn/bn_asm.c \
+"
+
+OPENSSL_CRYPTO_SOURCES_x86="\
+crypto/aes/asm/aes-586.S \
+crypto/aes/asm/aesni-x86.S \
+crypto/aes/asm/vpaes-x86.S \
+crypto/bf/asm/bf-586.S \
+crypto/bn/asm/bn-586.S \
+crypto/bn/asm/co-586.S \
+crypto/bn/asm/x86-gf2m.S \
+crypto/bn/asm/x86-mont.S \
+crypto/des/asm/crypt586.S \
+crypto/des/asm/des-586.S \
+crypto/md5/asm/md5-586.S \
+crypto/modes/asm/ghash-x86.S \
+crypto/sha/asm/sha1-586.S \
+crypto/sha/asm/sha256-586.S \
+crypto/sha/asm/sha512-586.S \
+crypto/x86cpuid.S \
+"
+
+OPENSSL_CRYPTO_SOURCES_EXCLUDES_x86="\
+crypto/aes/aes_core.c \
+crypto/aes/aes_cbc.c \
+crypto/bf/bf_enc.c \
+crypto/bn/bn_asm.c \
+crypto/des/des_enc.c \
+crypto/des/fcrypt_b.c \
+crypto/mem_clr.c \
+"
+
+OPENSSL_CRYPTO_SOURCES_x86_64="\
+crypto/aes/asm/aes-x86_64.S \
+crypto/aes/asm/aesni-x86_64.S \
+crypto/aes/asm/aesni-sha1-x86_64.S \
+crypto/aes/asm/bsaes-x86_64.S \
+crypto/aes/asm/vpaes-x86_64.S \
+crypto/bn/asm/modexp512-x86_64.S \
+crypto/bn/asm/x86_64-gcc.c \
+crypto/bn/asm/x86_64-gf2m.S \
+crypto/bn/asm/x86_64-mont.S \
+crypto/bn/asm/x86_64-mont5.S \
+crypto/md5/asm/md5-x86_64.S \
+crypto/modes/asm/ghash-x86_64.S \
+crypto/rc4/asm/rc4-md5-x86_64.S \
+crypto/rc4/asm/rc4-x86_64.S \
+crypto/sha/asm/sha1-x86_64.S \
+crypto/sha/asm/sha256-x86_64.S \
+crypto/sha/asm/sha512-x86_64.S \
+crypto/x86_64cpuid.S \
+"
+
+OPENSSL_CRYPTO_SOURCES_EXCLUDES_x86_64="\
+crypto/aes/aes_cbc.c \
+crypto/aes/aes_core.c \
+crypto/mem_clr.c \
+crypto/rc4/rc4_enc.c \
+"
+
+OPENSSL_SSL_INCLUDES="\
+. \
+include \
+crypto \
+"
+
+OPENSSL_SSL_SOURCES="\
+ssl/bio_ssl.c \
+ssl/d1_both.c \
+ssl/d1_enc.c \
+ssl/d1_lib.c \
+ssl/d1_pkt.c \
+ssl/d1_srtp.c \
+ssl/kssl.c \
+ssl/s23_clnt.c \
+ssl/s23_lib.c \
+ssl/s23_meth.c \
+ssl/s23_pkt.c \
+ssl/s23_srvr.c \
+ssl/s2_clnt.c \
+ssl/s2_enc.c \
+ssl/s2_lib.c \
+ssl/s2_meth.c \
+ssl/s2_pkt.c \
+ssl/s2_srvr.c \
+ssl/s3_both.c \
+ssl/s3_cbc.c \
+ssl/s3_clnt.c \
+ssl/s3_enc.c \
+ssl/s3_lib.c \
+ssl/s3_meth.c \
+ssl/s3_pkt.c \
+ssl/s3_srvr.c \
+ssl/ssl_algs.c \
+ssl/ssl_asn1.c \
+ssl/ssl_cert.c \
+ssl/ssl_ciph.c \
+ssl/ssl_err.c \
+ssl/ssl_err2.c \
+ssl/ssl_lib.c \
+ssl/ssl_rsa.c \
+ssl/ssl_sess.c \
+ssl/ssl_stat.c \
+ssl/ssl_txt.c \
+ssl/t1_clnt.c \
+ssl/t1_enc.c \
+ssl/t1_lib.c \
+ssl/t1_meth.c \
+ssl/t1_reneg.c \
+ssl/t1_srvr.c \
+ssl/tls_srp.c \
+"
+
+OPENSSL_APPS_DEFINES="\
+MONOLITH \
+"
+
+OPENSSL_APPS_INCLUDES="\
+. \
+include \
+"
+
+OPENSSL_APPS_SOURCES="\
+apps/app_rand.c \
+apps/apps.c \
+apps/asn1pars.c \
+apps/ca.c \
+apps/ciphers.c \
+apps/crl.c \
+apps/crl2p7.c \
+apps/dgst.c \
+apps/dh.c \
+apps/dhparam.c \
+apps/dsa.c \
+apps/dsaparam.c \
+apps/ecparam.c \
+apps/ec.c \
+apps/enc.c \
+apps/engine.c \
+apps/errstr.c \
+apps/gendh.c \
+apps/gendsa.c \
+apps/genpkey.c \
+apps/genrsa.c \
+apps/nseq.c \
+apps/ocsp.c \
+apps/openssl.c \
+apps/passwd.c \
+apps/pkcs12.c \
+apps/pkcs7.c \
+apps/pkcs8.c \
+apps/pkey.c \
+apps/pkeyparam.c \
+apps/pkeyutl.c \
+apps/prime.c \
+apps/rand.c \
+apps/req.c \
+apps/rsa.c \
+apps/rsautl.c \
+apps/s_cb.c \
+apps/s_client.c \
+apps/s_server.c \
+apps/s_socket.c \
+apps/s_time.c \
+apps/sess_id.c \
+apps/smime.c \
+apps/speed.c \
+apps/spkac.c \
+apps/srp.c \
+apps/verify.c \
+apps/version.c \
+apps/x509.c \
+"
+
 OPENSSL_PATCHES="\
 progs.patch \
-small_records.patch \
 handshake_cutthrough.patch \
 jsse.patch \
-npn.patch \
-sslv3_uninit_padding.patch \
-sha1_armv4_large.patch \
-mips_asm.patch \
+channelid.patch \
+eng_dyn_dirs.patch \
+fix_clang_build.patch \
+tls12_digests.patch \
+alpn.patch \
 "
 
 OPENSSL_PATCHES_progs_SOURCES="\
@@ -211,17 +1013,6 @@ ssl/ssltest.c \
 test/testssl \
 "
 
-OPENSSL_PATCHES_small_records_SOURCES="\
-ssl/d1_pkt.c \
-ssl/s23_srvr.c \
-ssl/s3_both.c \
-ssl/s3_pkt.c \
-ssl/ssl.h \
-ssl/ssl3.h \
-ssl/ssltest.c \
-test/testssl \
-"
-
 OPENSSL_PATCHES_jsse_SOURCES="\
 ssl/ssl.h \
 ssl/d1_clnt.c \
@@ -237,18 +1028,12 @@ ssl/ssl_rsa.c \
 ssl/ssl_sess.c \
 "
 
-OPENSSL_PATCHES_npn_SOURCES="\
-apps/apps.c \
-apps/apps.h \
-apps/s_client.c \
-apps/s_server.c \
-include/openssl/ssl.h \
-include/openssl/ssl3.h \
-include/openssl/tls1.h \
+OPENSSL_PATCHES_channelid_SOURCES="\
+crypto/evp/evp.h \
+crypto/evp/p_lib.c \
 ssl/s3_both.c \
 ssl/s3_clnt.c \
 ssl/s3_lib.c \
-ssl/s3_pkt.c \
 ssl/s3_srvr.c \
 ssl/ssl.h \
 ssl/ssl3.h \
@@ -258,3 +1043,28 @@ ssl/ssl_locl.h \
 ssl/t1_lib.c \
 ssl/tls1.h \
 "
+
+OPENSSL_PATCHES_fix_clang_build_SOURCES="\
+crypto/bio/b_sock.c \
+crypto/x509v3/v3_utl.c \
+"
+
+OPENSSL_PATCHES_tls12_digests_SOURCES="\
+ssl/s3_clnt.c \
+ssl/ssl3.h \
+ssl/ssl_cert.c \
+ssl/ssl_lib.c \
+ssl/ssl_locl.h \
+ssl/t1_lib.c \
+"
+
+OPENSSL_PATCHES_alpn_SOURCES="\
+apps/s_client.c \
+ssl/s3_lib.c \
+ssl/ssl.h \
+ssl/ssl3.h \
+ssl/ssl_lib.c \
+ssl/t1_lib.c \
+ssl/tls1.h \
+"
+
diff --git a/openssl.version b/openssl.version
index 721c467..b4e8cab 100644
--- a/openssl.version
+++ b/openssl.version
@@ -1,2 +1 @@
-# also update ThirdPartyProject.prop
-OPENSSL_VERSION=1.0.0e
+OPENSSL_VERSION=1.0.1e
diff --git a/patches/README b/patches/README
index 7dae927..0859abe 100644
--- a/patches/README
+++ b/patches/README
@@ -3,18 +3,6 @@ progs.patch:
 Fixup sources under the apps/ directory that are not built under the android environment.
 
 
-small_records.patch:
-
-Reduce OpenSSL memory consumption.
-SSL records may be as large as 16K, but are typically < 2K.  In
-addition, a historic bug in Windows allowed records to be as large
-32K.  OpenSSL statically allocates read and write buffers (34K and
-18K respectively) used for processing records.
-With this patch, OpenSSL statically allocates 4K + 4K buffers, with
-the option of dynamically growing buffers to 34K + 4K, which is a
-saving of 44K per connection for the typical case.
-
-
 handshake_cutthrough.patch
 
 Enables SSL3+ clients to send application data immediately following the
@@ -26,18 +14,26 @@ jsse.patch
 
 Support for JSSE implementation based on OpenSSL.
 
-npn.patch
+channelid.patch
+
+Implements TLS Channel ID support as both a client and a server.
+See http://tools.ietf.org/html/draft-balfanz-tls-channelid-00.
+
+eng_dyn_dirs.patch
 
-Transport Layer Security (TLS) Next Protocol Negotiation Extension
+Fixes the case of having multiple DIR_ADD commands sent to eng_dyn
 
-sslv3_uninit_padding.patch
+fix_clang_build.patch
 
-This patch sets the padding for SSLv3 block ciphers to zero.
+Fixes the Clang based build.
 
-sha1_armv4_large.patch
+tls12_digests.patch
 
-This patch eliminates memory stores to addresses below SP.
+Fixes a bug with handling TLS 1.2 and digest functions for DSA and ECDSA
+keys.
 
-mips_asm.patch
+alpn.patch
 
-MIPS assembly routines (AES, BN, SHA1, SHA256)
+This change adds support for ALPN in OpenSSL. ALPN is the IETF
+blessed version of NPN and we'll be supporting both ALPN and NPN for
+some time yet.
diff --git a/patches/alpn.patch b/patches/alpn.patch
new file mode 100644
index 0000000..084ac32
--- /dev/null
+++ b/patches/alpn.patch
@@ -0,0 +1,592 @@
+From 5ebeb8b5d90f9f47418b6b8d898ace8f1b4d4104 Mon Sep 17 00:00:00 2001
+From: Adam Langley <agl@chromium.org>
+Date: Mon, 15 Apr 2013 18:07:47 -0400
+
+This change adds support for ALPN[1] in OpenSSL. ALPN is the IETF
+blessed version of NPN and we'll be supporting both ALPN and NPN for
+some time yet.
+
+[1] https://tools.ietf.org/html/draft-ietf-tls-applayerprotoneg-00
+---
+ apps/s_client.c |  40 +++++++++++++-
+ ssl/s3_lib.c    |  13 +++++
+ ssl/ssl.h       |  45 +++++++++++++++
+ ssl/ssl3.h      |  10 ++++
+ ssl/ssl_lib.c   |  87 +++++++++++++++++++++++++++++
+ ssl/t1_lib.c    | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ ssl/tls1.h      |   3 +
+ 7 files changed, 362 insertions(+), 3 deletions(-)
+
+diff --git a/apps/s_client.c b/apps/s_client.c
+index 791e277..cb1efcd 100644
+--- a/apps/s_client.c
++++ b/apps/s_client.c
+@@ -359,6 +359,7 @@ static void sc_usage(void)
+ 	BIO_printf(bio_err," -no_ticket        - disable use of RFC4507bis session tickets\n");
+ # ifndef OPENSSL_NO_NEXTPROTONEG
+ 	BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
++	BIO_printf(bio_err," -alpn arg         - enable ALPN extension, considering named protocols supported (comma-separated list)\n");
+ # endif
+ #endif
+ 	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
+@@ -611,6 +612,7 @@ int MAIN(int argc, char **argv)
+         {NULL,0};
+ # ifndef OPENSSL_NO_NEXTPROTONEG
+ 	const char *next_proto_neg_in = NULL;
++	const char *alpn_in = NULL;
+ # endif
+ #endif
+ 	char *sess_in = NULL;
+@@ -883,6 +885,11 @@ int MAIN(int argc, char **argv)
+ 			if (--argc < 1) goto bad;
+ 			next_proto_neg_in = *(++argv);
+ 			}
++		else if (strcmp(*argv,"-alpn") == 0)
++			{
++			if (--argc < 1) goto bad;
++			alpn_in = *(++argv);
++			}
+ # endif
+ #endif
+ 		else if (strcmp(*argv,"-cutthrough") == 0)
+@@ -1157,9 +1164,23 @@ bad:
+ 	 */
+ 	if (socket_type == SOCK_DGRAM) SSL_CTX_set_read_ahead(ctx, 1);
+ 
+-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
++#if !defined(OPENSSL_NO_TLSEXT)
++# if !defined(OPENSSL_NO_NEXTPROTONEG)
+ 	if (next_proto.data)
+ 		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
++# endif
++	if (alpn_in)
++		{
++		unsigned short alpn_len;
++		unsigned char *alpn = next_protos_parse(&alpn_len, alpn_in);
++
++		if (alpn == NULL)
++			{
++			BIO_printf(bio_err, "Error parsing -alpn argument\n");
++			goto end;
++			}
++		SSL_CTX_set_alpn_protos(ctx, alpn, alpn_len);
++		}
+ #endif
+ 
+ 	/* Enable handshake cutthrough for client connections using
+@@ -2077,7 +2098,8 @@ static void print_stuff(BIO *bio, SSL *s, int full)
+ 	}
+ #endif
+ 
+-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
++#if !defined(OPENSSL_NO_TLSEXT)
++# if !defined(OPENSSL_NO_NEXTPROTONEG)
+ 	if (next_proto.status != -1) {
+ 		const unsigned char *proto;
+ 		unsigned int proto_len;
+@@ -2086,6 +2108,20 @@ static void print_stuff(BIO *bio, SSL *s, int full)
+ 		BIO_write(bio, proto, proto_len);
+ 		BIO_write(bio, "\n", 1);
+ 	}
++	{
++		const unsigned char *proto;
++		unsigned int proto_len;
++		SSL_get0_alpn_selected(s, &proto, &proto_len);
++		if (proto_len > 0)
++			{
++			BIO_printf(bio, "ALPN protocol: ");
++			BIO_write(bio, proto, proto_len);
++			BIO_write(bio, "\n", 1);
++			}
++		else
++			BIO_printf(bio, "No ALPN negotiated\n");
++	}
++# endif
+ #endif
+ 
+ #ifndef OPENSSL_NO_SRTP
+diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
+index 5e46393..2cd1654 100644
+--- a/ssl/s3_lib.c
++++ b/ssl/s3_lib.c
+@@ -2996,6 +2996,11 @@ void ssl3_free(SSL *s)
+ 		BIO_free(s->s3->handshake_buffer);
+ 	}
+ 	if (s->s3->handshake_dgst) ssl3_free_digest_list(s);
++#ifndef OPENSSL_NO_TLSEXT
++	if (s->s3->alpn_selected)
++		OPENSSL_free(s->s3->alpn_selected);
++#endif
++
+ #ifndef OPENSSL_NO_SRP
+ 	SSL_SRP_CTX_free(s);
+ #endif
+@@ -3055,6 +3060,14 @@ void ssl3_clear(SSL *s)
+ 	if (s->s3->handshake_dgst) {
+ 		ssl3_free_digest_list(s);
+ 	}	
++
++#if !defined(OPENSSL_NO_TLSEXT)
++	if (s->s3->alpn_selected)
++		{
++		free(s->s3->alpn_selected);
++		s->s3->alpn_selected = NULL;
++		}
++#endif
+ 	memset(s->s3,0,sizeof *s->s3);
+ 	s->s3->rbuf.buf = rp;
+ 	s->s3->wbuf.buf = wp;
+diff --git a/ssl/ssl.h b/ssl/ssl.h
+index e8c73fa..612c7aa 100644
+--- a/ssl/ssl.h
++++ b/ssl/ssl.h
+@@ -1019,6 +1019,31 @@ struct ssl_ctx_st
+ 				    void *arg);
+ 	void *next_proto_select_cb_arg;
+ # endif
++
++	/* ALPN information
++	 * (we are in the process of transitioning from NPN to ALPN.) */
++
++	/* For a server, this contains a callback function that allows the
++	 * server to select the protocol for the connection.
++	 *   out: on successful return, this must point to the raw protocol
++	 *        name (without the length prefix).
++	 *   outlen: on successful return, this contains the length of |*out|.
++	 *   in: points to the client's list of supported protocols in
++	 *       wire-format.
++	 *   inlen: the length of |in|. */
++	int (*alpn_select_cb)(SSL *s,
++			      const unsigned char **out,
++			      unsigned char *outlen,
++			      const unsigned char* in,
++			      unsigned int inlen,
++			      void *arg);
++	void *alpn_select_cb_arg;
++
++	/* For a client, this contains the list of supported protocols in wire
++	 * format. */
++	unsigned char* alpn_client_proto_list;
++	unsigned alpn_client_proto_list_len;
++
+         /* SRTP profiles we are willing to do from RFC 5764 */
+         STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  
+ 
+@@ -1120,6 +1145,21 @@ void SSL_get0_next_proto_negotiated(const SSL *s,
+ #define OPENSSL_NPN_NO_OVERLAP	2
+ #endif
+ 
++int SSL_CTX_set_alpn_protos(SSL_CTX *ctx, const unsigned char* protos,
++			    unsigned protos_len);
++int SSL_set_alpn_protos(SSL *ssl, const unsigned char* protos,
++			unsigned protos_len);
++void SSL_CTX_set_alpn_select_cb(SSL_CTX* ctx,
++				int (*cb) (SSL *ssl,
++					   const unsigned char **out,
++					   unsigned char *outlen,
++					   const unsigned char *in,
++					   unsigned int inlen,
++					   void *arg),
++				void *arg);
++void SSL_get0_alpn_selected(const SSL *ssl, const unsigned char **data,
++			    unsigned *len);
++
+ #ifndef OPENSSL_NO_PSK
+ /* the maximum length of the buffer given to callbacks containing the
+  * resulting identity/psk */
+@@ -1422,6 +1462,11 @@ struct ssl_st
+ 	char tlsext_channel_id_enabled;
+ 	/* The client's Channel ID private key. */
+ 	EVP_PKEY *tlsext_channel_id_private;
++
++	/* For a client, this contains the list of supported protocols in wire
++	 * format. */
++	unsigned char* alpn_client_proto_list;
++	unsigned alpn_client_proto_list_len;
+ #else
+ #define session_ctx ctx
+ #endif /* OPENSSL_NO_TLSEXT */
+diff --git a/ssl/ssl3.h b/ssl/ssl3.h
+index 3229995..28c46d5 100644
+--- a/ssl/ssl3.h
++++ b/ssl/ssl3.h
+@@ -551,6 +551,16 @@ typedef struct ssl3_state_st
+ 	 *     each are big-endian values. */
+ 	unsigned char tlsext_channel_id[64];
+ 
++	/* ALPN information
++	 * (we are in the process of transitioning from NPN to ALPN.) */
++
++	/* In a server these point to the selected ALPN protocol after the
++	 * ClientHello has been processed. In a client these contain the
++	 * protocol that the server selected once the ServerHello has been
++	 * processed. */
++	unsigned char *alpn_selected;
++	unsigned alpn_selected_len;
++
+ 	/* These point to the digest function to use for signatures made with
+ 	 * each type of public key. A NULL value indicates that the default
+ 	 * digest should be used, which is SHA1 as of TLS 1.2.
+diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
+index e360550..b472423 100644
+--- a/ssl/ssl_lib.c
++++ b/ssl/ssl_lib.c
+@@ -359,6 +359,17 @@ SSL *SSL_new(SSL_CTX *ctx)
+ # ifndef OPENSSL_NO_NEXTPROTONEG
+ 	s->next_proto_negotiated = NULL;
+ # endif
++
++	if (s->ctx->alpn_client_proto_list)
++		{
++		s->alpn_client_proto_list =
++			OPENSSL_malloc(s->ctx->alpn_client_proto_list_len);
++		if (s->alpn_client_proto_list == NULL)
++			goto err;
++		memcpy(s->alpn_client_proto_list, s->ctx->alpn_client_proto_list,
++		       s->ctx->alpn_client_proto_list_len);
++		s->alpn_client_proto_list_len = s->ctx->alpn_client_proto_list_len;
++		}
+ #endif
+ 
+ 	s->verify_result=X509_V_OK;
+@@ -564,6 +575,8 @@ void SSL_free(SSL *s)
+ 		OPENSSL_free(s->tlsext_ocsp_resp);
+ 	if (s->tlsext_channel_id_private)
+ 		EVP_PKEY_free(s->tlsext_channel_id_private);
++	if (s->alpn_client_proto_list)
++		OPENSSL_free(s->alpn_client_proto_list);
+ #endif
+ 
+ 	if (s->client_CA != NULL)
+@@ -1615,6 +1628,78 @@ void SSL_CTX_set_next_proto_select_cb(SSL_CTX *ctx, int (*cb) (SSL *s, unsigned
+ 	ctx->next_proto_select_cb_arg = arg;
+ 	}
+ # endif
++
++/* SSL_CTX_set_alpn_protos sets the ALPN protocol list on |ctx| to |protos|.
++ * |protos| must be in wire-format (i.e. a series of non-empty, 8-bit
++ * length-prefixed strings).
++ *
++ * Returns 0 on success. */
++int SSL_CTX_set_alpn_protos(SSL_CTX *ctx, const unsigned char* protos,
++			    unsigned protos_len)
++	{
++	if (ctx->alpn_client_proto_list)
++		OPENSSL_free(ctx->alpn_client_proto_list);
++
++	ctx->alpn_client_proto_list = OPENSSL_malloc(protos_len);
++	if (!ctx->alpn_client_proto_list)
++		return 1;
++	memcpy(ctx->alpn_client_proto_list, protos, protos_len);
++	ctx->alpn_client_proto_list_len = protos_len;
++
++	return 0;
++	}
++
++/* SSL_set_alpn_protos sets the ALPN protocol list on |ssl| to |protos|.
++ * |protos| must be in wire-format (i.e. a series of non-empty, 8-bit
++ * length-prefixed strings).
++ *
++ * Returns 0 on success. */
++int SSL_set_alpn_protos(SSL *ssl, const unsigned char* protos,
++			unsigned protos_len)
++	{
++	if (ssl->alpn_client_proto_list)
++		OPENSSL_free(ssl->alpn_client_proto_list);
++
++	ssl->alpn_client_proto_list = OPENSSL_malloc(protos_len);
++	if (!ssl->alpn_client_proto_list)
++		return 1;
++	memcpy(ssl->alpn_client_proto_list, protos, protos_len);
++	ssl->alpn_client_proto_list_len = protos_len;
++
++	return 0;
++	}
++
++/* SSL_CTX_set_alpn_select_cb sets a callback function on |ctx| that is called
++ * during ClientHello processing in order to select an ALPN protocol from the
++ * client's list of offered protocols. */
++void SSL_CTX_set_alpn_select_cb(SSL_CTX* ctx,
++				int (*cb) (SSL *ssl,
++					   const unsigned char **out,
++					   unsigned char *outlen,
++					   const unsigned char *in,
++					   unsigned int inlen,
++					   void *arg),
++				void *arg)
++	{
++	ctx->alpn_select_cb = cb;
++	ctx->alpn_select_cb_arg = arg;
++	}
++
++/* SSL_get0_alpn_selected gets the selected ALPN protocol (if any) from |ssl|.
++ * On return it sets |*data| to point to |*len| bytes of protocol name (not
++ * including the leading length-prefix byte). If the server didn't respond with
++ * a negotiated protocol then |*len| will be zero. */
++void SSL_get0_alpn_selected(const SSL *ssl, const unsigned char **data,
++			    unsigned *len)
++	{
++	*data = NULL;
++	if (ssl->s3)
++		*data = ssl->s3->alpn_selected;
++	if (*data == NULL)
++		*len = 0;
++	else
++		*len = ssl->s3->alpn_selected_len;
++	}
+ #endif
+ 
+ int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+@@ -1955,6 +2040,8 @@ void SSL_CTX_free(SSL_CTX *a)
+ #ifndef OPENSSL_NO_TLSEXT
+ 	if (a->tlsext_channel_id_private)
+ 		EVP_PKEY_free(a->tlsext_channel_id_private);
++	if (a->alpn_client_proto_list != NULL)
++		OPENSSL_free(a->alpn_client_proto_list);
+ #endif
+ 
+ 	OPENSSL_free(a);
+diff --git a/ssl/t1_lib.c b/ssl/t1_lib.c
+index 1f93a6f..b2e049a 100644
+--- a/ssl/t1_lib.c
++++ b/ssl/t1_lib.c
+@@ -659,6 +659,18 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
+ 		s2n(0,ret);
+ 		}
+ 
++	if (s->alpn_client_proto_list && !s->s3->tmp.finish_md_len)
++		{
++		if ((size_t)(limit - ret) < 6 + s->alpn_client_proto_list_len)
++			return NULL;
++		s2n(TLSEXT_TYPE_application_layer_protocol_negotiation,ret);
++		s2n(2 + s->alpn_client_proto_list_len,ret);
++		s2n(s->alpn_client_proto_list_len,ret);
++		memcpy(ret, s->alpn_client_proto_list,
++		       s->alpn_client_proto_list_len);
++		ret += s->alpn_client_proto_list_len;
++		}
++
+ #ifndef OPENSSL_NO_SRTP
+         if(SSL_get_srtp_profiles(s))
+                 {
+@@ -879,6 +891,21 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
+ 		s2n(0,ret);
+ 		}
+ 
++	if (s->s3->alpn_selected)
++		{
++		const unsigned char *selected = s->s3->alpn_selected;
++		unsigned len = s->s3->alpn_selected_len;
++
++		if ((long)(limit - ret - 4 - 2 - 1 - len) < 0)
++			return NULL;
++		s2n(TLSEXT_TYPE_application_layer_protocol_negotiation,ret);
++		s2n(3 + len,ret);
++		s2n(1 + len,ret);
++		*ret++ = len;
++		memcpy(ret, selected, len);
++		ret += len;
++		}
++
+ 	if ((extdatalen = ret-p-2)== 0) 
+ 		return p;
+ 
+@@ -966,6 +993,76 @@ static void ssl_check_for_safari(SSL *s, const unsigned char *data, const unsign
+ 	s->is_probably_safari = 1;
+ }
+ 
++/* tls1_alpn_handle_client_hello is called to process the ALPN extension in a
++ * ClientHello.
++ *   data: the contents of the extension, not including the type and length.
++ *   data_len: the number of bytes in |data|
++ *   al: a pointer to the alert value to send in the event of a non-zero
++ *       return.
++ *
++ *   returns: 0 on success. */
++static int tls1_alpn_handle_client_hello(SSL *s, const unsigned char *data,
++					 unsigned data_len, int *al)
++	{
++	unsigned i;
++	unsigned proto_len;
++	const unsigned char *selected;
++	unsigned char selected_len;
++	int r;
++
++	if (s->ctx->alpn_select_cb == NULL)
++		return 0;
++
++	if (data_len < 2)
++		goto parse_error;
++
++	/* data should contain a uint16 length followed by a series of 8-bit,
++	 * length-prefixed strings. */
++	i = ((unsigned) data[0]) << 8 |
++	    ((unsigned) data[1]);
++	data_len -= 2;
++	data += 2;
++	if (data_len != i)
++		goto parse_error;
++
++	if (data_len < 2)
++		goto parse_error;
++
++	for (i = 0; i < data_len;)
++		{
++		proto_len = data[i];
++		i++;
++
++		if (proto_len == 0)
++			goto parse_error;
++
++		if (i + proto_len < i || i + proto_len > data_len)
++			goto parse_error;
++
++		i += proto_len;
++		}
++
++	r = s->ctx->alpn_select_cb(s, &selected, &selected_len, data, data_len,
++				   s->ctx->alpn_select_cb_arg);
++	if (r == SSL_TLSEXT_ERR_OK) {
++		if (s->s3->alpn_selected)
++			OPENSSL_free(s->s3->alpn_selected);
++		s->s3->alpn_selected = OPENSSL_malloc(selected_len);
++		if (!s->s3->alpn_selected)
++			{
++			*al = SSL_AD_INTERNAL_ERROR;
++			return -1;
++			}
++		memcpy(s->s3->alpn_selected, selected, selected_len);
++		s->s3->alpn_selected_len = selected_len;
++	}
++	return 0;
++
++parse_error:
++	*al = SSL_AD_DECODE_ERROR;
++	return -1;
++	}
++
+ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, int n, int *al)
+ 	{
+ 	unsigned short type;
+@@ -988,6 +1085,12 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
+ 	s->s3->next_proto_neg_seen = 0;
+ #endif
+ 
++	if (s->s3->alpn_selected)
++		{
++		OPENSSL_free(s->s3->alpn_selected);
++		s->s3->alpn_selected = NULL;
++		}
++
+ #ifndef OPENSSL_NO_HEARTBEATS
+ 	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+ 	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+@@ -1420,7 +1523,8 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
+ #endif
+ #ifndef OPENSSL_NO_NEXTPROTONEG
+ 		else if (type == TLSEXT_TYPE_next_proto_neg &&
+-			 s->s3->tmp.finish_md_len == 0)
++			 s->s3->tmp.finish_md_len == 0 &&
++			 s->s3->alpn_selected == NULL)
+ 			{
+ 			/* We shouldn't accept this extension on a
+ 			 * renegotiation.
+@@ -1444,6 +1548,16 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
+ 		else if (type == TLSEXT_TYPE_channel_id && s->tlsext_channel_id_enabled)
+ 			s->s3->tlsext_channel_id_valid = 1;
+ 
++		else if (type == TLSEXT_TYPE_application_layer_protocol_negotiation &&
++			 s->ctx->alpn_select_cb &&
++			 s->s3->tmp.finish_md_len == 0)
++			{
++			if (tls1_alpn_handle_client_hello(s, data, size, al) != 0)
++				return 0;
++			/* ALPN takes precedence over NPN. */
++			s->s3->next_proto_neg_seen = 0;
++			}
++
+ 		/* session ticket processed earlier */
+ #ifndef OPENSSL_NO_SRTP
+ 		else if (type == TLSEXT_TYPE_use_srtp)
+@@ -1508,6 +1622,12 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
+ 	s->s3->next_proto_neg_seen = 0;
+ #endif
+ 
++	if (s->s3->alpn_selected)
++		{
++		OPENSSL_free(s->s3->alpn_selected);
++		s->s3->alpn_selected = NULL;
++		}
++
+ #ifndef OPENSSL_NO_HEARTBEATS
+ 	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+ 	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+@@ -1677,6 +1797,51 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
+ 		else if (type == TLSEXT_TYPE_channel_id)
+ 			s->s3->tlsext_channel_id_valid = 1;
+ 
++		else if (type == TLSEXT_TYPE_application_layer_protocol_negotiation)
++			{
++			unsigned len;
++
++			/* We must have requested it. */
++			if (s->alpn_client_proto_list == NULL)
++				{
++				*al = TLS1_AD_UNSUPPORTED_EXTENSION;
++				return 0;
++				}
++			if (size < 4)
++				{
++				*al = TLS1_AD_DECODE_ERROR;
++				return 0;
++				}
++			/* The extension data consists of:
++			 *   uint16 list_length
++			 *   uint8 proto_length;
++			 *   uint8 proto[proto_length]; */
++			len = data[0];
++			len <<= 8;
++			len |= data[1];
++			if (len != (unsigned) size - 2)
++				{
++				*al = TLS1_AD_DECODE_ERROR;
++				return 0;
++				}
++			len = data[2];
++			if (len != (unsigned) size - 3)
++				{
++				*al = TLS1_AD_DECODE_ERROR;
++				return 0;
++				}
++			if (s->s3->alpn_selected)
++				OPENSSL_free(s->s3->alpn_selected);
++			s->s3->alpn_selected = OPENSSL_malloc(len);
++			if (!s->s3->alpn_selected)
++				{
++				*al = TLS1_AD_INTERNAL_ERROR;
++				return 0;
++				}
++			memcpy(s->s3->alpn_selected, data + 3, len);
++			s->s3->alpn_selected_len = len;
++			}
++
+ 		else if (type == TLSEXT_TYPE_renegotiate)
+ 			{
+ 			if(!ssl_parse_serverhello_renegotiate_ext(s, data, size, al))
+diff --git a/ssl/tls1.h b/ssl/tls1.h
+index 8fc1ff4..c6670f4 100644
+--- a/ssl/tls1.h
++++ b/ssl/tls1.h
+@@ -230,6 +230,9 @@ extern "C" {
+ /* ExtensionType value from RFC5620 */
+ #define TLSEXT_TYPE_heartbeat	15
+ 
++/* ExtensionType value from draft-ietf-tls-applayerprotoneg-00 */
++#define TLSEXT_TYPE_application_layer_protocol_negotiation 16
++
+ /* ExtensionType value from RFC4507 */
+ #define TLSEXT_TYPE_session_ticket		35
+ 
+-- 
+1.8.2.1
+
diff --git a/patches/apps_Android.mk b/patches/apps_Android.mk
deleted file mode 100644
index 20cc5a9..0000000
--- a/patches/apps_Android.mk
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2006 The Android Open Source Project
-
-LOCAL_PATH:= $(call my-dir)
-
-local_src_files:= \
-	app_rand.c \
-	apps.c \
-	asn1pars.c \
-	ca.c \
-	ciphers.c \
-	crl.c \
-	crl2p7.c \
-	dgst.c \
-	dh.c \
-	dhparam.c \
-	dsa.c \
-	dsaparam.c \
-	ecparam.c \
-	ec.c \
-	enc.c \
-	engine.c \
-	errstr.c \
-	gendh.c \
-	gendsa.c \
-	genpkey.c \
-	genrsa.c \
-	nseq.c \
-	ocsp.c \
-	openssl.c \
-	passwd.c \
-	pkcs12.c \
-	pkcs7.c \
-	pkcs8.c \
-	pkey.c \
-	pkeyparam.c \
-	pkeyutl.c \
-	prime.c \
-	rand.c \
-	req.c \
-	rsa.c \
-	rsautl.c \
-	s_cb.c \
-	s_client.c \
-	s_server.c \
-	s_socket.c \
-	s_time.c \
-	sess_id.c \
-	smime.c \
-	speed.c \
-	spkac.c \
-	verify.c \
-	version.c \
-	x509.c
-
-local_shared_libraries := \
-	libssl \
-	libcrypto
-
-local_c_includes := \
-	external/openssl \
-	external/openssl/include
-
-local_cflags := -DMONOLITH
-
-# These flags omit whole features from the commandline "openssl".
-# However, portions of these features are actually turned on.
-local_cflags += -DOPENSSL_NO_DTLS1
-
-include $(CLEAR_VARS)
-LOCAL_MODULE:= openssl
-LOCAL_MODULE_TAGS := optional
-LOCAL_SRC_FILES := $(local_src_files)
-LOCAL_SHARED_LIBRARIES := $(local_shared_libraries)
-LOCAL_C_INCLUDES := $(local_c_includes)
-LOCAL_CFLAGS := $(local_cflags)
-include $(LOCAL_PATH)/../android-config.mk
-include $(BUILD_EXECUTABLE)
-
-include $(CLEAR_VARS)
-LOCAL_MODULE:= openssl
-LOCAL_MODULE_TAGS := optional
-LOCAL_SRC_FILES := $(local_src_files)
-LOCAL_SHARED_LIBRARIES := $(local_shared_libraries)
-LOCAL_C_INCLUDES := $(local_c_includes)
-LOCAL_CFLAGS := $(local_cflags)
-include $(LOCAL_PATH)/../android-config.mk
-include $(BUILD_HOST_EXECUTABLE)
diff --git a/patches/channelid.patch b/patches/channelid.patch
new file mode 100644
index 0000000..c8ebbfa
--- /dev/null
+++ b/patches/channelid.patch
@@ -0,0 +1,983 @@
+--- openssl-1.0.1e.orig/crypto/evp/evp.h	2013-03-05 18:49:33.183296743 +0000
++++ openssl-1.0.1e/crypto/evp/evp.h	2013-03-05 18:49:33.373298798 +0000
+@@ -921,6 +921,7 @@ struct ec_key_st *EVP_PKEY_get1_EC_KEY(E
+ #endif
+ 
+ EVP_PKEY *	EVP_PKEY_new(void);
++EVP_PKEY *	EVP_PKEY_dup(EVP_PKEY *pkey);
+ void		EVP_PKEY_free(EVP_PKEY *pkey);
+ 
+ EVP_PKEY *	d2i_PublicKey(int type,EVP_PKEY **a, const unsigned char **pp,
+--- openssl-1.0.1e.orig/crypto/evp/p_lib.c	2013-03-05 18:49:33.183296743 +0000
++++ openssl-1.0.1e/crypto/evp/p_lib.c	2013-03-05 18:49:33.373298798 +0000
+@@ -200,6 +200,12 @@ EVP_PKEY *EVP_PKEY_new(void)
+ 	return(ret);
+ 	}
+ 
++EVP_PKEY *EVP_PKEY_dup(EVP_PKEY *pkey)
++	{
++	CRYPTO_add(&pkey->references, 1, CRYPTO_LOCK_EVP_PKEY);
++	return pkey;
++	}
++
+ /* Setup a public key ASN1 method and ENGINE from a NID or a string.
+  * If pkey is NULL just return 1 or 0 if the algorithm exists.
+  */
+--- openssl-1.0.1e.orig/ssl/s3_both.c	2013-03-05 18:49:33.233297282 +0000
++++ openssl-1.0.1e/ssl/s3_both.c	2013-03-05 18:49:33.413299231 +0000
+@@ -555,7 +555,8 @@ long ssl3_get_message(SSL *s, int st1, i
+ #endif
+ 
+ 	/* Feed this message into MAC computation. */
+-	ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
++	if (*(unsigned char*)s->init_buf->data != SSL3_MT_ENCRYPTED_EXTENSIONS)
++		ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
+ 	if (s->msg_callback)
+ 		s->msg_callback(0, s->version, SSL3_RT_HANDSHAKE, s->init_buf->data, (size_t)s->init_num + 4, s, s->msg_callback_arg);
+ 	*ok=1;
+--- openssl-1.0.1e.orig/ssl/s3_clnt.c	2013-03-05 18:49:33.233297282 +0000
++++ openssl-1.0.1e/ssl/s3_clnt.c	2013-03-05 18:49:33.413299231 +0000
+@@ -477,13 +477,14 @@ int ssl3_connect(SSL *s)
+ 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
+ 			if (ret <= 0) goto end;
+ 
+-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
+ 			s->state=SSL3_ST_CW_FINISHED_A;
+-#else
++#if !defined(OPENSSL_NO_TLSEXT)
++			if (s->s3->tlsext_channel_id_valid)
++				s->state=SSL3_ST_CW_CHANNEL_ID_A;
++# if !defined(OPENSSL_NO_NEXTPROTONEG)
+ 			if (s->s3->next_proto_neg_seen)
+ 				s->state=SSL3_ST_CW_NEXT_PROTO_A;
+-			else
+-				s->state=SSL3_ST_CW_FINISHED_A;
++# endif
+ #endif
+ 			s->init_num=0;
+ 
+@@ -517,6 +518,18 @@ int ssl3_connect(SSL *s)
+ 		case SSL3_ST_CW_NEXT_PROTO_B:
+ 			ret=ssl3_send_next_proto(s);
+ 			if (ret <= 0) goto end;
++			if (s->s3->tlsext_channel_id_valid)
++				s->state=SSL3_ST_CW_CHANNEL_ID_A;
++			else
++				s->state=SSL3_ST_CW_FINISHED_A;
++			break;
++#endif
++
++#if !defined(OPENSSL_NO_TLSEXT)
++		case SSL3_ST_CW_CHANNEL_ID_A:
++		case SSL3_ST_CW_CHANNEL_ID_B:
++			ret=ssl3_send_channel_id(s);
++			if (ret <= 0) goto end;
+ 			s->state=SSL3_ST_CW_FINISHED_A;
+ 			break;
+ #endif
+@@ -3362,7 +3375,8 @@ err:
+ 	return(0);
+ 	}
+ 
+-#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
++#if !defined(OPENSSL_NO_TLSEXT)
++# if !defined(OPENSSL_NO_NEXTPROTONEG)
+ int ssl3_send_next_proto(SSL *s)
+ 	{
+ 	unsigned int len, padding_len;
+@@ -3386,7 +3400,116 @@ int ssl3_send_next_proto(SSL *s)
+ 
+ 	return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
+ }
+-#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
++# endif  /* !OPENSSL_NO_NEXTPROTONEG */
++
++int ssl3_send_channel_id(SSL *s)
++	{
++	unsigned char *d;
++	int ret = -1, public_key_len;
++	EVP_MD_CTX md_ctx;
++	size_t sig_len;
++	ECDSA_SIG *sig = NULL;
++	unsigned char *public_key = NULL, *derp, *der_sig = NULL;
++
++	if (s->state != SSL3_ST_CW_CHANNEL_ID_A)
++		return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
++
++	d = (unsigned char *)s->init_buf->data;
++	*(d++)=SSL3_MT_ENCRYPTED_EXTENSIONS;
++	l2n3(2 + 2 + TLSEXT_CHANNEL_ID_SIZE, d);
++	s2n(TLSEXT_TYPE_channel_id, d);
++	s2n(TLSEXT_CHANNEL_ID_SIZE, d);
++
++	EVP_MD_CTX_init(&md_ctx);
++
++	public_key_len = i2d_PublicKey(s->tlsext_channel_id_private, NULL);
++	if (public_key_len <= 0)
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_CANNOT_SERIALIZE_PUBLIC_KEY);
++		goto err;
++		}
++	// i2d_PublicKey will produce an ANSI X9.62 public key which, for a
++	// P-256 key, is 0x04 (meaning uncompressed) followed by the x and y
++	// field elements as 32-byte, big-endian numbers.
++	if (public_key_len != 65)
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_CHANNEL_ID_NOT_P256);
++		goto err;
++		}
++	public_key = OPENSSL_malloc(public_key_len);
++	if (!public_key)
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,ERR_R_MALLOC_FAILURE);
++		goto err;
++		}
++
++	derp = public_key;
++	i2d_PublicKey(s->tlsext_channel_id_private, &derp);
++
++	if (EVP_DigestSignInit(&md_ctx, NULL, EVP_sha256(), NULL,
++			       s->tlsext_channel_id_private) != 1)
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_EVP_DIGESTSIGNINIT_FAILED);
++		goto err;
++		}
++
++	if (!tls1_channel_id_hash(&md_ctx, s))
++		goto err;
++
++	if (!EVP_DigestSignFinal(&md_ctx, NULL, &sig_len))
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_EVP_DIGESTSIGNFINAL_FAILED);
++		goto err;
++		}
++
++	der_sig = OPENSSL_malloc(sig_len);
++	if (!der_sig)
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,ERR_R_MALLOC_FAILURE);
++		goto err;
++		}
++
++	if (!EVP_DigestSignFinal(&md_ctx, der_sig, &sig_len))
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_EVP_DIGESTSIGNFINAL_FAILED);
++		goto err;
++		}
++
++	derp = der_sig;
++	sig = d2i_ECDSA_SIG(NULL, (const unsigned char**)&derp, sig_len);
++	if (sig == NULL)
++		{
++		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_D2I_ECDSA_SIG);
++		goto err;
++		}
++
++	// The first byte of public_key will be 0x4, denoting an uncompressed key.
++	memcpy(d, public_key + 1, 64);
++	d += 64;
++	memset(d, 0, 2 * 32);
++	BN_bn2bin(sig->r, d + 32 - BN_num_bytes(sig->r));
++	d += 32;
++	BN_bn2bin(sig->s, d + 32 - BN_num_bytes(sig->s));
++	d += 32;
++
++	s->state = SSL3_ST_CW_CHANNEL_ID_B;
++	s->init_num = 4 + 2 + 2 + TLSEXT_CHANNEL_ID_SIZE;
++	s->init_off = 0;
++
++	ret = ssl3_do_write(s, SSL3_RT_HANDSHAKE);
++
++err:
++	EVP_MD_CTX_cleanup(&md_ctx);
++	if (public_key)
++		OPENSSL_free(public_key);
++	if (der_sig)
++		OPENSSL_free(der_sig);
++	if (sig)
++		ECDSA_SIG_free(sig);
++
++	return ret;
++	}
++#endif  /* !OPENSSL_NO_TLSEXT */
+ 
+ /* Check to see if handshake is full or resumed. Usually this is just a
+  * case of checking to see if a cache hit has occurred. In the case of
+--- openssl-1.0.1e.orig/ssl/s3_lib.c	2013-03-05 18:49:33.223297173 +0000
++++ openssl-1.0.1e/ssl/s3_lib.c	2013-03-05 18:49:33.413299231 +0000
+@@ -2951,6 +2951,11 @@ int ssl3_new(SSL *s)
+ #ifndef OPENSSL_NO_SRP
+ 	SSL_SRP_CTX_init(s);
+ #endif
++#if !defined(OPENSSL_NO_TLSEXT)
++	s->tlsext_channel_id_enabled = s->ctx->tlsext_channel_id_enabled;
++	if (s->ctx->tlsext_channel_id_private)
++		s->tlsext_channel_id_private = EVP_PKEY_dup(s->ctx->tlsext_channel_id_private);
++#endif
+ 	s->method->ssl_clear(s);
+ 	return(1);
+ err:
+@@ -3074,6 +3079,10 @@ void ssl3_clear(SSL *s)
+ 		s->next_proto_negotiated_len = 0;
+ 		}
+ #endif
++
++#if !defined(OPENSSL_NO_TLSEXT)
++	s->s3->tlsext_channel_id_valid = 0;
++#endif
+ 	}
+ 
+ #ifndef OPENSSL_NO_SRP
+@@ -3348,6 +3357,35 @@ long ssl3_ctrl(SSL *s, int cmd, long lar
+ 		ret = 1;
+ 		break;
+ #endif
++	case SSL_CTRL_CHANNEL_ID:
++		if (!s->server)
++			break;
++		s->tlsext_channel_id_enabled = 1;
++		ret = 1;
++		break;
++
++	case SSL_CTRL_SET_CHANNEL_ID:
++		if (s->server)
++			break;
++		s->tlsext_channel_id_enabled = 1;
++		if (EVP_PKEY_bits(parg) != 256)
++			{
++			SSLerr(SSL_F_SSL3_CTRL,SSL_R_CHANNEL_ID_NOT_P256);
++			break;
++			}
++		if (s->tlsext_channel_id_private)
++			EVP_PKEY_free(s->tlsext_channel_id_private);
++		s->tlsext_channel_id_private = (EVP_PKEY*) parg;
++		ret = 1;
++		break;
++
++	case SSL_CTRL_GET_CHANNEL_ID:
++		if (!s->server)
++			break;
++		if (!s->s3->tlsext_channel_id_valid)
++			break;
++		memcpy(parg, s->s3->tlsext_channel_id, larg < 64 ? larg : 64);
++		return 64;
+ 
+ #endif /* !OPENSSL_NO_TLSEXT */
+ 	default:
+@@ -3569,6 +3607,12 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd
+ 			}
+ 		return 1;
+ 		}
++	case SSL_CTRL_CHANNEL_ID:
++		/* must be called on a server */
++		if (ctx->method->ssl_accept == ssl_undefined_function)
++			return 0;
++		ctx->tlsext_channel_id_enabled=1;
++		return 1;
+ 
+ #ifdef TLSEXT_TYPE_opaque_prf_input
+ 	case SSL_CTRL_SET_TLSEXT_OPAQUE_PRF_INPUT_CB_ARG:
+@@ -3637,6 +3681,18 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd
+ 			}
+ 		break;
+ 
++	case SSL_CTRL_SET_CHANNEL_ID:
++		ctx->tlsext_channel_id_enabled = 1;
++		if (EVP_PKEY_bits(parg) != 256)
++			{
++			SSLerr(SSL_F_SSL3_CTX_CTRL,SSL_R_CHANNEL_ID_NOT_P256);
++			break;
++			}
++		if (ctx->tlsext_channel_id_private)
++			EVP_PKEY_free(ctx->tlsext_channel_id_private);
++		ctx->tlsext_channel_id_private = (EVP_PKEY*) parg;
++		break;
++
+ 	default:
+ 		return(0);
+ 		}
+--- openssl-1.0.1e.orig/ssl/s3_srvr.c	2013-03-05 18:49:33.233297282 +0000
++++ openssl-1.0.1e/ssl/s3_srvr.c	2013-03-05 18:49:33.413299231 +0000
+@@ -157,8 +157,11 @@
+ #include <openssl/buffer.h>
+ #include <openssl/rand.h>
+ #include <openssl/objects.h>
++#include <openssl/ec.h>
++#include <openssl/ecdsa.h>
+ #include <openssl/evp.h>
+ #include <openssl/hmac.h>
++#include <openssl/sha.h>
+ #include <openssl/x509.h>
+ #ifndef OPENSSL_NO_DH
+ #include <openssl/dh.h>
+@@ -609,15 +612,8 @@ int ssl3_accept(SSL *s)
+ 				 * the client uses its key from the certificate
+ 				 * for key exchange.
+ 				 */
+-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
+-				s->state=SSL3_ST_SR_FINISHED_A;
+-#else
+-				if (s->s3->next_proto_neg_seen)
+-					s->state=SSL3_ST_SR_NEXT_PROTO_A;
+-				else
+-					s->state=SSL3_ST_SR_FINISHED_A;
+-#endif
+ 				s->init_num = 0;
++				s->state=SSL3_ST_SR_POST_CLIENT_CERT;
+ 				}
+ 			else if (TLS1_get_version(s) >= TLS1_2_VERSION)
+ 				{
+@@ -677,16 +673,28 @@ int ssl3_accept(SSL *s)
+ 			ret=ssl3_get_cert_verify(s);
+ 			if (ret <= 0) goto end;
+ 
+-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
+-			s->state=SSL3_ST_SR_FINISHED_A;
+-#else
+-			if (s->s3->next_proto_neg_seen)
++			s->state=SSL3_ST_SR_POST_CLIENT_CERT;
++			s->init_num=0;
++			break;
++
++		case SSL3_ST_SR_POST_CLIENT_CERT: {
++			char next_proto_neg = 0;
++			char channel_id = 0;
++#if !defined(OPENSSL_NO_TLSEXT)
++# if !defined(OPENSSL_NO_NEXTPROTONEG)
++			next_proto_neg = s->s3->next_proto_neg_seen;
++# endif
++			channel_id = s->s3->tlsext_channel_id_valid;
++#endif
++
++			if (next_proto_neg)
+ 				s->state=SSL3_ST_SR_NEXT_PROTO_A;
++			else if (channel_id)
++				s->state=SSL3_ST_SR_CHANNEL_ID_A;
+ 			else
+ 				s->state=SSL3_ST_SR_FINISHED_A;
+-#endif
+-			s->init_num=0;
+ 			break;
++		}
+ 
+ #if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+ 		case SSL3_ST_SR_NEXT_PROTO_A:
+@@ -694,6 +702,19 @@ int ssl3_accept(SSL *s)
+ 			ret=ssl3_get_next_proto(s);
+ 			if (ret <= 0) goto end;
+ 			s->init_num = 0;
++			if (s->s3->tlsext_channel_id_valid)
++				s->state=SSL3_ST_SR_CHANNEL_ID_A;
++			else
++				s->state=SSL3_ST_SR_FINISHED_A;
++			break;
++#endif
++
++#if !defined(OPENSSL_NO_TLSEXT)
++		case SSL3_ST_SR_CHANNEL_ID_A:
++		case SSL3_ST_SR_CHANNEL_ID_B:
++			ret=ssl3_get_channel_id(s);
++			if (ret <= 0) goto end;
++			s->init_num = 0;
+ 			s->state=SSL3_ST_SR_FINISHED_A;
+ 			break;
+ #endif
+@@ -765,16 +786,7 @@ int ssl3_accept(SSL *s)
+ 			if (ret <= 0) goto end;
+ 			s->state=SSL3_ST_SW_FLUSH;
+ 			if (s->hit)
+-				{
+-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
+-				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
+-#else
+-				if (s->s3->next_proto_neg_seen)
+-					s->s3->tmp.next_state=SSL3_ST_SR_NEXT_PROTO_A;
+-				else
+-					s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
+-#endif
+-				}
++				s->s3->tmp.next_state=SSL3_ST_SR_POST_CLIENT_CERT;
+ 			else
+ 				s->s3->tmp.next_state=SSL_ST_OK;
+ 			s->init_num=0;
+@@ -3610,4 +3622,140 @@ int ssl3_get_next_proto(SSL *s)
+ 	return 1;
+ 	}
+ # endif
++
++/* ssl3_get_channel_id reads and verifies a ClientID handshake message. */
++int ssl3_get_channel_id(SSL *s)
++	{
++	int ret = -1, ok;
++	long n;
++	const unsigned char *p;
++	unsigned short extension_type, extension_len;
++	EC_GROUP* p256 = NULL;
++	EC_KEY* key = NULL;
++	EC_POINT* point = NULL;
++	ECDSA_SIG sig;
++	BIGNUM x, y;
++
++	if (s->state == SSL3_ST_SR_CHANNEL_ID_A && s->init_num == 0)
++		{
++		/* The first time that we're called we take the current
++		 * handshake hash and store it. */
++		EVP_MD_CTX md_ctx;
++		unsigned int len;
++
++		EVP_MD_CTX_init(&md_ctx);
++		EVP_DigestInit_ex(&md_ctx, EVP_sha256(), NULL);
++		if (!tls1_channel_id_hash(&md_ctx, s))
++			return -1;
++		len = sizeof(s->s3->tlsext_channel_id);
++		EVP_DigestFinal(&md_ctx, s->s3->tlsext_channel_id, &len);
++		EVP_MD_CTX_cleanup(&md_ctx);
++		}
++
++	n = s->method->ssl_get_message(s,
++		SSL3_ST_SR_CHANNEL_ID_A,
++		SSL3_ST_SR_CHANNEL_ID_B,
++		SSL3_MT_ENCRYPTED_EXTENSIONS,
++		2 + 2 + TLSEXT_CHANNEL_ID_SIZE,
++		&ok);
++
++	if (!ok)
++		return((int)n);
++
++	ssl3_finish_mac(s, (unsigned char*)s->init_buf->data, s->init_num + 4);
++
++	/* s->state doesn't reflect whether ChangeCipherSpec has been received
++	 * in this handshake, but s->s3->change_cipher_spec does (will be reset
++	 * by ssl3_get_finished). */
++	if (!s->s3->change_cipher_spec)
++		{
++		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_GOT_CHANNEL_ID_BEFORE_A_CCS);
++		return -1;
++		}
++
++	if (n != 2 + 2 + TLSEXT_CHANNEL_ID_SIZE)
++		{
++		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_INVALID_MESSAGE);
++		return -1;
++		}
++
++	p = (unsigned char *)s->init_msg;
++
++	/* The payload looks like:
++	 *   uint16 extension_type
++	 *   uint16 extension_len;
++	 *   uint8 x[32];
++	 *   uint8 y[32];
++	 *   uint8 r[32];
++	 *   uint8 s[32];
++	 */
++	n2s(p, extension_type);
++	n2s(p, extension_len);
++
++	if (extension_type != TLSEXT_TYPE_channel_id ||
++	    extension_len != TLSEXT_CHANNEL_ID_SIZE)
++		{
++		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_INVALID_MESSAGE);
++		return -1;
++		}
++
++	p256 = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1);
++	if (!p256)
++		{
++		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_NO_P256_SUPPORT);
++		return -1;
++		}
++
++	BN_init(&x);
++	BN_init(&y);
++	sig.r = BN_new();
++	sig.s = BN_new();
++
++	if (BN_bin2bn(p +  0, 32, &x) == NULL ||
++	    BN_bin2bn(p + 32, 32, &y) == NULL ||
++	    BN_bin2bn(p + 64, 32, sig.r) == NULL ||
++	    BN_bin2bn(p + 96, 32, sig.s) == NULL)
++		goto err;
++
++	point = EC_POINT_new(p256);
++	if (!point ||
++	    !EC_POINT_set_affine_coordinates_GFp(p256, point, &x, &y, NULL))
++		goto err;
++
++	key = EC_KEY_new();
++	if (!key ||
++	    !EC_KEY_set_group(key, p256) ||
++	    !EC_KEY_set_public_key(key, point))
++		goto err;
++
++	/* We stored the handshake hash in |tlsext_channel_id| the first time
++	 * that we were called. */
++	switch (ECDSA_do_verify(s->s3->tlsext_channel_id, SHA256_DIGEST_LENGTH, &sig, key)) {
++	case 1:
++		break;
++	case 0:
++		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_CHANNEL_ID_SIGNATURE_INVALID);
++		s->s3->tlsext_channel_id_valid = 0;
++		goto err;
++	default:
++		s->s3->tlsext_channel_id_valid = 0;
++		goto err;
++	}
++
++	memcpy(s->s3->tlsext_channel_id, p, 64);
++	ret = 1;
++
++err:
++	BN_free(&x);
++	BN_free(&y);
++	BN_free(sig.r);
++	BN_free(sig.s);
++	if (key)
++		EC_KEY_free(key);
++	if (point)
++		EC_POINT_free(point);
++	if (p256)
++		EC_GROUP_free(p256);
++	return ret;
++	}
+ #endif
+--- openssl-1.0.1e.orig/ssl/ssl.h	2013-03-05 18:49:33.233297282 +0000
++++ openssl-1.0.1e/ssl/ssl.h	2013-03-05 18:49:33.413299231 +0000
+@@ -981,6 +981,12 @@ struct ssl_ctx_st
+ # endif
+         /* SRTP profiles we are willing to do from RFC 5764 */
+         STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  
++
++	/* If true, a client will advertise the Channel ID extension and a
++	 * server will echo it. */
++	char tlsext_channel_id_enabled;
++	/* The client's Channel ID private key. */
++	EVP_PKEY *tlsext_channel_id_private;
+ #endif
+ 	};
+ 
+@@ -1022,6 +1028,10 @@ LHASH_OF(SSL_SESSION) *SSL_CTX_sessions(
+ 	SSL_CTX_ctrl(ctx,SSL_CTRL_SESS_TIMEOUTS,0,NULL)
+ #define SSL_CTX_sess_cache_full(ctx) \
+ 	SSL_CTX_ctrl(ctx,SSL_CTRL_SESS_CACHE_FULL,0,NULL)
++/* SSL_CTX_enable_tls_channel_id configures a TLS server to accept TLS client
++ * IDs from clients. Returns 1 on success. */
++#define SSL_CTX_enable_tls_channel_id(ctx) \
++	SSL_CTX_ctrl(ctx,SSL_CTRL_CHANNEL_ID,0,NULL)
+ 
+ void SSL_CTX_sess_set_new_cb(SSL_CTX *ctx, int (*new_session_cb)(struct ssl_st *ssl,SSL_SESSION *sess));
+ int (*SSL_CTX_sess_get_new_cb(SSL_CTX *ctx))(struct ssl_st *ssl, SSL_SESSION *sess);
+@@ -1348,6 +1358,13 @@ struct ssl_st
+ 	                                 */
+ 	unsigned int tlsext_hb_pending; /* Indicates if a HeartbeatRequest is in flight */
+ 	unsigned int tlsext_hb_seq;     /* HeartbeatRequest sequence number */
++
++	/* Copied from the SSL_CTX. For a server, means that we'll accept
++	 * Channel IDs from clients. For a client, means that we'll advertise
++	 * support. */
++	char tlsext_channel_id_enabled;
++	/* The client's Channel ID private key. */
++	EVP_PKEY *tlsext_channel_id_private;
+ #else
+ #define session_ctx ctx
+ #endif /* OPENSSL_NO_TLSEXT */
+@@ -1605,6 +1622,9 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
+ #define SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING		86
+ #define SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS	87
+ #endif
++#define SSL_CTRL_CHANNEL_ID			88
++#define SSL_CTRL_GET_CHANNEL_ID			89
++#define SSL_CTRL_SET_CHANNEL_ID			90
+ #endif
+ 
+ #define DTLS_CTRL_GET_TIMEOUT		73
+@@ -1652,6 +1672,25 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
+ #define SSL_set_tmp_ecdh(ssl,ecdh) \
+ 	SSL_ctrl(ssl,SSL_CTRL_SET_TMP_ECDH,0,(char *)ecdh)
+ 
++/* SSL_enable_tls_channel_id configures a TLS server to accept TLS client
++ * IDs from clients. Returns 1 on success. */
++#define SSL_enable_tls_channel_id(ctx) \
++	SSL_ctrl(ctx,SSL_CTRL_CHANNEL_ID,0,NULL)
++/* SSL_set1_tls_channel_id configures a TLS client to send a TLS Channel ID to
++ * compatible servers. private_key must be a P-256 EVP_PKEY*. Returns 1 on
++ * success. */
++#define SSL_set1_tls_channel_id(s, private_key) \
++	SSL_ctrl(s,SSL_CTRL_SET_CHANNEL_ID,0,(void*)private_key)
++#define SSL_CTX_set1_tls_channel_id(ctx, private_key) \
++	SSL_CTX_ctrl(ctx,SSL_CTRL_SET_CHANNEL_ID,0,(void*)private_key)
++/* SSL_get_tls_channel_id gets the client's TLS Channel ID from a server SSL*
++ * and copies up to the first |channel_id_len| bytes into |channel_id|. The
++ * Channel ID consists of the client's P-256 public key as an (x,y) pair where
++ * each is a 32-byte, big-endian field element. Returns 0 if the client didn't
++ * offer a Channel ID and the length of the complete Channel ID otherwise. */
++#define SSL_get_tls_channel_id(ctx, channel_id, channel_id_len) \
++	SSL_ctrl(ctx,SSL_CTRL_GET_CHANNEL_ID,channel_id_len,(void*)channel_id)
++
+ #define SSL_CTX_add_extra_chain_cert(ctx,x509) \
+ 	SSL_CTX_ctrl(ctx,SSL_CTRL_EXTRA_CHAIN_CERT,0,(char *)x509)
+ #define SSL_CTX_get_extra_chain_certs(ctx,px509) \
+@@ -1686,6 +1725,7 @@ int	SSL_CIPHER_get_bits(const SSL_CIPHER
+ char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
+ const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
+ unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c);
++const char* SSL_CIPHER_authentication_method(const SSL_CIPHER* cipher);
+ 
+ int	SSL_get_fd(const SSL *s);
+ int	SSL_get_rfd(const SSL *s);
+@@ -2149,6 +2189,7 @@ void ERR_load_SSL_strings(void);
+ #define SSL_F_SSL3_GET_CERTIFICATE_REQUEST		 135
+ #define SSL_F_SSL3_GET_CERT_STATUS			 289
+ #define SSL_F_SSL3_GET_CERT_VERIFY			 136
++#define SSL_F_SSL3_GET_CHANNEL_ID			 317
+ #define SSL_F_SSL3_GET_CLIENT_CERTIFICATE		 137
+ #define SSL_F_SSL3_GET_CLIENT_HELLO			 138
+ #define SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE		 139
+@@ -2168,6 +2209,7 @@ void ERR_load_SSL_strings(void);
+ #define SSL_F_SSL3_READ_BYTES				 148
+ #define SSL_F_SSL3_READ_N				 149
+ #define SSL_F_SSL3_SEND_CERTIFICATE_REQUEST		 150
++#define SSL_F_SSL3_SEND_CHANNEL_ID			 318
+ #define SSL_F_SSL3_SEND_CLIENT_CERTIFICATE		 151
+ #define SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE		 152
+ #define SSL_F_SSL3_SEND_CLIENT_VERIFY			 153
+@@ -2335,12 +2377,15 @@ void ERR_load_SSL_strings(void);
+ #define SSL_R_BIO_NOT_SET				 128
+ #define SSL_R_BLOCK_CIPHER_PAD_IS_WRONG			 129
+ #define SSL_R_BN_LIB					 130
++#define SSL_R_CANNOT_SERIALIZE_PUBLIC_KEY		 376
+ #define SSL_R_CA_DN_LENGTH_MISMATCH			 131
+ #define SSL_R_CA_DN_TOO_LONG				 132
+ #define SSL_R_CCS_RECEIVED_EARLY			 133
+ #define SSL_R_CERTIFICATE_VERIFY_FAILED			 134
+ #define SSL_R_CERT_LENGTH_MISMATCH			 135
+ #define SSL_R_CHALLENGE_IS_DIFFERENT			 136
++#define SSL_R_CHANNEL_ID_NOT_P256			 375
++#define SSL_R_CHANNEL_ID_SIGNATURE_INVALID		 371
+ #define SSL_R_CIPHER_CODE_WRONG_LENGTH			 137
+ #define SSL_R_CIPHER_OR_HASH_UNAVAILABLE		 138
+ #define SSL_R_CIPHER_TABLE_SRC_ERROR			 139
+@@ -2353,6 +2398,7 @@ void ERR_load_SSL_strings(void);
+ #define SSL_R_CONNECTION_ID_IS_DIFFERENT		 143
+ #define SSL_R_CONNECTION_TYPE_NOT_SET			 144
+ #define SSL_R_COOKIE_MISMATCH				 308
++#define SSL_R_D2I_ECDSA_SIG				 379
+ #define SSL_R_DATA_BETWEEN_CCS_AND_FINISHED		 145
+ #define SSL_R_DATA_LENGTH_TOO_LONG			 146
+ #define SSL_R_DECRYPTION_FAILED				 147
+@@ -2370,9 +2416,12 @@ void ERR_load_SSL_strings(void);
+ #define SSL_R_ENCRYPTED_LENGTH_TOO_LONG			 150
+ #define SSL_R_ERROR_GENERATING_TMP_RSA_KEY		 282
+ #define SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST		 151
++#define SSL_R_EVP_DIGESTSIGNFINAL_FAILED		 377
++#define SSL_R_EVP_DIGESTSIGNINIT_FAILED			 378
+ #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
+ #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
+ #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
++#define SSL_R_GOT_CHANNEL_ID_BEFORE_A_CCS		 372
+ #define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 355
+ #define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 356
+ #define SSL_R_HTTPS_PROXY_REQUEST			 155
+@@ -2382,6 +2431,7 @@ void ERR_load_SSL_strings(void);
+ #define SSL_R_INVALID_CHALLENGE_LENGTH			 158
+ #define SSL_R_INVALID_COMMAND				 280
+ #define SSL_R_INVALID_COMPRESSION_ALGORITHM		 341
++#define SSL_R_INVALID_MESSAGE				 374
+ #define SSL_R_INVALID_PURPOSE				 278
+ #define SSL_R_INVALID_SRP_USERNAME			 357
+ #define SSL_R_INVALID_STATUS_RESPONSE			 328
+@@ -2436,6 +2486,7 @@ void ERR_load_SSL_strings(void);
+ #define SSL_R_NO_COMPRESSION_SPECIFIED			 187
+ #define SSL_R_NO_GOST_CERTIFICATE_SENT_BY_PEER		 330
+ #define SSL_R_NO_METHOD_SPECIFIED			 188
++#define SSL_R_NO_P256_SUPPORT				 373
+ #define SSL_R_NO_PRIVATEKEY				 189
+ #define SSL_R_NO_PRIVATE_KEY_ASSIGNED			 190
+ #define SSL_R_NO_PROTOCOLS_AVAILABLE			 191
+--- openssl-1.0.1e.orig/ssl/ssl3.h	2013-03-05 18:49:33.223297173 +0000
++++ openssl-1.0.1e/ssl/ssl3.h	2013-03-05 18:49:33.413299231 +0000
+@@ -539,6 +539,17 @@ typedef struct ssl3_state_st
+ 	/* Set if we saw the Next Protocol Negotiation extension from our peer. */
+ 	int next_proto_neg_seen;
+ #endif
++
++	/* In a client, this means that the server supported Channel ID and that
++	 * a Channel ID was sent. In a server it means that we echoed support
++	 * for Channel IDs and that tlsext_channel_id will be valid after the
++	 * handshake. */
++	char tlsext_channel_id_valid;
++	/* For a server:
++	 *     If |tlsext_channel_id_valid| is true, then this contains the
++	 *     verified Channel ID from the client: a P256 point, (x,y), where
++	 *     each are big-endian values. */
++	unsigned char tlsext_channel_id[64];
+ 	} SSL3_STATE;
+ 
+ #endif
+@@ -583,6 +594,8 @@ typedef struct ssl3_state_st
+ #define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
+ #define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
+ #endif
++#define SSL3_ST_CW_CHANNEL_ID_A		(0x210|SSL_ST_CONNECT)
++#define SSL3_ST_CW_CHANNEL_ID_B		(0x211|SSL_ST_CONNECT)
+ #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
+ #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
+ /* read from server */
+@@ -632,10 +645,13 @@ typedef struct ssl3_state_st
+ #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
+ #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
+ #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
++#define SSL3_ST_SR_POST_CLIENT_CERT	(0x1BF|SSL_ST_ACCEPT)
+ #ifndef OPENSSL_NO_NEXTPROTONEG
+ #define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
+ #define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
+ #endif
++#define SSL3_ST_SR_CHANNEL_ID_A		(0x220|SSL_ST_ACCEPT)
++#define SSL3_ST_SR_CHANNEL_ID_B		(0x221|SSL_ST_ACCEPT)
+ #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
+ #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
+ /* write to client */
+@@ -663,6 +679,7 @@ typedef struct ssl3_state_st
+ #ifndef OPENSSL_NO_NEXTPROTONEG
+ #define SSL3_MT_NEXT_PROTO			67
+ #endif
++#define SSL3_MT_ENCRYPTED_EXTENSIONS		203
+ #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
+ 
+ 
+--- openssl-1.0.1e.orig/ssl/ssl_err.c	2013-03-05 18:49:33.243297392 +0000
++++ openssl-1.0.1e/ssl/ssl_err.c	2013-03-05 18:49:33.413299231 +0000
+@@ -151,6 +151,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
+ {ERR_FUNC(SSL_F_SSL3_GET_CERTIFICATE_REQUEST),	"SSL3_GET_CERTIFICATE_REQUEST"},
+ {ERR_FUNC(SSL_F_SSL3_GET_CERT_STATUS),	"SSL3_GET_CERT_STATUS"},
+ {ERR_FUNC(SSL_F_SSL3_GET_CERT_VERIFY),	"SSL3_GET_CERT_VERIFY"},
++{ERR_FUNC(SSL_F_SSL3_GET_CHANNEL_ID),	"SSL3_GET_CHANNEL_ID"},
+ {ERR_FUNC(SSL_F_SSL3_GET_CLIENT_CERTIFICATE),	"SSL3_GET_CLIENT_CERTIFICATE"},
+ {ERR_FUNC(SSL_F_SSL3_GET_CLIENT_HELLO),	"SSL3_GET_CLIENT_HELLO"},
+ {ERR_FUNC(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE),	"SSL3_GET_CLIENT_KEY_EXCHANGE"},
+@@ -170,6 +171,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
+ {ERR_FUNC(SSL_F_SSL3_READ_BYTES),	"SSL3_READ_BYTES"},
+ {ERR_FUNC(SSL_F_SSL3_READ_N),	"SSL3_READ_N"},
+ {ERR_FUNC(SSL_F_SSL3_SEND_CERTIFICATE_REQUEST),	"SSL3_SEND_CERTIFICATE_REQUEST"},
++{ERR_FUNC(SSL_F_SSL3_SEND_CHANNEL_ID),	"SSL3_SEND_CHANNEL_ID"},
+ {ERR_FUNC(SSL_F_SSL3_SEND_CLIENT_CERTIFICATE),	"SSL3_SEND_CLIENT_CERTIFICATE"},
+ {ERR_FUNC(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE),	"SSL3_SEND_CLIENT_KEY_EXCHANGE"},
+ {ERR_FUNC(SSL_F_SSL3_SEND_CLIENT_VERIFY),	"SSL3_SEND_CLIENT_VERIFY"},
+@@ -339,12 +341,15 @@ static ERR_STRING_DATA SSL_str_reasons[]
+ {ERR_REASON(SSL_R_BIO_NOT_SET)           ,"bio not set"},
+ {ERR_REASON(SSL_R_BLOCK_CIPHER_PAD_IS_WRONG),"block cipher pad is wrong"},
+ {ERR_REASON(SSL_R_BN_LIB)                ,"bn lib"},
++{ERR_REASON(SSL_R_CANNOT_SERIALIZE_PUBLIC_KEY),"cannot serialize public key"},
+ {ERR_REASON(SSL_R_CA_DN_LENGTH_MISMATCH) ,"ca dn length mismatch"},
+ {ERR_REASON(SSL_R_CA_DN_TOO_LONG)        ,"ca dn too long"},
+ {ERR_REASON(SSL_R_CCS_RECEIVED_EARLY)    ,"ccs received early"},
+ {ERR_REASON(SSL_R_CERTIFICATE_VERIFY_FAILED),"certificate verify failed"},
+ {ERR_REASON(SSL_R_CERT_LENGTH_MISMATCH)  ,"cert length mismatch"},
+ {ERR_REASON(SSL_R_CHALLENGE_IS_DIFFERENT),"challenge is different"},
++{ERR_REASON(SSL_R_CHANNEL_ID_NOT_P256)   ,"channel id not p256"},
++{ERR_REASON(SSL_R_CHANNEL_ID_SIGNATURE_INVALID),"Channel ID signature invalid"},
+ {ERR_REASON(SSL_R_CIPHER_CODE_WRONG_LENGTH),"cipher code wrong length"},
+ {ERR_REASON(SSL_R_CIPHER_OR_HASH_UNAVAILABLE),"cipher or hash unavailable"},
+ {ERR_REASON(SSL_R_CIPHER_TABLE_SRC_ERROR),"cipher table src error"},
+@@ -357,6 +362,7 @@ static ERR_STRING_DATA SSL_str_reasons[]
+ {ERR_REASON(SSL_R_CONNECTION_ID_IS_DIFFERENT),"connection id is different"},
+ {ERR_REASON(SSL_R_CONNECTION_TYPE_NOT_SET),"connection type not set"},
+ {ERR_REASON(SSL_R_COOKIE_MISMATCH)       ,"cookie mismatch"},
++{ERR_REASON(SSL_R_D2I_ECDSA_SIG)         ,"d2i ecdsa sig"},
+ {ERR_REASON(SSL_R_DATA_BETWEEN_CCS_AND_FINISHED),"data between ccs and finished"},
+ {ERR_REASON(SSL_R_DATA_LENGTH_TOO_LONG)  ,"data length too long"},
+ {ERR_REASON(SSL_R_DECRYPTION_FAILED)     ,"decryption failed"},
+@@ -374,9 +380,12 @@ static ERR_STRING_DATA SSL_str_reasons[]
+ {ERR_REASON(SSL_R_ENCRYPTED_LENGTH_TOO_LONG),"encrypted length too long"},
+ {ERR_REASON(SSL_R_ERROR_GENERATING_TMP_RSA_KEY),"error generating tmp rsa key"},
+ {ERR_REASON(SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST),"error in received cipher list"},
++{ERR_REASON(SSL_R_EVP_DIGESTSIGNFINAL_FAILED),"evp digestsignfinal failed"},
++{ERR_REASON(SSL_R_EVP_DIGESTSIGNINIT_FAILED),"evp digestsigninit failed"},
+ {ERR_REASON(SSL_R_EXCESSIVE_MESSAGE_SIZE),"excessive message size"},
+ {ERR_REASON(SSL_R_EXTRA_DATA_IN_MESSAGE) ,"extra data in message"},
+ {ERR_REASON(SSL_R_GOT_A_FIN_BEFORE_A_CCS),"got a fin before a ccs"},
++{ERR_REASON(SSL_R_GOT_CHANNEL_ID_BEFORE_A_CCS),"got Channel ID before a ccs"},
+ {ERR_REASON(SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS),"got next proto before a ccs"},
+ {ERR_REASON(SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION),"got next proto without seeing extension"},
+ {ERR_REASON(SSL_R_HTTPS_PROXY_REQUEST)   ,"https proxy request"},
+@@ -386,6 +395,7 @@ static ERR_STRING_DATA SSL_str_reasons[]
+ {ERR_REASON(SSL_R_INVALID_CHALLENGE_LENGTH),"invalid challenge length"},
+ {ERR_REASON(SSL_R_INVALID_COMMAND)       ,"invalid command"},
+ {ERR_REASON(SSL_R_INVALID_COMPRESSION_ALGORITHM),"invalid compression algorithm"},
++{ERR_REASON(SSL_R_INVALID_MESSAGE)       ,"invalid message"},
+ {ERR_REASON(SSL_R_INVALID_PURPOSE)       ,"invalid purpose"},
+ {ERR_REASON(SSL_R_INVALID_SRP_USERNAME)  ,"invalid srp username"},
+ {ERR_REASON(SSL_R_INVALID_STATUS_RESPONSE),"invalid status response"},
+@@ -440,6 +450,7 @@ static ERR_STRING_DATA SSL_str_reasons[]
+ {ERR_REASON(SSL_R_NO_COMPRESSION_SPECIFIED),"no compression specified"},
+ {ERR_REASON(SSL_R_NO_GOST_CERTIFICATE_SENT_BY_PEER),"Peer haven't sent GOST certificate, required for selected ciphersuite"},
+ {ERR_REASON(SSL_R_NO_METHOD_SPECIFIED)   ,"no method specified"},
++{ERR_REASON(SSL_R_NO_P256_SUPPORT)       ,"no p256 support"},
+ {ERR_REASON(SSL_R_NO_PRIVATEKEY)         ,"no privatekey"},
+ {ERR_REASON(SSL_R_NO_PRIVATE_KEY_ASSIGNED),"no private key assigned"},
+ {ERR_REASON(SSL_R_NO_PROTOCOLS_AVAILABLE),"no protocols available"},
+--- openssl-1.0.1e.orig/ssl/ssl_lib.c	2013-03-05 18:49:33.243297392 +0000
++++ openssl-1.0.1e/ssl/ssl_lib.c	2013-03-05 18:49:33.413299231 +0000
+@@ -579,6 +579,8 @@ void SSL_free(SSL *s)
+ 		sk_OCSP_RESPID_pop_free(s->tlsext_ocsp_ids, OCSP_RESPID_free);
+ 	if (s->tlsext_ocsp_resp)
+ 		OPENSSL_free(s->tlsext_ocsp_resp);
++	if (s->tlsext_channel_id_private)
++		EVP_PKEY_free(s->tlsext_channel_id_private);
+ #endif
+ 
+ 	if (s->client_CA != NULL)
+@@ -2005,6 +2007,11 @@ void SSL_CTX_free(SSL_CTX *a)
+ 		ssl_buf_freelist_free(a->rbuf_freelist);
+ #endif
+ 
++#ifndef OPENSSL_NO_TLSEXT
++	if (a->tlsext_channel_id_private)
++		EVP_PKEY_free(a->tlsext_channel_id_private);
++#endif
++
+ 	OPENSSL_free(a);
+ 	}
+ 
+--- openssl-1.0.1e.orig/ssl/ssl_locl.h	2013-03-05 18:49:33.243297392 +0000
++++ openssl-1.0.1e/ssl/ssl_locl.h	2013-03-05 18:49:33.413299231 +0000
+@@ -378,6 +378,7 @@
+  * (currently this also goes into algorithm2) */
+ #define TLS1_STREAM_MAC 0x04
+ 
++#define TLSEXT_CHANNEL_ID_SIZE 128
+ 
+ 
+ /*
+@@ -1004,6 +1005,7 @@ int ssl3_check_cert_and_algorithm(SSL *s
+ int ssl3_check_finished(SSL *s);
+ # ifndef OPENSSL_NO_NEXTPROTONEG
+ int ssl3_send_next_proto(SSL *s);
++int ssl3_send_channel_id(SSL *s);
+ # endif
+ #endif
+ 
+@@ -1026,6 +1028,7 @@ int ssl3_get_cert_verify(SSL *s);
+ #ifndef OPENSSL_NO_NEXTPROTONEG
+ int ssl3_get_next_proto(SSL *s);
+ #endif
++int ssl3_get_channel_id(SSL *s);
+ 
+ int dtls1_send_hello_request(SSL *s);
+ int dtls1_send_server_hello(SSL *s);
+@@ -1123,7 +1126,9 @@ int tls12_get_sigandhash(unsigned char *
+ int tls12_get_sigid(const EVP_PKEY *pk);
+ const EVP_MD *tls12_get_hash(unsigned char hash_alg);
+ 
++int tls1_channel_id_hash(EVP_MD_CTX *ctx, SSL *s);
+ #endif
++
+ EVP_MD_CTX* ssl_replace_hash(EVP_MD_CTX **hash,const EVP_MD *md) ;
+ void ssl_clear_hash_ctx(EVP_MD_CTX **hash);
+ int ssl_add_serverhello_renegotiate_ext(SSL *s, unsigned char *p, int *len,
+--- openssl-1.0.1e.orig/ssl/t1_lib.c	2013-03-05 18:49:33.173296633 +0000
++++ openssl-1.0.1e/ssl/t1_lib.c	2013-03-05 18:49:33.413299231 +0000
+@@ -649,6 +649,16 @@ unsigned char *ssl_add_clienthello_tlsex
+ 		}
+ #endif
+ 
++	if (s->tlsext_channel_id_enabled)
++		{
++		/* The client advertises an emtpy extension to indicate its
++		 * support for Channel ID. */
++		if (limit - ret - 4 < 0)
++			return NULL;
++		s2n(TLSEXT_TYPE_channel_id,ret);
++		s2n(0,ret);
++		}
++
+ #ifndef OPENSSL_NO_SRTP
+         if(SSL_get_srtp_profiles(s))
+                 {
+@@ -859,6 +869,16 @@ unsigned char *ssl_add_serverhello_tlsex
+ 		}
+ #endif
+ 
++	/* If the client advertised support for Channel ID, and we have it
++	 * enabled, then we want to echo it back. */
++	if (s->s3->tlsext_channel_id_valid)
++		{
++		if (limit - ret - 4 < 0)
++			return NULL;
++		s2n(TLSEXT_TYPE_channel_id,ret);
++		s2n(0,ret);
++		}
++
+ 	if ((extdatalen = ret-p-2)== 0) 
+ 		return p;
+ 
+@@ -1332,6 +1352,9 @@ int ssl_parse_clienthello_tlsext(SSL *s,
+ 			}
+ #endif
+ 
++		else if (type == TLSEXT_TYPE_channel_id && s->tlsext_channel_id_enabled)
++			s->s3->tlsext_channel_id_valid = 1;
++
+ 		/* session ticket processed earlier */
+ #ifndef OPENSSL_NO_SRTP
+ 		else if (type == TLSEXT_TYPE_use_srtp)
+@@ -1562,6 +1585,9 @@ int ssl_parse_serverhello_tlsext(SSL *s,
+ 			s->s3->next_proto_neg_seen = 1;
+ 			}
+ #endif
++		else if (type == TLSEXT_TYPE_channel_id)
++			s->s3->tlsext_channel_id_valid = 1;
++
+ 		else if (type == TLSEXT_TYPE_renegotiate)
+ 			{
+ 			if(!ssl_parse_serverhello_renegotiate_ext(s, data, size, al))
+@@ -2621,3 +2647,37 @@ tls1_heartbeat(SSL *s)
+ 	return ret;
+ 	}
+ #endif
++
++#if !defined(OPENSSL_NO_TLSEXT)
++/* tls1_channel_id_hash calculates the signed data for a Channel ID on the given
++ * SSL connection and writes it to |md|.
++ */
++int
++tls1_channel_id_hash(EVP_MD_CTX *md, SSL *s)
++	{
++	EVP_MD_CTX ctx;
++	unsigned char temp_digest[EVP_MAX_MD_SIZE];
++	unsigned temp_digest_len;
++	int i;
++	static const char kClientIDMagic[] = "TLS Channel ID signature";
++
++	if (s->s3->handshake_buffer)
++		if (!ssl3_digest_cached_records(s))
++			return 0;
++
++	EVP_DigestUpdate(md, kClientIDMagic, sizeof(kClientIDMagic));
++
++	EVP_MD_CTX_init(&ctx);
++	for (i = 0; i < SSL_MAX_DIGEST; i++)
++		{
++		if (s->s3->handshake_dgst[i] == NULL)
++			continue;
++		EVP_MD_CTX_copy_ex(&ctx, s->s3->handshake_dgst[i]);
++		EVP_DigestFinal_ex(&ctx, temp_digest, &temp_digest_len);
++		EVP_DigestUpdate(md, temp_digest, temp_digest_len);
++		}
++	EVP_MD_CTX_cleanup(&ctx);
++
++	return 1;
++	}
++#endif
+--- openssl-1.0.1e.orig/ssl/tls1.h	2013-03-05 18:49:33.173296633 +0000
++++ openssl-1.0.1e/ssl/tls1.h	2013-03-05 18:49:33.413299231 +0000
+@@ -248,6 +248,9 @@ extern "C" {
+ #define TLSEXT_TYPE_next_proto_neg		13172
+ #endif
+ 
++/* This is not an IANA defined extension number */
++#define TLSEXT_TYPE_channel_id			30031
++
+ /* NameType value from RFC 3546 */
+ #define TLSEXT_NAMETYPE_host_name 0
+ /* status request value from RFC 3546 */
diff --git a/patches/crypto_Android.mk b/patches/crypto_Android.mk
deleted file mode 100644
index a5bfe3c..0000000
--- a/patches/crypto_Android.mk
+++ /dev/null
@@ -1,589 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
-mips_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM
-
-arm_src_files := \
-    aes/asm/aes-armv4.s \
-    bn/asm/armv4-mont.s \
-    bn/bn_asm.c \
-    sha/asm/sha1-armv4-large.s \
-    sha/asm/sha256-armv4.s \
-    sha/asm/sha512-armv4.s
-
-mips_src_files := \
-    aes/asm/aes-mips.s \
-    bn/asm/bn-mips.s \
-    bn/asm/mips-mont.s \
-    sha/asm/sha1-mips.s \
-    sha/asm/sha256-mips.s
-
-other_arch_src_files := \
-    aes/aes_core.c \
-    bn/bn_asm.c
-
-local_src_files := \
-	cryptlib.c \
-	mem.c \
-	mem_clr.c \
-	mem_dbg.c \
-	cversion.c \
-	ex_data.c \
-	cpt_err.c \
-	ebcdic.c \
-	uid.c \
-	o_time.c \
-	o_str.c \
-	o_dir.c \
-	aes/aes_cbc.c \
-	aes/aes_cfb.c \
-	aes/aes_ctr.c \
-	aes/aes_ecb.c \
-	aes/aes_misc.c \
-	aes/aes_ofb.c \
-	aes/aes_wrap.c \
-	asn1/a_bitstr.c \
-	asn1/a_bool.c \
-	asn1/a_bytes.c \
-	asn1/a_d2i_fp.c \
-	asn1/a_digest.c \
-	asn1/a_dup.c \
-	asn1/a_enum.c \
-	asn1/a_gentm.c \
-	asn1/a_i2d_fp.c \
-	asn1/a_int.c \
-	asn1/a_mbstr.c \
-	asn1/a_object.c \
-	asn1/a_octet.c \
-	asn1/a_print.c \
-	asn1/a_set.c \
-	asn1/a_sign.c \
-	asn1/a_strex.c \
-	asn1/a_strnid.c \
-	asn1/a_time.c \
-	asn1/a_type.c \
-	asn1/a_utctm.c \
-	asn1/a_utf8.c \
-	asn1/a_verify.c \
-	asn1/ameth_lib.c \
-	asn1/asn1_err.c \
-	asn1/asn1_gen.c \
-	asn1/asn1_lib.c \
-	asn1/asn1_par.c \
-	asn1/asn_mime.c \
-	asn1/asn_moid.c \
-	asn1/asn_pack.c \
-	asn1/bio_asn1.c \
-	asn1/bio_ndef.c \
-	asn1/d2i_pr.c \
-	asn1/d2i_pu.c \
-	asn1/evp_asn1.c \
-	asn1/f_enum.c \
-	asn1/f_int.c \
-	asn1/f_string.c \
-	asn1/i2d_pr.c \
-	asn1/i2d_pu.c \
-	asn1/n_pkey.c \
-	asn1/nsseq.c \
-	asn1/p5_pbe.c \
-	asn1/p5_pbev2.c \
-	asn1/p8_pkey.c \
-	asn1/t_bitst.c \
-	asn1/t_crl.c \
-	asn1/t_pkey.c \
-	asn1/t_req.c \
-	asn1/t_spki.c \
-	asn1/t_x509.c \
-	asn1/t_x509a.c \
-	asn1/tasn_dec.c \
-	asn1/tasn_enc.c \
-	asn1/tasn_fre.c \
-	asn1/tasn_new.c \
-	asn1/tasn_prn.c \
-	asn1/tasn_typ.c \
-	asn1/tasn_utl.c \
-	asn1/x_algor.c \
-	asn1/x_attrib.c \
-	asn1/x_bignum.c \
-	asn1/x_crl.c \
-	asn1/x_exten.c \
-	asn1/x_info.c \
-	asn1/x_long.c \
-	asn1/x_name.c \
-	asn1/x_nx509.c \
-	asn1/x_pkey.c \
-	asn1/x_pubkey.c \
-	asn1/x_req.c \
-	asn1/x_sig.c \
-	asn1/x_spki.c \
-	asn1/x_val.c \
-	asn1/x_x509.c \
-	asn1/x_x509a.c \
-	bf/bf_cfb64.c \
-	bf/bf_ecb.c \
-	bf/bf_enc.c \
-	bf/bf_ofb64.c \
-	bf/bf_skey.c \
-	bio/b_dump.c \
-	bio/b_print.c \
-	bio/b_sock.c \
-	bio/bf_buff.c \
-	bio/bf_nbio.c \
-	bio/bf_null.c \
-	bio/bio_cb.c \
-	bio/bio_err.c \
-	bio/bio_lib.c \
-	bio/bss_acpt.c \
-	bio/bss_bio.c \
-	bio/bss_conn.c \
-	bio/bss_dgram.c \
-	bio/bss_fd.c \
-	bio/bss_file.c \
-	bio/bss_log.c \
-	bio/bss_mem.c \
-	bio/bss_null.c \
-	bio/bss_sock.c \
-	bn/bn_add.c \
-	bn/bn_blind.c \
-	bn/bn_const.c \
-	bn/bn_ctx.c \
-	bn/bn_div.c \
-	bn/bn_err.c \
-	bn/bn_exp.c \
-	bn/bn_exp2.c \
-	bn/bn_gcd.c \
-	bn/bn_gf2m.c \
-	bn/bn_kron.c \
-	bn/bn_lib.c \
-	bn/bn_mod.c \
-	bn/bn_mont.c \
-	bn/bn_mpi.c \
-	bn/bn_mul.c \
-	bn/bn_nist.c \
-	bn/bn_prime.c \
-	bn/bn_print.c \
-	bn/bn_rand.c \
-	bn/bn_recp.c \
-	bn/bn_shift.c \
-	bn/bn_sqr.c \
-	bn/bn_sqrt.c \
-	bn/bn_word.c \
-	buffer/buf_err.c \
-	buffer/buffer.c \
-	comp/c_rle.c \
-	comp/c_zlib.c \
-	comp/comp_err.c \
-	comp/comp_lib.c \
-	conf/conf_api.c \
-	conf/conf_def.c \
-	conf/conf_err.c \
-	conf/conf_lib.c \
-	conf/conf_mall.c \
-	conf/conf_mod.c \
-	conf/conf_sap.c \
-	des/cbc_cksm.c \
-	des/cbc_enc.c \
-	des/cfb64ede.c \
-	des/cfb64enc.c \
-	des/cfb_enc.c \
-	des/des_enc.c \
-	des/des_old.c \
-	des/des_old2.c \
-	des/ecb3_enc.c \
-	des/ecb_enc.c \
-	des/ede_cbcm_enc.c \
-	des/enc_read.c \
-	des/enc_writ.c \
-	des/fcrypt.c \
-	des/fcrypt_b.c \
-	des/ofb64ede.c \
-	des/ofb64enc.c \
-	des/ofb_enc.c \
-	des/pcbc_enc.c \
-	des/qud_cksm.c \
-	des/rand_key.c \
-	des/read2pwd.c \
-	des/rpc_enc.c \
-	des/set_key.c \
-	des/str2key.c \
-	des/xcbc_enc.c \
-	dh/dh_ameth.c \
-	dh/dh_asn1.c \
-	dh/dh_check.c \
-	dh/dh_depr.c \
-	dh/dh_err.c \
-	dh/dh_gen.c \
-	dh/dh_key.c \
-	dh/dh_lib.c \
-	dh/dh_pmeth.c \
-	dsa/dsa_ameth.c \
-	dsa/dsa_asn1.c \
-	dsa/dsa_depr.c \
-	dsa/dsa_err.c \
-	dsa/dsa_gen.c \
-	dsa/dsa_key.c \
-	dsa/dsa_lib.c \
-	dsa/dsa_ossl.c \
-	dsa/dsa_pmeth.c \
-	dsa/dsa_prn.c \
-	dsa/dsa_sign.c \
-	dsa/dsa_vrf.c \
-	dso/dso_dl.c \
-	dso/dso_dlfcn.c \
-	dso/dso_err.c \
-	dso/dso_lib.c \
-	dso/dso_null.c \
-	dso/dso_openssl.c \
-	ec/ec2_mult.c \
-	ec/ec2_smpl.c \
-	ec/ec_ameth.c \
-	ec/ec_asn1.c \
-	ec/ec_check.c \
-	ec/ec_curve.c \
-	ec/ec_cvt.c \
-	ec/ec_err.c \
-	ec/ec_key.c \
-	ec/ec_lib.c \
-	ec/ec_mult.c \
-	ec/ec_pmeth.c \
-	ec/ec_print.c \
-	ec/eck_prn.c \
-	ec/ecp_mont.c \
-	ec/ecp_nist.c \
-	ec/ecp_smpl.c \
-	ecdh/ech_err.c \
-	ecdh/ech_key.c \
-	ecdh/ech_lib.c \
-	ecdh/ech_ossl.c \
-	ecdsa/ecs_asn1.c \
-	ecdsa/ecs_err.c \
-	ecdsa/ecs_lib.c \
-	ecdsa/ecs_ossl.c \
-	ecdsa/ecs_sign.c \
-	ecdsa/ecs_vrf.c \
-	err/err.c \
-	err/err_all.c \
-	err/err_prn.c \
-	evp/bio_b64.c \
-	evp/bio_enc.c \
-	evp/bio_md.c \
-	evp/bio_ok.c \
-	evp/c_all.c \
-	evp/c_allc.c \
-	evp/c_alld.c \
-	evp/digest.c \
-	evp/e_aes.c \
-	evp/e_bf.c \
-	evp/e_des.c \
-	evp/e_des3.c \
-	evp/e_null.c \
-	evp/e_old.c \
-	evp/e_rc2.c \
-	evp/e_rc4.c \
-	evp/e_rc5.c \
-	evp/e_xcbc_d.c \
-	evp/encode.c \
-	evp/evp_acnf.c \
-	evp/evp_enc.c \
-	evp/evp_err.c \
-	evp/evp_key.c \
-	evp/evp_lib.c \
-	evp/evp_pbe.c \
-	evp/evp_pkey.c \
-	evp/m_dss.c \
-	evp/m_dss1.c \
-	evp/m_ecdsa.c \
-	evp/m_md4.c \
-	evp/m_md5.c \
-	evp/m_mdc2.c \
-	evp/m_null.c \
-	evp/m_ripemd.c \
-	evp/m_sha1.c \
-	evp/m_sigver.c \
-	evp/m_wp.c \
-	evp/names.c \
-	evp/p5_crpt.c \
-	evp/p5_crpt2.c \
-	evp/p_dec.c \
-	evp/p_enc.c \
-	evp/p_lib.c \
-	evp/p_open.c \
-	evp/p_seal.c \
-	evp/p_sign.c \
-	evp/p_verify.c \
-	evp/pmeth_fn.c \
-	evp/pmeth_gn.c \
-	evp/pmeth_lib.c \
-	hmac/hm_ameth.c \
-	hmac/hm_pmeth.c \
-	hmac/hmac.c \
-	krb5/krb5_asn.c \
-	lhash/lh_stats.c \
-	lhash/lhash.c \
-	md4/md4_dgst.c \
-	md4/md4_one.c \
-	md5/md5_dgst.c \
-	md5/md5_one.c \
-	modes/cbc128.c \
-	modes/cfb128.c \
-	modes/ctr128.c \
-	modes/ofb128.c \
-	objects/o_names.c \
-	objects/obj_dat.c \
-	objects/obj_err.c \
-	objects/obj_lib.c \
-	objects/obj_xref.c \
-	ocsp/ocsp_asn.c \
-	ocsp/ocsp_cl.c \
-	ocsp/ocsp_err.c \
-	ocsp/ocsp_ext.c \
-	ocsp/ocsp_ht.c \
-	ocsp/ocsp_lib.c \
-	ocsp/ocsp_prn.c \
-	ocsp/ocsp_srv.c \
-	ocsp/ocsp_vfy.c \
-	pem/pem_all.c \
-	pem/pem_err.c \
-	pem/pem_info.c \
-	pem/pem_lib.c \
-	pem/pem_oth.c \
-	pem/pem_pk8.c \
-	pem/pem_pkey.c \
-	pem/pem_seal.c \
-	pem/pem_sign.c \
-	pem/pem_x509.c \
-	pem/pem_xaux.c \
-	pem/pvkfmt.c \
-	pkcs12/p12_add.c \
-	pkcs12/p12_asn.c \
-	pkcs12/p12_attr.c \
-	pkcs12/p12_crpt.c \
-	pkcs12/p12_crt.c \
-	pkcs12/p12_decr.c \
-	pkcs12/p12_init.c \
-	pkcs12/p12_key.c \
-	pkcs12/p12_kiss.c \
-	pkcs12/p12_mutl.c \
-	pkcs12/p12_npas.c \
-	pkcs12/p12_p8d.c \
-	pkcs12/p12_p8e.c \
-	pkcs12/p12_utl.c \
-	pkcs12/pk12err.c \
-	pkcs7/pk7_asn1.c \
-	pkcs7/pk7_attr.c \
-	pkcs7/pk7_doit.c \
-	pkcs7/pk7_lib.c	\
-	pkcs7/pk7_mime.c \
-	pkcs7/pk7_smime.c \
-	pkcs7/pkcs7err.c \
-	rand/md_rand.c \
-	rand/rand_egd.c \
-	rand/rand_err.c \
-	rand/rand_lib.c \
-	rand/rand_unix.c \
-	rand/randfile.c \
-	rc2/rc2_cbc.c \
-	rc2/rc2_ecb.c \
-	rc2/rc2_skey.c \
-	rc2/rc2cfb64.c \
-	rc2/rc2ofb64.c \
-	rc4/rc4_enc.c \
-	rc4/rc4_skey.c \
-	ripemd/rmd_dgst.c \
-	ripemd/rmd_one.c \
-	rsa/rsa_ameth.c \
-	rsa/rsa_asn1.c \
-	rsa/rsa_chk.c \
-	rsa/rsa_eay.c \
-	rsa/rsa_err.c \
-	rsa/rsa_gen.c \
-	rsa/rsa_lib.c \
-	rsa/rsa_none.c \
-	rsa/rsa_null.c \
-	rsa/rsa_oaep.c \
-	rsa/rsa_pk1.c \
-	rsa/rsa_pmeth.c \
-	rsa/rsa_prn.c \
-	rsa/rsa_pss.c \
-	rsa/rsa_saos.c \
-	rsa/rsa_sign.c \
-	rsa/rsa_ssl.c \
-	rsa/rsa_x931.c \
-	sha/sha1_one.c \
-	sha/sha1dgst.c \
-	sha/sha256.c \
-	sha/sha512.c \
-	sha/sha_dgst.c \
-	stack/stack.c \
-	ts/ts_err.c \
-	txt_db/txt_db.c \
-	ui/ui_compat.c \
-	ui/ui_err.c \
-	ui/ui_lib.c \
-	ui/ui_openssl.c \
-	ui/ui_util.c \
-	x509/by_dir.c \
-	x509/by_file.c \
-	x509/x509_att.c \
-	x509/x509_cmp.c \
-	x509/x509_d2.c \
-	x509/x509_def.c \
-	x509/x509_err.c \
-	x509/x509_ext.c \
-	x509/x509_lu.c \
-	x509/x509_obj.c \
-	x509/x509_r2x.c \
-	x509/x509_req.c \
-	x509/x509_set.c \
-	x509/x509_trs.c \
-	x509/x509_txt.c \
-	x509/x509_v3.c \
-	x509/x509_vfy.c \
-	x509/x509_vpm.c \
-	x509/x509cset.c \
-	x509/x509name.c \
-	x509/x509rset.c \
-	x509/x509spki.c \
-	x509/x509type.c \
-	x509/x_all.c \
-	x509v3/pcy_cache.c \
-	x509v3/pcy_data.c \
-	x509v3/pcy_lib.c \
-	x509v3/pcy_map.c \
-	x509v3/pcy_node.c \
-	x509v3/pcy_tree.c \
-	x509v3/v3_akey.c \
-	x509v3/v3_akeya.c \
-	x509v3/v3_alt.c \
-	x509v3/v3_bcons.c \
-	x509v3/v3_bitst.c \
-	x509v3/v3_conf.c \
-	x509v3/v3_cpols.c \
-	x509v3/v3_crld.c \
-	x509v3/v3_enum.c \
-	x509v3/v3_extku.c \
-	x509v3/v3_genn.c \
-	x509v3/v3_ia5.c \
-	x509v3/v3_info.c \
-	x509v3/v3_int.c \
-	x509v3/v3_lib.c \
-	x509v3/v3_ncons.c \
-	x509v3/v3_ocsp.c \
-	x509v3/v3_pci.c \
-	x509v3/v3_pcia.c \
-	x509v3/v3_pcons.c \
-	x509v3/v3_pku.c \
-	x509v3/v3_pmaps.c \
-	x509v3/v3_prn.c \
-	x509v3/v3_purp.c \
-	x509v3/v3_skey.c \
-	x509v3/v3_sxnet.c \
-	x509v3/v3_utl.c \
-	x509v3/v3err.c
-
-local_c_includes := \
-	external/openssl \
-	external/openssl/crypto/asn1 \
-	external/openssl/crypto/evp \
-	external/openssl/include \
-	external/openssl/include/openssl \
-	external/zlib
-
-local_c_flags := -DNO_WINDOWS_BRAINDEATH
-
-#######################################
-# target static library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
-	LOCAL_SRC_FILES += $(arm_src_files)
-	LOCAL_CFLAGS += $(arm_cflags)
-endif
-ifeq ($(TARGET_ARCH),mips)
-    ifneq (($TARGET_HAS_BIGENDIAN),true)
-      LOCAL_SRC_FILES += $(mips_src_files)
-      LOCAL_CFLAGS += $(mips_cflags)
-    else
-      LOCAL_SRC_FILES += $(other_arch_src_files)
-    endif
-endif
-ifeq ($(TARGET_ARCH),x86)
-	LOCAL_SRC_FILES += $(other_arch_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_STATIC_LIBRARY)
-
-#######################################
-# target shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-# Use the NDK prebuilt libz and libdl.
-LOCAL_LDFLAGS += -lz -ldl
-else
-LOCAL_SHARED_LIBRARIES += libz libdl
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
-	LOCAL_SRC_FILES += $(arm_src_files)
-	LOCAL_CFLAGS += $(arm_cflags)
-endif
-ifeq ($(TARGET_ARCH),mips)
-    ifneq (($TARGET_HAS_BIGENDIAN),true)
-      LOCAL_SRC_FILES += $(mips_src_files)
-      LOCAL_CFLAGS += $(mips_cflags)
-    else
-      LOCAL_SRC_FILES += $(other_arch_src_files)
-    endif
-endif
-ifeq ($(TARGET_ARCH),x86)
-	LOCAL_SRC_FILES += $(other_arch_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_SHARED_LIBRARY)
-
-#######################################
-# host shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SRC_FILES += $(other_arch_src_files)
-LOCAL_STATIC_LIBRARIES += libz
-LOCAL_LDLIBS += -ldl
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_HOST_SHARED_LIBRARY)
-
-########################################
-# host static library, which is used by some SDK tools.
-
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SRC_FILES += $(other_arch_src_files)
-LOCAL_STATIC_LIBRARIES += libz
-LOCAL_LDLIBS += -ldl
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/patches/eng_dyn_dirs.patch b/patches/eng_dyn_dirs.patch
new file mode 100644
index 0000000..ad137e5
--- /dev/null
+++ b/patches/eng_dyn_dirs.patch
@@ -0,0 +1,72 @@
+From b53cd994adaff887ec126de259e37d769ad585cb Mon Sep 17 00:00:00 2001
+From: Kenny Root <kroot@google.com>
+Date: Fri, 8 Feb 2013 11:22:25 -0800
+Subject: [PATCH] Fix failures when eng_dyn scans multiple directories
+
+If DIR_ADD is called with multiple directories, and the target file
+does not exist in the first directory scanned, the DSO object will still
+be considered "loaded" for the next call of DSO_load(...) and cause
+subsequent calls to DSO_load(...) fail with the reason code of "already
+loaded" even though the load failed.
+
+Additionally, with multiple directories used in eng_dyn, another problem
+manifests because the errors pushed onto the error stack will linger even
+if another library is loaded successfully on subsequent calls to
+DSO_load(...) in the directory scanning loop.
+
+Change-Id: I4ddd24f7b39bd88663e1783f30914870a907acfa
+---
+ crypto/dso/dso_lib.c    | 8 ++++++++
+ crypto/engine/eng_dyn.c | 5 ++++-
+ 2 files changed, 12 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/dso/dso_lib.c b/crypto/dso/dso_lib.c
+index 8a15b79..7801529 100644
+--- a/crypto/dso/dso_lib.c
++++ b/crypto/dso/dso_lib.c
+@@ -237,11 +237,19 @@ DSO *DSO_load(DSO *dso, const char *filename, DSO_METHOD *meth, int flags)
+ 	if(ret->meth->dso_load == NULL)
+ 		{
+ 		DSOerr(DSO_F_DSO_LOAD,DSO_R_UNSUPPORTED);
++		/* Make sure we unset the filename on failure, because we use
++		 * this to determine when the DSO has been loaded above. */
++		OPENSSL_free(ret->filename);
++		ret->filename = NULL;
+ 		goto err;
+ 		}
+ 	if(!ret->meth->dso_load(ret))
+ 		{
+ 		DSOerr(DSO_F_DSO_LOAD,DSO_R_LOAD_FAILED);
++		/* Make sure we unset the filename on failure, because we use
++		 * this to determine when the DSO has been loaded above. */
++		OPENSSL_free(ret->filename);
++		ret->filename = NULL;
+ 		goto err;
+ 		}
+ 	/* Load succeeded */
+diff --git a/crypto/engine/eng_dyn.c b/crypto/engine/eng_dyn.c
+index 807da7a..8fb8634 100644
+--- a/crypto/engine/eng_dyn.c
++++ b/crypto/engine/eng_dyn.c
+@@ -408,7 +408,7 @@ static int int_load(dynamic_data_ctx *ctx)
+ 	int num, loop;
+ 	/* Unless told not to, try a direct load */
+ 	if((ctx->dir_load != 2) && (DSO_load(ctx->dynamic_dso,
+-				ctx->DYNAMIC_LIBNAME, NULL, 0)) != NULL)
++				ctx->DYNAMIC_LIBNAME, NULL, 0) != NULL))
+ 		return 1;
+ 	/* If we're not allowed to use 'dirs' or we have none, fail */
+ 	if(!ctx->dir_load || (num = sk_OPENSSL_STRING_num(ctx->dirs)) < 1)
+@@ -423,6 +423,9 @@ static int int_load(dynamic_data_ctx *ctx)
+ 			{
+ 			/* Found what we're looking for */
+ 			OPENSSL_free(merge);
++			/* Previous failed loop iterations, if any, will have resulted in
++			 * errors. Clear them out before returning success. */
++			ERR_clear_error();
+ 			return 1;
+ 			}
+ 		OPENSSL_free(merge);
+-- 
+1.7.12.3-x20-1
+
diff --git a/patches/fix_clang_build.patch b/patches/fix_clang_build.patch
new file mode 100644
index 0000000..8f9b826
--- /dev/null
+++ b/patches/fix_clang_build.patch
@@ -0,0 +1,46 @@
+--- openssl-1.0.1e.orig/crypto/bio/b_sock.c	2013-03-05 19:12:46.758376542 +0000
++++ openssl-1.0.1e/crypto/bio/b_sock.c	2013-03-05 19:12:46.948378599 +0000
+@@ -629,7 +629,8 @@ int BIO_get_accept_socket(char *host, in
+ 		struct sockaddr_in6 sa_in6;
+ #endif
+ 	} server,client;
+-	int s=INVALID_SOCKET,cs,addrlen;
++	int s=INVALID_SOCKET,cs;
++	socklen_t addrlen;
+ 	unsigned char ip[4];
+ 	unsigned short port;
+ 	char *str=NULL,*e;
+@@ -704,10 +705,10 @@ int BIO_get_accept_socket(char *host, in
+ 
+ 	if ((*p_getaddrinfo.f)(h,p,&hint,&res)) break;
+ 
+-	addrlen = res->ai_addrlen<=sizeof(server) ?
++	addrlen = res->ai_addrlen <= (socklen_t)sizeof(server) ?
+ 			res->ai_addrlen :
+-			sizeof(server);
+-	memcpy(&server, res->ai_addr, addrlen);
++			(socklen_t)sizeof(server);
++	memcpy(&server, res->ai_addr, (size_t)addrlen);
+ 
+ 	(*p_freeaddrinfo.f)(res);
+ 	goto again;
+@@ -719,7 +720,7 @@ int BIO_get_accept_socket(char *host, in
+ 	memset((char *)&server,0,sizeof(server));
+ 	server.sa_in.sin_family=AF_INET;
+ 	server.sa_in.sin_port=htons(port);
+-	addrlen = sizeof(server.sa_in);
++	addrlen = (socklen_t)sizeof(server.sa_in);
+ 
+ 	if (h == NULL || strcmp(h,"*") == 0)
+ 		server.sa_in.sin_addr.s_addr=INADDR_ANY;
+--- openssl-1.0.1e.orig/crypto/x509v3/v3_utl.c	2013-03-05 19:12:46.768376649 +0000
++++ openssl-1.0.1e/crypto/x509v3/v3_utl.c	2013-03-05 19:12:46.948378599 +0000
+@@ -365,7 +365,7 @@ char *hex_to_string(const unsigned char
+ 	char *tmp, *q;
+ 	const unsigned char *p;
+ 	int i;
+-	const static char hexdig[] = "0123456789ABCDEF";
++	static const char hexdig[] = "0123456789ABCDEF";
+ 	if(!buffer || !len) return NULL;
+ 	if(!(tmp = OPENSSL_malloc(len * 3 + 1))) {
+ 		X509V3err(X509V3_F_HEX_TO_STRING,ERR_R_MALLOC_FAILURE);
diff --git a/patches/handshake_cutthrough.patch b/patches/handshake_cutthrough.patch
index 4f29839..7930c44 100644
--- a/patches/handshake_cutthrough.patch
+++ b/patches/handshake_cutthrough.patch
@@ -1,15 +1,22 @@
-diff -uarp openssl-1.0.0.orig/apps/s_client.c openssl-1.0.0/apps/s_client.c
---- openssl-1.0.0.orig/apps/s_client.c	2009-12-16 15:28:28.000000000 -0500
-+++ openssl-1.0.0/apps/s_client.c	2010-04-21 14:39:49.000000000 -0400
-@@ -248,6 +248,7 @@ static void sc_usage(void)
- 	BIO_printf(bio_err," -tlsextdebug      - hex dump of all TLS extensions received\n");
- 	BIO_printf(bio_err," -status           - request certificate status from server\n");
- 	BIO_printf(bio_err," -no_ticket        - disable use of RFC4507bis session tickets\n");
-+	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
+commit 15a86b921a7eaa190a22a0a369e4e99beb91f5ad
+Author: Adam Langley <agl@chromium.org>
+Date:   Mon Mar 26 17:43:29 2012 -0400
+
+    handshake_cutthrough
+
+diff --git a/apps/s_client.c b/apps/s_client.c
+index 098cce2..0e10766 100644
+--- a/apps/s_client.c
++++ b/apps/s_client.c
+@@ -361,6 +361,7 @@ static void sc_usage(void)
+ 	BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
+ # endif
  #endif
- 	}
- 
-@@ -304,6 +305,7 @@ int MAIN(int argc, char **argv)
++	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
+ 	BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
+ 	BIO_printf(bio_err," -use_srtp profiles - Offer SRTP key management with a colon-separated profile list\n");
+  	BIO_printf(bio_err," -keymatexport label   - Export keying material using label\n");
+@@ -573,6 +574,7 @@ int MAIN(int argc, char **argv)
  	EVP_PKEY *key = NULL;
  	char *CApath=NULL,*CAfile=NULL,*cipher=NULL;
  	int reconnect=0,badop=0,verify=SSL_VERIFY_NONE,bugs=0;
@@ -17,18 +24,18 @@ diff -uarp openssl-1.0.0.orig/apps/s_client.c openssl-1.0.0/apps/s_client.c
  	int crlf=0;
  	int write_tty,read_tty,write_ssl,read_ssl,tty_on,ssl_pending;
  	SSL_CTX *ctx=NULL;
-@@ -533,6 +535,8 @@ int MAIN(int argc, char **argv)
- 		else if	(strcmp(*argv,"-no_ticket") == 0)
- 			{ off|=SSL_OP_NO_TICKET; }
+@@ -885,6 +887,8 @@ int MAIN(int argc, char **argv)
+ 			}
+ # endif
  #endif
 +		else if (strcmp(*argv,"-cutthrough") == 0)
 +			cutthrough=1;
  		else if (strcmp(*argv,"-serverpref") == 0)
  			off|=SSL_OP_CIPHER_SERVER_PREFERENCE;
- 		else if	(strcmp(*argv,"-cipher") == 0)
-@@ -714,6 +718,15 @@ bad:
- 	 */
- 	if (sock_type == SOCK_DGRAM) SSL_CTX_set_read_ahead(ctx, 1);
+ 		else if (strcmp(*argv,"-legacy_renegotiation") == 0)
+@@ -1156,6 +1160,15 @@ bad:
+ 		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
+ #endif
  
 +	/* Enable handshake cutthrough for client connections using
 +	 * strong ciphers. */
@@ -42,29 +49,36 @@ diff -uarp openssl-1.0.0.orig/apps/s_client.c openssl-1.0.0/apps/s_client.c
  	if (state) SSL_CTX_set_info_callback(ctx,apps_ssl_info_callback);
  	if (cipher != NULL)
  		if(!SSL_CTX_set_cipher_list(ctx,cipher)) {
-diff -uarp openssl-1.0.0.orig/ssl/s3_clnt.c openssl-1.0.0/ssl/s3_clnt.c
---- openssl-1.0.0.orig/ssl/s3_clnt.c	2010-02-27 19:24:24.000000000 -0500
-+++ openssl-1.0.0/ssl/s3_clnt.c	2010-04-21 14:39:49.000000000 -0400
-@@ -186,6 +186,18 @@ int ssl3_connect(SSL *s)
- 	
- 	s->in_handshake++;
- 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); 
-+#if 0	/* Send app data in separate packet, otherwise, some particular site
-+	 * (only one site so far) closes the socket.
-+	 * Note: there is a very small chance that two TCP packets
-+	 * could be arriving at server combined into a single TCP packet,
-+	 * then trigger that site to break. We haven't encounter that though.
-+	 */
+diff --git a/ssl/s3_clnt.c b/ssl/s3_clnt.c
+index 4511a91..d2ecc3e 100644
+--- a/ssl/s3_clnt.c
++++ b/ssl/s3_clnt.c
+@@ -215,6 +215,24 @@ int ssl3_connect(SSL *s)
+ 		}
+ #endif
+ 
++// BEGIN android-added
++#if 0
++/* Send app data in separate packet, otherwise, some particular site
++ * (only one site so far) closes the socket. http://b/2511073
++ * Note: there is a very small chance that two TCP packets
++ * could be arriving at server combined into a single TCP packet,
++ * then trigger that site to break. We haven't encounter that though.
++ */
++// END android-added
 +	if (SSL_get_mode(s) & SSL_MODE_HANDSHAKE_CUTTHROUGH)
 +		{
 +		/* Send app data along with CCS/Finished */
 +		s->s3->flags |= SSL3_FLAGS_DELAY_CLIENT_FINISHED;
 +		}
++
++// BEGIN android-added
 +#endif
- 
++// END android-added
  	for (;;)
  		{
-@@ -454,14 +468,31 @@ int ssl3_connect(SSL *s)
+ 		state=s->state;
+@@ -527,14 +533,31 @@ int ssl3_connect(SSL *s)
  				}
  			else
  				{
@@ -102,7 +116,7 @@ diff -uarp openssl-1.0.0.orig/ssl/s3_clnt.c openssl-1.0.0/ssl/s3_clnt.c
  				}
  			s->init_num=0;
  			break;
-@@ -512,6 +541,24 @@ int ssl3_connect(SSL *s)
+@@ -582,6 +605,24 @@ int ssl3_connect(SSL *s)
  			s->state=s->s3->tmp.next_state;
  			break;
  
@@ -127,10 +141,11 @@ diff -uarp openssl-1.0.0.orig/ssl/s3_clnt.c openssl-1.0.0/ssl/s3_clnt.c
  		case SSL_ST_OK:
  			/* clean a few things up */
  			ssl3_cleanup_key_block(s);
-diff -uarp openssl-1.0.0.orig/ssl/s3_lib.c openssl-1.0.0/ssl/s3_lib.c
--- openssl-1.0.0.orig/ssl/s3_lib.c     2009-10-16 11:24:19.000000000 -0400
-+++ openssl-1.0.0/ssl/s3_lib.c  2010-04-21 14:39:49.000000000 -0400
-@@ -2551,9 +2551,22 @@ int ssl3_write(SSL *s, const void *buf, 
+diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
+index db75479..a2ea25a 100644
+--- a/ssl/s3_lib.c
++++ b/ssl/s3_lib.c
+@@ -4199,9 +4199,22 @@ int ssl3_write(SSL *s, const void *buf, int len)
  
  static int ssl3_read_internal(SSL *s, void *buf, int len, int peek)
  	{
@@ -154,27 +169,28 @@ diff -uarp openssl-1.0.0.orig/ssl/s3_lib.c openssl-1.0.0/ssl/s3_lib.c
  	if (s->s3->renegotiate) ssl3_renegotiate_check(s);
  	s->s3->in_read_app_data=1;
  	ret=s->method->ssl_read_bytes(s,SSL3_RT_APPLICATION_DATA,buf,len,peek);
-diff -uarp openssl-1.0.0.orig/ssl/ssl.h openssl-1.0.0/ssl/ssl.h
---- openssl-1.0.0.orig/ssl/ssl.h	2010-01-06 12:37:38.000000000 -0500
-+++ openssl-1.0.0/ssl/ssl.h	2010-04-21 16:57:49.000000000 -0400
-@@ -605,6 +605,10 @@ typedef struct ssl_session_st
- /* Use small read and write buffers: (a) lazy allocate read buffers for
-  * large incoming records, and (b) limit the size of outgoing records. */
- #define SSL_MODE_SMALL_BUFFERS 0x00000020L
+diff --git a/ssl/ssl.h b/ssl/ssl.h
+index 72ed766..ceaf647 100644
+--- a/ssl/ssl.h
++++ b/ssl/ssl.h
+@@ -638,6 +638,10 @@ struct ssl_session_st
+  * TLS only.)  "Released" buffers are put onto a free-list in the context
+  * or just freed (depending on the context's setting for freelist_max_len). */
+ #define SSL_MODE_RELEASE_BUFFERS 0x00000010L
 +/* When set, clients may send application data before receipt of CCS
 + * and Finished.  This mode enables full-handshakes to 'complete' in
 + * one RTT. */
-+#define SSL_MODE_HANDSHAKE_CUTTHROUGH 0x00000040L
-
++#define SSL_MODE_HANDSHAKE_CUTTHROUGH 0x00000020L
+ 
  /* Note: SSL[_CTX]_set_{options,mode} use |= op on the previous value,
   * they cannot be used to clear bits. */
-@@ -1097,10 +1101,12 @@ extern "C" {
+@@ -1410,10 +1414,12 @@ extern "C" {
  /* Is the SSL_connection established? */
  #define SSL_get_state(a)		SSL_state(a)
  #define SSL_is_init_finished(a)		(SSL_state(a) == SSL_ST_OK)
 -#define SSL_in_init(a)			(SSL_state(a)&SSL_ST_INIT)
 +#define SSL_in_init(a)			((SSL_state(a)&SSL_ST_INIT) && \
-+                                  !SSL_cutthrough_complete(a))
++					!SSL_cutthrough_complete(a))
  #define SSL_in_before(a)		(SSL_state(a)&SSL_ST_BEFORE)
  #define SSL_in_connect_init(a)		(SSL_state(a)&SSL_ST_CONNECT)
  #define SSL_in_accept_init(a)		(SSL_state(a)&SSL_ST_ACCEPT)
@@ -182,25 +198,26 @@ diff -uarp openssl-1.0.0.orig/ssl/ssl.h openssl-1.0.0/ssl/ssl.h
  
  /* The following 2 states are kept in ssl->rstate when reads fail,
   * you should not need these */
-Only in openssl-1.0.0/ssl: ssl.h.orig
-diff -uarp openssl-1.0.0.orig/ssl/ssl3.h openssl-1.0.0/ssl/ssl3.h
--- openssl-1.0.0.orig/ssl/ssl3.h	2010-01-06 12:37:38.000000000 -0500
-+++ openssl-1.0.0/ssl/ssl3.h	2010-04-21 14:39:49.000000000 -0400
-@@ -456,6 +456,7 @@ typedef struct ssl3_state_st
+diff --git a/ssl/ssl3.h b/ssl/ssl3.h
+index 112e627..556ffc1 100644
+--- a/ssl/ssl3.h
++++ b/ssl/ssl3.h
+@@ -547,6 +547,7 @@ typedef struct ssl3_state_st
  /*client */
  /* extra state */
  #define SSL3_ST_CW_FLUSH		(0x100|SSL_ST_CONNECT)
 +#define SSL3_ST_CUTTHROUGH_COMPLETE	(0x101|SSL_ST_CONNECT)
- /* write to server */
- #define SSL3_ST_CW_CLNT_HELLO_A		(0x110|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CLNT_HELLO_B		(0x111|SSL_ST_CONNECT)
-diff -uarp openssl-1.0.0.orig/ssl/ssl_lib.c openssl-1.0.0/ssl/ssl_lib.c
---- openssl-1.0.0.orig/ssl/ssl_lib.c	2010-02-17 14:43:46.000000000 -0500
-+++ openssl-1.0.0/ssl/ssl_lib.c	2010-04-21 17:02:45.000000000 -0400
-@@ -3031,6 +3031,19 @@ void SSL_set_msg_callback(SSL *ssl, void
+ #ifndef OPENSSL_NO_SCTP
+ #define DTLS1_SCTP_ST_CW_WRITE_SOCK			(0x310|SSL_ST_CONNECT)
+ #define DTLS1_SCTP_ST_CR_READ_SOCK			(0x320|SSL_ST_CONNECT)
+diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
+index f82d071..518f152 100644
+--- a/ssl/ssl_lib.c
++++ b/ssl/ssl_lib.c
+@@ -3211,6 +3211,19 @@ void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int con
  	SSL_callback_ctrl(ssl, SSL_CTRL_SET_MSG_CALLBACK, (void (*)(void))cb);
  	}
-
+ 
 +int SSL_cutthrough_complete(const SSL *s)
 +	{
 +	return (!s->server &&                 /* cutthrough only applies to clients */
@@ -217,28 +234,29 @@ diff -uarp openssl-1.0.0.orig/ssl/ssl_lib.c openssl-1.0.0/ssl/ssl_lib.c
  /* Allocates new EVP_MD_CTX and sets pointer to it into given pointer
   * vairable, freeing  EVP_MD_CTX previously stored in that variable, if
   * any. If EVP_MD pointer is passed, initializes ctx with this md
-diff -uarp openssl-1.0.0.orig/ssl/ssltest.c openssl-1.0.0/ssl/ssltest.c
---- openssl-1.0.0.orig/ssl/ssltest.c	2010-01-24 11:57:38.000000000 -0500
-+++ openssl-1.0.0/ssl/ssltest.c	2010-04-21 17:06:35.000000000 -0400
-@@ -279,6 +279,7 @@ static void sv_usage(void)
+diff --git a/ssl/ssltest.c b/ssl/ssltest.c
+index 0f8fd39..02ce4ec 100644
+--- a/ssl/ssltest.c
++++ b/ssl/ssltest.c
+@@ -369,6 +369,7 @@ static void sv_usage(void)
+ 	               "                 (default is sect163r2).\n");
+ #endif
  	fprintf(stderr," -test_cipherlist - verifies the order of the ssl cipher lists\n");
- 	fprintf(stderr," -c_small_records - enable client side use of small SSL record buffers\n");
- 	fprintf(stderr," -s_small_records - enable server side use of small SSL record buffers\n");
 +	fprintf(stderr," -cutthrough      - enable 1-RTT full-handshake for strong ciphers\n");
  	}
  
  static void print_details(SSL *c_ssl, const char *prefix)
-@@ -436,6 +437,7 @@ int main(int argc, char *argv[])
- 	int ssl_mode = 0;
- 	int c_small_records=0;
- 	int s_small_records=0;
+@@ -549,6 +550,7 @@ int main(int argc, char *argv[])
+ #ifdef OPENSSL_FIPS
+ 	int fips_mode=0;
+ #endif
 +	int cutthrough = 0;
  
  	verbose = 0;
  	debug = 0;
-@@ -632,6 +634,10 @@ int main(int argc, char *argv[])
+@@ -765,6 +767,10 @@ int main(int argc, char *argv[])
  			{
- 			s_small_records = 1;
+ 			test_cipherlist = 1;
  			}
 +		else if (strcmp(*argv, "-cutthrough") == 0)
 +			{
@@ -247,29 +265,30 @@ diff -uarp openssl-1.0.0.orig/ssl/ssltest.c openssl-1.0.0/ssl/ssltest.c
  		else
  			{
  			fprintf(stderr,"unknown option %s\n",*argv);
-@@ -782,6 +788,13 @@ bad:
- 		ssl_mode |= SSL_MODE_SMALL_BUFFERS;
- 		SSL_CTX_set_mode(s_ctx, ssl_mode);
+@@ -900,6 +906,12 @@ bad:
+ 		SSL_CTX_set_cipher_list(c_ctx,cipher);
+ 		SSL_CTX_set_cipher_list(s_ctx,cipher);
  		}
-+	ssl_mode = 0;
 +	if (cutthrough)
 +		{
-+		ssl_mode = SSL_CTX_get_mode(c_ctx);
-+		ssl_mode = SSL_MODE_HANDSHAKE_CUTTHROUGH;
++		int ssl_mode = SSL_CTX_get_mode(c_ctx);
++		ssl_mode |= SSL_MODE_HANDSHAKE_CUTTHROUGH;
 +		SSL_CTX_set_mode(c_ctx, ssl_mode);
 +		}
  
  #ifndef OPENSSL_NO_DH
  	if (!no_dhe)
-diff -uarp openssl-1.0.0.orig/test/testssl openssl-1.0.0/test/testssl
---- openssl-1.0.0.orig/test/testssl	2006-03-10 18:06:27.000000000 -0500
-+++ openssl-1.0.0/test/testssl	2010-04-21 16:50:13.000000000 -0400
-@@ -79,6 +79,8 @@ $ssltest -server_auth -client_auth -s_sm
- echo test sslv2/sslv3 with both client and server authentication and small client and server buffers
- $ssltest -server_auth -client_auth -c_small_records -s_small_records $CA $extra || exit 1
+diff --git a/test/testssl b/test/testssl
+index 5ae4dc8..5dfeeeb 100644
+--- a/test/testssl
++++ b/test/testssl
+@@ -70,6 +70,9 @@ $ssltest -client_auth $CA $extra || exit 1
+ echo test sslv2/sslv3 with both client and server authentication
+ $ssltest -server_auth -client_auth $CA $extra || exit 1
  
 +echo test sslv2/sslv3 with both client and server authentication and handshake cutthrough
 +$ssltest -server_auth -client_auth -cutthrough $CA $extra || exit 1
- 
++
  echo test sslv2 via BIO pair
  $ssltest -bio_pair -ssl2 $extra || exit 1
+ 
diff --git a/patches/jsse.patch b/patches/jsse.patch
index 249fb5b..795a2bb 100644
--- a/patches/jsse.patch
+++ b/patches/jsse.patch
@@ -10,14 +10,6 @@
  	/* Default generate session ID callback. */
  	GEN_SESSION_CB generate_session_id;
  
-@@ -1546,6 +1549,7 @@ const SSL_CIPHER *SSL_get_current_cipher
- int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);
- char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
- const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
-+const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER *c);
- 
- int	SSL_get_fd(const SSL *s);
- int	SSL_get_rfd(const SSL *s);
 @@ -1554,6 +1558,7 @@ const char  * SSL_get_cipher_list(const 
  char *	SSL_get_shared_ciphers(const SSL *s, char *buf, int len);
  int	SSL_get_read_ahead(const SSL * s);
@@ -48,9 +40,9 @@
  const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s,
  					unsigned int *len);
 +const char *	SSL_SESSION_get_version(const SSL_SESSION *s);
+ unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s);
  #ifndef OPENSSL_NO_FP_API
  int	SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses);
- #endif
 @@ -1624,6 +1633,7 @@ int	SSL_SESSION_print(BIO *fp,const SSL_
  void	SSL_SESSION_free(SSL_SESSION *ses);
  int	i2d_SSL_SESSION(SSL_SESSION *in,unsigned char **pp);
@@ -296,15 +288,21 @@
  /* works well for SSLv2, not so good for SSLv3 */
  char *SSL_get_shared_ciphers(const SSL *s,char *buf,int len)
  	{
-@@ -2551,18 +2578,45 @@ SSL_METHOD *ssl_bad_method(int ver)
+@@ -2551,22 +2578,45 @@ SSL_METHOD *ssl_bad_method(int ver)
  	return(NULL);
  	}
  
 -const char *SSL_get_version(const SSL *s)
 +static const char *ssl_get_version(int version)
  	{
--	if (s->version == TLS1_VERSION)
-+	if (version == TLS1_VERSION)
+-	if (s->version == TLS1_2_VERSION)
++	if (version == TLS1_2_VERSION)
+ 		return("TLSv1.2");
+-	else if (s->version == TLS1_1_VERSION)
++	else if (version == TLS1_1_VERSION)
+ 		return("TLSv1.1");
+-	else if (s->version == TLS1_VERSION)
++	else if (version == TLS1_VERSION)
  		return("TLSv1");
 -	else if (s->version == SSL3_VERSION)
 +	else if (version == SSL3_VERSION)
@@ -334,12 +332,8 @@
 +		{
 +	case SSL2_VERSION:
 +		return SSL_TXT_RSA;
-+	case SSL3_VERSION:
-+	case TLS1_VERSION:
-+	case DTLS1_VERSION:
-+		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 +	default:
-+		return "UNKNOWN";
++		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 +		}
 +	}
 +
@@ -358,7 +352,7 @@
  
 --- openssl-1.0.0b.orig/ssl/ssl_rsa.c	2009-09-12 23:09:26.000000000 +0000
 +++ openssl-1.0.0b/ssl/ssl_rsa.c	2010-11-30 00:03:47.000000000 +0000
-@@ -697,6 +697,42 @@ int SSL_CTX_use_PrivateKey_ASN1(int type
+@@ -697,6 +697,44 @@ int SSL_CTX_use_PrivateKey_ASN1(int type
  	}
  
  
@@ -379,6 +373,8 @@
 +		SSLerr(SSL_F_SSL_USE_CERTIFICATE_CHAIN,SSL_R_NO_CERTIFICATE_ASSIGNED);
 +		return(0);
 +		}
++	if (ssl->cert->key->cert_chain != NULL)
++		sk_X509_pop_free(ssl->cert->key->cert_chain, X509_free);
 +	ssl->cert->key->cert_chain = cert_chain;
 +	return(1);
 +	}
diff --git a/patches/mips_asm.patch b/patches/mips_asm.patch
deleted file mode 100644
index 68a80f1..0000000
--- a/patches/mips_asm.patch
+++ /dev/null
@@ -1,5461 +0,0 @@
-diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl
-new file mode 100644
-index 0000000..2ce6def
---- /dev/null
-+++ b/crypto/aes/asm/aes-mips.pl
-@@ -0,0 +1,1611 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# AES for MIPS
-+
-+# October 2010
-+#
-+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
-+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
-+# faster than gcc-generated code, which is not very impressive. But
-+# recall that compressed S-box requires extra processing, namely
-+# additional rotations. Rotations are implemented with lwl/lwr pairs,
-+# which is normally used for loading unaligned data. Another cool
-+# thing about this module is its endian neutrality, which means that
-+# it processes data without ever changing byte order...
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp;
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$PTR_SLL="dsll";	# incidentally works even on n32
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$PTR_SLL="sll";
-+	$SZREG=4;
-+}
-+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
-+#
-+# <appro@openssl.org>
-+#
-+######################################################################
-+
-+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
-+
-+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
-+open STDOUT,">$output";
-+
-+if (!defined($big_endian))
-+{    $big_endian=(unpack('L',pack('N',1))==1);   }
-+
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+my ($MSB,$LSB)=(0,3);	# automatically converted to little-endian
-+
-+$code.=<<___;
-+.text
-+#ifdef OPENSSL_FIPSCANISTER
-+# include <openssl/fipssyms.h>
-+#endif
-+
-+#if !defined(__vxworks) || defined(__pic__)
-+.option	pic2
-+#endif
-+.set	noat
-+___
-+
-+{{{
-+my $FRAMESIZE=16*$SZREG;
-+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
-+
-+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
-+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
-+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
-+my ($key0,$cnt)=($gp,$fp);
-+
-+# instuction ordering is "stolen" from output from MIPSpro assembler
-+# invoked with -mips3 -O3 arguments...
-+$code.=<<___;
-+.align	5
-+.ent	_mips_AES_encrypt
-+_mips_AES_encrypt:
-+	.frame	$sp,0,$ra
-+	.set	reorder
-+	lw	$t0,0($key)
-+	lw	$t1,4($key)
-+	lw	$t2,8($key)
-+	lw	$t3,12($key)
-+	lw	$cnt,240($key)
-+	$PTR_ADD $key0,$key,16
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	sub	$cnt,1
-+	_xtr	$i0,$s1,16-2
-+.Loop_enc:
-+	_xtr	$i1,$s2,16-2
-+	_xtr	$i2,$s3,16-2
-+	_xtr	$i3,$s0,16-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t0,3($i0)		# Te1[s1>>16]
-+	lwl	$t1,3($i1)		# Te1[s2>>16]
-+	lwl	$t2,3($i2)		# Te1[s3>>16]
-+	lwl	$t3,3($i3)		# Te1[s0>>16]
-+	lwr	$t0,2($i0)		# Te1[s1>>16]
-+	lwr	$t1,2($i1)		# Te1[s2>>16]
-+	lwr	$t2,2($i2)		# Te1[s3>>16]
-+	lwr	$t3,2($i3)		# Te1[s0>>16]
-+
-+	_xtr	$i0,$s2,8-2
-+	_xtr	$i1,$s3,8-2
-+	_xtr	$i2,$s0,8-2
-+	_xtr	$i3,$s1,8-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t4,2($i0)		# Te2[s2>>8]
-+	lwl	$t5,2($i1)		# Te2[s3>>8]
-+	lwl	$t6,2($i2)		# Te2[s0>>8]
-+	lwl	$t7,2($i3)		# Te2[s1>>8]
-+	lwr	$t4,1($i0)		# Te2[s2>>8]
-+	lwr	$t5,1($i1)		# Te2[s3>>8]
-+	lwr	$t6,1($i2)		# Te2[s0>>8]
-+	lwr	$t7,1($i3)		# Te2[s1>>8]
-+
-+	_xtr	$i0,$s3,0-2
-+	_xtr	$i1,$s0,0-2
-+	_xtr	$i2,$s1,0-2
-+	_xtr	$i3,$s2,0-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t8,1($i0)		# Te3[s3]
-+	lwl	$t9,1($i1)		# Te3[s0]
-+	lwl	$t10,1($i2)		# Te3[s1]
-+	lwl	$t11,1($i3)		# Te3[s2]
-+	lwr	$t8,0($i0)		# Te3[s3]
-+	lwr	$t9,0($i1)		# Te3[s0]
-+	lwr	$t10,0($i2)		# Te3[s1]
-+	lwr	$t11,0($i3)		# Te3[s2]
-+
-+	_xtr	$i0,$s0,24-2
-+	_xtr	$i1,$s1,24-2
-+	_xtr	$i2,$s2,24-2
-+	_xtr	$i3,$s3,24-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+	lw	$t4,0($i0)		# Te0[s0>>24]
-+	lw	$t5,0($i1)		# Te0[s1>>24]
-+	lw	$t6,0($i2)		# Te0[s2>>24]
-+	lw	$t7,0($i3)		# Te0[s3>>24]
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	sub	$cnt,1
-+	$PTR_ADD $key0,16
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+	.set	noreorder
-+	bnez	$cnt,.Loop_enc
-+	_xtr	$i0,$s1,16-2
-+
-+	.set	reorder
-+	_xtr	$i1,$s2,16-2
-+	_xtr	$i2,$s3,16-2
-+	_xtr	$i3,$s0,16-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t0,2($i0)		# Te4[s1>>16]
-+	lbu	$t1,2($i1)		# Te4[s2>>16]
-+	lbu	$t2,2($i2)		# Te4[s3>>16]
-+	lbu	$t3,2($i3)		# Te4[s0>>16]
-+
-+	_xtr	$i0,$s2,8-2
-+	_xtr	$i1,$s3,8-2
-+	_xtr	$i2,$s0,8-2
-+	_xtr	$i3,$s1,8-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,2($i0)		# Te4[s2>>8]
-+	lbu	$t5,2($i1)		# Te4[s3>>8]
-+	lbu	$t6,2($i2)		# Te4[s0>>8]
-+	lbu	$t7,2($i3)		# Te4[s1>>8]
-+
-+	_xtr	$i0,$s0,24-2
-+	_xtr	$i1,$s1,24-2
-+	_xtr	$i2,$s2,24-2
-+	_xtr	$i3,$s3,24-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t8,2($i0)		# Te4[s0>>24]
-+	lbu	$t9,2($i1)		# Te4[s1>>24]
-+	lbu	$t10,2($i2)		# Te4[s2>>24]
-+	lbu	$t11,2($i3)		# Te4[s3>>24]
-+
-+	_xtr	$i0,$s3,0-2
-+	_xtr	$i1,$s0,0-2
-+	_xtr	$i2,$s1,0-2
-+	_xtr	$i3,$s2,0-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+
-+	_ins	$t0,16
-+	_ins	$t1,16
-+	_ins	$t2,16
-+	_ins	$t3,16
-+
-+	_ins	$t4,8
-+	_ins	$t5,8
-+	_ins	$t6,8
-+	_ins	$t7,8
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,2($i0)		# Te4[s3]
-+	lbu	$t5,2($i1)		# Te4[s0]
-+	lbu	$t6,2($i2)		# Te4[s1]
-+	lbu	$t7,2($i3)		# Te4[s2]
-+
-+	_ins	$t8,24
-+	_ins	$t9,24
-+	_ins	$t10,24
-+	_ins	$t11,24
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	_ins	$t4,0
-+	_ins	$t5,0
-+	_ins	$t6,0
-+	_ins	$t7,0
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	jr	$ra
-+.end	_mips_AES_encrypt
-+
-+.align	5
-+.globl	AES_encrypt
-+.ent	AES_encrypt
-+AES_encrypt:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_encrypt
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Te		# PIC-ified 'load address'
-+
-+	lwl	$s0,0+$MSB($inp)
-+	lwl	$s1,4+$MSB($inp)
-+	lwl	$s2,8+$MSB($inp)
-+	lwl	$s3,12+$MSB($inp)
-+	lwr	$s0,0+$LSB($inp)
-+	lwr	$s1,4+$LSB($inp)
-+	lwr	$s2,8+$LSB($inp)
-+	lwr	$s3,12+$LSB($inp)
-+
-+	bal	_mips_AES_encrypt
-+
-+	swr	$s0,0+$LSB($out)
-+	swr	$s1,4+$LSB($out)
-+	swr	$s2,8+$LSB($out)
-+	swr	$s3,12+$LSB($out)
-+	swl	$s0,0+$MSB($out)
-+	swl	$s1,4+$MSB($out)
-+	swl	$s2,8+$MSB($out)
-+	swl	$s3,12+$MSB($out)
-+
-+	.set	noreorder
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_encrypt
-+___
-+
-+$code.=<<___;
-+.align	5
-+.ent	_mips_AES_decrypt
-+_mips_AES_decrypt:
-+	.frame	$sp,0,$ra
-+	.set	reorder
-+	lw	$t0,0($key)
-+	lw	$t1,4($key)
-+	lw	$t2,8($key)
-+	lw	$t3,12($key)
-+	lw	$cnt,240($key)
-+	$PTR_ADD $key0,$key,16
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	sub	$cnt,1
-+	_xtr	$i0,$s3,16-2
-+.Loop_dec:
-+	_xtr	$i1,$s0,16-2
-+	_xtr	$i2,$s1,16-2
-+	_xtr	$i3,$s2,16-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t0,3($i0)		# Td1[s3>>16]
-+	lwl	$t1,3($i1)		# Td1[s0>>16]
-+	lwl	$t2,3($i2)		# Td1[s1>>16]
-+	lwl	$t3,3($i3)		# Td1[s2>>16]
-+	lwr	$t0,2($i0)		# Td1[s3>>16]
-+	lwr	$t1,2($i1)		# Td1[s0>>16]
-+	lwr	$t2,2($i2)		# Td1[s1>>16]
-+	lwr	$t3,2($i3)		# Td1[s2>>16]
-+
-+	_xtr	$i0,$s2,8-2
-+	_xtr	$i1,$s3,8-2
-+	_xtr	$i2,$s0,8-2
-+	_xtr	$i3,$s1,8-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t4,2($i0)		# Td2[s2>>8]
-+	lwl	$t5,2($i1)		# Td2[s3>>8]
-+	lwl	$t6,2($i2)		# Td2[s0>>8]
-+	lwl	$t7,2($i3)		# Td2[s1>>8]
-+	lwr	$t4,1($i0)		# Td2[s2>>8]
-+	lwr	$t5,1($i1)		# Td2[s3>>8]
-+	lwr	$t6,1($i2)		# Td2[s0>>8]
-+	lwr	$t7,1($i3)		# Td2[s1>>8]
-+
-+	_xtr	$i0,$s1,0-2
-+	_xtr	$i1,$s2,0-2
-+	_xtr	$i2,$s3,0-2
-+	_xtr	$i3,$s0,0-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lwl	$t8,1($i0)		# Td3[s1]
-+	lwl	$t9,1($i1)		# Td3[s2]
-+	lwl	$t10,1($i2)		# Td3[s3]
-+	lwl	$t11,1($i3)		# Td3[s0]
-+	lwr	$t8,0($i0)		# Td3[s1]
-+	lwr	$t9,0($i1)		# Td3[s2]
-+	lwr	$t10,0($i2)		# Td3[s3]
-+	lwr	$t11,0($i3)		# Td3[s0]
-+
-+	_xtr	$i0,$s0,24-2
-+	_xtr	$i1,$s1,24-2
-+	_xtr	$i2,$s2,24-2
-+	_xtr	$i3,$s3,24-2
-+	and	$i0,0x3fc
-+	and	$i1,0x3fc
-+	and	$i2,0x3fc
-+	and	$i3,0x3fc
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+
-+	lw	$t4,0($i0)		# Td0[s0>>24]
-+	lw	$t5,0($i1)		# Td0[s1>>24]
-+	lw	$t6,0($i2)		# Td0[s2>>24]
-+	lw	$t7,0($i3)		# Td0[s3>>24]
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	sub	$cnt,1
-+	$PTR_ADD $key0,16
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+	.set	noreorder
-+	bnez	$cnt,.Loop_dec
-+	_xtr	$i0,$s3,16-2
-+
-+	.set	reorder
-+	lw	$t4,1024($Tbl)		# prefetch Td4
-+	lw	$t5,1024+32($Tbl)
-+	lw	$t6,1024+64($Tbl)
-+	lw	$t7,1024+96($Tbl)
-+	lw	$t8,1024+128($Tbl)
-+	lw	$t9,1024+160($Tbl)
-+	lw	$t10,1024+192($Tbl)
-+	lw	$t11,1024+224($Tbl)
-+
-+	_xtr	$i0,$s3,16
-+	_xtr	$i1,$s0,16
-+	_xtr	$i2,$s1,16
-+	_xtr	$i3,$s2,16
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,0xff
-+	and	$i3,0xff
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t0,1024($i0)		# Td4[s3>>16]
-+	lbu	$t1,1024($i1)		# Td4[s0>>16]
-+	lbu	$t2,1024($i2)		# Td4[s1>>16]
-+	lbu	$t3,1024($i3)		# Td4[s2>>16]
-+
-+	_xtr	$i0,$s2,8
-+	_xtr	$i1,$s3,8
-+	_xtr	$i2,$s0,8
-+	_xtr	$i3,$s1,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,0xff
-+	and	$i3,0xff
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,1024($i0)		# Td4[s2>>8]
-+	lbu	$t5,1024($i1)		# Td4[s3>>8]
-+	lbu	$t6,1024($i2)		# Td4[s0>>8]
-+	lbu	$t7,1024($i3)		# Td4[s1>>8]
-+
-+	_xtr	$i0,$s0,24
-+	_xtr	$i1,$s1,24
-+	_xtr	$i2,$s2,24
-+	_xtr	$i3,$s3,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t8,1024($i0)		# Td4[s0>>24]
-+	lbu	$t9,1024($i1)		# Td4[s1>>24]
-+	lbu	$t10,1024($i2)		# Td4[s2>>24]
-+	lbu	$t11,1024($i3)		# Td4[s3>>24]
-+
-+	_xtr	$i0,$s1,0
-+	_xtr	$i1,$s2,0
-+	_xtr	$i2,$s3,0
-+	_xtr	$i3,$s0,0
-+
-+	_ins	$t0,16
-+	_ins	$t1,16
-+	_ins	$t2,16
-+	_ins	$t3,16
-+
-+	_ins	$t4,8
-+	_ins	$t5,8
-+	_ins	$t6,8
-+	_ins	$t7,8
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$t4,1024($i0)		# Td4[s1]
-+	lbu	$t5,1024($i1)		# Td4[s2]
-+	lbu	$t6,1024($i2)		# Td4[s3]
-+	lbu	$t7,1024($i3)		# Td4[s0]
-+
-+	_ins	$t8,24
-+	_ins	$t9,24
-+	_ins	$t10,24
-+	_ins	$t11,24
-+
-+	lw	$s0,0($key0)
-+	lw	$s1,4($key0)
-+	lw	$s2,8($key0)
-+	lw	$s3,12($key0)
-+
-+	_ins	$t4,0
-+	_ins	$t5,0
-+	_ins	$t6,0
-+	_ins	$t7,0
-+
-+
-+	xor	$t0,$t8
-+	xor	$t1,$t9
-+	xor	$t2,$t10
-+	xor	$t3,$t11
-+
-+	xor	$t0,$t4
-+	xor	$t1,$t5
-+	xor	$t2,$t6
-+	xor	$t3,$t7
-+
-+	xor	$s0,$t0
-+	xor	$s1,$t1
-+	xor	$s2,$t2
-+	xor	$s3,$t3
-+
-+	jr	$ra
-+.end	_mips_AES_decrypt
-+
-+.align	5
-+.globl	AES_decrypt
-+.ent	AES_decrypt
-+AES_decrypt:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_decrypt
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Td		# PIC-ified 'load address'
-+
-+	lwl	$s0,0+$MSB($inp)
-+	lwl	$s1,4+$MSB($inp)
-+	lwl	$s2,8+$MSB($inp)
-+	lwl	$s3,12+$MSB($inp)
-+	lwr	$s0,0+$LSB($inp)
-+	lwr	$s1,4+$LSB($inp)
-+	lwr	$s2,8+$LSB($inp)
-+	lwr	$s3,12+$LSB($inp)
-+
-+	bal	_mips_AES_decrypt
-+
-+	swr	$s0,0+$LSB($out)
-+	swr	$s1,4+$LSB($out)
-+	swr	$s2,8+$LSB($out)
-+	swr	$s3,12+$LSB($out)
-+	swl	$s0,0+$MSB($out)
-+	swl	$s1,4+$MSB($out)
-+	swl	$s2,8+$MSB($out)
-+	swl	$s3,12+$MSB($out)
-+
-+	.set	noreorder
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_decrypt
-+___
-+}}}
-+
-+{{{
-+my $FRAMESIZE=8*$SZREG;
-+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
-+
-+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
-+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
-+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
-+my ($rcon,$cnt)=($gp,$fp);
-+
-+$code.=<<___;
-+.align	5
-+.ent	_mips_AES_set_encrypt_key
-+_mips_AES_set_encrypt_key:
-+	.frame	$sp,0,$ra
-+	.set	noreorder
-+	beqz	$inp,.Lekey_done
-+	li	$t0,-1
-+	beqz	$key,.Lekey_done
-+	$PTR_ADD $rcon,$Tbl,1024+256
-+
-+	.set	reorder
-+	lwl	$rk0,0+$MSB($inp)	# load 128 bits
-+	lwl	$rk1,4+$MSB($inp)
-+	lwl	$rk2,8+$MSB($inp)
-+	lwl	$rk3,12+$MSB($inp)
-+	li	$at,128
-+	lwr	$rk0,0+$LSB($inp)
-+	lwr	$rk1,4+$LSB($inp)
-+	lwr	$rk2,8+$LSB($inp)
-+	lwr	$rk3,12+$LSB($inp)
-+	.set	noreorder
-+	beq	$bits,$at,.L128bits
-+	li	$cnt,10
-+
-+	.set	reorder
-+	lwl	$rk4,16+$MSB($inp)	# load 192 bits
-+	lwl	$rk5,20+$MSB($inp)
-+	li	$at,192
-+	lwr	$rk4,16+$LSB($inp)
-+	lwr	$rk5,20+$LSB($inp)
-+	.set	noreorder
-+	beq	$bits,$at,.L192bits
-+	li	$cnt,8
-+
-+	.set	reorder
-+	lwl	$rk6,24+$MSB($inp)	# load 256 bits
-+	lwl	$rk7,28+$MSB($inp)
-+	li	$at,256
-+	lwr	$rk6,24+$LSB($inp)
-+	lwr	$rk7,28+$LSB($inp)
-+	.set	noreorder
-+	beq	$bits,$at,.L256bits
-+	li	$cnt,7
-+
-+	b	.Lekey_done
-+	li	$t0,-2
-+
-+.align	4
-+.L128bits:
-+	.set	reorder
-+	srl	$i0,$rk3,16
-+	srl	$i1,$rk3,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,$rk3,0xff
-+	srl	$i3,$rk3,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	sw	$rk3,12($key)
-+	sub	$cnt,1
-+	$PTR_ADD $key,16
-+
-+	_bias	$i0,24
-+	_bias	$i1,16
-+	_bias	$i2,8
-+	_bias	$i3,0
-+
-+	xor	$rk0,$i0
-+	lw	$i0,0($rcon)
-+	xor	$rk0,$i1
-+	xor	$rk0,$i2
-+	xor	$rk0,$i3
-+	xor	$rk0,$i0
-+
-+	xor	$rk1,$rk0
-+	xor	$rk2,$rk1
-+	xor	$rk3,$rk2
-+
-+	.set	noreorder
-+	bnez	$cnt,.L128bits
-+	$PTR_ADD $rcon,4
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	li	$cnt,10
-+	sw	$rk3,12($key)
-+	li	$t0,0
-+	sw	$cnt,80($key)
-+	b	.Lekey_done
-+	$PTR_SUB $key,10*16
-+
-+.align	4
-+.L192bits:
-+	.set	reorder
-+	srl	$i0,$rk5,16
-+	srl	$i1,$rk5,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,$rk5,0xff
-+	srl	$i3,$rk5,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	sw	$rk3,12($key)
-+	sw	$rk4,16($key)
-+	sw	$rk5,20($key)
-+	sub	$cnt,1
-+	$PTR_ADD $key,24
-+
-+	_bias	$i0,24
-+	_bias	$i1,16
-+	_bias	$i2,8
-+	_bias	$i3,0
-+
-+	xor	$rk0,$i0
-+	lw	$i0,0($rcon)
-+	xor	$rk0,$i1
-+	xor	$rk0,$i2
-+	xor	$rk0,$i3
-+	xor	$rk0,$i0
-+
-+	xor	$rk1,$rk0
-+	xor	$rk2,$rk1
-+	xor	$rk3,$rk2
-+	xor	$rk4,$rk3
-+	xor	$rk5,$rk4
-+
-+	.set	noreorder
-+	bnez	$cnt,.L192bits
-+	$PTR_ADD $rcon,4
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	li	$cnt,12
-+	sw	$rk3,12($key)
-+	li	$t0,0
-+	sw	$cnt,48($key)
-+	b	.Lekey_done
-+	$PTR_SUB $key,12*16
-+
-+.align	4
-+.L256bits:
-+	.set	reorder
-+	srl	$i0,$rk7,16
-+	srl	$i1,$rk7,8
-+	and	$i0,0xff
-+	and	$i1,0xff
-+	and	$i2,$rk7,0xff
-+	srl	$i3,$rk7,24
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+
-+	sw	$rk0,0($key)
-+	sw	$rk1,4($key)
-+	sw	$rk2,8($key)
-+	sw	$rk3,12($key)
-+	sw	$rk4,16($key)
-+	sw	$rk5,20($key)
-+	sw	$rk6,24($key)
-+	sw	$rk7,28($key)
-+	sub	$cnt,1
-+
-+	_bias	$i0,24
-+	_bias	$i1,16
-+	_bias	$i2,8
-+	_bias	$i3,0
-+
-+	xor	$rk0,$i0
-+	lw	$i0,0($rcon)
-+	xor	$rk0,$i1
-+	xor	$rk0,$i2
-+	xor	$rk0,$i3
-+	xor	$rk0,$i0
-+
-+	xor	$rk1,$rk0
-+	xor	$rk2,$rk1
-+	xor	$rk3,$rk2
-+	beqz	$cnt,.L256bits_done
-+
-+	srl	$i0,$rk3,24
-+	srl	$i1,$rk3,16
-+	srl	$i2,$rk3,8
-+	and	$i3,$rk3,0xff
-+	and	$i1,0xff
-+	and	$i2,0xff
-+	$PTR_ADD $i0,$Tbl
-+	$PTR_ADD $i1,$Tbl
-+	$PTR_ADD $i2,$Tbl
-+	$PTR_ADD $i3,$Tbl
-+	lbu	$i0,1024($i0)
-+	lbu	$i1,1024($i1)
-+	lbu	$i2,1024($i2)
-+	lbu	$i3,1024($i3)
-+	sll	$i0,24
-+	sll	$i1,16
-+	sll	$i2,8
-+
-+	xor	$rk4,$i0
-+	xor	$rk4,$i1
-+	xor	$rk4,$i2
-+	xor	$rk4,$i3
-+
-+	xor	$rk5,$rk4
-+	xor	$rk6,$rk5
-+	xor	$rk7,$rk6
-+
-+	$PTR_ADD $key,32
-+	.set	noreorder
-+	b	.L256bits
-+	$PTR_ADD $rcon,4
-+
-+.L256bits_done:
-+	sw	$rk0,32($key)
-+	sw	$rk1,36($key)
-+	sw	$rk2,40($key)
-+	li	$cnt,14
-+	sw	$rk3,44($key)
-+	li	$t0,0
-+	sw	$cnt,48($key)
-+	$PTR_SUB $key,12*16
-+
-+.Lekey_done:
-+	jr	$ra
-+	nop
-+.end	_mips_AES_set_encrypt_key
-+
-+.globl	AES_set_encrypt_key
-+.ent	AES_set_encrypt_key
-+AES_set_encrypt_key:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_set_encrypt_key
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Te		# PIC-ified 'load address'
-+
-+	bal	_mips_AES_set_encrypt_key
-+
-+	.set	noreorder
-+	move	$a0,$t0
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_set_encrypt_key
-+___
-+
-+my ($head,$tail)=($inp,$bits);
-+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
-+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
-+$code.=<<___;
-+.align	5
-+.globl	AES_set_decrypt_key
-+.ent	AES_set_decrypt_key
-+AES_set_decrypt_key:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Tbl
-+	.cpsetup	$pf,$zero,AES_set_decrypt_key
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Tbl,AES_Te		# PIC-ified 'load address'
-+
-+	bal	_mips_AES_set_encrypt_key
-+
-+	bltz	$t0,.Ldkey_done
-+
-+	sll	$at,$cnt,4
-+	$PTR_ADD $head,$key,0
-+	$PTR_ADD $tail,$key,$at
-+.align	4
-+.Lswap:
-+	lw	$rk0,0($head)
-+	lw	$rk1,4($head)
-+	lw	$rk2,8($head)
-+	lw	$rk3,12($head)
-+	lw	$rk4,0($tail)
-+	lw	$rk5,4($tail)
-+	lw	$rk6,8($tail)
-+	lw	$rk7,12($tail)
-+	sw	$rk0,0($tail)
-+	sw	$rk1,4($tail)
-+	sw	$rk2,8($tail)
-+	sw	$rk3,12($tail)
-+	$PTR_ADD $head,16
-+	$PTR_SUB $tail,16
-+	sw	$rk4,-16($head)
-+	sw	$rk5,-12($head)
-+	sw	$rk6,-8($head)
-+	sw	$rk7,-4($head)
-+	bne	$head,$tail,.Lswap
-+
-+	lw	$tp1,16($key)		# modulo-scheduled
-+	lui	$x80808080,0x8080
-+	sub	$cnt,1
-+	or	$x80808080,0x8080
-+	sll	$cnt,2
-+	$PTR_ADD $key,16
-+	lui	$x1b1b1b1b,0x1b1b
-+	nor	$x7f7f7f7f,$zero,$x80808080
-+	or	$x1b1b1b1b,0x1b1b
-+.align	4
-+.Lmix:
-+	and	$m,$tp1,$x80808080
-+	and	$tp2,$tp1,$x7f7f7f7f
-+	srl	$tp4,$m,7
-+	addu	$tp2,$tp2		# tp2<<1
-+	subu	$m,$tp4
-+	and	$m,$x1b1b1b1b
-+	xor	$tp2,$m
-+
-+	and	$m,$tp2,$x80808080
-+	and	$tp4,$tp2,$x7f7f7f7f
-+	srl	$tp8,$m,7
-+	addu	$tp4,$tp4		# tp4<<1
-+	subu	$m,$tp8
-+	and	$m,$x1b1b1b1b
-+	xor	$tp4,$m
-+
-+	and	$m,$tp4,$x80808080
-+	and	$tp8,$tp4,$x7f7f7f7f
-+	srl	$tp9,$m,7
-+	addu	$tp8,$tp8		# tp8<<1
-+	subu	$m,$tp9
-+	and	$m,$x1b1b1b1b
-+	xor	$tp8,$m
-+
-+	xor	$tp9,$tp8,$tp1
-+	xor	$tpe,$tp8,$tp4
-+	xor	$tpb,$tp9,$tp2
-+	xor	$tpd,$tp9,$tp4
-+
-+	_ror	$tp1,$tpd,16
-+	 xor	$tpe,$tp2
-+	_ror	$tp2,$tpd,-16
-+	xor	$tpe,$tp1
-+	_ror	$tp1,$tp9,8
-+	xor	$tpe,$tp2
-+	_ror	$tp2,$tp9,-24
-+	xor	$tpe,$tp1
-+	_ror	$tp1,$tpb,24
-+	xor	$tpe,$tp2
-+	_ror	$tp2,$tpb,-8
-+	xor	$tpe,$tp1
-+	lw	$tp1,4($key)		# modulo-scheduled
-+	xor	$tpe,$tp2
-+	sub	$cnt,1
-+	sw	$tpe,0($key)
-+	$PTR_ADD $key,4
-+	bnez	$cnt,.Lmix
-+
-+	li	$t0,0
-+.Ldkey_done:
-+	.set	noreorder
-+	move	$a0,$t0
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	AES_set_decrypt_key
-+___
-+}}}
-+
-+######################################################################
-+# Tables are kept in endian-neutral manner
-+$code.=<<___;
-+.rdata
-+.align	6
-+AES_Te:
-+.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84	# Te0
-+.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
-+.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
-+.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
-+.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
-+.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
-+.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
-+.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
-+.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
-+.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
-+.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
-+.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
-+.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
-+.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
-+.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
-+.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
-+.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
-+.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
-+.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
-+.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
-+.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
-+.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
-+.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
-+.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
-+.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
-+.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
-+.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
-+.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
-+.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
-+.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
-+.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
-+.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
-+.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
-+.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
-+.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
-+.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
-+.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
-+.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
-+.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
-+.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
-+.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
-+.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
-+.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
-+.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
-+.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
-+.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
-+.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
-+.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
-+.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
-+.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
-+.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
-+.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
-+.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
-+.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
-+.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
-+.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
-+.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
-+.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
-+.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
-+.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
-+.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
-+.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
-+.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
-+.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
-+.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
-+.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
-+.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
-+.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
-+.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
-+.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
-+.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
-+.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
-+.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
-+.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
-+.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
-+.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
-+.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
-+.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
-+.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
-+.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
-+.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
-+.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
-+.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
-+.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
-+.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
-+.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
-+.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
-+.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
-+.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
-+.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
-+.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
-+.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
-+.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
-+.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
-+.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
-+.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
-+.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
-+.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
-+.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
-+.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
-+.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
-+.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
-+.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
-+.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
-+.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
-+.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
-+.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
-+.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
-+.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
-+.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
-+.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
-+.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
-+.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
-+.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
-+.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
-+.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
-+.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
-+.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
-+.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
-+.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
-+.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
-+.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
-+.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
-+.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
-+.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
-+.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
-+.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
-+.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
-+
-+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5	# Te4
-+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
-+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
-+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
-+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
-+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
-+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
-+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
-+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
-+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
-+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
-+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
-+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
-+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
-+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
-+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
-+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
-+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
-+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
-+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
-+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
-+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
-+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
-+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
-+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
-+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
-+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
-+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
-+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
-+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
-+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
-+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-+
-+.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00	# rcon
-+.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
-+.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
-+.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
-+.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
-+
-+.align	6
-+AES_Td:
-+.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53	# Td0
-+.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
-+.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
-+.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
-+.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
-+.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
-+.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
-+.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
-+.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
-+.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
-+.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
-+.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
-+.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
-+.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
-+.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
-+.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
-+.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
-+.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
-+.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
-+.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
-+.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
-+.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
-+.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
-+.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
-+.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
-+.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
-+.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
-+.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
-+.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
-+.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
-+.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
-+.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
-+.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
-+.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
-+.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
-+.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
-+.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
-+.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
-+.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
-+.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
-+.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
-+.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
-+.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
-+.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
-+.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
-+.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
-+.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
-+.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
-+.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
-+.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
-+.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
-+.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
-+.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
-+.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
-+.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
-+.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
-+.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
-+.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
-+.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
-+.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
-+.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
-+.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
-+.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
-+.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
-+.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
-+.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
-+.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
-+.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
-+.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
-+.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
-+.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
-+.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
-+.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
-+.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
-+.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
-+.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
-+.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
-+.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
-+.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
-+.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
-+.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
-+.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
-+.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
-+.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
-+.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
-+.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
-+.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
-+.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
-+.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
-+.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
-+.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
-+.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
-+.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
-+.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
-+.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
-+.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
-+.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
-+.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
-+.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
-+.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
-+.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
-+.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
-+.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
-+.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
-+.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
-+.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
-+.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
-+.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
-+.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
-+.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
-+.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
-+.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
-+.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
-+.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
-+.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
-+.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
-+.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
-+.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
-+.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
-+.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
-+.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
-+.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
-+.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
-+.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
-+.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
-+.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
-+.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
-+.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
-+
-+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38	# Td4
-+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
-+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
-+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
-+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
-+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
-+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
-+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
-+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
-+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
-+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
-+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
-+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
-+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
-+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
-+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
-+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
-+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
-+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
-+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
-+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
-+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
-+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
-+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
-+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
-+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
-+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
-+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
-+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
-+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
-+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
-+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-+___
-+
-+foreach (split("\n",$code)) {
-+	s/\`([^\`]*)\`/eval $1/ge;
-+
-+	# made-up _instructions, _xtr, _ins, _ror and _bias, cope
-+	# with byte order dependencies...
-+	if (/^\s+_/) {
-+	    s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
-+
-+	    s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
-+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("24-$3"))/e or
-+	    s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
-+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("24-$3"))/e or
-+	    s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
-+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("$3*-1"))/e or
-+	    s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
-+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
-+					:		eval("($3-16)&31"))/e;
-+
-+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
-+		sprintf("sll\t$1,$2,$3")/e				or
-+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
-+		sprintf("and\t$1,$2,0xff")/e				or
-+	    s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
-+	}
-+
-+	# convert lwl/lwr and swr/swl to little-endian order
-+	if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
-+	    s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
-+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e	or
-+	    s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
-+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
-+	}
-+
-+	print $_,"\n";
-+}
-+
-+close STDOUT;
-diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl
-new file mode 100644
-index 0000000..b944a12
---- /dev/null
-+++ b/crypto/bn/asm/mips-mont.pl
-@@ -0,0 +1,426 @@
-+#!/usr/bin/env perl
-+#
-+# ====================================================================
-+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# This module doesn't present direct interest for OpenSSL, because it
-+# doesn't provide better performance for longer keys, at least not on
-+# in-order-execution cores. While 512-bit RSA sign operations can be
-+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
-+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
-+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
-+# verify:-( All comparisons are against bn_mul_mont-free assembler.
-+# The module might be of interest to embedded system developers, as
-+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
-+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
-+# code.
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp;
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$SZREG=4;
-+}
-+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
-+#
-+# <appro@openssl.org>
-+#
-+######################################################################
-+
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+if ($flavour =~ /64|n32/i) {
-+	$LD="ld";
-+	$ST="sd";
-+	$MULTU="dmultu";
-+	$ADDU="daddu";
-+	$SUBU="dsubu";
-+	$BNSZ=8;
-+} else {
-+	$LD="lw";
-+	$ST="sw";
-+	$MULTU="multu";
-+	$ADDU="addu";
-+	$SUBU="subu";
-+	$BNSZ=4;
-+}
-+
-+# int bn_mul_mont(
-+$rp=$a0;	# BN_ULONG *rp,
-+$ap=$a1;	# const BN_ULONG *ap,
-+$bp=$a2;	# const BN_ULONG *bp,
-+$np=$a3;	# const BN_ULONG *np,
-+$n0=$a4;	# const BN_ULONG *n0,
-+$num=$a5;	# int num);
-+
-+$lo0=$a6;
-+$hi0=$a7;
-+$lo1=$t1;
-+$hi1=$t2;
-+$aj=$s0;
-+$bi=$s1;
-+$nj=$s2;
-+$tp=$s3;
-+$alo=$s4;
-+$ahi=$s5;
-+$nlo=$s6;
-+$nhi=$s7;
-+$tj=$s8;
-+$i=$s9;
-+$j=$s10;
-+$m1=$s11;
-+
-+$FRAMESIZE=14;
-+
-+$code=<<___;
-+.text
-+
-+.set	noat
-+.set	noreorder
-+
-+.align	5
-+.globl	bn_mul_mont
-+.ent	bn_mul_mont
-+bn_mul_mont:
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);
-+	lw	$n0,16($sp)
-+	lw	$num,20($sp)
-+___
-+$code.=<<___;
-+	slt	$at,$num,4
-+	bnez	$at,1f
-+	li	$t0,0
-+	slt	$at,$num,17	# on in-order CPU
-+	bnezl	$at,bn_mul_mont_internal
-+	nop
-+1:	jr	$ra
-+	li	$a0,0
-+.end	bn_mul_mont
-+
-+.align	5
-+.ent	bn_mul_mont_internal
-+bn_mul_mont_internal:
-+	.frame	$fp,$FRAMESIZE*$SZREG,$ra
-+	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
-+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
-+	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
-+	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	move	$fp,$sp
-+
-+	.set	reorder
-+	$LD	$n0,0($n0)
-+	$LD	$bi,0($bp)	# bp[0]
-+	$LD	$aj,0($ap)	# ap[0]
-+	$LD	$nj,0($np)	# np[0]
-+
-+	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
-+	sll	$num,`log($BNSZ)/log(2)`
-+	li	$at,-4096
-+	$PTR_SUB $sp,$num
-+	and	$sp,$at
-+
-+	$MULTU	$aj,$bi
-+	$LD	$alo,$BNSZ($ap)
-+	$LD	$nlo,$BNSZ($np)
-+	mflo	$lo0
-+	mfhi	$hi0
-+	$MULTU	$lo0,$n0
-+	mflo	$m1
-+
-+	$MULTU	$alo,$bi
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$MULTU	$nj,$m1
-+	mflo	$lo1
-+	mfhi	$hi1
-+	$MULTU	$nlo,$m1
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$ADDU	$hi1,$at
-+	mflo	$nlo
-+	mfhi	$nhi
-+
-+	move	$tp,$sp
-+	li	$j,2*$BNSZ
-+.align	4
-+.L1st:
-+	.set	noreorder
-+	$PTR_ADD $aj,$ap,$j
-+	$PTR_ADD $nj,$np,$j
-+	$LD	$aj,($aj)
-+	$LD	$nj,($nj)
-+
-+	$MULTU	$aj,$bi
-+	$ADDU	$lo0,$alo,$hi0
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$at,$lo0,$hi0
-+	sltu	$t0,$lo1,$hi1
-+	$ADDU	$hi0,$ahi,$at
-+	$ADDU	$hi1,$nhi,$t0
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$MULTU	$nj,$m1
-+	$ADDU	$hi1,$at
-+	addu	$j,$BNSZ
-+	$ST	$lo1,($tp)
-+	sltu	$t0,$j,$num
-+	mflo	$nlo
-+	mfhi	$nhi
-+
-+	bnez	$t0,.L1st
-+	$PTR_ADD $tp,$BNSZ
-+	.set	reorder
-+
-+	$ADDU	$lo0,$alo,$hi0
-+	sltu	$at,$lo0,$hi0
-+	$ADDU	$hi0,$ahi,$at
-+
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$t0,$lo1,$hi1
-+	$ADDU	$hi1,$nhi,$t0
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$ADDU	$hi1,$at
-+
-+	$ST	$lo1,($tp)
-+
-+	$ADDU	$hi1,$hi0
-+	sltu	$at,$hi1,$hi0
-+	$ST	$hi1,$BNSZ($tp)
-+	$ST	$at,2*$BNSZ($tp)
-+
-+	li	$i,$BNSZ
-+.align	4
-+.Louter:
-+	$PTR_ADD $bi,$bp,$i
-+	$LD	$bi,($bi)
-+	$LD	$aj,($ap)
-+	$LD	$alo,$BNSZ($ap)
-+	$LD	$tj,($sp)
-+
-+	$MULTU	$aj,$bi
-+	$LD	$nj,($np)
-+	$LD	$nlo,$BNSZ($np)
-+	mflo	$lo0
-+	mfhi	$hi0
-+	$ADDU	$lo0,$tj
-+	$MULTU	$lo0,$n0
-+	sltu	$at,$lo0,$tj
-+	$ADDU	$hi0,$at
-+	mflo	$m1
-+
-+	$MULTU	$alo,$bi
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$MULTU	$nj,$m1
-+	mflo	$lo1
-+	mfhi	$hi1
-+
-+	$MULTU	$nlo,$m1
-+	$ADDU	$lo1,$lo0
-+	sltu	$at,$lo1,$lo0
-+	$ADDU	$hi1,$at
-+	mflo	$nlo
-+	mfhi	$nhi
-+
-+	move	$tp,$sp
-+	li	$j,2*$BNSZ
-+	$LD	$tj,$BNSZ($tp)
-+.align	4
-+.Linner:
-+	.set	noreorder
-+	$PTR_ADD $aj,$ap,$j
-+	$PTR_ADD $nj,$np,$j
-+	$LD	$aj,($aj)
-+	$LD	$nj,($nj)
-+
-+	$MULTU	$aj,$bi
-+	$ADDU	$lo0,$alo,$hi0
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$at,$lo0,$hi0
-+	sltu	$t0,$lo1,$hi1
-+	$ADDU	$hi0,$ahi,$at
-+	$ADDU	$hi1,$nhi,$t0
-+	mflo	$alo
-+	mfhi	$ahi
-+
-+	$ADDU	$lo0,$tj
-+	addu	$j,$BNSZ
-+	$MULTU	$nj,$m1
-+	sltu	$at,$lo0,$tj
-+	$ADDU	$lo1,$lo0
-+	$ADDU	$hi0,$at
-+	sltu	$t0,$lo1,$lo0
-+	$LD	$tj,2*$BNSZ($tp)
-+	$ADDU	$hi1,$t0
-+	sltu	$at,$j,$num
-+	mflo	$nlo
-+	mfhi	$nhi
-+	$ST	$lo1,($tp)
-+	bnez	$at,.Linner
-+	$PTR_ADD $tp,$BNSZ
-+	.set	reorder
-+
-+	$ADDU	$lo0,$alo,$hi0
-+	sltu	$at,$lo0,$hi0
-+	$ADDU	$hi0,$ahi,$at
-+	$ADDU	$lo0,$tj
-+	sltu	$t0,$lo0,$tj
-+	$ADDU	$hi0,$t0
-+
-+	$LD	$tj,2*$BNSZ($tp)
-+	$ADDU	$lo1,$nlo,$hi1
-+	sltu	$at,$lo1,$hi1
-+	$ADDU	$hi1,$nhi,$at
-+	$ADDU	$lo1,$lo0
-+	sltu	$t0,$lo1,$lo0
-+	$ADDU	$hi1,$t0
-+	$ST	$lo1,($tp)
-+
-+	$ADDU	$lo1,$hi1,$hi0
-+	sltu	$hi1,$lo1,$hi0
-+	$ADDU	$lo1,$tj
-+	sltu	$at,$lo1,$tj
-+	$ADDU	$hi1,$at
-+	$ST	$lo1,$BNSZ($tp)
-+	$ST	$hi1,2*$BNSZ($tp)
-+
-+	addu	$i,$BNSZ
-+	sltu	$t0,$i,$num
-+	bnez	$t0,.Louter
-+
-+	.set	noreorder
-+	$PTR_ADD $tj,$sp,$num	# &tp[num]
-+	move	$tp,$sp
-+	move	$ap,$sp
-+	li	$hi0,0		# clear borrow bit
-+
-+.align	4
-+.Lsub:	$LD	$lo0,($tp)
-+	$LD	$lo1,($np)
-+	$PTR_ADD $tp,$BNSZ
-+	$PTR_ADD $np,$BNSZ
-+	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
-+	sgtu	$at,$lo1,$lo0
-+	$SUBU	$lo0,$lo1,$hi0
-+	sgtu	$hi0,$lo0,$lo1
-+	$ST	$lo0,($rp)
-+	or	$hi0,$at
-+	sltu	$at,$tp,$tj
-+	bnez	$at,.Lsub
-+	$PTR_ADD $rp,$BNSZ
-+
-+	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
-+	move	$tp,$sp
-+	$PTR_SUB $rp,$num	# restore rp
-+	not	$hi1,$hi0
-+
-+	and	$ap,$hi0,$sp
-+	and	$bp,$hi1,$rp
-+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
-+
-+.align	4
-+.Lcopy:	$LD	$aj,($ap)
-+	$PTR_ADD $ap,$BNSZ
-+	$ST	$zero,($tp)
-+	$PTR_ADD $tp,$BNSZ
-+	sltu	$at,$tp,$tj
-+	$ST	$aj,($rp)
-+	bnez	$at,.Lcopy
-+	$PTR_ADD $rp,$BNSZ
-+
-+	li	$a0,1
-+	li	$t0,1
-+
-+	.set	noreorder
-+	move	$sp,$fp
-+	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
-+	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
-+.end	bn_mul_mont_internal
-+.rdata
-+.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
-+___
-+
-+$code =~ s/\`([^\`]*)\`/eval $1/gem;
-+
-+print $code;
-+close STDOUT;
-diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
-new file mode 100644
-index 0000000..f04b3b9
---- /dev/null
-+++ b/crypto/bn/asm/mips.pl
-@@ -0,0 +1,2585 @@
-+#!/usr/bin/env perl
-+#
-+# ====================================================================
-+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-+# project.
-+#
-+# Rights for redistribution and usage in source and binary forms are
-+# granted according to the OpenSSL license. Warranty of any kind is
-+# disclaimed.
-+# ====================================================================
-+
-+
-+# July 1999
-+#
-+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
-+#
-+# The module is designed to work with either of the "new" MIPS ABI(5),
-+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
-+# IRIX 5.x not only because it doesn't support new ABIs but also
-+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
-+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
-+# cause illegal instruction exception:-(
-+#
-+# In addition the code depends on preprocessor flags set up by MIPSpro
-+# compiler driver (either as or cc) and therefore (probably?) can't be
-+# compiled by the GNU assembler. GNU C driver manages fine though...
-+# I mean as long as -mmips-as is specified or is the default option,
-+# because then it simply invokes /usr/bin/as which in turn takes
-+# perfect care of the preprocessor definitions. Another neat feature
-+# offered by the MIPSpro assembler is an optimization pass. This gave
-+# me the opportunity to have the code looking more regular as all those
-+# architecture dependent instruction rescheduling details were left to
-+# the assembler. Cool, huh?
-+#
-+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
-+# goes way over 3 times faster!
-+#
-+#					<appro@fy.chalmers.se>
-+
-+# October 2010
-+#
-+# Adapt the module even for 32-bit ABIs and other OSes. The former was
-+# achieved by mechanical replacement of 64-bit arithmetic instructions
-+# such as dmultu, daddu, etc. with their 32-bit counterparts and
-+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
-+# >3x performance improvement naturally does not apply to 32-bit code
-+# [because there is no instruction 32-bit compiler can't use], one
-+# has to content with 40-85% improvement depending on benchmark and
-+# key length, more for longer keys.
-+
-+$flavour = shift;
-+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-+open STDOUT,">$output";
-+
-+if ($flavour =~ /64|n32/i) {
-+	$LD="ld";
-+	$ST="sd";
-+	$MULTU="dmultu";
-+	$DIVU="ddivu";
-+	$ADDU="daddu";
-+	$SUBU="dsubu";
-+	$SRL="dsrl";
-+	$SLL="dsll";
-+	$BNSZ=8;
-+	$PTR_ADD="daddu";
-+	$PTR_SUB="dsubu";
-+	$SZREG=8;
-+	$REG_S="sd";
-+	$REG_L="ld";
-+} else {
-+	$LD="lw";
-+	$ST="sw";
-+	$MULTU="multu";
-+	$DIVU="divu";
-+	$ADDU="addu";
-+	$SUBU="subu";
-+	$SRL="srl";
-+	$SLL="sll";
-+	$BNSZ=4;
-+	$PTR_ADD="addu";
-+	$PTR_SUB="subu";
-+	$SZREG=4;
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$code=".set	mips2\n";
-+}
-+
-+# Below is N32/64 register layout used in the original module.
-+#
-+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
-+#
-+# No special adaptation is required for O32. NUBI on the other hand
-+# is treated by saving/restoring ($v1,$t0..$t3).
-+
-+$gp=$v1 if ($flavour =~ /nubi/i);
-+
-+$minus4=$v1;
-+
-+$code.=<<___;
-+.rdata
-+.asciiz	"mips3.s, Version 1.2"
-+.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
-+
-+.text
-+.set	noat
-+
-+.align	5
-+.globl	bn_mul_add_words
-+.ent	bn_mul_add_words
-+bn_mul_add_words:
-+	.set	noreorder
-+	bgtz	$a2,bn_mul_add_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_add_words
-+
-+.align	5
-+.ent	bn_mul_add_words_internal
-+bn_mul_add_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$ta0,$a2,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$ta0,.L_bn_mul_add_words_tail
-+
-+.L_bn_mul_add_words_loop:
-+	$MULTU	$t0,$a3
-+	$LD	$t1,0($a0)
-+	$LD	$t2,$BNSZ($a1)
-+	$LD	$t3,$BNSZ($a0)
-+	$LD	$ta0,2*$BNSZ($a1)
-+	$LD	$ta1,2*$BNSZ($a0)
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
-+				# values", but it seems to work fine
-+				# even on 64-bit registers.
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	 $MULTU	$t2,$a3
-+	sltu	$at,$t1,$at
-+	$ST	$t1,0($a0)
-+	$ADDU	$v0,$at
-+
-+	$LD	$ta2,3*$BNSZ($a1)
-+	$LD	$ta3,3*$BNSZ($a0)
-+	$ADDU	$t3,$v0
-+	sltu	$v0,$t3,$v0
-+	mflo	$at
-+	mfhi	$t2
-+	$ADDU	$t3,$at
-+	$ADDU	$v0,$t2
-+	 $MULTU	$ta0,$a3
-+	sltu	$at,$t3,$at
-+	$ST	$t3,$BNSZ($a0)
-+	$ADDU	$v0,$at
-+
-+	subu	$a2,4
-+	$PTR_ADD $a0,4*$BNSZ
-+	$PTR_ADD $a1,4*$BNSZ
-+	$ADDU	$ta1,$v0
-+	sltu	$v0,$ta1,$v0
-+	mflo	$at
-+	mfhi	$ta0
-+	$ADDU	$ta1,$at
-+	$ADDU	$v0,$ta0
-+	 $MULTU	$ta2,$a3
-+	sltu	$at,$ta1,$at
-+	$ST	$ta1,-2*$BNSZ($a0)
-+	$ADDU	$v0,$at
-+
-+
-+	and	$ta0,$a2,$minus4
-+	$ADDU	$ta3,$v0
-+	sltu	$v0,$ta3,$v0
-+	mflo	$at
-+	mfhi	$ta2
-+	$ADDU	$ta3,$at
-+	$ADDU	$v0,$ta2
-+	sltu	$at,$ta3,$at
-+	$ST	$ta3,-$BNSZ($a0)
-+	$ADDU	$v0,$at
-+	.set	noreorder
-+	bgtzl	$ta0,.L_bn_mul_add_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a2,.L_bn_mul_add_words_return
-+	nop
-+
-+.L_bn_mul_add_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$MULTU	$t0,$a3
-+	$LD	$t1,0($a0)
-+	subu	$a2,1
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	sltu	$at,$t1,$at
-+	$ST	$t1,0($a0)
-+	$ADDU	$v0,$at
-+	beqz	$a2,.L_bn_mul_add_words_return
-+
-+	$LD	$t0,$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	$LD	$t1,$BNSZ($a0)
-+	subu	$a2,1
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	sltu	$at,$t1,$at
-+	$ST	$t1,$BNSZ($a0)
-+	$ADDU	$v0,$at
-+	beqz	$a2,.L_bn_mul_add_words_return
-+
-+	$LD	$t0,2*$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	$LD	$t1,2*$BNSZ($a0)
-+	$ADDU	$t1,$v0
-+	sltu	$v0,$t1,$v0
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$t1,$at
-+	$ADDU	$v0,$t0
-+	sltu	$at,$t1,$at
-+	$ST	$t1,2*$BNSZ($a0)
-+	$ADDU	$v0,$at
-+
-+.L_bn_mul_add_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_add_words_internal
-+
-+.align	5
-+.globl	bn_mul_words
-+.ent	bn_mul_words
-+bn_mul_words:
-+	.set	noreorder
-+	bgtz	$a2,bn_mul_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_words
-+
-+.align	5
-+.ent	bn_mul_words_internal
-+bn_mul_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$ta0,$a2,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$ta0,.L_bn_mul_words_tail
-+
-+.L_bn_mul_words_loop:
-+	$MULTU	$t0,$a3
-+	$LD	$t2,$BNSZ($a1)
-+	$LD	$ta0,2*$BNSZ($a1)
-+	$LD	$ta2,3*$BNSZ($a1)
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	 $MULTU	$t2,$a3
-+	$ST	$v0,0($a0)
-+	$ADDU	$v0,$t1,$t0
-+
-+	subu	$a2,4
-+	$PTR_ADD $a0,4*$BNSZ
-+	$PTR_ADD $a1,4*$BNSZ
-+	mflo	$at
-+	mfhi	$t2
-+	$ADDU	$v0,$at
-+	sltu	$t3,$v0,$at
-+	 $MULTU	$ta0,$a3
-+	$ST	$v0,-3*$BNSZ($a0)
-+	$ADDU	$v0,$t3,$t2
-+
-+	mflo	$at
-+	mfhi	$ta0
-+	$ADDU	$v0,$at
-+	sltu	$ta1,$v0,$at
-+	 $MULTU	$ta2,$a3
-+	$ST	$v0,-2*$BNSZ($a0)
-+	$ADDU	$v0,$ta1,$ta0
-+
-+	and	$ta0,$a2,$minus4
-+	mflo	$at
-+	mfhi	$ta2
-+	$ADDU	$v0,$at
-+	sltu	$ta3,$v0,$at
-+	$ST	$v0,-$BNSZ($a0)
-+	$ADDU	$v0,$ta3,$ta2
-+	.set	noreorder
-+	bgtzl	$ta0,.L_bn_mul_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a2,.L_bn_mul_words_return
-+	nop
-+
-+.L_bn_mul_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$MULTU	$t0,$a3
-+	subu	$a2,1
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	$ST	$v0,0($a0)
-+	$ADDU	$v0,$t1,$t0
-+	beqz	$a2,.L_bn_mul_words_return
-+
-+	$LD	$t0,$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	subu	$a2,1
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	$ST	$v0,$BNSZ($a0)
-+	$ADDU	$v0,$t1,$t0
-+	beqz	$a2,.L_bn_mul_words_return
-+
-+	$LD	$t0,2*$BNSZ($a1)
-+	$MULTU	$t0,$a3
-+	mflo	$at
-+	mfhi	$t0
-+	$ADDU	$v0,$at
-+	sltu	$t1,$v0,$at
-+	$ST	$v0,2*$BNSZ($a0)
-+	$ADDU	$v0,$t1,$t0
-+
-+.L_bn_mul_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_mul_words_internal
-+
-+.align	5
-+.globl	bn_sqr_words
-+.ent	bn_sqr_words
-+bn_sqr_words:
-+	.set	noreorder
-+	bgtz	$a2,bn_sqr_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_sqr_words
-+
-+.align	5
-+.ent	bn_sqr_words_internal
-+bn_sqr_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$ta0,$a2,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$ta0,.L_bn_sqr_words_tail
-+
-+.L_bn_sqr_words_loop:
-+	$MULTU	$t0,$t0
-+	$LD	$t2,$BNSZ($a1)
-+	$LD	$ta0,2*$BNSZ($a1)
-+	$LD	$ta2,3*$BNSZ($a1)
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,0($a0)
-+	$ST	$t0,$BNSZ($a0)
-+
-+	$MULTU	$t2,$t2
-+	subu	$a2,4
-+	$PTR_ADD $a0,8*$BNSZ
-+	$PTR_ADD $a1,4*$BNSZ
-+	mflo	$t3
-+	mfhi	$t2
-+	$ST	$t3,-6*$BNSZ($a0)
-+	$ST	$t2,-5*$BNSZ($a0)
-+
-+	$MULTU	$ta0,$ta0
-+	mflo	$ta1
-+	mfhi	$ta0
-+	$ST	$ta1,-4*$BNSZ($a0)
-+	$ST	$ta0,-3*$BNSZ($a0)
-+
-+
-+	$MULTU	$ta2,$ta2
-+	and	$ta0,$a2,$minus4
-+	mflo	$ta3
-+	mfhi	$ta2
-+	$ST	$ta3,-2*$BNSZ($a0)
-+	$ST	$ta2,-$BNSZ($a0)
-+
-+	.set	noreorder
-+	bgtzl	$ta0,.L_bn_sqr_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a2,.L_bn_sqr_words_return
-+	nop
-+
-+.L_bn_sqr_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$MULTU	$t0,$t0
-+	subu	$a2,1
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,0($a0)
-+	$ST	$t0,$BNSZ($a0)
-+	beqz	$a2,.L_bn_sqr_words_return
-+
-+	$LD	$t0,$BNSZ($a1)
-+	$MULTU	$t0,$t0
-+	subu	$a2,1
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,2*$BNSZ($a0)
-+	$ST	$t0,3*$BNSZ($a0)
-+	beqz	$a2,.L_bn_sqr_words_return
-+
-+	$LD	$t0,2*$BNSZ($a1)
-+	$MULTU	$t0,$t0
-+	mflo	$t1
-+	mfhi	$t0
-+	$ST	$t1,4*$BNSZ($a0)
-+	$ST	$t0,5*$BNSZ($a0)
-+
-+.L_bn_sqr_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+
-+.end	bn_sqr_words_internal
-+
-+.align	5
-+.globl	bn_add_words
-+.ent	bn_add_words
-+bn_add_words:
-+	.set	noreorder
-+	bgtz	$a3,bn_add_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_add_words
-+
-+.align	5
-+.ent	bn_add_words_internal
-+bn_add_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$at,$a3,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$at,.L_bn_add_words_tail
-+
-+.L_bn_add_words_loop:
-+	$LD	$ta0,0($a2)
-+	subu	$a3,4
-+	$LD	$t1,$BNSZ($a1)
-+	and	$at,$a3,$minus4
-+	$LD	$t2,2*$BNSZ($a1)
-+	$PTR_ADD $a2,4*$BNSZ
-+	$LD	$t3,3*$BNSZ($a1)
-+	$PTR_ADD $a0,4*$BNSZ
-+	$LD	$ta1,-3*$BNSZ($a2)
-+	$PTR_ADD $a1,4*$BNSZ
-+	$LD	$ta2,-2*$BNSZ($a2)
-+	$LD	$ta3,-$BNSZ($a2)
-+	$ADDU	$ta0,$t0
-+	sltu	$t8,$ta0,$t0
-+	$ADDU	$t0,$ta0,$v0
-+	sltu	$v0,$t0,$ta0
-+	$ST	$t0,-4*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	$ADDU	$ta1,$t1
-+	sltu	$t9,$ta1,$t1
-+	$ADDU	$t1,$ta1,$v0
-+	sltu	$v0,$t1,$ta1
-+	$ST	$t1,-3*$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+	$ADDU	$ta2,$t2
-+	sltu	$t8,$ta2,$t2
-+	$ADDU	$t2,$ta2,$v0
-+	sltu	$v0,$t2,$ta2
-+	$ST	$t2,-2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	$ADDU	$ta3,$t3
-+	sltu	$t9,$ta3,$t3
-+	$ADDU	$t3,$ta3,$v0
-+	sltu	$v0,$t3,$ta3
-+	$ST	$t3,-$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+	.set	noreorder
-+	bgtzl	$at,.L_bn_add_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a3,.L_bn_add_words_return
-+	nop
-+
-+.L_bn_add_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$LD	$ta0,0($a2)
-+	$ADDU	$ta0,$t0
-+	subu	$a3,1
-+	sltu	$t8,$ta0,$t0
-+	$ADDU	$t0,$ta0,$v0
-+	sltu	$v0,$t0,$ta0
-+	$ST	$t0,0($a0)
-+	$ADDU	$v0,$t8
-+	beqz	$a3,.L_bn_add_words_return
-+
-+	$LD	$t1,$BNSZ($a1)
-+	$LD	$ta1,$BNSZ($a2)
-+	$ADDU	$ta1,$t1
-+	subu	$a3,1
-+	sltu	$t9,$ta1,$t1
-+	$ADDU	$t1,$ta1,$v0
-+	sltu	$v0,$t1,$ta1
-+	$ST	$t1,$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+	beqz	$a3,.L_bn_add_words_return
-+
-+	$LD	$t2,2*$BNSZ($a1)
-+	$LD	$ta2,2*$BNSZ($a2)
-+	$ADDU	$ta2,$t2
-+	sltu	$t8,$ta2,$t2
-+	$ADDU	$t2,$ta2,$v0
-+	sltu	$v0,$t2,$ta2
-+	$ST	$t2,2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+.L_bn_add_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+
-+.end	bn_add_words_internal
-+
-+.align	5
-+.globl	bn_sub_words
-+.ent	bn_sub_words
-+bn_sub_words:
-+	.set	noreorder
-+	bgtz	$a3,bn_sub_words_internal
-+	move	$v0,$zero
-+	jr	$ra
-+	move	$a0,$zero
-+.end	bn_sub_words
-+
-+.align	5
-+.ent	bn_sub_words_internal
-+bn_sub_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	li	$minus4,-4
-+	and	$at,$a3,$minus4
-+	$LD	$t0,0($a1)
-+	beqz	$at,.L_bn_sub_words_tail
-+
-+.L_bn_sub_words_loop:
-+	$LD	$ta0,0($a2)
-+	subu	$a3,4
-+	$LD	$t1,$BNSZ($a1)
-+	and	$at,$a3,$minus4
-+	$LD	$t2,2*$BNSZ($a1)
-+	$PTR_ADD $a2,4*$BNSZ
-+	$LD	$t3,3*$BNSZ($a1)
-+	$PTR_ADD $a0,4*$BNSZ
-+	$LD	$ta1,-3*$BNSZ($a2)
-+	$PTR_ADD $a1,4*$BNSZ
-+	$LD	$ta2,-2*$BNSZ($a2)
-+	$LD	$ta3,-$BNSZ($a2)
-+	sltu	$t8,$t0,$ta0
-+	$SUBU	$ta0,$t0,$ta0
-+	$SUBU	$t0,$ta0,$v0
-+	sgtu	$v0,$t0,$ta0
-+	$ST	$t0,-4*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	sltu	$t9,$t1,$ta1
-+	$SUBU	$ta1,$t1,$ta1
-+	$SUBU	$t1,$ta1,$v0
-+	sgtu	$v0,$t1,$ta1
-+	$ST	$t1,-3*$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+
-+	sltu	$t8,$t2,$ta2
-+	$SUBU	$ta2,$t2,$ta2
-+	$SUBU	$t2,$ta2,$v0
-+	sgtu	$v0,$t2,$ta2
-+	$ST	$t2,-2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+	sltu	$t9,$t3,$ta3
-+	$SUBU	$ta3,$t3,$ta3
-+	$SUBU	$t3,$ta3,$v0
-+	sgtu	$v0,$t3,$ta3
-+	$ST	$t3,-$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+
-+	.set	noreorder
-+	bgtzl	$at,.L_bn_sub_words_loop
-+	$LD	$t0,0($a1)
-+
-+	beqz	$a3,.L_bn_sub_words_return
-+	nop
-+
-+.L_bn_sub_words_tail:
-+	.set	reorder
-+	$LD	$t0,0($a1)
-+	$LD	$ta0,0($a2)
-+	subu	$a3,1
-+	sltu	$t8,$t0,$ta0
-+	$SUBU	$ta0,$t0,$ta0
-+	$SUBU	$t0,$ta0,$v0
-+	sgtu	$v0,$t0,$ta0
-+	$ST	$t0,0($a0)
-+	$ADDU	$v0,$t8
-+	beqz	$a3,.L_bn_sub_words_return
-+
-+	$LD	$t1,$BNSZ($a1)
-+	subu	$a3,1
-+	$LD	$ta1,$BNSZ($a2)
-+	sltu	$t9,$t1,$ta1
-+	$SUBU	$ta1,$t1,$ta1
-+	$SUBU	$t1,$ta1,$v0
-+	sgtu	$v0,$t1,$ta1
-+	$ST	$t1,$BNSZ($a0)
-+	$ADDU	$v0,$t9
-+	beqz	$a3,.L_bn_sub_words_return
-+
-+	$LD	$t2,2*$BNSZ($a1)
-+	$LD	$ta2,2*$BNSZ($a2)
-+	sltu	$t8,$t2,$ta2
-+	$SUBU	$ta2,$t2,$ta2
-+	$SUBU	$t2,$ta2,$v0
-+	sgtu	$v0,$t2,$ta2
-+	$ST	$t2,2*$BNSZ($a0)
-+	$ADDU	$v0,$t8
-+
-+.L_bn_sub_words_return:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_sub_words_internal
-+
-+.align 5
-+.globl	bn_div_3_words
-+.ent	bn_div_3_words
-+bn_div_3_words:
-+	.set	noreorder
-+	move	$a3,$a0		# we know that bn_div_words does not
-+				# touch $a3, $ta2, $ta3 and preserves $a2
-+				# so that we can save two arguments
-+				# and return address in registers
-+				# instead of stack:-)
-+
-+	$LD	$a0,($a3)
-+	move	$ta2,$a1
-+	bne	$a0,$a2,bn_div_3_words_internal
-+	$LD	$a1,-$BNSZ($a3)
-+	li	$v0,-1
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_3_words
-+
-+.align	5
-+.ent	bn_div_3_words_internal
-+bn_div_3_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	move	$ta3,$ra
-+	bal	bn_div_words
-+	move	$ra,$ta3
-+	$MULTU	$ta2,$v0
-+	$LD	$t2,-2*$BNSZ($a3)
-+	move	$ta0,$zero
-+	mfhi	$t1
-+	mflo	$t0
-+	sltu	$t8,$t1,$a1
-+.L_bn_div_3_words_inner_loop:
-+	bnez	$t8,.L_bn_div_3_words_inner_loop_done
-+	sgeu	$at,$t2,$t0
-+	seq	$t9,$t1,$a1
-+	and	$at,$t9
-+	sltu	$t3,$t0,$ta2
-+	$ADDU	$a1,$a2
-+	$SUBU	$t1,$t3
-+	$SUBU	$t0,$ta2
-+	sltu	$t8,$t1,$a1
-+	sltu	$ta0,$a1,$a2
-+	or	$t8,$ta0
-+	.set	noreorder
-+	beqzl	$at,.L_bn_div_3_words_inner_loop
-+	$SUBU	$v0,1
-+	.set	reorder
-+.L_bn_div_3_words_inner_loop_done:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_3_words_internal
-+
-+.align	5
-+.globl	bn_div_words
-+.ent	bn_div_words
-+bn_div_words:
-+	.set	noreorder
-+	bnez	$a2,bn_div_words_internal
-+	li	$v0,-1		# I would rather signal div-by-zero
-+				# which can be done with 'break 7'
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_words
-+
-+.align	5
-+.ent	bn_div_words_internal
-+bn_div_words_internal:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	move	$v1,$zero
-+	bltz	$a2,.L_bn_div_words_body
-+	move	$t9,$v1
-+	$SLL	$a2,1
-+	bgtz	$a2,.-4
-+	addu	$t9,1
-+
-+	.set	reorder
-+	negu	$t1,$t9
-+	li	$t2,-1
-+	$SLL	$t2,$t1
-+	and	$t2,$a0
-+	$SRL	$at,$a1,$t1
-+	.set	noreorder
-+	bnezl	$t2,.+8
-+	break	6		# signal overflow
-+	.set	reorder
-+	$SLL	$a0,$t9
-+	$SLL	$a1,$t9
-+	or	$a0,$at
-+___
-+$QT=$ta0;
-+$HH=$ta1;
-+$DH=$v1;
-+$code.=<<___;
-+.L_bn_div_words_body:
-+	$SRL	$DH,$a2,4*$BNSZ	# bits
-+	sgeu	$at,$a0,$a2
-+	.set	noreorder
-+	bnezl	$at,.+8
-+	$SUBU	$a0,$a2
-+	.set	reorder
-+
-+	li	$QT,-1
-+	$SRL	$HH,$a0,4*$BNSZ	# bits
-+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
-+	beq	$DH,$HH,.L_bn_div_words_skip_div1
-+	$DIVU	$zero,$a0,$DH
-+	mflo	$QT
-+.L_bn_div_words_skip_div1:
-+	$MULTU	$a2,$QT
-+	$SLL	$t3,$a0,4*$BNSZ	# bits
-+	$SRL	$at,$a1,4*$BNSZ	# bits
-+	or	$t3,$at
-+	mflo	$t0
-+	mfhi	$t1
-+.L_bn_div_words_inner_loop1:
-+	sltu	$t2,$t3,$t0
-+	seq	$t8,$HH,$t1
-+	sltu	$at,$HH,$t1
-+	and	$t2,$t8
-+	sltu	$v0,$t0,$a2
-+	or	$at,$t2
-+	.set	noreorder
-+	beqz	$at,.L_bn_div_words_inner_loop1_done
-+	$SUBU	$t1,$v0
-+	$SUBU	$t0,$a2
-+	b	.L_bn_div_words_inner_loop1
-+	$SUBU	$QT,1
-+	.set	reorder
-+.L_bn_div_words_inner_loop1_done:
-+
-+	$SLL	$a1,4*$BNSZ	# bits
-+	$SUBU	$a0,$t3,$t0
-+	$SLL	$v0,$QT,4*$BNSZ	# bits
-+
-+	li	$QT,-1
-+	$SRL	$HH,$a0,4*$BNSZ	# bits
-+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
-+	beq	$DH,$HH,.L_bn_div_words_skip_div2
-+	$DIVU	$zero,$a0,$DH
-+	mflo	$QT
-+.L_bn_div_words_skip_div2:
-+	$MULTU	$a2,$QT
-+	$SLL	$t3,$a0,4*$BNSZ	# bits
-+	$SRL	$at,$a1,4*$BNSZ	# bits
-+	or	$t3,$at
-+	mflo	$t0
-+	mfhi	$t1
-+.L_bn_div_words_inner_loop2:
-+	sltu	$t2,$t3,$t0
-+	seq	$t8,$HH,$t1
-+	sltu	$at,$HH,$t1
-+	and	$t2,$t8
-+	sltu	$v1,$t0,$a2
-+	or	$at,$t2
-+	.set	noreorder
-+	beqz	$at,.L_bn_div_words_inner_loop2_done
-+	$SUBU	$t1,$v1
-+	$SUBU	$t0,$a2
-+	b	.L_bn_div_words_inner_loop2
-+	$SUBU	$QT,1
-+	.set	reorder
-+.L_bn_div_words_inner_loop2_done:
-+
-+	$SUBU	$a0,$t3,$t0
-+	or	$v0,$QT
-+	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
-+	$SRL	$a2,$t9		# restore $a2
-+
-+	.set	noreorder
-+	move	$a1,$v1
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	move	$a0,$v0
-+.end	bn_div_words_internal
-+___
-+undef $HH; undef $QT; undef $DH;
-+
-+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
-+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
-+
-+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
-+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
-+
-+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
-+
-+$code.=<<___;
-+
-+.align	5
-+.globl	bn_mul_comba8
-+.ent	bn_mul_comba8
-+bn_mul_comba8:
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,12*$SZREG,$ra
-+	.mask	0x803ff008,-$SZREG
-+	$PTR_SUB $sp,12*$SZREG
-+	$REG_S	$ra,11*$SZREG($sp)
-+	$REG_S	$s5,10*$SZREG($sp)
-+	$REG_S	$s4,9*$SZREG($sp)
-+	$REG_S	$s3,8*$SZREG($sp)
-+	$REG_S	$s2,7*$SZREG($sp)
-+	$REG_S	$s1,6*$SZREG($sp)
-+	$REG_S	$s0,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour !~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x003f0000,-$SZREG
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$s5,5*$SZREG($sp)
-+	$REG_S	$s4,4*$SZREG($sp)
-+	$REG_S	$s3,3*$SZREG($sp)
-+	$REG_S	$s2,2*$SZREG($sp)
-+	$REG_S	$s1,1*$SZREG($sp)
-+	$REG_S	$s0,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+
-+	.set	reorder
-+	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
-+				# R5000 box assembler barks on this
-+				# 1ine with "should not have mult/div
-+				# as last instruction in bb (R10K
-+				# bug)" warning. If anybody out there
-+				# has a clue about how to circumvent
-+				# this do send me a note.
-+				#		<appro\@fy.chalmers.se>
-+
-+	$LD	$b_0,0($a2)
-+	$LD	$a_1,$BNSZ($a1)
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_3,3*$BNSZ($a1)
-+	$LD	$b_1,$BNSZ($a2)
-+	$LD	$b_2,2*$BNSZ($a2)
-+	$LD	$b_3,3*$BNSZ($a2)
-+	mflo	$c_1
-+	mfhi	$c_2
-+
-+	$LD	$a_4,4*$BNSZ($a1)
-+	$LD	$a_5,5*$BNSZ($a1)
-+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
-+	$LD	$a_6,6*$BNSZ($a1)
-+	$LD	$a_7,7*$BNSZ($a1)
-+	$LD	$b_4,4*$BNSZ($a2)
-+	$LD	$b_5,5*$BNSZ($a2)
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
-+	$ADDU	$c_3,$t_2,$at
-+	$LD	$b_6,6*$BNSZ($a2)
-+	$LD	$b_7,7*$BNSZ($a2)
-+	$ST	$c_1,0($a0)	# r[0]=c1;
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
-+	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s5,10*$SZREG($sp)
-+	$REG_L	$s4,9*$SZREG($sp)
-+	$REG_L	$s3,8*$SZREG($sp)
-+	$REG_L	$s2,7*$SZREG($sp)
-+	$REG_L	$s1,6*$SZREG($sp)
-+	$REG_L	$s0,5*$SZREG($sp)
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	jr	$ra
-+	$PTR_ADD $sp,12*$SZREG
-+___
-+$code.=<<___ if ($flavour !~ /nubi/i);
-+	$REG_L	$s5,5*$SZREG($sp)
-+	$REG_L	$s4,4*$SZREG($sp)
-+	$REG_L	$s3,3*$SZREG($sp)
-+	$REG_L	$s2,2*$SZREG($sp)
-+	$REG_L	$s1,1*$SZREG($sp)
-+	$REG_L	$s0,0*$SZREG($sp)
-+	jr	$ra
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+.end	bn_mul_comba8
-+
-+.align	5
-+.globl	bn_mul_comba4
-+.ent	bn_mul_comba4
-+bn_mul_comba4:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	$LD	$a_0,0($a1)
-+	$LD	$b_0,0($a2)
-+	$LD	$a_1,$BNSZ($a1)
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_3,3*$BNSZ($a1)
-+	$LD	$b_1,$BNSZ($a2)
-+	$LD	$b_2,2*$BNSZ($a2)
-+	$LD	$b_3,3*$BNSZ($a2)
-+	mflo	$c_1
-+	mfhi	$c_2
-+	$ST	$c_1,0($a0)
-+
-+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
-+	$ADDU	$c_3,$t_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	$ST	$c_2,$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$c_3,$c_2,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$c_1,$c_3,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$c_2,$c_1,$t_2
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	$ST	$c_1,6*$BNSZ($a0)
-+	$ST	$c_2,7*$BNSZ($a0)
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	nop
-+.end	bn_mul_comba4
-+___
-+
-+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
-+
-+$code.=<<___;
-+
-+.align	5
-+.globl	bn_sqr_comba8
-+.ent	bn_sqr_comba8
-+bn_sqr_comba8:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	$LD	$a_0,0($a1)
-+	$LD	$a_1,$BNSZ($a1)
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$LD	$a_3,3*$BNSZ($a1)
-+
-+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_4,4*$BNSZ($a1)
-+	$LD	$a_5,5*$BNSZ($a1)
-+	$LD	$a_6,6*$BNSZ($a1)
-+	$LD	$a_7,7*$BNSZ($a1)
-+	mflo	$c_1
-+	mfhi	$c_2
-+	$ST	$c_1,0($a0)
-+
-+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$c_3,$t_2,$at
-+	$ST	$c_2,$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
-+	$ADDU	$c_2,$at
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,6*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,7*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,8*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,9*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_1,$at
-+	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,10*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_2,$at
-+	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,11*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,12*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,13*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	$ST	$c_3,14*$BNSZ($a0)
-+	$ST	$c_1,15*$BNSZ($a0)
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	nop
-+.end	bn_sqr_comba8
-+
-+.align	5
-+.globl	bn_sqr_comba4
-+.ent	bn_sqr_comba4
-+bn_sqr_comba4:
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	.frame	$sp,6*$SZREG,$ra
-+	.mask	0x8000f008,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,6*$SZREG
-+	$REG_S	$ra,5*$SZREG($sp)
-+	$REG_S	$t3,4*$SZREG($sp)
-+	$REG_S	$t2,3*$SZREG($sp)
-+	$REG_S	$t1,2*$SZREG($sp)
-+	$REG_S	$t0,1*$SZREG($sp)
-+	$REG_S	$gp,0*$SZREG($sp)
-+___
-+$code.=<<___;
-+	.set	reorder
-+	$LD	$a_0,0($a1)
-+	$LD	$a_1,$BNSZ($a1)
-+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
-+	$LD	$a_2,2*$BNSZ($a1)
-+	$LD	$a_3,3*$BNSZ($a1)
-+	mflo	$c_1
-+	mfhi	$c_2
-+	$ST	$c_1,0($a0)
-+
-+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$c_3,$t_2,$at
-+	$ST	$c_2,$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,2*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_3,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$at,$t_2,$zero
-+	$ADDU	$c_3,$at
-+	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-+	$SLL	$t_2,1
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	sltu	$at,$c_2,$t_2
-+	$ADDU	$c_3,$at
-+	$ST	$c_1,3*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_1,$t_2,$zero
-+	$SLL	$t_2,1
-+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_2,$t_1
-+	sltu	$at,$c_2,$t_1
-+	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_3,$t_2
-+	sltu	$at,$c_3,$t_2
-+	$ADDU	$c_1,$at
-+	$ST	$c_2,4*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	slt	$c_2,$t_2,$zero
-+	$SLL	$t_2,1
-+	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-+	slt	$a2,$t_1,$zero
-+	$ADDU	$t_2,$a2
-+	$SLL	$t_1,1
-+	$ADDU	$c_3,$t_1
-+	sltu	$at,$c_3,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_1,$t_2
-+	sltu	$at,$c_1,$t_2
-+	$ADDU	$c_2,$at
-+	$ST	$c_3,5*$BNSZ($a0)
-+
-+	mflo	$t_1
-+	mfhi	$t_2
-+	$ADDU	$c_1,$t_1
-+	sltu	$at,$c_1,$t_1
-+	$ADDU	$t_2,$at
-+	$ADDU	$c_2,$t_2
-+	$ST	$c_1,6*$BNSZ($a0)
-+	$ST	$c_2,7*$BNSZ($a0)
-+
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$t3,4*$SZREG($sp)
-+	$REG_L	$t2,3*$SZREG($sp)
-+	$REG_L	$t1,2*$SZREG($sp)
-+	$REG_L	$t0,1*$SZREG($sp)
-+	$REG_L	$gp,0*$SZREG($sp)
-+	$PTR_ADD $sp,6*$SZREG
-+___
-+$code.=<<___;
-+	jr	$ra
-+	nop
-+.end	bn_sqr_comba4
-+___
-+print $code;
-+close STDOUT;
-diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl
-new file mode 100644
-index 0000000..f1a702f
---- /dev/null
-+++ b/crypto/sha/asm/sha1-mips.pl
-@@ -0,0 +1,354 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# SHA1 block procedure for MIPS.
-+
-+# Performance improvement is 30% on unaligned input. The "secret" is
-+# to deploy lwl/lwr pair to load unaligned input. One could have
-+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
-+# compatible subroutine. There is room for minor optimization on
-+# little-endian platforms...
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp;
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$PTR_SLL="dsll";	# incidentally works even on n32
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$PTR_SLL="sll";
-+	$SZREG=4;
-+}
-+#
-+# <appro@openssl.org>
-+#
-+######################################################################
-+
-+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
-+
-+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);   }
-+open STDOUT,">$output";
-+
-+if (!defined($big_endian))
-+            {   $big_endian=(unpack('L',pack('N',1))==1);   }
-+
-+# offsets of the Most and Least Significant Bytes
-+$MSB=$big_endian?0:3;
-+$LSB=3&~$MSB;
-+
-+@X=map("\$$_",(8..23));	# a4-a7,s0-s11
-+
-+$ctx=$a0;
-+$inp=$a1;
-+$num=$a2;
-+$A="\$1";
-+$B="\$2";
-+$C="\$3";
-+$D="\$7";
-+$E="\$24";	@V=($A,$B,$C,$D,$E);
-+$t0="\$25";
-+$t1=$num;	# $num is offloaded to stack
-+$t2="\$30";	# fp
-+$K="\$31";	# ra
-+
-+sub BODY_00_14 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+$code.=<<___	if (!$big_endian);
-+	srl	$t0,@X[$i],24	# byte swap($i)
-+	srl	$t1,@X[$i],8
-+	andi	$t2,@X[$i],0xFF00
-+	sll	@X[$i],@X[$i],24
-+	andi	$t1,0xFF00
-+	sll	$t2,$t2,8
-+	or	@X[$i],$t0
-+	or	$t1,$t2
-+	or	@X[$i],$t1
-+___
-+$code.=<<___;
-+	 lwl	@X[$j],$j*4+$MSB($inp)
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	 lwr	@X[$j],$j*4+$LSB($inp)
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	sll	$t2,$b,30
-+	and	$t0,$b
-+	srl	$b,$b,2
-+	xor	$t0,$d
-+	addu	$e,@X[$i]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+sub BODY_15_19 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+
-+$code.=<<___	if (!$big_endian && $i==15);
-+	srl	$t0,@X[$i],24	# byte swap($i)
-+	srl	$t1,@X[$i],8
-+	andi	$t2,@X[$i],0xFF00
-+	sll	@X[$i],@X[$i],24
-+	andi	$t1,0xFF00
-+	sll	$t2,$t2,8
-+	or	@X[$i],$t0
-+	or	@X[$i],$t1
-+	or	@X[$i],$t2
-+___
-+$code.=<<___;
-+	 xor	@X[$j%16],@X[($j+2)%16]
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 xor	@X[$j%16],@X[($j+8)%16]
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	 xor	@X[$j%16],@X[($j+13)%16]
-+	sll	$t2,$b,30
-+	and	$t0,$b
-+	 srl	$t1,@X[$j%16],31
-+	 addu	@X[$j%16],@X[$j%16]
-+	srl	$b,$b,2
-+	xor	$t0,$d
-+	 or	@X[$j%16],$t1
-+	addu	$e,@X[$i%16]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+sub BODY_20_39 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+$code.=<<___ if ($i<79);
-+	 xor	@X[$j%16],@X[($j+2)%16]
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 xor	@X[$j%16],@X[($j+8)%16]
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	 xor	@X[$j%16],@X[($j+13)%16]
-+	sll	$t2,$b,30
-+	xor	$t0,$b
-+	 srl	$t1,@X[$j%16],31
-+	 addu	@X[$j%16],@X[$j%16]
-+	srl	$b,$b,2
-+	addu	$e,@X[$i%16]
-+	 or	@X[$j%16],$t1
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+$code.=<<___ if ($i==79);
-+	 lw	@X[0],0($ctx)
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	 lw	@X[1],4($ctx)
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 lw	@X[2],8($ctx)
-+	xor	$t0,$c,$d
-+	addu	$e,$t1
-+	 lw	@X[3],12($ctx)
-+	sll	$t2,$b,30
-+	xor	$t0,$b
-+	 lw	@X[4],16($ctx)
-+	srl	$b,$b,2
-+	addu	$e,@X[$i%16]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+sub BODY_40_59 {
-+my ($i,$a,$b,$c,$d,$e)=@_;
-+my $j=$i+1;
-+$code.=<<___ if ($i<79);
-+	 xor	@X[$j%16],@X[($j+2)%16]
-+	sll	$t0,$a,5	# $i
-+	addu	$e,$K
-+	srl	$t1,$a,27
-+	addu	$e,$t0
-+	 xor	@X[$j%16],@X[($j+8)%16]
-+	and	$t0,$c,$d
-+	addu	$e,$t1
-+	 xor	@X[$j%16],@X[($j+13)%16]
-+	sll	$t2,$b,30
-+	addu	$e,$t0
-+	 srl	$t1,@X[$j%16],31
-+	xor	$t0,$c,$d
-+	 addu	@X[$j%16],@X[$j%16]
-+	and	$t0,$b
-+	srl	$b,$b,2
-+	 or	@X[$j%16],$t1
-+	addu	$e,@X[$i%16]
-+	or	$b,$t2
-+	addu	$e,$t0
-+___
-+}
-+
-+$FRAMESIZE=16;	# large enough to accomodate NUBI saved registers
-+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
-+
-+$code=<<___;
-+#ifdef OPENSSL_FIPSCANISTER
-+# include <openssl/fipssyms.h>
-+#endif
-+
-+.text
-+
-+.set	noat
-+.set	noreorder
-+.align	5
-+.globl	sha1_block_data_order
-+.ent	sha1_block_data_order
-+sha1_block_data_order:
-+	.frame	$sp,$FRAMESIZE*$SZREG,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
-+	$REG_S	$ra,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_S	$fp,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_S	$s11,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_S	$s10,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_S	$s9,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_S	$s8,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_S	$s7,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_S	$s6,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_S	$s5,($FRAMESIZE-9)*$SZREG($sp)
-+	$REG_S	$s4,($FRAMESIZE-10)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_S	$s2,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_S	$s1,($FRAMESIZE-13)*$SZREG($sp)
-+	$REG_S	$s0,($FRAMESIZE-14)*$SZREG($sp)
-+	$REG_S	$gp,($FRAMESIZE-15)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	$PTR_SLL $num,6
-+	$PTR_ADD $num,$inp
-+	$REG_S	$num,0($sp)
-+	lw	$A,0($ctx)
-+	lw	$B,4($ctx)
-+	lw	$C,8($ctx)
-+	lw	$D,12($ctx)
-+	b	.Loop
-+	lw	$E,16($ctx)
-+.align	4
-+.Loop:
-+	.set	reorder
-+	lwl	@X[0],$MSB($inp)
-+	lui	$K,0x5a82
-+	lwr	@X[0],$LSB($inp)
-+	ori	$K,0x7999	# K_00_19
-+___
-+for ($i=0;$i<15;$i++)	{ &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
-+for (;$i<20;$i++)	{ &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	lui	$K,0x6ed9
-+	ori	$K,0xeba1	# K_20_39
-+___
-+for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	lui	$K,0x8f1b
-+	ori	$K,0xbcdc	# K_40_59
-+___
-+for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	lui	$K,0xca62
-+	ori	$K,0xc1d6	# K_60_79
-+___
-+for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	$PTR_ADD $inp,64
-+	$REG_L	$num,0($sp)
-+
-+	addu	$A,$X[0]
-+	addu	$B,$X[1]
-+	sw	$A,0($ctx)
-+	addu	$C,$X[2]
-+	addu	$D,$X[3]
-+	sw	$B,4($ctx)
-+	addu	$E,$X[4]
-+	sw	$C,8($ctx)
-+	sw	$D,12($ctx)
-+	sw	$E,16($ctx)
-+	.set	noreorder
-+	bne	$inp,$num,.Loop
-+	nop
-+
-+	.set	noreorder
-+	$REG_L	$ra,($FRAMESIZE-1)*$SZREG($sp)
-+	$REG_L	$fp,($FRAMESIZE-2)*$SZREG($sp)
-+	$REG_L	$s11,($FRAMESIZE-3)*$SZREG($sp)
-+	$REG_L	$s10,($FRAMESIZE-4)*$SZREG($sp)
-+	$REG_L	$s9,($FRAMESIZE-5)*$SZREG($sp)
-+	$REG_L	$s8,($FRAMESIZE-6)*$SZREG($sp)
-+	$REG_L	$s7,($FRAMESIZE-7)*$SZREG($sp)
-+	$REG_L	$s6,($FRAMESIZE-8)*$SZREG($sp)
-+	$REG_L	$s5,($FRAMESIZE-9)*$SZREG($sp)
-+	$REG_L	$s4,($FRAMESIZE-10)*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,($FRAMESIZE-11)*$SZREG($sp)
-+	$REG_L	$s2,($FRAMESIZE-12)*$SZREG($sp)
-+	$REG_L	$s1,($FRAMESIZE-13)*$SZREG($sp)
-+	$REG_L	$s0,($FRAMESIZE-14)*$SZREG($sp)
-+	$REG_L	$gp,($FRAMESIZE-15)*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
-+.end	sha1_block_data_order
-+.rdata
-+.asciiz	"SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
-+___
-+print $code;
-+close STDOUT;
-diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl
-new file mode 100644
-index 0000000..ba5b250
---- /dev/null
-+++ b/crypto/sha/asm/sha512-mips.pl
-@@ -0,0 +1,455 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+# ====================================================================
-+
-+# SHA2 block procedures for MIPS.
-+
-+# October 2010.
-+#
-+# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
-+# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
-+# for now can only be compiled for MIPS64 ISA] improvement is modest
-+# ~17%, but it comes for free, because it's same instruction sequence.
-+# Improvement coefficients are for aligned input.
-+
-+######################################################################
-+# There is a number of MIPS ABI in use, O32 and N32/64 are most
-+# widely used. Then there is a new contender: NUBI. It appears that if
-+# one picks the latter, it's possible to arrange code in ABI neutral
-+# manner. Therefore let's stick to NUBI register layout:
-+#
-+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-+#
-+# The return value is placed in $a0. Following coding rules facilitate
-+# interoperability:
-+#
-+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
-+#   excluded from the rule, because it's specified volatile];
-+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-+#   old code];
-+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-+#
-+# For reference here is register layout for N32/64 MIPS ABIs:
-+#
-+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-+#
-+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
-+
-+if ($flavour =~ /64|n32/i) {
-+	$PTR_ADD="dadd";	# incidentally works even on n32
-+	$PTR_SUB="dsub";	# incidentally works even on n32
-+	$REG_S="sd";
-+	$REG_L="ld";
-+	$PTR_SLL="dsll";	# incidentally works even on n32
-+	$SZREG=8;
-+} else {
-+	$PTR_ADD="add";
-+	$PTR_SUB="sub";
-+	$REG_S="sw";
-+	$REG_L="lw";
-+	$PTR_SLL="sll";
-+	$SZREG=4;
-+}
-+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
-+#
-+# <appro@openssl.org>
-+#
-+######################################################################
-+
-+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
-+
-+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
-+open STDOUT,">$output";
-+
-+if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
-+
-+if ($output =~ /512/) {
-+	$label="512";
-+	$SZ=8;
-+	$LD="ld";		# load from memory
-+	$ST="sd";		# store to memory
-+	$SLL="dsll";		# shift left logical
-+	$SRL="dsrl";		# shift right logical
-+	$ADDU="daddu";
-+	@Sigma0=(28,34,39);
-+	@Sigma1=(14,18,41);
-+	@sigma0=( 7, 1, 8);	# right shift first
-+	@sigma1=( 6,19,61);	# right shift first
-+	$lastK=0x817;
-+	$rounds=80;
-+} else {
-+	$label="256";
-+	$SZ=4;
-+	$LD="lw";		# load from memory
-+	$ST="sw";		# store to memory
-+	$SLL="sll";		# shift left logical
-+	$SRL="srl";		# shift right logical
-+	$ADDU="addu";
-+	@Sigma0=( 2,13,22);
-+	@Sigma1=( 6,11,25);
-+	@sigma0=( 3, 7,18);	# right shift first
-+	@sigma1=(10,17,19);	# right shift first
-+	$lastK=0x8f2;
-+	$rounds=64;
-+}
-+
-+$MSB = $big_endian ? 0 : ($SZ-1);
-+$LSB = ($SZ-1)&~$MSB;
-+
-+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
-+@X=map("\$$_",(8..23));
-+
-+$ctx=$a0;
-+$inp=$a1;
-+$len=$a2;	$Ktbl=$len;
-+
-+sub BODY_00_15 {
-+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
-+my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
-+
-+$code.=<<___ if ($i<15);
-+	${LD}l	@X[1],`($i+1)*$SZ+$MSB`($inp)
-+	${LD}r	@X[1],`($i+1)*$SZ+$LSB`($inp)
-+___
-+$code.=<<___	if (!$big_endian && $i<16 && $SZ==4);
-+	srl	$tmp0,@X[0],24		# byte swap($i)
-+	srl	$tmp1,@X[0],8
-+	andi	$tmp2,@X[0],0xFF00
-+	sll	@X[0],@X[0],24
-+	andi	$tmp1,0xFF00
-+	sll	$tmp2,$tmp2,8
-+	or	@X[0],$tmp0
-+	or	$tmp1,$tmp2
-+	or	@X[0],$tmp1
-+___
-+$code.=<<___	if (!$big_endian && $i<16 && $SZ==8);
-+	ori	$tmp0,$zero,0xFF
-+	dsll	$tmp2,$tmp0,32
-+	or	$tmp0,$tmp2		# 0x000000FF000000FF
-+	and	$tmp1,@X[0],$tmp0	# byte swap($i)
-+	dsrl	$tmp2,@X[0],24
-+	dsll	$tmp1,24
-+	and	$tmp2,$tmp0
-+	dsll	$tmp0,8			# 0x0000FF000000FF00
-+	or	$tmp1,$tmp2
-+	and	$tmp2,@X[0],$tmp0
-+	dsrl	@X[0],8
-+	dsll	$tmp2,8
-+	and	@X[0],$tmp0
-+	or	$tmp1,$tmp2
-+	or	@X[0],$tmp1
-+	dsrl	$tmp1,@X[0],32
-+	dsll	@X[0],32
-+	or	@X[0],$tmp1
-+___
-+$code.=<<___;
-+	$ADDU	$T1,$X[0],$h			# $i
-+	$SRL	$h,$e,@Sigma1[0]
-+	xor	$tmp2,$f,$g
-+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[2]`
-+	and	$tmp2,$e
-+	$SRL	$tmp0,$e,@Sigma1[1]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[1]`
-+	xor	$h,$tmp0
-+	$SRL	$tmp0,$e,@Sigma1[2]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[0]`
-+	xor	$h,$tmp0
-+	xor	$tmp2,$g			# Ch(e,f,g)
-+	xor	$tmp0,$tmp1,$h			# Sigma1(e)
-+
-+	$SRL	$h,$a,@Sigma0[0]
-+	$ADDU	$T1,$tmp2
-+	$LD	$tmp2,`$i*$SZ`($Ktbl)		# K[$i]
-+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[2]`
-+	$ADDU	$T1,$tmp0
-+	$SRL	$tmp0,$a,@Sigma0[1]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[1]`
-+	xor	$h,$tmp0
-+	$SRL	$tmp0,$a,@Sigma0[2]
-+	xor	$h,$tmp1
-+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[0]`
-+	xor	$h,$tmp0
-+	$ST	@X[0],`($i%16)*$SZ`($sp)	# offload to ring buffer
-+	xor	$h,$tmp1			# Sigma0(a)
-+
-+	or	$tmp0,$a,$b
-+	and	$tmp1,$a,$b
-+	and	$tmp0,$c
-+	or	$tmp1,$tmp0			# Maj(a,b,c)
-+	$ADDU	$T1,$tmp2			# +=K[$i]
-+	$ADDU	$h,$tmp1
-+
-+	$ADDU	$d,$T1
-+	$ADDU	$h,$T1
-+___
-+$code.=<<___ if ($i>=13);
-+	$LD	@X[3],`(($i+3)%16)*$SZ`($sp)	# prefetch from ring buffer
-+___
-+}
-+
-+sub BODY_16_XX {
-+my $i=@_[0];
-+my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
-+
-+$code.=<<___;
-+	$SRL	$tmp2,@X[1],@sigma0[0]		# Xupdate($i)
-+	$ADDU	@X[0],@X[9]			# +=X[i+9]
-+	$SLL	$tmp1,@X[1],`$SZ*8-@sigma0[2]`
-+	$SRL	$tmp0,@X[1],@sigma0[1]
-+	xor	$tmp2,$tmp1
-+	$SLL	$tmp1,`@sigma0[2]-@sigma0[1]`
-+	xor	$tmp2,$tmp0
-+	$SRL	$tmp0,@X[1],@sigma0[2]
-+	xor	$tmp2,$tmp1
-+
-+	$SRL	$tmp3,@X[14],@sigma1[0]
-+	xor	$tmp2,$tmp0			# sigma0(X[i+1])
-+	$SLL	$tmp1,@X[14],`$SZ*8-@sigma1[2]`
-+	$ADDU	@X[0],$tmp2
-+	$SRL	$tmp0,@X[14],@sigma1[1]
-+	xor	$tmp3,$tmp1
-+	$SLL	$tmp1,`@sigma1[2]-@sigma1[1]`
-+	xor	$tmp3,$tmp0
-+	$SRL	$tmp0,@X[14],@sigma1[2]
-+	xor	$tmp3,$tmp1
-+
-+	xor	$tmp3,$tmp0			# sigma1(X[i+14])
-+	$ADDU	@X[0],$tmp3
-+___
-+	&BODY_00_15(@_);
-+}
-+
-+$FRAMESIZE=16*$SZ+16*$SZREG;
-+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
-+
-+$code.=<<___;
-+#ifdef OPENSSL_FIPSCANISTER
-+# include <openssl/fipssyms.h>
-+#endif
-+
-+.text
-+.set	noat
-+#if !defined(__vxworks) || defined(__pic__)
-+.option	pic2
-+#endif
-+
-+.align	5
-+.globl	sha${label}_block_data_order
-+.ent	sha${label}_block_data_order
-+sha${label}_block_data_order:
-+	.frame	$sp,$FRAMESIZE,$ra
-+	.mask	$SAVED_REGS_MASK,-$SZREG
-+	.set	noreorder
-+___
-+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
-+	.cpload	$pf
-+___
-+$code.=<<___;
-+	$PTR_SUB $sp,$FRAMESIZE
-+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-+	$REG_S	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_S	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_S	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_S	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	$PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
-+___
-+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
-+	.cplocal	$Ktbl
-+	.cpsetup	$pf,$zero,sha${label}_block_data_order
-+___
-+$code.=<<___;
-+	.set	reorder
-+	la	$Ktbl,K${label}		# PIC-ified 'load address'
-+
-+	$LD	$A,0*$SZ($ctx)		# load context
-+	$LD	$B,1*$SZ($ctx)
-+	$LD	$C,2*$SZ($ctx)
-+	$LD	$D,3*$SZ($ctx)
-+	$LD	$E,4*$SZ($ctx)
-+	$LD	$F,5*$SZ($ctx)
-+	$LD	$G,6*$SZ($ctx)
-+	$LD	$H,7*$SZ($ctx)
-+
-+	$PTR_ADD @X[15],$inp		# pointer to the end of input
-+	$REG_S	@X[15],16*$SZ($sp)
-+	b	.Loop
-+
-+.align	5
-+.Loop:
-+	${LD}l	@X[0],$MSB($inp)
-+	${LD}r	@X[0],$LSB($inp)
-+___
-+for ($i=0;$i<16;$i++)
-+{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
-+$code.=<<___;
-+	b	.L16_xx
-+.align	4
-+.L16_xx:
-+___
-+for (;$i<32;$i++)
-+{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
-+$code.=<<___;
-+	and	@X[6],0xfff
-+	li	@X[7],$lastK
-+	.set	noreorder
-+	bne	@X[6],@X[7],.L16_xx
-+	$PTR_ADD $Ktbl,16*$SZ		# Ktbl+=16
-+
-+	$REG_L	@X[15],16*$SZ($sp)	# restore pointer to the end of input
-+	$LD	@X[0],0*$SZ($ctx)
-+	$LD	@X[1],1*$SZ($ctx)
-+	$LD	@X[2],2*$SZ($ctx)
-+	$PTR_ADD $inp,16*$SZ
-+	$LD	@X[3],3*$SZ($ctx)
-+	$ADDU	$A,@X[0]
-+	$LD	@X[4],4*$SZ($ctx)
-+	$ADDU	$B,@X[1]
-+	$LD	@X[5],5*$SZ($ctx)
-+	$ADDU	$C,@X[2]
-+	$LD	@X[6],6*$SZ($ctx)
-+	$ADDU	$D,@X[3]
-+	$LD	@X[7],7*$SZ($ctx)
-+	$ADDU	$E,@X[4]
-+	$ST	$A,0*$SZ($ctx)
-+	$ADDU	$F,@X[5]
-+	$ST	$B,1*$SZ($ctx)
-+	$ADDU	$G,@X[6]
-+	$ST	$C,2*$SZ($ctx)
-+	$ADDU	$H,@X[7]
-+	$ST	$D,3*$SZ($ctx)
-+	$ST	$E,4*$SZ($ctx)
-+	$ST	$F,5*$SZ($ctx)
-+	$ST	$G,6*$SZ($ctx)
-+	$ST	$H,7*$SZ($ctx)
-+
-+	bnel	$inp,@X[15],.Loop
-+	$PTR_SUB $Ktbl,`($rounds-16)*$SZ`	# rewind $Ktbl
-+
-+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
-+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
-+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
-+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
-+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
-+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
-+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
-+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
-+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
-+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
-+___
-+$code.=<<___ if ($flavour =~ /nubi/i);
-+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
-+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
-+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
-+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
-+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
-+___
-+$code.=<<___;
-+	jr	$ra
-+	$PTR_ADD $sp,$FRAMESIZE
-+.end	sha${label}_block_data_order
-+
-+.rdata
-+.align	5
-+K${label}:
-+___
-+if ($SZ==4) {
-+$code.=<<___;
-+	.word	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-+	.word	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-+	.word	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-+	.word	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-+	.word	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-+	.word	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-+	.word	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-+	.word	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-+	.word	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-+	.word	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-+	.word	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-+	.word	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-+	.word	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-+	.word	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-+	.word	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-+	.word	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-+___
-+} else {
-+$code.=<<___;
-+	.dword	0x428a2f98d728ae22, 0x7137449123ef65cd
-+	.dword	0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
-+	.dword	0x3956c25bf348b538, 0x59f111f1b605d019
-+	.dword	0x923f82a4af194f9b, 0xab1c5ed5da6d8118
-+	.dword	0xd807aa98a3030242, 0x12835b0145706fbe
-+	.dword	0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
-+	.dword	0x72be5d74f27b896f, 0x80deb1fe3b1696b1
-+	.dword	0x9bdc06a725c71235, 0xc19bf174cf692694
-+	.dword	0xe49b69c19ef14ad2, 0xefbe4786384f25e3
-+	.dword	0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
-+	.dword	0x2de92c6f592b0275, 0x4a7484aa6ea6e483
-+	.dword	0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
-+	.dword	0x983e5152ee66dfab, 0xa831c66d2db43210
-+	.dword	0xb00327c898fb213f, 0xbf597fc7beef0ee4
-+	.dword	0xc6e00bf33da88fc2, 0xd5a79147930aa725
-+	.dword	0x06ca6351e003826f, 0x142929670a0e6e70
-+	.dword	0x27b70a8546d22ffc, 0x2e1b21385c26c926
-+	.dword	0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
-+	.dword	0x650a73548baf63de, 0x766a0abb3c77b2a8
-+	.dword	0x81c2c92e47edaee6, 0x92722c851482353b
-+	.dword	0xa2bfe8a14cf10364, 0xa81a664bbc423001
-+	.dword	0xc24b8b70d0f89791, 0xc76c51a30654be30
-+	.dword	0xd192e819d6ef5218, 0xd69906245565a910
-+	.dword	0xf40e35855771202a, 0x106aa07032bbd1b8
-+	.dword	0x19a4c116b8d2d0c8, 0x1e376c085141ab53
-+	.dword	0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
-+	.dword	0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
-+	.dword	0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
-+	.dword	0x748f82ee5defb2fc, 0x78a5636f43172f60
-+	.dword	0x84c87814a1f0ab72, 0x8cc702081a6439ec
-+	.dword	0x90befffa23631e28, 0xa4506cebde82bde9
-+	.dword	0xbef9a3f7b2c67915, 0xc67178f2e372532b
-+	.dword	0xca273eceea26619c, 0xd186b8c721c0c207
-+	.dword	0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
-+	.dword	0x06f067aa72176fba, 0x0a637dc5a2c898a6
-+	.dword	0x113f9804bef90dae, 0x1b710b35131c471b
-+	.dword	0x28db77f523047d84, 0x32caab7b40c72493
-+	.dword	0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
-+	.dword	0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
-+	.dword	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
-+___
-+}
-+$code.=<<___;
-+.asciiz	"SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
-+.align	5
-+
-+___
-+
-+$code =~ s/\`([^\`]*)\`/eval $1/gem;
-+print $code;
-+close STDOUT;
diff --git a/patches/npn.patch b/patches/npn.patch
deleted file mode 100644
index 46b7a7d..0000000
--- a/patches/npn.patch
+++ /dev/null
@@ -1,1293 +0,0 @@
---- openssl-1.0.0b.orig/apps/apps.c	2010-11-11 14:42:19.000000000 +0000
-+++ openssl-1.0.0b/apps/apps.c	2010-11-29 19:56:04.902465346 +0000
-@@ -3012,3 +3012,46 @@ int raw_write_stdout(const void *buf,int
- int raw_write_stdout(const void *buf,int siz)
- 	{	return write(fileno(stdout),buf,siz);	}
- #endif
-+
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+/* next_protos_parse parses a comma separated list of strings into a string
-+ * in a format suitable for passing to SSL_CTX_set_next_protos_advertised.
-+ *   outlen: (output) set to the length of the resulting buffer on success.
-+ *   in: a NUL termianted string like "abc,def,ghi"
-+ *
-+ *   returns: a malloced buffer or NULL on failure.
-+ */
-+unsigned char *next_protos_parse(unsigned short *outlen, const char *in)
-+	{
-+	size_t len;
-+	unsigned char *out;
-+	size_t i, start = 0;
-+
-+	len = strlen(in);
-+	if (len >= 65535)
-+		return NULL;
-+
-+	out = OPENSSL_malloc(strlen(in) + 1);
-+	if (!out)
-+		return NULL;
-+
-+	for (i = 0; i <= len; ++i)
-+		{
-+		if (i == len || in[i] == ',')
-+			{
-+			if (i - start > 255)
-+				{
-+				OPENSSL_free(out);
-+				return NULL;
-+				}
-+			out[start] = i - start;
-+			start = i + 1;
-+			}
-+		else
-+			out[i+1] = in[i];
-+		}
-+
-+	*outlen = len + 1;
-+	return out;
-+	}
-+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
---- openssl-1.0.0b.orig/apps/apps.h	2009-10-31 13:34:19.000000000 +0000
-+++ openssl-1.0.0b/apps/apps.h	2010-11-29 19:56:04.902465346 +0000
-@@ -358,3 +358,7 @@ int raw_write_stdout(const void *,int);
- #define TM_STOP		1
- double app_tminterval (int stop,int usertime);
- #endif
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+unsigned char *next_protos_parse(unsigned short *outlen, const char *in);
-+#endif
---- openssl-1.0.0b.orig/apps/s_client.c	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/apps/s_client.c	2010-11-29 19:56:04.902465346 +0000
-@@ -342,6 +342,9 @@ static void sc_usage(void)
- 	BIO_printf(bio_err," -tlsextdebug      - hex dump of all TLS extensions received\n");
- 	BIO_printf(bio_err," -status           - request certificate status from server\n");
- 	BIO_printf(bio_err," -no_ticket        - disable use of RFC4507bis session tickets\n");
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	BIO_printf(bio_err," -nextprotoneg arg - enable NPN extension, considering named protocols supported (comma-separated list)\n");
-+# endif
- 	BIO_printf(bio_err," -cutthrough       - enable 1-RTT full-handshake for strong ciphers\n");
- #endif
- 	BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
-@@ -367,6 +370,40 @@ static int MS_CALLBACK ssl_servername_cb
- 	
- 	return SSL_TLSEXT_ERR_OK;
- 	}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This the context that we pass to next_proto_cb */
-+typedef struct tlsextnextprotoctx_st {
-+	unsigned char *data;
-+	unsigned short len;
-+	int status;
-+} tlsextnextprotoctx;
-+
-+static tlsextnextprotoctx next_proto;
-+
-+static int next_proto_cb(SSL *s, unsigned char **out, unsigned char *outlen, const unsigned char *in, unsigned int inlen, void *arg)
-+	{
-+	tlsextnextprotoctx *ctx = arg;
-+
-+	if (!c_quiet)
-+		{
-+		/* We can assume that |in| is syntactically valid. */
-+		unsigned i;
-+		BIO_printf(bio_c_out, "Protocols advertised by server: ");
-+		for (i = 0; i < inlen; )
-+			{
-+			if (i)
-+				BIO_write(bio_c_out, ", ", 2);
-+			BIO_write(bio_c_out, &in[i + 1], in[i]);
-+			i += in[i] + 1;
-+			}
-+		BIO_write(bio_c_out, "\n", 1);
-+		}
-+
-+	ctx->status = SSL_select_next_proto(out, outlen, in, inlen, ctx->data, ctx->len);
-+	return SSL_TLSEXT_ERR_OK;
-+	}
-+# endif  /* ndef OPENSSL_NO_NEXTPROTONEG */
- #endif
- 
- enum
-@@ -431,6 +468,9 @@ int MAIN(int argc, char **argv)
- 	char *servername = NULL; 
-         tlsextctx tlsextcbp = 
-         {NULL,0};
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	const char *next_proto_neg_in = NULL;
-+# endif
- #endif
- 	char *sess_in = NULL;
- 	char *sess_out = NULL;
-@@ -658,6 +698,13 @@ int MAIN(int argc, char **argv)
- #ifndef OPENSSL_NO_TLSEXT
- 		else if	(strcmp(*argv,"-no_ticket") == 0)
- 			{ off|=SSL_OP_NO_TICKET; }
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if (strcmp(*argv,"-nextprotoneg") == 0)
-+			{
-+			if (--argc < 1) goto bad;
-+			next_proto_neg_in = *(++argv);
-+			}
-+# endif
- #endif
- 		else if (strcmp(*argv,"-cutthrough") == 0)
- 			cutthrough=1;
-@@ -766,6 +813,21 @@ bad:
- 	OpenSSL_add_ssl_algorithms();
- 	SSL_load_error_strings();
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	next_proto.status = -1;
-+	if (next_proto_neg_in)
-+		{
-+		next_proto.data = next_protos_parse(&next_proto.len, next_proto_neg_in);
-+		if (next_proto.data == NULL)
-+			{
-+			BIO_printf(bio_err, "Error parsing -nextprotoneg argument\n");
-+			goto end;
-+			}
-+		}
-+	else
-+		next_proto.data = NULL;
-+#endif
-+
- #ifndef OPENSSL_NO_ENGINE
-         e = setup_engine(bio_err, engine_id, 1);
- 	if (ssl_client_engine_id)
-@@ -896,6 +958,11 @@ bad:
- 		SSL_CTX_set_mode(ctx, ssl_mode);
- 		}
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (next_proto.data)
-+		SSL_CTX_set_next_proto_select_cb(ctx, next_proto_cb, &next_proto);
-+#endif
-+
- 	if (state) SSL_CTX_set_info_callback(ctx,apps_ssl_info_callback);
- 	if (cipher != NULL)
- 		if(!SSL_CTX_set_cipher_list(ctx,cipher)) {
-@@ -1755,6 +1822,18 @@ static void print_stuff(BIO *bio, SSL *s
- 	BIO_printf(bio,"Expansion: %s\n",
- 		expansion ? SSL_COMP_get_name(expansion) : "NONE");
- #endif
-+
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (next_proto.status != -1) {
-+		const unsigned char *proto;
-+		unsigned int proto_len;
-+		SSL_get0_next_proto_negotiated(s, &proto, &proto_len);
-+		BIO_printf(bio, "Next protocol: (%d) ", next_proto.status);
-+		BIO_write(bio, proto, proto_len);
-+		BIO_write(bio, "\n", 1);
-+	}
-+#endif
-+
- 	SSL_SESSION_print(bio,SSL_get_session(s));
- 	BIO_printf(bio,"---\n");
- 	if (peer != NULL)
---- openssl-1.0.0b.orig/apps/s_server.c	2010-06-15 17:25:02.000000000 +0000
-+++ openssl-1.0.0b/apps/s_server.c	2010-11-29 19:56:04.902465346 +0000
-@@ -492,6 +492,9 @@ static void sv_usage(void)
- 	BIO_printf(bio_err," -tlsextdebug  - hex dump of all TLS extensions received\n");
- 	BIO_printf(bio_err," -no_ticket    - disable use of RFC4507bis session tickets\n");
- 	BIO_printf(bio_err," -legacy_renegotiation - enable use of legacy renegotiation (dangerous)\n");
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	BIO_printf(bio_err," -nextprotoneg arg - set the advertised protocols for the NPN extension (comma-separated list)\n");
-+# endif
- #endif
- 	}
- 
-@@ -826,6 +829,24 @@ BIO_printf(err, "cert_status: received %
- 	ret = SSL_TLSEXT_ERR_ALERT_FATAL;
- 	goto done;
- 	}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This is the context that we pass to next_proto_cb */
-+typedef struct tlsextnextprotoctx_st {
-+	unsigned char *data;
-+	unsigned int len;
-+} tlsextnextprotoctx;
-+
-+static int next_proto_cb(SSL *s, const unsigned char **data, unsigned int *len, void *arg)
-+	{
-+	tlsextnextprotoctx *next_proto = arg;
-+
-+	*data = next_proto->data;
-+	*len = next_proto->len;
-+
-+	return SSL_TLSEXT_ERR_OK;
-+	}
-+# endif  /* ndef OPENSSL_NO_NPN */
- #endif
- 
- int MAIN(int, char **);
-@@ -867,6 +888,10 @@ int MAIN(int argc, char *argv[])
- #endif
- #ifndef OPENSSL_NO_TLSEXT
-         tlsextctx tlsextcbp = {NULL, NULL, SSL_TLSEXT_ERR_ALERT_WARNING};
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	const char *next_proto_neg_in = NULL;
-+	tlsextnextprotoctx next_proto;
-+# endif
- #endif
- #ifndef OPENSSL_NO_PSK
- 	/* by default do not send a PSK identity hint */
-@@ -1191,7 +1216,13 @@ int MAIN(int argc, char *argv[])
- 			if (--argc < 1) goto bad;
- 			s_key_file2= *(++argv);
- 			}
--			
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if	(strcmp(*argv,"-nextprotoneg") == 0)
-+			{
-+			if (--argc < 1) goto bad;
-+			next_proto_neg_in = *(++argv);
-+			}
-+# endif
- #endif
- #if !defined(OPENSSL_NO_JPAKE) && !defined(OPENSSL_NO_PSK)
- 		else if (strcmp(*argv,"-jpake") == 0)
-@@ -1476,6 +1507,11 @@ bad:
- 		if (vpm)
- 			SSL_CTX_set1_param(ctx2, vpm);
- 		}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	if (next_proto.data)
-+		SSL_CTX_set_next_protos_advertised_cb(ctx, next_proto_cb, &next_proto);
-+# endif
- #endif 
- 
- #ifndef OPENSSL_NO_DH
-@@ -1617,6 +1653,21 @@ bad:
- 					goto end;
- 					}
- 				}
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+		if (next_proto_neg_in)
-+			{
-+			unsigned short len;
-+			next_proto.data = next_protos_parse(&len,
-+				next_proto_neg_in);
-+			if (next_proto.data == NULL)
-+				goto end;
-+			next_proto.len = len;
-+			}
-+		else
-+			{
-+			next_proto.data = NULL;
-+			}
-+# endif
- #endif
- 		RSA_free(rsa);
- 		BIO_printf(bio_s_out,"\n");
-@@ -2159,6 +2210,10 @@ static int init_ssl_connection(SSL *con)
- 	X509 *peer;
- 	long verify_error;
- 	MS_STATIC char buf[BUFSIZ];
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	const unsigned char *next_proto_neg;
-+	unsigned next_proto_neg_len;
-+#endif
- 
- 	if ((i=SSL_accept(con)) <= 0)
- 		{
-@@ -2198,6 +2253,15 @@ static int init_ssl_connection(SSL *con)
- 		BIO_printf(bio_s_out,"Shared ciphers:%s\n",buf);
- 	str=SSL_CIPHER_get_name(SSL_get_current_cipher(con));
- 	BIO_printf(bio_s_out,"CIPHER is %s\n",(str != NULL)?str:"(NONE)");
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	SSL_get0_next_proto_negotiated(con, &next_proto_neg, &next_proto_neg_len);
-+	if (next_proto_neg)
-+		{
-+		BIO_printf(bio_s_out,"NEXTPROTO is ");
-+		BIO_write(bio_s_out, next_proto_neg, next_proto_neg_len);
-+		BIO_printf(bio_s_out, "\n");
-+		}
-+#endif
- 	if (con->hit) BIO_printf(bio_s_out,"Reused session-id\n");
- 	if (SSL_ctrl(con,SSL_CTRL_GET_FLAGS,0,NULL) &
- 		TLS1_FLAGS_TLS_PADDING_BUG)
---- openssl-1.0.0b.orig/include/openssl/ssl.h	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/include/openssl/ssl.h	2010-11-29 19:56:04.965928855 +0000
-@@ -857,6 +857,25 @@ struct ssl_ctx_st
- 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
- 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
- 	void *tlsext_opaque_prf_input_callback_arg;
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation information */
-+	/* (for experimental NPN extension). */
-+
-+	/* For a server, this contains a callback function by which the set of
-+	 * advertised protocols can be provided. */
-+	int (*next_protos_advertised_cb)(SSL *s, const unsigned char **buf,
-+			                 unsigned int *len, void *arg);
-+	void *next_protos_advertised_cb_arg;
-+	/* For a client, this contains a callback function that selects the
-+	 * next protocol from the list provided by the server. */
-+	int (*next_proto_select_cb)(SSL *s, unsigned char **out,
-+				    unsigned char *outlen,
-+				    const unsigned char *in,
-+				    unsigned int inlen,
-+				    void *arg);
-+	void *next_proto_select_cb_arg;
-+# endif
- #endif
- 
- #ifndef OPENSSL_NO_PSK
-@@ -928,6 +947,30 @@ int SSL_CTX_set_client_cert_engine(SSL_C
- #endif
- void SSL_CTX_set_cookie_generate_cb(SSL_CTX *ctx, int (*app_gen_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int *cookie_len));
- void SSL_CTX_set_cookie_verify_cb(SSL_CTX *ctx, int (*app_verify_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int cookie_len));
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s,
-+					   int (*cb) (SSL *ssl,
-+						      const unsigned char **out,
-+						      unsigned int *outlen,
-+						      void *arg), void *arg);
-+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-+				      int (*cb) (SSL *ssl, unsigned char **out,
-+						 unsigned char *outlen,
-+						 const unsigned char *in,
-+						 unsigned int inlen, void *arg),
-+				      void *arg);
-+
-+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
-+			  const unsigned char *in, unsigned int inlen,
-+			  const unsigned char *client, unsigned int client_len);
-+void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-+				    unsigned *len);
-+
-+#define OPENSSL_NPN_UNSUPPORTED	0
-+#define OPENSSL_NPN_NEGOTIATED	1
-+#define OPENSSL_NPN_NO_OVERLAP	2
-+
-+#endif
- 
- #ifndef OPENSSL_NO_PSK
- /* the maximum length of the buffer given to callbacks containing the
-@@ -1187,6 +1230,19 @@ struct ssl_st
- 	void *tls_session_secret_cb_arg;
- 
- 	SSL_CTX * initial_ctx; /* initial ctx, used to store sessions */
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation. For the client, this is the protocol that
-+	 * we sent in NextProtocol and is set when handling ServerHello
-+	 * extensions.
-+	 *
-+	 * For a server, this is the client's selected_protocol from
-+	 * NextProtocol and is set when handling the NextProtocol message,
-+	 * before the Finished message. */
-+	unsigned char *next_proto_negotiated;
-+	unsigned char next_proto_negotiated_len;
-+#endif
-+
- #define session_ctx initial_ctx
- #else
- #define session_ctx ctx
-@@ -1919,6 +1975,7 @@ void ERR_load_SSL_strings(void);
- #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
- #define SSL_F_SSL3_GET_MESSAGE				 142
- #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-+#define SSL_F_SSL3_GET_NEXT_PROTO			 304
- #define SSL_F_SSL3_GET_RECORD				 143
- #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
- #define SSL_F_SSL3_GET_SERVER_DONE			 145
-@@ -2117,6 +2174,8 @@ void ERR_load_SSL_strings(void);
- #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
- #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
- #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
- #define SSL_R_HTTPS_PROXY_REQUEST			 155
- #define SSL_R_HTTP_REQUEST				 156
- #define SSL_R_ILLEGAL_PADDING				 283
---- openssl-1.0.0b.orig/include/openssl/ssl3.h	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/include/openssl/ssl3.h	2010-11-29 19:56:04.965928855 +0000
-@@ -465,6 +465,12 @@ typedef struct ssl3_state_st
- 	void *server_opaque_prf_input;
- 	size_t server_opaque_prf_input_len;
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Set if we saw the Next Protocol Negotiation extension from
-+	   our peer. */
-+	int next_proto_neg_seen;
-+#endif
-+
- 	struct	{
- 		/* actually only needs to be 16+20 */
- 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
-@@ -557,6 +563,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_CW_CERT_VRFY_B		(0x191|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_A		(0x1A0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_B		(0x1A1|SSL_ST_CONNECT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
-+#define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
-+#endif
- #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
- /* read from server */
-@@ -602,6 +612,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
-+#define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
-+#endif
- #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
- /* write to client */
-@@ -626,6 +640,9 @@ typedef struct ssl3_state_st
- #define SSL3_MT_CLIENT_KEY_EXCHANGE		16
- #define SSL3_MT_FINISHED			20
- #define SSL3_MT_CERTIFICATE_STATUS		22
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_MT_NEXT_PROTO			67
-+#endif
- #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
- 
- 
---- openssl-1.0.0b.orig/include/openssl/tls1.h	2009-11-11 14:51:29.000000000 +0000
-+++ openssl-1.0.0b/include/openssl/tls1.h	2010-11-29 19:56:04.965928855 +0000
-@@ -204,6 +204,11 @@ extern "C" {
- /* Temporary extension type */
- #define TLSEXT_TYPE_renegotiate                 0xff01
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This is not an IANA defined extension number */
-+#define TLSEXT_TYPE_next_proto_neg		13172
-+#endif
-+
- /* NameType value from RFC 3546 */
- #define TLSEXT_NAMETYPE_host_name 0
- /* status request value from RFC 3546 */
---- openssl-1.0.0b.orig/ssl/s3_both.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/s3_both.c	2010-11-29 19:56:04.965928855 +0000
-@@ -202,15 +202,40 @@ int ssl3_send_finished(SSL *s, int a, in
- 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
- 	}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* ssl3_take_mac calculates the Finished MAC for the handshakes messages seen to far. */
-+static void ssl3_take_mac(SSL *s)
-+	{
-+	const char *sender;
-+	int slen;
-+
-+	if (s->state & SSL_ST_CONNECT)
-+		{
-+		sender=s->method->ssl3_enc->server_finished_label;
-+		slen=s->method->ssl3_enc->server_finished_label_len;
-+		}
-+	else
-+		{
-+		sender=s->method->ssl3_enc->client_finished_label;
-+		slen=s->method->ssl3_enc->client_finished_label_len;
-+		}
-+
-+	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
-+		sender,slen,s->s3->tmp.peer_finish_md);
-+	}
-+#endif
-+
- int ssl3_get_finished(SSL *s, int a, int b)
- 	{
- 	int al,i,ok;
- 	long n;
- 	unsigned char *p;
- 
-+#ifdef OPENSSL_NO_NEXTPROTONEG
- 	/* the mac has already been generated when we received the
- 	 * change cipher spec message and is in s->s3->tmp.peer_finish_md
- 	 */ 
-+#endif
- 
- 	n=s->method->ssl_get_message(s,
- 		a,
-@@ -521,6 +546,15 @@ long ssl3_get_message(SSL *s, int st1, i
- 		s->init_num += i;
- 		n -= i;
- 		}
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* If receiving Finished, record MAC of prior handshake messages for
-+	 * Finished verification. */
-+	if (*s->init_buf->data == SSL3_MT_FINISHED)
-+		ssl3_take_mac(s);
-+#endif
-+
-+	/* Feed this message into MAC computation. */
- 	ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
- 	if (s->msg_callback)
- 		s->msg_callback(0, s->version, SSL3_RT_HANDSHAKE, s->init_buf->data, (size_t)s->init_num + 4, s, s->msg_callback_arg);
---- openssl-1.0.0b.orig/ssl/s3_clnt.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/s3_clnt.c	2010-11-29 19:56:04.965928855 +0000
-@@ -435,7 +435,16 @@ int ssl3_connect(SSL *s)
- 			ret=ssl3_send_change_cipher_spec(s,
- 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
- 			if (ret <= 0) goto end;
-+
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 			s->state=SSL3_ST_CW_FINISHED_A;
-+#else
-+			if (s->next_proto_negotiated)
-+				s->state=SSL3_ST_CW_NEXT_PROTO_A;
-+			else
-+				s->state=SSL3_ST_CW_FINISHED_A;
-+#endif
-+
- 			s->init_num=0;
- 
- 			s->session->cipher=s->s3->tmp.new_cipher;
-@@ -463,6 +472,15 @@ int ssl3_connect(SSL *s)
- 
- 			break;
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+		case SSL3_ST_CW_NEXT_PROTO_A:
-+		case SSL3_ST_CW_NEXT_PROTO_B:
-+			ret=ssl3_send_next_proto(s);
-+			if (ret <= 0) goto end;
-+			s->state=SSL3_ST_CW_FINISHED_A;
-+			break;
-+#endif
-+
- 		case SSL3_ST_CW_FINISHED_A:
- 		case SSL3_ST_CW_FINISHED_B:
- 			ret=ssl3_send_finished(s,
-@@ -3060,6 +3078,32 @@ err:
-  */
- 
- #ifndef OPENSSL_NO_TLSEXT
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+int ssl3_send_next_proto(SSL *s)
-+	{
-+	unsigned int len, padding_len;
-+	unsigned char *d;
-+
-+	if (s->state == SSL3_ST_CW_NEXT_PROTO_A)
-+		{
-+		len = s->next_proto_negotiated_len;
-+		padding_len = 32 - ((len + 2) % 32);
-+		d = (unsigned char *)s->init_buf->data;
-+		d[4] = len;
-+		memcpy(d + 5, s->next_proto_negotiated, len);
-+		d[5 + len] = padding_len;
-+		memset(d + 6 + len, 0, padding_len);
-+		*(d++)=SSL3_MT_NEXT_PROTO;
-+		l2n3(2 + len + padding_len, d);
-+		s->state = SSL3_ST_CW_NEXT_PROTO_B;
-+		s->init_num = 4 + 2 + len + padding_len;
-+		s->init_off = 0;
-+		}
-+
-+	return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
-+	}
-+# endif
-+
- int ssl3_check_finished(SSL *s)
- 	{
- 	int ok;
---- openssl-1.0.0b.orig/ssl/s3_lib.c	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/ssl/s3_lib.c	2010-11-29 19:56:04.965928855 +0000
-@@ -2230,6 +2230,15 @@ void ssl3_clear(SSL *s)
- 	s->s3->num_renegotiations=0;
- 	s->s3->in_read_app_data=0;
- 	s->version=SSL3_VERSION;
-+
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (s->next_proto_negotiated)
-+		{
-+		OPENSSL_free(s->next_proto_negotiated);
-+		s->next_proto_negotiated = NULL;
-+		s->next_proto_negotiated_len = 0;
-+		}
-+#endif
- 	}
- 
- long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg)
---- openssl-1.0.0b.orig/ssl/s3_pkt.c	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/ssl/s3_pkt.c	2010-11-29 19:56:04.965928855 +0000
-@@ -1394,8 +1394,10 @@ err:
- int ssl3_do_change_cipher_spec(SSL *s)
- 	{
- 	int i;
-+#ifdef OPENSSL_NO_NEXTPROTONEG
- 	const char *sender;
- 	int slen;
-+#endif
- 
- 	if (s->state & SSL_ST_ACCEPT)
- 		i=SSL3_CHANGE_CIPHER_SERVER_READ;
-@@ -1418,6 +1420,7 @@ int ssl3_do_change_cipher_spec(SSL *s)
- 	if (!s->method->ssl3_enc->change_cipher_state(s,i))
- 		return(0);
- 
-+#ifdef OPENSSL_NO_NEXTPROTONEG
- 	/* we have to record the message digest at
- 	 * this point so we can get it before we read
- 	 * the finished message */
-@@ -1434,6 +1437,7 @@ int ssl3_do_change_cipher_spec(SSL *s)
- 
- 	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
- 		sender,slen,s->s3->tmp.peer_finish_md);
-+#endif
- 
- 	return(1);
- 	}
---- openssl-1.0.0b.orig/ssl/s3_srvr.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/s3_srvr.c	2010-11-29 19:56:04.965928855 +0000
-@@ -538,7 +538,14 @@ int ssl3_accept(SSL *s)
- 				 * the client uses its key from the certificate
- 				 * for key exchange.
- 				 */
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 				s->state=SSL3_ST_SR_FINISHED_A;
-+#else
-+				if (s->s3->next_proto_neg_seen)
-+					s->state=SSL3_ST_SR_NEXT_PROTO_A;
-+				else
-+					s->state=SSL3_ST_SR_FINISHED_A;
-+#endif
- 				s->init_num = 0;
- 				}
- 			else
-@@ -581,10 +588,27 @@ int ssl3_accept(SSL *s)
- 			ret=ssl3_get_cert_verify(s);
- 			if (ret <= 0) goto end;
- 
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 			s->state=SSL3_ST_SR_FINISHED_A;
-+#else
-+			if (s->s3->next_proto_neg_seen)
-+				s->state=SSL3_ST_SR_NEXT_PROTO_A;
-+			else
-+				s->state=SSL3_ST_SR_FINISHED_A;
-+#endif
- 			s->init_num=0;
- 			break;
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+		case SSL3_ST_SR_NEXT_PROTO_A:
-+		case SSL3_ST_SR_NEXT_PROTO_B:
-+			ret=ssl3_get_next_proto(s);
-+			if (ret <= 0) goto end;
-+			s->init_num = 0;
-+			s->state=SSL3_ST_SR_FINISHED_A;
-+			break;
-+#endif
-+
- 		case SSL3_ST_SR_FINISHED_A:
- 		case SSL3_ST_SR_FINISHED_B:
- 			ret=ssl3_get_finished(s,SSL3_ST_SR_FINISHED_A,
-@@ -655,7 +679,16 @@ int ssl3_accept(SSL *s)
- 			if (ret <= 0) goto end;
- 			s->state=SSL3_ST_SW_FLUSH;
- 			if (s->hit)
-+				{
-+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
- 				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
-+#else
-+				if (s->s3->next_proto_neg_seen)
-+					s->s3->tmp.next_state=SSL3_ST_SR_NEXT_PROTO_A;
-+				else
-+					s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
-+#endif
-+				}
- 			else
- 				s->s3->tmp.next_state=SSL_ST_OK;
- 			s->init_num=0;
-@@ -3196,4 +3229,72 @@ int ssl3_send_cert_status(SSL *s)
- 	/* SSL3_ST_SW_CERT_STATUS_B */
- 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
- 	}
-+
-+# ifndef OPENSSL_NO_NPN
-+/* ssl3_get_next_proto reads a Next Protocol Negotiation handshake message. It
-+ * sets the next_proto member in s if found */
-+int ssl3_get_next_proto(SSL *s)
-+	{
-+	int ok;
-+	unsigned proto_len, padding_len;
-+	long n;
-+	const unsigned char *p;
-+
-+	/* Clients cannot send a NextProtocol message if we didn't see the
-+	 * extension in their ClientHello */
-+	if (!s->s3->next_proto_neg_seen)
-+		{
-+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION);
-+		return -1;
-+		}
-+
-+	n=s->method->ssl_get_message(s,
-+		SSL3_ST_SR_NEXT_PROTO_A,
-+		SSL3_ST_SR_NEXT_PROTO_B,
-+		SSL3_MT_NEXT_PROTO,
-+		514,  /* See the payload format below */
-+		&ok);
-+
-+	if (!ok)
-+		return((int)n);
-+
-+	/* s->state doesn't reflect whether ChangeCipherSpec has been received
-+	 * in this handshake, but s->s3->change_cipher_spec does (will be reset
-+	 * by ssl3_get_finished). */
-+	if (!s->s3->change_cipher_spec)
-+		{
-+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS);
-+		return -1;
-+		}
-+
-+	if (n < 2)
-+		return 0;  /* The body must be > 1 bytes long */
-+
-+	p=(unsigned char *)s->init_msg;
-+
-+	/* The payload looks like:
-+	 *   uint8 proto_len;
-+	 *   uint8 proto[proto_len];
-+	 *   uint8 padding_len;
-+	 *   uint8 padding[padding_len];
-+	 */
-+	proto_len = p[0];
-+	if (proto_len + 2 > s->init_num)
-+		return 0;
-+	padding_len = p[proto_len + 1];
-+	if (proto_len + padding_len + 2 != s->init_num)
-+		return 0;
-+
-+	s->next_proto_negotiated = OPENSSL_malloc(proto_len);
-+	if (!s->next_proto_negotiated)
-+		{
-+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,ERR_R_MALLOC_FAILURE);
-+		return 0;
-+		}
-+	memcpy(s->next_proto_negotiated, p + 1, proto_len);
-+	s->next_proto_negotiated_len = proto_len;
-+
-+	return 1;
-+	}
-+# endif
- #endif
---- openssl-1.0.0b.orig/ssl/ssl.h	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl.h	2010-11-29 19:56:04.965928855 +0000
-@@ -857,6 +857,25 @@ struct ssl_ctx_st
- 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
- 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
- 	void *tlsext_opaque_prf_input_callback_arg;
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation information */
-+	/* (for experimental NPN extension). */
-+
-+	/* For a server, this contains a callback function by which the set of
-+	 * advertised protocols can be provided. */
-+	int (*next_protos_advertised_cb)(SSL *s, const unsigned char **buf,
-+			                 unsigned int *len, void *arg);
-+	void *next_protos_advertised_cb_arg;
-+	/* For a client, this contains a callback function that selects the
-+	 * next protocol from the list provided by the server. */
-+	int (*next_proto_select_cb)(SSL *s, unsigned char **out,
-+				    unsigned char *outlen,
-+				    const unsigned char *in,
-+				    unsigned int inlen,
-+				    void *arg);
-+	void *next_proto_select_cb_arg;
-+# endif
- #endif
- 
- #ifndef OPENSSL_NO_PSK
-@@ -928,6 +947,30 @@ int SSL_CTX_set_client_cert_engine(SSL_C
- #endif
- void SSL_CTX_set_cookie_generate_cb(SSL_CTX *ctx, int (*app_gen_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int *cookie_len));
- void SSL_CTX_set_cookie_verify_cb(SSL_CTX *ctx, int (*app_verify_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int cookie_len));
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s,
-+					   int (*cb) (SSL *ssl,
-+						      const unsigned char **out,
-+						      unsigned int *outlen,
-+						      void *arg), void *arg);
-+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-+				      int (*cb) (SSL *ssl, unsigned char **out,
-+						 unsigned char *outlen,
-+						 const unsigned char *in,
-+						 unsigned int inlen, void *arg),
-+				      void *arg);
-+
-+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
-+			  const unsigned char *in, unsigned int inlen,
-+			  const unsigned char *client, unsigned int client_len);
-+void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-+				    unsigned *len);
-+
-+#define OPENSSL_NPN_UNSUPPORTED	0
-+#define OPENSSL_NPN_NEGOTIATED	1
-+#define OPENSSL_NPN_NO_OVERLAP	2
-+
-+#endif
- 
- #ifndef OPENSSL_NO_PSK
- /* the maximum length of the buffer given to callbacks containing the
-@@ -1187,6 +1230,19 @@ struct ssl_st
- 	void *tls_session_secret_cb_arg;
- 
- 	SSL_CTX * initial_ctx; /* initial ctx, used to store sessions */
-+
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Next protocol negotiation. For the client, this is the protocol that
-+	 * we sent in NextProtocol and is set when handling ServerHello
-+	 * extensions.
-+	 *
-+	 * For a server, this is the client's selected_protocol from
-+	 * NextProtocol and is set when handling the NextProtocol message,
-+	 * before the Finished message. */
-+	unsigned char *next_proto_negotiated;
-+	unsigned char next_proto_negotiated_len;
-+#endif
-+
- #define session_ctx initial_ctx
- #else
- #define session_ctx ctx
-@@ -1919,6 +1975,7 @@ void ERR_load_SSL_strings(void);
- #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
- #define SSL_F_SSL3_GET_MESSAGE				 142
- #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-+#define SSL_F_SSL3_GET_NEXT_PROTO			 304
- #define SSL_F_SSL3_GET_RECORD				 143
- #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
- #define SSL_F_SSL3_GET_SERVER_DONE			 145
-@@ -2117,6 +2174,8 @@ void ERR_load_SSL_strings(void);
- #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
- #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
- #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
- #define SSL_R_HTTPS_PROXY_REQUEST			 155
- #define SSL_R_HTTP_REQUEST				 156
- #define SSL_R_ILLEGAL_PADDING				 283
---- openssl-1.0.0b.orig/ssl/ssl3.h	2010-11-29 19:56:04.832465351 +0000
-+++ openssl-1.0.0b/ssl/ssl3.h	2010-11-29 19:56:04.965928855 +0000
-@@ -465,6 +465,12 @@ typedef struct ssl3_state_st
- 	void *server_opaque_prf_input;
- 	size_t server_opaque_prf_input_len;
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	/* Set if we saw the Next Protocol Negotiation extension from
-+	   our peer. */
-+	int next_proto_neg_seen;
-+#endif
-+
- 	struct	{
- 		/* actually only needs to be 16+20 */
- 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
-@@ -557,6 +563,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_CW_CERT_VRFY_B		(0x191|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_A		(0x1A0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_CHANGE_B		(0x1A1|SSL_ST_CONNECT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
-+#define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
-+#endif
- #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
- #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
- /* read from server */
-@@ -602,6 +612,10 @@ typedef struct ssl3_state_st
- #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
-+#define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
-+#endif
- #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
- #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
- /* write to client */
-@@ -626,6 +640,9 @@ typedef struct ssl3_state_st
- #define SSL3_MT_CLIENT_KEY_EXCHANGE		16
- #define SSL3_MT_FINISHED			20
- #define SSL3_MT_CERTIFICATE_STATUS		22
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+#define SSL3_MT_NEXT_PROTO			67
-+#endif
- #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
- 
- 
---- openssl-1.0.0b.orig/ssl/ssl_err.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl_err.c	2010-11-29 19:56:04.965928855 +0000
-@@ -155,6 +155,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
- {ERR_FUNC(SSL_F_SSL3_GET_KEY_EXCHANGE),	"SSL3_GET_KEY_EXCHANGE"},
- {ERR_FUNC(SSL_F_SSL3_GET_MESSAGE),	"SSL3_GET_MESSAGE"},
- {ERR_FUNC(SSL_F_SSL3_GET_NEW_SESSION_TICKET),	"SSL3_GET_NEW_SESSION_TICKET"},
-+{ERR_FUNC(SSL_F_SSL3_GET_NEXT_PROTO),	"SSL3_GET_NEXT_PROTO"},
- {ERR_FUNC(SSL_F_SSL3_GET_RECORD),	"SSL3_GET_RECORD"},
- {ERR_FUNC(SSL_F_SSL3_GET_SERVER_CERTIFICATE),	"SSL3_GET_SERVER_CERTIFICATE"},
- {ERR_FUNC(SSL_F_SSL3_GET_SERVER_DONE),	"SSL3_GET_SERVER_DONE"},
-@@ -355,6 +356,8 @@ static ERR_STRING_DATA SSL_str_reasons[]
- {ERR_REASON(SSL_R_EXCESSIVE_MESSAGE_SIZE),"excessive message size"},
- {ERR_REASON(SSL_R_EXTRA_DATA_IN_MESSAGE) ,"extra data in message"},
- {ERR_REASON(SSL_R_GOT_A_FIN_BEFORE_A_CCS),"got a fin before a ccs"},
-+{ERR_REASON(SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS),"got next proto before a ccs"},
-+{ERR_REASON(SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION),"got next proto without seeing extension"},
- {ERR_REASON(SSL_R_HTTPS_PROXY_REQUEST)   ,"https proxy request"},
- {ERR_REASON(SSL_R_HTTP_REQUEST)          ,"http request"},
- {ERR_REASON(SSL_R_ILLEGAL_PADDING)       ,"illegal padding"},
---- openssl-1.0.0b.orig/ssl/ssl_lib.c	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl_lib.c	2010-11-29 19:56:04.965928855 +0000
-@@ -354,6 +354,9 @@ SSL *SSL_new(SSL_CTX *ctx)
- 	s->tlsext_ocsp_resplen = -1;
- 	CRYPTO_add(&ctx->references,1,CRYPTO_LOCK_SSL_CTX);
- 	s->initial_ctx=ctx;
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	s->next_proto_negotiated = NULL;
-+# endif
- #endif
- 
- 	s->verify_result=X509_V_OK;
-@@ -587,6 +590,11 @@ void SSL_free(SSL *s)
- 		kssl_ctx_free(s->kssl_ctx);
- #endif	/* OPENSSL_NO_KRB5 */
- 
-+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
-+	if (s->next_proto_negotiated)
-+		OPENSSL_free(s->next_proto_negotiated);
-+#endif
-+
- 	OPENSSL_free(s);
- 	}
- 
-@@ -1503,6 +1511,124 @@ int SSL_get_servername_type(const SSL *s
- 		return TLSEXT_NAMETYPE_host_name;
- 	return -1;
- 	}
-+
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+/* SSL_select_next_proto implements the standard protocol selection. It is
-+ * expected that this function is called from the callback set by
-+ * SSL_CTX_set_next_proto_select_cb.
-+ *
-+ * The protocol data is assumed to be a vector of 8-bit, length prefixed byte
-+ * strings. The length byte itself is not included in the length. A byte
-+ * string of length 0 is invalid. No byte string may be truncated.
-+ *
-+ * The current, but experimental algorithm for selecting the protocol is:
-+ *
-+ * 1) If the server doesn't support NPN then this is indicated to the
-+ * callback. In this case, the client application has to abort the connection
-+ * or have a default application level protocol.
-+ *
-+ * 2) If the server supports NPN, but advertises an empty list then the
-+ * client selects the first protcol in its list, but indicates via the
-+ * API that this fallback case was enacted.
-+ *
-+ * 3) Otherwise, the client finds the first protocol in the server's list
-+ * that it supports and selects this protocol. This is because it's
-+ * assumed that the server has better information about which protocol
-+ * a client should use.
-+ *
-+ * 4) If the client doesn't support any of the server's advertised
-+ * protocols, then this is treated the same as case 2.
-+ *
-+ * It returns either
-+ * OPENSSL_NPN_NEGOTIATED if a common protocol was found, or
-+ * OPENSSL_NPN_NO_OVERLAP if the fallback case was reached.
-+ */
-+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen, const unsigned char *server, unsigned int server_len, const unsigned char *client, unsigned int client_len)
-+	{
-+	unsigned int i, j;
-+	const unsigned char *result;
-+	int status = OPENSSL_NPN_UNSUPPORTED;
-+
-+	/* For each protocol in server preference order, see if we support it. */
-+	for (i = 0; i < server_len; )
-+		{
-+		for (j = 0; j < client_len; )
-+			{
-+			if (server[i] == client[j] &&
-+			    memcmp(&server[i+1], &client[j+1], server[i]) == 0)
-+				{
-+				/* We found a match */
-+				result = &server[i];
-+				status = OPENSSL_NPN_NEGOTIATED;
-+				goto found;
-+				}
-+			j += client[j];
-+			j++;
-+			}
-+		i += server[i];
-+		i++;
-+		}
-+
-+	/* There's no overlap between our protocols and the server's list. */
-+	result = client;
-+	status = OPENSSL_NPN_NO_OVERLAP;
-+
-+	found:
-+	*out = (unsigned char *) result + 1;
-+	*outlen = result[0];
-+	return status;
-+	}
-+
-+/* SSL_get0_next_proto_negotiated sets *data and *len to point to the client's
-+ * requested protocol for this connection and returns 0. If the client didn't
-+ * request any protocol, then *data is set to NULL.
-+ *
-+ * Note that the client can request any protocol it chooses. The value returned
-+ * from this function need not be a member of the list of supported protocols
-+ * provided by the callback.
-+ */
-+void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data, unsigned *len)
-+	{
-+	*data = s->next_proto_negotiated;
-+	if (!*data) {
-+		*len = 0;
-+	} else {
-+		*len = s->next_proto_negotiated_len;
-+	}
-+}
-+
-+/* SSL_CTX_set_next_protos_advertised_cb sets a callback that is called when a
-+ * TLS server needs a list of supported protocols for Next Protocol
-+ * Negotiation. The returned list must be in wire format.  The list is returned
-+ * by setting |out| to point to it and |outlen| to its length. This memory will
-+ * not be modified, but one should assume that the SSL* keeps a reference to
-+ * it.
-+ *
-+ * The callback should return SSL_TLSEXT_ERR_OK if it wishes to advertise. Otherwise, no
-+ * such extension will be included in the ServerHello. */
-+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *ctx, int (*cb) (SSL *ssl, const unsigned char **out, unsigned int *outlen, void *arg), void *arg)
-+	{
-+	ctx->next_protos_advertised_cb = cb;
-+	ctx->next_protos_advertised_cb_arg = arg;
-+	}
-+
-+/* SSL_CTX_set_next_proto_select_cb sets a callback that is called when a
-+ * client needs to select a protocol from the server's provided list. |out|
-+ * must be set to point to the selected protocol (which may be within |in|).
-+ * The length of the protocol name must be written into |outlen|. The server's
-+ * advertised protocols are provided in |in| and |inlen|. The callback can
-+ * assume that |in| is syntactically valid.
-+ *
-+ * The client must select a protocol. It is fatal to the connection if this
-+ * callback returns a value other than SSL_TLSEXT_ERR_OK.
-+ */
-+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *ctx, int (*cb) (SSL *s, unsigned char **out, unsigned char *outlen, const unsigned char *in, unsigned int inlen, void *arg), void *arg)
-+	{
-+	ctx->next_proto_select_cb = cb;
-+	ctx->next_proto_select_cb_arg = arg;
-+	}
-+
-+# endif
- #endif
- 
- static unsigned long ssl_session_hash(const SSL_SESSION *a)
-@@ -1667,6 +1793,10 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *m
- 	ret->tlsext_status_cb = 0;
- 	ret->tlsext_status_arg = NULL;
- 
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+	ret->next_protos_advertised_cb = 0;
-+	ret->next_proto_select_cb = 0;
-+# endif
- #endif
- #ifndef OPENSSL_NO_PSK
- 	ret->psk_identity_hint=NULL;
---- openssl-1.0.0b.orig/ssl/ssl_locl.h	2010-11-29 19:56:04.846517045 +0000
-+++ openssl-1.0.0b/ssl/ssl_locl.h	2010-11-29 19:56:04.965928855 +0000
-@@ -968,6 +968,9 @@ int ssl3_get_server_certificate(SSL *s);
- int ssl3_check_cert_and_algorithm(SSL *s);
- #ifndef OPENSSL_NO_TLSEXT
- int ssl3_check_finished(SSL *s);
-+# ifndef OPENSSL_NO_NEXTPROTONEG
-+int ssl3_send_next_proto(SSL *s);
-+# endif
- #endif
- 
- int dtls1_client_hello(SSL *s);
-@@ -986,6 +989,9 @@ int ssl3_check_client_hello(SSL *s);
- int ssl3_get_client_certificate(SSL *s);
- int ssl3_get_client_key_exchange(SSL *s);
- int ssl3_get_cert_verify(SSL *s);
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+int ssl3_get_next_proto(SSL *s);
-+#endif
- 
- int dtls1_send_hello_request(SSL *s);
- int dtls1_send_server_hello(SSL *s);
---- openssl-1.0.0b.orig/ssl/t1_lib.c	2010-11-16 13:26:24.000000000 +0000
-+++ openssl-1.0.0b/ssl/t1_lib.c	2010-11-29 19:56:04.965928855 +0000
-@@ -494,6 +494,18 @@ unsigned char *ssl_add_clienthello_tlsex
- 			i2d_X509_EXTENSIONS(s->tlsext_ocsp_exts, &ret);
- 		}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	if (s->ctx->next_proto_select_cb && !s->s3->tmp.finish_md_len)
-+		{
-+		/* The client advertises an emtpy extension to indicate its
-+		 * support for Next Protocol Negotiation */
-+		if (limit - ret - 4 < 0)
-+			return NULL;
-+		s2n(TLSEXT_TYPE_next_proto_neg,ret);
-+		s2n(0,ret);
-+		}
-+#endif
-+
- 	if ((extdatalen = ret-p-2)== 0) 
- 		return p;
- 
-@@ -505,6 +517,9 @@ unsigned char *ssl_add_serverhello_tlsex
- 	{
- 	int extdatalen=0;
- 	unsigned char *ret = p;
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	int next_proto_neg_seen;
-+#endif
- 
- 	/* don't add extensions for SSLv3, unless doing secure renegotiation */
- 	if (s->version == SSL3_VERSION && !s->s3->send_connection_binding)
-@@ -618,6 +633,28 @@ unsigned char *ssl_add_serverhello_tlsex
- 
- 		}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+	next_proto_neg_seen = s->s3->next_proto_neg_seen;
-+	s->s3->next_proto_neg_seen = 0;
-+	if (next_proto_neg_seen && s->ctx->next_protos_advertised_cb)
-+		{
-+		const unsigned char *npa;
-+		unsigned int npalen;
-+		int r;
-+
-+		r = s->ctx->next_protos_advertised_cb(s, &npa, &npalen, s->ctx->next_protos_advertised_cb_arg);
-+		if (r == SSL_TLSEXT_ERR_OK)
-+			{
-+			if ((long)(limit - ret - 4 - npalen) < 0) return NULL;
-+			s2n(TLSEXT_TYPE_next_proto_neg,ret);
-+			s2n(npalen,ret);
-+			memcpy(ret, npa, npalen);
-+			ret += npalen;
-+			s->s3->next_proto_neg_seen = 1;
-+			}
-+		}
-+#endif
-+
- 	if ((extdatalen = ret-p-2)== 0) 
- 		return p;
- 
-@@ -982,6 +1019,28 @@ int ssl_parse_clienthello_tlsext(SSL *s,
- 				else
- 					s->tlsext_status_type = -1;
- 			}
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if (type == TLSEXT_TYPE_next_proto_neg &&
-+                         s->s3->tmp.finish_md_len == 0)
-+			{
-+			/* We shouldn't accept this extension on a
-+			 * renegotiation.
-+			 *
-+			 * s->new_session will be set on renegotiation, but we
-+			 * probably shouldn't rely that it couldn't be set on
-+			 * the initial renegotation too in certain cases (when
-+			 * there's some other reason to disallow resuming an
-+			 * earlier session -- the current code won't be doing
-+			 * anything like that, but this might change).
-+
-+			 * A valid sign that there's been a previous handshake
-+			 * in this connection is if s->s3->tmp.finish_md_len >
-+			 * 0.  (We are talking about a check that will happen
-+			 * in the Hello protocol round, well before a new
-+			 * Finished message could have been computed.) */
-+			s->s3->next_proto_neg_seen = 1;
-+			}
-+#endif
- 
- 		/* session ticket processed earlier */
- 		data+=size;
-@@ -1005,6 +1064,26 @@ int ssl_parse_clienthello_tlsext(SSL *s,
- 	return 1;
- 	}
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* ssl_next_proto_validate validates a Next Protocol Negotiation block. No
-+ * elements of zero length are allowed and the set of elements must exactly fill
-+ * the length of the block. */
-+static int ssl_next_proto_validate(unsigned char *d, unsigned len)
-+	{
-+	unsigned int off = 0;
-+
-+	while (off < len)
-+		{
-+		if (d[off] == 0)
-+			return 0;
-+		off += d[off];
-+		off++;
-+		}
-+
-+	return off == len;
-+	}
-+#endif
-+
- int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, int n, int *al)
- 	{
- 	unsigned short length;
-@@ -1139,6 +1218,39 @@ int ssl_parse_serverhello_tlsext(SSL *s,
- 			/* Set flag to expect CertificateStatus message */
- 			s->tlsext_status_expected = 1;
- 			}
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+		else if (type == TLSEXT_TYPE_next_proto_neg)
-+			{
-+			unsigned char *selected;
-+			unsigned char selected_len;
-+
-+			/* We must have requested it. */
-+			if ((s->ctx->next_proto_select_cb == NULL))
-+				{
-+				*al = TLS1_AD_UNSUPPORTED_EXTENSION;
-+				return 0;
-+				}
-+			/* The data must be valid */
-+			if (!ssl_next_proto_validate(data, size))
-+				{
-+				*al = TLS1_AD_DECODE_ERROR;
-+				return 0;
-+				}
-+			if (s->ctx->next_proto_select_cb(s, &selected, &selected_len, data, size, s->ctx->next_proto_select_cb_arg) != SSL_TLSEXT_ERR_OK)
-+				{
-+				*al = TLS1_AD_INTERNAL_ERROR;
-+				return 0;
-+				}
-+			s->next_proto_negotiated = OPENSSL_malloc(selected_len);
-+			if (!s->next_proto_negotiated)
-+				{
-+				*al = TLS1_AD_INTERNAL_ERROR;
-+				return 0;
-+				}
-+			memcpy(s->next_proto_negotiated, selected, selected_len);
-+			s->next_proto_negotiated_len = selected_len;
-+			}
-+#endif
- 		else if (type == TLSEXT_TYPE_renegotiate)
- 			{
- 			if(!ssl_parse_serverhello_renegotiate_ext(s, data, size, al))
---- openssl-1.0.0b.orig/ssl/tls1.h	2009-11-11 14:51:29.000000000 +0000
-+++ openssl-1.0.0b/ssl/tls1.h	2010-11-29 19:56:04.965928855 +0000
-@@ -204,6 +204,11 @@ extern "C" {
- /* Temporary extension type */
- #define TLSEXT_TYPE_renegotiate                 0xff01
- 
-+#ifndef OPENSSL_NO_NEXTPROTONEG
-+/* This is not an IANA defined extension number */
-+#define TLSEXT_TYPE_next_proto_neg		13172
-+#endif
-+
- /* NameType value from RFC 3546 */
- #define TLSEXT_NAMETYPE_host_name 0
- /* status request value from RFC 3546 */
diff --git a/patches/progs.patch b/patches/progs.patch
index 16fd9b0..f0879ae 100644
--- a/patches/progs.patch
+++ b/patches/progs.patch
@@ -20,8 +20,8 @@
 +#if 0 /* ANDROID */
  	{FUNC_TYPE_GENERAL,"ts",ts_main},
 +#endif
- #ifndef OPENSSL_NO_MD2
- 	{FUNC_TYPE_MD,"md2",dgst_main},
+ #ifndef OPENSSL_NO_SRP
+ 	{FUNC_TYPE_GENERAL,"srp",srp_main},
  #endif
 --- openssl-1.0.0.orig/apps/speed.c	2010-03-03 11:56:17.000000000 -0800
 +++ openssl-1.0.0/apps/speed.c	2010-05-18 14:05:57.000000000 -0700
diff --git a/patches/sha1_armv4_large.patch b/patches/sha1_armv4_large.patch
deleted file mode 100644
index 359ff94..0000000
--- a/patches/sha1_armv4_large.patch
+++ /dev/null
@@ -1,21 +0,0 @@
-diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
-index 6e65fe3..79e3f61 100644
---- a/crypto/sha/asm/sha1-armv4-large.pl
-+++ b/crypto/sha/asm/sha1-armv4-large.pl
-@@ -161,6 +161,7 @@ for($i=0;$i<5;$i++) {
- $code.=<<___;
- 	teq	$Xi,sp
- 	bne	.L_00_15		@ [((11+4)*5+2)*3]
-+	sub	sp,sp,#5*4
- ___
- 	&BODY_00_15(@V);	unshift(@V,pop(@V));
- 	&BODY_16_19(@V);	unshift(@V,pop(@V));
-@@ -170,7 +171,7 @@ ___
- $code.=<<___;
- 
- 	ldr	$K,.LK_20_39		@ [+15+16*4]
--	sub	sp,sp,#25*4
-+	sub	sp,sp,#20*4
- 	cmn	sp,#0			@ [+3], clear carry to denote 20_39
- .L_20_39_or_60_79:
- ___
diff --git a/patches/small_records.patch b/patches/small_records.patch
deleted file mode 100644
index a2ea51c..0000000
--- a/patches/small_records.patch
+++ /dev/null
@@ -1,337 +0,0 @@
---- openssl-1.0.0a.orig/ssl/d1_pkt.c	2010-04-14 00:09:55.000000000 +0000
-+++ openssl-1.0.0a/ssl/d1_pkt.c	2010-08-25 21:12:39.000000000 +0000
-@@ -608,6 +608,24 @@ again:
- 			goto again;
- 			}
- 
-+		/* If we receive a valid record larger than the current buffer size,
-+		 * allocate some memory for it.
-+		 */
-+		if (rr->length > s->s3->rbuf.len - DTLS1_RT_HEADER_LENGTH)
-+			{
-+			unsigned char *pp;
-+			unsigned int newlen = rr->length + DTLS1_RT_HEADER_LENGTH;
-+			if ((pp=OPENSSL_realloc(s->s3->rbuf.buf, newlen))==NULL)
-+				{
-+				SSLerr(SSL_F_DTLS1_GET_RECORD,ERR_R_MALLOC_FAILURE);
-+				return(-1);
-+				}
-+			p = pp + (p - s->s3->rbuf.buf);
-+			s->s3->rbuf.buf=pp;
-+			s->s3->rbuf.len=newlen;
-+			s->packet= &(s->s3->rbuf.buf[0]);
-+			}
-+
- 		/* now s->rstate == SSL_ST_READ_BODY */
- 		}
- 
-@@ -1342,6 +1360,7 @@ int do_dtls1_write(SSL *s, int type, con
- 	SSL3_BUFFER *wb;
- 	SSL_SESSION *sess;
- 	int bs;
-+	unsigned int len_with_overhead = len + SSL3_RT_DEFAULT_WRITE_OVERHEAD;
- 
- 	/* first check if there is a SSL3_BUFFER still being written
- 	 * out.  This will happen with non blocking IO */
-@@ -1351,6 +1370,16 @@ int do_dtls1_write(SSL *s, int type, con
- 		return(ssl3_write_pending(s,type,buf,len));
- 		}
- 
-+	if (s->s3->wbuf.len < len_with_overhead)
-+		{
-+		if ((p=OPENSSL_realloc(s->s3->wbuf.buf, len_with_overhead)) == NULL) {
-+			SSLerr(SSL_F_DO_DTLS1_WRITE,ERR_R_MALLOC_FAILURE);
-+			goto err;
-+		}
-+		s->s3->wbuf.buf = p;
-+		s->s3->wbuf.len = len_with_overhead;
-+		}
-+
- 	/* If we have an alert to send, lets send it */
- 	if (s->s3->alert_dispatch)
- 		{
---- openssl-1.0.0a.orig/ssl/s23_srvr.c	2010-02-16 14:20:40.000000000 +0000
-+++ openssl-1.0.0a/ssl/s23_srvr.c	2010-08-25 21:12:39.000000000 +0000
-@@ -403,8 +403,13 @@ int ssl23_get_client_hello(SSL *s)
- 		v[0] = p[3]; /* == SSL3_VERSION_MAJOR */
- 		v[1] = p[4];
- 
-+/* The SSL2 protocol allows n to be larger, just pick
-+ * a reasonable buffer size. */
-+#if SSL3_RT_DEFAULT_PACKET_SIZE < 1024*4 - SSL3_RT_DEFAULT_WRITE_OVERHEAD
-+#error "SSL3_RT_DEFAULT_PACKET_SIZE is too small."
-+#endif
- 		n=((p[0]&0x7f)<<8)|p[1];
--		if (n > (1024*4))
-+		if (n > SSL3_RT_DEFAULT_PACKET_SIZE - 2)
- 			{
- 			SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,SSL_R_RECORD_TOO_LARGE);
- 			goto err;
---- openssl-1.0.0a.orig/ssl/s3_both.c	2010-03-24 23:16:49.000000000 +0000
-+++ openssl-1.0.0a/ssl/s3_both.c	2010-08-25 21:12:39.000000000 +0000
-@@ -715,13 +722,20 @@ int ssl3_setup_read_buffer(SSL *s)
- 
- 	if (s->s3->rbuf.buf == NULL)
- 		{
--		len = SSL3_RT_MAX_PLAIN_LENGTH
--			+ SSL3_RT_MAX_ENCRYPTED_OVERHEAD
--			+ headerlen + align;
--		if (s->options & SSL_OP_MICROSOFT_BIG_SSLV3_BUFFER)
-+		if (SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS)
- 			{
--			s->s3->init_extra = 1;
--			len += SSL3_RT_MAX_EXTRA;
-+			len = SSL3_RT_DEFAULT_PACKET_SIZE;
-+			}
-+  		else
-+			{
-+			len = SSL3_RT_MAX_PLAIN_LENGTH
-+				+ SSL3_RT_MAX_ENCRYPTED_OVERHEAD
-+				+ headerlen + align;
-+			if (s->options & SSL_OP_MICROSOFT_BIG_SSLV3_BUFFER)
-+				{
-+				s->s3->init_extra = 1;
-+				len += SSL3_RT_MAX_EXTRA;
-+				}
- 			}
- #ifndef OPENSSL_NO_COMP
- 		if (!(s->options & SSL_OP_NO_COMPRESSION))
-@@ -757,7 +771,15 @@ int ssl3_setup_write_buffer(SSL *s)
- 
- 	if (s->s3->wbuf.buf == NULL)
- 		{
--		len = s->max_send_fragment
-+		if (SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS)
-+			{
-+			len = SSL3_RT_DEFAULT_PACKET_SIZE;
-+			}
-+  		else
-+			{
-+			len = s->max_send_fragment;
-+			}
-+		len += 0
- 			+ SSL3_RT_SEND_MAX_ENCRYPTED_OVERHEAD
- 			+ headerlen + align;
- #ifndef OPENSSL_NO_COMP
-@@ -767,7 +789,6 @@ int ssl3_setup_write_buffer(SSL *s)
- 		if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS))
- 			len += headerlen + align
- 				+ SSL3_RT_SEND_MAX_ENCRYPTED_OVERHEAD;
--
- 		if ((p=freelist_extract(s->ctx, 0, len)) == NULL)
- 			goto err;
- 		s->s3->wbuf.buf = p;
-@@ -810,4 +831,3 @@ int ssl3_release_read_buffer(SSL *s)
- 		}
- 	return 1;
- 	}
--
---- openssl-1.0.0a.orig/ssl/s3_pkt.c	2010-03-25 11:22:42.000000000 +0000
-+++ openssl-1.0.0a/ssl/s3_pkt.c	2010-08-25 21:12:39.000000000 +0000
-@@ -293,6 +293,11 @@ static int ssl3_get_record(SSL *s)
- 	size_t extra;
- 	int decryption_failed_or_bad_record_mac = 0;
- 	unsigned char *mac = NULL;
-+#if defined(SSL3_ALIGN_PAYLOAD) && SSL3_ALIGN_PAYLOAD!=0
-+	long align=SSL3_ALIGN_PAYLOAD;
-+#else
-+	long align=0;
-+#endif
- 
- 	rr= &(s->s3->rrec);
- 	sess=s->session;
-@@ -301,7 +306,8 @@ static int ssl3_get_record(SSL *s)
- 		extra=SSL3_RT_MAX_EXTRA;
- 	else
- 		extra=0;
--	if (extra && !s->s3->init_extra)
-+	if (!(SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS) &&
-+		extra && !s->s3->init_extra)
- 		{
- 		/* An application error: SLS_OP_MICROSOFT_BIG_SSLV3_BUFFER
- 		 * set after ssl3_setup_buffers() was done */
-@@ -350,6 +356,21 @@ fprintf(stderr, "Record type=%d, Length=
- 			goto err;
- 			}
- 
-+		/* If we receive a valid record larger than the current buffer size,
-+		 * allocate some memory for it.
-+		 */
-+		if (rr->length > s->s3->rbuf.len - SSL3_RT_HEADER_LENGTH - align)
-+			{
-+			if ((p=OPENSSL_realloc(s->s3->rbuf.buf, rr->length + SSL3_RT_HEADER_LENGTH + align))==NULL)
-+				{
-+				SSLerr(SSL_F_SSL3_GET_RECORD,ERR_R_MALLOC_FAILURE);
-+				goto err;
-+				}
-+			s->s3->rbuf.buf=p;
-+			s->s3->rbuf.len=rr->length + SSL3_RT_HEADER_LENGTH + align;
-+			s->packet= &(s->s3->rbuf.buf[0]);
-+			}
-+
- 		if (rr->length > s->s3->rbuf.len - SSL3_RT_HEADER_LENGTH)
- 			{
- 			al=SSL_AD_RECORD_OVERFLOW;
-@@ -576,6 +597,7 @@ int ssl3_write_bytes(SSL *s, int type, c
- 	const unsigned char *buf=buf_;
- 	unsigned int tot,n,nw;
- 	int i;
-+	unsigned int max_plain_length;
- 
- 	s->rwstate=SSL_NOTHING;
- 	tot=s->s3->wnum;
-@@ -595,8 +617,13 @@ int ssl3_write_bytes(SSL *s, int type, c
- 	n=(len-tot);
- 	for (;;)
- 		{
--		if (n > s->max_send_fragment)
--			nw=s->max_send_fragment;
-+		if (type == SSL3_RT_APPLICATION_DATA && (SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS))
-+			max_plain_length = SSL3_RT_DEFAULT_PLAIN_LENGTH;
-+		else
-+			max_plain_length = s->max_send_fragment;
-+
-+		if (n > max_plain_length)
-+			nw = max_plain_length;
- 		else
- 			nw=n;
- 
-@@ -727,6 +727,18 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
- 		s->s3->empty_fragment_done = 1;
- 		}
- 
-+	/* resize if necessary to hold the data. */
-+	if (len + SSL3_RT_DEFAULT_WRITE_OVERHEAD > wb->len)
-+		{
-+		if ((p=OPENSSL_realloc(wb->buf, len + SSL3_RT_DEFAULT_WRITE_OVERHEAD))==NULL)
-+			{
-+			SSLerr(SSL_F_DO_SSL3_WRITE,ERR_R_MALLOC_FAILURE);
-+			goto err;
-+			}
-+		wb->buf = p;
-+		wb->len = len + SSL3_RT_DEFAULT_WRITE_OVERHEAD;
-+		}
-+
- 	if (create_empty_fragment)
- 		{
- #if defined(SSL3_ALIGN_PAYLOAD) && SSL3_ALIGN_PAYLOAD!=0
---- openssl-1.0.0a.orig/ssl/ssl.h	2010-01-06 17:37:38.000000000 +0000
-+++ openssl-1.0.0a/ssl/ssl.h	2010-08-25 21:12:39.000000000 +0000
-@@ -602,6 +602,9 @@ typedef struct ssl_session_st
-  * TLS only.)  "Released" buffers are put onto a free-list in the context
-  * or just freed (depending on the context's setting for freelist_max_len). */
- #define SSL_MODE_RELEASE_BUFFERS 0x00000010L
-+/* Use small read and write buffers: (a) lazy allocate read buffers for
-+ * large incoming records, and (b) limit the size of outgoing records. */
-+#define SSL_MODE_SMALL_BUFFERS 0x00000020L
- 
- /* Note: SSL[_CTX]_set_{options,mode} use |= op on the previous value,
-  * they cannot be used to clear bits. */
---- openssl-1.0.0a.orig/ssl/ssl3.h	2010-01-06 17:37:38.000000000 +0000
-+++ openssl-1.0.0a/ssl/ssl3.h	2010-08-25 21:12:39.000000000 +0000
-@@ -280,6 +280,9 @@ extern "C" {
- 
- #define SSL3_RT_MAX_EXTRA			(16384)
- 
-+/* Default buffer length used for writen records.  Thus a generated record
-+ * will contain plaintext no larger than this value. */
-+#define SSL3_RT_DEFAULT_PLAIN_LENGTH	2048
- /* Maximum plaintext length: defined by SSL/TLS standards */
- #define SSL3_RT_MAX_PLAIN_LENGTH		16384
- /* Maximum compression overhead: defined by SSL/TLS standards */
-@@ -311,6 +314,13 @@ extern "C" {
- #define SSL3_RT_MAX_PACKET_SIZE		\
- 		(SSL3_RT_MAX_ENCRYPTED_LENGTH+SSL3_RT_HEADER_LENGTH)
- 
-+/* Extra space for empty fragment, headers, MAC, and padding. */
-+#define SSL3_RT_DEFAULT_WRITE_OVERHEAD  256
-+#define SSL3_RT_DEFAULT_PACKET_SIZE     4096 - SSL3_RT_DEFAULT_WRITE_OVERHEAD
-+#if SSL3_RT_DEFAULT_PLAIN_LENGTH + SSL3_RT_DEFAULT_WRITE_OVERHEAD > SSL3_RT_DEFAULT_PACKET_SIZE
-+#error "Insufficient space allocated for write buffers."
-+#endif
-+
- #define SSL3_MD_CLIENT_FINISHED_CONST	"\x43\x4C\x4E\x54"
- #define SSL3_MD_SERVER_FINISHED_CONST	"\x53\x52\x56\x52"
- 
-@@ -634,4 +645,3 @@ typedef struct ssl3_state_st
- }
- #endif
- #endif
--
---- openssl-1.0.0a.orig/ssl/ssltest.c	2010-01-24 16:57:38.000000000 +0000
-+++ openssl-1.0.0a/ssl/ssltest.c	2010-08-25 21:12:39.000000000 +0000
-@@ -316,6 +316,8 @@ static void sv_usage(void)
- 	               "                 (default is sect163r2).\n");
- #endif
- 	fprintf(stderr," -test_cipherlist - verifies the order of the ssl cipher lists\n");
-+	fprintf(stderr," -c_small_records - enable client side use of small SSL record buffers\n");
-+	fprintf(stderr," -s_small_records - enable server side use of small SSL record buffers\n");
- 	}
- 
- static void print_details(SSL *c_ssl, const char *prefix)
-@@ -444,6 +447,9 @@ int opaque_prf_input_cb(SSL *ssl, void *
- 	return arg->ret;
- 	}
- #endif
-+	int ssl_mode = 0;
-+	int c_small_records=0;
-+	int s_small_records=0;
- 
- int main(int argc, char *argv[])
- 	{
-@@ -680,6 +687,14 @@ int main(int argc, char *argv[])
- 			{
- 			test_cipherlist = 1;
- 			}
-+		else if (strcmp(*argv, "-c_small_records") == 0)
-+			{
-+			c_small_records = 1;
-+			}
-+		else if (strcmp(*argv, "-s_small_records") == 0)
-+			{
-+			s_small_records = 1;
-+			}
- 		else
- 			{
- 			fprintf(stderr,"unknown option %s\n",*argv);
-@@ -802,6 +821,21 @@ bad:
- 		SSL_CTX_set_cipher_list(s_ctx,cipher);
- 		}
- 
-+	ssl_mode = 0;
-+	if (c_small_records)
-+		{
-+		ssl_mode = SSL_CTX_get_mode(c_ctx);
-+		ssl_mode |= SSL_MODE_SMALL_BUFFERS;
-+		SSL_CTX_set_mode(c_ctx, ssl_mode);
-+		}
-+	ssl_mode = 0;
-+	if (s_small_records)
-+		{
-+		ssl_mode = SSL_CTX_get_mode(s_ctx);
-+		ssl_mode |= SSL_MODE_SMALL_BUFFERS;
-+		SSL_CTX_set_mode(s_ctx, ssl_mode);
-+		}
-+
- #ifndef OPENSSL_NO_DH
- 	if (!no_dhe)
- 		{
---- openssl-1.0.0.orig/test/testssl	2006-03-10 15:06:27.000000000 -0800
-+++ openssl-1.0.0/test/testssl	2010-04-26 10:24:55.000000000 -0700
-@@ -70,6 +70,16 @@ $ssltest -client_auth $CA $extra || exit
- echo test sslv2/sslv3 with both client and server authentication
- $ssltest -server_auth -client_auth $CA $extra || exit 1
- 
-+echo test sslv2/sslv3 with both client and server authentication and small client buffers
-+$ssltest -server_auth -client_auth -c_small_records $CA $extra || exit 1
-+
-+echo test sslv2/sslv3 with both client and server authentication and small server buffers
-+$ssltest -server_auth -client_auth -s_small_records $CA $extra || exit 1
-+
-+echo test sslv2/sslv3 with both client and server authentication and small client and server buffers
-+$ssltest -server_auth -client_auth -c_small_records -s_small_records $CA $extra || exit 1
-+
-+
- echo test sslv2 via BIO pair
- $ssltest -bio_pair -ssl2 $extra || exit 1
- 
diff --git a/patches/ssl_Android.mk b/patches/ssl_Android.mk
deleted file mode 100644
index 40641a3..0000000
--- a/patches/ssl_Android.mk
+++ /dev/null
@@ -1,98 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-local_c_includes := \
-	external/openssl \
-	external/openssl/include \
-	external/openssl/crypto
-
-local_src_files:= \
-	s2_meth.c \
-	s2_srvr.c \
-	s2_clnt.c \
-	s2_lib.c \
-	s2_enc.c \
-	s2_pkt.c \
-	s3_meth.c \
-	s3_srvr.c \
-	s3_clnt.c \
-	s3_lib.c \
-	s3_enc.c \
-	s3_pkt.c \
-	s3_both.c \
-	s23_meth.c \
-	s23_srvr.c \
-	s23_clnt.c \
-	s23_lib.c \
-	s23_pkt.c \
-	t1_meth.c \
-	t1_srvr.c \
-	t1_clnt.c \
-	t1_lib.c \
-	t1_enc.c \
-	t1_reneg.c \
-	ssl_lib.c \
-	ssl_err2.c \
-	ssl_cert.c \
-	ssl_sess.c \
-	ssl_ciph.c \
-	ssl_stat.c \
-	ssl_rsa.c \
-	ssl_asn1.c \
-	ssl_txt.c \
-	ssl_algs.c \
-	bio_ssl.c \
-	ssl_err.c \
-	kssl.c
-
-#######################################
-# target static library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libssl_static
-include $(BUILD_STATIC_LIBRARY)
-
-#######################################
-# target shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SHARED_LIBRARIES += libcrypto
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libssl
-include $(BUILD_SHARED_LIBRARY)
-
-#######################################
-# host shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SHARED_LIBRARIES += libcrypto
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libssl
-include $(BUILD_HOST_SHARED_LIBRARY)
-
-#######################################
-# ssltest
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES:= ssltest.c
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SHARED_LIBRARIES := libssl libcrypto
-LOCAL_MODULE:= ssltest
-LOCAL_MODULE_TAGS := optional
-include $(BUILD_EXECUTABLE)
diff --git a/patches/sslv3_uninit_padding.patch b/patches/sslv3_uninit_padding.patch
deleted file mode 100644
index 89fff7b..0000000
--- a/patches/sslv3_uninit_padding.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-diff --git a/ssl/s3_enc.c b/google3/third_party/openssl/openssl/ssl/s3_enc.c
-index 58386e1..b145970 100644
---- a/ssl/s3_enc.c
-+++ b/ssl/s3_enc.c
-@@ -511,6 +511,9 @@ int ssl3_enc(SSL *s, int send)
- 
- 			/* we need to add 'i-1' padding bytes */
- 			l+=i;
-+			/* the last of these zero bytes will be overwritten
-+			 * with the padding length. */
-+			memset(&rec->input[rec->length], 0, i);
- 			rec->length+=i;
- 			rec->input[l-1]=(i-1);
- 			}
diff --git a/patches/tls12_digests.patch b/patches/tls12_digests.patch
new file mode 100644
index 0000000..11f7c27
--- /dev/null
+++ b/patches/tls12_digests.patch
@@ -0,0 +1,411 @@
+From 3a8c7b1a08b2766a7f8a388eee14442281b4e295 Mon Sep 17 00:00:00 2001
+From: Adam Langley <agl@chromium.org>
+Date: Thu, 24 Jan 2013 16:27:14 -0500
+Subject: [PATCH 19/36] tls12_digests
+
+Fixes a bug with handling TLS 1.2 and digest functions for DSA and ECDSA
+keys.
+---
+ ssl/s3_clnt.c  |  26 +++++++++++++--
+ ssl/ssl3.h     |  11 +++++-
+ ssl/ssl_cert.c |  20 -----------
+ ssl/ssl_lib.c  |  35 +++++++++++--------
+ ssl/ssl_locl.h |   4 +--
+ ssl/t1_lib.c   | 104 ++++++++++++++++++++-------------------------------------
+ 6 files changed, 94 insertions(+), 106 deletions(-)
+
+diff --git a/ssl/s3_clnt.c b/ssl/s3_clnt.c
+index c9196b3..1f3b376 100644
+--- a/ssl/s3_clnt.c
++++ b/ssl/s3_clnt.c
+@@ -1990,12 +1990,13 @@ int ssl3_get_certificate_request(SSL *s)
+ 			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_DATA_LENGTH_TOO_LONG);
+ 			goto err;
+ 			}
+-		if ((llen & 1) || !tls1_process_sigalgs(s, p, llen))
++		if (llen & 1)
+ 			{
+ 			ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
+ 			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_SIGNATURE_ALGORITHMS_ERROR);
+ 			goto err;
+ 			}
++		tls1_process_sigalgs(s, p, llen);
+ 		p += llen;
+ 		}
+ 
+@@ -3017,7 +3018,28 @@ int ssl3_send_client_verify(SSL *s)
+ 			{
+ 			long hdatalen = 0;
+ 			void *hdata;
+-			const EVP_MD *md = s->cert->key->digest;
++			const EVP_MD *md;
++			switch (ssl_cert_type(NULL, pkey))
++				{
++			case SSL_PKEY_RSA_ENC:
++				md = s->s3->digest_rsa;
++				break;
++			case SSL_PKEY_DSA_SIGN:
++				md = s->s3->digest_dsa;
++				break;
++			case SSL_PKEY_ECC:
++				md = s->s3->digest_ecdsa;
++				break;
++			default:
++				md = NULL;
++				}
++			if (!md)
++				/* Unlike with the SignatureAlgorithm extension (sent by clients),
++				 * there are no default algorithms for the CertificateRequest message
++				 * (sent by servers). However, now that we've sent a certificate
++				 * for which we don't really know what hash to use for signing, the
++				 * best we can do is try a default algorithm. */
++				md = EVP_sha1();
+ 			hdatalen = BIO_get_mem_data(s->s3->handshake_buffer,
+ 								&hdata);
+ 			if (hdatalen <= 0 || !tls12_get_sigandhash(p, pkey, md))
+diff --git a/ssl/ssl3.h b/ssl/ssl3.h
+index 29098e4..3229995 100644
+--- a/ssl/ssl3.h
++++ b/ssl/ssl3.h
+@@ -550,6 +550,16 @@ typedef struct ssl3_state_st
+ 	 *     verified Channel ID from the client: a P256 point, (x,y), where
+ 	 *     each are big-endian values. */
+ 	unsigned char tlsext_channel_id[64];
++
++	/* These point to the digest function to use for signatures made with
++	 * each type of public key. A NULL value indicates that the default
++	 * digest should be used, which is SHA1 as of TLS 1.2.
++	 *
++	 * (These should be in the tmp member, but we have to put them here to
++	 * ensure binary compatibility with earlier OpenSSL 1.0.* releases.) */
++	const EVP_MD *digest_rsa;
++	const EVP_MD *digest_dsa;
++	const EVP_MD *digest_ecdsa;
+ 	} SSL3_STATE;
+ 
+ #endif
+@@ -700,4 +710,3 @@ typedef struct ssl3_state_st
+ }
+ #endif
+ #endif
+-
+diff --git a/ssl/ssl_cert.c b/ssl/ssl_cert.c
+index 5123a89..bc4150b 100644
+--- a/ssl/ssl_cert.c
++++ b/ssl/ssl_cert.c
+@@ -160,21 +160,6 @@ int SSL_get_ex_data_X509_STORE_CTX_idx(void)
+ 	return ssl_x509_store_ctx_idx;
+ 	}
+ 
+-static void ssl_cert_set_default_md(CERT *cert)
+-	{
+-	/* Set digest values to defaults */
+-#ifndef OPENSSL_NO_DSA
+-	cert->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_sha1();
+-#endif
+-#ifndef OPENSSL_NO_RSA
+-	cert->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1();
+-	cert->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1();
+-#endif
+-#ifndef OPENSSL_NO_ECDSA
+-	cert->pkeys[SSL_PKEY_ECC].digest = EVP_sha1();
+-#endif
+-	}
+-
+ CERT *ssl_cert_new(void)
+ 	{
+ 	CERT *ret;
+@@ -189,7 +174,6 @@ CERT *ssl_cert_new(void)
+ 
+ 	ret->key= &(ret->pkeys[SSL_PKEY_RSA_ENC]);
+ 	ret->references=1;
+-	ssl_cert_set_default_md(ret);
+ 	return(ret);
+ 	}
+ 
+@@ -322,10 +306,6 @@ CERT *ssl_cert_dup(CERT *cert)
+ 	 * chain is held inside SSL_CTX */
+ 
+ 	ret->references=1;
+-	/* Set digests to defaults. NB: we don't copy existing values as they
+-	 * will be set during handshake.
+-	 */
+-	ssl_cert_set_default_md(ret);
+ 
+ 	return(ret);
+ 	
+diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
+index 5f8b0b0..e360550 100644
+--- a/ssl/ssl_lib.c
++++ b/ssl/ssl_lib.c
+@@ -2345,32 +2345,41 @@ EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher, const EVP_MD **pmd)
+ 	{
+ 	unsigned long alg_a;
+ 	CERT *c;
+-	int idx = -1;
+ 
+ 	alg_a = cipher->algorithm_auth;
+ 	c=s->cert;
+ 
++	/* SHA1 is the default for all signature algorithms up to TLS 1.2,
++	 * except RSA which is handled specially in s3_srvr.c */
++	if (pmd)
++		*pmd = EVP_sha1();
++
+ 	if ((alg_a & SSL_aDSS) &&
+-		(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey != NULL))
+-		idx = SSL_PKEY_DSA_SIGN;
++	    (c->pkeys[SSL_PKEY_DSA_SIGN].privatekey != NULL))
++		{
++		if (pmd && s->s3 && s->s3->digest_dsa)
++			*pmd = s->s3->digest_dsa;
++		return c->pkeys[SSL_PKEY_DSA_SIGN].privatekey;
++		}
+ 	else if (alg_a & SSL_aRSA)
+ 		{
++		if (pmd && s->s3 && s->s3->digest_rsa)
++			*pmd = s->s3->digest_rsa;
+ 		if (c->pkeys[SSL_PKEY_RSA_SIGN].privatekey != NULL)
+-			idx = SSL_PKEY_RSA_SIGN;
+-		else if (c->pkeys[SSL_PKEY_RSA_ENC].privatekey != NULL)
+-			idx = SSL_PKEY_RSA_ENC;
++			return c->pkeys[SSL_PKEY_RSA_SIGN].privatekey;
++		if (c->pkeys[SSL_PKEY_RSA_ENC].privatekey != NULL)
++			return c->pkeys[SSL_PKEY_RSA_ENC].privatekey;
+ 		}
+ 	else if ((alg_a & SSL_aECDSA) &&
+ 	         (c->pkeys[SSL_PKEY_ECC].privatekey != NULL))
+-		idx = SSL_PKEY_ECC;
+-	if (idx == -1)
+ 		{
+-		SSLerr(SSL_F_SSL_GET_SIGN_PKEY,ERR_R_INTERNAL_ERROR);
+-		return(NULL);
++		if (pmd && s->s3 && s->s3->digest_ecdsa)
++			*pmd = s->s3->digest_ecdsa;
++		return c->pkeys[SSL_PKEY_ECC].privatekey;
+ 		}
+-	if (pmd)
+-		*pmd = c->pkeys[idx].digest;
+-	return c->pkeys[idx].privatekey;
++
++	SSLerr(SSL_F_SSL_GET_SIGN_PKEY,ERR_R_INTERNAL_ERROR);
++	return(NULL);
+ 	}
+ 
+ void ssl_update_cache(SSL *s,int mode)
+diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h
+index 6d38f0f..3e89fcb 100644
+--- a/ssl/ssl_locl.h
++++ b/ssl/ssl_locl.h
+@@ -485,8 +485,6 @@ typedef struct cert_pkey_st
+ 	{
+ 	X509 *x509;
+ 	EVP_PKEY *privatekey;
+-	/* Digest to use when signing */
+-	const EVP_MD *digest;
+ 	} CERT_PKEY;
+ 
+ typedef struct cert_st
+@@ -1142,7 +1140,7 @@ int ssl_add_clienthello_renegotiate_ext(SSL *s, unsigned char *p, int *len,
+ int ssl_parse_clienthello_renegotiate_ext(SSL *s, unsigned char *d, int len,
+ 					  int *al);
+ long ssl_get_algorithm2(SSL *s);
+-int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize);
++void tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize);
+ int tls12_get_req_sig_algs(SSL *s, unsigned char *p);
+ 
+ int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen);
+diff --git a/ssl/t1_lib.c b/ssl/t1_lib.c
+index 26805e4..6af51a9 100644
+--- a/ssl/t1_lib.c
++++ b/ssl/t1_lib.c
+@@ -897,6 +897,13 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
+ 
+ 	s->servername_done = 0;
+ 	s->tlsext_status_type = -1;
++
++	/* Reset TLS 1.2 digest functions to defaults because they don't carry
++	 * over to a renegotiation. */
++	s->s3->digest_rsa = NULL;
++	s->s3->digest_dsa = NULL;
++	s->s3->digest_ecdsa = NULL;
++
+ #ifndef OPENSSL_NO_NEXTPROTONEG
+ 	s->s3->next_proto_neg_seen = 0;
+ #endif
+@@ -1198,11 +1205,7 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
+ 				*al = SSL_AD_DECODE_ERROR;
+ 				return 0;
+ 				}
+-			if (!tls1_process_sigalgs(s, data, dsize))
+-				{
+-				*al = SSL_AD_DECODE_ERROR;
+-				return 0;
+-				}
++			tls1_process_sigalgs(s, data, dsize);
+ 			}
+ 		else if (type == TLSEXT_TYPE_status_request &&
+ 		         s->version != DTLS1_VERSION && s->ctx->tlsext_status_cb)
+@@ -2354,18 +2357,6 @@ static int tls12_find_id(int nid, tls12_lookup *table, size_t tlen)
+ 		}
+ 	return -1;
+ 	}
+-#if 0
+-static int tls12_find_nid(int id, tls12_lookup *table, size_t tlen)
+-	{
+-	size_t i;
+-	for (i = 0; i < tlen; i++)
+-		{
+-		if (table[i].id == id)
+-			return table[i].nid;
+-		}
+-	return -1;
+-	}
+-#endif
+ 
+ int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk, const EVP_MD *md)
+ 	{
+@@ -2384,6 +2375,8 @@ int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk, const EVP_MD *md)
+ 	return 1;
+ 	}
+ 
++/* tls12_get_sigid returns the TLS 1.2 SignatureAlgorithm value corresponding
++ * to the given public key, or -1 if not known. */
+ int tls12_get_sigid(const EVP_PKEY *pk)
+ 	{
+ 	return tls12_find_id(pk->type, tls12_sig,
+@@ -2403,47 +2396,49 @@ const EVP_MD *tls12_get_hash(unsigned char hash_alg)
+ 		return EVP_md5();
+ #endif
+ #ifndef OPENSSL_NO_SHA
+-		case TLSEXT_hash_sha1:
++	case TLSEXT_hash_sha1:
+ 		return EVP_sha1();
+ #endif
+ #ifndef OPENSSL_NO_SHA256
+-		case TLSEXT_hash_sha224:
++	case TLSEXT_hash_sha224:
+ 		return EVP_sha224();
+ 
+-		case TLSEXT_hash_sha256:
++	case TLSEXT_hash_sha256:
+ 		return EVP_sha256();
+ #endif
+ #ifndef OPENSSL_NO_SHA512
+-		case TLSEXT_hash_sha384:
++	case TLSEXT_hash_sha384:
+ 		return EVP_sha384();
+ 
+-		case TLSEXT_hash_sha512:
++	case TLSEXT_hash_sha512:
+ 		return EVP_sha512();
+ #endif
+-		default:
++	default:
+ 		return NULL;
+ 
+ 		}
+ 	}
+ 
+-/* Set preferred digest for each key type */
+-
+-int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize)
++/* tls1_process_sigalgs processes a signature_algorithms extension and sets the
++ * digest functions accordingly for each key type.
++ *
++ * See RFC 5246, section 7.4.1.4.1.
++ *
++ * data: points to the content of the extension, not including type and length
++ *     headers.
++ * dsize: the number of bytes of |data|. Must be even.
++ */
++void tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize)
+ 	{
+-	int i, idx;
+-	const EVP_MD *md;
+-	CERT *c = s->cert;
++	int i;
++	const EVP_MD *md, **digest_ptr;
+ 	/* Extension ignored for TLS versions below 1.2 */
+ 	if (TLS1_get_version(s) < TLS1_2_VERSION)
+-		return 1;
+-	/* Should never happen */
+-	if (!c)
+-		return 0;
++		return;
+ 
+-	c->pkeys[SSL_PKEY_DSA_SIGN].digest = NULL;
+-	c->pkeys[SSL_PKEY_RSA_SIGN].digest = NULL;
+-	c->pkeys[SSL_PKEY_RSA_ENC].digest = NULL;
+-	c->pkeys[SSL_PKEY_ECC].digest = NULL;
++	s->s3->digest_rsa = NULL;
++	s->s3->digest_dsa = NULL;
++	s->s3->digest_ecdsa = NULL;
+ 
+ 	for (i = 0; i < dsize; i += 2)
+ 		{
+@@ -2453,56 +2448,31 @@ int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize)
+ 			{
+ #ifndef OPENSSL_NO_RSA
+ 			case TLSEXT_signature_rsa:
+-			idx = SSL_PKEY_RSA_SIGN;
++			digest_ptr = &s->s3->digest_rsa;
+ 			break;
+ #endif
+ #ifndef OPENSSL_NO_DSA
+ 			case TLSEXT_signature_dsa:
+-			idx = SSL_PKEY_DSA_SIGN;
++			digest_ptr = &s->s3->digest_dsa;
+ 			break;
+ #endif
+ #ifndef OPENSSL_NO_ECDSA
+ 			case TLSEXT_signature_ecdsa:
+-			idx = SSL_PKEY_ECC;
++			digest_ptr = &s->s3->digest_ecdsa;
+ 			break;
+ #endif
+ 			default:
+ 			continue;
+ 			}
+ 
+-		if (c->pkeys[idx].digest == NULL)
++		if (*digest_ptr == NULL)
+ 			{
+ 			md = tls12_get_hash(hash_alg);
+ 			if (md)
+-				{
+-				c->pkeys[idx].digest = md;
+-				if (idx == SSL_PKEY_RSA_SIGN)
+-					c->pkeys[SSL_PKEY_RSA_ENC].digest = md;
+-				}
++				*digest_ptr = md;
+ 			}
+ 
+ 		}
+-
+-
+-	/* Set any remaining keys to default values. NOTE: if alg is not
+-	 * supported it stays as NULL.
+-	 */
+-#ifndef OPENSSL_NO_DSA
+-	if (!c->pkeys[SSL_PKEY_DSA_SIGN].digest)
+-		c->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_sha1();
+-#endif
+-#ifndef OPENSSL_NO_RSA
+-	if (!c->pkeys[SSL_PKEY_RSA_SIGN].digest)
+-		{
+-		c->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1();
+-		c->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1();
+-		}
+-#endif
+-#ifndef OPENSSL_NO_ECDSA
+-	if (!c->pkeys[SSL_PKEY_ECC].digest)
+-		c->pkeys[SSL_PKEY_ECC].digest = EVP_sha1();
+-#endif
+-	return 1;
+ 	}
+ 
+ #endif
+-- 
+1.8.2.1
+
diff --git a/proguard-project.txt b/proguard-project.txt
new file mode 100644
index 0000000..f2fe155
--- /dev/null
+++ b/proguard-project.txt
@@ -0,0 +1,20 @@
+# To enable ProGuard in your project, edit project.properties
+# to define the proguard.config property as described in that file.
+#
+# Add project specific ProGuard rules here.
+# By default, the flags in this file are appended to flags specified
+# in ${sdk.dir}/tools/proguard/proguard-android.txt
+# You can edit the include path and order by changing the ProGuard
+# include property in project.properties.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# Add any project specific keep options here:
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
diff --git a/project.properties b/project.properties
new file mode 100644
index 0000000..7a23616
--- /dev/null
+++ b/project.properties
@@ -0,0 +1,15 @@
+# This file is automatically generated by Android Tools.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file must be checked in Version Control Systems.
+#
+# To customize properties used by the Ant build system edit
+# "ant.properties", and override values to adapt the script to your
+# project structure.
+#
+# To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home):
+proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt
+
+# Project target.
+target=android-9
+android.library=true
diff --git a/ssl/Android.mk b/ssl/Android.mk
deleted file mode 100644
index 40641a3..0000000
--- a/ssl/Android.mk
+++ /dev/null
@@ -1,98 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-local_c_includes := \
-	external/openssl \
-	external/openssl/include \
-	external/openssl/crypto
-
-local_src_files:= \
-	s2_meth.c \
-	s2_srvr.c \
-	s2_clnt.c \
-	s2_lib.c \
-	s2_enc.c \
-	s2_pkt.c \
-	s3_meth.c \
-	s3_srvr.c \
-	s3_clnt.c \
-	s3_lib.c \
-	s3_enc.c \
-	s3_pkt.c \
-	s3_both.c \
-	s23_meth.c \
-	s23_srvr.c \
-	s23_clnt.c \
-	s23_lib.c \
-	s23_pkt.c \
-	t1_meth.c \
-	t1_srvr.c \
-	t1_clnt.c \
-	t1_lib.c \
-	t1_enc.c \
-	t1_reneg.c \
-	ssl_lib.c \
-	ssl_err2.c \
-	ssl_cert.c \
-	ssl_sess.c \
-	ssl_ciph.c \
-	ssl_stat.c \
-	ssl_rsa.c \
-	ssl_asn1.c \
-	ssl_txt.c \
-	ssl_algs.c \
-	bio_ssl.c \
-	ssl_err.c \
-	kssl.c
-
-#######################################
-# target static library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libssl_static
-include $(BUILD_STATIC_LIBRARY)
-
-#######################################
-# target shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifeq ($(TARGET_ARCH),arm)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SHARED_LIBRARIES += libcrypto
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libssl
-include $(BUILD_SHARED_LIBRARY)
-
-#######################################
-# host shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SHARED_LIBRARIES += libcrypto
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libssl
-include $(BUILD_HOST_SHARED_LIBRARY)
-
-#######################################
-# ssltest
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-LOCAL_SRC_FILES:= ssltest.c
-LOCAL_C_INCLUDES += $(local_c_includes)
-LOCAL_SHARED_LIBRARIES := libssl libcrypto
-LOCAL_MODULE:= ssltest
-LOCAL_MODULE_TAGS := optional
-include $(BUILD_EXECUTABLE)
diff --git a/ssl/Makefile b/ssl/Makefile
deleted file mode 100644
index 2b275fa..0000000
--- a/ssl/Makefile
+++ /dev/null
@@ -1,975 +0,0 @@
-#
-# OpenSSL/ssl/Makefile
-#
-
-DIR=	ssl
-TOP=	..
-CC=	cc
-INCLUDES= -I../crypto -I$(TOP) -I../include $(KRB5_INCLUDES)
-CFLAG=-g
-MAKEFILE=	Makefile
-AR=		ar r
-# KRB5 stuff
-KRB5_INCLUDES=
-
-CFLAGS= $(INCLUDES) $(CFLAG)
-
-GENERAL=Makefile README ssl-lib.com install.com
-TEST=ssltest.c
-APPS=
-
-LIB=$(TOP)/libssl.a
-SHARED_LIB= libssl$(SHLIB_EXT)
-LIBSRC=	\
-	s2_meth.c   s2_srvr.c s2_clnt.c  s2_lib.c  s2_enc.c s2_pkt.c \
-	s3_meth.c   s3_srvr.c s3_clnt.c  s3_lib.c  s3_enc.c s3_pkt.c s3_both.c \
-	s23_meth.c s23_srvr.c s23_clnt.c s23_lib.c          s23_pkt.c \
-	t1_meth.c   t1_srvr.c t1_clnt.c  t1_lib.c  t1_enc.c \
-	d1_meth.c   d1_srvr.c d1_clnt.c  d1_lib.c  d1_pkt.c \
-	d1_both.c d1_enc.c \
-	ssl_lib.c ssl_err2.c ssl_cert.c ssl_sess.c \
-	ssl_ciph.c ssl_stat.c ssl_rsa.c \
-	ssl_asn1.c ssl_txt.c ssl_algs.c \
-	bio_ssl.c ssl_err.c kssl.c t1_reneg.c
-LIBOBJ= \
-	s2_meth.o  s2_srvr.o  s2_clnt.o  s2_lib.o  s2_enc.o s2_pkt.o \
-	s3_meth.o  s3_srvr.o  s3_clnt.o  s3_lib.o  s3_enc.o s3_pkt.o s3_both.o \
-	s23_meth.o s23_srvr.o s23_clnt.o s23_lib.o          s23_pkt.o \
-	t1_meth.o   t1_srvr.o t1_clnt.o  t1_lib.o  t1_enc.o \
-	d1_meth.o   d1_srvr.o d1_clnt.o  d1_lib.o  d1_pkt.o \
-	d1_both.o d1_enc.o \
-	ssl_lib.o ssl_err2.o ssl_cert.o ssl_sess.o \
-	ssl_ciph.o ssl_stat.o ssl_rsa.o \
-	ssl_asn1.o ssl_txt.o ssl_algs.o \
-	bio_ssl.o ssl_err.o kssl.o t1_reneg.o
-
-SRC= $(LIBSRC)
-
-EXHEADER= ssl.h ssl2.h ssl3.h ssl23.h tls1.h dtls1.h kssl.h
-HEADER=	$(EXHEADER) ssl_locl.h kssl_lcl.h
-
-ALL=    $(GENERAL) $(SRC) $(HEADER)
-
-top:
-	(cd ..; $(MAKE) DIRS=$(DIR) all)
-
-all:	shared
-
-lib:	$(LIBOBJ)
-	$(AR) $(LIB) $(LIBOBJ)
-	$(RANLIB) $(LIB) || echo Never mind.
-	@touch lib
-
-shared: lib
-	if [ -n "$(SHARED_LIBS)" ]; then \
-		(cd ..; $(MAKE) $(SHARED_LIB)); \
-	fi
-
-files:
-	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
-
-links:
-	@$(PERL) $(TOP)/util/mklink.pl ../include/openssl $(EXHEADER)
-	@$(PERL) $(TOP)/util/mklink.pl ../test $(TEST)
-	@$(PERL) $(TOP)/util/mklink.pl ../apps $(APPS)
-
-install:
-	@[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
-	@headerlist="$(EXHEADER)"; for i in $$headerlist ; \
-	do  \
-	(cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
-	chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
-	done;
-
-tags:
-	ctags $(SRC)
-
-tests:
-
-lint:
-	lint -DLINT $(INCLUDES) $(SRC)>fluff
-
-depend:
-	@if [ -z "$(THIS)" ]; then \
-	    $(MAKE) -f $(TOP)/Makefile reflect THIS=$@; \
-	else \
-	    $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC); \
-	fi
-
-dclean:
-	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
-	mv -f Makefile.new $(MAKEFILE)
-
-clean:
-	rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-bio_ssl.o: ../include/openssl/asn1.h ../include/openssl/bio.h
-bio_ssl.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-bio_ssl.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
-bio_ssl.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-bio_ssl.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-bio_ssl.o: ../include/openssl/err.h ../include/openssl/evp.h
-bio_ssl.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-bio_ssl.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-bio_ssl.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-bio_ssl.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-bio_ssl.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-bio_ssl.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-bio_ssl.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-bio_ssl.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-bio_ssl.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-bio_ssl.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-bio_ssl.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-bio_ssl.o: ../include/openssl/x509_vfy.h bio_ssl.c
-d1_both.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-d1_both.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-d1_both.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-d1_both.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-d1_both.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-d1_both.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-d1_both.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-d1_both.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-d1_both.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-d1_both.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-d1_both.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-d1_both.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-d1_both.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-d1_both.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-d1_both.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-d1_both.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-d1_both.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-d1_both.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-d1_both.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_both.c
-d1_both.o: ssl_locl.h
-d1_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-d1_clnt.o: ../include/openssl/bn.h ../include/openssl/buffer.h
-d1_clnt.o: ../include/openssl/comp.h ../include/openssl/crypto.h
-d1_clnt.o: ../include/openssl/dh.h ../include/openssl/dsa.h
-d1_clnt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-d1_clnt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-d1_clnt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-d1_clnt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-d1_clnt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-d1_clnt.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-d1_clnt.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-d1_clnt.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-d1_clnt.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-d1_clnt.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-d1_clnt.o: ../include/openssl/rand.h ../include/openssl/rsa.h
-d1_clnt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_clnt.o: ../include/openssl/x509_vfy.h d1_clnt.c kssl_lcl.h ssl_locl.h
-d1_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-d1_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-d1_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-d1_enc.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-d1_enc.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-d1_enc.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-d1_enc.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-d1_enc.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-d1_enc.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-d1_enc.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-d1_enc.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-d1_enc.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-d1_enc.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-d1_enc.o: ../include/openssl/rand.h ../include/openssl/rsa.h
-d1_enc.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_enc.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_enc.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_enc.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_enc.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_enc.o: ../include/openssl/x509_vfy.h d1_enc.c ssl_locl.h
-d1_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-d1_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-d1_lib.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-d1_lib.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-d1_lib.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-d1_lib.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-d1_lib.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-d1_lib.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-d1_lib.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-d1_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-d1_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-d1_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-d1_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-d1_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_lib.o: ../include/openssl/x509_vfy.h d1_lib.c ssl_locl.h
-d1_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-d1_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-d1_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-d1_meth.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-d1_meth.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-d1_meth.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-d1_meth.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-d1_meth.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-d1_meth.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-d1_meth.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-d1_meth.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-d1_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-d1_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-d1_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_meth.o: ../include/openssl/x509_vfy.h d1_meth.c ssl_locl.h
-d1_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-d1_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-d1_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-d1_pkt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-d1_pkt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-d1_pkt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-d1_pkt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-d1_pkt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-d1_pkt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-d1_pkt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-d1_pkt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-d1_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-d1_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-d1_pkt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-d1_pkt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-d1_pkt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-d1_pkt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-d1_pkt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-d1_pkt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h d1_pkt.c
-d1_pkt.o: ssl_locl.h
-d1_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-d1_srvr.o: ../include/openssl/bn.h ../include/openssl/buffer.h
-d1_srvr.o: ../include/openssl/comp.h ../include/openssl/crypto.h
-d1_srvr.o: ../include/openssl/dh.h ../include/openssl/dsa.h
-d1_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-d1_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-d1_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-d1_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-d1_srvr.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-d1_srvr.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-d1_srvr.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-d1_srvr.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-d1_srvr.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-d1_srvr.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-d1_srvr.o: ../include/openssl/rand.h ../include/openssl/rsa.h
-d1_srvr.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-d1_srvr.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-d1_srvr.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-d1_srvr.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-d1_srvr.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-d1_srvr.o: ../include/openssl/x509_vfy.h d1_srvr.c ssl_locl.h
-kssl.o: ../include/openssl/asn1.h ../include/openssl/bio.h
-kssl.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-kssl.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
-kssl.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-kssl.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-kssl.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-kssl.o: ../include/openssl/krb5_asn.h ../include/openssl/kssl.h
-kssl.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-kssl.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-kssl.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-kssl.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-kssl.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-kssl.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-kssl.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-kssl.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-kssl.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-kssl.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-kssl.o: ../include/openssl/x509_vfy.h kssl.c kssl_lcl.h
-s23_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s23_clnt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s23_clnt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s23_clnt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s23_clnt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s23_clnt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s23_clnt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s23_clnt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s23_clnt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s23_clnt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s23_clnt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s23_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s23_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s23_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s23_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s23_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s23_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s23_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s23_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s23_clnt.c
-s23_clnt.o: ssl_locl.h
-s23_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s23_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s23_lib.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s23_lib.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s23_lib.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s23_lib.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s23_lib.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s23_lib.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s23_lib.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s23_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s23_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s23_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s23_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s23_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s23_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s23_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s23_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s23_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s23_lib.o: ../include/openssl/x509_vfy.h s23_lib.c ssl_locl.h
-s23_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s23_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s23_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s23_meth.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s23_meth.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s23_meth.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s23_meth.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s23_meth.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s23_meth.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s23_meth.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s23_meth.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s23_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s23_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s23_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s23_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s23_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s23_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s23_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s23_meth.o: ../include/openssl/x509_vfy.h s23_meth.c ssl_locl.h
-s23_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s23_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s23_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s23_pkt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s23_pkt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s23_pkt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s23_pkt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s23_pkt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s23_pkt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s23_pkt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s23_pkt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s23_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s23_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s23_pkt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s23_pkt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s23_pkt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s23_pkt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s23_pkt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s23_pkt.o: ../include/openssl/x509_vfy.h s23_pkt.c ssl_locl.h
-s23_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s23_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s23_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s23_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s23_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s23_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s23_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s23_srvr.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s23_srvr.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s23_srvr.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s23_srvr.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s23_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s23_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s23_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s23_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s23_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s23_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s23_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s23_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s23_srvr.c
-s23_srvr.o: ssl_locl.h
-s2_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s2_clnt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s2_clnt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s2_clnt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s2_clnt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s2_clnt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s2_clnt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s2_clnt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_clnt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s2_clnt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s2_clnt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s2_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s2_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s2_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s2_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s2_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s2_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_clnt.c
-s2_clnt.o: ssl_locl.h
-s2_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s2_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s2_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s2_enc.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s2_enc.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s2_enc.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s2_enc.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s2_enc.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_enc.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s2_enc.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s2_enc.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s2_enc.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s2_enc.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s2_enc.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s2_enc.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_enc.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_enc.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_enc.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_enc.o: ../include/openssl/x509_vfy.h s2_enc.c ssl_locl.h
-s2_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s2_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s2_lib.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s2_lib.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s2_lib.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s2_lib.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s2_lib.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s2_lib.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_lib.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-s2_lib.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-s2_lib.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-s2_lib.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-s2_lib.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-s2_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_lib.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s2_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s2_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s2_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s2_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_lib.c
-s2_lib.o: ssl_locl.h
-s2_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s2_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s2_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s2_meth.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s2_meth.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s2_meth.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s2_meth.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s2_meth.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_meth.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s2_meth.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s2_meth.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s2_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s2_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s2_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s2_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_meth.o: ../include/openssl/x509_vfy.h s2_meth.c ssl_locl.h
-s2_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s2_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s2_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s2_pkt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s2_pkt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s2_pkt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s2_pkt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s2_pkt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_pkt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s2_pkt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s2_pkt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s2_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s2_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s2_pkt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s2_pkt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_pkt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_pkt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_pkt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_pkt.o: ../include/openssl/x509_vfy.h s2_pkt.c ssl_locl.h
-s2_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s2_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s2_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s2_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s2_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s2_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s2_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s2_srvr.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_srvr.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s2_srvr.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s2_srvr.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s2_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s2_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s2_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s2_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s2_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s2_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s2_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_srvr.c
-s2_srvr.o: ssl_locl.h
-s3_both.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s3_both.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s3_both.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s3_both.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s3_both.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s3_both.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s3_both.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s3_both.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s3_both.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s3_both.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s3_both.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s3_both.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s3_both.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s3_both.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_both.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_both.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_both.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_both.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_both.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s3_both.c
-s3_both.o: ssl_locl.h
-s3_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s3_clnt.o: ../include/openssl/bn.h ../include/openssl/buffer.h
-s3_clnt.o: ../include/openssl/comp.h ../include/openssl/crypto.h
-s3_clnt.o: ../include/openssl/dh.h ../include/openssl/dsa.h
-s3_clnt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s3_clnt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s3_clnt.o: ../include/openssl/ecdsa.h ../include/openssl/engine.h
-s3_clnt.o: ../include/openssl/err.h ../include/openssl/evp.h
-s3_clnt.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-s3_clnt.o: ../include/openssl/lhash.h ../include/openssl/md5.h
-s3_clnt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s3_clnt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s3_clnt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s3_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s3_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s3_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h kssl_lcl.h
-s3_clnt.o: s3_clnt.c ssl_locl.h
-s3_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s3_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s3_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s3_enc.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s3_enc.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s3_enc.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s3_enc.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s3_enc.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s3_enc.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-s3_enc.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-s3_enc.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-s3_enc.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-s3_enc.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-s3_enc.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_enc.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_enc.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_enc.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_enc.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_enc.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s3_enc.c
-s3_enc.o: ssl_locl.h
-s3_lib.o: ../crypto/ec/ec_lcl.h ../e_os.h ../include/openssl/asn1.h
-s3_lib.o: ../include/openssl/bio.h ../include/openssl/bn.h
-s3_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s3_lib.o: ../include/openssl/crypto.h ../include/openssl/dh.h
-s3_lib.o: ../include/openssl/dsa.h ../include/openssl/dtls1.h
-s3_lib.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-s3_lib.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-s3_lib.o: ../include/openssl/err.h ../include/openssl/evp.h
-s3_lib.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-s3_lib.o: ../include/openssl/lhash.h ../include/openssl/md5.h
-s3_lib.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s3_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s3_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s3_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s3_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s3_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s3_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s3_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s3_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s3_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s3_lib.o: ../include/openssl/x509_vfy.h kssl_lcl.h s3_lib.c ssl_locl.h
-s3_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s3_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s3_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s3_meth.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s3_meth.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s3_meth.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s3_meth.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s3_meth.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s3_meth.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s3_meth.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s3_meth.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s3_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s3_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s3_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s3_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s3_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s3_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s3_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s3_meth.o: ../include/openssl/x509_vfy.h s3_meth.c ssl_locl.h
-s3_pkt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s3_pkt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-s3_pkt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-s3_pkt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s3_pkt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s3_pkt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s3_pkt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s3_pkt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s3_pkt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s3_pkt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s3_pkt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s3_pkt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s3_pkt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-s3_pkt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-s3_pkt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s3_pkt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s3_pkt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s3_pkt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s3_pkt.o: ../include/openssl/x509_vfy.h s3_pkt.c ssl_locl.h
-s3_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-s3_srvr.o: ../include/openssl/bn.h ../include/openssl/buffer.h
-s3_srvr.o: ../include/openssl/comp.h ../include/openssl/crypto.h
-s3_srvr.o: ../include/openssl/dh.h ../include/openssl/dsa.h
-s3_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-s3_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-s3_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-s3_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-s3_srvr.o: ../include/openssl/krb5_asn.h ../include/openssl/kssl.h
-s3_srvr.o: ../include/openssl/lhash.h ../include/openssl/md5.h
-s3_srvr.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-s3_srvr.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-s3_srvr.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-s3_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s3_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s3_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s3_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-s3_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-s3_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-s3_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-s3_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h kssl_lcl.h
-s3_srvr.o: s3_srvr.c ssl_locl.h
-ssl_algs.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_algs.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_algs.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-ssl_algs.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_algs.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_algs.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-ssl_algs.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-ssl_algs.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-ssl_algs.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-ssl_algs.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-ssl_algs.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-ssl_algs.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-ssl_algs.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-ssl_algs.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_algs.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_algs.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_algs.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_algs.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_algs.o: ../include/openssl/x509_vfy.h ssl_algs.c ssl_locl.h
-ssl_asn1.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/asn1_mac.h
-ssl_asn1.o: ../include/openssl/bio.h ../include/openssl/buffer.h
-ssl_asn1.o: ../include/openssl/comp.h ../include/openssl/crypto.h
-ssl_asn1.o: ../include/openssl/dsa.h ../include/openssl/dtls1.h
-ssl_asn1.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-ssl_asn1.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-ssl_asn1.o: ../include/openssl/err.h ../include/openssl/evp.h
-ssl_asn1.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-ssl_asn1.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-ssl_asn1.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-ssl_asn1.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-ssl_asn1.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-ssl_asn1.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-ssl_asn1.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-ssl_asn1.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-ssl_asn1.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-ssl_asn1.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-ssl_asn1.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-ssl_asn1.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_asn1.c
-ssl_asn1.o: ssl_locl.h
-ssl_cert.o: ../crypto/o_dir.h ../e_os.h ../include/openssl/asn1.h
-ssl_cert.o: ../include/openssl/bio.h ../include/openssl/bn.h
-ssl_cert.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_cert.o: ../include/openssl/conf.h ../include/openssl/crypto.h
-ssl_cert.o: ../include/openssl/dh.h ../include/openssl/dsa.h
-ssl_cert.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_cert.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_cert.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-ssl_cert.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-ssl_cert.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-ssl_cert.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-ssl_cert.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-ssl_cert.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-ssl_cert.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-ssl_cert.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-ssl_cert.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_cert.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_cert.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_cert.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_cert.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_cert.o: ../include/openssl/x509_vfy.h ../include/openssl/x509v3.h
-ssl_cert.o: ssl_cert.c ssl_locl.h
-ssl_ciph.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_ciph.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_ciph.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-ssl_ciph.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_ciph.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_ciph.o: ../include/openssl/ecdsa.h ../include/openssl/engine.h
-ssl_ciph.o: ../include/openssl/err.h ../include/openssl/evp.h
-ssl_ciph.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-ssl_ciph.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-ssl_ciph.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-ssl_ciph.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-ssl_ciph.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-ssl_ciph.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-ssl_ciph.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-ssl_ciph.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-ssl_ciph.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-ssl_ciph.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-ssl_ciph.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-ssl_ciph.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_ciph.c
-ssl_ciph.o: ssl_locl.h
-ssl_err.o: ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_err.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_err.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
-ssl_err.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-ssl_err.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-ssl_err.o: ../include/openssl/err.h ../include/openssl/evp.h
-ssl_err.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-ssl_err.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-ssl_err.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-ssl_err.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-ssl_err.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-ssl_err.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-ssl_err.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_err.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_err.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_err.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_err.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_err.o: ../include/openssl/x509_vfy.h ssl_err.c
-ssl_err2.o: ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_err2.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_err2.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h
-ssl_err2.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-ssl_err2.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-ssl_err2.o: ../include/openssl/err.h ../include/openssl/evp.h
-ssl_err2.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-ssl_err2.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-ssl_err2.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-ssl_err2.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-ssl_err2.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-ssl_err2.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-ssl_err2.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_err2.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_err2.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_err2.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_err2.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_err2.o: ../include/openssl/x509_vfy.h ssl_err2.c
-ssl_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_lib.o: ../include/openssl/conf.h ../include/openssl/crypto.h
-ssl_lib.o: ../include/openssl/dh.h ../include/openssl/dsa.h
-ssl_lib.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_lib.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_lib.o: ../include/openssl/ecdsa.h ../include/openssl/engine.h
-ssl_lib.o: ../include/openssl/err.h ../include/openssl/evp.h
-ssl_lib.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-ssl_lib.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-ssl_lib.o: ../include/openssl/objects.h ../include/openssl/ocsp.h
-ssl_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-ssl_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-ssl_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-ssl_lib.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-ssl_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-ssl_lib.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-ssl_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-ssl_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-ssl_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-ssl_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h
-ssl_lib.o: ../include/openssl/x509v3.h kssl_lcl.h ssl_lib.c ssl_locl.h
-ssl_rsa.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_rsa.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_rsa.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-ssl_rsa.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_rsa.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_rsa.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-ssl_rsa.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-ssl_rsa.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-ssl_rsa.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-ssl_rsa.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-ssl_rsa.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-ssl_rsa.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-ssl_rsa.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-ssl_rsa.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_rsa.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_rsa.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_rsa.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_rsa.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_rsa.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_rsa.c
-ssl_sess.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_sess.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_sess.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-ssl_sess.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_sess.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_sess.o: ../include/openssl/ecdsa.h ../include/openssl/engine.h
-ssl_sess.o: ../include/openssl/err.h ../include/openssl/evp.h
-ssl_sess.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-ssl_sess.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-ssl_sess.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-ssl_sess.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-ssl_sess.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-ssl_sess.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-ssl_sess.o: ../include/openssl/rand.h ../include/openssl/rsa.h
-ssl_sess.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_sess.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_sess.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_sess.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_sess.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_sess.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_sess.c
-ssl_stat.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_stat.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_stat.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-ssl_stat.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_stat.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_stat.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-ssl_stat.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-ssl_stat.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-ssl_stat.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-ssl_stat.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-ssl_stat.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-ssl_stat.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-ssl_stat.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-ssl_stat.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_stat.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_stat.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_stat.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_stat.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_stat.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_stat.c
-ssl_txt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-ssl_txt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-ssl_txt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-ssl_txt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-ssl_txt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-ssl_txt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-ssl_txt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-ssl_txt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-ssl_txt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-ssl_txt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-ssl_txt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-ssl_txt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-ssl_txt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-ssl_txt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-ssl_txt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-ssl_txt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-ssl_txt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-ssl_txt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-ssl_txt.o: ../include/openssl/x509_vfy.h ssl_locl.h ssl_txt.c
-t1_clnt.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-t1_clnt.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-t1_clnt.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-t1_clnt.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-t1_clnt.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-t1_clnt.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-t1_clnt.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-t1_clnt.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-t1_clnt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-t1_clnt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-t1_clnt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-t1_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-t1_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-t1_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-t1_clnt.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-t1_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-t1_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-t1_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-t1_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
-t1_clnt.o: t1_clnt.c
-t1_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-t1_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-t1_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-t1_enc.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-t1_enc.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-t1_enc.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-t1_enc.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-t1_enc.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-t1_enc.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-t1_enc.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-t1_enc.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-t1_enc.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-t1_enc.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-t1_enc.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-t1_enc.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-t1_enc.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-t1_enc.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-t1_enc.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-t1_enc.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
-t1_enc.o: t1_enc.c
-t1_lib.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-t1_lib.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-t1_lib.o: ../include/openssl/conf.h ../include/openssl/crypto.h
-t1_lib.o: ../include/openssl/dsa.h ../include/openssl/dtls1.h
-t1_lib.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-t1_lib.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-t1_lib.o: ../include/openssl/err.h ../include/openssl/evp.h
-t1_lib.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-t1_lib.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-t1_lib.o: ../include/openssl/objects.h ../include/openssl/ocsp.h
-t1_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-t1_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-t1_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-t1_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-t1_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-t1_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-t1_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-t1_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-t1_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-t1_lib.o: ../include/openssl/x509_vfy.h ../include/openssl/x509v3.h ssl_locl.h
-t1_lib.o: t1_lib.c
-t1_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-t1_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-t1_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-t1_meth.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-t1_meth.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-t1_meth.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-t1_meth.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-t1_meth.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-t1_meth.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-t1_meth.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-t1_meth.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-t1_meth.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-t1_meth.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-t1_meth.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-t1_meth.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-t1_meth.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-t1_meth.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-t1_meth.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-t1_meth.o: ../include/openssl/x509_vfy.h ssl_locl.h t1_meth.c
-t1_reneg.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-t1_reneg.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-t1_reneg.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-t1_reneg.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-t1_reneg.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-t1_reneg.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-t1_reneg.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-t1_reneg.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-t1_reneg.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-t1_reneg.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-t1_reneg.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-t1_reneg.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-t1_reneg.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
-t1_reneg.o: ../include/openssl/safestack.h ../include/openssl/sha.h
-t1_reneg.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-t1_reneg.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-t1_reneg.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-t1_reneg.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-t1_reneg.o: ../include/openssl/x509_vfy.h ssl_locl.h t1_reneg.c
-t1_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
-t1_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
-t1_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
-t1_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
-t1_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
-t1_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
-t1_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h
-t1_srvr.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-t1_srvr.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
-t1_srvr.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
-t1_srvr.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
-t1_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-t1_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-t1_srvr.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-t1_srvr.o: ../include/openssl/sha.h ../include/openssl/ssl.h
-t1_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
-t1_srvr.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
-t1_srvr.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
-t1_srvr.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssl_locl.h
-t1_srvr.o: t1_srvr.c
diff --git a/ssl/bio_ssl.c b/ssl/bio_ssl.c
index eedac8a..e9552ca 100644
--- a/ssl/bio_ssl.c
+++ b/ssl/bio_ssl.c
@@ -538,6 +538,7 @@ BIO *BIO_new_buffer_ssl_connect(SSL_CTX *ctx)
 
 BIO *BIO_new_ssl_connect(SSL_CTX *ctx)
 	{
+#ifndef OPENSSL_NO_SOCK
 	BIO *ret=NULL,*con=NULL,*ssl=NULL;
 
 	if ((con=BIO_new(BIO_s_connect())) == NULL)
@@ -549,6 +550,7 @@ BIO *BIO_new_ssl_connect(SSL_CTX *ctx)
 	return(ret);
 err:
 	if (con != NULL) BIO_free(con);
+#endif
 	return(NULL);
 	}
 
diff --git a/ssl/d1_both.c b/ssl/d1_both.c
index 2180c6d..de8bab8 100644
--- a/ssl/d1_both.c
+++ b/ssl/d1_both.c
@@ -158,7 +158,6 @@ static unsigned char bitmask_end_values[]   = {0xff, 0x01, 0x03, 0x07, 0x0f, 0x1
 /* XDTLS:  figure out the right values */
 static unsigned int g_probable_mtu[] = {1500 - 28, 512 - 28, 256 - 28};
 
-static unsigned int dtls1_min_mtu(void);
 static unsigned int dtls1_guess_mtu(unsigned int curr_mtu);
 static void dtls1_fix_message_header(SSL *s, unsigned long frag_off, 
 	unsigned long frag_len);
@@ -228,14 +227,14 @@ int dtls1_do_write(SSL *s, int type)
 	unsigned int len, frag_off, mac_size, blocksize;
 
 	/* AHA!  Figure out the MTU, and stick to the right size */
-	if ( ! (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU))
+	if (s->d1->mtu < dtls1_min_mtu() && !(SSL_get_options(s) & SSL_OP_NO_QUERY_MTU))
 		{
 		s->d1->mtu = 
 			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL);
 
 		/* I've seen the kernel return bogus numbers when it doesn't know
 		 * (initial write), so just make sure we have a reasonable number */
-		if ( s->d1->mtu < dtls1_min_mtu())
+		if (s->d1->mtu < dtls1_min_mtu())
 			{
 			s->d1->mtu = 0;
 			s->d1->mtu = dtls1_guess_mtu(s->d1->mtu);
@@ -264,11 +263,10 @@ int dtls1_do_write(SSL *s, int type)
 			return ret;
 		mtu = s->d1->mtu - (DTLS1_HM_HEADER_LENGTH + DTLS1_RT_HEADER_LENGTH);
 		}
-
-	OPENSSL_assert(mtu > 0);  /* should have something reasonable now */
-
 #endif
 
+	OPENSSL_assert(s->d1->mtu >= dtls1_min_mtu());  /* should have something reasonable now */
+
 	if ( s->init_off == 0  && type == SSL3_RT_HANDSHAKE)
 		OPENSSL_assert(s->init_num == 
 			(int)s->d1->w_msg_hdr.msg_len + DTLS1_HM_HEADER_LENGTH);
@@ -795,7 +793,13 @@ dtls1_get_message_fragment(SSL *s, int st1, int stn, long max, int *ok)
 		*ok = 0;
 		return i;
 		}
-	OPENSSL_assert(i == DTLS1_HM_HEADER_LENGTH);
+	/* Handshake fails if message header is incomplete */
+	if (i != DTLS1_HM_HEADER_LENGTH)
+		{
+		al=SSL_AD_UNEXPECTED_MESSAGE;
+		SSLerr(SSL_F_DTLS1_GET_MESSAGE_FRAGMENT,SSL_R_UNEXPECTED_MESSAGE);
+		goto f_err;
+		}
 
 	/* parse the message fragment header */
 	dtls1_get_message_header(wire, &msg_hdr);
@@ -867,7 +871,12 @@ dtls1_get_message_fragment(SSL *s, int st1, int stn, long max, int *ok)
 
 	/* XDTLS:  an incorrectly formatted fragment should cause the 
 	 * handshake to fail */
-	OPENSSL_assert(i == (int)frag_len);
+	if (i != (int)frag_len)
+		{
+		al=SSL3_AD_ILLEGAL_PARAMETER;
+		SSLerr(SSL_F_DTLS1_GET_MESSAGE_FRAGMENT,SSL3_AD_ILLEGAL_PARAMETER);
+		goto f_err;
+		}
 
 	*ok = 1;
 
@@ -1075,7 +1084,11 @@ int dtls1_read_failed(SSL *s, int code)
 		return code;
 		}
 
-	if ( ! SSL_in_init(s))  /* done, no need to send a retransmit */
+#ifndef OPENSSL_NO_HEARTBEATS
+	if (!SSL_in_init(s) && !s->tlsext_hb_pending)  /* done, no need to send a retransmit */
+#else
+	if (!SSL_in_init(s))  /* done, no need to send a retransmit */
+#endif
 		{
 		BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ);
 		return code;
@@ -1367,7 +1380,7 @@ dtls1_write_message_header(SSL *s, unsigned char *p)
 	return p;
 	}
 
-static unsigned int 
+unsigned int 
 dtls1_min_mtu(void)
 	{
 	return (g_probable_mtu[(sizeof(g_probable_mtu) / 
@@ -1408,3 +1421,171 @@ dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr)
 
 	ccs_hdr->type = *(data++);
 	}
+
+int dtls1_shutdown(SSL *s)
+	{
+	int ret;
+#ifndef OPENSSL_NO_SCTP
+	if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+	    !(s->shutdown & SSL_SENT_SHUTDOWN))
+		{
+		ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+		if (ret < 0) return -1;
+
+		if (ret == 0)
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL);
+		}
+#endif
+	ret = ssl3_shutdown(s);
+#ifndef OPENSSL_NO_SCTP
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL);
+#endif
+	return ret;
+	}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+int
+dtls1_process_heartbeat(SSL *s)
+	{
+	unsigned char *p = &s->s3->rrec.data[0], *pl;
+	unsigned short hbtype;
+	unsigned int payload;
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Read type and payload length first */
+	hbtype = *p++;
+	n2s(p, payload);
+	pl = p;
+
+	if (s->msg_callback)
+		s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT,
+			&s->s3->rrec.data[0], s->s3->rrec.length,
+			s, s->msg_callback_arg);
+
+	if (hbtype == TLS1_HB_REQUEST)
+		{
+		unsigned char *buffer, *bp;
+		int r;
+
+		/* Allocate memory for the response, size is 1 byte
+		 * message type, plus 2 bytes payload length, plus
+		 * payload, plus padding
+		 */
+		buffer = OPENSSL_malloc(1 + 2 + payload + padding);
+		bp = buffer;
+
+		/* Enter response type, length and copy payload */
+		*bp++ = TLS1_HB_RESPONSE;
+		s2n(payload, bp);
+		memcpy(bp, pl, payload);
+		bp += payload;
+		/* Random padding */
+		RAND_pseudo_bytes(bp, padding);
+
+		r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding);
+
+		if (r >= 0 && s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buffer, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		OPENSSL_free(buffer);
+
+		if (r < 0)
+			return r;
+		}
+	else if (hbtype == TLS1_HB_RESPONSE)
+		{
+		unsigned int seq;
+
+		/* We only send sequence numbers (2 bytes unsigned int),
+		 * and 16 random bytes, so we just try to read the
+		 * sequence number */
+		n2s(pl, seq);
+
+		if (payload == 18 && seq == s->tlsext_hb_seq)
+			{
+			dtls1_stop_timer(s);
+			s->tlsext_hb_seq++;
+			s->tlsext_hb_pending = 0;
+			}
+		}
+
+	return 0;
+	}
+
+int
+dtls1_heartbeat(SSL *s)
+	{
+	unsigned char *buf, *p;
+	int ret;
+	unsigned int payload = 18; /* Sequence number + random bytes */
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Only send if peer supports and accepts HB requests... */
+	if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) ||
+	    s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT);
+		return -1;
+		}
+
+	/* ...and there is none in flight yet... */
+	if (s->tlsext_hb_pending)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING);
+		return -1;
+		}
+
+	/* ...and no handshake in progress. */
+	if (SSL_in_init(s) || s->in_handshake)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE);
+		return -1;
+		}
+
+	/* Check if padding is too long, payload and padding
+	 * must not exceed 2^14 - 3 = 16381 bytes in total.
+	 */
+	OPENSSL_assert(payload + padding <= 16381);
+
+	/* Create HeartBeat message, we just use a sequence number
+	 * as payload to distuingish different messages and add
+	 * some random stuff.
+	 *  - Message Type, 1 byte
+	 *  - Payload Length, 2 bytes (unsigned int)
+	 *  - Payload, the sequence number (2 bytes uint)
+	 *  - Payload, random bytes (16 bytes uint)
+	 *  - Padding
+	 */
+	buf = OPENSSL_malloc(1 + 2 + payload + padding);
+	p = buf;
+	/* Message Type */
+	*p++ = TLS1_HB_REQUEST;
+	/* Payload length (18 bytes here) */
+	s2n(payload, p);
+	/* Sequence number */
+	s2n(s->tlsext_hb_seq, p);
+	/* 16 random bytes */
+	RAND_pseudo_bytes(p, 16);
+	p += 16;
+	/* Random padding */
+	RAND_pseudo_bytes(p, padding);
+
+	ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding);
+	if (ret >= 0)
+		{
+		if (s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buf, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		dtls1_start_timer(s);
+		s->tlsext_hb_pending = 1;
+		}
+
+	OPENSSL_free(buf);
+
+	return ret;
+	}
+#endif
diff --git a/ssl/d1_clnt.c b/ssl/d1_clnt.c
index 5776671..7e8077e 100644
--- a/ssl/d1_clnt.c
+++ b/ssl/d1_clnt.c
@@ -150,7 +150,11 @@ int dtls1_connect(SSL *s)
 	unsigned long Time=(unsigned long)time(NULL);
 	void (*cb)(const SSL *ssl,int type,int val)=NULL;
 	int ret= -1;
-	int new_state,state,skip=0;;
+	int new_state,state,skip=0;
+#ifndef OPENSSL_NO_SCTP
+	unsigned char sctpauthkey[64];
+	char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)];
+#endif
 
 	RAND_add(&Time,sizeof(Time),0);
 	ERR_clear_error();
@@ -164,6 +168,27 @@ int dtls1_connect(SSL *s)
 	s->in_handshake++;
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); 
 
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to enter handshake
+	 * mode and prevent stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		dtls1_stop_timer(s);
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -171,7 +196,7 @@ int dtls1_connect(SSL *s)
 		switch(s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			s->state=SSL_ST_CONNECT;
 			s->ctx->stats.sess_connect_renegotiate++;
 			/* break */
@@ -226,6 +251,42 @@ int dtls1_connect(SSL *s)
 			s->hit = 0;
 			break;
 
+#ifndef OPENSSL_NO_SCTP
+		case DTLS1_SCTP_ST_CR_READ_SOCK:
+
+			if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+			{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+			}
+
+			s->state=s->s3->tmp.next_state;
+			break;
+
+		case DTLS1_SCTP_ST_CW_WRITE_SOCK:
+			/* read app data until dry event */
+
+			ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+			if (ret < 0) goto end;
+
+			if (ret == 0)
+			{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+			}
+
+			s->state=s->d1->next_state;
+			break;
+#endif
+
 		case SSL3_ST_CW_CLNT_HELLO_A:
 		case SSL3_ST_CW_CLNT_HELLO_B:
 
@@ -248,9 +309,17 @@ int dtls1_connect(SSL *s)
 
 			s->init_num=0;
 
-			/* turn on buffering for the next lot of output */
-			if (s->bbio != s->wbio)
-				s->wbio=BIO_push(s->bbio,s->wbio);
+#ifndef OPENSSL_NO_SCTP
+			/* Disable buffering for SCTP */
+			if (!BIO_dgram_is_sctp(SSL_get_wbio(s)))
+				{
+#endif
+				/* turn on buffering for the next lot of output */
+				if (s->bbio != s->wbio)
+					s->wbio=BIO_push(s->bbio,s->wbio);
+#ifndef OPENSSL_NO_SCTP
+				}
+#endif
 
 			break;
 
@@ -260,9 +329,25 @@ int dtls1_connect(SSL *s)
 			if (ret <= 0) goto end;
 			else
 				{
-				dtls1_stop_timer(s);
 				if (s->hit)
+					{
+#ifndef OPENSSL_NO_SCTP
+					/* Add new shared key for SCTP-Auth,
+					 * will be ignored if no SCTP used.
+					 */
+					snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+					         DTLS1_SCTP_AUTH_LABEL);
+
+					SSL_export_keying_material(s, sctpauthkey,
+					                           sizeof(sctpauthkey), labelbuffer,
+					                           sizeof(labelbuffer), NULL, 0, 0);
+
+					BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+							 sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 					s->state=SSL3_ST_CR_FINISHED_A;
+					}
 				else
 					s->state=DTLS1_ST_CR_HELLO_VERIFY_REQUEST_A;
 				}
@@ -354,12 +439,20 @@ int dtls1_connect(SSL *s)
 		case SSL3_ST_CR_SRVR_DONE_B:
 			ret=ssl3_get_server_done(s);
 			if (ret <= 0) goto end;
+			dtls1_stop_timer(s);
 			if (s->s3->tmp.cert_req)
-				s->state=SSL3_ST_CW_CERT_A;
+				s->s3->tmp.next_state=SSL3_ST_CW_CERT_A;
 			else
-				s->state=SSL3_ST_CW_KEY_EXCH_A;
+				s->s3->tmp.next_state=SSL3_ST_CW_KEY_EXCH_A;
 			s->init_num=0;
 
+#ifndef OPENSSL_NO_SCTP			
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+			    state == SSL_ST_RENEGOTIATE)
+				s->state=DTLS1_SCTP_ST_CR_READ_SOCK;
+			else
+#endif			
+			s->state=s->s3->tmp.next_state;
 			break;
 
 		case SSL3_ST_CW_CERT_A:
@@ -378,6 +471,22 @@ int dtls1_connect(SSL *s)
 			dtls1_start_timer(s);
 			ret=dtls1_send_client_key_exchange(s);
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Add new shared key for SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+			         DTLS1_SCTP_AUTH_LABEL);
+
+			SSL_export_keying_material(s, sctpauthkey,
+			                           sizeof(sctpauthkey), labelbuffer,
+			                           sizeof(labelbuffer), NULL, 0, 0);
+
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+					 sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 			/* EAY EAY EAY need to check for DH fix cert
 			 * sent back */
 			/* For TLS, cert_req is set to 2, so a cert chain
@@ -388,7 +497,15 @@ int dtls1_connect(SSL *s)
 				}
 			else
 				{
-				s->state=SSL3_ST_CW_CHANGE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state=SSL3_ST_CW_CHANGE_A;
+					s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+					}
+				else
+#endif
+					s->state=SSL3_ST_CW_CHANGE_A;
 				s->s3->change_cipher_spec=0;
 				}
 
@@ -400,7 +517,15 @@ int dtls1_connect(SSL *s)
 			dtls1_start_timer(s);
 			ret=dtls1_send_client_verify(s);
 			if (ret <= 0) goto end;
-			s->state=SSL3_ST_CW_CHANGE_A;
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+			{
+				s->d1->next_state=SSL3_ST_CW_CHANGE_A;
+				s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+			}
+			else
+#endif
+				s->state=SSL3_ST_CW_CHANGE_A;
 			s->init_num=0;
 			s->s3->change_cipher_spec=0;
 			break;
@@ -412,6 +537,14 @@ int dtls1_connect(SSL *s)
 			ret=dtls1_send_change_cipher_spec(s,
 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Change to new shared key of SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL);
+#endif
+
 			s->state=SSL3_ST_CW_FINISHED_A;
 			s->init_num=0;
 
@@ -457,9 +590,23 @@ int dtls1_connect(SSL *s)
 			if (s->hit)
 				{
 				s->s3->tmp.next_state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+						s->d1->next_state = s->s3->tmp.next_state;
+						s->s3->tmp.next_state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+					}
+#endif
 				if (s->s3->flags & SSL3_FLAGS_DELAY_CLIENT_FINISHED)
 					{
 					s->state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+					if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+						{
+							s->d1->next_state = SSL_ST_OK;
+							s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+						}
+#endif
 					s->s3->flags|=SSL3_FLAGS_POP_BUFFER;
 					s->s3->delay_buf_pop_ret=0;
 					}
@@ -508,6 +655,16 @@ int dtls1_connect(SSL *s)
 				s->state=SSL3_ST_CW_CHANGE_A;
 			else
 				s->state=SSL_ST_OK;
+
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+				state == SSL_ST_RENEGOTIATE)
+				{
+				s->d1->next_state=s->state;
+				s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+				}
+#endif
+
 			s->init_num=0;
 			break;
 
@@ -515,6 +672,13 @@ int dtls1_connect(SSL *s)
 			s->rwstate=SSL_WRITING;
 			if (BIO_flush(s->wbio) <= 0)
 				{
+				/* If the write error was fatal, stop trying */
+				if (!BIO_should_retry(s->wbio))
+					{
+					s->rwstate=SSL_NOTHING;
+					s->state=s->s3->tmp.next_state;
+					}
+				
 				ret= -1;
 				goto end;
 				}
@@ -541,6 +705,7 @@ int dtls1_connect(SSL *s)
 			/* else do it later in ssl3_write */
 
 			s->init_num=0;
+			s->renegotiate=0;
 			s->new_session=0;
 
 			ssl_update_cache(s,SSL_SESS_CACHE_CLIENT);
@@ -587,6 +752,15 @@ int dtls1_connect(SSL *s)
 		}
 end:
 	s->in_handshake--;
+	
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to leave handshake
+	 * mode and allow stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
 	if (buf != NULL)
 		BUF_MEM_free(buf);
 	if (cb != NULL)
diff --git a/ssl/d1_enc.c b/ssl/d1_enc.c
index becbab9..712c464 100644
--- a/ssl/d1_enc.c
+++ b/ssl/d1_enc.c
@@ -126,20 +126,28 @@
 #include <openssl/des.h>
 #endif
 
+/* dtls1_enc encrypts/decrypts the record in |s->wrec| / |s->rrec|, respectively.
+ *
+ * Returns:
+ *   0: (in non-constant time) if the record is publically invalid (i.e. too
+ *       short etc).
+ *   1: if the record's padding is valid / the encryption was successful.
+ *   -1: if the record's padding/AEAD-authenticator is invalid or, if sending,
+ *       an internal error occured. */
 int dtls1_enc(SSL *s, int send)
 	{
 	SSL3_RECORD *rec;
 	EVP_CIPHER_CTX *ds;
 	unsigned long l;
-	int bs,i,ii,j,k,n=0;
+	int bs,i,j,k,mac_size=0;
 	const EVP_CIPHER *enc;
 
 	if (send)
 		{
 		if (EVP_MD_CTX_md(s->write_hash))
 			{
-			n=EVP_MD_CTX_size(s->write_hash);
-			if (n < 0)
+			mac_size=EVP_MD_CTX_size(s->write_hash);
+			if (mac_size < 0)
 				return -1;
 			}
 		ds=s->enc_write_ctx;
@@ -164,9 +172,8 @@ int dtls1_enc(SSL *s, int send)
 		{
 		if (EVP_MD_CTX_md(s->read_hash))
 			{
-			n=EVP_MD_CTX_size(s->read_hash);
-			if (n < 0)
-				return -1;
+			mac_size=EVP_MD_CTX_size(s->read_hash);
+			OPENSSL_assert(mac_size >= 0);
 			}
 		ds=s->enc_read_ctx;
 		rec= &(s->s3->rrec);
@@ -231,7 +238,7 @@ int dtls1_enc(SSL *s, int send)
 		if (!send)
 			{
 			if (l == 0 || l%bs != 0)
-				return -1;
+				return 0;
 			}
 		
 		EVP_Cipher(ds,rec->data,rec->input,l);
@@ -246,43 +253,7 @@ int dtls1_enc(SSL *s, int send)
 #endif	/* KSSL_DEBUG */
 
 		if ((bs != 1) && !send)
-			{
-			ii=i=rec->data[l-1]; /* padding_length */
-			i++;
-			if (s->options&SSL_OP_TLS_BLOCK_PADDING_BUG)
-				{
-				/* First packet is even in size, so check */
-				if ((memcmp(s->s3->read_sequence,
-					"\0\0\0\0\0\0\0\0",8) == 0) && !(ii & 1))
-					s->s3->flags|=TLS1_FLAGS_TLS_PADDING_BUG;
-				if (s->s3->flags & TLS1_FLAGS_TLS_PADDING_BUG)
-					i--;
-				}
-			/* TLS 1.0 does not bound the number of padding bytes by the block size.
-			 * All of them must have value 'padding_length'. */
-			if (i > (int)rec->length)
-				{
-				/* Incorrect padding. SSLerr() and ssl3_alert are done
-				 * by caller: we don't want to reveal whether this is
-				 * a decryption error or a MAC verification failure
-				 * (see http://www.openssl.org/~bodo/tls-cbc.txt) 
-				 */
-				return -1;
-				}
-			for (j=(int)(l-i); j<(int)l; j++)
-				{
-				if (rec->data[j] != ii)
-					{
-					/* Incorrect padding */
-					return -1;
-					}
-				}
-			rec->length-=i;
-
-			rec->data += bs;    /* skip the implicit IV */
-			rec->input += bs;
-			rec->length -= bs;
-			}
+			return tls1_cbc_remove_padding(s, rec, bs, mac_size);
 		}
 	return(1);
 	}
diff --git a/ssl/d1_lib.c b/ssl/d1_lib.c
index 48e8b6f..f61f718 100644
--- a/ssl/d1_lib.c
+++ b/ssl/d1_lib.c
@@ -82,6 +82,7 @@ SSL3_ENC_METHOD DTLSv1_enc_data={
 	TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE,
 	TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE,
 	tls1_alert_code,
+	tls1_export_keying_material,
 	};
 
 long dtls1_default_timeout(void)
@@ -204,7 +205,8 @@ void dtls1_clear(SSL *s)
     pqueue buffered_messages;
 	pqueue sent_messages;
 	pqueue buffered_app_data;
-	
+	unsigned int mtu;
+
 	if (s->d1)
 		{
 		unprocessed_rcds = s->d1->unprocessed_rcds.q;
@@ -212,6 +214,7 @@ void dtls1_clear(SSL *s)
 		buffered_messages = s->d1->buffered_messages;
 		sent_messages = s->d1->sent_messages;
 		buffered_app_data = s->d1->buffered_app_data.q;
+		mtu = s->d1->mtu;
 
 		dtls1_clear_queues(s);
 
@@ -222,6 +225,11 @@ void dtls1_clear(SSL *s)
 			s->d1->cookie_len = sizeof(s->d1->cookie);
 			}
 
+		if (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)
+			{
+			s->d1->mtu = mtu;
+			}
+
 		s->d1->unprocessed_rcds.q = unprocessed_rcds;
 		s->d1->processed_rcds.q = processed_rcds;
 		s->d1->buffered_messages = buffered_messages;
@@ -284,6 +292,15 @@ const SSL_CIPHER *dtls1_get_cipher(unsigned int u)
 
 void dtls1_start_timer(SSL *s)
 	{
+#ifndef OPENSSL_NO_SCTP
+	/* Disable timer for SCTP */
+	if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+		{
+		memset(&(s->d1->next_timeout), 0, sizeof(struct timeval));
+		return;
+		}
+#endif
+
 	/* If timer is not set, initialize duration with 1 second */
 	if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0)
 		{
@@ -374,6 +391,7 @@ void dtls1_double_timeout(SSL *s)
 void dtls1_stop_timer(SSL *s)
 	{
 	/* Reset everything */
+	memset(&(s->d1->timeout), 0, sizeof(struct dtls1_timeout_st));
 	memset(&(s->d1->next_timeout), 0, sizeof(struct timeval));
 	s->d1->timeout_duration = 1;
 	BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout));
@@ -381,10 +399,28 @@ void dtls1_stop_timer(SSL *s)
 	dtls1_clear_record_buffer(s);
 	}
 
-int dtls1_handle_timeout(SSL *s)
+int dtls1_check_timeout_num(SSL *s)
 	{
-	DTLS1_STATE *state;
+	s->d1->timeout.num_alerts++;
+
+	/* Reduce MTU after 2 unsuccessful retransmissions */
+	if (s->d1->timeout.num_alerts > 2)
+		{
+		s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_GET_FALLBACK_MTU, 0, NULL);		
+		}
+
+	if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT)
+		{
+		/* fail the connection, enough alerts have been sent */
+		SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED);
+		return -1;
+		}
 
+	return 0;
+	}
+
+int dtls1_handle_timeout(SSL *s)
+	{
 	/* if no timer is expired, don't do anything */
 	if (!dtls1_is_timer_expired(s))
 		{
@@ -392,20 +428,23 @@ int dtls1_handle_timeout(SSL *s)
 		}
 
 	dtls1_double_timeout(s);
-	state = s->d1;
-	state->timeout.num_alerts++;
-	if ( state->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT)
-		{
-		/* fail the connection, enough alerts have been sent */
-		SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED);
+
+	if (dtls1_check_timeout_num(s) < 0)
 		return -1;
+
+	s->d1->timeout.read_timeouts++;
+	if (s->d1->timeout.read_timeouts > DTLS1_TMO_READ_COUNT)
+		{
+		s->d1->timeout.read_timeouts = 1;
 		}
 
-	state->timeout.read_timeouts++;
-	if ( state->timeout.read_timeouts > DTLS1_TMO_READ_COUNT)
+#ifndef OPENSSL_NO_HEARTBEATS
+	if (s->tlsext_hb_pending)
 		{
-		state->timeout.read_timeouts = 1;
+		s->tlsext_hb_pending = 0;
+		return dtls1_heartbeat(s);
 		}
+#endif
 
 	dtls1_start_timer(s);
 	return dtls1_retransmit_buffered_messages(s);
diff --git a/ssl/d1_pkt.c b/ssl/d1_pkt.c
index 91562f3..0bf87be 100644
--- a/ssl/d1_pkt.c
+++ b/ssl/d1_pkt.c
@@ -179,7 +179,6 @@ static int dtls1_record_needs_buffering(SSL *s, SSL3_RECORD *rr,
 static int dtls1_buffer_record(SSL *s, record_pqueue *q,
 	unsigned char *priority);
 static int dtls1_process_record(SSL *s);
-static void dtls1_clear_timeouts(SSL *s);
 
 /* copy buffered record into SSL structure */
 static int
@@ -232,6 +231,14 @@ dtls1_buffer_record(SSL *s, record_pqueue *queue, unsigned char *priority)
 
 	item->data = rdata;
 
+#ifndef OPENSSL_NO_SCTP
+	/* Store bio_dgram_sctp_rcvinfo struct */
+	if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+	    (s->state == SSL3_ST_SR_FINISHED_A || s->state == SSL3_ST_CR_FINISHED_A)) {
+		BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_GET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo);
+	}
+#endif
+
 	/* insert should not fail, since duplicates are dropped */
 	if (pqueue_insert(queue->q, item) == NULL)
 		{
@@ -369,14 +376,12 @@ static int
 dtls1_process_record(SSL *s)
 {
 	int i,al;
-	int clear=0;
 	int enc_err;
 	SSL_SESSION *sess;
 	SSL3_RECORD *rr;
-	unsigned int mac_size;
+	unsigned int mac_size, orig_len;
 	unsigned char md[EVP_MAX_MD_SIZE];
 
-
 	rr= &(s->s3->rrec);
 	sess = s->session;
 
@@ -407,14 +412,15 @@ dtls1_process_record(SSL *s)
 	rr->data=rr->input;
 
 	enc_err = s->method->ssl3_enc->enc(s,0);
-	if (enc_err <= 0)
+	/* enc_err is:
+	 *    0: (in non-constant time) if the record is publically invalid.
+	 *    1: if the padding is valid
+	 *    -1: if the padding is invalid */
+	if (enc_err == 0)
 		{
-		/* decryption failed, silently discard message */
-		if (enc_err < 0)
-			{
-			rr->length = 0;
-			s->packet_length = 0;
-			}
+		/* For DTLS we simply ignore bad packets. */
+		rr->length = 0;
+		s->packet_length = 0;
 		goto err;
 		}
 
@@ -425,46 +431,67 @@ printf("\n");
 #endif
 
 	/* r->length is now the compressed data plus mac */
-	if (	(sess == NULL) ||
-		(s->enc_read_ctx == NULL) ||
-		(s->read_hash == NULL))
-		clear=1;
-
-	if (!clear)
+	if ((sess != NULL) &&
+	    (s->enc_read_ctx != NULL) &&
+	    (EVP_MD_CTX_md(s->read_hash) != NULL))
 		{
-		/* !clear => s->read_hash != NULL => mac_size != -1 */
-		int t;
-		t=EVP_MD_CTX_size(s->read_hash);
-		OPENSSL_assert(t >= 0);
-		mac_size=t;
-
-		if (rr->length > SSL3_RT_MAX_COMPRESSED_LENGTH+mac_size)
-			{
-#if 0 /* OK only for stream ciphers (then rr->length is visible from ciphertext anyway) */
-			al=SSL_AD_RECORD_OVERFLOW;
-			SSLerr(SSL_F_DTLS1_PROCESS_RECORD,SSL_R_PRE_MAC_LENGTH_TOO_LONG);
-			goto f_err;
-#else
-			goto err;
-#endif			
-			}
-		/* check the MAC for rr->input (it's in mac_size bytes at the tail) */
-		if (rr->length < mac_size)
+		/* s->read_hash != NULL => mac_size != -1 */
+		unsigned char *mac = NULL;
+		unsigned char mac_tmp[EVP_MAX_MD_SIZE];
+		mac_size=EVP_MD_CTX_size(s->read_hash);
+		OPENSSL_assert(mac_size <= EVP_MAX_MD_SIZE);
+
+		/* kludge: *_cbc_remove_padding passes padding length in rr->type */
+		orig_len = rr->length+((unsigned int)rr->type>>8);
+
+		/* orig_len is the length of the record before any padding was
+		 * removed. This is public information, as is the MAC in use,
+		 * therefore we can safely process the record in a different
+		 * amount of time if it's too short to possibly contain a MAC.
+		 */
+		if (orig_len < mac_size ||
+		    /* CBC records must have a padding length byte too. */
+		    (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+		     orig_len < mac_size+1))
 			{
-#if 0 /* OK only for stream ciphers */
 			al=SSL_AD_DECODE_ERROR;
 			SSLerr(SSL_F_DTLS1_PROCESS_RECORD,SSL_R_LENGTH_TOO_SHORT);
 			goto f_err;
-#else
-			goto err;
-#endif
 			}
-		rr->length-=mac_size;
-		i=s->method->ssl3_enc->mac(s,md,0);
-		if (i < 0 || memcmp(md,&(rr->data[rr->length]),mac_size) != 0)
+
+		if (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE)
 			{
-			goto err;
+			/* We update the length so that the TLS header bytes
+			 * can be constructed correctly but we need to extract
+			 * the MAC in constant time from within the record,
+			 * without leaking the contents of the padding bytes.
+			 * */
+			mac = mac_tmp;
+			ssl3_cbc_copy_mac(mac_tmp, rr, mac_size, orig_len);
+			rr->length -= mac_size;
 			}
+		else
+			{
+			/* In this case there's no padding, so |orig_len|
+			 * equals |rec->length| and we checked that there's
+			 * enough bytes for |mac_size| above. */
+			rr->length -= mac_size;
+			mac = &rr->data[rr->length];
+			}
+
+		i=s->method->ssl3_enc->mac(s,md,0 /* not send */);
+		if (i < 0 || mac == NULL || CRYPTO_memcmp(md, mac, (size_t)mac_size) != 0)
+			enc_err = -1;
+		if (rr->length > SSL3_RT_MAX_COMPRESSED_LENGTH+mac_size)
+			enc_err = -1;
+		}
+
+	if (enc_err < 0)
+		{
+		/* decryption failed, silently discard message */
+		rr->length = 0;
+		s->packet_length = 0;
+		goto err;
 		}
 
 	/* r->length is now just compressed */
@@ -604,24 +631,6 @@ int dtls1_get_record(SSL *s)
 			goto again;
 			}
 
-		/* If we receive a valid record larger than the current buffer size,
-		 * allocate some memory for it.
-		 */
-		if (rr->length > s->s3->rbuf.len - DTLS1_RT_HEADER_LENGTH)
-			{
-			unsigned char *pp;
-			unsigned int newlen = rr->length + DTLS1_RT_HEADER_LENGTH;
-			if ((pp=OPENSSL_realloc(s->s3->rbuf.buf, newlen))==NULL)
-				{
-				SSLerr(SSL_F_DTLS1_GET_RECORD,ERR_R_MALLOC_FAILURE);
-				return(-1);
-				}
-			p = pp + (p - s->s3->rbuf.buf);
-			s->s3->rbuf.buf=pp;
-			s->s3->rbuf.len=newlen;
-			s->packet= &(s->s3->rbuf.buf[0]);
-			}
-
 		/* now s->rstate == SSL_ST_READ_BODY */
 		}
 
@@ -656,20 +665,28 @@ int dtls1_get_record(SSL *s)
 		goto again;   /* get another record */
 		}
 
-	/* Check whether this is a repeat, or aged record.
-	 * Don't check if we're listening and this message is
-	 * a ClientHello. They can look as if they're replayed,
-	 * since they arrive from different connections and
-	 * would be dropped unnecessarily.
-	 */
-	if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE &&
-		*p == SSL3_MT_CLIENT_HELLO) &&
-		!dtls1_record_replay_check(s, bitmap))
-		{
-		rr->length = 0;
-		s->packet_length=0; /* dump this record */
-		goto again;     /* get another record */
-		}
+#ifndef OPENSSL_NO_SCTP
+	/* Only do replay check if no SCTP bio */
+	if (!BIO_dgram_is_sctp(SSL_get_rbio(s)))
+  		{
+#endif
+		/* Check whether this is a repeat, or aged record.
+		 * Don't check if we're listening and this message is
+		 * a ClientHello. They can look as if they're replayed,
+		 * since they arrive from different connections and
+		 * would be dropped unnecessarily.
+		 */
+		if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE &&
+		    *p == SSL3_MT_CLIENT_HELLO) &&
+		    !dtls1_record_replay_check(s, bitmap))
+			{
+			rr->length = 0;
+			s->packet_length=0; /* dump this record */
+			goto again;     /* get another record */
+			}
+#ifndef OPENSSL_NO_SCTP
+  		}
+#endif
 
 	/* just read a 0 length packet */
 	if (rr->length == 0) goto again;
@@ -697,7 +714,6 @@ int dtls1_get_record(SSL *s)
 		goto again;   /* get another record */
 		}
 
-	dtls1_clear_timeouts(s);  /* done waiting */
 	return(1);
 
 	}
@@ -755,7 +771,17 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 
 	/* Now s->d1->handshake_fragment_len == 0 if type == SSL3_RT_HANDSHAKE. */
 
+#ifndef OPENSSL_NO_SCTP
+	/* Continue handshake if it had to be interrupted to read
+	 * app data with SCTP.
+	 */
+	if ((!s->in_handshake && SSL_in_init(s)) ||
+	    (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+	     (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK) &&
+	     s->s3->in_read_app_data != 2))
+#else
 	if (!s->in_handshake && SSL_in_init(s))
+#endif
 		{
 		/* type == SSL3_RT_APPLICATION_DATA */
 		i=s->handshake_func(s);
@@ -786,6 +812,15 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 		item = pqueue_pop(s->d1->buffered_app_data.q);
 		if (item)
 			{
+#ifndef OPENSSL_NO_SCTP
+			/* Restore bio_dgram_sctp_rcvinfo struct */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)))
+				{
+				DTLS1_RECORD_DATA *rdata = (DTLS1_RECORD_DATA *) item->data;
+				BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_SET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo);
+				}
+#endif
+
 			dtls1_copy_record(s, item);
 
 			OPENSSL_free(item->data);
@@ -868,6 +903,31 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 				rr->off=0;
 				}
 			}
+
+#ifndef OPENSSL_NO_SCTP
+			/* We were about to renegotiate but had to read
+			 * belated application data first, so retry.
+			 */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+			    rr->type == SSL3_RT_APPLICATION_DATA &&
+			    (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK))
+				{
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				}
+
+			/* We might had to delay a close_notify alert because
+			 * of reordered app data. If there was an alert and there
+			 * is no message to read anymore, finally set shutdown.
+			 */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+			    s->d1->shutdown_received && !BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+				{
+				s->shutdown |= SSL_RECEIVED_SHUTDOWN;
+				return(0);
+				}
+#endif			
 		return(n);
 		}
 
@@ -895,6 +955,19 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 			dest = s->d1->alert_fragment;
 			dest_len = &s->d1->alert_fragment_len;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (rr->type == TLS1_RT_HEARTBEAT)
+			{
+			dtls1_process_heartbeat(s);
+
+			/* Exit and notify application to read again */
+			rr->length = 0;
+			s->rwstate=SSL_READING;
+			BIO_clear_retry_flags(SSL_get_rbio(s));
+			BIO_set_retry_read(SSL_get_rbio(s));
+			return(-1);
+			}
+#endif
 		/* else it's a CCS message, or application data or wrong */
 		else if (rr->type != SSL3_RT_CHANGE_CIPHER_SPEC)
 			{
@@ -978,6 +1051,7 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 			!(s->s3->flags & SSL3_FLAGS_NO_RENEGOTIATE_CIPHERS) &&
 			!s->s3->renegotiate)
 			{
+			s->new_session = 1;
 			ssl3_renegotiate(s);
 			if (ssl3_renegotiate_check(s))
 				{
@@ -1039,6 +1113,21 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 			s->s3->warn_alert = alert_descr;
 			if (alert_descr == SSL_AD_CLOSE_NOTIFY)
 				{
+#ifndef OPENSSL_NO_SCTP
+				/* With SCTP and streams the socket may deliver app data
+				 * after a close_notify alert. We have to check this
+				 * first so that nothing gets discarded.
+				 */
+				if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+					BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+					{
+					s->d1->shutdown_received = 1;
+					s->rwstate=SSL_READING;
+					BIO_clear_retry_flags(SSL_get_rbio(s));
+					BIO_set_retry_read(SSL_get_rbio(s));
+					return -1;
+					}
+#endif
 				s->shutdown |= SSL_RECEIVED_SHUTDOWN;
 				return(0);
 				}
@@ -1145,6 +1234,15 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 		if (s->version == DTLS1_BAD_VER)
 			s->d1->handshake_read_seq++;
 
+#ifndef OPENSSL_NO_SCTP
+		/* Remember that a CCS has been received,
+		 * so that an old key of SCTP-Auth can be
+		 * deleted when a CCS is sent. Will be ignored
+		 * if no SCTP is used
+		 */
+		BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD, 1, NULL);
+#endif
+
 		goto start;
 		}
 
@@ -1167,6 +1265,9 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 		 */
 		if (msg_hdr.type == SSL3_MT_FINISHED)
 			{
+			if (dtls1_check_timeout_num(s) < 0)
+				return -1;
+
 			dtls1_retransmit_buffered_messages(s);
 			rr->length = 0;
 			goto start;
@@ -1184,6 +1285,7 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 #else
 			s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT;
 #endif
+			s->renegotiate=1;
 			s->new_session=1;
 			}
 		i=s->handshake_func(s);
@@ -1280,7 +1382,16 @@ dtls1_write_app_data_bytes(SSL *s, int type, const void *buf_, int len)
 	{
 	int i;
 
-	if (SSL_in_init(s) && !s->in_handshake)
+#ifndef OPENSSL_NO_SCTP
+		/* Check if we have to continue an interrupted handshake
+		 * for reading belated app data with SCTP.
+		 */
+		if ((SSL_in_init(s) && !s->in_handshake) ||
+		    (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+		     (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK)))
+#else
+		if (SSL_in_init(s) && !s->in_handshake)
+#endif
 		{
 		i=s->handshake_func(s);
 		if (i < 0) return(i);
@@ -1358,7 +1469,6 @@ int do_dtls1_write(SSL *s, int type, const unsigned char *buf, unsigned int len,
 	SSL3_BUFFER *wb;
 	SSL_SESSION *sess;
 	int bs;
-	unsigned int len_with_overhead = len + SSL3_RT_DEFAULT_WRITE_OVERHEAD;
 
 	/* first check if there is a SSL3_BUFFER still being written
 	 * out.  This will happen with non blocking IO */
@@ -1368,16 +1478,6 @@ int do_dtls1_write(SSL *s, int type, const unsigned char *buf, unsigned int len,
 		return(ssl3_write_pending(s,type,buf,len));
 		}
 
-	if (s->s3->wbuf.len < len_with_overhead)
-		{
-		if ((p=OPENSSL_realloc(s->s3->wbuf.buf, len_with_overhead)) == NULL) {
-			SSLerr(SSL_F_DO_DTLS1_WRITE,ERR_R_MALLOC_FAILURE);
-			goto err;
-		}
-		s->s3->wbuf.buf = p;
-		s->s3->wbuf.len = len_with_overhead;
-		}
-
 	/* If we have an alert to send, lets send it */
 	if (s->s3->alert_dispatch)
 		{
@@ -1791,10 +1891,3 @@ dtls1_reset_seq_numbers(SSL *s, int rw)
 
 	memset(seq, 0x00, seq_bytes);
 	}
-
-
-static void
-dtls1_clear_timeouts(SSL *s)
-	{
-	memset(&(s->d1->timeout), 0x00, sizeof(struct dtls1_timeout_st));
-	}
diff --git a/ssl/d1_srtp.c b/ssl/d1_srtp.c
new file mode 100644
index 0000000..ab9c419
--- /dev/null
+++ b/ssl/d1_srtp.c
@@ -0,0 +1,494 @@
+/* ssl/t1_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <ekr@rtfm.com>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#include <stdio.h>
+#include <openssl/objects.h>
+#include "ssl_locl.h"
+
+#ifndef OPENSSL_NO_SRTP
+
+#include "srtp.h"
+
+
+static SRTP_PROTECTION_PROFILE srtp_known_profiles[]=
+    {
+    {
+    "SRTP_AES128_CM_SHA1_80",
+    SRTP_AES128_CM_SHA1_80,
+    },
+    {
+    "SRTP_AES128_CM_SHA1_32",
+    SRTP_AES128_CM_SHA1_32,
+    },
+#if 0
+    {
+    "SRTP_NULL_SHA1_80",
+    SRTP_NULL_SHA1_80,
+    },
+    {
+    "SRTP_NULL_SHA1_32",
+    SRTP_NULL_SHA1_32,
+    },
+#endif
+    {0}
+    };
+
+static int find_profile_by_name(char *profile_name,
+				SRTP_PROTECTION_PROFILE **pptr,unsigned len)
+	{
+	SRTP_PROTECTION_PROFILE *p;
+
+	p=srtp_known_profiles;
+	while(p->name)
+		{
+		if((len == strlen(p->name)) && !strncmp(p->name,profile_name,
+							len))
+			{
+			*pptr=p;
+			return 0;
+			}
+
+		p++;
+		}
+
+	return 1;
+	}
+
+static int find_profile_by_num(unsigned profile_num,
+			       SRTP_PROTECTION_PROFILE **pptr)
+	{
+	SRTP_PROTECTION_PROFILE *p;
+
+	p=srtp_known_profiles;
+	while(p->name)
+		{
+		if(p->id == profile_num)
+			{
+			*pptr=p;
+			return 0;
+			}
+		p++;
+		}
+
+	return 1;
+	}
+
+static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out)
+	{
+	STACK_OF(SRTP_PROTECTION_PROFILE) *profiles;
+
+	char *col;
+	char *ptr=(char *)profiles_string;
+    
+	SRTP_PROTECTION_PROFILE *p;
+
+	if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null()))
+		{
+		SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES);
+		return 1;
+		}
+    
+	do
+		{
+		col=strchr(ptr,':');
+
+		if(!find_profile_by_name(ptr,&p,
+					 col ? col-ptr : (int)strlen(ptr)))
+			{
+			sk_SRTP_PROTECTION_PROFILE_push(profiles,p);
+			}
+		else
+			{
+			SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE);
+			return 1;
+			}
+
+		if(col) ptr=col+1;
+		} while (col);
+
+	*out=profiles;
+    
+	return 0;
+	}
+    
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles)
+	{
+	return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles);
+	}
+
+int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles)
+	{
+	return ssl_ctx_make_profiles(profiles,&s->srtp_profiles);
+	}
+
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s)
+	{
+	if(s != NULL)
+		{
+		if(s->srtp_profiles != NULL)
+			{
+			return s->srtp_profiles;
+			}
+		else if((s->ctx != NULL) &&
+			(s->ctx->srtp_profiles != NULL))
+			{
+			return s->ctx->srtp_profiles;
+			}
+		}
+
+	return NULL;
+	}
+
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s)
+	{
+	return s->srtp_profile;
+	}
+
+/* Note: this function returns 0 length if there are no 
+   profiles specified */
+int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+	{
+	int ct=0;
+	int i;
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0;
+	SRTP_PROTECTION_PROFILE *prof;
+    
+	clnt=SSL_get_srtp_profiles(s);    
+	ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */
+
+	if(p)
+		{
+		if(ct==0)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST);
+			return 1;
+			}
+
+		if((2 + ct*2 + 1) > maxlen)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+			return 1;
+			}
+
+                /* Add the length */
+                s2n(ct * 2, p);
+		for(i=0;i<ct;i++)
+			{
+			prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+			s2n(prof->id,p);
+			}
+
+                /* Add an empty use_mki value */
+                *p++ = 0;
+		}
+
+	*len=2 + ct*2 + 1;
+    
+	return 0;
+	}
+
+
+int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+	{
+	SRTP_PROTECTION_PROFILE *cprof,*sprof;
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr;
+        int ct;
+        int mki_len;
+	int i,j;
+	int id;
+	int ret;
+
+         /* Length value + the MKI length */
+        if(len < 3)
+		{            
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+                }
+
+        /* Pull off the length of the cipher suite list */
+        n2s(d, ct);
+        len -= 2;
+        
+        /* Check that it is even */
+	if(ct%2)
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+        
+        /* Check that lengths are consistent */
+	if(len < (ct + 1)) 
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+        
+	clnt=sk_SRTP_PROTECTION_PROFILE_new_null();
+
+	while(ct)
+		{
+		n2s(d,id);
+		ct-=2;
+                len-=2;
+
+		if(!find_profile_by_num(id,&cprof))
+			{
+			sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof);
+			}
+		else
+			{
+			; /* Ignore */
+			}
+		}
+
+        /* Now extract the MKI value as a sanity check, but discard it for now */
+        mki_len = *d;
+        d++; len--;
+
+        if (mki_len != len)
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+	srvr=SSL_get_srtp_profiles(s);
+
+	/* Pick our most preferred profile. If no profiles have been
+	 configured then the outer loop doesn't run 
+	 (sk_SRTP_PROTECTION_PROFILE_num() = -1)
+	 and so we just return without doing anything */
+	for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(srvr);i++)
+		{
+		sprof=sk_SRTP_PROTECTION_PROFILE_value(srvr,i);
+
+		for(j=0;j<sk_SRTP_PROTECTION_PROFILE_num(clnt);j++)
+			{
+			cprof=sk_SRTP_PROTECTION_PROFILE_value(clnt,j);
+            
+			if(cprof->id==sprof->id)
+				{
+				s->srtp_profile=sprof;
+				*al=0;
+				ret=0;
+				goto done;
+				}
+			}
+		}
+
+	ret=0;
+    
+done:
+	if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt);
+
+	return ret;
+	}
+
+int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+	{
+	if(p)
+		{
+		if(maxlen < 5)
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+			return 1;
+			}
+
+		if(s->srtp_profile==0)
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED);
+			return 1;
+			}
+                s2n(2, p);
+		s2n(s->srtp_profile->id,p);
+                *p++ = 0;
+		}
+	*len=5;
+    
+	return 0;
+	}
+    
+
+int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+	{
+	unsigned id;
+	int i;
+        int ct;
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt;
+	SRTP_PROTECTION_PROFILE *prof;
+
+	if(len!=5)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+        n2s(d, ct);
+	if(ct!=2)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+	n2s(d,id);
+        if (*d)  /* Must be no MKI, since we never offer one */
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+		*al=SSL_AD_ILLEGAL_PARAMETER;
+		return 1;
+		}
+
+	clnt=SSL_get_srtp_profiles(s);
+
+	/* Throw an error if the server gave us an unsolicited extension */
+	if (clnt == NULL)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+    
+	/* Check to see if the server gave us something we support
+	   (and presumably offered)
+	*/
+	for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(clnt);i++)
+		{
+		prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+	    
+		if(prof->id == id)
+			{
+			s->srtp_profile=prof;
+			*al=0;
+			return 0;
+			}
+		}
+
+	SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+	*al=SSL_AD_DECODE_ERROR;
+	return 1;
+	}
+
+
+#endif
diff --git a/ssl/d1_srvr.c b/ssl/d1_srvr.c
index a6a4c87..29421da 100644
--- a/ssl/d1_srvr.c
+++ b/ssl/d1_srvr.c
@@ -151,6 +151,10 @@ int dtls1_accept(SSL *s)
 	int ret= -1;
 	int new_state,state,skip=0;
 	int listen;
+#ifndef OPENSSL_NO_SCTP
+	unsigned char sctpauthkey[64];
+	char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)];
+#endif
 
 	RAND_add(&Time,sizeof(Time),0);
 	ERR_clear_error();
@@ -168,6 +172,13 @@ int dtls1_accept(SSL *s)
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s);
 
 	s->d1->listen = listen;
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to enter handshake
+	 * mode and prevent stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
 
 	if (s->cert == NULL)
 		{
@@ -175,6 +186,19 @@ int dtls1_accept(SSL *s)
 		return(-1);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		dtls1_stop_timer(s);
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -182,7 +206,7 @@ int dtls1_accept(SSL *s)
 		switch (s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			/* s->state=SSL_ST_ACCEPT; */
 
 		case SSL_ST_BEFORE:
@@ -227,8 +251,12 @@ int dtls1_accept(SSL *s)
 				{
 				/* Ok, we now need to push on a buffering BIO so that
 				 * the output is sent in a way that TCP likes :-)
+				 * ...but not with SCTP :-)
 				 */
-				if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; }
+#ifndef OPENSSL_NO_SCTP
+				if (!BIO_dgram_is_sctp(SSL_get_wbio(s)))
+#endif
+					if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; }
 
 				ssl3_init_finished_mac(s);
 				s->state=SSL3_ST_SR_CLNT_HELLO_A;
@@ -313,25 +341,75 @@ int dtls1_accept(SSL *s)
 				ssl3_init_finished_mac(s);
 			break;
 			
+#ifndef OPENSSL_NO_SCTP
+		case DTLS1_SCTP_ST_SR_READ_SOCK:
+			
+			if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))		
+				{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+				}
+			
+			s->state=SSL3_ST_SR_FINISHED_A;
+			break;
+			
+		case DTLS1_SCTP_ST_SW_WRITE_SOCK:
+			ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+			if (ret < 0) goto end;
+			
+			if (ret == 0)
+				{
+				if (s->d1->next_state != SSL_ST_OK)
+					{
+					s->s3->in_read_app_data=2;
+					s->rwstate=SSL_READING;
+					BIO_clear_retry_flags(SSL_get_rbio(s));
+					BIO_set_retry_read(SSL_get_rbio(s));
+					ret = -1;
+					goto end;
+					}
+				}
+
+			s->state=s->d1->next_state;
+			break;
+#endif
+
 		case SSL3_ST_SW_SRVR_HELLO_A:
 		case SSL3_ST_SW_SRVR_HELLO_B:
-			s->new_session = 2;
+			s->renegotiate = 2;
 			dtls1_start_timer(s);
 			ret=dtls1_send_server_hello(s);
 			if (ret <= 0) goto end;
 
-#ifndef OPENSSL_NO_TLSEXT
 			if (s->hit)
 				{
+#ifndef OPENSSL_NO_SCTP
+				/* Add new shared key for SCTP-Auth,
+				 * will be ignored if no SCTP used.
+				 */
+				snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+				         DTLS1_SCTP_AUTH_LABEL);
+
+				SSL_export_keying_material(s, sctpauthkey,
+				                           sizeof(sctpauthkey), labelbuffer,
+				                           sizeof(labelbuffer), NULL, 0, 0);
+				
+				BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+                         sizeof(sctpauthkey), sctpauthkey);
+#endif
+#ifndef OPENSSL_NO_TLSEXT
 				if (s->tlsext_ticket_expected)
 					s->state=SSL3_ST_SW_SESSION_TICKET_A;
 				else
 					s->state=SSL3_ST_SW_CHANGE_A;
-				}
 #else
-			if (s->hit)
-					s->state=SSL3_ST_SW_CHANGE_A;
+				s->state=SSL3_ST_SW_CHANGE_A;
 #endif
+				}
 			else
 				s->state=SSL3_ST_SW_CERT_A;
 			s->init_num=0;
@@ -441,6 +519,13 @@ int dtls1_accept(SSL *s)
 				skip=1;
 				s->s3->tmp.cert_request=0;
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A;
+					s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 				}
 			else
 				{
@@ -450,9 +535,23 @@ int dtls1_accept(SSL *s)
 				if (ret <= 0) goto end;
 #ifndef NETSCAPE_HANG_BUG
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A;
+					s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 #else
 				s->state=SSL3_ST_SW_FLUSH;
 				s->s3->tmp.next_state=SSL3_ST_SR_CERT_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = s->s3->tmp.next_state;
+					s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 #endif
 				s->init_num=0;
 				}
@@ -472,6 +571,13 @@ int dtls1_accept(SSL *s)
 			s->rwstate=SSL_WRITING;
 			if (BIO_flush(s->wbio) <= 0)
 				{
+				/* If the write error was fatal, stop trying */
+				if (!BIO_should_retry(s->wbio))
+					{
+					s->rwstate=SSL_NOTHING;
+					s->state=s->s3->tmp.next_state;
+					}
+				
 				ret= -1;
 				goto end;
 				}
@@ -485,15 +591,16 @@ int dtls1_accept(SSL *s)
 			ret = ssl3_check_client_hello(s);
 			if (ret <= 0)
 				goto end;
-			dtls1_stop_timer(s);
 			if (ret == 2)
+				{
+				dtls1_stop_timer(s);
 				s->state = SSL3_ST_SR_CLNT_HELLO_C;
+				}
 			else {
 				/* could be sent for a DH cert, even if we
 				 * have not asked for it :-) */
 				ret=ssl3_get_client_certificate(s);
 				if (ret <= 0) goto end;
-				dtls1_stop_timer(s);
 				s->init_num=0;
 				s->state=SSL3_ST_SR_KEY_EXCH_A;
 			}
@@ -503,7 +610,21 @@ int dtls1_accept(SSL *s)
 		case SSL3_ST_SR_KEY_EXCH_B:
 			ret=ssl3_get_client_key_exchange(s);
 			if (ret <= 0) goto end;
-			dtls1_stop_timer(s);
+#ifndef OPENSSL_NO_SCTP
+			/* Add new shared key for SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			snprintf((char *) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+			         DTLS1_SCTP_AUTH_LABEL);
+
+			SSL_export_keying_material(s, sctpauthkey,
+			                           sizeof(sctpauthkey), labelbuffer,
+			                           sizeof(labelbuffer), NULL, 0, 0);
+
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+			         sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 			s->state=SSL3_ST_SR_CERT_VRFY_A;
 			s->init_num=0;
 
@@ -540,9 +661,13 @@ int dtls1_accept(SSL *s)
 			/* we should decide if we expected this one */
 			ret=ssl3_get_cert_verify(s);
 			if (ret <= 0) goto end;
-			dtls1_stop_timer(s);
-
-			s->state=SSL3_ST_SR_FINISHED_A;
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+			    state == SSL_ST_RENEGOTIATE)
+				s->state=DTLS1_SCTP_ST_SR_READ_SOCK;
+			else
+#endif			
+				s->state=SSL3_ST_SR_FINISHED_A;
 			s->init_num=0;
 			break;
 
@@ -594,6 +719,14 @@ int dtls1_accept(SSL *s)
 				SSL3_ST_SW_CHANGE_A,SSL3_ST_SW_CHANGE_B);
 
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Change to new shared key of SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL);
+#endif
+
 			s->state=SSL3_ST_SW_FINISHED_A;
 			s->init_num=0;
 
@@ -618,7 +751,16 @@ int dtls1_accept(SSL *s)
 			if (s->hit)
 				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
 			else
+				{
 				s->s3->tmp.next_state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = s->s3->tmp.next_state;
+					s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
+				}
 			s->init_num=0;
 			break;
 
@@ -636,11 +778,9 @@ int dtls1_accept(SSL *s)
 
 			s->init_num=0;
 
-			if (s->new_session == 2) /* skipped if we just sent a HelloRequest */
+			if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */
 				{
-				/* actually not necessarily a 'new' session unless
-				 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
-				
+				s->renegotiate=0;
 				s->new_session=0;
 				
 				ssl_update_cache(s,SSL_SESS_CACHE_SERVER);
@@ -692,6 +832,14 @@ int dtls1_accept(SSL *s)
 	/* BIO_flush(s->wbio); */
 
 	s->in_handshake--;
+#ifndef OPENSSL_NO_SCTP
+		/* Notify SCTP BIO socket to leave handshake
+		 * mode and prevent stream identifier other
+		 * than 0. Will be ignored if no SCTP is used.
+		 */
+		BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
 	if (cb != NULL)
 		cb(s,SSL_CB_ACCEPT_EXIT,ret);
 	return(ret);
@@ -772,7 +920,7 @@ int dtls1_send_server_hello(SSL *s)
 		p=s->s3->server_random;
 		Time=(unsigned long)time(NULL);			/* Time */
 		l2n(Time,p);
-		RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-sizeof(Time));
+		RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-4);
 		/* Do the message type and length last */
 		d=p= &(buf[DTLS1_HM_HEADER_LENGTH]);
 
@@ -1147,7 +1295,7 @@ int dtls1_send_server_key_exchange(SSL *s)
 		if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 			&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
 			{
-			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher))
+			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher, NULL))
 				== NULL)
 				{
 				al=SSL_AD_DECODE_ERROR;
@@ -1271,7 +1419,7 @@ int dtls1_send_server_key_exchange(SSL *s)
 				EVP_SignInit_ex(&md_ctx,EVP_ecdsa(), NULL);
 				EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
 				EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
-				EVP_SignUpdate(&md_ctx,&(d[4]),n);
+				EVP_SignUpdate(&md_ctx,&(d[DTLS1_HM_HEADER_LENGTH]),n);
 				if (!EVP_SignFinal(&md_ctx,&(p[2]),
 					(unsigned int *)&i,pkey))
 					{
diff --git a/ssl/dtls1.h b/ssl/dtls1.h
index 2900d1d..e65d501 100644
--- a/ssl/dtls1.h
+++ b/ssl/dtls1.h
@@ -57,8 +57,8 @@
  *
  */
 
-#ifndef HEADER_DTLS1_H 
-#define HEADER_DTLS1_H 
+#ifndef HEADER_DTLS1_H
+#define HEADER_DTLS1_H
 
 #include <openssl/buffer.h>
 #include <openssl/pqueue.h>
@@ -72,8 +72,12 @@
 #elif defined(OPENSSL_SYS_NETWARE) && !defined(_WINSOCK2API_)
 #include <sys/timeval.h>
 #else
+#if defined(OPENSSL_SYS_VXWORKS)
+#include <sys/times.h>
+#else
 #include <sys/time.h>
 #endif
+#endif
 
 #ifdef  __cplusplus
 extern "C" {
@@ -105,6 +109,11 @@ extern "C" {
 #define DTLS1_AL_HEADER_LENGTH                   2
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_AUTH_LABEL	"EXPORTER_DTLS_OVER_SCTP"
+#endif
 
 typedef struct dtls1_bitmap_st
 	{
@@ -227,7 +236,7 @@ typedef struct dtls1_state_st
 
 	struct dtls1_timeout_st timeout;
 
-	/* Indicates when the last handshake msg sent will timeout */
+	/* Indicates when the last handshake msg or heartbeat sent will timeout */
 	struct timeval next_timeout;
 
 	/* Timeout duration */
@@ -243,6 +252,13 @@ typedef struct dtls1_state_st
 	unsigned int retransmitting;
 	unsigned int change_cipher_spec_ok;
 
+#ifndef OPENSSL_NO_SCTP
+	/* used when SSL_ST_XX_FLUSH is entered */
+	int next_state;
+
+	int shutdown_received;
+#endif
+
 	} DTLS1_STATE;
 
 typedef struct dtls1_record_data_st
@@ -251,8 +267,12 @@ typedef struct dtls1_record_data_st
 	unsigned int   packet_length;
 	SSL3_BUFFER    rbuf;
 	SSL3_RECORD    rrec;
+#ifndef OPENSSL_NO_SCTP
+	struct bio_dgram_sctp_rcvinfo recordinfo;
+#endif
 	} DTLS1_RECORD_DATA;
 
+#endif
 
 /* Timeout multipliers (timeout slice is defined in apps/timeouts.h */
 #define DTLS1_TMO_READ_COUNT                      2
diff --git a/ssl/kssl.c b/ssl/kssl.c
index b820e37..fd7c67b 100644
--- a/ssl/kssl.c
+++ b/ssl/kssl.c
@@ -2194,6 +2194,22 @@ krb5_error_code  kssl_build_principal_2(
 	return ENOMEM;
 	}
 
+void SSL_set0_kssl_ctx(SSL *s, KSSL_CTX *kctx)
+	{
+	s->kssl_ctx = kctx;
+	} 
+
+KSSL_CTX * SSL_get0_kssl_ctx(SSL *s)
+	{
+	return s->kssl_ctx;
+	}
+
+char *kssl_ctx_get0_client_princ(KSSL_CTX *kctx)
+	{
+	if (kctx)
+		return kctx->client_princ;
+	return NULL;
+	}
 
 #else /* !OPENSSL_NO_KRB5 */
 
diff --git a/ssl/kssl.h b/ssl/kssl.h
index a3d20e1..8242fd5 100644
--- a/ssl/kssl.h
+++ b/ssl/kssl.h
@@ -172,6 +172,10 @@ krb5_error_code  kssl_check_authent(KSSL_CTX *kssl_ctx, krb5_data *authentp,
 			            krb5_timestamp *atimep, KSSL_ERR *kssl_err);
 unsigned char	*kssl_skip_confound(krb5_enctype enctype, unsigned char *authn);
 
+void SSL_set0_kssl_ctx(SSL *s, KSSL_CTX *kctx);
+KSSL_CTX * SSL_get0_kssl_ctx(SSL *s);
+char *kssl_ctx_get0_client_princ(KSSL_CTX *kctx);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ssl/s23_clnt.c b/ssl/s23_clnt.c
index f41fe3a..2d0f51d 100644
--- a/ssl/s23_clnt.c
+++ b/ssl/s23_clnt.c
@@ -129,6 +129,10 @@ static const SSL_METHOD *ssl23_get_client_method(int ver)
 		return(SSLv3_client_method());
 	else if (ver == TLS1_VERSION)
 		return(TLSv1_client_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_client_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_client_method());
 	else
 		return(NULL);
 	}
@@ -278,24 +282,51 @@ static int ssl23_client_hello(SSL *s)
 	SSL_COMP *comp;
 #endif
 	int ret;
+	unsigned long mask, options = s->options;
 
-	ssl2_compat = (s->options & SSL_OP_NO_SSLv2) ? 0 : 1;
+	ssl2_compat = (options & SSL_OP_NO_SSLv2) ? 0 : 1;
 
 	if (ssl2_compat && ssl23_no_ssl2_ciphers(s))
 		ssl2_compat = 0;
 
-	if (!(s->options & SSL_OP_NO_TLSv1))
-		{
+	/*
+	 * SSL_OP_NO_X disables all protocols above X *if* there are
+	 * some protocols below X enabled. This is required in order
+	 * to maintain "version capability" vector contiguous. So
+	 * that if application wants to disable TLS1.0 in favour of
+	 * TLS1>=1, it would be insufficient to pass SSL_NO_TLSv1, the
+	 * answer is SSL_OP_NO_TLSv1|SSL_OP_NO_SSLv3|SSL_OP_NO_SSLv2.
+	 */
+	mask =	SSL_OP_NO_TLSv1_1|SSL_OP_NO_TLSv1
+#if !defined(OPENSSL_NO_SSL3)
+		|SSL_OP_NO_SSLv3
+#endif
+#if !defined(OPENSSL_NO_SSL2)
+		|(ssl2_compat?SSL_OP_NO_SSLv2:0)
+#endif
+		;
+#if !defined(OPENSSL_NO_TLS1_2_CLIENT)
+	version = TLS1_2_VERSION;
+
+	if ((options & SSL_OP_NO_TLSv1_2) && (options & mask) != mask)
+		version = TLS1_1_VERSION;
+#else
+	version = TLS1_1_VERSION;
+#endif
+	mask &= ~SSL_OP_NO_TLSv1_1;
+	if ((options & SSL_OP_NO_TLSv1_1) && (options & mask) != mask)
 		version = TLS1_VERSION;
-		}
-	else if (!(s->options & SSL_OP_NO_SSLv3))
-		{
+	mask &= ~SSL_OP_NO_TLSv1;
+#if !defined(OPENSSL_NO_SSL3)
+	if ((options & SSL_OP_NO_TLSv1) && (options & mask) != mask)
 		version = SSL3_VERSION;
-		}
-	else if (!(s->options & SSL_OP_NO_SSLv2))
-		{
+	mask &= ~SSL_OP_NO_SSLv3;
+#endif
+#if !defined(OPENSSL_NO_SSL2)
+	if ((options & SSL_OP_NO_SSLv3) && (options & mask) != mask)
 		version = SSL2_VERSION;
-		}
+#endif
+
 #ifndef OPENSSL_NO_TLSEXT
 	if (version != SSL2_VERSION)
 		{
@@ -329,11 +360,29 @@ static int ssl23_client_hello(SSL *s)
 		if (RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-4) <= 0)
 			return -1;
 
-		if (version == TLS1_VERSION)
+		if (version == TLS1_2_VERSION)
+			{
+			version_major = TLS1_2_VERSION_MAJOR;
+			version_minor = TLS1_2_VERSION_MINOR;
+			}
+		else if (version == TLS1_1_VERSION)
+			{
+			version_major = TLS1_1_VERSION_MAJOR;
+			version_minor = TLS1_1_VERSION_MINOR;
+			}
+		else if (version == TLS1_VERSION)
 			{
 			version_major = TLS1_VERSION_MAJOR;
 			version_minor = TLS1_VERSION_MINOR;
 			}
+#ifdef OPENSSL_FIPS
+		else if(FIPS_mode())
+			{
+			SSLerr(SSL_F_SSL23_CLIENT_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+			return -1;
+			}
+#endif
 		else if (version == SSL3_VERSION)
 			{
 			version_major = SSL3_VERSION_MAJOR;
@@ -437,6 +486,15 @@ static int ssl23_client_hello(SSL *s)
 				SSLerr(SSL_F_SSL23_CLIENT_HELLO,SSL_R_NO_CIPHERS_AVAILABLE);
 				return -1;
 				}
+#ifdef OPENSSL_MAX_TLS1_2_CIPHER_LENGTH
+			/* Some servers hang if client hello > 256 bytes
+			 * as hack workaround chop number of supported ciphers
+			 * to keep it well below this if we use TLS v1.2
+			 */
+			if (TLS1_get_version(s) >= TLS1_2_VERSION
+				&& i > OPENSSL_MAX_TLS1_2_CIPHER_LENGTH)
+				i = OPENSSL_MAX_TLS1_2_CIPHER_LENGTH & ~1;
+#endif
 			s2n(i,p);
 			p+=i;
 
@@ -491,8 +549,13 @@ static int ssl23_client_hello(SSL *s)
 			d=buf;
 			*(d++) = SSL3_RT_HANDSHAKE;
 			*(d++) = version_major;
-			*(d++) = version_minor; /* arguably we should send the *lowest* suported version here
-			                         * (indicating, e.g., TLS 1.0 in "SSL 3.0 format") */
+			/* Some servers hang if we use long client hellos
+			 * and a record number > TLS 1.0.
+			 */
+			if (TLS1_get_client_version(s) > TLS1_VERSION)
+				*(d++) = 1;
+			else
+				*(d++) = version_minor;
 			s2n((int)l,d);
 
 			/* number of bytes to write */
@@ -608,7 +671,7 @@ static int ssl23_get_server_hello(SSL *s)
 #endif
 		}
 	else if (p[1] == SSL3_VERSION_MAJOR &&
-	         (p[2] == SSL3_VERSION_MINOR || p[2] == TLS1_VERSION_MINOR) &&
+	         p[2] <= TLS1_2_VERSION_MINOR &&
 	         ((p[0] == SSL3_RT_HANDSHAKE && p[5] == SSL3_MT_SERVER_HELLO) ||
 	          (p[0] == SSL3_RT_ALERT && p[3] == 0 && p[4] == 2)))
 		{
@@ -617,6 +680,14 @@ static int ssl23_get_server_hello(SSL *s)
 		if ((p[2] == SSL3_VERSION_MINOR) &&
 			!(s->options & SSL_OP_NO_SSLv3))
 			{
+#ifdef OPENSSL_FIPS
+			if(FIPS_mode())
+				{
+				SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+				goto err;
+				}
+#endif
 			s->version=SSL3_VERSION;
 			s->method=SSLv3_client_method();
 			}
@@ -626,6 +697,18 @@ static int ssl23_get_server_hello(SSL *s)
 			s->version=TLS1_VERSION;
 			s->method=TLSv1_client_method();
 			}
+		else if ((p[2] == TLS1_1_VERSION_MINOR) &&
+			!(s->options & SSL_OP_NO_TLSv1_1))
+			{
+			s->version=TLS1_1_VERSION;
+			s->method=TLSv1_1_client_method();
+			}
+		else if ((p[2] == TLS1_2_VERSION_MINOR) &&
+			!(s->options & SSL_OP_NO_TLSv1_2))
+			{
+			s->version=TLS1_2_VERSION;
+			s->method=TLSv1_2_client_method();
+			}
 		else
 			{
 			SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,SSL_R_UNSUPPORTED_PROTOCOL);
diff --git a/ssl/s23_meth.c b/ssl/s23_meth.c
index c6099ef..40eae0f 100644
--- a/ssl/s23_meth.c
+++ b/ssl/s23_meth.c
@@ -76,6 +76,10 @@ static const SSL_METHOD *ssl23_get_method(int ver)
 #ifndef OPENSSL_NO_TLS1
 	if (ver == TLS1_VERSION)
 		return(TLSv1_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_method());
 	else
 #endif
 		return(NULL);
diff --git a/ssl/s23_srvr.c b/ssl/s23_srvr.c
index e22879c..4877849 100644
--- a/ssl/s23_srvr.c
+++ b/ssl/s23_srvr.c
@@ -115,6 +115,9 @@
 #include <openssl/rand.h>
 #include <openssl/objects.h>
 #include <openssl/evp.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 static const SSL_METHOD *ssl23_get_server_method(int ver);
 int ssl23_get_client_hello(SSL *s);
@@ -128,6 +131,10 @@ static const SSL_METHOD *ssl23_get_server_method(int ver)
 		return(SSLv3_server_method());
 	else if (ver == TLS1_VERSION)
 		return(TLSv1_server_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_server_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_server_method());
 	else
 		return(NULL);
 	}
@@ -283,7 +290,20 @@ int ssl23_get_client_hello(SSL *s)
 				/* SSLv3/TLSv1 */
 				if (p[4] >= TLS1_VERSION_MINOR)
 					{
-					if (!(s->options & SSL_OP_NO_TLSv1))
+					if (p[4] >= TLS1_2_VERSION_MINOR &&
+					   !(s->options & SSL_OP_NO_TLSv1_2))
+						{
+						s->version=TLS1_2_VERSION;
+						s->state=SSL23_ST_SR_CLNT_HELLO_B;
+						}
+					else if (p[4] >= TLS1_1_VERSION_MINOR &&
+					   !(s->options & SSL_OP_NO_TLSv1_1))
+						{
+						s->version=TLS1_1_VERSION;
+						/* type=2; */ /* done later to survive restarts */
+						s->state=SSL23_ST_SR_CLNT_HELLO_B;
+						}
+					else if (!(s->options & SSL_OP_NO_TLSv1))
 						{
 						s->version=TLS1_VERSION;
 						/* type=2; */ /* done later to survive restarts */
@@ -350,7 +370,19 @@ int ssl23_get_client_hello(SSL *s)
 				v[1]=p[10]; /* minor version according to client_version */
 			if (v[1] >= TLS1_VERSION_MINOR)
 				{
-				if (!(s->options & SSL_OP_NO_TLSv1))
+				if (v[1] >= TLS1_2_VERSION_MINOR &&
+					!(s->options & SSL_OP_NO_TLSv1_2))
+					{
+					s->version=TLS1_2_VERSION;
+					type=3;
+					}
+				else if (v[1] >= TLS1_1_VERSION_MINOR &&
+					!(s->options & SSL_OP_NO_TLSv1_1))
+					{
+					s->version=TLS1_1_VERSION;
+					type=3;
+					}
+				else if (!(s->options & SSL_OP_NO_TLSv1))
 					{
 					s->version=TLS1_VERSION;
 					type=3;
@@ -393,6 +425,15 @@ int ssl23_get_client_hello(SSL *s)
 			}
 		}
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && (s->version < TLS1_VERSION))
+		{
+		SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+		goto err;
+		}
+#endif
+
 	if (s->state == SSL23_ST_SR_CLNT_HELLO_B)
 		{
 		/* we have SSLv3/TLSv1 in an SSLv2 header
@@ -403,13 +444,8 @@ int ssl23_get_client_hello(SSL *s)
 		v[0] = p[3]; /* == SSL3_VERSION_MAJOR */
 		v[1] = p[4];
 
-/* The SSL2 protocol allows n to be larger, just pick
- * a reasonable buffer size. */
-#if SSL3_RT_DEFAULT_PACKET_SIZE < 1024*4 - SSL3_RT_DEFAULT_WRITE_OVERHEAD
-#error "SSL3_RT_DEFAULT_PACKET_SIZE is too small."
-#endif
 		n=((p[0]&0x7f)<<8)|p[1];
-		if (n > SSL3_RT_DEFAULT_PACKET_SIZE - 2)
+		if (n > (1024*4))
 			{
 			SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,SSL_R_RECORD_TOO_LARGE);
 			goto err;
@@ -572,8 +608,11 @@ int ssl23_get_client_hello(SSL *s)
 			s->s3->rbuf.left=0;
 			s->s3->rbuf.offset=0;
 			}
-
-		if (s->version == TLS1_VERSION)
+		if (s->version == TLS1_2_VERSION)
+			s->method = TLSv1_2_server_method();
+		else if (s->version == TLS1_1_VERSION)
+			s->method = TLSv1_1_server_method();
+		else if (s->version == TLS1_VERSION)
 			s->method = TLSv1_server_method();
 		else
 			s->method = SSLv3_server_method();
diff --git a/ssl/s2_clnt.c b/ssl/s2_clnt.c
index 00ac158..03b6cf9 100644
--- a/ssl/s2_clnt.c
+++ b/ssl/s2_clnt.c
@@ -359,12 +359,14 @@ static int get_server_hello(SSL *s)
 					SSL_R_PEER_ERROR);
 			return(-1);
 			}
-#ifdef __APPLE_CC__
-		/* The Rhapsody 5.5 (a.k.a. MacOS X) compiler bug
-		 * workaround. <appro@fy.chalmers.se> */
-		s->hit=(i=*(p++))?1:0;
-#else
+#if 0
 		s->hit=(*(p++))?1:0;
+		/* Some [PPC?] compilers fail to increment p in above
+		   statement, e.g. one provided with Rhapsody 5.5, but
+		   most recent example XL C 11.1 for AIX, even without
+		   optimization flag... */
+#else
+		s->hit=(*p)?1:0; p++;
 #endif
 		s->s2->tmp.cert_type= *(p++);
 		n2s(p,i);
@@ -937,7 +939,7 @@ static int get_server_verify(SSL *s)
 		s->msg_callback(0, s->version, 0, p, len, s, s->msg_callback_arg); /* SERVER-VERIFY */
 	p += 1;
 
-	if (memcmp(p,s->s2->challenge,s->s2->challenge_length) != 0)
+	if (CRYPTO_memcmp(p,s->s2->challenge,s->s2->challenge_length) != 0)
 		{
 		ssl2_return_error(s,SSL2_PE_UNDEFINED_ERROR);
 		SSLerr(SSL_F_GET_SERVER_VERIFY,SSL_R_CHALLENGE_IS_DIFFERENT);
diff --git a/ssl/s2_pkt.c b/ssl/s2_pkt.c
index ac963b2..8bb6ab8 100644
--- a/ssl/s2_pkt.c
+++ b/ssl/s2_pkt.c
@@ -269,8 +269,7 @@ static int ssl2_read_internal(SSL *s, void *buf, int len, int peek)
 			s->s2->ract_data_length-=mac_size;
 			ssl2_mac(s,mac,0);
 			s->s2->ract_data_length-=s->s2->padding;
-			if (	(memcmp(mac,s->s2->mac_data,
-				(unsigned int)mac_size) != 0) ||
+			if (	(CRYPTO_memcmp(mac,s->s2->mac_data,mac_size) != 0) ||
 				(s->s2->rlength%EVP_CIPHER_CTX_block_size(s->enc_read_ctx) != 0))
 				{
 				SSLerr(SSL_F_SSL2_READ_INTERNAL,SSL_R_BAD_MAC_DECODE);
diff --git a/ssl/s2_srvr.c b/ssl/s2_srvr.c
index bc885e8..2cba426 100644
--- a/ssl/s2_srvr.c
+++ b/ssl/s2_srvr.c
@@ -1059,10 +1059,12 @@ static int request_certificate(SSL *s)
 		EVP_PKEY *pkey=NULL;
 
 		EVP_MD_CTX_init(&ctx);
-		EVP_VerifyInit_ex(&ctx,s->ctx->rsa_md5, NULL);
-		EVP_VerifyUpdate(&ctx,s->s2->key_material,
-				 s->s2->key_material_length);
-		EVP_VerifyUpdate(&ctx,ccd,SSL2_MIN_CERT_CHALLENGE_LENGTH);
+		if (!EVP_VerifyInit_ex(&ctx,s->ctx->rsa_md5, NULL)
+		    || !EVP_VerifyUpdate(&ctx,s->s2->key_material,
+					 s->s2->key_material_length)
+		    || !EVP_VerifyUpdate(&ctx,ccd,
+					 SSL2_MIN_CERT_CHALLENGE_LENGTH))
+			goto msg_end;
 
 		i=i2d_X509(s->cert->pkeys[SSL_PKEY_RSA_ENC].x509,NULL);
 		buf2=OPENSSL_malloc((unsigned int)i);
@@ -1073,7 +1075,11 @@ static int request_certificate(SSL *s)
 			}
 		p2=buf2;
 		i=i2d_X509(s->cert->pkeys[SSL_PKEY_RSA_ENC].x509,&p2);
-		EVP_VerifyUpdate(&ctx,buf2,(unsigned int)i);
+		if (!EVP_VerifyUpdate(&ctx,buf2,(unsigned int)i))
+			{
+			OPENSSL_free(buf2);
+			goto msg_end;
+			}
 		OPENSSL_free(buf2);
 
 		pkey=X509_get_pubkey(x509);
diff --git a/ssl/s3_both.c b/ssl/s3_both.c
index 508e390..1edef20 100644
--- a/ssl/s3_both.c
+++ b/ssl/s3_both.c
@@ -233,7 +233,7 @@ int ssl3_get_finished(SSL *s, int a, int b)
 
 #ifdef OPENSSL_NO_NEXTPROTONEG
 	/* the mac has already been generated when we received the
-	 * change cipher spec message and is in s->s3->tmp.peer_finish_md
+	 * change cipher spec message and is in s->s3->tmp.peer_finish_md.
 	 */ 
 #endif
 
@@ -265,7 +265,7 @@ int ssl3_get_finished(SSL *s, int a, int b)
 		goto f_err;
 		}
 
-	if (memcmp(p, s->s3->tmp.peer_finish_md, i) != 0)
+	if (CRYPTO_memcmp(p, s->s3->tmp.peer_finish_md, i) != 0)
 		{
 		al=SSL_AD_DECRYPT_ERROR;
 		SSLerr(SSL_F_SSL3_GET_FINISHED,SSL_R_DIGEST_CHECK_FAILED);
@@ -555,7 +555,8 @@ long ssl3_get_message(SSL *s, int st1, int stn, int mt, long max, int *ok)
 #endif
 
 	/* Feed this message into MAC computation. */
-	ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
+	if (*(unsigned char*)s->init_buf->data != SSL3_MT_ENCRYPTED_EXTENSIONS)
+		ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
 	if (s->msg_callback)
 		s->msg_callback(0, s->version, SSL3_RT_HANDSHAKE, s->init_buf->data, (size_t)s->init_num + 4, s, s->msg_callback_arg);
 	*ok=1;
@@ -756,20 +757,13 @@ int ssl3_setup_read_buffer(SSL *s)
 
 	if (s->s3->rbuf.buf == NULL)
 		{
-		if (SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS)
-			{
-			len = SSL3_RT_DEFAULT_PACKET_SIZE;
-			}
-  		else
+		len = SSL3_RT_MAX_PLAIN_LENGTH
+			+ SSL3_RT_MAX_ENCRYPTED_OVERHEAD
+			+ headerlen + align;
+		if (s->options & SSL_OP_MICROSOFT_BIG_SSLV3_BUFFER)
 			{
-			len = SSL3_RT_MAX_PLAIN_LENGTH
-				+ SSL3_RT_MAX_ENCRYPTED_OVERHEAD
-				+ headerlen + align;
-			if (s->options & SSL_OP_MICROSOFT_BIG_SSLV3_BUFFER)
-				{
-				s->s3->init_extra = 1;
-				len += SSL3_RT_MAX_EXTRA;
-				}
+			s->s3->init_extra = 1;
+			len += SSL3_RT_MAX_EXTRA;
 			}
 #ifndef OPENSSL_NO_COMP
 		if (!(s->options & SSL_OP_NO_COMPRESSION))
@@ -805,15 +799,7 @@ int ssl3_setup_write_buffer(SSL *s)
 
 	if (s->s3->wbuf.buf == NULL)
 		{
-		if (SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS)
-			{
-			len = SSL3_RT_DEFAULT_PACKET_SIZE;
-			}
-  		else
-			{
-			len = s->max_send_fragment;
-			}
-		len += 0
+		len = s->max_send_fragment
 			+ SSL3_RT_SEND_MAX_ENCRYPTED_OVERHEAD
 			+ headerlen + align;
 #ifndef OPENSSL_NO_COMP
@@ -823,6 +809,7 @@ int ssl3_setup_write_buffer(SSL *s)
 		if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS))
 			len += headerlen + align
 				+ SSL3_RT_SEND_MAX_ENCRYPTED_OVERHEAD;
+
 		if ((p=freelist_extract(s->ctx, 0, len)) == NULL)
 			goto err;
 		s->s3->wbuf.buf = p;
@@ -865,3 +852,4 @@ int ssl3_release_read_buffer(SSL *s)
 		}
 	return 1;
 	}
+
diff --git a/ssl/s3_cbc.c b/ssl/s3_cbc.c
new file mode 100644
index 0000000..02edf3f
--- /dev/null
+++ b/ssl/s3_cbc.c
@@ -0,0 +1,790 @@
+/* ssl/s3_cbc.c */
+/* ====================================================================
+ * Copyright (c) 2012 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "ssl_locl.h"
+
+#include <openssl/md5.h>
+#include <openssl/sha.h>
+
+/* MAX_HASH_BIT_COUNT_BYTES is the maximum number of bytes in the hash's length
+ * field. (SHA-384/512 have 128-bit length.) */
+#define MAX_HASH_BIT_COUNT_BYTES 16
+
+/* MAX_HASH_BLOCK_SIZE is the maximum hash block size that we'll support.
+ * Currently SHA-384/512 has a 128-byte block size and that's the largest
+ * supported by TLS.) */
+#define MAX_HASH_BLOCK_SIZE 128
+
+/* Some utility functions are needed:
+ *
+ * These macros return the given value with the MSB copied to all the other
+ * bits. They use the fact that arithmetic shift shifts-in the sign bit.
+ * However, this is not ensured by the C standard so you may need to replace
+ * them with something else on odd CPUs. */
+#define DUPLICATE_MSB_TO_ALL(x) ( (unsigned)( (int)(x) >> (sizeof(int)*8-1) ) )
+#define DUPLICATE_MSB_TO_ALL_8(x) ((unsigned char)(DUPLICATE_MSB_TO_ALL(x)))
+
+/* constant_time_lt returns 0xff if a<b and 0x00 otherwise. */
+static unsigned constant_time_lt(unsigned a, unsigned b)
+	{
+	a -= b;
+	return DUPLICATE_MSB_TO_ALL(a);
+	}
+
+/* constant_time_ge returns 0xff if a>=b and 0x00 otherwise. */
+static unsigned constant_time_ge(unsigned a, unsigned b)
+	{
+	a -= b;
+	return DUPLICATE_MSB_TO_ALL(~a);
+	}
+
+/* constant_time_eq_8 returns 0xff if a==b and 0x00 otherwise. */
+static unsigned char constant_time_eq_8(unsigned a, unsigned b)
+	{
+	unsigned c = a ^ b;
+	c--;
+	return DUPLICATE_MSB_TO_ALL_8(c);
+	}
+
+/* ssl3_cbc_remove_padding removes padding from the decrypted, SSLv3, CBC
+ * record in |rec| by updating |rec->length| in constant time.
+ *
+ * block_size: the block size of the cipher used to encrypt the record.
+ * returns:
+ *   0: (in non-constant time) if the record is publicly invalid.
+ *   1: if the padding was valid
+ *  -1: otherwise. */
+int ssl3_cbc_remove_padding(const SSL* s,
+			    SSL3_RECORD *rec,
+			    unsigned block_size,
+			    unsigned mac_size)
+	{
+	unsigned padding_length, good;
+	const unsigned overhead = 1 /* padding length byte */ + mac_size;
+
+	/* These lengths are all public so we can test them in non-constant
+	 * time. */
+	if (overhead > rec->length)
+		return 0;
+
+	padding_length = rec->data[rec->length-1];
+	good = constant_time_ge(rec->length, padding_length+overhead);
+	/* SSLv3 requires that the padding is minimal. */
+	good &= constant_time_ge(block_size, padding_length+1);
+	padding_length = good & (padding_length+1);
+	rec->length -= padding_length;
+	rec->type |= padding_length<<8;	/* kludge: pass padding length */
+	return (int)((good & 1) | (~good & -1));
+}
+
+/* tls1_cbc_remove_padding removes the CBC padding from the decrypted, TLS, CBC
+ * record in |rec| in constant time and returns 1 if the padding is valid and
+ * -1 otherwise. It also removes any explicit IV from the start of the record
+ * without leaking any timing about whether there was enough space after the
+ * padding was removed.
+ *
+ * block_size: the block size of the cipher used to encrypt the record.
+ * returns:
+ *   0: (in non-constant time) if the record is publicly invalid.
+ *   1: if the padding was valid
+ *  -1: otherwise. */
+int tls1_cbc_remove_padding(const SSL* s,
+			    SSL3_RECORD *rec,
+			    unsigned block_size,
+			    unsigned mac_size)
+	{
+	unsigned padding_length, good, to_check, i;
+	const unsigned overhead = 1 /* padding length byte */ + mac_size;
+	/* Check if version requires explicit IV */
+	if (s->version >= TLS1_1_VERSION || s->version == DTLS1_VERSION)
+		{
+		/* These lengths are all public so we can test them in
+		 * non-constant time.
+		 */
+		if (overhead + block_size > rec->length)
+			return 0;
+		/* We can now safely skip explicit IV */
+		rec->data += block_size;
+		rec->input += block_size;
+		rec->length -= block_size;
+		}
+	else if (overhead > rec->length)
+		return 0;
+
+	padding_length = rec->data[rec->length-1];
+
+	/* NB: if compression is in operation the first packet may not be of
+	 * even length so the padding bug check cannot be performed. This bug
+	 * workaround has been around since SSLeay so hopefully it is either
+	 * fixed now or no buggy implementation supports compression [steve]
+	 */
+	if ( (s->options&SSL_OP_TLS_BLOCK_PADDING_BUG) && !s->expand)
+		{
+		/* First packet is even in size, so check */
+		if ((memcmp(s->s3->read_sequence, "\0\0\0\0\0\0\0\0",8) == 0) &&
+		    !(padding_length & 1))
+			{
+			s->s3->flags|=TLS1_FLAGS_TLS_PADDING_BUG;
+			}
+		if ((s->s3->flags & TLS1_FLAGS_TLS_PADDING_BUG) &&
+		    padding_length > 0)
+			{
+			padding_length--;
+			}
+		}
+
+	if (EVP_CIPHER_flags(s->enc_read_ctx->cipher)&EVP_CIPH_FLAG_AEAD_CIPHER)
+		{
+		/* padding is already verified */
+		rec->length -= padding_length + 1;
+		return 1;
+		}
+
+	good = constant_time_ge(rec->length, overhead+padding_length);
+	/* The padding consists of a length byte at the end of the record and
+	 * then that many bytes of padding, all with the same value as the
+	 * length byte. Thus, with the length byte included, there are i+1
+	 * bytes of padding.
+	 *
+	 * We can't check just |padding_length+1| bytes because that leaks
+	 * decrypted information. Therefore we always have to check the maximum
+	 * amount of padding possible. (Again, the length of the record is
+	 * public information so we can use it.) */
+	to_check = 255; /* maximum amount of padding. */
+	if (to_check > rec->length-1)
+		to_check = rec->length-1;
+
+	for (i = 0; i < to_check; i++)
+		{
+		unsigned char mask = constant_time_ge(padding_length, i);
+		unsigned char b = rec->data[rec->length-1-i];
+		/* The final |padding_length+1| bytes should all have the value
+		 * |padding_length|. Therefore the XOR should be zero. */
+		good &= ~(mask&(padding_length ^ b));
+		}
+
+	/* If any of the final |padding_length+1| bytes had the wrong value,
+	 * one or more of the lower eight bits of |good| will be cleared. We
+	 * AND the bottom 8 bits together and duplicate the result to all the
+	 * bits. */
+	good &= good >> 4;
+	good &= good >> 2;
+	good &= good >> 1;
+	good <<= sizeof(good)*8-1;
+	good = DUPLICATE_MSB_TO_ALL(good);
+
+	padding_length = good & (padding_length+1);
+	rec->length -= padding_length;
+	rec->type |= padding_length<<8;	/* kludge: pass padding length */
+
+	return (int)((good & 1) | (~good & -1));
+	}
+
+/* ssl3_cbc_copy_mac copies |md_size| bytes from the end of |rec| to |out| in
+ * constant time (independent of the concrete value of rec->length, which may
+ * vary within a 256-byte window).
+ *
+ * ssl3_cbc_remove_padding or tls1_cbc_remove_padding must be called prior to
+ * this function.
+ *
+ * On entry:
+ *   rec->orig_len >= md_size
+ *   md_size <= EVP_MAX_MD_SIZE
+ *
+ * If CBC_MAC_ROTATE_IN_PLACE is defined then the rotation is performed with
+ * variable accesses in a 64-byte-aligned buffer. Assuming that this fits into
+ * a single or pair of cache-lines, then the variable memory accesses don't
+ * actually affect the timing. CPUs with smaller cache-lines [if any] are
+ * not multi-core and are not considered vulnerable to cache-timing attacks.
+ */
+#define CBC_MAC_ROTATE_IN_PLACE
+
+void ssl3_cbc_copy_mac(unsigned char* out,
+		       const SSL3_RECORD *rec,
+		       unsigned md_size,unsigned orig_len)
+	{
+#if defined(CBC_MAC_ROTATE_IN_PLACE)
+	unsigned char rotated_mac_buf[64+EVP_MAX_MD_SIZE];
+	unsigned char *rotated_mac;
+#else
+	unsigned char rotated_mac[EVP_MAX_MD_SIZE];
+#endif
+
+	/* mac_end is the index of |rec->data| just after the end of the MAC. */
+	unsigned mac_end = rec->length;
+	unsigned mac_start = mac_end - md_size;
+	/* scan_start contains the number of bytes that we can ignore because
+	 * the MAC's position can only vary by 255 bytes. */
+	unsigned scan_start = 0;
+	unsigned i, j;
+	unsigned div_spoiler;
+	unsigned rotate_offset;
+
+	OPENSSL_assert(orig_len >= md_size);
+	OPENSSL_assert(md_size <= EVP_MAX_MD_SIZE);
+
+#if defined(CBC_MAC_ROTATE_IN_PLACE)
+	rotated_mac = rotated_mac_buf + ((0-(size_t)rotated_mac_buf)&63);
+#endif
+
+	/* This information is public so it's safe to branch based on it. */
+	if (orig_len > md_size + 255 + 1)
+		scan_start = orig_len - (md_size + 255 + 1);
+	/* div_spoiler contains a multiple of md_size that is used to cause the
+	 * modulo operation to be constant time. Without this, the time varies
+	 * based on the amount of padding when running on Intel chips at least.
+	 *
+	 * The aim of right-shifting md_size is so that the compiler doesn't
+	 * figure out that it can remove div_spoiler as that would require it
+	 * to prove that md_size is always even, which I hope is beyond it. */
+	div_spoiler = md_size >> 1;
+	div_spoiler <<= (sizeof(div_spoiler)-1)*8;
+	rotate_offset = (div_spoiler + mac_start - scan_start) % md_size;
+
+	memset(rotated_mac, 0, md_size);
+	for (i = scan_start, j = 0; i < orig_len; i++)
+		{
+		unsigned char mac_started = constant_time_ge(i, mac_start);
+		unsigned char mac_ended = constant_time_ge(i, mac_end);
+		unsigned char b = rec->data[i];
+		rotated_mac[j++] |= b & mac_started & ~mac_ended;
+		j &= constant_time_lt(j,md_size);
+		}
+
+	/* Now rotate the MAC */
+#if defined(CBC_MAC_ROTATE_IN_PLACE)
+	j = 0;
+	for (i = 0; i < md_size; i++)
+		{
+		/* in case cache-line is 32 bytes, touch second line */
+		((volatile unsigned char *)rotated_mac)[rotate_offset^32];
+		out[j++] = rotated_mac[rotate_offset++];
+		rotate_offset &= constant_time_lt(rotate_offset,md_size);
+		}
+#else
+	memset(out, 0, md_size);
+	rotate_offset = md_size - rotate_offset;
+	rotate_offset &= constant_time_lt(rotate_offset,md_size);
+	for (i = 0; i < md_size; i++)
+		{
+		for (j = 0; j < md_size; j++)
+			out[j] |= rotated_mac[i] & constant_time_eq_8(j, rotate_offset);
+		rotate_offset++;
+		rotate_offset &= constant_time_lt(rotate_offset,md_size);
+		}
+#endif
+	}
+
+/* u32toLE serialises an unsigned, 32-bit number (n) as four bytes at (p) in
+ * little-endian order. The value of p is advanced by four. */
+#define u32toLE(n, p) \
+	(*((p)++)=(unsigned char)(n), \
+	 *((p)++)=(unsigned char)(n>>8), \
+	 *((p)++)=(unsigned char)(n>>16), \
+	 *((p)++)=(unsigned char)(n>>24))
+
+/* These functions serialize the state of a hash and thus perform the standard
+ * "final" operation without adding the padding and length that such a function
+ * typically does. */
+static void tls1_md5_final_raw(void* ctx, unsigned char *md_out)
+	{
+	MD5_CTX *md5 = ctx;
+	u32toLE(md5->A, md_out);
+	u32toLE(md5->B, md_out);
+	u32toLE(md5->C, md_out);
+	u32toLE(md5->D, md_out);
+	}
+
+static void tls1_sha1_final_raw(void* ctx, unsigned char *md_out)
+	{
+	SHA_CTX *sha1 = ctx;
+	l2n(sha1->h0, md_out);
+	l2n(sha1->h1, md_out);
+	l2n(sha1->h2, md_out);
+	l2n(sha1->h3, md_out);
+	l2n(sha1->h4, md_out);
+	}
+#define LARGEST_DIGEST_CTX SHA_CTX
+
+#ifndef OPENSSL_NO_SHA256
+static void tls1_sha256_final_raw(void* ctx, unsigned char *md_out)
+	{
+	SHA256_CTX *sha256 = ctx;
+	unsigned i;
+
+	for (i = 0; i < 8; i++)
+		{
+		l2n(sha256->h[i], md_out);
+		}
+	}
+#undef  LARGEST_DIGEST_CTX
+#define LARGEST_DIGEST_CTX SHA256_CTX
+#endif
+
+#ifndef OPENSSL_NO_SHA512
+static void tls1_sha512_final_raw(void* ctx, unsigned char *md_out)
+	{
+	SHA512_CTX *sha512 = ctx;
+	unsigned i;
+
+	for (i = 0; i < 8; i++)
+		{
+		l2n8(sha512->h[i], md_out);
+		}
+	}
+#undef  LARGEST_DIGEST_CTX
+#define LARGEST_DIGEST_CTX SHA512_CTX
+#endif
+
+/* ssl3_cbc_record_digest_supported returns 1 iff |ctx| uses a hash function
+ * which ssl3_cbc_digest_record supports. */
+char ssl3_cbc_record_digest_supported(const EVP_MD_CTX *ctx)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return 0;
+#endif
+	switch (EVP_MD_CTX_type(ctx))
+		{
+		case NID_md5:
+		case NID_sha1:
+#ifndef OPENSSL_NO_SHA256
+		case NID_sha224:
+		case NID_sha256:
+#endif
+#ifndef OPENSSL_NO_SHA512
+		case NID_sha384:
+		case NID_sha512:
+#endif
+			return 1;
+		default:
+			return 0;
+		}
+	}
+
+/* ssl3_cbc_digest_record computes the MAC of a decrypted, padded SSLv3/TLS
+ * record.
+ *
+ *   ctx: the EVP_MD_CTX from which we take the hash function.
+ *     ssl3_cbc_record_digest_supported must return true for this EVP_MD_CTX.
+ *   md_out: the digest output. At most EVP_MAX_MD_SIZE bytes will be written.
+ *   md_out_size: if non-NULL, the number of output bytes is written here.
+ *   header: the 13-byte, TLS record header.
+ *   data: the record data itself, less any preceeding explicit IV.
+ *   data_plus_mac_size: the secret, reported length of the data and MAC
+ *     once the padding has been removed.
+ *   data_plus_mac_plus_padding_size: the public length of the whole
+ *     record, including padding.
+ *   is_sslv3: non-zero if we are to use SSLv3. Otherwise, TLS.
+ *
+ * On entry: by virtue of having been through one of the remove_padding
+ * functions, above, we know that data_plus_mac_size is large enough to contain
+ * a padding byte and MAC. (If the padding was invalid, it might contain the
+ * padding too. ) */
+void ssl3_cbc_digest_record(
+	const EVP_MD_CTX *ctx,
+	unsigned char* md_out,
+	size_t* md_out_size,
+	const unsigned char header[13],
+	const unsigned char *data,
+	size_t data_plus_mac_size,
+	size_t data_plus_mac_plus_padding_size,
+	const unsigned char *mac_secret,
+	unsigned mac_secret_length,
+	char is_sslv3)
+	{
+	union {	double align;
+		unsigned char c[sizeof(LARGEST_DIGEST_CTX)]; } md_state;
+	void (*md_final_raw)(void *ctx, unsigned char *md_out);
+	void (*md_transform)(void *ctx, const unsigned char *block);
+	unsigned md_size, md_block_size = 64;
+	unsigned sslv3_pad_length = 40, header_length, variance_blocks,
+		 len, max_mac_bytes, num_blocks,
+		 num_starting_blocks, k, mac_end_offset, c, index_a, index_b;
+	unsigned int bits;	/* at most 18 bits */
+	unsigned char length_bytes[MAX_HASH_BIT_COUNT_BYTES];
+	/* hmac_pad is the masked HMAC key. */
+	unsigned char hmac_pad[MAX_HASH_BLOCK_SIZE];
+	unsigned char first_block[MAX_HASH_BLOCK_SIZE];
+	unsigned char mac_out[EVP_MAX_MD_SIZE];
+	unsigned i, j, md_out_size_u;
+	EVP_MD_CTX md_ctx;
+	/* mdLengthSize is the number of bytes in the length field that terminates
+	* the hash. */
+	unsigned md_length_size = 8;
+	char length_is_big_endian = 1;
+
+	/* This is a, hopefully redundant, check that allows us to forget about
+	 * many possible overflows later in this function. */
+	OPENSSL_assert(data_plus_mac_plus_padding_size < 1024*1024);
+
+	switch (EVP_MD_CTX_type(ctx))
+		{
+		case NID_md5:
+			MD5_Init((MD5_CTX*)md_state.c);
+			md_final_raw = tls1_md5_final_raw;
+			md_transform = (void(*)(void *ctx, const unsigned char *block)) MD5_Transform;
+			md_size = 16;
+			sslv3_pad_length = 48;
+			length_is_big_endian = 0;
+			break;
+		case NID_sha1:
+			SHA1_Init((SHA_CTX*)md_state.c);
+			md_final_raw = tls1_sha1_final_raw;
+			md_transform = (void(*)(void *ctx, const unsigned char *block)) SHA1_Transform;
+			md_size = 20;
+			break;
+#ifndef OPENSSL_NO_SHA256
+		case NID_sha224:
+			SHA224_Init((SHA256_CTX*)md_state.c);
+			md_final_raw = tls1_sha256_final_raw;
+			md_transform = (void(*)(void *ctx, const unsigned char *block)) SHA256_Transform;
+			md_size = 224/8;
+			break;
+		case NID_sha256:
+			SHA256_Init((SHA256_CTX*)md_state.c);
+			md_final_raw = tls1_sha256_final_raw;
+			md_transform = (void(*)(void *ctx, const unsigned char *block)) SHA256_Transform;
+			md_size = 32;
+			break;
+#endif
+#ifndef OPENSSL_NO_SHA512
+		case NID_sha384:
+			SHA384_Init((SHA512_CTX*)md_state.c);
+			md_final_raw = tls1_sha512_final_raw;
+			md_transform = (void(*)(void *ctx, const unsigned char *block)) SHA512_Transform;
+			md_size = 384/8;
+			md_block_size = 128;
+			md_length_size = 16;
+			break;
+		case NID_sha512:
+			SHA512_Init((SHA512_CTX*)md_state.c);
+			md_final_raw = tls1_sha512_final_raw;
+			md_transform = (void(*)(void *ctx, const unsigned char *block)) SHA512_Transform;
+			md_size = 64;
+			md_block_size = 128;
+			md_length_size = 16;
+			break;
+#endif
+		default:
+			/* ssl3_cbc_record_digest_supported should have been
+			 * called first to check that the hash function is
+			 * supported. */
+			OPENSSL_assert(0);
+			if (md_out_size)
+				*md_out_size = -1;
+			return;
+		}
+
+	OPENSSL_assert(md_length_size <= MAX_HASH_BIT_COUNT_BYTES);
+	OPENSSL_assert(md_block_size <= MAX_HASH_BLOCK_SIZE);
+	OPENSSL_assert(md_size <= EVP_MAX_MD_SIZE);
+
+	header_length = 13;
+	if (is_sslv3)
+		{
+		header_length =
+			mac_secret_length +
+			sslv3_pad_length +
+			8 /* sequence number */ +
+			1 /* record type */ +
+			2 /* record length */;
+		}
+
+	/* variance_blocks is the number of blocks of the hash that we have to
+	 * calculate in constant time because they could be altered by the
+	 * padding value.
+	 *
+	 * In SSLv3, the padding must be minimal so the end of the plaintext
+	 * varies by, at most, 15+20 = 35 bytes. (We conservatively assume that
+	 * the MAC size varies from 0..20 bytes.) In case the 9 bytes of hash
+	 * termination (0x80 + 64-bit length) don't fit in the final block, we
+	 * say that the final two blocks can vary based on the padding.
+	 *
+	 * TLSv1 has MACs up to 48 bytes long (SHA-384) and the padding is not
+	 * required to be minimal. Therefore we say that the final six blocks
+	 * can vary based on the padding.
+	 *
+	 * Later in the function, if the message is short and there obviously
+	 * cannot be this many blocks then variance_blocks can be reduced. */
+	variance_blocks = is_sslv3 ? 2 : 6;
+	/* From now on we're dealing with the MAC, which conceptually has 13
+	 * bytes of `header' before the start of the data (TLS) or 71/75 bytes
+	 * (SSLv3) */
+	len = data_plus_mac_plus_padding_size + header_length;
+	/* max_mac_bytes contains the maximum bytes of bytes in the MAC, including
+	* |header|, assuming that there's no padding. */
+	max_mac_bytes = len - md_size - 1;
+	/* num_blocks is the maximum number of hash blocks. */
+	num_blocks = (max_mac_bytes + 1 + md_length_size + md_block_size - 1) / md_block_size;
+	/* In order to calculate the MAC in constant time we have to handle
+	 * the final blocks specially because the padding value could cause the
+	 * end to appear somewhere in the final |variance_blocks| blocks and we
+	 * can't leak where. However, |num_starting_blocks| worth of data can
+	 * be hashed right away because no padding value can affect whether
+	 * they are plaintext. */
+	num_starting_blocks = 0;
+	/* k is the starting byte offset into the conceptual header||data where
+	 * we start processing. */
+	k = 0;
+	/* mac_end_offset is the index just past the end of the data to be
+	 * MACed. */
+	mac_end_offset = data_plus_mac_size + header_length - md_size;
+	/* c is the index of the 0x80 byte in the final hash block that
+	 * contains application data. */
+	c = mac_end_offset % md_block_size;
+	/* index_a is the hash block number that contains the 0x80 terminating
+	 * value. */
+	index_a = mac_end_offset / md_block_size;
+	/* index_b is the hash block number that contains the 64-bit hash
+	 * length, in bits. */
+	index_b = (mac_end_offset + md_length_size) / md_block_size;
+	/* bits is the hash-length in bits. It includes the additional hash
+	 * block for the masked HMAC key, or whole of |header| in the case of
+	 * SSLv3. */
+
+	/* For SSLv3, if we're going to have any starting blocks then we need
+	 * at least two because the header is larger than a single block. */
+	if (num_blocks > variance_blocks + (is_sslv3 ? 1 : 0))
+		{
+		num_starting_blocks = num_blocks - variance_blocks;
+		k = md_block_size*num_starting_blocks;
+		}
+
+	bits = 8*mac_end_offset;
+	if (!is_sslv3)
+		{
+		/* Compute the initial HMAC block. For SSLv3, the padding and
+		 * secret bytes are included in |header| because they take more
+		 * than a single block. */
+		bits += 8*md_block_size;
+		memset(hmac_pad, 0, md_block_size);
+		OPENSSL_assert(mac_secret_length <= sizeof(hmac_pad));
+		memcpy(hmac_pad, mac_secret, mac_secret_length);
+		for (i = 0; i < md_block_size; i++)
+			hmac_pad[i] ^= 0x36;
+
+		md_transform(md_state.c, hmac_pad);
+		}
+
+	if (length_is_big_endian)
+		{
+		memset(length_bytes,0,md_length_size-4);
+		length_bytes[md_length_size-4] = (unsigned char)(bits>>24);
+		length_bytes[md_length_size-3] = (unsigned char)(bits>>16);
+		length_bytes[md_length_size-2] = (unsigned char)(bits>>8);
+		length_bytes[md_length_size-1] = (unsigned char)bits;
+		}
+	else
+		{
+		memset(length_bytes,0,md_length_size);
+		length_bytes[md_length_size-5] = (unsigned char)(bits>>24);
+		length_bytes[md_length_size-6] = (unsigned char)(bits>>16);
+		length_bytes[md_length_size-7] = (unsigned char)(bits>>8);
+		length_bytes[md_length_size-8] = (unsigned char)bits;
+		}
+
+	if (k > 0)
+		{
+		if (is_sslv3)
+			{
+			/* The SSLv3 header is larger than a single block.
+			 * overhang is the number of bytes beyond a single
+			 * block that the header consumes: either 7 bytes
+			 * (SHA1) or 11 bytes (MD5). */
+			unsigned overhang = header_length-md_block_size;
+			md_transform(md_state.c, header);
+			memcpy(first_block, header + md_block_size, overhang);
+			memcpy(first_block + overhang, data, md_block_size-overhang);
+			md_transform(md_state.c, first_block);
+			for (i = 1; i < k/md_block_size - 1; i++)
+				md_transform(md_state.c, data + md_block_size*i - overhang);
+			}
+		else
+			{
+			/* k is a multiple of md_block_size. */
+			memcpy(first_block, header, 13);
+			memcpy(first_block+13, data, md_block_size-13);
+			md_transform(md_state.c, first_block);
+			for (i = 1; i < k/md_block_size; i++)
+				md_transform(md_state.c, data + md_block_size*i - 13);
+			}
+		}
+
+	memset(mac_out, 0, sizeof(mac_out));
+
+	/* We now process the final hash blocks. For each block, we construct
+	 * it in constant time. If the |i==index_a| then we'll include the 0x80
+	 * bytes and zero pad etc. For each block we selectively copy it, in
+	 * constant time, to |mac_out|. */
+	for (i = num_starting_blocks; i <= num_starting_blocks+variance_blocks; i++)
+		{
+		unsigned char block[MAX_HASH_BLOCK_SIZE];
+		unsigned char is_block_a = constant_time_eq_8(i, index_a);
+		unsigned char is_block_b = constant_time_eq_8(i, index_b);
+		for (j = 0; j < md_block_size; j++)
+			{
+			unsigned char b = 0, is_past_c, is_past_cp1;
+			if (k < header_length)
+				b = header[k];
+			else if (k < data_plus_mac_plus_padding_size + header_length)
+				b = data[k-header_length];
+			k++;
+
+			is_past_c = is_block_a & constant_time_ge(j, c);
+			is_past_cp1 = is_block_a & constant_time_ge(j, c+1);
+			/* If this is the block containing the end of the
+			 * application data, and we are at the offset for the
+			 * 0x80 value, then overwrite b with 0x80. */
+			b = (b&~is_past_c) | (0x80&is_past_c);
+			/* If this the the block containing the end of the
+			 * application data and we're past the 0x80 value then
+			 * just write zero. */
+			b = b&~is_past_cp1;
+			/* If this is index_b (the final block), but not
+			 * index_a (the end of the data), then the 64-bit
+			 * length didn't fit into index_a and we're having to
+			 * add an extra block of zeros. */
+			b &= ~is_block_b | is_block_a;
+
+			/* The final bytes of one of the blocks contains the
+			 * length. */
+			if (j >= md_block_size - md_length_size)
+				{
+				/* If this is index_b, write a length byte. */
+				b = (b&~is_block_b) | (is_block_b&length_bytes[j-(md_block_size-md_length_size)]);
+				}
+			block[j] = b;
+			}
+
+		md_transform(md_state.c, block);
+		md_final_raw(md_state.c, block);
+		/* If this is index_b, copy the hash value to |mac_out|. */
+		for (j = 0; j < md_size; j++)
+			mac_out[j] |= block[j]&is_block_b;
+		}
+
+	EVP_MD_CTX_init(&md_ctx);
+	EVP_DigestInit_ex(&md_ctx, ctx->digest, NULL /* engine */);
+	if (is_sslv3)
+		{
+		/* We repurpose |hmac_pad| to contain the SSLv3 pad2 block. */
+		memset(hmac_pad, 0x5c, sslv3_pad_length);
+
+		EVP_DigestUpdate(&md_ctx, mac_secret, mac_secret_length);
+		EVP_DigestUpdate(&md_ctx, hmac_pad, sslv3_pad_length);
+		EVP_DigestUpdate(&md_ctx, mac_out, md_size);
+		}
+	else
+		{
+		/* Complete the HMAC in the standard manner. */
+		for (i = 0; i < md_block_size; i++)
+			hmac_pad[i] ^= 0x6a;
+
+		EVP_DigestUpdate(&md_ctx, hmac_pad, md_block_size);
+		EVP_DigestUpdate(&md_ctx, mac_out, md_size);
+		}
+	EVP_DigestFinal(&md_ctx, md_out, &md_out_size_u);
+	if (md_out_size)
+		*md_out_size = md_out_size_u;
+	EVP_MD_CTX_cleanup(&md_ctx);
+	}
+
+#ifdef OPENSSL_FIPS
+
+/* Due to the need to use EVP in FIPS mode we can't reimplement digests but
+ * we can ensure the number of blocks processed is equal for all cases
+ * by digesting additional data.
+ */
+
+void tls_fips_digest_extra(
+	const EVP_CIPHER_CTX *cipher_ctx, EVP_MD_CTX *mac_ctx,
+	const unsigned char *data, size_t data_len, size_t orig_len)
+	{
+	size_t block_size, digest_pad, blocks_data, blocks_orig;
+	if (EVP_CIPHER_CTX_mode(cipher_ctx) != EVP_CIPH_CBC_MODE)
+		return;
+	block_size = EVP_MD_CTX_block_size(mac_ctx);
+	/* We are in FIPS mode if we get this far so we know we have only SHA*
+	 * digests and TLS to deal with.
+	 * Minimum digest padding length is 17 for SHA384/SHA512 and 9
+	 * otherwise.
+	 * Additional header is 13 bytes. To get the number of digest blocks
+	 * processed round up the amount of data plus padding to the nearest
+	 * block length. Block length is 128 for SHA384/SHA512 and 64 otherwise.
+	 * So we have:
+	 * blocks = (payload_len + digest_pad + 13 + block_size - 1)/block_size
+	 * equivalently:
+	 * blocks = (payload_len + digest_pad + 12)/block_size + 1
+	 * HMAC adds a constant overhead.
+	 * We're ultimately only interested in differences so this becomes
+	 * blocks = (payload_len + 29)/128
+	 * for SHA384/SHA512 and
+	 * blocks = (payload_len + 21)/64
+	 * otherwise.
+	 */
+	digest_pad = block_size == 64 ? 21 : 29;
+	blocks_orig = (orig_len + digest_pad)/block_size;
+	blocks_data = (data_len + digest_pad)/block_size;
+	/* MAC enough blocks to make up the difference between the original
+	 * and actual lengths plus one extra block to ensure this is never a
+	 * no op. The "data" pointer should always have enough space to
+	 * perform this operation as it is large enough for a maximum
+	 * length TLS buffer. 
+	 */
+	EVP_DigestSignUpdate(mac_ctx, data,
+				(blocks_orig - blocks_data + 1) * block_size);
+	}
+#endif
diff --git a/ssl/s3_clnt.c b/ssl/s3_clnt.c
index 42bcd62..88077ea 100644
--- a/ssl/s3_clnt.c
+++ b/ssl/s3_clnt.c
@@ -156,6 +156,9 @@
 #include <openssl/objects.h>
 #include <openssl/evp.h>
 #include <openssl/md5.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #ifndef OPENSSL_NO_DH
 #include <openssl/dh.h>
 #endif
@@ -199,19 +202,37 @@ int ssl3_connect(SSL *s)
 	
 	s->in_handshake++;
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); 
-#if 0	/* Send app data in separate packet, otherwise, some particular site
-	 * (only one site so far) closes the socket.
-	 * Note: there is a very small chance that two TCP packets
-	 * could be arriving at server combined into a single TCP packet,
-	 * then trigger that site to break. We haven't encounter that though.
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
 	 */
+	if (s->tlsext_hb_pending)
+		{
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
+// BEGIN android-added
+#if 0
+/* Send app data in separate packet, otherwise, some particular site
+ * (only one site so far) closes the socket. http://b/2511073
+ * Note: there is a very small chance that two TCP packets
+ * could be arriving at server combined into a single TCP packet,
+ * then trigger that site to break. We haven't encounter that though.
+ */
+// END android-added
 	if (SSL_get_mode(s) & SSL_MODE_HANDSHAKE_CUTTHROUGH)
 		{
 		/* Send app data along with CCS/Finished */
 		s->s3->flags |= SSL3_FLAGS_DELAY_CLIENT_FINISHED;
 		}
-#endif
 
+// BEGIN android-added
+#endif
+// END android-added
 	for (;;)
 		{
 		state=s->state;
@@ -219,7 +240,7 @@ int ssl3_connect(SSL *s)
 		switch(s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			s->state=SSL_ST_CONNECT;
 			s->ctx->stats.sess_connect_renegotiate++;
 			/* break */
@@ -292,7 +313,16 @@ int ssl3_connect(SSL *s)
 			if (ret <= 0) goto end;
 
 			if (s->hit)
+				{
 				s->state=SSL3_ST_CR_FINISHED_A;
+#ifndef OPENSSL_NO_TLSEXT
+				if (s->tlsext_ticket_expected)
+					{
+					/* receive renewed session ticket */
+					s->state=SSL3_ST_CR_SESSION_TICKET_A;
+					}
+#endif
+				}
 			else
 				s->state=SSL3_ST_CR_CERT_A;
 			s->init_num=0;
@@ -370,6 +400,17 @@ int ssl3_connect(SSL *s)
 		case SSL3_ST_CR_SRVR_DONE_B:
 			ret=ssl3_get_server_done(s);
 			if (ret <= 0) goto end;
+#ifndef OPENSSL_NO_SRP
+			if (s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP)
+				{
+				if ((ret = SRP_Calc_A_param(s))<=0)
+					{
+					SSLerr(SSL_F_SSL3_CONNECT,SSL_R_SRP_A_CALC);
+					ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_INTERNAL_ERROR);
+					goto end;
+					}
+				}
+#endif
 			if (s->s3->tmp.cert_req)
 				s->state=SSL3_ST_CW_CERT_A;
 			else
@@ -436,15 +477,15 @@ int ssl3_connect(SSL *s)
 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
 			if (ret <= 0) goto end;
 
-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
 			s->state=SSL3_ST_CW_FINISHED_A;
-#else
-			if (s->next_proto_negotiated)
+#if !defined(OPENSSL_NO_TLSEXT)
+			if (s->s3->tlsext_channel_id_valid)
+				s->state=SSL3_ST_CW_CHANNEL_ID_A;
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
+			if (s->s3->next_proto_neg_seen)
 				s->state=SSL3_ST_CW_NEXT_PROTO_A;
-			else
-				s->state=SSL3_ST_CW_FINISHED_A;
+# endif
 #endif
-
 			s->init_num=0;
 
 			s->session->cipher=s->s3->tmp.new_cipher;
@@ -477,6 +518,18 @@ int ssl3_connect(SSL *s)
 		case SSL3_ST_CW_NEXT_PROTO_B:
 			ret=ssl3_send_next_proto(s);
 			if (ret <= 0) goto end;
+			if (s->s3->tlsext_channel_id_valid)
+				s->state=SSL3_ST_CW_CHANNEL_ID_A;
+			else
+				s->state=SSL3_ST_CW_FINISHED_A;
+			break;
+#endif
+
+#if !defined(OPENSSL_NO_TLSEXT)
+		case SSL3_ST_CW_CHANNEL_ID_A:
+		case SSL3_ST_CW_CHANNEL_ID_B:
+			ret=ssl3_send_channel_id(s);
+			if (ret <= 0) goto end;
 			s->state=SSL3_ST_CW_FINISHED_A;
 			break;
 #endif
@@ -611,6 +664,7 @@ int ssl3_connect(SSL *s)
 			/* else do it later in ssl3_write */
 
 			s->init_num=0;
+			s->renegotiate=0;
 			s->new_session=0;
 
 			ssl_update_cache(s,SSL_SESS_CACHE_CLIENT);
@@ -706,9 +760,43 @@ int ssl3_client_hello(SSL *s)
 		/* Do the message type and length last */
 		d=p= &(buf[4]);
 
+		/* version indicates the negotiated version: for example from
+		 * an SSLv2/v3 compatible client hello). The client_version
+		 * field is the maximum version we permit and it is also
+		 * used in RSA encrypted premaster secrets. Some servers can
+		 * choke if we initially report a higher version then
+		 * renegotiate to a lower one in the premaster secret. This
+		 * didn't happen with TLS 1.0 as most servers supported it
+		 * but it can with TLS 1.1 or later if the server only supports
+		 * 1.0.
+		 *
+		 * Possible scenario with previous logic:
+		 * 	1. Client hello indicates TLS 1.2
+		 * 	2. Server hello says TLS 1.0
+		 *	3. RSA encrypted premaster secret uses 1.2.
+		 * 	4. Handhaked proceeds using TLS 1.0.
+		 *	5. Server sends hello request to renegotiate.
+		 *	6. Client hello indicates TLS v1.0 as we now
+		 *	   know that is maximum server supports.
+		 *	7. Server chokes on RSA encrypted premaster secret
+		 *	   containing version 1.0.
+		 *
+		 * For interoperability it should be OK to always use the
+		 * maximum version we support in client hello and then rely
+		 * on the checking of version to ensure the servers isn't
+		 * being inconsistent: for example initially negotiating with
+		 * TLS 1.0 and renegotiating with TLS 1.2. We do this by using
+		 * client_version in client hello and not resetting it to
+		 * the negotiated version.
+		 */
+#if 0
 		*(p++)=s->version>>8;
 		*(p++)=s->version&0xff;
 		s->client_version=s->version;
+#else
+		*(p++)=s->client_version>>8;
+		*(p++)=s->client_version&0xff;
+#endif
 
 		/* Random stuff */
 		memcpy(p,s->s3->client_random,SSL3_RANDOM_SIZE);
@@ -738,6 +826,15 @@ int ssl3_client_hello(SSL *s)
 			SSLerr(SSL_F_SSL3_CLIENT_HELLO,SSL_R_NO_CIPHERS_AVAILABLE);
 			goto err;
 			}
+#ifdef OPENSSL_MAX_TLS1_2_CIPHER_LENGTH
+			/* Some servers hang if client hello > 256 bytes
+			 * as hack workaround chop number of supported ciphers
+			 * to keep it well below this if we use TLS v1.2
+			 */
+			if (TLS1_get_version(s) >= TLS1_2_VERSION
+				&& i > OPENSSL_MAX_TLS1_2_CIPHER_LENGTH)
+				i = OPENSSL_MAX_TLS1_2_CIPHER_LENGTH & ~1;
+#endif
 		s2n(i,p);
 		p+=i;
 
@@ -924,6 +1021,14 @@ int ssl3_get_server_hello(SSL *s)
 		SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_UNKNOWN_CIPHER_RETURNED);
 		goto f_err;
 		}
+	/* TLS v1.2 only ciphersuites require v1.2 or later */
+	if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+		(TLS1_get_version(s) < TLS1_2_VERSION))
+		{
+		al=SSL_AD_ILLEGAL_PARAMETER;
+		SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_WRONG_CIPHER_RETURNED);
+		goto f_err;
+		}
 	p+=ssl_put_cipher_by_char(s,NULL,NULL);
 
 	sk=ssl_get_ciphers_by_id(s);
@@ -955,9 +1060,14 @@ int ssl3_get_server_hello(SSL *s)
 			}
 		}
 	s->s3->tmp.new_cipher=c;
-	if (!ssl3_digest_cached_records(s))
+	/* Don't digest cached records if TLS v1.2: we may need them for
+	 * client authentication.
+	 */
+	if (TLS1_get_version(s) < TLS1_2_VERSION && !ssl3_digest_cached_records(s))
+		{
+		al = SSL_AD_INTERNAL_ERROR;
 		goto f_err;
-
+		}
 	/* lets get the compression algorithm */
 	/* COMPRESSION */
 #ifdef OPENSSL_NO_COMP
@@ -1030,7 +1140,7 @@ int ssl3_get_server_hello(SSL *s)
 		/* wrong packet length */
 		al=SSL_AD_DECODE_ERROR;
 		SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_BAD_PACKET_LENGTH);
-		goto err;
+		goto f_err;
 		}
 
 	return(1);
@@ -1236,6 +1346,7 @@ int ssl3_get_key_exchange(SSL *s)
 	int al,i,j,param_len,ok;
 	long n,alg_k,alg_a;
 	EVP_PKEY *pkey=NULL;
+	const EVP_MD *md = NULL;
 #ifndef OPENSSL_NO_RSA
 	RSA *rsa=NULL;
 #endif
@@ -1359,6 +1470,86 @@ int ssl3_get_key_exchange(SSL *s)
 		}
 	else
 #endif /* !OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (alg_k & SSL_kSRP)
+		{
+		n2s(p,i);
+		param_len=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_N_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.N=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		n2s(p,i);
+		param_len+=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_G_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.g=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		i = (unsigned int)(p[0]);
+		p++;
+		param_len+=i+1;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_S_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.s=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		n2s(p,i);
+		param_len+=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_B_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.B=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+		n-=param_len;
+
+/* We must check if there is a certificate */
+#ifndef OPENSSL_NO_RSA
+		if (alg_a & SSL_aRSA)
+			pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_RSA_ENC].x509);
+#else
+		if (0)
+			;
+#endif
+#ifndef OPENSSL_NO_DSA
+		else if (alg_a & SSL_aDSS)
+			pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_DSA_SIGN].x509);
+#endif
+		}
+	else
+#endif /* !OPENSSL_NO_SRP */
 #ifndef OPENSSL_NO_RSA
 	if (alg_k & SSL_kRSA)
 		{
@@ -1606,6 +1797,38 @@ int ssl3_get_key_exchange(SSL *s)
 	/* if it was signed, check the signature */
 	if (pkey != NULL)
 		{
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			int sigalg = tls12_get_sigid(pkey);
+			/* Should never happen */
+			if (sigalg == -1)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			/* Check key type is consistent with signature */
+			if (sigalg != (int)p[1])
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_WRONG_SIGNATURE_TYPE);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+			md = tls12_get_hash(p[0]);
+			if (md == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_UNKNOWN_DIGEST);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+#ifdef SSL_DEBUG
+fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md));
+#endif
+			p += 2;
+			n -= 2;
+			}
+		else
+			md = EVP_sha1();
+			
 		n2s(p,i);
 		n-=2;
 		j=EVP_PKEY_size(pkey);
@@ -1619,7 +1842,7 @@ int ssl3_get_key_exchange(SSL *s)
 			}
 
 #ifndef OPENSSL_NO_RSA
-		if (pkey->type == EVP_PKEY_RSA)
+		if (pkey->type == EVP_PKEY_RSA && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			int num;
 
@@ -1627,6 +1850,8 @@ int ssl3_get_key_exchange(SSL *s)
 			q=md_buf;
 			for (num=2; num > 0; num--)
 				{
+				EVP_MD_CTX_set_flags(&md_ctx,
+					EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 				EVP_DigestInit_ex(&md_ctx,(num == 2)
 					?s->ctx->md5:s->ctx->sha1, NULL);
 				EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
@@ -1654,11 +1879,8 @@ int ssl3_get_key_exchange(SSL *s)
 			}
 		else
 #endif
-#ifndef OPENSSL_NO_DSA
-			if (pkey->type == EVP_PKEY_DSA)
 			{
-			/* lets do DSS */
-			EVP_VerifyInit_ex(&md_ctx,EVP_dss1(), NULL);
+			EVP_VerifyInit_ex(&md_ctx, md, NULL);
 			EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
 			EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
 			EVP_VerifyUpdate(&md_ctx,param,param_len);
@@ -1670,30 +1892,6 @@ int ssl3_get_key_exchange(SSL *s)
 				goto f_err;
 				}
 			}
-		else
-#endif
-#ifndef OPENSSL_NO_ECDSA
-			if (pkey->type == EVP_PKEY_EC)
-			{
-			/* let's do ECDSA */
-			EVP_VerifyInit_ex(&md_ctx,EVP_ecdsa(), NULL);
-			EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-			EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
-			EVP_VerifyUpdate(&md_ctx,param,param_len);
-			if (EVP_VerifyFinal(&md_ctx,p,(int)n,pkey) <= 0)
-				{
-				/* bad signature */
-				al=SSL_AD_DECRYPT_ERROR;
-				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SIGNATURE);
-				goto f_err;
-				}
-			}
-		else
-#endif
-			{
-			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
 		}
 	else
 		{
@@ -1740,7 +1938,7 @@ int ssl3_get_certificate_request(SSL *s)
 	{
 	int ok,ret=0;
 	unsigned long n,nc,l;
-	unsigned int llen,ctype_num,i;
+	unsigned int llen, ctype_num,i;
 	X509_NAME *xn=NULL;
 	const unsigned char *p,*q;
 	unsigned char *d;
@@ -1760,6 +1958,14 @@ int ssl3_get_certificate_request(SSL *s)
 	if (s->s3->tmp.message_type == SSL3_MT_SERVER_DONE)
 		{
 		s->s3->tmp.reuse_message=1;
+		/* If we get here we don't need any cached handshake records
+		 * as we wont be doing client auth.
+		 */
+		if (s->s3->handshake_buffer)
+			{
+			if (!ssl3_digest_cached_records(s))
+				goto err;
+			}
 		return(1);
 		}
 
@@ -1796,6 +2002,27 @@ int ssl3_get_certificate_request(SSL *s)
 	for (i=0; i<ctype_num; i++)
 		s->s3->tmp.ctype[i]= p[i];
 	p+=ctype_num;
+	if (TLS1_get_version(s) >= TLS1_2_VERSION)
+		{
+		n2s(p, llen);
+		/* Check we have enough room for signature algorithms and
+		 * following length value.
+		 */
+		if ((unsigned long)(p - d + llen + 2) > n)
+			{
+			ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
+			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_DATA_LENGTH_TOO_LONG);
+			goto err;
+			}
+		if (llen & 1)
+			{
+			ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
+			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_SIGNATURE_ALGORITHMS_ERROR);
+			goto err;
+			}
+		tls1_process_sigalgs(s, p, llen);
+		p += llen;
+		}
 
 	/* get the CA RDNs */
 	n2s(p,llen);
@@ -1808,7 +2035,7 @@ fclose(out);
 }
 #endif
 
-	if ((llen+ctype_num+2+1) != n)
+	if ((unsigned long)(p - d + llen) != n)
 		{
 		ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
 		SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_LENGTH_MISMATCH);
@@ -1914,7 +2141,7 @@ int ssl3_get_new_session_ticket(SSL *s)
 	if (n < 6)
 		{
 		/* need at least ticket_lifetime_hint + ticket length */
-		al = SSL3_AL_FATAL,SSL_AD_DECODE_ERROR;
+		al = SSL_AD_DECODE_ERROR;
 		SSLerr(SSL_F_SSL3_GET_NEW_SESSION_TICKET,SSL_R_LENGTH_MISMATCH);
 		goto f_err;
 		}
@@ -1925,7 +2152,7 @@ int ssl3_get_new_session_ticket(SSL *s)
 	/* ticket_lifetime_hint + ticket_length + ticket */
 	if (ticklen + 6 != n)
 		{
-		al = SSL3_AL_FATAL,SSL_AD_DECODE_ERROR;
+		al = SSL_AD_DECODE_ERROR;
 		SSLerr(SSL_F_SSL3_GET_NEW_SESSION_TICKET,SSL_R_LENGTH_MISMATCH);
 		goto f_err;
 		}
@@ -2630,6 +2857,39 @@ int ssl3_send_client_key_exchange(SSL *s)
 			EVP_PKEY_free(pub_key);
 
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (alg_k & SSL_kSRP)
+			{
+			if (s->srp_ctx.A != NULL)
+				{
+				/* send off the data */
+				n=BN_num_bytes(s->srp_ctx.A);
+				s2n(n,p);
+				BN_bn2bin(s->srp_ctx.A,p);
+				n+=2;
+				}
+			else
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			if (s->session->srp_username != NULL)
+				OPENSSL_free(s->session->srp_username);
+			s->session->srp_username = BUF_strdup(s->srp_ctx.login);
+			if (s->session->srp_username == NULL)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					ERR_R_MALLOC_FAILURE);
+				goto err;
+				}
+
+			if ((s->session->master_key_length = SRP_generate_client_master_secret(s,s->session->master_key))<0)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			}
+#endif
 #ifndef OPENSSL_NO_PSK
 		else if (alg_k & SSL_kPSK)
 			{
@@ -2749,12 +3009,13 @@ int ssl3_send_client_verify(SSL *s)
 	unsigned char data[MD5_DIGEST_LENGTH+SHA_DIGEST_LENGTH];
 	EVP_PKEY *pkey;
 	EVP_PKEY_CTX *pctx=NULL;
-#ifndef OPENSSL_NO_RSA
+	EVP_MD_CTX mctx;
 	unsigned u=0;
-#endif
 	unsigned long n;
 	int j;
 
+	EVP_MD_CTX_init(&mctx);
+
 	if (s->state == SSL3_ST_CW_CERT_VRFY_A)
 		{
 		d=(unsigned char *)s->init_buf->data;
@@ -2765,7 +3026,8 @@ int ssl3_send_client_verify(SSL *s)
 		EVP_PKEY_sign_init(pctx);
 		if (EVP_PKEY_CTX_set_signature_md(pctx, EVP_sha1())>0)
 			{
-			s->method->ssl3_enc->cert_verify_mac(s,
+			if (TLS1_get_version(s) < TLS1_2_VERSION)
+				s->method->ssl3_enc->cert_verify_mac(s,
 						NID_sha1,
 						&(data[MD5_DIGEST_LENGTH]));
 			}
@@ -2773,6 +3035,62 @@ int ssl3_send_client_verify(SSL *s)
 			{
 			ERR_clear_error();
 			}
+		/* For TLS v1.2 send signature algorithm and signature
+		 * using agreed digest and cached handshake records.
+		 */
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			long hdatalen = 0;
+			void *hdata;
+			const EVP_MD *md;
+			switch (ssl_cert_type(NULL, pkey))
+				{
+			case SSL_PKEY_RSA_ENC:
+				md = s->s3->digest_rsa;
+				break;
+			case SSL_PKEY_DSA_SIGN:
+				md = s->s3->digest_dsa;
+				break;
+			case SSL_PKEY_ECC:
+				md = s->s3->digest_ecdsa;
+				break;
+			default:
+				md = NULL;
+				}
+			if (!md)
+				/* Unlike with the SignatureAlgorithm extension (sent by clients),
+				 * there are no default algorithms for the CertificateRequest message
+				 * (sent by servers). However, now that we've sent a certificate
+				 * for which we don't really know what hash to use for signing, the
+				 * best we can do is try a default algorithm. */
+				md = EVP_sha1();
+			hdatalen = BIO_get_mem_data(s->s3->handshake_buffer,
+								&hdata);
+			if (hdatalen <= 0 || !tls12_get_sigandhash(p, pkey, md))
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY,
+						ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			p += 2;
+#ifdef SSL_DEBUG
+			fprintf(stderr, "Using TLS 1.2 with client alg %s\n",
+							EVP_MD_name(md));
+#endif
+			if (!EVP_SignInit_ex(&mctx, md, NULL)
+				|| !EVP_SignUpdate(&mctx, hdata, hdatalen)
+				|| !EVP_SignFinal(&mctx, p + 2, &u, pkey))
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY,
+						ERR_R_EVP_LIB);
+				goto err;
+				}
+			s2n(u,p);
+			n = u + 4;
+			if (!ssl3_digest_cached_records(s))
+				goto err;
+			}
+		else
 #ifndef OPENSSL_NO_RSA
 		if (pkey->type == EVP_PKEY_RSA)
 			{
@@ -2855,9 +3173,11 @@ int ssl3_send_client_verify(SSL *s)
 		s->init_num=(int)n+4;
 		s->init_off=0;
 		}
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_CTX_free(pctx);
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 err:
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_CTX_free(pctx);
 	return(-1);
 	}
@@ -2981,7 +3301,7 @@ int ssl3_check_cert_and_algorithm(SSL *s)
 	if (idx == SSL_PKEY_ECC)
 		{
 		if (ssl_check_srvr_ecc_cert_and_alg(sc->peer_pkeys[idx].x509,
-		    s->s3->tmp.new_cipher) == 0) 
+		    						s) == 0) 
 			{ /* check failed */
 			SSLerr(SSL_F_SSL3_CHECK_CERT_AND_ALGORITHM,SSL_R_BAD_ECC_CERT);
 			goto f_err;
@@ -3077,13 +3397,8 @@ int ssl3_check_cert_and_algorithm(SSL *s)
 	return(0);
 	}
 
-/* Check to see if handshake is full or resumed. Usually this is just a
- * case of checking to see if a cache hit has occurred. In the case of
- * session tickets we have to check the next message to be sure.
- */
-
-#ifndef OPENSSL_NO_TLSEXT
-# ifndef OPENSSL_NO_NEXTPROTONEG
+#if !defined(OPENSSL_NO_TLSEXT)
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
 int ssl3_send_next_proto(SSL *s)
 	{
 	unsigned int len, padding_len;
@@ -3106,9 +3421,124 @@ int ssl3_send_next_proto(SSL *s)
 		}
 
 	return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
+}
+# endif  /* !OPENSSL_NO_NEXTPROTONEG */
+
+int ssl3_send_channel_id(SSL *s)
+	{
+	unsigned char *d;
+	int ret = -1, public_key_len;
+	EVP_MD_CTX md_ctx;
+	size_t sig_len;
+	ECDSA_SIG *sig = NULL;
+	unsigned char *public_key = NULL, *derp, *der_sig = NULL;
+
+	if (s->state != SSL3_ST_CW_CHANNEL_ID_A)
+		return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
+
+	d = (unsigned char *)s->init_buf->data;
+	*(d++)=SSL3_MT_ENCRYPTED_EXTENSIONS;
+	l2n3(2 + 2 + TLSEXT_CHANNEL_ID_SIZE, d);
+	s2n(TLSEXT_TYPE_channel_id, d);
+	s2n(TLSEXT_CHANNEL_ID_SIZE, d);
+
+	EVP_MD_CTX_init(&md_ctx);
+
+	public_key_len = i2d_PublicKey(s->tlsext_channel_id_private, NULL);
+	if (public_key_len <= 0)
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_CANNOT_SERIALIZE_PUBLIC_KEY);
+		goto err;
+		}
+	// i2d_PublicKey will produce an ANSI X9.62 public key which, for a
+	// P-256 key, is 0x04 (meaning uncompressed) followed by the x and y
+	// field elements as 32-byte, big-endian numbers.
+	if (public_key_len != 65)
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_CHANNEL_ID_NOT_P256);
+		goto err;
+		}
+	public_key = OPENSSL_malloc(public_key_len);
+	if (!public_key)
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	derp = public_key;
+	i2d_PublicKey(s->tlsext_channel_id_private, &derp);
+
+	if (EVP_DigestSignInit(&md_ctx, NULL, EVP_sha256(), NULL,
+			       s->tlsext_channel_id_private) != 1)
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_EVP_DIGESTSIGNINIT_FAILED);
+		goto err;
+		}
+
+	if (!tls1_channel_id_hash(&md_ctx, s))
+		goto err;
+
+	if (!EVP_DigestSignFinal(&md_ctx, NULL, &sig_len))
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_EVP_DIGESTSIGNFINAL_FAILED);
+		goto err;
+		}
+
+	der_sig = OPENSSL_malloc(sig_len);
+	if (!der_sig)
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	if (!EVP_DigestSignFinal(&md_ctx, der_sig, &sig_len))
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_EVP_DIGESTSIGNFINAL_FAILED);
+		goto err;
+		}
+
+	derp = der_sig;
+	sig = d2i_ECDSA_SIG(NULL, (const unsigned char**)&derp, sig_len);
+	if (sig == NULL)
+		{
+		SSLerr(SSL_F_SSL3_SEND_CHANNEL_ID,SSL_R_D2I_ECDSA_SIG);
+		goto err;
+		}
+
+	// The first byte of public_key will be 0x4, denoting an uncompressed key.
+	memcpy(d, public_key + 1, 64);
+	d += 64;
+	memset(d, 0, 2 * 32);
+	BN_bn2bin(sig->r, d + 32 - BN_num_bytes(sig->r));
+	d += 32;
+	BN_bn2bin(sig->s, d + 32 - BN_num_bytes(sig->s));
+	d += 32;
+
+	s->state = SSL3_ST_CW_CHANNEL_ID_B;
+	s->init_num = 4 + 2 + 2 + TLSEXT_CHANNEL_ID_SIZE;
+	s->init_off = 0;
+
+	ret = ssl3_do_write(s, SSL3_RT_HANDSHAKE);
+
+err:
+	EVP_MD_CTX_cleanup(&md_ctx);
+	if (public_key)
+		OPENSSL_free(public_key);
+	if (der_sig)
+		OPENSSL_free(der_sig);
+	if (sig)
+		ECDSA_SIG_free(sig);
+
+	return ret;
 	}
-# endif
+#endif  /* !OPENSSL_NO_TLSEXT */
+
+/* Check to see if handshake is full or resumed. Usually this is just a
+ * case of checking to see if a cache hit has occurred. In the case of
+ * session tickets we have to check the next message to be sure.
+ */
 
+#ifndef OPENSSL_NO_TLSEXT
 int ssl3_check_finished(SSL *s)
 	{
 	int ok;
diff --git a/ssl/s3_enc.c b/ssl/s3_enc.c
index b145970..e3cd4f0 100644
--- a/ssl/s3_enc.c
+++ b/ssl/s3_enc.c
@@ -170,6 +170,7 @@ static int ssl3_generate_key_block(SSL *s, unsigned char *km, int num)
 #endif
 	k=0;
 	EVP_MD_CTX_init(&m5);
+	EVP_MD_CTX_set_flags(&m5, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 	EVP_MD_CTX_init(&s1);
 	for (i=0; (int)i<num; i+=MD5_DIGEST_LENGTH)
 		{
@@ -465,12 +466,21 @@ void ssl3_cleanup_key_block(SSL *s)
 	s->s3->tmp.key_block_length=0;
 	}
 
+/* ssl3_enc encrypts/decrypts the record in |s->wrec| / |s->rrec|, respectively.
+ *
+ * Returns:
+ *   0: (in non-constant time) if the record is publically invalid (i.e. too
+ *       short etc).
+ *   1: if the record's padding is valid / the encryption was successful.
+ *   -1: if the record's padding is invalid or, if sending, an internal error
+ *       occured.
+ */
 int ssl3_enc(SSL *s, int send)
 	{
 	SSL3_RECORD *rec;
 	EVP_CIPHER_CTX *ds;
 	unsigned long l;
-	int bs,i;
+	int bs,i,mac_size=0;
 	const EVP_CIPHER *enc;
 
 	if (send)
@@ -521,32 +531,16 @@ int ssl3_enc(SSL *s, int send)
 		if (!send)
 			{
 			if (l == 0 || l%bs != 0)
-				{
-				SSLerr(SSL_F_SSL3_ENC,SSL_R_BLOCK_CIPHER_PAD_IS_WRONG);
-				ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECRYPTION_FAILED);
 				return 0;
-				}
 			/* otherwise, rec->length >= bs */
 			}
 		
 		EVP_Cipher(ds,rec->data,rec->input,l);
 
+		if (EVP_MD_CTX_md(s->read_hash) != NULL)
+			mac_size = EVP_MD_CTX_size(s->read_hash);
 		if ((bs != 1) && !send)
-			{
-			i=rec->data[l-1]+1;
-			/* SSL 3.0 bounds the number of padding bytes by the block size;
-			 * padding bytes (except the last one) are arbitrary */
-			if (i > bs)
-				{
-				/* Incorrect padding. SSLerr() and ssl3_alert are done
-				 * by caller: we don't want to reveal whether this is
-				 * a decryption error or a MAC verification failure
-				 * (see http://www.openssl.org/~bodo/tls-cbc.txt) */
-				return -1;
-				}
-			/* now i <= bs <= rec->length */
-			rec->length-=i;
-			}
+			return ssl3_cbc_remove_padding(s, rec, bs, mac_size);
 		}
 	return(1);
 	}
@@ -571,12 +565,12 @@ void ssl3_free_digest_list(SSL *s)
 	OPENSSL_free(s->s3->handshake_dgst);
 	s->s3->handshake_dgst=NULL;
 	}	
-		
+
 
 
 void ssl3_finish_mac(SSL *s, const unsigned char *buf, int len)
 	{
-	if (s->s3->handshake_buffer) 
+	if (s->s3->handshake_buffer && !(s->s3->flags & TLS1_FLAGS_KEEP_HANDSHAKE)) 
 		{
 		BIO_write (s->s3->handshake_buffer,(void *)buf,len);
 		} 
@@ -613,9 +607,16 @@ int ssl3_digest_cached_records(SSL *s)
 	/* Loop through bitso of algorithm2 field and create MD_CTX-es */
 	for (i=0;ssl_get_handshake_digest(i,&mask,&md); i++) 
 		{
-		if ((mask & s->s3->tmp.new_cipher->algorithm2) && md) 
+		if ((mask & ssl_get_algorithm2(s)) && md) 
 			{
 			s->s3->handshake_dgst[i]=EVP_MD_CTX_create();
+#ifdef OPENSSL_FIPS
+			if (EVP_MD_nid(md) == NID_md5)
+				{
+				EVP_MD_CTX_set_flags(s->s3->handshake_dgst[i],
+						EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+				}
+#endif
 			EVP_DigestInit_ex(s->s3->handshake_dgst[i],md,NULL);
 			EVP_DigestUpdate(s->s3->handshake_dgst[i],hdata,hdatalen);
 			} 
@@ -624,9 +625,12 @@ int ssl3_digest_cached_records(SSL *s)
 			s->s3->handshake_dgst[i]=NULL;
 			}
 		}
-	/* Free handshake_buffer BIO */
-	BIO_free(s->s3->handshake_buffer);
-	s->s3->handshake_buffer = NULL;
+	if (!(s->s3->flags & TLS1_FLAGS_KEEP_HANDSHAKE))
+		{
+		/* Free handshake_buffer BIO */
+		BIO_free(s->s3->handshake_buffer);
+		s->s3->handshake_buffer = NULL;
+		}
 
 	return 1;
 	}
@@ -672,6 +676,7 @@ static int ssl3_handshake_mac(SSL *s, int md_nid,
 		return 0;
 	}	
 	EVP_MD_CTX_init(&ctx);
+	EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 	EVP_MD_CTX_copy_ex(&ctx,d);
 	n=EVP_MD_CTX_size(&ctx);
 	if (n < 0)
@@ -704,7 +709,7 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
 	EVP_MD_CTX md_ctx;
 	const EVP_MD_CTX *hash;
 	unsigned char *p,rec_char;
-	unsigned int md_size;
+	size_t md_size, orig_len;
 	int npad;
 	int t;
 
@@ -729,28 +734,72 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
 	md_size=t;
 	npad=(48/md_size)*md_size;
 
-	/* Chop the digest off the end :-) */
-	EVP_MD_CTX_init(&md_ctx);
-
-	EVP_MD_CTX_copy_ex( &md_ctx,hash);
-	EVP_DigestUpdate(&md_ctx,mac_sec,md_size);
-	EVP_DigestUpdate(&md_ctx,ssl3_pad_1,npad);
-	EVP_DigestUpdate(&md_ctx,seq,8);
-	rec_char=rec->type;
-	EVP_DigestUpdate(&md_ctx,&rec_char,1);
-	p=md;
-	s2n(rec->length,p);
-	EVP_DigestUpdate(&md_ctx,md,2);
-	EVP_DigestUpdate(&md_ctx,rec->input,rec->length);
-	EVP_DigestFinal_ex( &md_ctx,md,NULL);
-
-	EVP_MD_CTX_copy_ex( &md_ctx,hash);
-	EVP_DigestUpdate(&md_ctx,mac_sec,md_size);
-	EVP_DigestUpdate(&md_ctx,ssl3_pad_2,npad);
-	EVP_DigestUpdate(&md_ctx,md,md_size);
-	EVP_DigestFinal_ex( &md_ctx,md,&md_size);
-
-	EVP_MD_CTX_cleanup(&md_ctx);
+	/* kludge: ssl3_cbc_remove_padding passes padding length in rec->type */
+	orig_len = rec->length+md_size+((unsigned int)rec->type>>8);
+	rec->type &= 0xff;
+
+	if (!send &&
+	    EVP_CIPHER_CTX_mode(ssl->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+	    ssl3_cbc_record_digest_supported(hash))
+		{
+		/* This is a CBC-encrypted record. We must avoid leaking any
+		 * timing-side channel information about how many blocks of
+		 * data we are hashing because that gives an attacker a
+		 * timing-oracle. */
+
+		/* npad is, at most, 48 bytes and that's with MD5:
+		 *   16 + 48 + 8 (sequence bytes) + 1 + 2 = 75.
+		 *
+		 * With SHA-1 (the largest hash speced for SSLv3) the hash size
+		 * goes up 4, but npad goes down by 8, resulting in a smaller
+		 * total size. */
+		unsigned char header[75];
+		unsigned j = 0;
+		memcpy(header+j, mac_sec, md_size);
+		j += md_size;
+		memcpy(header+j, ssl3_pad_1, npad);
+		j += npad;
+		memcpy(header+j, seq, 8);
+		j += 8;
+		header[j++] = rec->type;
+		header[j++] = rec->length >> 8;
+		header[j++] = rec->length & 0xff;
+
+		ssl3_cbc_digest_record(
+			hash,
+			md, &md_size,
+			header, rec->input,
+			rec->length + md_size, orig_len,
+			mac_sec, md_size,
+			1 /* is SSLv3 */);
+		}
+	else
+		{
+		unsigned int md_size_u;
+		/* Chop the digest off the end :-) */
+		EVP_MD_CTX_init(&md_ctx);
+
+		EVP_MD_CTX_copy_ex( &md_ctx,hash);
+		EVP_DigestUpdate(&md_ctx,mac_sec,md_size);
+		EVP_DigestUpdate(&md_ctx,ssl3_pad_1,npad);
+		EVP_DigestUpdate(&md_ctx,seq,8);
+		rec_char=rec->type;
+		EVP_DigestUpdate(&md_ctx,&rec_char,1);
+		p=md;
+		s2n(rec->length,p);
+		EVP_DigestUpdate(&md_ctx,md,2);
+		EVP_DigestUpdate(&md_ctx,rec->input,rec->length);
+		EVP_DigestFinal_ex( &md_ctx,md,NULL);
+
+		EVP_MD_CTX_copy_ex( &md_ctx,hash);
+		EVP_DigestUpdate(&md_ctx,mac_sec,md_size);
+		EVP_DigestUpdate(&md_ctx,ssl3_pad_2,npad);
+		EVP_DigestUpdate(&md_ctx,md,md_size);
+		EVP_DigestFinal_ex( &md_ctx,md,&md_size_u);
+		md_size = md_size_u;
+
+		EVP_MD_CTX_cleanup(&md_ctx);
+	}
 
 	ssl3_record_sequence_update(seq);
 	return(md_size);
diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
index 28ee474..1d3af82 100644
--- a/ssl/s3_lib.c
+++ b/ssl/s3_lib.c
@@ -1071,6 +1071,103 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	256,
 	},
 
+	/* TLS v1.2 ciphersuites */
+	/* Cipher 3B */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_NULL_SHA256,
+	TLS1_CK_RSA_WITH_NULL_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_eNULL,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	0,
+	0,
+	},
+
+	/* Cipher 3C */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_RSA_WITH_AES_128_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 3D */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_RSA_WITH_AES_256_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 3E */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_DSS_WITH_AES_128_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_128_SHA256,
+	SSL_kDHd,
+	SSL_aDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 3F */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_128_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 40 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
 #ifndef OPENSSL_NO_CAMELLIA
 	/* Camellia ciphersuites from RFC4132 (128-bit portion) */
 
@@ -1287,6 +1384,122 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	128,
 	},
 #endif
+
+	/* TLS v1.2 ciphersuites */
+	/* Cipher 67 */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 68 */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_DSS_WITH_AES_256_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_256_SHA256,
+	SSL_kDHd,
+	SSL_aDH,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 69 */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_256_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6A */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6B */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6C */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_128_SHA256,
+	TLS1_CK_ADH_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 6D */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_256_SHA256,
+	TLS1_CK_ADH_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* GOST Ciphersuites */
+
 	{
 	1,
 	"GOST94-GOST89-GOST89",
@@ -1610,6 +1823,200 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 
 #endif /* OPENSSL_NO_SEED */
 
+	/* GCM ciphersuites from RFC5288 */
+
+	/* Cipher 9C */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher 9D */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher 9E */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher 9F */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A0 */
+	{
+	0,
+	TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A1 */
+	{
+	0,
+	TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A2 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A3 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A4 */
+	{
+	0,
+	TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256,
+	SSL_kDHd,
+	SSL_aDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A5 */
+	{
+	0,
+	TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384,
+	SSL_kDHd,
+	SSL_aDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A6 */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ADH_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A7 */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ADH_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
 #ifndef OPENSSL_NO_ECDH
 	/* Cipher C001 */
 	{
@@ -1621,7 +2028,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1653,7 +2060,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1669,7 +2076,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1685,7 +2092,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1701,7 +2108,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1733,7 +2140,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1749,7 +2156,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1765,7 +2172,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1781,7 +2188,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1813,204 +2220,614 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C00E */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_CBC_SHA,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C00F */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_CBC_SHA,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C010 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_NULL_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_NULL_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_eNULL,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	0,
+	0,
+	},
+
+	/* Cipher C011 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_RC4_128_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_RC4_128_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_RC4,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_MEDIUM,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C012 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C013 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C014 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C015 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_NULL_SHA,
+	TLS1_CK_ECDH_anon_WITH_NULL_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_eNULL,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	0,
+	0,
+	},
+
+	/* Cipher C016 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_RC4_128_SHA,
+	TLS1_CK_ECDH_anon_WITH_RC4_128_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_RC4,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_MEDIUM,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C017 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_DES_192_CBC3_SHA,
+	TLS1_CK_ECDH_anon_WITH_DES_192_CBC3_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C018 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_AES_128_CBC_SHA,
+	TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C019 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_AES_256_CBC_SHA,
+	TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+#endif	/* OPENSSL_NO_ECDH */
+
+#ifndef OPENSSL_NO_SRP
+	/* Cipher C01A */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01B */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
 	SSL_NOT_EXP|SSL_HIGH,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
 	},
 
-	/* Cipher C00E */
+	/* Cipher C01C */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01D */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C01E */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C01F */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C020 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C021 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C022 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+#endif  /* OPENSSL_NO_SRP */
+#ifndef OPENSSL_NO_ECDH
+
+	/* HMAC based TLS v1.2 ciphersuites from RFC5289 */
+
+	/* Cipher C023 */
+	{
+	1,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C024 */
+	{
+	1,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C025 */
 	{
 	1,
-	TLS1_TXT_ECDH_RSA_WITH_AES_128_CBC_SHA,
-	TLS1_CK_ECDH_RSA_WITH_AES_128_CBC_SHA,
-	SSL_kECDHr,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256,
+	SSL_kECDHe,
 	SSL_aECDH,
 	SSL_AES128,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C00F */
+	/* Cipher C026 */
 	{
 	1,
-	TLS1_TXT_ECDH_RSA_WITH_AES_256_CBC_SHA,
-	TLS1_CK_ECDH_RSA_WITH_AES_256_CBC_SHA,
-	SSL_kECDHr,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384,
+	SSL_kECDHe,
 	SSL_aECDH,
 	SSL_AES256,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
 	256,
 	256,
 	},
 
-	/* Cipher C010 */
+	/* Cipher C027 */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_NULL_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_NULL_SHA,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256,
 	SSL_kEECDH,
 	SSL_aRSA,
-	SSL_eNULL,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	0,
-	0,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
 	},
 
-	/* Cipher C011 */
+	/* Cipher C028 */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_RC4_128_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_RC4_128_SHA,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384,
 	SSL_kEECDH,
 	SSL_aRSA,
-	SSL_RC4,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_MEDIUM,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C029 */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C012 */
+	/* Cipher C02A */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
-	SSL_kEECDH,
-	SSL_aRSA,
-	SSL_3DES,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	168,
-	168,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
 	},
 
-	/* Cipher C013 */
+	/* GCM based TLS v1.2 ciphersuites from RFC5289 */
+
+	/* Cipher C02B */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_AES_128_CBC_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
 	SSL_kEECDH,
-	SSL_aRSA,
-	SSL_AES128,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_aECDSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C014 */
+	/* Cipher C02C */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_AES_256_CBC_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
 	SSL_kEECDH,
-	SSL_aRSA,
-	SSL_AES256,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_aECDSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
 	256,
 	256,
 	},
 
-	/* Cipher C015 */
+	/* Cipher C02D */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_NULL_SHA,
-	TLS1_CK_ECDH_anon_WITH_NULL_SHA,
-	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_eNULL,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	0,
-	0,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
 	},
 
-	/* Cipher C016 */
+	/* Cipher C02E */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_RC4_128_SHA,
-	TLS1_CK_ECDH_anon_WITH_RC4_128_SHA,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C02F */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
 	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_RC4,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_MEDIUM,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C017 */
+	/* Cipher C030 */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_DES_192_CBC3_SHA,
-	TLS1_CK_ECDH_anon_WITH_DES_192_CBC3_SHA,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
 	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_3DES,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	168,
-	168,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
 	},
 
-	/* Cipher C018 */
+	/* Cipher C031 */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_AES_128_CBC_SHA,
-	TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA,
-	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_AES128,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C019 */
+	/* Cipher C032 */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_AES_256_CBC_SHA,
-	TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA,
-	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_AES256,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
 	256,
 	256,
 	},
-#endif	/* OPENSSL_NO_ECDH */
+
+#endif /* OPENSSL_NO_ECDH */
+
 
 #ifdef TEMP_GOST_TLS
 /* Cipher FF00 */
@@ -2087,6 +2904,9 @@ SSL3_ENC_METHOD SSLv3_enc_data={
 	SSL3_MD_CLIENT_FINISHED_CONST,4,
 	SSL3_MD_SERVER_FINISHED_CONST,4,
 	ssl3_alert_code,
+	(int (*)(SSL *, unsigned char *, size_t, const char *,
+		 size_t, const unsigned char *, size_t,
+		 int use_context))ssl_undefined_function,
 	};
 
 long ssl3_default_timeout(void)
@@ -2128,6 +2948,14 @@ int ssl3_new(SSL *s)
 
 	s->s3=s3;
 
+#ifndef OPENSSL_NO_SRP
+	SSL_SRP_CTX_init(s);
+#endif
+#if !defined(OPENSSL_NO_TLSEXT)
+	s->tlsext_channel_id_enabled = s->ctx->tlsext_channel_id_enabled;
+	if (s->ctx->tlsext_channel_id_private)
+		s->tlsext_channel_id_private = EVP_PKEY_dup(s->ctx->tlsext_channel_id_private);
+#endif
 	s->method->ssl_clear(s);
 	return(1);
 err:
@@ -2168,6 +2996,14 @@ void ssl3_free(SSL *s)
 		BIO_free(s->s3->handshake_buffer);
 	}
 	if (s->s3->handshake_dgst) ssl3_free_digest_list(s);
+#ifndef OPENSSL_NO_TLSEXT
+	if (s->s3->alpn_selected)
+		OPENSSL_free(s->s3->alpn_selected);
+#endif
+
+#ifndef OPENSSL_NO_SRP
+	SSL_SRP_CTX_free(s);
+#endif
 	OPENSSL_cleanse(s->s3,sizeof *s->s3);
 	OPENSSL_free(s->s3);
 	s->s3=NULL;
@@ -2177,6 +3013,7 @@ void ssl3_clear(SSL *s)
 	{
 	unsigned char *rp,*wp;
 	size_t rlen, wlen;
+	int init_extra;
 
 #ifdef TLSEXT_TYPE_opaque_prf_input
 	if (s->s3->client_opaque_prf_input != NULL)
@@ -2215,6 +3052,7 @@ void ssl3_clear(SSL *s)
 	wp = s->s3->wbuf.buf;
 	rlen = s->s3->rbuf.len;
  	wlen = s->s3->wbuf.len;
+	init_extra = s->s3->init_extra;
 	if (s->s3->handshake_buffer) {
 		BIO_free(s->s3->handshake_buffer);
 		s->s3->handshake_buffer = NULL;
@@ -2222,11 +3060,20 @@ void ssl3_clear(SSL *s)
 	if (s->s3->handshake_dgst) {
 		ssl3_free_digest_list(s);
 	}	
+
+#if !defined(OPENSSL_NO_TLSEXT)
+	if (s->s3->alpn_selected)
+		{
+		free(s->s3->alpn_selected);
+		s->s3->alpn_selected = NULL;
+		}
+#endif
 	memset(s->s3,0,sizeof *s->s3);
 	s->s3->rbuf.buf = rp;
 	s->s3->wbuf.buf = wp;
 	s->s3->rbuf.len = rlen;
  	s->s3->wbuf.len = wlen;
+	s->s3->init_extra = init_extra;
 
 	ssl_free_wbio_buffer(s);
 
@@ -2245,7 +3092,18 @@ void ssl3_clear(SSL *s)
 		s->next_proto_negotiated_len = 0;
 		}
 #endif
+
+#if !defined(OPENSSL_NO_TLSEXT)
+	s->s3->tlsext_channel_id_valid = 0;
+#endif
+	}
+
+#ifndef OPENSSL_NO_SRP
+static char * MS_CALLBACK srp_password_from_info_cb(SSL *s, void *arg)
+	{
+	return BUF_strdup(s->srp_ctx.info) ;
 	}
+#endif
 
 long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg)
 	{
@@ -2492,6 +3350,56 @@ long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg)
 		ret = 1;
 		break;
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	case SSL_CTRL_TLS_EXT_SEND_HEARTBEAT:
+		if (SSL_version(s) == DTLS1_VERSION || SSL_version(s) == DTLS1_BAD_VER)
+			ret = dtls1_heartbeat(s);
+		else
+			ret = tls1_heartbeat(s);
+		break;
+
+	case SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING:
+		ret = s->tlsext_hb_pending;
+		break;
+
+	case SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS:
+		if (larg)
+			s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_RECV_REQUESTS;
+		else
+			s->tlsext_heartbeat &= ~SSL_TLSEXT_HB_DONT_RECV_REQUESTS;
+		ret = 1;
+		break;
+#endif
+	case SSL_CTRL_CHANNEL_ID:
+		if (!s->server)
+			break;
+		s->tlsext_channel_id_enabled = 1;
+		ret = 1;
+		break;
+
+	case SSL_CTRL_SET_CHANNEL_ID:
+		if (s->server)
+			break;
+		s->tlsext_channel_id_enabled = 1;
+		if (EVP_PKEY_bits(parg) != 256)
+			{
+			SSLerr(SSL_F_SSL3_CTRL,SSL_R_CHANNEL_ID_NOT_P256);
+			break;
+			}
+		if (s->tlsext_channel_id_private)
+			EVP_PKEY_free(s->tlsext_channel_id_private);
+		s->tlsext_channel_id_private = (EVP_PKEY*) parg;
+		ret = 1;
+		break;
+
+	case SSL_CTRL_GET_CHANNEL_ID:
+		if (!s->server)
+			break;
+		if (!s->s3->tlsext_channel_id_valid)
+			break;
+		memcpy(parg, s->s3->tlsext_channel_id, larg < 64 ? larg : 64);
+		return 64;
+
 #endif /* !OPENSSL_NO_TLSEXT */
 	default:
 		break;
@@ -2712,6 +3620,12 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg)
 			}
 		return 1;
 		}
+	case SSL_CTRL_CHANNEL_ID:
+		/* must be called on a server */
+		if (ctx->method->ssl_accept == ssl_undefined_function)
+			return 0;
+		ctx->tlsext_channel_id_enabled=1;
+		return 1;
 
 #ifdef TLSEXT_TYPE_opaque_prf_input
 	case SSL_CTRL_SET_TLSEXT_OPAQUE_PRF_INPUT_CB_ARG:
@@ -2724,6 +3638,38 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg)
 		return 1;
 		break;
 
+#ifndef OPENSSL_NO_SRP
+	case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		if (ctx->srp_ctx.login != NULL)
+			OPENSSL_free(ctx->srp_ctx.login);
+		ctx->srp_ctx.login = NULL;
+		if (parg == NULL)
+			break;
+		if (strlen((const char *)parg) > 255 || strlen((const char *)parg) < 1)
+			{
+			SSLerr(SSL_F_SSL3_CTX_CTRL, SSL_R_INVALID_SRP_USERNAME);
+			return 0;
+			} 
+		if ((ctx->srp_ctx.login = BUF_strdup((char *)parg)) == NULL)
+			{
+			SSLerr(SSL_F_SSL3_CTX_CTRL, ERR_R_INTERNAL_ERROR);
+			return 0;
+			}
+		break;
+	case SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD:
+		ctx->srp_ctx.SRP_give_srp_client_pwd_callback=srp_password_from_info_cb;
+		ctx->srp_ctx.info=parg;
+		break;
+	case SSL_CTRL_SET_SRP_ARG:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_cb_arg=parg;
+		break;
+
+	case SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH:
+		ctx->srp_ctx.strength=larg;
+		break;
+#endif
 #endif /* !OPENSSL_NO_TLSEXT */
 
 	/* A Thawte special :-) */
@@ -2736,6 +3682,30 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg)
 		sk_X509_push(ctx->extra_certs,(X509 *)parg);
 		break;
 
+	case SSL_CTRL_GET_EXTRA_CHAIN_CERTS:
+		*(STACK_OF(X509) **)parg =  ctx->extra_certs;
+		break;
+
+	case SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS:
+		if (ctx->extra_certs)
+			{
+			sk_X509_pop_free(ctx->extra_certs, X509_free);
+			ctx->extra_certs = NULL;
+			}
+		break;
+
+	case SSL_CTRL_SET_CHANNEL_ID:
+		ctx->tlsext_channel_id_enabled = 1;
+		if (EVP_PKEY_bits(parg) != 256)
+			{
+			SSLerr(SSL_F_SSL3_CTX_CTRL,SSL_R_CHANNEL_ID_NOT_P256);
+			break;
+			}
+		if (ctx->tlsext_channel_id_private)
+			EVP_PKEY_free(ctx->tlsext_channel_id_private);
+		ctx->tlsext_channel_id_private = (EVP_PKEY*) parg;
+		break;
+
 	default:
 		return(0);
 		}
@@ -2793,6 +3763,20 @@ long ssl3_ctx_callback_ctrl(SSL_CTX *ctx, int cmd, void (*fp)(void))
 						HMAC_CTX *, int))fp;
 		break;
 
+#ifndef OPENSSL_NO_SRP
+	case SSL_CTRL_SET_SRP_VERIFY_PARAM_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_verify_param_callback=(int (*)(SSL *,void *))fp;
+		break;
+	case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.TLS_ext_srp_username_callback=(int (*)(SSL *,int *,void *))fp;
+		break;
+	case SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_give_srp_client_pwd_callback=(char *(*)(SSL *,void *))fp;
+		break;
+#endif
 #endif
 	default:
 		return(0);
@@ -2811,6 +3795,9 @@ const SSL_CIPHER *ssl3_get_cipher_by_char(const unsigned char *p)
 	id=0x03000000L|((unsigned long)p[0]<<8L)|(unsigned long)p[1];
 	c.id=id;
 	cp = OBJ_bsearch_ssl_cipher_id(&c, ssl3_ciphers, SSL3_NUM_CIPHERS);
+#ifdef DEBUG_PRINT_UNKNOWN_CIPHERSUITES
+if (cp == NULL) fprintf(stderr, "Unknown cipher ID %x\n", (p[0] << 8) | p[1]);
+#endif
 	if (cp == NULL || cp->valid == 0)
 		return NULL;
 	else
@@ -2888,11 +3875,20 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
 		{
 		c=sk_SSL_CIPHER_value(prio,i);
 
+		/* Skip TLS v1.2 only ciphersuites if lower than v1.2 */
+		if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+			(TLS1_get_version(s) < TLS1_2_VERSION))
+			continue;
+
 		ssl_set_cert_masks(cert,c);
 		mask_k = cert->mask_k;
 		mask_a = cert->mask_a;
 		emask_k = cert->export_mask_k;
 		emask_a = cert->export_mask_a;
+#ifndef OPENSSL_NO_SRP
+		mask_k=cert->mask_k | s->srp_ctx.srp_Mask;
+		emask_k=cert->export_mask_k | s->srp_ctx.srp_Mask;
+#endif
 			
 #ifdef KSSL_DEBUG
 /*		printf("ssl3_choose_cipher %d alg= %lx\n", i,c->algorithms);*/
@@ -3354,4 +4350,15 @@ need to go to SSL_ST_ACCEPT.
 		}
 	return(ret);
 	}
-
+/* If we are using TLS v1.2 or later and default SHA1+MD5 algorithms switch
+ * to new SHA256 PRF and handshake macs
+ */
+long ssl_get_algorithm2(SSL *s)
+	{
+	long alg2 = s->s3->tmp.new_cipher->algorithm2;
+	if (TLS1_get_version(s) >= TLS1_2_VERSION &&
+	    alg2 == (SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF))
+		return SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256;
+	return alg2;
+	}
+		
diff --git a/ssl/s3_pkt.c b/ssl/s3_pkt.c
index 0d3874a..804291e 100644
--- a/ssl/s3_pkt.c
+++ b/ssl/s3_pkt.c
@@ -115,6 +115,7 @@
 #include "ssl_locl.h"
 #include <openssl/evp.h>
 #include <openssl/buffer.h>
+#include <openssl/rand.h>
 
 static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 			 unsigned int len, int create_empty_fragment);
@@ -289,16 +290,8 @@ static int ssl3_get_record(SSL *s)
 	unsigned char *p;
 	unsigned char md[EVP_MAX_MD_SIZE];
 	short version;
-	int mac_size;
-	int clear=0;
+	unsigned mac_size, orig_len;
 	size_t extra;
-	int decryption_failed_or_bad_record_mac = 0;
-	unsigned char *mac = NULL;
-#if defined(SSL3_ALIGN_PAYLOAD) && SSL3_ALIGN_PAYLOAD!=0
-	long align=SSL3_ALIGN_PAYLOAD;
-#else
-	long align=0;
-#endif
 
 	rr= &(s->s3->rrec);
 	sess=s->session;
@@ -307,8 +300,7 @@ static int ssl3_get_record(SSL *s)
 		extra=SSL3_RT_MAX_EXTRA;
 	else
 		extra=0;
-	if (!(SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS) &&
-		extra && !s->s3->init_extra)
+	if (extra && !s->s3->init_extra)
 		{
 		/* An application error: SLS_OP_MICROSOFT_BIG_SSLV3_BUFFER
 		 * set after ssl3_setup_buffers() was done */
@@ -357,21 +349,6 @@ fprintf(stderr, "Record type=%d, Length=%d\n", rr->type, rr->length);
 			goto err;
 			}
 
-		/* If we receive a valid record larger than the current buffer size,
-		 * allocate some memory for it.
-		 */
-		if (rr->length > s->s3->rbuf.len - SSL3_RT_HEADER_LENGTH - align)
-			{
-			if ((p=OPENSSL_realloc(s->s3->rbuf.buf, rr->length + SSL3_RT_HEADER_LENGTH + align))==NULL)
-				{
-				SSLerr(SSL_F_SSL3_GET_RECORD,ERR_R_MALLOC_FAILURE);
-				goto err;
-				}
-			s->s3->rbuf.buf=p;
-			s->s3->rbuf.len=rr->length + SSL3_RT_HEADER_LENGTH + align;
-			s->packet= &(s->s3->rbuf.buf[0]);
-			}
-
 		if (rr->length > s->s3->rbuf.len - SSL3_RT_HEADER_LENGTH)
 			{
 			al=SSL_AD_RECORD_OVERFLOW;
@@ -423,17 +400,15 @@ fprintf(stderr, "Record type=%d, Length=%d\n", rr->type, rr->length);
 	rr->data=rr->input;
 
 	enc_err = s->method->ssl3_enc->enc(s,0);
-	if (enc_err <= 0)
+	/* enc_err is:
+	 *    0: (in non-constant time) if the record is publically invalid.
+	 *    1: if the padding is valid
+	 *    -1: if the padding is invalid */
+	if (enc_err == 0)
 		{
-		if (enc_err == 0)
-			/* SSLerr() and ssl3_send_alert() have been called */
-			goto err;
-
-		/* Otherwise enc_err == -1, which indicates bad padding
-		 * (rec->length has not been changed in this case).
-		 * To minimize information leaked via timing, we will perform
-		 * the MAC computation anyway. */
-		decryption_failed_or_bad_record_mac = 1;
+		al=SSL_AD_DECRYPTION_FAILED;
+		SSLerr(SSL_F_SSL3_GET_RECORD,SSL_R_BLOCK_CIPHER_PAD_IS_WRONG);
+		goto f_err;
 		}
 
 #ifdef TLS_DEBUG
@@ -443,53 +418,62 @@ printf("\n");
 #endif
 
 	/* r->length is now the compressed data plus mac */
-	if (	(sess == NULL) ||
-		(s->enc_read_ctx == NULL) ||
-		(EVP_MD_CTX_md(s->read_hash) == NULL))
-		clear=1;
-
-	if (!clear)
+	if ((sess != NULL) &&
+	    (s->enc_read_ctx != NULL) &&
+	    (EVP_MD_CTX_md(s->read_hash) != NULL))
 		{
-		/* !clear => s->read_hash != NULL => mac_size != -1 */
+		/* s->read_hash != NULL => mac_size != -1 */
+		unsigned char *mac = NULL;
+		unsigned char mac_tmp[EVP_MAX_MD_SIZE];
 		mac_size=EVP_MD_CTX_size(s->read_hash);
-		OPENSSL_assert(mac_size >= 0);
+		OPENSSL_assert(mac_size <= EVP_MAX_MD_SIZE);
 
-		if (rr->length > SSL3_RT_MAX_COMPRESSED_LENGTH+extra+mac_size)
+		/* kludge: *_cbc_remove_padding passes padding length in rr->type */
+		orig_len = rr->length+((unsigned int)rr->type>>8);
+
+		/* orig_len is the length of the record before any padding was
+		 * removed. This is public information, as is the MAC in use,
+		 * therefore we can safely process the record in a different
+		 * amount of time if it's too short to possibly contain a MAC.
+		 */
+		if (orig_len < mac_size ||
+		    /* CBC records must have a padding length byte too. */
+		    (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+		     orig_len < mac_size+1))
 			{
-#if 0 /* OK only for stream ciphers (then rr->length is visible from ciphertext anyway) */
-			al=SSL_AD_RECORD_OVERFLOW;
-			SSLerr(SSL_F_SSL3_GET_RECORD,SSL_R_PRE_MAC_LENGTH_TOO_LONG);
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_RECORD,SSL_R_LENGTH_TOO_SHORT);
 			goto f_err;
-#else
-			decryption_failed_or_bad_record_mac = 1;
-#endif			
 			}
-		/* check the MAC for rr->input (it's in mac_size bytes at the tail) */
-		if (rr->length >= (unsigned int)mac_size)
+
+		if (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE)
 			{
+			/* We update the length so that the TLS header bytes
+			 * can be constructed correctly but we need to extract
+			 * the MAC in constant time from within the record,
+			 * without leaking the contents of the padding bytes.
+			 * */
+			mac = mac_tmp;
+			ssl3_cbc_copy_mac(mac_tmp, rr, mac_size, orig_len);
 			rr->length -= mac_size;
-			mac = &rr->data[rr->length];
 			}
 		else
 			{
-			/* record (minus padding) is too short to contain a MAC */
-#if 0 /* OK only for stream ciphers */
-			al=SSL_AD_DECODE_ERROR;
-			SSLerr(SSL_F_SSL3_GET_RECORD,SSL_R_LENGTH_TOO_SHORT);
-			goto f_err;
-#else
-			decryption_failed_or_bad_record_mac = 1;
-			rr->length = 0;
-#endif
-			}
-		i=s->method->ssl3_enc->mac(s,md,0);
-		if (i < 0 || mac == NULL || memcmp(md, mac, (size_t)mac_size) != 0)
-			{
-			decryption_failed_or_bad_record_mac = 1;
+			/* In this case there's no padding, so |orig_len|
+			 * equals |rec->length| and we checked that there's
+			 * enough bytes for |mac_size| above. */
+			rr->length -= mac_size;
+			mac = &rr->data[rr->length];
 			}
+
+		i=s->method->ssl3_enc->mac(s,md,0 /* not send */);
+		if (i < 0 || mac == NULL || CRYPTO_memcmp(md, mac, (size_t)mac_size) != 0)
+			enc_err = -1;
+		if (rr->length > SSL3_RT_MAX_COMPRESSED_LENGTH+extra+mac_size)
+			enc_err = -1;
 		}
 
-	if (decryption_failed_or_bad_record_mac)
+	if (enc_err < 0)
 		{
 		/* A separate 'decryption_failed' alert was introduced with TLS 1.0,
 		 * SSL 3.0 only has 'bad_record_mac'.  But unless a decryption
@@ -598,7 +582,6 @@ int ssl3_write_bytes(SSL *s, int type, const void *buf_, int len)
 	const unsigned char *buf=buf_;
 	unsigned int tot,n,nw;
 	int i;
-	unsigned int max_plain_length;
 
 	s->rwstate=SSL_NOTHING;
 	tot=s->s3->wnum;
@@ -618,13 +601,8 @@ int ssl3_write_bytes(SSL *s, int type, const void *buf_, int len)
 	n=(len-tot);
 	for (;;)
 		{
-		if (type == SSL3_RT_APPLICATION_DATA && (SSL_get_mode(s) & SSL_MODE_SMALL_BUFFERS))
-			max_plain_length = SSL3_RT_DEFAULT_PLAIN_LENGTH;
-		else
-			max_plain_length = s->max_send_fragment;
-
-		if (n > max_plain_length)
-			nw = max_plain_length;
+		if (n > s->max_send_fragment)
+			nw=s->max_send_fragment;
 		else
 			nw=n;
 
@@ -657,6 +635,7 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 	unsigned char *p,*plen;
 	int i,mac_size,clear=0;
 	int prefix_len=0;
+	int eivlen;
 	long align=0;
 	SSL3_RECORD *wr;
 	SSL3_BUFFER *wb=&(s->s3->wbuf);
@@ -689,10 +668,14 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 	if (	(sess == NULL) ||
 		(s->enc_write_ctx == NULL) ||
 		(EVP_MD_CTX_md(s->write_hash) == NULL))
+		{
+#if 1
+		clear=s->enc_write_ctx?0:1;	/* must be AEAD cipher */
+#else
 		clear=1;
-
-	if (clear)
+#endif
 		mac_size=0;
+		}
 	else
 		{
 		mac_size=EVP_MD_CTX_size(s->write_hash);
@@ -728,18 +711,6 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 		s->s3->empty_fragment_done = 1;
 		}
 
-	/* resize if necessary to hold the data. */
-	if (len + SSL3_RT_DEFAULT_WRITE_OVERHEAD > wb->len)
-		{
-		if ((p=OPENSSL_realloc(wb->buf, len + SSL3_RT_DEFAULT_WRITE_OVERHEAD))==NULL)
-			{
-			SSLerr(SSL_F_DO_SSL3_WRITE,ERR_R_MALLOC_FAILURE);
-			goto err;
-			}
-		wb->buf = p;
-		wb->len = len + SSL3_RT_DEFAULT_WRITE_OVERHEAD;
-		}
-
 	if (create_empty_fragment)
 		{
 #if defined(SSL3_ALIGN_PAYLOAD) && SSL3_ALIGN_PAYLOAD!=0
@@ -773,14 +744,40 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 	wr->type=type;
 
 	*(p++)=(s->version>>8);
-	*(p++)=s->version&0xff;
+	/* Some servers hang if iniatial client hello is larger than 256
+	 * bytes and record version number > TLS 1.0
+	 */
+	if (s->state == SSL3_ST_CW_CLNT_HELLO_B
+				&& !s->renegotiate
+				&& TLS1_get_version(s) > TLS1_VERSION)
+		*(p++) = 0x1;
+	else
+		*(p++)=s->version&0xff;
 
 	/* field where we are to write out packet length */
 	plen=p; 
 	p+=2;
+	/* Explicit IV length, block ciphers and TLS version 1.1 or later */
+	if (s->enc_write_ctx && s->version >= TLS1_1_VERSION)
+		{
+		int mode = EVP_CIPHER_CTX_mode(s->enc_write_ctx);
+		if (mode == EVP_CIPH_CBC_MODE)
+			{
+			eivlen = EVP_CIPHER_CTX_iv_length(s->enc_write_ctx);
+			if (eivlen <= 1)
+				eivlen = 0;
+			}
+		/* Need explicit part of IV for GCM mode */
+		else if (mode == EVP_CIPH_GCM_MODE)
+			eivlen = EVP_GCM_TLS_EXPLICIT_IV_LEN;
+		else
+			eivlen = 0;
+		}
+	else 
+		eivlen = 0;
 
 	/* lets setup the record stuff. */
-	wr->data=p;
+	wr->data=p + eivlen;
 	wr->length=(int)len;
 	wr->input=(unsigned char *)buf;
 
@@ -808,11 +805,19 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 
 	if (mac_size != 0)
 		{
-		if (s->method->ssl3_enc->mac(s,&(p[wr->length]),1) < 0)
+		if (s->method->ssl3_enc->mac(s,&(p[wr->length + eivlen]),1) < 0)
 			goto err;
 		wr->length+=mac_size;
-		wr->input=p;
-		wr->data=p;
+		}
+
+	wr->input=p;
+	wr->data=p;
+
+	if (eivlen)
+		{
+	/*	if (RAND_pseudo_bytes(p, eivlen) <= 0)
+			goto err; */
+		wr->length += eivlen;
 		}
 
 	/* ssl3_enc can only have an error on read */
@@ -1081,6 +1086,19 @@ int ssl3_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 			dest = s->s3->alert_fragment;
 			dest_len = &s->s3->alert_fragment_len;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (rr->type == TLS1_RT_HEARTBEAT)
+			{
+			tls1_process_heartbeat(s);
+
+			/* Exit and notify application to read again */
+			rr->length = 0;
+			s->rwstate=SSL_READING;
+			BIO_clear_retry_flags(SSL_get_rbio(s));
+			BIO_set_retry_read(SSL_get_rbio(s));
+			return(-1);
+			}
+#endif
 
 		if (dest_maxlen > 0)
 			{
@@ -1224,6 +1242,10 @@ int ssl3_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 				SSLerr(SSL_F_SSL3_READ_BYTES,SSL_R_NO_RENEGOTIATION);
 				goto f_err;
 				}
+#ifdef SSL_AD_MISSING_SRP_USERNAME
+			else if (alert_descr == SSL_AD_MISSING_SRP_USERNAME)
+				return(0);
+#endif
 			}
 		else if (alert_level == 2) /* fatal */
 			{
@@ -1302,6 +1324,7 @@ int ssl3_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 #else
 			s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT;
 #endif
+			s->renegotiate=1;
 			s->new_session=1;
 			}
 		i=s->handshake_func(s);
@@ -1335,8 +1358,10 @@ int ssl3_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 		{
 	default:
 #ifndef OPENSSL_NO_TLS
-		/* TLS just ignores unknown message types */
-		if (s->version == TLS1_VERSION)
+		/* TLS up to v1.1 just ignores unknown message types:
+		 * TLS v1.2 give an unexpected message alert.
+		 */
+		if (s->version >= TLS1_VERSION && s->version <= TLS1_1_VERSION)
 			{
 			rr->length = 0;
 			goto start;
@@ -1396,10 +1421,8 @@ int ssl3_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 int ssl3_do_change_cipher_spec(SSL *s)
 	{
 	int i;
-#ifdef OPENSSL_NO_NEXTPROTONEG
 	const char *sender;
 	int slen;
-#endif
 
 	if (s->state & SSL_ST_ACCEPT)
 		i=SSL3_CHANGE_CIPHER_SERVER_READ;
@@ -1422,7 +1445,6 @@ int ssl3_do_change_cipher_spec(SSL *s)
 	if (!s->method->ssl3_enc->change_cipher_state(s,i))
 		return(0);
 
-#ifdef OPENSSL_NO_NEXTPROTONEG
 	/* we have to record the message digest at
 	 * this point so we can get it before we read
 	 * the finished message */
@@ -1439,7 +1461,6 @@ int ssl3_do_change_cipher_spec(SSL *s)
 
 	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
 		sender,slen,s->s3->tmp.peer_finish_md);
-#endif
 
 	return(1);
 	}
diff --git a/ssl/s3_srvr.c b/ssl/s3_srvr.c
index 6059162..5c3343a 100644
--- a/ssl/s3_srvr.c
+++ b/ssl/s3_srvr.c
@@ -157,8 +157,11 @@
 #include <openssl/buffer.h>
 #include <openssl/rand.h>
 #include <openssl/objects.h>
+#include <openssl/ec.h>
+#include <openssl/ecdsa.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
+#include <openssl/sha.h>
 #include <openssl/x509.h>
 #ifndef OPENSSL_NO_DH
 #include <openssl/dh.h>
@@ -179,6 +182,32 @@ static const SSL_METHOD *ssl3_get_server_method(int ver)
 		return(NULL);
 	}
 
+#ifndef OPENSSL_NO_SRP
+static int ssl_check_srp_ext_ClientHello(SSL *s, int *al)
+	{
+	int ret = SSL_ERROR_NONE;
+
+	*al = SSL_AD_UNRECOGNIZED_NAME;
+
+	if ((s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP) &&
+	    (s->srp_ctx.TLS_ext_srp_username_callback != NULL))
+		{
+		if(s->srp_ctx.login == NULL)
+			{
+			/* RFC 5054 says SHOULD reject, 
+			   we do so if There is no srp login name */
+			ret = SSL3_AL_FATAL;
+			*al = SSL_AD_UNKNOWN_PSK_IDENTITY;
+			}
+		else
+			{
+			ret = SSL_srp_server_param_with_username(s,al);
+			}
+		}
+	return ret;
+	}
+#endif
+
 IMPLEMENT_ssl3_meth_func(SSLv3_server_method,
 			ssl3_accept,
 			ssl_undefined_function,
@@ -211,6 +240,18 @@ int ssl3_accept(SSL *s)
 		return(-1);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -218,7 +259,7 @@ int ssl3_accept(SSL *s)
 		switch (s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			/* s->state=SSL_ST_ACCEPT; */
 
 		case SSL_ST_BEFORE:
@@ -258,6 +299,7 @@ int ssl3_accept(SSL *s)
 				}
 
 			s->init_num=0;
+			s->s3->flags &= ~SSL3_FLAGS_SGC_RESTART_DONE;
 
 			if (s->state != SSL_ST_RENEGOTIATE)
 				{
@@ -313,10 +355,35 @@ int ssl3_accept(SSL *s)
 		case SSL3_ST_SR_CLNT_HELLO_C:
 
 			s->shutdown=0;
-			ret=ssl3_get_client_hello(s);
-			if (ret <= 0) goto end;
+			if (s->rwstate != SSL_X509_LOOKUP)
+			{
+				ret=ssl3_get_client_hello(s);
+				if (ret <= 0) goto end;
+			}
+#ifndef OPENSSL_NO_SRP
+			{
+			int al;
+			if ((ret = ssl_check_srp_ext_ClientHello(s,&al))  < 0)
+					{
+					/* callback indicates firther work to be done */
+					s->rwstate=SSL_X509_LOOKUP;
+					goto end;
+					}
+			if (ret != SSL_ERROR_NONE)
+				{
+				ssl3_send_alert(s,SSL3_AL_FATAL,al);	
+				/* This is not really an error but the only means to
+                                   for a client to detect whether srp is supported. */
+ 				   if (al != TLS1_AD_UNKNOWN_PSK_IDENTITY) 	
+					SSLerr(SSL_F_SSL3_ACCEPT,SSL_R_CLIENTHELLO_TLSEXT);			
+				ret = SSL_TLSEXT_ERR_ALERT_FATAL;			
+				ret= -1;
+				goto end;	
+				}
+			}
+#endif		
 			
-			s->new_session = 2;
+			s->renegotiate = 2;
 			s->state=SSL3_ST_SW_SRVR_HELLO_A;
 			s->init_num=0;
 			break;
@@ -345,7 +412,7 @@ int ssl3_accept(SSL *s)
 		case SSL3_ST_SW_CERT_A:
 		case SSL3_ST_SW_CERT_B:
 			/* Check if it is anon DH or anon ECDH, */
-			/* normal PSK or KRB5 */
+			/* normal PSK or KRB5 or SRP */
 			if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 				&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK)
 				&& !(s->s3->tmp.new_cipher->algorithm_auth & SSL_aKRB5))
@@ -409,6 +476,10 @@ int ssl3_accept(SSL *s)
 			 * hint if provided */
 #ifndef OPENSSL_NO_PSK
 			    || ((alg_k & SSL_kPSK) && s->ctx->psk_identity_hint)
+#endif
+#ifndef OPENSSL_NO_SRP
+			    /* SRP: send ServerKeyExchange */
+			    || (alg_k & SSL_kSRP)
 #endif
 			    || (alg_k & (SSL_kDHr|SSL_kDHd|SSL_kEDH))
 			    || (alg_k & SSL_kEECDH)
@@ -456,6 +527,9 @@ int ssl3_accept(SSL *s)
 				skip=1;
 				s->s3->tmp.cert_request=0;
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+				if (s->s3->handshake_buffer)
+					if (!ssl3_digest_cached_records(s))
+						return -1;
 				}
 			else
 				{
@@ -538,15 +612,26 @@ int ssl3_accept(SSL *s)
 				 * the client uses its key from the certificate
 				 * for key exchange.
 				 */
-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
-				s->state=SSL3_ST_SR_FINISHED_A;
-#else
-				if (s->s3->next_proto_neg_seen)
-					s->state=SSL3_ST_SR_NEXT_PROTO_A;
-				else
-					s->state=SSL3_ST_SR_FINISHED_A;
-#endif
 				s->init_num = 0;
+				s->state=SSL3_ST_SR_POST_CLIENT_CERT;
+				}
+			else if (TLS1_get_version(s) >= TLS1_2_VERSION)
+				{
+				s->state=SSL3_ST_SR_CERT_VRFY_A;
+				s->init_num=0;
+				if (!s->session->peer)
+					break;
+				/* For TLS v1.2 freeze the handshake buffer
+				 * at this point and digest cached records.
+				 */
+				if (!s->s3->handshake_buffer)
+					{
+					SSLerr(SSL_F_SSL3_ACCEPT,ERR_R_INTERNAL_ERROR);
+					return -1;
+					}
+				s->s3->flags |= TLS1_FLAGS_KEEP_HANDSHAKE;
+				if (!ssl3_digest_cached_records(s))
+					return -1;
 				}
 			else
 				{
@@ -588,16 +673,28 @@ int ssl3_accept(SSL *s)
 			ret=ssl3_get_cert_verify(s);
 			if (ret <= 0) goto end;
 
-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
-			s->state=SSL3_ST_SR_FINISHED_A;
-#else
-			if (s->s3->next_proto_neg_seen)
+			s->state=SSL3_ST_SR_POST_CLIENT_CERT;
+			s->init_num=0;
+			break;
+
+		case SSL3_ST_SR_POST_CLIENT_CERT: {
+			char next_proto_neg = 0;
+			char channel_id = 0;
+#if !defined(OPENSSL_NO_TLSEXT)
+# if !defined(OPENSSL_NO_NEXTPROTONEG)
+			next_proto_neg = s->s3->next_proto_neg_seen;
+# endif
+			channel_id = s->s3->tlsext_channel_id_valid;
+#endif
+
+			if (next_proto_neg)
 				s->state=SSL3_ST_SR_NEXT_PROTO_A;
+			else if (channel_id)
+				s->state=SSL3_ST_SR_CHANNEL_ID_A;
 			else
 				s->state=SSL3_ST_SR_FINISHED_A;
-#endif
-			s->init_num=0;
 			break;
+		}
 
 #if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
 		case SSL3_ST_SR_NEXT_PROTO_A:
@@ -605,6 +702,19 @@ int ssl3_accept(SSL *s)
 			ret=ssl3_get_next_proto(s);
 			if (ret <= 0) goto end;
 			s->init_num = 0;
+			if (s->s3->tlsext_channel_id_valid)
+				s->state=SSL3_ST_SR_CHANNEL_ID_A;
+			else
+				s->state=SSL3_ST_SR_FINISHED_A;
+			break;
+#endif
+
+#if !defined(OPENSSL_NO_TLSEXT)
+		case SSL3_ST_SR_CHANNEL_ID_A:
+		case SSL3_ST_SR_CHANNEL_ID_B:
+			ret=ssl3_get_channel_id(s);
+			if (ret <= 0) goto end;
+			s->init_num = 0;
 			s->state=SSL3_ST_SR_FINISHED_A;
 			break;
 #endif
@@ -614,14 +724,11 @@ int ssl3_accept(SSL *s)
 			ret=ssl3_get_finished(s,SSL3_ST_SR_FINISHED_A,
 				SSL3_ST_SR_FINISHED_B);
 			if (ret <= 0) goto end;
-#ifndef OPENSSL_NO_TLSEXT
-			if (s->tlsext_ticket_expected)
-				s->state=SSL3_ST_SW_SESSION_TICKET_A;
-			else if (s->hit)
-				s->state=SSL_ST_OK;
-#else
 			if (s->hit)
 				s->state=SSL_ST_OK;
+#ifndef OPENSSL_NO_TLSEXT
+			else if (s->tlsext_ticket_expected)
+				s->state=SSL3_ST_SW_SESSION_TICKET_A;
 #endif
 			else
 				s->state=SSL3_ST_SW_CHANGE_A;
@@ -679,16 +786,7 @@ int ssl3_accept(SSL *s)
 			if (ret <= 0) goto end;
 			s->state=SSL3_ST_SW_FLUSH;
 			if (s->hit)
-				{
-#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
-				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
-#else
-				if (s->s3->next_proto_neg_seen)
-					s->s3->tmp.next_state=SSL3_ST_SR_NEXT_PROTO_A;
-				else
-					s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
-#endif
-				}
+				s->s3->tmp.next_state=SSL3_ST_SR_POST_CLIENT_CERT;
 			else
 				s->s3->tmp.next_state=SSL_ST_OK;
 			s->init_num=0;
@@ -706,11 +804,9 @@ int ssl3_accept(SSL *s)
 
 			s->init_num=0;
 
-			if (s->new_session == 2) /* skipped if we just sent a HelloRequest */
+			if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */
 				{
-				/* actually not necessarily a 'new' session unless
-				 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
-				
+				s->renegotiate=0;
 				s->new_session=0;
 				
 				ssl_update_cache(s,SSL_SESS_CACHE_SERVER);
@@ -800,6 +896,13 @@ int ssl3_check_client_hello(SSL *s)
 	s->s3->tmp.reuse_message = 1;
 	if (s->s3->tmp.message_type == SSL3_MT_CLIENT_HELLO)
 		{
+		/* We only allow the client to restart the handshake once per
+		 * negotiation. */
+		if (s->s3->flags & SSL3_FLAGS_SGC_RESTART_DONE)
+			{
+			SSLerr(SSL_F_SSL3_CHECK_CLIENT_HELLO, SSL_R_MULTIPLE_SGC_RESTARTS);
+			return -1;
+			}
 		/* Throw away what we have done so far in the current handshake,
 		 * which will now be aborted. (A full SSL_clear would be too much.) */
 #ifndef OPENSSL_NO_DH
@@ -816,6 +919,7 @@ int ssl3_check_client_hello(SSL *s)
 			s->s3->tmp.ecdh = NULL;
 			}
 #endif
+		s->s3->flags |= SSL3_FLAGS_SGC_RESTART_DONE;
 		return 2;
 		}
 	return 1;
@@ -840,7 +944,8 @@ int ssl3_get_client_hello(SSL *s)
 	 * If we are SSLv3, we will respond with SSLv3, even if prompted with
 	 * TLSv1.
 	 */
-	if (s->state == SSL3_ST_SR_CLNT_HELLO_A)
+	if (s->state == SSL3_ST_SR_CLNT_HELLO_A
+		)
 		{
 		s->state=SSL3_ST_SR_CLNT_HELLO_B;
 		}
@@ -897,13 +1002,16 @@ int ssl3_get_client_hello(SSL *s)
 	j= *(p++);
 
 	s->hit=0;
-	/* Versions before 0.9.7 always allow session reuse during renegotiation
-	 * (i.e. when s->new_session is true), option
-	 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is new with 0.9.7.
-	 * Maybe this optional behaviour should always have been the default,
-	 * but we cannot safely change the default behaviour (or new applications
-	 * might be written that become totally unsecure when compiled with
-	 * an earlier library version)
+	/* Versions before 0.9.7 always allow clients to resume sessions in renegotiation.
+	 * 0.9.7 and later allow this by default, but optionally ignore resumption requests
+	 * with flag SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION (it's a new flag rather
+	 * than a change to default behavior so that applications relying on this for security
+	 * won't even compile against older library versions).
+	 *
+	 * 1.0.1 and later also have a function SSL_renegotiate_abbreviated() to request
+	 * renegotiation but not a new session (s->new_session remains unset): for servers,
+	 * this essentially just means that the SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION
+	 * setting will be ignored.
 	 */
 	if ((s->new_session && (s->options & SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION)))
 		{
@@ -1099,7 +1207,7 @@ int ssl3_get_client_hello(SSL *s)
 			goto f_err;
 			}
 		}
-		if (ssl_check_clienthello_tlsext(s) <= 0) {
+		if (ssl_check_clienthello_tlsext_early(s) <= 0) {
 			SSLerr(SSL_F_SSL3_GET_CLIENT_HELLO,SSL_R_CLIENTHELLO_TLSEXT);
 			goto err;
 		}
@@ -1304,8 +1412,14 @@ int ssl3_get_client_hello(SSL *s)
 		s->s3->tmp.new_cipher=s->session->cipher;
 		}
 
-	if (!ssl3_digest_cached_records(s))
-		goto f_err;
+	if (TLS1_get_version(s) < TLS1_2_VERSION || !(s->verify_mode & SSL_VERIFY_PEER))
+		{
+		if (!ssl3_digest_cached_records(s))
+			{
+			al = SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
+		}
 	
 	/* we now have the following setup. 
 	 * client_random
@@ -1318,6 +1432,16 @@ int ssl3_get_client_hello(SSL *s)
 	 * s->tmp.new_cipher	- the new cipher to use.
 	 */
 
+	/* Handles TLS extensions that we couldn't check earlier */
+	if (s->version >= SSL3_VERSION)
+		{
+		if (ssl_check_clienthello_tlsext_late(s) <= 0)
+			{
+			SSLerr(SSL_F_SSL3_GET_CLIENT_HELLO, SSL_R_CLIENTHELLO_TLSEXT);
+			goto err;
+			}
+		}
+
 	if (ret < 0) ret=1;
 	if (0)
 		{
@@ -1360,20 +1484,20 @@ int ssl3_send_server_hello(SSL *s)
 		memcpy(p,s->s3->server_random,SSL3_RANDOM_SIZE);
 		p+=SSL3_RANDOM_SIZE;
 
-		/* now in theory we have 3 options to sending back the
-		 * session id.  If it is a re-use, we send back the
-		 * old session-id, if it is a new session, we send
-		 * back the new session-id or we send back a 0 length
-		 * session-id if we want it to be single use.
-		 * Currently I will not implement the '0' length session-id
-		 * 12-Jan-98 - I'll now support the '0' length stuff.
-		 *
-		 * We also have an additional case where stateless session
-		 * resumption is successful: we always send back the old
-		 * session id. In this case s->hit is non zero: this can
-		 * only happen if stateless session resumption is succesful
-		 * if session caching is disabled so existing functionality
-		 * is unaffected.
+		/* There are several cases for the session ID to send
+		 * back in the server hello:
+		 * - For session reuse from the session cache,
+		 *   we send back the old session ID.
+		 * - If stateless session reuse (using a session ticket)
+		 *   is successful, we send back the client's "session ID"
+		 *   (which doesn't actually identify the session).
+		 * - If it is a new session, we send back the new
+		 *   session ID.
+		 * - However, if we want the new session to be single-use,
+		 *   we send back a 0-length session ID.
+		 * s->hit is non-zero in either case of session reuse,
+		 * so the following won't overwrite an ID that we're supposed
+		 * to send back.
 		 */
 		if (!(s->ctx->session_cache_mode & SSL_SESS_CACHE_SERVER)
 			&& !s->hit)
@@ -1474,6 +1598,7 @@ int ssl3_send_server_key_exchange(SSL *s)
 	BN_CTX *bn_ctx = NULL; 
 #endif
 	EVP_PKEY *pkey;
+	const EVP_MD *md = NULL;
 	unsigned char *p,*d;
 	int al,i;
 	unsigned long type;
@@ -1714,21 +1839,44 @@ int ssl3_send_server_key_exchange(SSL *s)
 				}
 			else
 #endif /* !OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+		if (type & SSL_kSRP)
+			{
+			if ((s->srp_ctx.N == NULL) ||
+				(s->srp_ctx.g == NULL) ||
+				(s->srp_ctx.s == NULL) ||
+				(s->srp_ctx.B == NULL))
+				{
+				SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_MISSING_SRP_PARAM);
+				goto err;
+				}
+			r[0]=s->srp_ctx.N;
+			r[1]=s->srp_ctx.g;
+			r[2]=s->srp_ctx.s;
+			r[3]=s->srp_ctx.B;
+			}
+		else 
+#endif
 			{
 			al=SSL_AD_HANDSHAKE_FAILURE;
 			SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE);
 			goto f_err;
 			}
-		for (i=0; r[i] != NULL; i++)
+		for (i=0; r[i] != NULL && i<4; i++)
 			{
 			nr[i]=BN_num_bytes(r[i]);
+#ifndef OPENSSL_NO_SRP
+			if ((i == 2) && (type & SSL_kSRP))
+				n+=1+nr[i];
+			else
+#endif
 			n+=2+nr[i];
 			}
 
 		if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 			&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
 			{
-			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher))
+			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher,&md))
 				== NULL)
 				{
 				al=SSL_AD_DECODE_ERROR;
@@ -1750,8 +1898,16 @@ int ssl3_send_server_key_exchange(SSL *s)
 		d=(unsigned char *)s->init_buf->data;
 		p= &(d[4]);
 
-		for (i=0; r[i] != NULL; i++)
+		for (i=0; r[i] != NULL && i<4; i++)
 			{
+#ifndef OPENSSL_NO_SRP
+			if ((i == 2) && (type & SSL_kSRP))
+				{
+				*p = nr[i];
+				p++;
+				}
+			else
+#endif
 			s2n(nr[i],p);
 			BN_bn2bin(r[i],p);
 			p+=nr[i];
@@ -1799,12 +1955,15 @@ int ssl3_send_server_key_exchange(SSL *s)
 			/* n is the length of the params, they start at &(d[4])
 			 * and p points to the space at the end. */
 #ifndef OPENSSL_NO_RSA
-			if (pkey->type == EVP_PKEY_RSA)
+			if (pkey->type == EVP_PKEY_RSA
+					&& TLS1_get_version(s) < TLS1_2_VERSION)
 				{
 				q=md_buf;
 				j=0;
 				for (num=2; num > 0; num--)
 					{
+					EVP_MD_CTX_set_flags(&md_ctx,
+						EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 					EVP_DigestInit_ex(&md_ctx,(num == 2)
 						?s->ctx->md5:s->ctx->sha1, NULL);
 					EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
@@ -1826,44 +1985,41 @@ int ssl3_send_server_key_exchange(SSL *s)
 				}
 			else
 #endif
-#if !defined(OPENSSL_NO_DSA)
-				if (pkey->type == EVP_PKEY_DSA)
+			if (md)
 				{
-				/* lets do DSS */
-				EVP_SignInit_ex(&md_ctx,EVP_dss1(), NULL);
-				EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-				EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
-				EVP_SignUpdate(&md_ctx,&(d[4]),n);
-				if (!EVP_SignFinal(&md_ctx,&(p[2]),
-					(unsigned int *)&i,pkey))
+				/* For TLS1.2 and later send signature
+				 * algorithm */
+				if (TLS1_get_version(s) >= TLS1_2_VERSION)
 					{
-					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_DSA);
-					goto err;
+					if (!tls12_get_sigandhash(p, pkey, md))
+						{
+						/* Should never happen */
+						al=SSL_AD_INTERNAL_ERROR;
+						SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+						goto f_err;
+						}
+					p+=2;
 					}
-				s2n(i,p);
-				n+=i+2;
-				}
-			else
+#ifdef SSL_DEBUG
+				fprintf(stderr, "Using hash %s\n",
+							EVP_MD_name(md));
 #endif
-#if !defined(OPENSSL_NO_ECDSA)
-				if (pkey->type == EVP_PKEY_EC)
-				{
-				/* let's do ECDSA */
-				EVP_SignInit_ex(&md_ctx,EVP_ecdsa(), NULL);
+				EVP_SignInit_ex(&md_ctx, md, NULL);
 				EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
 				EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
 				EVP_SignUpdate(&md_ctx,&(d[4]),n);
 				if (!EVP_SignFinal(&md_ctx,&(p[2]),
 					(unsigned int *)&i,pkey))
 					{
-					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_ECDSA);
+					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_EVP);
 					goto err;
 					}
 				s2n(i,p);
 				n+=i+2;
+				if (TLS1_get_version(s) >= TLS1_2_VERSION)
+					n+= 2;
 				}
 			else
-#endif
 				{
 				/* Is this error check actually needed? */
 				al=SSL_AD_HANDSHAKE_FAILURE;
@@ -1916,6 +2072,14 @@ int ssl3_send_certificate_request(SSL *s)
 		p+=n;
 		n++;
 
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			nl = tls12_get_req_sig_algs(s, p + 2);
+			s2n(nl, p);
+			p += nl + 2;
+			n += nl + 2;
+			}
+
 		off=n;
 		p+=2;
 		n+=2;
@@ -2175,6 +2339,7 @@ int ssl3_get_client_key_exchange(SSL *s)
 		if (i <= 0)
 			{
 			SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_DH_LIB);
+			BN_clear_free(pub);
 			goto err;
 			}
 
@@ -2634,6 +2799,44 @@ int ssl3_get_client_key_exchange(SSL *s)
 			}
 		else
 #endif
+#ifndef OPENSSL_NO_SRP
+		if (alg_k & SSL_kSRP)
+			{
+			int param_len;
+
+			n2s(p,i);
+			param_len=i+2;
+			if (param_len > n)
+				{
+				al=SSL_AD_DECODE_ERROR;
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_BAD_SRP_A_LENGTH);
+				goto f_err;
+				}
+			if (!(s->srp_ctx.A=BN_bin2bn(p,i,NULL)))
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_BN_LIB);
+				goto err;
+				}
+			if (s->session->srp_username != NULL)
+				OPENSSL_free(s->session->srp_username);
+			s->session->srp_username = BUF_strdup(s->srp_ctx.login);
+			if (s->session->srp_username == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+					ERR_R_MALLOC_FAILURE);
+				goto err;
+				}
+
+			if ((s->session->master_key_length = SRP_generate_server_master_secret(s,s->session->master_key))<0)
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+
+			p+=i;
+			}
+		else
+#endif	/* OPENSSL_NO_SRP */
 		if (alg_k & SSL_kGOST) 
 			{
 			int ret = 0;
@@ -2717,7 +2920,7 @@ int ssl3_get_client_key_exchange(SSL *s)
 	return(1);
 f_err:
 	ssl3_send_alert(s,SSL3_AL_FATAL,al);
-#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH)
+#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH) || defined(OPENSSL_NO_SRP)
 err:
 #endif
 #ifndef OPENSSL_NO_ECDH
@@ -2738,12 +2941,15 @@ int ssl3_get_cert_verify(SSL *s)
 	long n;
 	int type=0,i,j;
 	X509 *peer;
+	const EVP_MD *md = NULL;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
 
 	n=s->method->ssl_get_message(s,
 		SSL3_ST_SR_CERT_VRFY_A,
 		SSL3_ST_SR_CERT_VRFY_B,
 		-1,
-		514, /* 514? */
+		516, /* Enough for 4096 bit RSA key with TLS v1.2 */
 		&ok);
 
 	if (!ok) return((int)n);
@@ -2763,7 +2969,7 @@ int ssl3_get_cert_verify(SSL *s)
 	if (s->s3->tmp.message_type != SSL3_MT_CERTIFICATE_VERIFY)
 		{
 		s->s3->tmp.reuse_message=1;
-		if ((peer != NULL) && (type | EVP_PKT_SIGN))
+		if ((peer != NULL) && (type & EVP_PKT_SIGN))
 			{
 			al=SSL_AD_UNEXPECTED_MESSAGE;
 			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_MISSING_VERIFY_MESSAGE);
@@ -2806,6 +3012,36 @@ int ssl3_get_cert_verify(SSL *s)
 		} 
 	else 
 		{	
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			int sigalg = tls12_get_sigid(pkey);
+			/* Should never happen */
+			if (sigalg == -1)
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,ERR_R_INTERNAL_ERROR);
+				al=SSL_AD_INTERNAL_ERROR;
+				goto f_err;
+				}
+			/* Check key type is consistent with signature */
+			if (sigalg != (int)p[1])
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_WRONG_SIGNATURE_TYPE);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+			md = tls12_get_hash(p[0]);
+			if (md == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_UNKNOWN_DIGEST);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+#ifdef SSL_DEBUG
+fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md));
+#endif
+			p += 2;
+			n -= 2;
+			}
 		n2s(p,i);
 		n-=2;
 		if (i > n)
@@ -2823,6 +3059,37 @@ int ssl3_get_cert_verify(SSL *s)
 		goto f_err;
 		}
 
+	if (TLS1_get_version(s) >= TLS1_2_VERSION)
+		{
+		long hdatalen = 0;
+		void *hdata;
+		hdatalen = BIO_get_mem_data(s->s3->handshake_buffer, &hdata);
+		if (hdatalen <= 0)
+			{
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_INTERNAL_ERROR);
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
+#ifdef SSL_DEBUG
+		fprintf(stderr, "Using TLS 1.2 with client verify alg %s\n",
+							EVP_MD_name(md));
+#endif
+		if (!EVP_VerifyInit_ex(&mctx, md, NULL)
+			|| !EVP_VerifyUpdate(&mctx, hdata, hdatalen))
+			{
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_EVP_LIB);
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
+
+		if (EVP_VerifyFinal(&mctx, p , i, pkey) <= 0)
+			{
+			al=SSL_AD_DECRYPT_ERROR;
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_BAD_SIGNATURE);
+			goto f_err;
+			}
+		}
+	else
 #ifndef OPENSSL_NO_RSA 
 	if (pkey->type == EVP_PKEY_RSA)
 		{
@@ -2913,6 +3180,13 @@ int ssl3_get_cert_verify(SSL *s)
 		ssl3_send_alert(s,SSL3_AL_FATAL,al);
 		}
 end:
+	if (s->s3->handshake_buffer)
+		{
+		BIO_free(s->s3->handshake_buffer);
+		s->s3->handshake_buffer = NULL;
+		s->s3->flags &= ~TLS1_FLAGS_KEEP_HANDSHAKE;
+		}
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_free(pkey);
 	return(ret);
 	}
@@ -3025,6 +3299,12 @@ int ssl3_get_client_certificate(SSL *s)
 			al=SSL_AD_HANDSHAKE_FAILURE;
 			goto f_err;
 			}
+		/* No client certificate so digest cached records */
+		if (s->s3->handshake_buffer && !ssl3_digest_cached_records(s))
+			{
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
 		}
 	else
 		{
@@ -3101,13 +3381,17 @@ int ssl3_send_server_certificate(SSL *s)
 	/* SSL3_ST_SW_CERT_B */
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 	}
+
 #ifndef OPENSSL_NO_TLSEXT
+/* send a new session ticket (not necessarily for a new session) */
 int ssl3_send_newsession_ticket(SSL *s)
 	{
 	if (s->state == SSL3_ST_SW_SESSION_TICKET_A)
 		{
 		unsigned char *p, *senc, *macstart;
-		int len, slen;
+		const unsigned char *const_p;
+		int len, slen_full, slen;
+		SSL_SESSION *sess;
 		unsigned int hlen;
 		EVP_CIPHER_CTX ctx;
 		HMAC_CTX hctx;
@@ -3116,12 +3400,38 @@ int ssl3_send_newsession_ticket(SSL *s)
 		unsigned char key_name[16];
 
 		/* get session encoding length */
-		slen = i2d_SSL_SESSION(s->session, NULL);
+		slen_full = i2d_SSL_SESSION(s->session, NULL);
 		/* Some length values are 16 bits, so forget it if session is
  		 * too long
  		 */
-		if (slen > 0xFF00)
+		if (slen_full > 0xFF00)
+			return -1;
+		senc = OPENSSL_malloc(slen_full);
+		if (!senc)
+			return -1;
+		p = senc;
+		i2d_SSL_SESSION(s->session, &p);
+
+		/* create a fresh copy (not shared with other threads) to clean up */
+		const_p = senc;
+		sess = d2i_SSL_SESSION(NULL, &const_p, slen_full);
+		if (sess == NULL)
+			{
+			OPENSSL_free(senc);
+			return -1;
+			}
+		sess->session_id_length = 0; /* ID is irrelevant for the ticket */
+
+		slen = i2d_SSL_SESSION(sess, NULL);
+		if (slen > slen_full) /* shouldn't ever happen */
+			{
+			OPENSSL_free(senc);
 			return -1;
+			}
+		p = senc;
+		i2d_SSL_SESSION(sess, &p);
+		SSL_SESSION_free(sess);
+
 		/* Grow buffer if need be: the length calculation is as
  		 * follows 1 (size of message name) + 3 (message length
  		 * bytes) + 4 (ticket lifetime hint) + 2 (ticket length) +
@@ -3133,11 +3443,6 @@ int ssl3_send_newsession_ticket(SSL *s)
 			26 + EVP_MAX_IV_LENGTH + EVP_MAX_BLOCK_LENGTH +
 			EVP_MAX_MD_SIZE + slen))
 			return -1;
-		senc = OPENSSL_malloc(slen);
-		if (!senc)
-			return -1;
-		p = senc;
-		i2d_SSL_SESSION(s->session, &p);
 
 		p=(unsigned char *)s->init_buf->data;
 		/* do the header */
@@ -3168,7 +3473,13 @@ int ssl3_send_newsession_ticket(SSL *s)
 					tlsext_tick_md(), NULL);
 			memcpy(key_name, tctx->tlsext_tick_key_name, 16);
 			}
-		l2n(s->session->tlsext_tick_lifetime_hint, p);
+
+		/* Ticket lifetime hint (advisory only):
+		 * We leave this unspecified for resumed session (for simplicity),
+		 * and guess that tickets for new sessions will live as long
+		 * as their sessions. */
+		l2n(s->hit ? 0 : s->session->timeout, p);
+
 		/* Skip ticket length for now */
 		p += 2;
 		/* Output key name */
@@ -3244,13 +3555,13 @@ int ssl3_send_cert_status(SSL *s)
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 	}
 
-# ifndef OPENSSL_NO_NPN
+# ifndef OPENSSL_NO_NEXTPROTONEG
 /* ssl3_get_next_proto reads a Next Protocol Negotiation handshake message. It
  * sets the next_proto member in s if found */
 int ssl3_get_next_proto(SSL *s)
 	{
 	int ok;
-	unsigned proto_len, padding_len;
+	int proto_len, padding_len;
 	long n;
 	const unsigned char *p;
 
@@ -3311,4 +3622,140 @@ int ssl3_get_next_proto(SSL *s)
 	return 1;
 	}
 # endif
+
+/* ssl3_get_channel_id reads and verifies a ClientID handshake message. */
+int ssl3_get_channel_id(SSL *s)
+	{
+	int ret = -1, ok;
+	long n;
+	const unsigned char *p;
+	unsigned short extension_type, extension_len;
+	EC_GROUP* p256 = NULL;
+	EC_KEY* key = NULL;
+	EC_POINT* point = NULL;
+	ECDSA_SIG sig;
+	BIGNUM x, y;
+
+	if (s->state == SSL3_ST_SR_CHANNEL_ID_A && s->init_num == 0)
+		{
+		/* The first time that we're called we take the current
+		 * handshake hash and store it. */
+		EVP_MD_CTX md_ctx;
+		unsigned int len;
+
+		EVP_MD_CTX_init(&md_ctx);
+		EVP_DigestInit_ex(&md_ctx, EVP_sha256(), NULL);
+		if (!tls1_channel_id_hash(&md_ctx, s))
+			return -1;
+		len = sizeof(s->s3->tlsext_channel_id);
+		EVP_DigestFinal(&md_ctx, s->s3->tlsext_channel_id, &len);
+		EVP_MD_CTX_cleanup(&md_ctx);
+		}
+
+	n = s->method->ssl_get_message(s,
+		SSL3_ST_SR_CHANNEL_ID_A,
+		SSL3_ST_SR_CHANNEL_ID_B,
+		SSL3_MT_ENCRYPTED_EXTENSIONS,
+		2 + 2 + TLSEXT_CHANNEL_ID_SIZE,
+		&ok);
+
+	if (!ok)
+		return((int)n);
+
+	ssl3_finish_mac(s, (unsigned char*)s->init_buf->data, s->init_num + 4);
+
+	/* s->state doesn't reflect whether ChangeCipherSpec has been received
+	 * in this handshake, but s->s3->change_cipher_spec does (will be reset
+	 * by ssl3_get_finished). */
+	if (!s->s3->change_cipher_spec)
+		{
+		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_GOT_CHANNEL_ID_BEFORE_A_CCS);
+		return -1;
+		}
+
+	if (n != 2 + 2 + TLSEXT_CHANNEL_ID_SIZE)
+		{
+		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_INVALID_MESSAGE);
+		return -1;
+		}
+
+	p = (unsigned char *)s->init_msg;
+
+	/* The payload looks like:
+	 *   uint16 extension_type
+	 *   uint16 extension_len;
+	 *   uint8 x[32];
+	 *   uint8 y[32];
+	 *   uint8 r[32];
+	 *   uint8 s[32];
+	 */
+	n2s(p, extension_type);
+	n2s(p, extension_len);
+
+	if (extension_type != TLSEXT_TYPE_channel_id ||
+	    extension_len != TLSEXT_CHANNEL_ID_SIZE)
+		{
+		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_INVALID_MESSAGE);
+		return -1;
+		}
+
+	p256 = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1);
+	if (!p256)
+		{
+		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_NO_P256_SUPPORT);
+		return -1;
+		}
+
+	BN_init(&x);
+	BN_init(&y);
+	sig.r = BN_new();
+	sig.s = BN_new();
+
+	if (BN_bin2bn(p +  0, 32, &x) == NULL ||
+	    BN_bin2bn(p + 32, 32, &y) == NULL ||
+	    BN_bin2bn(p + 64, 32, sig.r) == NULL ||
+	    BN_bin2bn(p + 96, 32, sig.s) == NULL)
+		goto err;
+
+	point = EC_POINT_new(p256);
+	if (!point ||
+	    !EC_POINT_set_affine_coordinates_GFp(p256, point, &x, &y, NULL))
+		goto err;
+
+	key = EC_KEY_new();
+	if (!key ||
+	    !EC_KEY_set_group(key, p256) ||
+	    !EC_KEY_set_public_key(key, point))
+		goto err;
+
+	/* We stored the handshake hash in |tlsext_channel_id| the first time
+	 * that we were called. */
+	switch (ECDSA_do_verify(s->s3->tlsext_channel_id, SHA256_DIGEST_LENGTH, &sig, key)) {
+	case 1:
+		break;
+	case 0:
+		SSLerr(SSL_F_SSL3_GET_CHANNEL_ID,SSL_R_CHANNEL_ID_SIGNATURE_INVALID);
+		s->s3->tlsext_channel_id_valid = 0;
+		goto err;
+	default:
+		s->s3->tlsext_channel_id_valid = 0;
+		goto err;
+	}
+
+	memcpy(s->s3->tlsext_channel_id, p, 64);
+	ret = 1;
+
+err:
+	BN_free(&x);
+	BN_free(&y);
+	BN_free(sig.r);
+	BN_free(sig.s);
+	if (key)
+		EC_KEY_free(key);
+	if (point)
+		EC_POINT_free(point);
+	if (p256)
+		EC_GROUP_free(p256);
+	return ret;
+	}
 #endif
diff --git a/ssl/srtp.h b/ssl/srtp.h
new file mode 100644
index 0000000..c0cf33e
--- /dev/null
+++ b/ssl/srtp.h
@@ -0,0 +1,145 @@
+/* ssl/tls1.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <ekr@rtfm.com>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#ifndef HEADER_D1_SRTP_H
+#define HEADER_D1_SRTP_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+     
+#define SRTP_AES128_CM_SHA1_80 0x0001
+#define SRTP_AES128_CM_SHA1_32 0x0002
+#define SRTP_AES128_F8_SHA1_80 0x0003
+#define SRTP_AES128_F8_SHA1_32 0x0004
+#define SRTP_NULL_SHA1_80      0x0005
+#define SRTP_NULL_SHA1_32      0x0006
+
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles);
+int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/ssl/ssl.h b/ssl/ssl.h
index 9cb2bf9..dac9c3e 100644
--- a/ssl/ssl.h
+++ b/ssl/ssl.h
@@ -252,6 +252,7 @@ extern "C" {
 #define SSL_TXT_kEECDH		"kEECDH"
 #define SSL_TXT_kPSK            "kPSK"
 #define SSL_TXT_kGOST		"kGOST"
+#define SSL_TXT_kSRP		"kSRP"
 
 #define	SSL_TXT_aRSA		"aRSA"
 #define	SSL_TXT_aDSS		"aDSS"
@@ -275,6 +276,7 @@ extern "C" {
 #define SSL_TXT_ECDSA		"ECDSA"
 #define SSL_TXT_KRB5      	"KRB5"
 #define SSL_TXT_PSK             "PSK"
+#define SSL_TXT_SRP		"SRP"
 
 #define SSL_TXT_DES		"DES"
 #define SSL_TXT_3DES		"3DES"
@@ -285,6 +287,7 @@ extern "C" {
 #define SSL_TXT_AES128		"AES128"
 #define SSL_TXT_AES256		"AES256"
 #define SSL_TXT_AES		"AES"
+#define SSL_TXT_AES_GCM		"AESGCM"
 #define SSL_TXT_CAMELLIA128	"CAMELLIA128"
 #define SSL_TXT_CAMELLIA256	"CAMELLIA256"
 #define SSL_TXT_CAMELLIA	"CAMELLIA"
@@ -294,10 +297,14 @@ extern "C" {
 #define SSL_TXT_SHA		"SHA" /* same as "SHA1" */
 #define SSL_TXT_GOST94		"GOST94" 
 #define SSL_TXT_GOST89MAC		"GOST89MAC" 
+#define SSL_TXT_SHA256		"SHA256"
+#define SSL_TXT_SHA384		"SHA384"
 
 #define SSL_TXT_SSLV2		"SSLv2"
 #define SSL_TXT_SSLV3		"SSLv3"
 #define SSL_TXT_TLSV1		"TLSv1"
+#define SSL_TXT_TLSV1_1		"TLSv1.1"
+#define SSL_TXT_TLSV1_2		"TLSv1.2"
 
 #define SSL_TXT_EXP		"EXP"
 #define SSL_TXT_EXPORT		"EXPORT"
@@ -356,9 +363,29 @@ extern "C" {
  * in SSL_CTX. */
 typedef struct ssl_st *ssl_crock_st;
 typedef struct tls_session_ticket_ext_st TLS_SESSION_TICKET_EXT;
+typedef struct ssl_method_st SSL_METHOD;
+typedef struct ssl_cipher_st SSL_CIPHER;
+typedef struct ssl_session_st SSL_SESSION;
+
+DECLARE_STACK_OF(SSL_CIPHER)
+
+/* SRTP protection profiles for use with the use_srtp extension (RFC 5764)*/
+typedef struct srtp_protection_profile_st
+       {
+       const char *name;
+       unsigned long id;
+       } SRTP_PROTECTION_PROFILE;
+
+DECLARE_STACK_OF(SRTP_PROTECTION_PROFILE)
+
+typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
+typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
+
+
+#ifndef OPENSSL_NO_SSL_INTERN
 
 /* used to hold info on the particular ciphers used */
-typedef struct ssl_cipher_st
+struct ssl_cipher_st
 	{
 	int valid;
 	const char *name;		/* text name */
@@ -375,15 +402,11 @@ typedef struct ssl_cipher_st
 	unsigned long algorithm2;	/* Extra flags */
 	int strength_bits;		/* Number of bits really used */
 	int alg_bits;			/* Number of bits for algorithm */
-	} SSL_CIPHER;
-
-DECLARE_STACK_OF(SSL_CIPHER)
+	};
 
-typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
-typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
 
 /* Used to hold functions for SSLv2 or SSLv3/TLSv1 functions */
-typedef struct ssl_method_st
+struct ssl_method_st
 	{
 	int version;
 	int (*ssl_new)(SSL *s);
@@ -416,7 +439,7 @@ typedef struct ssl_method_st
 	int (*ssl_version)(void);
 	long (*ssl_callback_ctrl)(SSL *s, int cb_id, void (*fp)(void));
 	long (*ssl_ctx_callback_ctrl)(SSL_CTX *s, int cb_id, void (*fp)(void));
-	} SSL_METHOD;
+	};
 
 /* Lets make this into an ASN.1 type structure as follows
  * SSL_SESSION_ID ::= SEQUENCE {
@@ -433,14 +456,17 @@ typedef struct ssl_method_st
  *	Session_ID_context [ 4 ] EXPLICIT OCTET STRING,   -- the Session ID context
  *	Verify_result [ 5 ] EXPLICIT INTEGER,   -- X509_V_... code for `Peer'
  *	HostName [ 6 ] EXPLICIT OCTET STRING,   -- optional HostName from servername TLS extension 
- *	ECPointFormatList [ 7 ] OCTET STRING,     -- optional EC point format list from TLS extension
- *	PSK_identity_hint [ 8 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
- *	PSK_identity [ 9 ] EXPLICIT OCTET STRING -- optional PSK identity
+ *	PSK_identity_hint [ 7 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
+ *	PSK_identity [ 8 ] EXPLICIT OCTET STRING,  -- optional PSK identity
+ *	Ticket_lifetime_hint [9] EXPLICIT INTEGER, -- server's lifetime hint for session ticket
+ *	Ticket [10]             EXPLICIT OCTET STRING, -- session ticket (clients only)
+ *	Compression_meth [11]   EXPLICIT OCTET STRING, -- optional compression method
+ *	SRP_username [ 12 ] EXPLICIT OCTET STRING -- optional SRP username
  *	}
  * Look in ssl/ssl_asn1.c for more details
  * I'm using EXPLICIT tags so I can read the damn things using asn1parse :-).
  */
-typedef struct ssl_session_st
+struct ssl_session_st
 	{
 	int ssl_version;	/* what ssl version session info is
 				 * being kept in here? */
@@ -467,6 +493,9 @@ typedef struct ssl_session_st
 	char *psk_identity_hint;
 	char *psk_identity;
 #endif
+	/* Used to indicate that session resumption is not allowed.
+	 * Applications can also set this bit for a new session via
+	 * not_resumable_session_cb to disable session caching and tickets. */
 	int not_resumable;
 
 	/* The cert is the certificate used to establish this connection */
@@ -509,11 +538,15 @@ typedef struct ssl_session_st
 #endif /* OPENSSL_NO_EC */
 	/* RFC4507 info */
 	unsigned char *tlsext_tick;	/* Session ticket */
-	size_t	tlsext_ticklen;		/* Session ticket length */	
+	size_t tlsext_ticklen;		/* Session ticket length */
 	long tlsext_tick_lifetime_hint;	/* Session lifetime hint in seconds */
 #endif
-	} SSL_SESSION;
+#ifndef OPENSSL_NO_SRP
+	char *srp_username;
+#endif
+	};
 
+#endif
 
 #define SSL_OP_MICROSOFT_SESS_ID_BUG			0x00000001L
 #define SSL_OP_NETSCAPE_CHALLENGE_BUG			0x00000002L
@@ -536,7 +569,7 @@ typedef struct ssl_session_st
 
 /* SSL_OP_ALL: various bug workarounds that should be rather harmless.
  *             This used to be 0x000FFFFFL before 0.9.7. */
-#define SSL_OP_ALL					0x80000FFFL
+#define SSL_OP_ALL					0x80000BFFL
 
 /* DTLS options */
 #define SSL_OP_NO_QUERY_MTU                 0x00001000L
@@ -572,11 +605,17 @@ typedef struct ssl_session_st
 #define SSL_OP_NO_SSLv2					0x01000000L
 #define SSL_OP_NO_SSLv3					0x02000000L
 #define SSL_OP_NO_TLSv1					0x04000000L
+#define SSL_OP_NO_TLSv1_2				0x08000000L
+#define SSL_OP_NO_TLSv1_1				0x10000000L
 
+/* These next two were never actually used for anything since SSLeay
+ * zap so we have some more flags.
+ */
 /* The next flag deliberately changes the ciphertest, this is a check
  * for the PKCS#1 attack */
-#define SSL_OP_PKCS1_CHECK_1				0x08000000L
-#define SSL_OP_PKCS1_CHECK_2				0x10000000L
+#define SSL_OP_PKCS1_CHECK_1				0x0
+#define SSL_OP_PKCS1_CHECK_2				0x0
+
 #define SSL_OP_NETSCAPE_CA_DN_BUG			0x20000000L
 #define SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG		0x40000000L
 /* Make server add server-hello extension from early version of
@@ -602,13 +641,10 @@ typedef struct ssl_session_st
  * TLS only.)  "Released" buffers are put onto a free-list in the context
  * or just freed (depending on the context's setting for freelist_max_len). */
 #define SSL_MODE_RELEASE_BUFFERS 0x00000010L
-/* Use small read and write buffers: (a) lazy allocate read buffers for
- * large incoming records, and (b) limit the size of outgoing records. */
-#define SSL_MODE_SMALL_BUFFERS 0x00000020L
 /* When set, clients may send application data before receipt of CCS
  * and Finished.  This mode enables full-handshakes to 'complete' in
  * one RTT. */
-#define SSL_MODE_HANDSHAKE_CUTTHROUGH 0x00000040L
+#define SSL_MODE_HANDSHAKE_CUTTHROUGH 0x00000020L
 
 /* Note: SSL[_CTX]_set_{options,mode} use |= op on the previous value,
  * they cannot be used to clear bits. */
@@ -644,12 +680,53 @@ typedef struct ssl_session_st
 #define SSL_get_secure_renegotiation_support(ssl) \
 	SSL_ctrl((ssl), SSL_CTRL_GET_RI_SUPPORT, 0, NULL)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_heartbeat(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_TLS_EXT_SEND_HEARTBEAT,0,NULL)
+#endif
+
 void SSL_CTX_set_msg_callback(SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 #define SSL_CTX_set_msg_callback_arg(ctx, arg) SSL_CTX_ctrl((ctx), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 #define SSL_set_msg_callback_arg(ssl, arg) SSL_ctrl((ssl), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 
+#ifndef OPENSSL_NO_SRP
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+typedef struct srp_ctx_st
+	{
+	/* param for all the callbacks */
+	void *SRP_cb_arg;
+	/* set client Hello login callback */
+	int (*TLS_ext_srp_username_callback)(SSL *, int *, void *);
+	/* set SRP N/g param callback for verification */
+	int (*SRP_verify_param_callback)(SSL *, void *);
+	/* set SRP client passwd callback */
+	char *(*SRP_give_srp_client_pwd_callback)(SSL *, void *);
+
+	char *login;
+	BIGNUM *N,*g,*s,*B,*A;
+	BIGNUM *a,*b,*v;
+	char *info;
+	int strength;
+
+	unsigned long srp_Mask;
+	} SRP_CTX;
+
+#endif
+
+/* see tls_srp.c */
+int SSL_SRP_CTX_init(SSL *s);
+int SSL_CTX_SRP_CTX_init(SSL_CTX *ctx);
+int SSL_SRP_CTX_free(SSL *ctx);
+int SSL_CTX_SRP_CTX_free(SSL_CTX *ctx);
+int SSL_srp_server_param_with_username(SSL *s, int *ad);
+int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key);
+int SRP_Calc_A_param(SSL *s);
+int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key);
+
+#endif
 
 #if defined(OPENSSL_SYS_MSDOS) && !defined(OPENSSL_SYS_WIN32)
 #define SSL_MAX_CERT_LIST_DEFAULT 1024*30 /* 30k max cert list :-) */
@@ -675,7 +752,11 @@ void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int con
 typedef int (*GEN_SESSION_CB)(const SSL *ssl, unsigned char *id,
 				unsigned int *id_len);
 
-typedef struct ssl_comp_st
+typedef struct ssl_comp_st SSL_COMP;
+
+#ifndef OPENSSL_NO_SSL_INTERN
+
+struct ssl_comp_st
 	{
 	int id;
 	const char *name;
@@ -684,7 +765,7 @@ typedef struct ssl_comp_st
 #else
 	char *method;
 #endif
-	} SSL_COMP;
+	};
 
 DECLARE_STACK_OF(SSL_COMP)
 DECLARE_LHASH_OF(SSL_SESSION);
@@ -857,6 +938,28 @@ struct ssl_ctx_st
 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
 	void *tlsext_opaque_prf_input_callback_arg;
+#endif
+
+#ifndef OPENSSL_NO_PSK
+	char *psk_identity_hint;
+	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
+		unsigned int max_identity_len, unsigned char *psk,
+		unsigned int max_psk_len);
+	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
+		unsigned char *psk, unsigned int max_psk_len);
+#endif
+
+#ifndef OPENSSL_NO_BUF_FREELISTS
+#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
+	unsigned int freelist_max_len;
+	struct ssl3_buf_freelist_st *wbuf_freelist;
+	struct ssl3_buf_freelist_st *rbuf_freelist;
+#endif
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
+
+#ifndef OPENSSL_NO_TLSEXT
 
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	/* Next protocol negotiation information */
@@ -876,24 +979,43 @@ struct ssl_ctx_st
 				    void *arg);
 	void *next_proto_select_cb_arg;
 # endif
-#endif
 
-#ifndef OPENSSL_NO_PSK
-	char *psk_identity_hint;
-	unsigned int (*psk_client_callback)(SSL *ssl, const char *hint, char *identity,
-		unsigned int max_identity_len, unsigned char *psk,
-		unsigned int max_psk_len);
-	unsigned int (*psk_server_callback)(SSL *ssl, const char *identity,
-		unsigned char *psk, unsigned int max_psk_len);
+	/* ALPN information
+	 * (we are in the process of transitioning from NPN to ALPN.) */
+
+	/* For a server, this contains a callback function that allows the
+	 * server to select the protocol for the connection.
+	 *   out: on successful return, this must point to the raw protocol
+	 *        name (without the length prefix).
+	 *   outlen: on successful return, this contains the length of |*out|.
+	 *   in: points to the client's list of supported protocols in
+	 *       wire-format.
+	 *   inlen: the length of |in|. */
+	int (*alpn_select_cb)(SSL *s,
+			      const unsigned char **out,
+			      unsigned char *outlen,
+			      const unsigned char* in,
+			      unsigned int inlen,
+			      void *arg);
+	void *alpn_select_cb_arg;
+
+	/* For a client, this contains the list of supported protocols in wire
+	 * format. */
+	unsigned char* alpn_client_proto_list;
+	unsigned alpn_client_proto_list_len;
+
+        /* SRTP profiles we are willing to do from RFC 5764 */
+        STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  
+
+	/* If true, a client will advertise the Channel ID extension and a
+	 * server will echo it. */
+	char tlsext_channel_id_enabled;
+	/* The client's Channel ID private key. */
+	EVP_PKEY *tlsext_channel_id_private;
 #endif
+	};
 
-#ifndef OPENSSL_NO_BUF_FREELISTS
-#define SSL_MAX_BUF_FREELIST_LEN_DEFAULT 32
-	unsigned int freelist_max_len;
-	struct ssl3_buf_freelist_st *wbuf_freelist;
-	struct ssl3_buf_freelist_st *rbuf_freelist;
 #endif
-	};
 
 #define SSL_SESS_CACHE_OFF			0x0000
 #define SSL_SESS_CACHE_CLIENT			0x0001
@@ -931,6 +1053,10 @@ LHASH_OF(SSL_SESSION) *SSL_CTX_sessions(SSL_CTX *ctx);
 	SSL_CTX_ctrl(ctx,SSL_CTRL_SESS_TIMEOUTS,0,NULL)
 #define SSL_CTX_sess_cache_full(ctx) \
 	SSL_CTX_ctrl(ctx,SSL_CTRL_SESS_CACHE_FULL,0,NULL)
+/* SSL_CTX_enable_tls_channel_id configures a TLS server to accept TLS client
+ * IDs from clients. Returns 1 on success. */
+#define SSL_CTX_enable_tls_channel_id(ctx) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_CHANNEL_ID,0,NULL)
 
 void SSL_CTX_sess_set_new_cb(SSL_CTX *ctx, int (*new_session_cb)(struct ssl_st *ssl,SSL_SESSION *sess));
 int (*SSL_CTX_sess_get_new_cb(SSL_CTX *ctx))(struct ssl_st *ssl, SSL_SESSION *sess);
@@ -952,26 +1078,43 @@ void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s,
 					   int (*cb) (SSL *ssl,
 						      const unsigned char **out,
 						      unsigned int *outlen,
-						      void *arg), void *arg);
+						      void *arg),
+					   void *arg);
 void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
-				      int (*cb) (SSL *ssl, unsigned char **out,
+				      int (*cb) (SSL *ssl,
+						 unsigned char **out,
 						 unsigned char *outlen,
 						 const unsigned char *in,
-						 unsigned int inlen, void *arg),
+						 unsigned int inlen,
+						 void *arg),
 				      void *arg);
 
 int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
 			  const unsigned char *in, unsigned int inlen,
 			  const unsigned char *client, unsigned int client_len);
-void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data,
-				    unsigned *len);
+void SSL_get0_next_proto_negotiated(const SSL *s,
+				    const unsigned char **data, unsigned *len);
 
 #define OPENSSL_NPN_UNSUPPORTED	0
 #define OPENSSL_NPN_NEGOTIATED	1
 #define OPENSSL_NPN_NO_OVERLAP	2
-
 #endif
 
+int SSL_CTX_set_alpn_protos(SSL_CTX *ctx, const unsigned char* protos,
+			    unsigned protos_len);
+int SSL_set_alpn_protos(SSL *ssl, const unsigned char* protos,
+			unsigned protos_len);
+void SSL_CTX_set_alpn_select_cb(SSL_CTX* ctx,
+				int (*cb) (SSL *ssl,
+					   const unsigned char **out,
+					   unsigned char *outlen,
+					   const unsigned char *in,
+					   unsigned int inlen,
+					   void *arg),
+				void *arg);
+void SSL_get0_alpn_selected(const SSL *ssl, const unsigned char **data,
+			    unsigned *len);
+
 #ifndef OPENSSL_NO_PSK
 /* the maximum length of the buffer given to callbacks containing the
  * resulting identity/psk */
@@ -1011,6 +1154,8 @@ const char *SSL_get_psk_identity(const SSL *s);
 #define SSL_MAC_FLAG_READ_MAC_STREAM 1
 #define SSL_MAC_FLAG_WRITE_MAC_STREAM 2
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 struct ssl_st
 	{
 	/* protocol version
@@ -1055,9 +1200,7 @@ struct ssl_st
 
 	int server;	/* are we the server side? - mostly used by SSL_clear*/
 
-	int new_session;/* 1 if we are to use a new session.
-	                 * 2 if we are a server and are inside a handshake
-	                 *   (i.e. not just sending a HelloRequest)
+	int new_session;/* Generate a new session or reuse an old one.
 	                 * NB: For servers, the 'new' session may actually be a previously
 	                 * cached session or even the previous session unless
 	                 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
@@ -1244,11 +1387,44 @@ struct ssl_st
 #endif
 
 #define session_ctx initial_ctx
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  /* What we'll do */
+	SRTP_PROTECTION_PROFILE *srtp_profile;            /* What's been chosen */
+
+	unsigned int tlsext_heartbeat;  /* Is use of the Heartbeat extension negotiated?
+	                                   0: disabled
+	                                   1: enabled
+	                                   2: enabled, but not allowed to send Requests
+	                                 */
+	unsigned int tlsext_hb_pending; /* Indicates if a HeartbeatRequest is in flight */
+	unsigned int tlsext_hb_seq;     /* HeartbeatRequest sequence number */
+
+	/* Copied from the SSL_CTX. For a server, means that we'll accept
+	 * Channel IDs from clients. For a client, means that we'll advertise
+	 * support. */
+	char tlsext_channel_id_enabled;
+	/* The client's Channel ID private key. */
+	EVP_PKEY *tlsext_channel_id_private;
+
+	/* For a client, this contains the list of supported protocols in wire
+	 * format. */
+	unsigned char* alpn_client_proto_list;
+	unsigned alpn_client_proto_list_len;
 #else
 #define session_ctx ctx
 #endif /* OPENSSL_NO_TLSEXT */
+
+	int renegotiate;/* 1 if we are renegotiating.
+	                 * 2 if we are a server and are inside a handshake
+	                 * (i.e. not just sending a HelloRequest) */
+
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
 	};
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
@@ -1258,6 +1434,7 @@ struct ssl_st
 #include <openssl/tls1.h> /* This is mostly sslv3 with a few tweaks */
 #include <openssl/dtls1.h> /* Datagram TLS */
 #include <openssl/ssl23.h>
+#include <openssl/srtp.h>  /* Support for the use_srtp extension */
 
 #ifdef  __cplusplus
 extern "C" {
@@ -1304,7 +1481,7 @@ extern "C" {
 #define SSL_get_state(a)		SSL_state(a)
 #define SSL_is_init_finished(a)		(SSL_state(a) == SSL_ST_OK)
 #define SSL_in_init(a)			((SSL_state(a)&SSL_ST_INIT) && \
-                                  !SSL_cutthrough_complete(a))
+					!SSL_cutthrough_complete(a))
 #define SSL_in_before(a)		(SSL_state(a)&SSL_ST_BEFORE)
 #define SSL_in_connect_init(a)		(SSL_state(a)&SSL_ST_CONNECT)
 #define SSL_in_accept_init(a)		(SSL_state(a)&SSL_ST_ACCEPT)
@@ -1476,6 +1653,23 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_CTRL_SET_TLSEXT_STATUS_REQ_OCSP_RESP	71
 
 #define SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB	72
+
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB	75
+#define SSL_CTRL_SET_SRP_VERIFY_PARAM_CB		76
+#define SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB		77
+
+#define SSL_CTRL_SET_SRP_ARG		78
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME		79
+#define SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH		80
+#define SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD		81
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_CTRL_TLS_EXT_SEND_HEARTBEAT				85
+#define SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING		86
+#define SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS	87
+#endif
+#define SSL_CTRL_CHANNEL_ID			88
+#define SSL_CTRL_GET_CHANNEL_ID			89
+#define SSL_CTRL_SET_CHANNEL_ID			90
 #endif
 
 #define DTLS_CTRL_GET_TIMEOUT		73
@@ -1486,6 +1680,9 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_CTRL_CLEAR_OPTIONS			77
 #define SSL_CTRL_CLEAR_MODE			78
 
+#define SSL_CTRL_GET_EXTRA_CHAIN_CERTS		82
+#define SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS	83
+
 #define DTLSv1_get_timeout(ssl, arg) \
 	SSL_ctrl(ssl,DTLS_CTRL_GET_TIMEOUT,0, (void *)arg)
 #define DTLSv1_handle_timeout(ssl) \
@@ -1520,8 +1717,31 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_set_tmp_ecdh(ssl,ecdh) \
 	SSL_ctrl(ssl,SSL_CTRL_SET_TMP_ECDH,0,(char *)ecdh)
 
+/* SSL_enable_tls_channel_id configures a TLS server to accept TLS client
+ * IDs from clients. Returns 1 on success. */
+#define SSL_enable_tls_channel_id(ctx) \
+	SSL_ctrl(ctx,SSL_CTRL_CHANNEL_ID,0,NULL)
+/* SSL_set1_tls_channel_id configures a TLS client to send a TLS Channel ID to
+ * compatible servers. private_key must be a P-256 EVP_PKEY*. Returns 1 on
+ * success. */
+#define SSL_set1_tls_channel_id(s, private_key) \
+	SSL_ctrl(s,SSL_CTRL_SET_CHANNEL_ID,0,(void*)private_key)
+#define SSL_CTX_set1_tls_channel_id(ctx, private_key) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_SET_CHANNEL_ID,0,(void*)private_key)
+/* SSL_get_tls_channel_id gets the client's TLS Channel ID from a server SSL*
+ * and copies up to the first |channel_id_len| bytes into |channel_id|. The
+ * Channel ID consists of the client's P-256 public key as an (x,y) pair where
+ * each is a 32-byte, big-endian field element. Returns 0 if the client didn't
+ * offer a Channel ID and the length of the complete Channel ID otherwise. */
+#define SSL_get_tls_channel_id(ctx, channel_id, channel_id_len) \
+	SSL_ctrl(ctx,SSL_CTRL_GET_CHANNEL_ID,channel_id_len,(void*)channel_id)
+
 #define SSL_CTX_add_extra_chain_cert(ctx,x509) \
 	SSL_CTX_ctrl(ctx,SSL_CTRL_EXTRA_CHAIN_CERT,0,(char *)x509)
+#define SSL_CTX_get_extra_chain_certs(ctx,px509) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_GET_EXTRA_CHAIN_CERTS,0,px509)
+#define SSL_CTX_clear_extra_chain_certs(ctx) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS,0,NULL)
 
 #ifndef OPENSSL_NO_BIO
 BIO_METHOD *BIO_f_ssl(void);
@@ -1549,7 +1769,8 @@ const SSL_CIPHER *SSL_get_current_cipher(const SSL *s);
 int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);
 char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
 const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
-const char *	SSL_CIPHER_authentication_method(const SSL_CIPHER *c);
+unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c);
+const char* SSL_CIPHER_authentication_method(const SSL_CIPHER* cipher);
 
 int	SSL_get_fd(const SSL *s);
 int	SSL_get_rfd(const SSL *s);
@@ -1619,11 +1840,15 @@ long	SSL_SESSION_set_time(SSL_SESSION *s, long t);
 long	SSL_SESSION_get_timeout(const SSL_SESSION *s);
 long	SSL_SESSION_set_timeout(SSL_SESSION *s, long t);
 void	SSL_copy_session_id(SSL *to,const SSL *from);
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s);
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len);
 
 SSL_SESSION *SSL_SESSION_new(void);
 const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s,
 					unsigned int *len);
 const char *	SSL_SESSION_get_version(const SSL_SESSION *s);
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s);
 #ifndef OPENSSL_NO_FP_API
 int	SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses);
 #endif
@@ -1687,6 +1912,30 @@ int SSL_set_trust(SSL *s, int trust);
 int SSL_CTX_set1_param(SSL_CTX *ctx, X509_VERIFY_PARAM *vpm);
 int SSL_set1_param(SSL *ssl, X509_VERIFY_PARAM *vpm);
 
+#ifndef OPENSSL_NO_SRP
+int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name);
+int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password);
+int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength);
+int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx,
+					char *(*cb)(SSL *,void *));
+int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx,
+					  int (*cb)(SSL *,void *));
+int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx,
+				      int (*cb)(SSL *,int *,void *));
+int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg);
+
+int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g,
+			     BIGNUM *sa, BIGNUM *v, char *info);
+int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass,
+				const char *grp);
+
+BIGNUM *SSL_get_srp_g(SSL *s);
+BIGNUM *SSL_get_srp_N(SSL *s);
+
+char *SSL_get_srp_username(SSL *s);
+char *SSL_get_srp_userinfo(SSL *s);
+#endif
+
 void	SSL_free(SSL *ssl);
 int 	SSL_accept(SSL *ssl);
 int 	SSL_connect(SSL *ssl);
@@ -1722,6 +1971,15 @@ const SSL_METHOD *TLSv1_method(void);		/* TLSv1.0 */
 const SSL_METHOD *TLSv1_server_method(void);	/* TLSv1.0 */
 const SSL_METHOD *TLSv1_client_method(void);	/* TLSv1.0 */
 
+const SSL_METHOD *TLSv1_1_method(void);		/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_server_method(void);	/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_client_method(void);	/* TLSv1.1 */
+
+const SSL_METHOD *TLSv1_2_method(void);		/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_server_method(void);	/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_client_method(void);	/* TLSv1.2 */
+
+
 const SSL_METHOD *DTLSv1_method(void);		/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_server_method(void);	/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_client_method(void);	/* DTLSv1.0 */
@@ -1730,6 +1988,7 @@ STACK_OF(SSL_CIPHER) *SSL_get_ciphers(const SSL *s);
 
 int SSL_do_handshake(SSL *s);
 int SSL_renegotiate(SSL *s);
+int SSL_renegotiate_abbreviated(SSL *s);
 int SSL_renegotiate_pending(SSL *s);
 int SSL_shutdown(SSL *s);
 
@@ -1781,6 +2040,7 @@ void SSL_set_info_callback(SSL *ssl,
 			   void (*cb)(const SSL *ssl,int type,int val));
 void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl,int type,int val);
 int SSL_state(const SSL *ssl);
+void SSL_set_state(SSL *ssl, int state);
 
 void SSL_set_verify_result(SSL *ssl,long v);
 long SSL_get_verify_result(const SSL *ssl);
@@ -1881,6 +2141,9 @@ int SSL_set_session_ticket_ext_cb(SSL *s, tls_session_ticket_ext_cb_fn cb,
 /* Pre-shared secret session resumption functions */
 int SSL_set_session_secret_cb(SSL *s, tls_session_secret_cb_fn tls_session_secret_cb, void *arg);
 
+void SSL_set_debug(SSL *s, int debug);
+int SSL_cache_hit(SSL *s);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1900,6 +2163,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_DTLS1_ACCEPT				 246
 #define SSL_F_DTLS1_ADD_CERT_TO_BUF			 295
 #define SSL_F_DTLS1_BUFFER_RECORD			 247
+#define SSL_F_DTLS1_CHECK_TIMEOUT_NUM			 316
 #define SSL_F_DTLS1_CLIENT_HELLO			 248
 #define SSL_F_DTLS1_CONNECT				 249
 #define SSL_F_DTLS1_ENC					 250
@@ -1908,6 +2172,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_DTLS1_GET_MESSAGE_FRAGMENT		 253
 #define SSL_F_DTLS1_GET_RECORD				 254
 #define SSL_F_DTLS1_HANDLE_TIMEOUT			 297
+#define SSL_F_DTLS1_HEARTBEAT				 305
 #define SSL_F_DTLS1_OUTPUT_CERT_CHAIN			 255
 #define SSL_F_DTLS1_PREPROCESS_FRAGMENT			 288
 #define SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE		 256
@@ -1957,6 +2222,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_CALLBACK_CTRL			 233
 #define SSL_F_SSL3_CHANGE_CIPHER_STATE			 129
 #define SSL_F_SSL3_CHECK_CERT_AND_ALGORITHM		 130
+#define SSL_F_SSL3_CHECK_CLIENT_HELLO			 304
 #define SSL_F_SSL3_CLIENT_HELLO				 131
 #define SSL_F_SSL3_CONNECT				 132
 #define SSL_F_SSL3_CTRL					 213
@@ -1968,6 +2234,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_GET_CERTIFICATE_REQUEST		 135
 #define SSL_F_SSL3_GET_CERT_STATUS			 289
 #define SSL_F_SSL3_GET_CERT_VERIFY			 136
+#define SSL_F_SSL3_GET_CHANNEL_ID			 317
 #define SSL_F_SSL3_GET_CLIENT_CERTIFICATE		 137
 #define SSL_F_SSL3_GET_CLIENT_HELLO			 138
 #define SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE		 139
@@ -1975,7 +2242,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
 #define SSL_F_SSL3_GET_MESSAGE				 142
 #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
-#define SSL_F_SSL3_GET_NEXT_PROTO			 304
+#define SSL_F_SSL3_GET_NEXT_PROTO			 306
 #define SSL_F_SSL3_GET_RECORD				 143
 #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
 #define SSL_F_SSL3_GET_SERVER_DONE			 145
@@ -1987,6 +2254,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_READ_BYTES				 148
 #define SSL_F_SSL3_READ_N				 149
 #define SSL_F_SSL3_SEND_CERTIFICATE_REQUEST		 150
+#define SSL_F_SSL3_SEND_CHANNEL_ID			 318
 #define SSL_F_SSL3_SEND_CLIENT_CERTIFICATE		 151
 #define SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE		 152
 #define SSL_F_SSL3_SEND_CLIENT_VERIFY			 153
@@ -2000,10 +2268,12 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_WRITE_PENDING			 159
 #define SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT	 298
 #define SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT		 277
+#define SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT		 307
 #define SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK	 215
 #define SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK	 216
 #define SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT	 299
 #define SSL_F_SSL_ADD_SERVERHELLO_TLSEXT		 278
+#define SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT		 308
 #define SSL_F_SSL_BAD_METHOD				 160
 #define SSL_F_SSL_BYTES_TO_CIPHER_LIST			 161
 #define SSL_F_SSL_CERT_DUP				 221
@@ -2020,6 +2290,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_CREATE_CIPHER_LIST			 166
 #define SSL_F_SSL_CTRL					 232
 #define SSL_F_SSL_CTX_CHECK_PRIVATE_KEY			 168
+#define SSL_F_SSL_CTX_MAKE_PROFILES			 309
 #define SSL_F_SSL_CTX_NEW				 169
 #define SSL_F_SSL_CTX_SET_CIPHER_LIST			 269
 #define SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE		 290
@@ -2042,14 +2313,17 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_GET_NEW_SESSION			 181
 #define SSL_F_SSL_GET_PREV_SESSION			 217
 #define SSL_F_SSL_GET_SERVER_SEND_CERT			 182
+#define SSL_F_SSL_GET_SERVER_SEND_PKEY			 317
 #define SSL_F_SSL_GET_SIGN_PKEY				 183
 #define SSL_F_SSL_INIT_WBIO_BUFFER			 184
 #define SSL_F_SSL_LOAD_CLIENT_CA_FILE			 185
 #define SSL_F_SSL_NEW					 186
 #define SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT	 300
 #define SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT		 302
+#define SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT	 310
 #define SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT	 301
 #define SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT		 303
+#define SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT	 311
 #define SSL_F_SSL_PEEK					 270
 #define SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT		 281
 #define SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT		 282
@@ -2058,6 +2332,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_RSA_PUBLIC_ENCRYPT			 188
 #define SSL_F_SSL_SESSION_NEW				 189
 #define SSL_F_SSL_SESSION_PRINT_FP			 190
+#define SSL_F_SSL_SESSION_SET1_ID_CONTEXT		 312
 #define SSL_F_SSL_SESS_CERT_NEW				 225
 #define SSL_F_SSL_SET_CERT				 191
 #define SSL_F_SSL_SET_CIPHER_LIST			 271
@@ -2071,6 +2346,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_SET_TRUST				 228
 #define SSL_F_SSL_SET_WFD				 196
 #define SSL_F_SSL_SHUTDOWN				 224
+#define SSL_F_SSL_SRP_CTX_INIT				 313
 #define SSL_F_SSL_UNDEFINED_CONST_FUNCTION		 243
 #define SSL_F_SSL_UNDEFINED_FUNCTION			 197
 #define SSL_F_SSL_UNDEFINED_VOID_FUNCTION		 244
@@ -2091,6 +2367,8 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_TLS1_CHANGE_CIPHER_STATE			 209
 #define SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT		 274
 #define SSL_F_TLS1_ENC					 210
+#define SSL_F_TLS1_EXPORT_KEYING_MATERIAL		 314
+#define SSL_F_TLS1_HEARTBEAT				 315
 #define SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT		 275
 #define SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT		 276
 #define SSL_F_TLS1_PRF					 284
@@ -2130,6 +2408,13 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_BAD_RSA_MODULUS_LENGTH			 121
 #define SSL_R_BAD_RSA_SIGNATURE				 122
 #define SSL_R_BAD_SIGNATURE				 123
+#define SSL_R_BAD_SRP_A_LENGTH				 347
+#define SSL_R_BAD_SRP_B_LENGTH				 348
+#define SSL_R_BAD_SRP_G_LENGTH				 349
+#define SSL_R_BAD_SRP_N_LENGTH				 350
+#define SSL_R_BAD_SRP_S_LENGTH				 351
+#define SSL_R_BAD_SRTP_MKI_VALUE			 352
+#define SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST		 353
 #define SSL_R_BAD_SSL_FILETYPE				 124
 #define SSL_R_BAD_SSL_SESSION_ID_LENGTH			 125
 #define SSL_R_BAD_STATE					 126
@@ -2137,12 +2422,15 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_BIO_NOT_SET				 128
 #define SSL_R_BLOCK_CIPHER_PAD_IS_WRONG			 129
 #define SSL_R_BN_LIB					 130
+#define SSL_R_CANNOT_SERIALIZE_PUBLIC_KEY		 376
 #define SSL_R_CA_DN_LENGTH_MISMATCH			 131
 #define SSL_R_CA_DN_TOO_LONG				 132
 #define SSL_R_CCS_RECEIVED_EARLY			 133
 #define SSL_R_CERTIFICATE_VERIFY_FAILED			 134
 #define SSL_R_CERT_LENGTH_MISMATCH			 135
 #define SSL_R_CHALLENGE_IS_DIFFERENT			 136
+#define SSL_R_CHANNEL_ID_NOT_P256			 375
+#define SSL_R_CHANNEL_ID_SIGNATURE_INVALID		 371
 #define SSL_R_CIPHER_CODE_WRONG_LENGTH			 137
 #define SSL_R_CIPHER_OR_HASH_UNAVAILABLE		 138
 #define SSL_R_CIPHER_TABLE_SRC_ERROR			 139
@@ -2155,6 +2443,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_CONNECTION_ID_IS_DIFFERENT		 143
 #define SSL_R_CONNECTION_TYPE_NOT_SET			 144
 #define SSL_R_COOKIE_MISMATCH				 308
+#define SSL_R_D2I_ECDSA_SIG				 379
 #define SSL_R_DATA_BETWEEN_CCS_AND_FINISHED		 145
 #define SSL_R_DATA_LENGTH_TOO_LONG			 146
 #define SSL_R_DECRYPTION_FAILED				 147
@@ -2168,14 +2457,18 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE	 322
 #define SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE	 323
 #define SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER		 310
+#define SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST	 354
 #define SSL_R_ENCRYPTED_LENGTH_TOO_LONG			 150
 #define SSL_R_ERROR_GENERATING_TMP_RSA_KEY		 282
 #define SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST		 151
+#define SSL_R_EVP_DIGESTSIGNFINAL_FAILED		 377
+#define SSL_R_EVP_DIGESTSIGNINIT_FAILED			 378
 #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
 #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
 #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
-#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 346
-#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 347
+#define SSL_R_GOT_CHANNEL_ID_BEFORE_A_CCS		 372
+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 355
+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 356
 #define SSL_R_HTTPS_PROXY_REQUEST			 155
 #define SSL_R_HTTP_REQUEST				 156
 #define SSL_R_ILLEGAL_PADDING				 283
@@ -2183,7 +2476,9 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_INVALID_CHALLENGE_LENGTH			 158
 #define SSL_R_INVALID_COMMAND				 280
 #define SSL_R_INVALID_COMPRESSION_ALGORITHM		 341
+#define SSL_R_INVALID_MESSAGE				 374
 #define SSL_R_INVALID_PURPOSE				 278
+#define SSL_R_INVALID_SRP_USERNAME			 357
 #define SSL_R_INVALID_STATUS_RESPONSE			 328
 #define SSL_R_INVALID_TICKET_KEYS_LENGTH		 325
 #define SSL_R_INVALID_TRUST				 279
@@ -2213,11 +2508,13 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_MISSING_RSA_CERTIFICATE			 168
 #define SSL_R_MISSING_RSA_ENCRYPTING_CERT		 169
 #define SSL_R_MISSING_RSA_SIGNING_CERT			 170
+#define SSL_R_MISSING_SRP_PARAM				 358
 #define SSL_R_MISSING_TMP_DH_KEY			 171
 #define SSL_R_MISSING_TMP_ECDH_KEY			 311
 #define SSL_R_MISSING_TMP_RSA_KEY			 172
 #define SSL_R_MISSING_TMP_RSA_PKEY			 173
 #define SSL_R_MISSING_VERIFY_MESSAGE			 174
+#define SSL_R_MULTIPLE_SGC_RESTARTS			 346
 #define SSL_R_NON_SSLV2_INITIAL_PACKET			 175
 #define SSL_R_NO_CERTIFICATES_RETURNED			 176
 #define SSL_R_NO_CERTIFICATE_ASSIGNED			 177
@@ -2234,6 +2531,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_NO_COMPRESSION_SPECIFIED			 187
 #define SSL_R_NO_GOST_CERTIFICATE_SENT_BY_PEER		 330
 #define SSL_R_NO_METHOD_SPECIFIED			 188
+#define SSL_R_NO_P256_SUPPORT				 373
 #define SSL_R_NO_PRIVATEKEY				 189
 #define SSL_R_NO_PRIVATE_KEY_ASSIGNED			 190
 #define SSL_R_NO_PROTOCOLS_AVAILABLE			 191
@@ -2241,6 +2539,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_NO_RENEGOTIATION				 339
 #define SSL_R_NO_REQUIRED_DIGEST			 324
 #define SSL_R_NO_SHARED_CIPHER				 193
+#define SSL_R_NO_SRTP_PROFILES				 359
 #define SSL_R_NO_VERIFY_CALLBACK			 194
 #define SSL_R_NULL_SSL_CTX				 195
 #define SSL_R_NULL_SSL_METHOD_PASSED			 196
@@ -2285,7 +2584,12 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED		 277
 #define SSL_R_SESSION_MAY_NOT_BE_CREATED		 2000
 #define SSL_R_SHORT_READ				 219
+#define SSL_R_SIGNATURE_ALGORITHMS_ERROR		 360
 #define SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE	 220
+#define SSL_R_SRP_A_CALC				 361
+#define SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES		 362
+#define SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG	 363
+#define SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE		 364
 #define SSL_R_SSL23_DOING_SESSION_ID_REUSE		 221
 #define SSL_R_SSL2_CONNECTION_ID_TOO_LONG		 299
 #define SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT		 321
@@ -2330,6 +2634,9 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_TLSV1_UNRECOGNIZED_NAME			 1112
 #define SSL_R_TLSV1_UNSUPPORTED_EXTENSION		 1110
 #define SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER	 232
+#define SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT		 365
+#define SSL_R_TLS_HEARTBEAT_PENDING			 366
+#define SSL_R_TLS_ILLEGAL_EXPORTER_LABEL		 367
 #define SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST		 157
 #define SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST 233
 #define SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG	 234
@@ -2351,6 +2658,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_UNKNOWN_CERTIFICATE_TYPE			 247
 #define SSL_R_UNKNOWN_CIPHER_RETURNED			 248
 #define SSL_R_UNKNOWN_CIPHER_TYPE			 249
+#define SSL_R_UNKNOWN_DIGEST				 368
 #define SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE			 250
 #define SSL_R_UNKNOWN_PKEY_TYPE				 251
 #define SSL_R_UNKNOWN_PROTOCOL				 252
@@ -2365,12 +2673,14 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_UNSUPPORTED_PROTOCOL			 258
 #define SSL_R_UNSUPPORTED_SSL_VERSION			 259
 #define SSL_R_UNSUPPORTED_STATUS_TYPE			 329
+#define SSL_R_USE_SRTP_NOT_NEGOTIATED			 369
 #define SSL_R_WRITE_BIO_NOT_SET				 260
 #define SSL_R_WRONG_CIPHER_RETURNED			 261
 #define SSL_R_WRONG_MESSAGE_TYPE			 262
 #define SSL_R_WRONG_NUMBER_OF_KEY_BITS			 263
 #define SSL_R_WRONG_SIGNATURE_LENGTH			 264
 #define SSL_R_WRONG_SIGNATURE_SIZE			 265
+#define SSL_R_WRONG_SIGNATURE_TYPE			 370
 #define SSL_R_WRONG_SSL_VERSION				 266
 #define SSL_R_WRONG_VERSION_NUMBER			 267
 #define SSL_R_X509_LIB					 268
diff --git a/ssl/ssl2.h b/ssl/ssl2.h
index 99a52ea..eb25dcb 100644
--- a/ssl/ssl2.h
+++ b/ssl/ssl2.h
@@ -155,6 +155,8 @@ extern "C" {
 #define  CERT		char
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl2_state_st
 	{
 	int three_byte_header;
@@ -219,6 +221,8 @@ typedef struct ssl2_state_st
 		} tmp;
 	} SSL2_STATE;
 
+#endif
+
 /* SSLv2 */
 /* client */
 #define SSL2_ST_SEND_CLIENT_HELLO_A		(0x10|SSL_ST_CONNECT)
diff --git a/ssl/ssl3.h b/ssl/ssl3.h
index f9268c5..4729868 100644
--- a/ssl/ssl3.h
+++ b/ssl/ssl3.h
@@ -280,9 +280,6 @@ extern "C" {
 
 #define SSL3_RT_MAX_EXTRA			(16384)
 
-/* Default buffer length used for writen records.  Thus a generated record
- * will contain plaintext no larger than this value. */
-#define SSL3_RT_DEFAULT_PLAIN_LENGTH	2048
 /* Maximum plaintext length: defined by SSL/TLS standards */
 #define SSL3_RT_MAX_PLAIN_LENGTH		16384
 /* Maximum compression overhead: defined by SSL/TLS standards */
@@ -314,13 +311,6 @@ extern "C" {
 #define SSL3_RT_MAX_PACKET_SIZE		\
 		(SSL3_RT_MAX_ENCRYPTED_LENGTH+SSL3_RT_HEADER_LENGTH)
 
-/* Extra space for empty fragment, headers, MAC, and padding. */
-#define SSL3_RT_DEFAULT_WRITE_OVERHEAD  256
-#define SSL3_RT_DEFAULT_PACKET_SIZE     4096 - SSL3_RT_DEFAULT_WRITE_OVERHEAD
-#if SSL3_RT_DEFAULT_PLAIN_LENGTH + SSL3_RT_DEFAULT_WRITE_OVERHEAD > SSL3_RT_DEFAULT_PACKET_SIZE
-#error "Insufficient space allocated for write buffers."
-#endif
-
 #define SSL3_MD_CLIENT_FINISHED_CONST	"\x43\x4C\x4E\x54"
 #define SSL3_MD_SERVER_FINISHED_CONST	"\x53\x52\x56\x52"
 
@@ -332,6 +322,7 @@ extern "C" {
 #define SSL3_RT_ALERT			21
 #define SSL3_RT_HANDSHAKE		22
 #define SSL3_RT_APPLICATION_DATA	23
+#define TLS1_RT_HEARTBEAT		24
 
 #define SSL3_AL_WARNING			1
 #define SSL3_AL_FATAL			2
@@ -349,6 +340,11 @@ extern "C" {
 #define SSL3_AD_CERTIFICATE_UNKNOWN	46
 #define SSL3_AD_ILLEGAL_PARAMETER	47	/* fatal */
 
+#define TLS1_HB_REQUEST		1
+#define TLS1_HB_RESPONSE	2
+	
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_record_st
 	{
 /*r */	int type;               /* type of record */
@@ -370,6 +366,8 @@ typedef struct ssl3_buffer_st
 	int left;               /* how many bytes left */
 	} SSL3_BUFFER;
 
+#endif
+
 #define SSL3_CT_RSA_SIGN			1
 #define SSL3_CT_DSS_SIGN			2
 #define SSL3_CT_RSA_FIXED_DH			3
@@ -389,6 +387,20 @@ typedef struct ssl3_buffer_st
 #define SSL3_FLAGS_POP_BUFFER			0x0004
 #define TLS1_FLAGS_TLS_PADDING_BUG		0x0008
 #define TLS1_FLAGS_SKIP_CERT_VERIFY		0x0010
+#define TLS1_FLAGS_KEEP_HANDSHAKE		0x0020
+ 
+/* SSL3_FLAGS_SGC_RESTART_DONE is set when we
+ * restart a handshake because of MS SGC and so prevents us
+ * from restarting the handshake in a loop. It's reset on a
+ * renegotiation, so effectively limits the client to one restart
+ * per negotiation. This limits the possibility of a DDoS
+ * attack where the client handshakes in a loop using SGC to
+ * restart. Servers which permit renegotiation can still be
+ * effected, but we can't prevent that.
+ */
+#define SSL3_FLAGS_SGC_RESTART_DONE		0x0040
+
+#ifndef OPENSSL_NO_SSL_INTERN
 
 typedef struct ssl3_state_st
 	{
@@ -465,12 +477,6 @@ typedef struct ssl3_state_st
 	void *server_opaque_prf_input;
 	size_t server_opaque_prf_input_len;
 
-#ifndef OPENSSL_NO_NEXTPROTONEG
-	/* Set if we saw the Next Protocol Negotiation extension from
-	   our peer. */
-	int next_proto_neg_seen;
-#endif
-
 	struct	{
 		/* actually only needs to be 16+20 */
 		unsigned char cert_verify_md[EVP_MAX_MD_SIZE*2];
@@ -480,7 +486,7 @@ typedef struct ssl3_state_st
 		int finish_md_len;
 		unsigned char peer_finish_md[EVP_MAX_MD_SIZE*2];
 		int peer_finish_md_len;
-		
+
 		unsigned long message_size;
 		int message_type;
 
@@ -528,14 +534,55 @@ typedef struct ssl3_state_st
         unsigned char previous_server_finished[EVP_MAX_MD_SIZE];
         unsigned char previous_server_finished_len;
         int send_connection_binding; /* TODOEKR */
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	/* Set if we saw the Next Protocol Negotiation extension from our peer. */
+	int next_proto_neg_seen;
+#endif
+
+	/* In a client, this means that the server supported Channel ID and that
+	 * a Channel ID was sent. In a server it means that we echoed support
+	 * for Channel IDs and that tlsext_channel_id will be valid after the
+	 * handshake. */
+	char tlsext_channel_id_valid;
+	/* For a server:
+	 *     If |tlsext_channel_id_valid| is true, then this contains the
+	 *     verified Channel ID from the client: a P256 point, (x,y), where
+	 *     each are big-endian values. */
+	unsigned char tlsext_channel_id[64];
+
+	/* ALPN information
+	 * (we are in the process of transitioning from NPN to ALPN.) */
+
+	/* In a server these point to the selected ALPN protocol after the
+	 * ClientHello has been processed. In a client these contain the
+	 * protocol that the server selected once the ServerHello has been
+	 * processed. */
+	unsigned char *alpn_selected;
+	unsigned alpn_selected_len;
+
+	/* These point to the digest function to use for signatures made with
+	 * each type of public key. A NULL value indicates that the default
+	 * digest should be used, which is SHA1 as of TLS 1.2.
+	 *
+	 * (These should be in the tmp member, but we have to put them here to
+	 * ensure binary compatibility with earlier OpenSSL 1.0.* releases.) */
+	const EVP_MD *digest_rsa;
+	const EVP_MD *digest_dsa;
+	const EVP_MD *digest_ecdsa;
 	} SSL3_STATE;
 
+#endif
 
 /* SSLv3 */
 /*client */
 /* extra state */
 #define SSL3_ST_CW_FLUSH		(0x100|SSL_ST_CONNECT)
 #define SSL3_ST_CUTTHROUGH_COMPLETE	(0x101|SSL_ST_CONNECT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_CW_WRITE_SOCK			(0x310|SSL_ST_CONNECT)
+#define DTLS1_SCTP_ST_CR_READ_SOCK			(0x320|SSL_ST_CONNECT)
+#endif	
 /* write to server */
 #define SSL3_ST_CW_CLNT_HELLO_A		(0x110|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CLNT_HELLO_B		(0x111|SSL_ST_CONNECT)
@@ -567,6 +614,8 @@ typedef struct ssl3_state_st
 #define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
 #define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
 #endif
+#define SSL3_ST_CW_CHANNEL_ID_A		(0x210|SSL_ST_CONNECT)
+#define SSL3_ST_CW_CHANNEL_ID_B		(0x211|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
 /* read from server */
@@ -582,6 +631,10 @@ typedef struct ssl3_state_st
 /* server */
 /* extra state */
 #define SSL3_ST_SW_FLUSH		(0x100|SSL_ST_ACCEPT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_SW_WRITE_SOCK			(0x310|SSL_ST_ACCEPT)
+#define DTLS1_SCTP_ST_SR_READ_SOCK			(0x320|SSL_ST_ACCEPT)
+#endif	
 /* read from client */
 /* Do not change the number values, they do matter */
 #define SSL3_ST_SR_CLNT_HELLO_A		(0x110|SSL_ST_ACCEPT)
@@ -612,10 +665,13 @@ typedef struct ssl3_state_st
 #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
+#define SSL3_ST_SR_POST_CLIENT_CERT	(0x1BF|SSL_ST_ACCEPT)
 #ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
 #endif
+#define SSL3_ST_SR_CHANNEL_ID_A		(0x220|SSL_ST_ACCEPT)
+#define SSL3_ST_SR_CHANNEL_ID_B		(0x221|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
 /* write to client */
@@ -643,6 +699,7 @@ typedef struct ssl3_state_st
 #ifndef OPENSSL_NO_NEXTPROTONEG
 #define SSL3_MT_NEXT_PROTO			67
 #endif
+#define SSL3_MT_ENCRYPTED_EXTENSIONS		203
 #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
 
 
diff --git a/ssl/ssl_algs.c b/ssl/ssl_algs.c
index 0967b2d..9c34d19 100644
--- a/ssl/ssl_algs.c
+++ b/ssl/ssl_algs.c
@@ -73,6 +73,9 @@ int SSL_library_init(void)
 #endif
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
+#if !defined(OPENSSL_NO_MD5) && (defined(__x86_64) || defined(__x86_64__))
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif  
 #ifndef OPENSSL_NO_RC2
 	EVP_add_cipher(EVP_rc2_cbc());
@@ -85,6 +88,13 @@ int SSL_library_init(void)
 	EVP_add_cipher(EVP_aes_128_cbc());
 	EVP_add_cipher(EVP_aes_192_cbc());
 	EVP_add_cipher(EVP_aes_256_cbc());
+	EVP_add_cipher(EVP_aes_128_gcm());
+	EVP_add_cipher(EVP_aes_256_gcm());
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
+
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
 	EVP_add_cipher(EVP_camellia_128_cbc());
diff --git a/ssl/ssl_asn1.c b/ssl/ssl_asn1.c
index d7f4c60..38540be 100644
--- a/ssl/ssl_asn1.c
+++ b/ssl/ssl_asn1.c
@@ -114,6 +114,9 @@ typedef struct ssl_session_asn1_st
 	ASN1_OCTET_STRING psk_identity_hint;
 	ASN1_OCTET_STRING psk_identity;
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	ASN1_OCTET_STRING srp_username;
+#endif /* OPENSSL_NO_SRP */
 	} SSL_SESSION_ASN1;
 
 int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
@@ -129,6 +132,9 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 #ifndef OPENSSL_NO_COMP
 	unsigned char cbuf;
 	int v11=0;
+#endif
+#ifndef OPENSSL_NO_SRP
+	int v12=0;
 #endif
 	long l;
 	SSL_SESSION_ASN1 a;
@@ -267,6 +273,14 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 		a.psk_identity.data=(unsigned char *)(in->psk_identity);
 		}
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+		{
+		a.srp_username.length=strlen(in->srp_username);
+		a.srp_username.type=V_ASN1_OCTET_STRING;
+		a.srp_username.data=(unsigned char *)(in->srp_username);
+		}
+#endif /* OPENSSL_NO_SRP */
 
 	M_ASN1_I2D_len(&(a.version),		i2d_ASN1_INTEGER);
 	M_ASN1_I2D_len(&(a.ssl_version),	i2d_ASN1_INTEGER);
@@ -307,6 +321,10 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 	if (in->psk_identity)
         	M_ASN1_I2D_len_EXP_opt(&(a.psk_identity), i2d_ASN1_OCTET_STRING,8,v8);
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+        	M_ASN1_I2D_len_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12);
+#endif /* OPENSSL_NO_SRP */
 
 	M_ASN1_I2D_seq_total();
 
@@ -351,6 +369,10 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 	if (in->compress_meth)
         	M_ASN1_I2D_put_EXP_opt(&(a.comp_id), i2d_ASN1_OCTET_STRING,11,v11);
 #endif
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+		M_ASN1_I2D_put_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12);
+#endif /* OPENSSL_NO_SRP */
 	M_ASN1_I2D_finish();
 	}
 
@@ -549,6 +571,19 @@ SSL_SESSION *d2i_SSL_SESSION(SSL_SESSION **a, const unsigned char **pp,
 		}
 	else
 		ret->psk_identity_hint=NULL;
+
+	os.length=0;
+	os.data=NULL;
+	M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,8);
+	if (os.data)
+		{
+		ret->psk_identity = BUF_strndup((char *)os.data, os.length);
+		OPENSSL_free(os.data);
+		os.data = NULL;
+		os.length = 0;
+		}
+	else
+		ret->psk_identity=NULL;
 #endif /* OPENSSL_NO_PSK */
 
 #ifndef OPENSSL_NO_TLSEXT
@@ -588,5 +623,20 @@ SSL_SESSION *d2i_SSL_SESSION(SSL_SESSION **a, const unsigned char **pp,
 		}
 #endif
 
+#ifndef OPENSSL_NO_SRP
+	os.length=0;
+	os.data=NULL;
+	M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,12);
+	if (os.data)
+		{
+		ret->srp_username = BUF_strndup((char *)os.data, os.length);
+		OPENSSL_free(os.data);
+		os.data = NULL;
+		os.length = 0;
+		}
+	else
+		ret->srp_username=NULL;
+#endif /* OPENSSL_NO_SRP */
+
 	M_ASN1_D2I_Finish(a,SSL_SESSION_free,SSL_F_D2I_SSL_SESSION);
 	}
diff --git a/ssl/ssl_cert.c b/ssl/ssl_cert.c
index 27256ee..bc4150b 100644
--- a/ssl/ssl_cert.c
+++ b/ssl/ssl_cert.c
@@ -174,7 +174,6 @@ CERT *ssl_cert_new(void)
 
 	ret->key= &(ret->pkeys[SSL_PKEY_RSA_ENC]);
 	ret->references=1;
-
 	return(ret);
 	}
 
diff --git a/ssl/ssl_ciph.c b/ssl/ssl_ciph.c
index e523a8f..e8794d4 100644
--- a/ssl/ssl_ciph.c
+++ b/ssl/ssl_ciph.c
@@ -162,11 +162,13 @@
 #define SSL_ENC_CAMELLIA256_IDX	9
 #define SSL_ENC_GOST89_IDX	10
 #define SSL_ENC_SEED_IDX    	11
-#define SSL_ENC_NUM_IDX		12
+#define SSL_ENC_AES128GCM_IDX	12
+#define SSL_ENC_AES256GCM_IDX	13
+#define SSL_ENC_NUM_IDX		14
 
 
 static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX]={
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
+	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL
 	};
 
 #define SSL_COMP_NULL_IDX	0
@@ -179,28 +181,32 @@ static STACK_OF(SSL_COMP) *ssl_comp_methods=NULL;
 #define SSL_MD_SHA1_IDX	1
 #define SSL_MD_GOST94_IDX 2
 #define SSL_MD_GOST89MAC_IDX 3
+#define SSL_MD_SHA256_IDX 4
+#define SSL_MD_SHA384_IDX 5
 /*Constant SSL_MAX_DIGEST equal to size of digests array should be 
  * defined in the
  * ssl_locl.h */
 #define SSL_MD_NUM_IDX	SSL_MAX_DIGEST 
 static const EVP_MD *ssl_digest_methods[SSL_MD_NUM_IDX]={
-	NULL,NULL,NULL,NULL
+	NULL,NULL,NULL,NULL,NULL,NULL
 	};
 /* PKEY_TYPE for GOST89MAC is known in advance, but, because
  * implementation is engine-provided, we'll fill it only if
  * corresponding EVP_PKEY_METHOD is found 
  */
 static int  ssl_mac_pkey_id[SSL_MD_NUM_IDX]={
-	EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef
+	EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef,
+	EVP_PKEY_HMAC,EVP_PKEY_HMAC
 	};
 
 static int ssl_mac_secret_size[SSL_MD_NUM_IDX]={
-	0,0,0,0
+	0,0,0,0,0,0
 	};
 
 static int ssl_handshake_digest_flag[SSL_MD_NUM_IDX]={
 	SSL_HANDSHAKE_MAC_MD5,SSL_HANDSHAKE_MAC_SHA,
-	SSL_HANDSHAKE_MAC_GOST94,0
+	SSL_HANDSHAKE_MAC_GOST94, 0, SSL_HANDSHAKE_MAC_SHA256,
+	SSL_HANDSHAKE_MAC_SHA384
 	};
 
 #define CIPHER_ADD	1
@@ -247,6 +253,7 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_ECDH,0,    SSL_kECDHr|SSL_kECDHe|SSL_kEECDH,0,0,0,0,0,0,0,0},
 
         {0,SSL_TXT_kPSK,0,    SSL_kPSK,  0,0,0,0,0,0,0,0},
+	{0,SSL_TXT_kSRP,0,    SSL_kSRP,  0,0,0,0,0,0,0,0},
 	{0,SSL_TXT_kGOST,0, SSL_kGOST,0,0,0,0,0,0,0,0},
 
 	/* server authentication aliases */
@@ -273,6 +280,7 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_ADH,0,     SSL_kEDH,SSL_aNULL,0,0,0,0,0,0,0},
 	{0,SSL_TXT_AECDH,0,   SSL_kEECDH,SSL_aNULL,0,0,0,0,0,0,0},
         {0,SSL_TXT_PSK,0,     SSL_kPSK,SSL_aPSK,0,0,0,0,0,0,0},
+	{0,SSL_TXT_SRP,0,     SSL_kSRP,0,0,0,0,0,0,0,0},
 
 
 	/* symmetric encryption aliases */
@@ -283,9 +291,10 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_IDEA,0,    0,0,SSL_IDEA,  0,0,0,0,0,0},
 	{0,SSL_TXT_SEED,0,    0,0,SSL_SEED,  0,0,0,0,0,0},
 	{0,SSL_TXT_eNULL,0,   0,0,SSL_eNULL, 0,0,0,0,0,0},
-	{0,SSL_TXT_AES128,0,  0,0,SSL_AES128,0,0,0,0,0,0},
-	{0,SSL_TXT_AES256,0,  0,0,SSL_AES256,0,0,0,0,0,0},
-	{0,SSL_TXT_AES,0,     0,0,SSL_AES128|SSL_AES256,0,0,0,0,0,0},
+	{0,SSL_TXT_AES128,0,  0,0,SSL_AES128|SSL_AES128GCM,0,0,0,0,0,0},
+	{0,SSL_TXT_AES256,0,  0,0,SSL_AES256|SSL_AES256GCM,0,0,0,0,0,0},
+	{0,SSL_TXT_AES,0,     0,0,SSL_AES,0,0,0,0,0,0},
+	{0,SSL_TXT_AES_GCM,0, 0,0,SSL_AES128GCM|SSL_AES256GCM,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA128,0,0,0,SSL_CAMELLIA128,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA256,0,0,0,SSL_CAMELLIA256,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA   ,0,0,0,SSL_CAMELLIA128|SSL_CAMELLIA256,0,0,0,0,0,0},
@@ -296,11 +305,14 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_SHA,0,     0,0,0,SSL_SHA1,  0,0,0,0,0},
 	{0,SSL_TXT_GOST94,0,     0,0,0,SSL_GOST94,  0,0,0,0,0},
 	{0,SSL_TXT_GOST89MAC,0,     0,0,0,SSL_GOST89MAC,  0,0,0,0,0},
+	{0,SSL_TXT_SHA256,0,    0,0,0,SSL_SHA256,  0,0,0,0,0},
+	{0,SSL_TXT_SHA384,0,    0,0,0,SSL_SHA384,  0,0,0,0,0},
 
 	/* protocol version aliases */
 	{0,SSL_TXT_SSLV2,0,   0,0,0,0,SSL_SSLV2, 0,0,0,0},
 	{0,SSL_TXT_SSLV3,0,   0,0,0,0,SSL_SSLV3, 0,0,0,0},
 	{0,SSL_TXT_TLSV1,0,   0,0,0,0,SSL_TLSV1, 0,0,0,0},
+	{0,SSL_TXT_TLSV1_2,0, 0,0,0,0,SSL_TLSV1_2, 0,0,0,0},
 
 	/* export flag */
 	{0,SSL_TXT_EXP,0,     0,0,0,0,0,SSL_EXPORT,0,0,0},
@@ -379,6 +391,11 @@ void ssl_load_ciphers(void)
 	ssl_cipher_methods[SSL_ENC_SEED_IDX]=
 	  EVP_get_cipherbyname(SN_seed_cbc);
 
+	ssl_cipher_methods[SSL_ENC_AES128GCM_IDX]=
+	  EVP_get_cipherbyname(SN_aes_128_gcm);
+	ssl_cipher_methods[SSL_ENC_AES256GCM_IDX]=
+	  EVP_get_cipherbyname(SN_aes_256_gcm);
+
 	ssl_digest_methods[SSL_MD_MD5_IDX]=
 		EVP_get_digestbyname(SN_md5);
 	ssl_mac_secret_size[SSL_MD_MD5_IDX]=
@@ -404,6 +421,14 @@ void ssl_load_ciphers(void)
 			ssl_mac_secret_size[SSL_MD_GOST89MAC_IDX]=32;
 		}		
 
+	ssl_digest_methods[SSL_MD_SHA256_IDX]=
+		EVP_get_digestbyname(SN_sha256);
+	ssl_mac_secret_size[SSL_MD_SHA256_IDX]=
+		EVP_MD_size(ssl_digest_methods[SSL_MD_SHA256_IDX]);
+	ssl_digest_methods[SSL_MD_SHA384_IDX]=
+		EVP_get_digestbyname(SN_sha384);
+	ssl_mac_secret_size[SSL_MD_SHA384_IDX]=
+		EVP_MD_size(ssl_digest_methods[SSL_MD_SHA384_IDX]);
 	}
 #ifndef OPENSSL_NO_COMP
 
@@ -446,6 +471,7 @@ static void load_builtin_compressions(void)
 						sk_SSL_COMP_push(ssl_comp_methods,comp);
 						}
 					}
+					sk_SSL_COMP_sort(ssl_comp_methods);
 				}
 			MemCheck_on();
 			}
@@ -525,6 +551,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
 	case SSL_SEED:
 		i=SSL_ENC_SEED_IDX;
 		break;
+	case SSL_AES128GCM:
+		i=SSL_ENC_AES128GCM_IDX;
+		break;
+	case SSL_AES256GCM:
+		i=SSL_ENC_AES256GCM_IDX;
+		break;
 	default:
 		i= -1;
 		break;
@@ -548,6 +580,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
 	case SSL_SHA1:
 		i=SSL_MD_SHA1_IDX;
 		break;
+	case SSL_SHA256:
+		i=SSL_MD_SHA256_IDX;
+		break;
+	case SSL_SHA384:
+		i=SSL_MD_SHA384_IDX;
+		break;
 	case SSL_GOST94:
 		i = SSL_MD_GOST94_IDX;
 		break;
@@ -563,17 +601,45 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
 		*md=NULL; 
 		if (mac_pkey_type!=NULL) *mac_pkey_type = NID_undef;
 		if (mac_secret_size!=NULL) *mac_secret_size = 0;
-
+		if (c->algorithm_mac == SSL_AEAD)
+			mac_pkey_type = NULL;
 	}
 	else
 	{
 		*md=ssl_digest_methods[i];
 		if (mac_pkey_type!=NULL) *mac_pkey_type = ssl_mac_pkey_id[i];
 		if (mac_secret_size!=NULL) *mac_secret_size = ssl_mac_secret_size[i];
-	}	
+	}
+
+	if ((*enc != NULL) &&
+	    (*md != NULL || (EVP_CIPHER_flags(*enc)&EVP_CIPH_FLAG_AEAD_CIPHER)) &&
+	    (!mac_pkey_type||*mac_pkey_type != NID_undef))
+		{
+		const EVP_CIPHER *evp;
+
+		if (s->ssl_version>>8 != TLS1_VERSION_MAJOR ||
+		    s->ssl_version < TLS1_VERSION)
+			return 1;
 
-	if ((*enc != NULL) && (*md != NULL) && (!mac_pkey_type||*mac_pkey_type != NID_undef))
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return 1;
+#endif
+
+		if	(c->algorithm_enc == SSL_RC4 &&
+			 c->algorithm_mac == SSL_MD5 &&
+			 (evp=EVP_get_cipherbyname("RC4-HMAC-MD5")))
+			*enc = evp, *md = NULL;
+		else if (c->algorithm_enc == SSL_AES128 &&
+			 c->algorithm_mac == SSL_SHA1 &&
+			 (evp=EVP_get_cipherbyname("AES-128-CBC-HMAC-SHA1")))
+			*enc = evp, *md = NULL;
+		else if (c->algorithm_enc == SSL_AES256 &&
+			 c->algorithm_mac == SSL_SHA1 &&
+			 (evp=EVP_get_cipherbyname("AES-256-CBC-HMAC-SHA1")))
+			*enc = evp, *md = NULL;
 		return(1);
+		}
 	else
 		return(0);
 	}
@@ -584,9 +650,11 @@ int ssl_get_handshake_digest(int idx, long *mask, const EVP_MD **md)
 		{
 		return 0;
 		}
-	if (ssl_handshake_digest_flag[idx]==0) return 0;
 	*mask = ssl_handshake_digest_flag[idx];
-	*md = ssl_digest_methods[idx];
+	if (*mask)
+		*md = ssl_digest_methods[idx];
+	else
+		*md = NULL;
 	return 1;
 }
 
@@ -660,6 +728,9 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un
 #ifdef OPENSSL_NO_PSK
 	*mkey |= SSL_kPSK;
 	*auth |= SSL_aPSK;
+#endif
+#ifdef OPENSSL_NO_SRP
+	*mkey |= SSL_kSRP;
 #endif
 	/* Check for presence of GOST 34.10 algorithms, and if they
 	 * do not present, disable  appropriate auth and key exchange */
@@ -686,6 +757,8 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un
 	*enc |= (ssl_cipher_methods[SSL_ENC_IDEA_IDX] == NULL) ? SSL_IDEA:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_AES128_IDX] == NULL) ? SSL_AES128:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_AES256_IDX] == NULL) ? SSL_AES256:0;
+	*enc |= (ssl_cipher_methods[SSL_ENC_AES128GCM_IDX] == NULL) ? SSL_AES128GCM:0;
+	*enc |= (ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] == NULL) ? SSL_AES256GCM:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA128_IDX] == NULL) ? SSL_CAMELLIA128:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA256_IDX] == NULL) ? SSL_CAMELLIA256:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_GOST89_IDX] == NULL) ? SSL_eGOST2814789CNT:0;
@@ -693,6 +766,8 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un
 
 	*mac |= (ssl_digest_methods[SSL_MD_MD5_IDX ] == NULL) ? SSL_MD5 :0;
 	*mac |= (ssl_digest_methods[SSL_MD_SHA1_IDX] == NULL) ? SSL_SHA1:0;
+	*mac |= (ssl_digest_methods[SSL_MD_SHA256_IDX] == NULL) ? SSL_SHA256:0;
+	*mac |= (ssl_digest_methods[SSL_MD_SHA384_IDX] == NULL) ? SSL_SHA384:0;
 	*mac |= (ssl_digest_methods[SSL_MD_GOST94_IDX] == NULL) ? SSL_GOST94:0;
 	*mac |= (ssl_digest_methods[SSL_MD_GOST89MAC_IDX] == NULL || ssl_mac_pkey_id[SSL_MD_GOST89MAC_IDX]==NID_undef)? SSL_GOST89MAC:0;
 
@@ -723,6 +798,9 @@ static void ssl_cipher_collect_ciphers(const SSL_METHOD *ssl_method,
 		c = ssl_method->get_cipher(i);
 		/* drop those that use any of that is not available */
 		if ((c != NULL) && c->valid &&
+#ifdef OPENSSL_FIPS
+		    (!FIPS_mode() || (c->algo_strength & SSL_FIPS)) &&
+#endif
 		    !(c->algorithm_mkey & disabled_mkey) &&
 		    !(c->algorithm_auth & disabled_auth) &&
 		    !(c->algorithm_enc & disabled_enc) &&
@@ -1073,9 +1151,9 @@ static int ssl_cipher_process_rulestr(const char *rule_str,
 			while (	((ch >= 'A') && (ch <= 'Z')) ||
 				((ch >= '0') && (ch <= '9')) ||
 				((ch >= 'a') && (ch <= 'z')) ||
-				 (ch == '-'))
+				 (ch == '-') || (ch == '.'))
 #else
-			while (	isalnum(ch) || (ch == '-'))
+			while (	isalnum(ch) || (ch == '-') || (ch == '.'))
 #endif
 				 {
 				 ch = *(++l);
@@ -1422,7 +1500,11 @@ STACK_OF(SSL_CIPHER) *ssl_create_cipher_list(const SSL_METHOD *ssl_method,
 	 */
 	for (curr = head; curr != NULL; curr = curr->next)
 		{
+#ifdef OPENSSL_FIPS
+		if (curr->active && (!FIPS_mode() || curr->cipher->algo_strength & SSL_FIPS))
+#else
 		if (curr->active)
+#endif
 			{
 			sk_SSL_CIPHER_push(cipherstack, curr->cipher);
 #ifdef CIPHER_DEBUG
@@ -1479,6 +1561,8 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 		ver="SSLv2";
 	else if (alg_ssl & SSL_SSLV3)
 		ver="SSLv3";
+	else if (alg_ssl & SSL_TLSV1_2)
+		ver="TLSv1.2";
 	else
 		ver="unknown";
 
@@ -1511,6 +1595,9 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 	case SSL_kPSK:
 		kx="PSK";
 		break;
+	case SSL_kSRP:
+		kx="SRP";
+		break;
 	default:
 		kx="unknown";
 		}
@@ -1573,6 +1660,12 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 	case SSL_AES256:
 		enc="AES(256)";
 		break;
+	case SSL_AES128GCM:
+		enc="AESGCM(128)";
+		break;
+	case SSL_AES256GCM:
+		enc="AESGCM(256)";
+		break;
 	case SSL_CAMELLIA128:
 		enc="Camellia(128)";
 		break;
@@ -1595,6 +1688,15 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 	case SSL_SHA1:
 		mac="SHA1";
 		break;
+	case SSL_SHA256:
+		mac="SHA256";
+		break;
+	case SSL_SHA384:
+		mac="SHA384";
+		break;
+	case SSL_AEAD:
+		mac="AEAD";
+		break;
 	default:
 		mac="unknown";
 		break;
@@ -1652,6 +1754,11 @@ int SSL_CIPHER_get_bits(const SSL_CIPHER *c, int *alg_bits)
 	return(ret);
 	}
 
+unsigned long SSL_CIPHER_get_id(const SSL_CIPHER *c)
+	{
+	return c->id;
+	}
+
 /* return string version of key exchange algorithm */
 const char* SSL_CIPHER_authentication_method(const SSL_CIPHER* cipher)
 	{
diff --git a/ssl/ssl_err.c b/ssl/ssl_err.c
index 8bff590..c40c718 100644
--- a/ssl/ssl_err.c
+++ b/ssl/ssl_err.c
@@ -1,6 +1,6 @@
 /* ssl/ssl_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -80,6 +80,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_DTLS1_ACCEPT),	"DTLS1_ACCEPT"},
 {ERR_FUNC(SSL_F_DTLS1_ADD_CERT_TO_BUF),	"DTLS1_ADD_CERT_TO_BUF"},
 {ERR_FUNC(SSL_F_DTLS1_BUFFER_RECORD),	"DTLS1_BUFFER_RECORD"},
+{ERR_FUNC(SSL_F_DTLS1_CHECK_TIMEOUT_NUM),	"DTLS1_CHECK_TIMEOUT_NUM"},
 {ERR_FUNC(SSL_F_DTLS1_CLIENT_HELLO),	"DTLS1_CLIENT_HELLO"},
 {ERR_FUNC(SSL_F_DTLS1_CONNECT),	"DTLS1_CONNECT"},
 {ERR_FUNC(SSL_F_DTLS1_ENC),	"DTLS1_ENC"},
@@ -88,6 +89,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_DTLS1_GET_MESSAGE_FRAGMENT),	"DTLS1_GET_MESSAGE_FRAGMENT"},
 {ERR_FUNC(SSL_F_DTLS1_GET_RECORD),	"DTLS1_GET_RECORD"},
 {ERR_FUNC(SSL_F_DTLS1_HANDLE_TIMEOUT),	"DTLS1_HANDLE_TIMEOUT"},
+{ERR_FUNC(SSL_F_DTLS1_HEARTBEAT),	"DTLS1_HEARTBEAT"},
 {ERR_FUNC(SSL_F_DTLS1_OUTPUT_CERT_CHAIN),	"DTLS1_OUTPUT_CERT_CHAIN"},
 {ERR_FUNC(SSL_F_DTLS1_PREPROCESS_FRAGMENT),	"DTLS1_PREPROCESS_FRAGMENT"},
 {ERR_FUNC(SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE),	"DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE"},
@@ -137,6 +139,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL3_CALLBACK_CTRL),	"SSL3_CALLBACK_CTRL"},
 {ERR_FUNC(SSL_F_SSL3_CHANGE_CIPHER_STATE),	"SSL3_CHANGE_CIPHER_STATE"},
 {ERR_FUNC(SSL_F_SSL3_CHECK_CERT_AND_ALGORITHM),	"SSL3_CHECK_CERT_AND_ALGORITHM"},
+{ERR_FUNC(SSL_F_SSL3_CHECK_CLIENT_HELLO),	"SSL3_CHECK_CLIENT_HELLO"},
 {ERR_FUNC(SSL_F_SSL3_CLIENT_HELLO),	"SSL3_CLIENT_HELLO"},
 {ERR_FUNC(SSL_F_SSL3_CONNECT),	"SSL3_CONNECT"},
 {ERR_FUNC(SSL_F_SSL3_CTRL),	"SSL3_CTRL"},
@@ -148,6 +151,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL3_GET_CERTIFICATE_REQUEST),	"SSL3_GET_CERTIFICATE_REQUEST"},
 {ERR_FUNC(SSL_F_SSL3_GET_CERT_STATUS),	"SSL3_GET_CERT_STATUS"},
 {ERR_FUNC(SSL_F_SSL3_GET_CERT_VERIFY),	"SSL3_GET_CERT_VERIFY"},
+{ERR_FUNC(SSL_F_SSL3_GET_CHANNEL_ID),	"SSL3_GET_CHANNEL_ID"},
 {ERR_FUNC(SSL_F_SSL3_GET_CLIENT_CERTIFICATE),	"SSL3_GET_CLIENT_CERTIFICATE"},
 {ERR_FUNC(SSL_F_SSL3_GET_CLIENT_HELLO),	"SSL3_GET_CLIENT_HELLO"},
 {ERR_FUNC(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE),	"SSL3_GET_CLIENT_KEY_EXCHANGE"},
@@ -167,6 +171,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL3_READ_BYTES),	"SSL3_READ_BYTES"},
 {ERR_FUNC(SSL_F_SSL3_READ_N),	"SSL3_READ_N"},
 {ERR_FUNC(SSL_F_SSL3_SEND_CERTIFICATE_REQUEST),	"SSL3_SEND_CERTIFICATE_REQUEST"},
+{ERR_FUNC(SSL_F_SSL3_SEND_CHANNEL_ID),	"SSL3_SEND_CHANNEL_ID"},
 {ERR_FUNC(SSL_F_SSL3_SEND_CLIENT_CERTIFICATE),	"SSL3_SEND_CLIENT_CERTIFICATE"},
 {ERR_FUNC(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE),	"SSL3_SEND_CLIENT_KEY_EXCHANGE"},
 {ERR_FUNC(SSL_F_SSL3_SEND_CLIENT_VERIFY),	"SSL3_SEND_CLIENT_VERIFY"},
@@ -180,10 +185,12 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL3_WRITE_PENDING),	"SSL3_WRITE_PENDING"},
 {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT),	"SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT),	"SSL_ADD_CLIENTHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT),	"SSL_ADD_CLIENTHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK),	"SSL_add_dir_cert_subjects_to_stack"},
 {ERR_FUNC(SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK),	"SSL_add_file_cert_subjects_to_stack"},
 {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT),	"SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT),	"SSL_ADD_SERVERHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT),	"SSL_ADD_SERVERHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_BAD_METHOD),	"SSL_BAD_METHOD"},
 {ERR_FUNC(SSL_F_SSL_BYTES_TO_CIPHER_LIST),	"SSL_BYTES_TO_CIPHER_LIST"},
 {ERR_FUNC(SSL_F_SSL_CERT_DUP),	"SSL_CERT_DUP"},
@@ -200,6 +207,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_CREATE_CIPHER_LIST),	"SSL_CREATE_CIPHER_LIST"},
 {ERR_FUNC(SSL_F_SSL_CTRL),	"SSL_ctrl"},
 {ERR_FUNC(SSL_F_SSL_CTX_CHECK_PRIVATE_KEY),	"SSL_CTX_check_private_key"},
+{ERR_FUNC(SSL_F_SSL_CTX_MAKE_PROFILES),	"SSL_CTX_MAKE_PROFILES"},
 {ERR_FUNC(SSL_F_SSL_CTX_NEW),	"SSL_CTX_new"},
 {ERR_FUNC(SSL_F_SSL_CTX_SET_CIPHER_LIST),	"SSL_CTX_set_cipher_list"},
 {ERR_FUNC(SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE),	"SSL_CTX_set_client_cert_engine"},
@@ -222,14 +230,17 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_GET_NEW_SESSION),	"SSL_GET_NEW_SESSION"},
 {ERR_FUNC(SSL_F_SSL_GET_PREV_SESSION),	"SSL_GET_PREV_SESSION"},
 {ERR_FUNC(SSL_F_SSL_GET_SERVER_SEND_CERT),	"SSL_GET_SERVER_SEND_CERT"},
+{ERR_FUNC(SSL_F_SSL_GET_SERVER_SEND_PKEY),	"SSL_GET_SERVER_SEND_PKEY"},
 {ERR_FUNC(SSL_F_SSL_GET_SIGN_PKEY),	"SSL_GET_SIGN_PKEY"},
 {ERR_FUNC(SSL_F_SSL_INIT_WBIO_BUFFER),	"SSL_INIT_WBIO_BUFFER"},
 {ERR_FUNC(SSL_F_SSL_LOAD_CLIENT_CA_FILE),	"SSL_load_client_CA_file"},
 {ERR_FUNC(SSL_F_SSL_NEW),	"SSL_new"},
 {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT),	"SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT),	"SSL_PARSE_CLIENTHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT),	"SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT),	"SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT),	"SSL_PARSE_SERVERHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT),	"SSL_PARSE_SERVERHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_PEEK),	"SSL_peek"},
 {ERR_FUNC(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT),	"SSL_PREPARE_CLIENTHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT),	"SSL_PREPARE_SERVERHELLO_TLSEXT"},
@@ -238,6 +249,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_RSA_PUBLIC_ENCRYPT),	"SSL_RSA_PUBLIC_ENCRYPT"},
 {ERR_FUNC(SSL_F_SSL_SESSION_NEW),	"SSL_SESSION_new"},
 {ERR_FUNC(SSL_F_SSL_SESSION_PRINT_FP),	"SSL_SESSION_print_fp"},
+{ERR_FUNC(SSL_F_SSL_SESSION_SET1_ID_CONTEXT),	"SSL_SESSION_set1_id_context"},
 {ERR_FUNC(SSL_F_SSL_SESS_CERT_NEW),	"SSL_SESS_CERT_NEW"},
 {ERR_FUNC(SSL_F_SSL_SET_CERT),	"SSL_SET_CERT"},
 {ERR_FUNC(SSL_F_SSL_SET_CIPHER_LIST),	"SSL_set_cipher_list"},
@@ -251,6 +263,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_SET_TRUST),	"SSL_set_trust"},
 {ERR_FUNC(SSL_F_SSL_SET_WFD),	"SSL_set_wfd"},
 {ERR_FUNC(SSL_F_SSL_SHUTDOWN),	"SSL_shutdown"},
+{ERR_FUNC(SSL_F_SSL_SRP_CTX_INIT),	"SSL_SRP_CTX_init"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_CONST_FUNCTION),	"SSL_UNDEFINED_CONST_FUNCTION"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_FUNCTION),	"SSL_UNDEFINED_FUNCTION"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_VOID_FUNCTION),	"SSL_UNDEFINED_VOID_FUNCTION"},
@@ -270,6 +283,8 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_TLS1_CHANGE_CIPHER_STATE),	"TLS1_CHANGE_CIPHER_STATE"},
 {ERR_FUNC(SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT),	"TLS1_CHECK_SERVERHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_ENC),	"TLS1_ENC"},
+{ERR_FUNC(SSL_F_TLS1_EXPORT_KEYING_MATERIAL),	"TLS1_EXPORT_KEYING_MATERIAL"},
+{ERR_FUNC(SSL_F_TLS1_HEARTBEAT),	"SSL_F_TLS1_HEARTBEAT"},
 {ERR_FUNC(SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT),	"TLS1_PREPARE_CLIENTHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT),	"TLS1_PREPARE_SERVERHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_PRF),	"tls1_prf"},
@@ -312,6 +327,13 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_BAD_RSA_MODULUS_LENGTH),"bad rsa modulus length"},
 {ERR_REASON(SSL_R_BAD_RSA_SIGNATURE)     ,"bad rsa signature"},
 {ERR_REASON(SSL_R_BAD_SIGNATURE)         ,"bad signature"},
+{ERR_REASON(SSL_R_BAD_SRP_A_LENGTH)      ,"bad srp a length"},
+{ERR_REASON(SSL_R_BAD_SRP_B_LENGTH)      ,"bad srp b length"},
+{ERR_REASON(SSL_R_BAD_SRP_G_LENGTH)      ,"bad srp g length"},
+{ERR_REASON(SSL_R_BAD_SRP_N_LENGTH)      ,"bad srp n length"},
+{ERR_REASON(SSL_R_BAD_SRP_S_LENGTH)      ,"bad srp s length"},
+{ERR_REASON(SSL_R_BAD_SRTP_MKI_VALUE)    ,"bad srtp mki value"},
+{ERR_REASON(SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST),"bad srtp protection profile list"},
 {ERR_REASON(SSL_R_BAD_SSL_FILETYPE)      ,"bad ssl filetype"},
 {ERR_REASON(SSL_R_BAD_SSL_SESSION_ID_LENGTH),"bad ssl session id length"},
 {ERR_REASON(SSL_R_BAD_STATE)             ,"bad state"},
@@ -319,12 +341,15 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_BIO_NOT_SET)           ,"bio not set"},
 {ERR_REASON(SSL_R_BLOCK_CIPHER_PAD_IS_WRONG),"block cipher pad is wrong"},
 {ERR_REASON(SSL_R_BN_LIB)                ,"bn lib"},
+{ERR_REASON(SSL_R_CANNOT_SERIALIZE_PUBLIC_KEY),"cannot serialize public key"},
 {ERR_REASON(SSL_R_CA_DN_LENGTH_MISMATCH) ,"ca dn length mismatch"},
 {ERR_REASON(SSL_R_CA_DN_TOO_LONG)        ,"ca dn too long"},
 {ERR_REASON(SSL_R_CCS_RECEIVED_EARLY)    ,"ccs received early"},
 {ERR_REASON(SSL_R_CERTIFICATE_VERIFY_FAILED),"certificate verify failed"},
 {ERR_REASON(SSL_R_CERT_LENGTH_MISMATCH)  ,"cert length mismatch"},
 {ERR_REASON(SSL_R_CHALLENGE_IS_DIFFERENT),"challenge is different"},
+{ERR_REASON(SSL_R_CHANNEL_ID_NOT_P256)   ,"channel id not p256"},
+{ERR_REASON(SSL_R_CHANNEL_ID_SIGNATURE_INVALID),"Channel ID signature invalid"},
 {ERR_REASON(SSL_R_CIPHER_CODE_WRONG_LENGTH),"cipher code wrong length"},
 {ERR_REASON(SSL_R_CIPHER_OR_HASH_UNAVAILABLE),"cipher or hash unavailable"},
 {ERR_REASON(SSL_R_CIPHER_TABLE_SRC_ERROR),"cipher table src error"},
@@ -337,6 +362,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_CONNECTION_ID_IS_DIFFERENT),"connection id is different"},
 {ERR_REASON(SSL_R_CONNECTION_TYPE_NOT_SET),"connection type not set"},
 {ERR_REASON(SSL_R_COOKIE_MISMATCH)       ,"cookie mismatch"},
+{ERR_REASON(SSL_R_D2I_ECDSA_SIG)         ,"d2i ecdsa sig"},
 {ERR_REASON(SSL_R_DATA_BETWEEN_CCS_AND_FINISHED),"data between ccs and finished"},
 {ERR_REASON(SSL_R_DATA_LENGTH_TOO_LONG)  ,"data length too long"},
 {ERR_REASON(SSL_R_DECRYPTION_FAILED)     ,"decryption failed"},
@@ -350,12 +376,16 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE),"ecc cert should have rsa signature"},
 {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE),"ecc cert should have sha1 signature"},
 {ERR_REASON(SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER),"ecgroup too large for cipher"},
+{ERR_REASON(SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST),"empty srtp protection profile list"},
 {ERR_REASON(SSL_R_ENCRYPTED_LENGTH_TOO_LONG),"encrypted length too long"},
 {ERR_REASON(SSL_R_ERROR_GENERATING_TMP_RSA_KEY),"error generating tmp rsa key"},
 {ERR_REASON(SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST),"error in received cipher list"},
+{ERR_REASON(SSL_R_EVP_DIGESTSIGNFINAL_FAILED),"evp digestsignfinal failed"},
+{ERR_REASON(SSL_R_EVP_DIGESTSIGNINIT_FAILED),"evp digestsigninit failed"},
 {ERR_REASON(SSL_R_EXCESSIVE_MESSAGE_SIZE),"excessive message size"},
 {ERR_REASON(SSL_R_EXTRA_DATA_IN_MESSAGE) ,"extra data in message"},
 {ERR_REASON(SSL_R_GOT_A_FIN_BEFORE_A_CCS),"got a fin before a ccs"},
+{ERR_REASON(SSL_R_GOT_CHANNEL_ID_BEFORE_A_CCS),"got Channel ID before a ccs"},
 {ERR_REASON(SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS),"got next proto before a ccs"},
 {ERR_REASON(SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION),"got next proto without seeing extension"},
 {ERR_REASON(SSL_R_HTTPS_PROXY_REQUEST)   ,"https proxy request"},
@@ -365,7 +395,9 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_INVALID_CHALLENGE_LENGTH),"invalid challenge length"},
 {ERR_REASON(SSL_R_INVALID_COMMAND)       ,"invalid command"},
 {ERR_REASON(SSL_R_INVALID_COMPRESSION_ALGORITHM),"invalid compression algorithm"},
+{ERR_REASON(SSL_R_INVALID_MESSAGE)       ,"invalid message"},
 {ERR_REASON(SSL_R_INVALID_PURPOSE)       ,"invalid purpose"},
+{ERR_REASON(SSL_R_INVALID_SRP_USERNAME)  ,"invalid srp username"},
 {ERR_REASON(SSL_R_INVALID_STATUS_RESPONSE),"invalid status response"},
 {ERR_REASON(SSL_R_INVALID_TICKET_KEYS_LENGTH),"invalid ticket keys length"},
 {ERR_REASON(SSL_R_INVALID_TRUST)         ,"invalid trust"},
@@ -395,11 +427,13 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_MISSING_RSA_CERTIFICATE),"missing rsa certificate"},
 {ERR_REASON(SSL_R_MISSING_RSA_ENCRYPTING_CERT),"missing rsa encrypting cert"},
 {ERR_REASON(SSL_R_MISSING_RSA_SIGNING_CERT),"missing rsa signing cert"},
+{ERR_REASON(SSL_R_MISSING_SRP_PARAM)     ,"can't find SRP server param"},
 {ERR_REASON(SSL_R_MISSING_TMP_DH_KEY)    ,"missing tmp dh key"},
 {ERR_REASON(SSL_R_MISSING_TMP_ECDH_KEY)  ,"missing tmp ecdh key"},
 {ERR_REASON(SSL_R_MISSING_TMP_RSA_KEY)   ,"missing tmp rsa key"},
 {ERR_REASON(SSL_R_MISSING_TMP_RSA_PKEY)  ,"missing tmp rsa pkey"},
 {ERR_REASON(SSL_R_MISSING_VERIFY_MESSAGE),"missing verify message"},
+{ERR_REASON(SSL_R_MULTIPLE_SGC_RESTARTS) ,"multiple sgc restarts"},
 {ERR_REASON(SSL_R_NON_SSLV2_INITIAL_PACKET),"non sslv2 initial packet"},
 {ERR_REASON(SSL_R_NO_CERTIFICATES_RETURNED),"no certificates returned"},
 {ERR_REASON(SSL_R_NO_CERTIFICATE_ASSIGNED),"no certificate assigned"},
@@ -416,6 +450,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_NO_COMPRESSION_SPECIFIED),"no compression specified"},
 {ERR_REASON(SSL_R_NO_GOST_CERTIFICATE_SENT_BY_PEER),"Peer haven't sent GOST certificate, required for selected ciphersuite"},
 {ERR_REASON(SSL_R_NO_METHOD_SPECIFIED)   ,"no method specified"},
+{ERR_REASON(SSL_R_NO_P256_SUPPORT)       ,"no p256 support"},
 {ERR_REASON(SSL_R_NO_PRIVATEKEY)         ,"no privatekey"},
 {ERR_REASON(SSL_R_NO_PRIVATE_KEY_ASSIGNED),"no private key assigned"},
 {ERR_REASON(SSL_R_NO_PROTOCOLS_AVAILABLE),"no protocols available"},
@@ -423,6 +458,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_NO_RENEGOTIATION)      ,"no renegotiation"},
 {ERR_REASON(SSL_R_NO_REQUIRED_DIGEST)    ,"digest requred for handshake isn't computed"},
 {ERR_REASON(SSL_R_NO_SHARED_CIPHER)      ,"no shared cipher"},
+{ERR_REASON(SSL_R_NO_SRTP_PROFILES)      ,"no srtp profiles"},
 {ERR_REASON(SSL_R_NO_VERIFY_CALLBACK)    ,"no verify callback"},
 {ERR_REASON(SSL_R_NULL_SSL_CTX)          ,"null ssl ctx"},
 {ERR_REASON(SSL_R_NULL_SSL_METHOD_PASSED),"null ssl method passed"},
@@ -467,7 +503,12 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED),"session id context uninitialized"},
 {ERR_REASON(SSL_R_SESSION_MAY_NOT_BE_CREATED),"session may not be created"},
 {ERR_REASON(SSL_R_SHORT_READ)            ,"short read"},
+{ERR_REASON(SSL_R_SIGNATURE_ALGORITHMS_ERROR),"signature algorithms error"},
 {ERR_REASON(SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE),"signature for non signing certificate"},
+{ERR_REASON(SSL_R_SRP_A_CALC)            ,"error with the srp params"},
+{ERR_REASON(SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES),"srtp could not allocate profiles"},
+{ERR_REASON(SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG),"srtp protection profile list too long"},
+{ERR_REASON(SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE),"srtp unknown protection profile"},
 {ERR_REASON(SSL_R_SSL23_DOING_SESSION_ID_REUSE),"ssl23 doing session id reuse"},
 {ERR_REASON(SSL_R_SSL2_CONNECTION_ID_TOO_LONG),"ssl2 connection id too long"},
 {ERR_REASON(SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT),"ssl3 ext invalid ecpointformat"},
@@ -512,6 +553,9 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_TLSV1_UNRECOGNIZED_NAME),"tlsv1 unrecognized name"},
 {ERR_REASON(SSL_R_TLSV1_UNSUPPORTED_EXTENSION),"tlsv1 unsupported extension"},
 {ERR_REASON(SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER),"tls client cert req with anon cipher"},
+{ERR_REASON(SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT),"peer does not accept heartbearts"},
+{ERR_REASON(SSL_R_TLS_HEARTBEAT_PENDING) ,"heartbeat request already pending"},
+{ERR_REASON(SSL_R_TLS_ILLEGAL_EXPORTER_LABEL),"tls illegal exporter label"},
 {ERR_REASON(SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST),"tls invalid ecpointformat list"},
 {ERR_REASON(SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST),"tls peer did not respond with certificate list"},
 {ERR_REASON(SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG),"tls rsa encrypted value length is wrong"},
@@ -533,6 +577,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_UNKNOWN_CERTIFICATE_TYPE),"unknown certificate type"},
 {ERR_REASON(SSL_R_UNKNOWN_CIPHER_RETURNED),"unknown cipher returned"},
 {ERR_REASON(SSL_R_UNKNOWN_CIPHER_TYPE)   ,"unknown cipher type"},
+{ERR_REASON(SSL_R_UNKNOWN_DIGEST)        ,"unknown digest"},
 {ERR_REASON(SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE),"unknown key exchange type"},
 {ERR_REASON(SSL_R_UNKNOWN_PKEY_TYPE)     ,"unknown pkey type"},
 {ERR_REASON(SSL_R_UNKNOWN_PROTOCOL)      ,"unknown protocol"},
@@ -547,12 +592,14 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_UNSUPPORTED_PROTOCOL)  ,"unsupported protocol"},
 {ERR_REASON(SSL_R_UNSUPPORTED_SSL_VERSION),"unsupported ssl version"},
 {ERR_REASON(SSL_R_UNSUPPORTED_STATUS_TYPE),"unsupported status type"},
+{ERR_REASON(SSL_R_USE_SRTP_NOT_NEGOTIATED),"use srtp not negotiated"},
 {ERR_REASON(SSL_R_WRITE_BIO_NOT_SET)     ,"write bio not set"},
 {ERR_REASON(SSL_R_WRONG_CIPHER_RETURNED) ,"wrong cipher returned"},
 {ERR_REASON(SSL_R_WRONG_MESSAGE_TYPE)    ,"wrong message type"},
 {ERR_REASON(SSL_R_WRONG_NUMBER_OF_KEY_BITS),"wrong number of key bits"},
 {ERR_REASON(SSL_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
 {ERR_REASON(SSL_R_WRONG_SIGNATURE_SIZE)  ,"wrong signature size"},
+{ERR_REASON(SSL_R_WRONG_SIGNATURE_TYPE)  ,"wrong signature type"},
 {ERR_REASON(SSL_R_WRONG_SSL_VERSION)     ,"wrong ssl version"},
 {ERR_REASON(SSL_R_WRONG_VERSION_NUMBER)  ,"wrong version number"},
 {ERR_REASON(SSL_R_X509_LIB)              ,"x509 lib"},
diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
index b169ba9..1a3ceea 100644
--- a/ssl/ssl_lib.c
+++ b/ssl/ssl_lib.c
@@ -176,7 +176,10 @@ SSL3_ENC_METHOD ssl3_undef_enc_method={
 	0,	/* client_finished_label_len */
 	NULL,	/* server_finished_label */
 	0,	/* server_finished_label_len */
-	(int (*)(int))ssl_undefined_function
+	(int (*)(int))ssl_undefined_function,
+	(int (*)(SSL *, unsigned char *, size_t, const char *,
+		 size_t, const unsigned char *, size_t,
+		 int use_context)) ssl_undefined_function,
 	};
 
 int SSL_clear(SSL *s)
@@ -202,9 +205,9 @@ int SSL_clear(SSL *s)
        * needed because SSL_clear is not called when doing renegotiation) */
 	/* This is set if we are doing dynamic renegotiation so keep
 	 * the old cipher.  It is sort of a SSL_clear_lite :-) */
-	if (s->new_session) return(1);
+	if (s->renegotiate) return(1);
 #else
-	if (s->new_session)
+	if (s->renegotiate)
 		{
 		SSLerr(SSL_F_SSL_CLEAR,ERR_R_INTERNAL_ERROR);
 		return 0;
@@ -357,6 +360,17 @@ SSL *SSL_new(SSL_CTX *ctx)
 # ifndef OPENSSL_NO_NEXTPROTONEG
 	s->next_proto_negotiated = NULL;
 # endif
+
+	if (s->ctx->alpn_client_proto_list)
+		{
+		s->alpn_client_proto_list =
+			OPENSSL_malloc(s->ctx->alpn_client_proto_list_len);
+		if (s->alpn_client_proto_list == NULL)
+			goto err;
+		memcpy(s->alpn_client_proto_list, s->ctx->alpn_client_proto_list,
+		       s->ctx->alpn_client_proto_list_len);
+		s->alpn_client_proto_list_len = s->ctx->alpn_client_proto_list_len;
+		}
 #endif
 
 	s->verify_result=X509_V_OK;
@@ -576,6 +590,10 @@ void SSL_free(SSL *s)
 		sk_OCSP_RESPID_pop_free(s->tlsext_ocsp_ids, OCSP_RESPID_free);
 	if (s->tlsext_ocsp_resp)
 		OPENSSL_free(s->tlsext_ocsp_resp);
+	if (s->tlsext_channel_id_private)
+		EVP_PKEY_free(s->tlsext_channel_id_private);
+	if (s->alpn_client_proto_list)
+		OPENSSL_free(s->alpn_client_proto_list);
 #endif
 
 	if (s->client_CA != NULL)
@@ -595,6 +613,11 @@ void SSL_free(SSL *s)
 		OPENSSL_free(s->next_proto_negotiated);
 #endif
 
+#ifndef OPENSSL_NO_SRTP
+        if (s->srtp_profiles)
+            sk_SRTP_PROTECTION_PROFILE_free(s->srtp_profiles);
+#endif
+
 	OPENSSL_free(s);
 	}
 
@@ -1017,10 +1040,21 @@ int SSL_shutdown(SSL *s)
 
 int SSL_renegotiate(SSL *s)
 	{
-	if (s->new_session == 0)
-		{
-		s->new_session=1;
-		}
+	if (s->renegotiate == 0)
+		s->renegotiate=1;
+
+	s->new_session=1;
+
+	return(s->method->ssl_renegotiate(s));
+	}
+
+int SSL_renegotiate_abbreviated(SSL *s)
+	{
+	if (s->renegotiate == 0)
+		s->renegotiate=1;
+
+	s->new_session=0;
+
 	return(s->method->ssl_renegotiate(s));
 	}
 
@@ -1028,7 +1062,7 @@ int SSL_renegotiate_pending(SSL *s)
 	{
 	/* becomes true when negotiation is requested;
 	 * false again once a handshake has finished */
-	return (s->new_session != 0);
+	return (s->renegotiate != 0);
 	}
 
 long SSL_ctrl(SSL *s,int cmd,long larg,void *parg)
@@ -1063,6 +1097,11 @@ long SSL_ctrl(SSL *s,int cmd,long larg,void *parg)
 		s->max_cert_list=larg;
 		return(l);
 	case SSL_CTRL_SET_MTU:
+#ifndef OPENSSL_NO_DTLS1
+		if (larg < (long)dtls1_min_mtu())
+			return 0;
+#endif
+
 		if (SSL_version(s) == DTLS1_VERSION ||
 		    SSL_version(s) == DTLS1_BAD_VER)
 			{
@@ -1390,6 +1429,10 @@ int ssl_cipher_list_to_bytes(SSL *s,STACK_OF(SSL_CIPHER) *sk,unsigned char *p,
 	for (i=0; i<sk_SSL_CIPHER_num(sk); i++)
 		{
 		c=sk_SSL_CIPHER_value(sk,i);
+		/* Skip TLS v1.2 only ciphersuites if lower than v1.2 */
+		if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+			(TLS1_get_client_version(s) < TLS1_2_VERSION))
+			continue;
 #ifndef OPENSSL_NO_KRB5
 		if (((c->algorithm_mkey & SSL_kKRB5) || (c->algorithm_auth & SSL_aKRB5)) &&
 		    nokrb5)
@@ -1407,7 +1450,7 @@ int ssl_cipher_list_to_bytes(SSL *s,STACK_OF(SSL_CIPHER) *sk,unsigned char *p,
 	/* If p == q, no ciphers and caller indicates an error. Otherwise
 	 * add SCSV if not renegotiating.
 	 */
-	if (p != q && !s->new_session)
+	if (p != q && !s->renegotiate)
 		{
 		static SSL_CIPHER scsv =
 			{
@@ -1454,7 +1497,7 @@ STACK_OF(SSL_CIPHER) *ssl_bytes_to_cipher_list(SSL *s,unsigned char *p,int num,
 			(p[n-1] == (SSL3_CK_SCSV & 0xff)))
 			{
 			/* SCSV fatal if renegotiating */
-			if (s->new_session)
+			if (s->renegotiate)
 				{
 				SSLerr(SSL_F_SSL_BYTES_TO_CIPHER_LIST,SSL_R_SCSV_RECEIVED_WHEN_RENEGOTIATING);
 				ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_HANDSHAKE_FAILURE); 
@@ -1627,10 +1670,93 @@ void SSL_CTX_set_next_proto_select_cb(SSL_CTX *ctx, int (*cb) (SSL *s, unsigned
 	ctx->next_proto_select_cb = cb;
 	ctx->next_proto_select_cb_arg = arg;
 	}
-
 # endif
+
+/* SSL_CTX_set_alpn_protos sets the ALPN protocol list on |ctx| to |protos|.
+ * |protos| must be in wire-format (i.e. a series of non-empty, 8-bit
+ * length-prefixed strings).
+ *
+ * Returns 0 on success. */
+int SSL_CTX_set_alpn_protos(SSL_CTX *ctx, const unsigned char* protos,
+			    unsigned protos_len)
+	{
+	if (ctx->alpn_client_proto_list)
+		OPENSSL_free(ctx->alpn_client_proto_list);
+
+	ctx->alpn_client_proto_list = OPENSSL_malloc(protos_len);
+	if (!ctx->alpn_client_proto_list)
+		return 1;
+	memcpy(ctx->alpn_client_proto_list, protos, protos_len);
+	ctx->alpn_client_proto_list_len = protos_len;
+
+	return 0;
+	}
+
+/* SSL_set_alpn_protos sets the ALPN protocol list on |ssl| to |protos|.
+ * |protos| must be in wire-format (i.e. a series of non-empty, 8-bit
+ * length-prefixed strings).
+ *
+ * Returns 0 on success. */
+int SSL_set_alpn_protos(SSL *ssl, const unsigned char* protos,
+			unsigned protos_len)
+	{
+	if (ssl->alpn_client_proto_list)
+		OPENSSL_free(ssl->alpn_client_proto_list);
+
+	ssl->alpn_client_proto_list = OPENSSL_malloc(protos_len);
+	if (!ssl->alpn_client_proto_list)
+		return 1;
+	memcpy(ssl->alpn_client_proto_list, protos, protos_len);
+	ssl->alpn_client_proto_list_len = protos_len;
+
+	return 0;
+	}
+
+/* SSL_CTX_set_alpn_select_cb sets a callback function on |ctx| that is called
+ * during ClientHello processing in order to select an ALPN protocol from the
+ * client's list of offered protocols. */
+void SSL_CTX_set_alpn_select_cb(SSL_CTX* ctx,
+				int (*cb) (SSL *ssl,
+					   const unsigned char **out,
+					   unsigned char *outlen,
+					   const unsigned char *in,
+					   unsigned int inlen,
+					   void *arg),
+				void *arg)
+	{
+	ctx->alpn_select_cb = cb;
+	ctx->alpn_select_cb_arg = arg;
+	}
+
+/* SSL_get0_alpn_selected gets the selected ALPN protocol (if any) from |ssl|.
+ * On return it sets |*data| to point to |*len| bytes of protocol name (not
+ * including the leading length-prefix byte). If the server didn't respond with
+ * a negotiated protocol then |*len| will be zero. */
+void SSL_get0_alpn_selected(const SSL *ssl, const unsigned char **data,
+			    unsigned *len)
+	{
+	*data = NULL;
+	if (ssl->s3)
+		*data = ssl->s3->alpn_selected;
+	if (*data == NULL)
+		*len = 0;
+	else
+		*len = ssl->s3->alpn_selected_len;
+	}
 #endif
 
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context)
+	{
+	if (s->version < TLS1_VERSION)
+		return -1;
+
+	return s->method->ssl3_enc->export_keying_material(s, out, olen, label,
+							   llen, p, plen,
+							   use_context);
+	}
+
 static unsigned long ssl_session_hash(const SSL_SESSION *a)
 	{
 	unsigned long l;
@@ -1674,6 +1800,14 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth)
 		return(NULL);
 		}
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && (meth->version < TLS1_VERSION))	
+		{
+		SSLerr(SSL_F_SSL_CTX_NEW, SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+		return NULL;
+		}
+#endif
+
 	if (SSL_get_ex_data_X509_STORE_CTX_idx() < 0)
 		{
 		SSLerr(SSL_F_SSL_CTX_NEW,SSL_R_X509_VERIFICATION_SETUP_PROBLEMS);
@@ -1803,6 +1937,9 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth)
 	ret->psk_client_callback=NULL;
 	ret->psk_server_callback=NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	SSL_CTX_SRP_CTX_init(ret);
+#endif
 #ifndef OPENSSL_NO_BUF_FREELISTS
 	ret->freelist_max_len = SSL_MAX_BUF_FREELIST_LEN_DEFAULT;
 	ret->rbuf_freelist = OPENSSL_malloc(sizeof(SSL3_BUF_FREELIST));
@@ -1931,10 +2068,18 @@ void SSL_CTX_free(SSL_CTX *a)
 	a->comp_methods = NULL;
 #endif
 
+#ifndef OPENSSL_NO_SRTP
+        if (a->srtp_profiles)
+                sk_SRTP_PROTECTION_PROFILE_free(a->srtp_profiles);
+#endif
+
 #ifndef OPENSSL_NO_PSK
 	if (a->psk_identity_hint)
 		OPENSSL_free(a->psk_identity_hint);
 #endif
+#ifndef OPENSSL_NO_SRP
+	SSL_CTX_SRP_CTX_free(a);
+#endif
 #ifndef OPENSSL_NO_ENGINE
 	if (a->client_cert_engine)
 		ENGINE_finish(a->client_cert_engine);
@@ -1947,6 +2092,13 @@ void SSL_CTX_free(SSL_CTX *a)
 		ssl_buf_freelist_free(a->rbuf_freelist);
 #endif
 
+#ifndef OPENSSL_NO_TLSEXT
+	if (a->tlsext_channel_id_private)
+		EVP_PKEY_free(a->tlsext_channel_id_private);
+	if (a->alpn_client_proto_list != NULL)
+		OPENSSL_free(a->alpn_client_proto_list);
+#endif
+
 	OPENSSL_free(a);
 	}
 
@@ -2188,12 +2340,13 @@ void ssl_set_cert_masks(CERT *c, const SSL_CIPHER *cipher)
 
 #ifndef OPENSSL_NO_EC
 
-int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
+int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s)
 	{
 	unsigned long alg_k, alg_a;
 	EVP_PKEY *pkey = NULL;
 	int keysize = 0;
 	int signature_nid = 0, md_nid = 0, pk_nid = 0;
+	const SSL_CIPHER *cs = s->s3->tmp.new_cipher;
 
 	alg_k = cs->algorithm_mkey;
 	alg_a = cs->algorithm_auth;
@@ -2223,7 +2376,7 @@ int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
 			SSLerr(SSL_F_SSL_CHECK_SRVR_ECC_CERT_AND_ALG, SSL_R_ECC_CERT_NOT_FOR_KEY_AGREEMENT);
 			return 0;
 			}
-		if (alg_k & SSL_kECDHe)
+		if ((alg_k & SSL_kECDHe) && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			/* signature alg must be ECDSA */
 			if (pk_nid != NID_X9_62_id_ecPublicKey)
@@ -2232,7 +2385,7 @@ int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
 				return 0;
 				}
 			}
-		if (alg_k & SSL_kECDHr)
+		if ((alg_k & SSL_kECDHr) && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			/* signature alg must be RSA */
 
@@ -2259,7 +2412,7 @@ int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
 #endif
 
 /* THIS NEEDS CLEANING UP */
-X509 *ssl_get_server_send_cert(SSL *s)
+CERT_PKEY *ssl_get_server_send_pkey(const SSL *s)
 	{
 	unsigned long alg_k,alg_a;
 	CERT *c;
@@ -2314,15 +2467,23 @@ X509 *ssl_get_server_send_cert(SSL *s)
 		i=SSL_PKEY_GOST01;
 	else /* if (alg_a & SSL_aNULL) */
 		{
-		SSLerr(SSL_F_SSL_GET_SERVER_SEND_CERT,ERR_R_INTERNAL_ERROR);
+		SSLerr(SSL_F_SSL_GET_SERVER_SEND_PKEY,ERR_R_INTERNAL_ERROR);
 		return(NULL);
 		}
-	if (c->pkeys[i].x509 == NULL) return(NULL);
 
-	return(c->pkeys[i].x509);
+	return c->pkeys + i;
+	}
+
+X509 *ssl_get_server_send_cert(const SSL *s)
+	{
+	CERT_PKEY *cpk;
+	cpk = ssl_get_server_send_pkey(s);
+	if (!cpk)
+		return NULL;
+	return cpk->x509;
 	}
 
-EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher)
+EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher, const EVP_MD **pmd)
 	{
 	unsigned long alg_a;
 	CERT *c;
@@ -2330,26 +2491,37 @@ EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher)
 	alg_a = cipher->algorithm_auth;
 	c=s->cert;
 
+	/* SHA1 is the default for all signature algorithms up to TLS 1.2,
+	 * except RSA which is handled specially in s3_srvr.c */
+	if (pmd)
+		*pmd = EVP_sha1();
+
 	if ((alg_a & SSL_aDSS) &&
-		(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey != NULL))
-		return(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey);
+	    (c->pkeys[SSL_PKEY_DSA_SIGN].privatekey != NULL))
+		{
+		if (pmd && s->s3 && s->s3->digest_dsa)
+			*pmd = s->s3->digest_dsa;
+		return c->pkeys[SSL_PKEY_DSA_SIGN].privatekey;
+		}
 	else if (alg_a & SSL_aRSA)
 		{
+		if (pmd && s->s3 && s->s3->digest_rsa)
+			*pmd = s->s3->digest_rsa;
 		if (c->pkeys[SSL_PKEY_RSA_SIGN].privatekey != NULL)
-			return(c->pkeys[SSL_PKEY_RSA_SIGN].privatekey);
-		else if (c->pkeys[SSL_PKEY_RSA_ENC].privatekey != NULL)
-			return(c->pkeys[SSL_PKEY_RSA_ENC].privatekey);
-		else
-			return(NULL);
+			return c->pkeys[SSL_PKEY_RSA_SIGN].privatekey;
+		if (c->pkeys[SSL_PKEY_RSA_ENC].privatekey != NULL)
+			return c->pkeys[SSL_PKEY_RSA_ENC].privatekey;
 		}
 	else if ((alg_a & SSL_aECDSA) &&
 	         (c->pkeys[SSL_PKEY_ECC].privatekey != NULL))
-		return(c->pkeys[SSL_PKEY_ECC].privatekey);
-	else /* if (alg_a & SSL_aNULL) */
 		{
-		SSLerr(SSL_F_SSL_GET_SIGN_PKEY,ERR_R_INTERNAL_ERROR);
-		return(NULL);
+		if (pmd && s->s3 && s->s3->digest_ecdsa)
+			*pmd = s->s3->digest_ecdsa;
+		return c->pkeys[SSL_PKEY_ECC].privatekey;
 		}
+
+	SSLerr(SSL_F_SSL_GET_SIGN_PKEY,ERR_R_INTERNAL_ERROR);
+	return(NULL);
 	}
 
 void ssl_update_cache(SSL *s,int mode)
@@ -2574,7 +2746,11 @@ SSL_METHOD *ssl_bad_method(int ver)
 
 static const char *ssl_get_version(int version)
 	{
-	if (version == TLS1_VERSION)
+	if (version == TLS1_2_VERSION)
+		return("TLSv1.2");
+	else if (version == TLS1_1_VERSION)
+		return("TLSv1.1");
+	else if (version == TLS1_VERSION)
 		return("TLSv1");
 	else if (version == SSL3_VERSION)
 		return("SSLv3");
@@ -2602,12 +2778,8 @@ const char* SSL_authentication_method(const SSL* ssl)
 		{
 	case SSL2_VERSION:
 		return SSL_TXT_RSA;
-	case SSL3_VERSION:
-	case TLS1_VERSION:
-	case DTLS1_VERSION:
-		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 	default:
-		return "UNKNOWN";
+		return SSL_CIPHER_authentication_method(ssl->s3->tmp.new_cipher);
 		}
 	}
 
@@ -2695,6 +2867,7 @@ SSL *SSL_dup(SSL *s)
 	ret->in_handshake = s->in_handshake;
 	ret->handshake_func = s->handshake_func;
 	ret->server = s->server;
+	ret->renegotiate = s->renegotiate;
 	ret->new_session = s->new_session;
 	ret->quiet_shutdown = s->quiet_shutdown;
 	ret->shutdown=s->shutdown;
@@ -2772,7 +2945,9 @@ void ssl_clear_cipher_ctx(SSL *s)
 /* Fix this function so that it takes an optional type parameter */
 X509 *SSL_get_certificate(const SSL *s)
 	{
-	if (s->cert != NULL)
+	if (s->server)
+		return(ssl_get_server_send_cert(s));
+	else if (s->cert != NULL)
 		return(s->cert->key->x509);
 	else
 		return(NULL);
@@ -2960,6 +3135,11 @@ int SSL_state(const SSL *ssl)
 	return(ssl->state);
 	}
 
+void SSL_set_state(SSL *ssl, int state)
+	{
+	ssl->state = state;
+	}
+
 void SSL_set_verify_result(SSL *ssl,long arg)
 	{
 	ssl->verify_result=arg;
@@ -3231,6 +3411,16 @@ void ssl_clear_hash_ctx(EVP_MD_CTX **hash)
 	*hash=NULL;
 }
 
+void SSL_set_debug(SSL *s, int debug)
+	{
+	s->debug = debug;
+	}
+
+int SSL_cache_hit(SSL *s)
+	{
+	return s->hit;
+	}
+
 #if defined(_WINDLL) && defined(OPENSSL_SYS_WIN16)
 #include "../crypto/bio/bss_file.c"
 #endif
@@ -3239,4 +3429,3 @@ IMPLEMENT_STACK_OF(SSL_CIPHER)
 IMPLEMENT_STACK_OF(SSL_COMP)
 IMPLEMENT_OBJ_BSEARCH_GLOBAL_CMP_FN(SSL_CIPHER, SSL_CIPHER,
 				    ssl_cipher_id);
-
diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h
index 25f8e16..917c9f3 100644
--- a/ssl/ssl_locl.h
+++ b/ssl/ssl_locl.h
@@ -170,7 +170,7 @@
 # define OPENSSL_EXTERN OPENSSL_EXPORT
 #endif
 
-#define PKCS1_CHECK
+#undef PKCS1_CHECK
 
 #define c2l(c,l)	(l = ((unsigned long)(*((c)++)))     , \
 			 l|=(((unsigned long)(*((c)++)))<< 8), \
@@ -215,6 +215,15 @@
 			 *((c)++)=(unsigned char)(((l)>> 8)&0xff), \
 			 *((c)++)=(unsigned char)(((l)    )&0xff))
 
+#define l2n8(l,c)	(*((c)++)=(unsigned char)(((l)>>56)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>>48)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>>40)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>>32)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>>24)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>>16)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>> 8)&0xff), \
+			 *((c)++)=(unsigned char)(((l)    )&0xff))
+
 #define n2l6(c,l)	(l =((BN_ULLONG)(*((c)++)))<<40, \
 			 l|=((BN_ULLONG)(*((c)++)))<<32, \
 			 l|=((BN_ULLONG)(*((c)++)))<<24, \
@@ -289,6 +298,7 @@
 #define SSL_kEECDH		0x00000080L /* ephemeral ECDH */
 #define SSL_kPSK		0x00000100L /* PSK */
 #define SSL_kGOST       0x00000200L /* GOST key exchange */
+#define SSL_kSRP        0x00000400L /* SRP */
 
 /* Bits for algorithm_auth (server authentication) */
 #define SSL_aRSA		0x00000001L /* RSA auth */
@@ -316,21 +326,29 @@
 #define SSL_CAMELLIA256		0x00000200L
 #define SSL_eGOST2814789CNT	0x00000400L
 #define SSL_SEED		0x00000800L
+#define SSL_AES128GCM		0x00001000L
+#define SSL_AES256GCM		0x00002000L
 
-#define SSL_AES        		(SSL_AES128|SSL_AES256)
+#define SSL_AES        		(SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM)
 #define SSL_CAMELLIA		(SSL_CAMELLIA128|SSL_CAMELLIA256)
 
 
 /* Bits for algorithm_mac (symmetric authentication) */
+
 #define SSL_MD5			0x00000001L
 #define SSL_SHA1		0x00000002L
 #define SSL_GOST94      0x00000004L
 #define SSL_GOST89MAC   0x00000008L
+#define SSL_SHA256		0x00000010L
+#define SSL_SHA384		0x00000020L
+/* Not a real MAC, just an indication it is part of cipher */
+#define SSL_AEAD		0x00000040L
 
 /* Bits for algorithm_ssl (protocol version) */
 #define SSL_SSLV2		0x00000001L
 #define SSL_SSLV3		0x00000002L
 #define SSL_TLSV1		SSL_SSLV3	/* for now */
+#define SSL_TLSV1_2		0x00000004L
 
 
 /* Bits for algorithm2 (handshake digests and other extra flags) */
@@ -338,15 +356,21 @@
 #define SSL_HANDSHAKE_MAC_MD5 0x10
 #define SSL_HANDSHAKE_MAC_SHA 0x20
 #define SSL_HANDSHAKE_MAC_GOST94 0x40
+#define SSL_HANDSHAKE_MAC_SHA256 0x80
+#define SSL_HANDSHAKE_MAC_SHA384 0x100
 #define SSL_HANDSHAKE_MAC_DEFAULT (SSL_HANDSHAKE_MAC_MD5 | SSL_HANDSHAKE_MAC_SHA)
 
 /* When adding new digest in the ssl_ciph.c and increment SSM_MD_NUM_IDX
  * make sure to update this constant too */
-#define SSL_MAX_DIGEST 4
+#define SSL_MAX_DIGEST 6
+
+#define TLS1_PRF_DGST_MASK	(0xff << TLS1_PRF_DGST_SHIFT)
 
-#define TLS1_PRF_DGST_SHIFT 8
+#define TLS1_PRF_DGST_SHIFT 10
 #define TLS1_PRF_MD5 (SSL_HANDSHAKE_MAC_MD5 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF_SHA1 (SSL_HANDSHAKE_MAC_SHA << TLS1_PRF_DGST_SHIFT)
+#define TLS1_PRF_SHA256 (SSL_HANDSHAKE_MAC_SHA256 << TLS1_PRF_DGST_SHIFT)
+#define TLS1_PRF_SHA384 (SSL_HANDSHAKE_MAC_SHA384 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF_GOST94 (SSL_HANDSHAKE_MAC_GOST94 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF (TLS1_PRF_MD5 | TLS1_PRF_SHA1)
 
@@ -354,6 +378,7 @@
  * (currently this also goes into algorithm2) */
 #define TLS1_STREAM_MAC 0x04
 
+#define TLSEXT_CHANNEL_ID_SIZE 128
 
 
 /*
@@ -555,6 +580,10 @@ typedef struct ssl3_enc_method
 	const char *server_finished_label;
 	int server_finished_label_len;
 	int (*alert_value)(int);
+	int (*export_keying_material)(SSL *, unsigned char *, size_t,
+				      const char *, size_t,
+				      const unsigned char *, size_t,
+				      int use_context);
 	} SSL3_ENC_METHOD;
 
 #ifndef OPENSSL_NO_COMP
@@ -592,11 +621,12 @@ extern SSL3_ENC_METHOD TLSv1_enc_data;
 extern SSL3_ENC_METHOD SSLv3_enc_data;
 extern SSL3_ENC_METHOD DTLSv1_enc_data;
 
-#define IMPLEMENT_tls1_meth_func(func_name, s_accept, s_connect, s_get_meth) \
+#define IMPLEMENT_tls_meth_func(version, func_name, s_accept, s_connect, \
+				s_get_meth) \
 const SSL_METHOD *func_name(void)  \
 	{ \
 	static const SSL_METHOD func_name##_data= { \
-		TLS1_VERSION, \
+		version, \
 		tls1_new, \
 		tls1_clear, \
 		tls1_free, \
@@ -670,7 +700,7 @@ const SSL_METHOD *func_name(void)  \
 const SSL_METHOD *func_name(void)  \
 	{ \
 	static const SSL_METHOD func_name##_data= { \
-	TLS1_VERSION, \
+	TLS1_2_VERSION, \
 	tls1_new, \
 	tls1_clear, \
 	tls1_free, \
@@ -753,7 +783,7 @@ const SSL_METHOD *func_name(void)  \
 		ssl3_read, \
 		ssl3_peek, \
 		ssl3_write, \
-		ssl3_shutdown, \
+		dtls1_shutdown, \
 		ssl3_renegotiate, \
 		ssl3_renegotiate_check, \
 		dtls1_get_message, \
@@ -809,8 +839,9 @@ int ssl_verify_cert_chain(SSL *s,STACK_OF(X509) *sk);
 int ssl_undefined_function(SSL *s);
 int ssl_undefined_void_function(void);
 int ssl_undefined_const_function(const SSL *s);
-X509 *ssl_get_server_send_cert(SSL *);
-EVP_PKEY *ssl_get_sign_pkey(SSL *,const SSL_CIPHER *);
+CERT_PKEY *ssl_get_server_send_pkey(const SSL *s);
+X509 *ssl_get_server_send_cert(const SSL *);
+EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *c, const EVP_MD **pmd);
 int ssl_cert_type(X509 *x,EVP_PKEY *pkey);
 void ssl_set_cert_masks(CERT *c, const SSL_CIPHER *cipher);
 STACK_OF(SSL_CIPHER) *ssl_get_ciphers_by_id(SSL *s);
@@ -944,6 +975,7 @@ void dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr);
 void dtls1_reset_seq_numbers(SSL *s, int rw);
 long dtls1_default_timeout(void);
 struct timeval* dtls1_get_timeout(SSL *s, struct timeval* timeleft);
+int dtls1_check_timeout_num(SSL *s);
 int dtls1_handle_timeout(SSL *s);
 const SSL_CIPHER *dtls1_get_cipher(unsigned int u);
 void dtls1_start_timer(SSL *s);
@@ -951,6 +983,7 @@ void dtls1_stop_timer(SSL *s);
 int dtls1_is_timer_expired(SSL *s);
 void dtls1_double_timeout(SSL *s);
 int dtls1_send_newsession_ticket(SSL *s);
+unsigned int dtls1_min_mtu(void);
 
 /* some client-only functions */
 int ssl3_client_hello(SSL *s);
@@ -970,6 +1003,7 @@ int ssl3_check_cert_and_algorithm(SSL *s);
 int ssl3_check_finished(SSL *s);
 # ifndef OPENSSL_NO_NEXTPROTONEG
 int ssl3_send_next_proto(SSL *s);
+int ssl3_send_channel_id(SSL *s);
 # endif
 #endif
 
@@ -992,6 +1026,7 @@ int ssl3_get_cert_verify(SSL *s);
 #ifndef OPENSSL_NO_NEXTPROTONEG
 int ssl3_get_next_proto(SSL *s);
 #endif
+int ssl3_get_channel_id(SSL *s);
 
 int dtls1_send_hello_request(SSL *s);
 int dtls1_send_server_hello(SSL *s);
@@ -1019,6 +1054,7 @@ int	dtls1_connect(SSL *s);
 void dtls1_free(SSL *s);
 void dtls1_clear(SSL *s);
 long dtls1_ctrl(SSL *s,int cmd, long larg, void *parg);
+int dtls1_shutdown(SSL *s);
 
 long dtls1_get_message(SSL *s, int st1, int stn, int mt, long max, int *ok);
 int dtls1_get_record(SSL *s);
@@ -1039,12 +1075,15 @@ int tls1_cert_verify_mac(SSL *s, int md_nid, unsigned char *p);
 int tls1_mac(SSL *ssl, unsigned char *md, int snd);
 int tls1_generate_master_secret(SSL *s, unsigned char *out,
 	unsigned char *p, int len);
+int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen,
+	const unsigned char *p, size_t plen, int use_context);
 int tls1_alert_code(int code);
 int ssl3_alert_code(int code);
 int ssl_ok(SSL *s);
 
 #ifndef OPENSSL_NO_ECDH
-int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs);
+int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s);
 #endif
 
 SSL_COMP *ssl3_comp_find(STACK_OF(SSL_COMP) *sk, int n);
@@ -1061,9 +1100,17 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **data, unsigned char *d,
 int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **data, unsigned char *d, int n, int *al);
 int ssl_prepare_clienthello_tlsext(SSL *s);
 int ssl_prepare_serverhello_tlsext(SSL *s);
-int ssl_check_clienthello_tlsext(SSL *s);
+int ssl_check_clienthello_tlsext_early(SSL *s);
+int ssl_check_clienthello_tlsext_late(SSL *s);
 int ssl_check_serverhello_tlsext(SSL *s);
 
+#ifndef OPENSSL_NO_HEARTBEATS
+int tls1_heartbeat(SSL *s);
+int dtls1_heartbeat(SSL *s);
+int tls1_process_heartbeat(SSL *s);
+int dtls1_process_heartbeat(SSL *s);
+#endif
+
 #ifdef OPENSSL_NO_SHA256
 #define tlsext_tick_md	EVP_sha1
 #else
@@ -1071,7 +1118,15 @@ int ssl_check_serverhello_tlsext(SSL *s);
 #endif
 int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
 				const unsigned char *limit, SSL_SESSION **ret);
+
+int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk,
+				const EVP_MD *md);
+int tls12_get_sigid(const EVP_PKEY *pk);
+const EVP_MD *tls12_get_hash(unsigned char hash_alg);
+
+int tls1_channel_id_hash(EVP_MD_CTX *ctx, SSL *s);
 #endif
+
 EVP_MD_CTX* ssl_replace_hash(EVP_MD_CTX **hash,const EVP_MD *md) ;
 void ssl_clear_hash_ctx(EVP_MD_CTX **hash);
 int ssl_add_serverhello_renegotiate_ext(SSL *s, unsigned char *p, int *len,
@@ -1082,4 +1137,42 @@ int ssl_add_clienthello_renegotiate_ext(SSL *s, unsigned char *p, int *len,
 					int maxlen);
 int ssl_parse_clienthello_renegotiate_ext(SSL *s, unsigned char *d, int len,
 					  int *al);
+long ssl_get_algorithm2(SSL *s);
+void tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize);
+int tls12_get_req_sig_algs(SSL *s, unsigned char *p);
+
+int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen);
+int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al);
+int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen);
+int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al);
+
+/* s3_cbc.c */
+void ssl3_cbc_copy_mac(unsigned char* out,
+		       const SSL3_RECORD *rec,
+		       unsigned md_size,unsigned orig_len);
+int ssl3_cbc_remove_padding(const SSL* s,
+			    SSL3_RECORD *rec,
+			    unsigned block_size,
+			    unsigned mac_size);
+int tls1_cbc_remove_padding(const SSL* s,
+			    SSL3_RECORD *rec,
+			    unsigned block_size,
+			    unsigned mac_size);
+char ssl3_cbc_record_digest_supported(const EVP_MD_CTX *ctx);
+void ssl3_cbc_digest_record(
+	const EVP_MD_CTX *ctx,
+	unsigned char* md_out,
+	size_t* md_out_size,
+	const unsigned char header[13],
+	const unsigned char *data,
+	size_t data_plus_mac_size,
+	size_t data_plus_mac_plus_padding_size,
+	const unsigned char *mac_secret,
+	unsigned mac_secret_length,
+	char is_sslv3);
+
+void tls_fips_digest_extra(
+	const EVP_CIPHER_CTX *cipher_ctx, EVP_MD_CTX *mac_ctx,
+	const unsigned char *data, size_t data_len, size_t orig_len);
+
 #endif
diff --git a/ssl/ssl_rsa.c b/ssl/ssl_rsa.c
index c43f3e2..e98e862 100644
--- a/ssl/ssl_rsa.c
+++ b/ssl/ssl_rsa.c
@@ -714,6 +714,8 @@ int SSL_use_certificate_chain(SSL *ssl, STACK_OF(X509) *cert_chain)
 		SSLerr(SSL_F_SSL_USE_CERTIFICATE_CHAIN,SSL_R_NO_CERTIFICATE_ASSIGNED);
 		return(0);
 		}
+	if (ssl->cert->key->cert_chain != NULL)
+		sk_X509_pop_free(ssl->cert->key->cert_chain, X509_free);
 	ssl->cert->key->cert_chain = cert_chain;
 	return(1);
 	}
@@ -746,7 +748,7 @@ int SSL_CTX_use_certificate_chain_file(SSL_CTX *ctx, const char *file)
 
 	ERR_clear_error(); /* clear error stack for SSL_CTX_use_certificate() */
 
-	in=BIO_new(BIO_s_file_internal());
+	in = BIO_new(BIO_s_file_internal());
 	if (in == NULL)
 		{
 		SSLerr(SSL_F_SSL_CTX_USE_CERTIFICATE_CHAIN_FILE,ERR_R_BUF_LIB);
@@ -759,14 +761,16 @@ int SSL_CTX_use_certificate_chain_file(SSL_CTX *ctx, const char *file)
 		goto end;
 		}
 
-	x=PEM_read_bio_X509_AUX(in,NULL,ctx->default_passwd_callback,ctx->default_passwd_callback_userdata);
+	x=PEM_read_bio_X509_AUX(in,NULL,ctx->default_passwd_callback,
+				ctx->default_passwd_callback_userdata);
 	if (x == NULL)
 		{
 		SSLerr(SSL_F_SSL_CTX_USE_CERTIFICATE_CHAIN_FILE,ERR_R_PEM_LIB);
 		goto end;
 		}
 
-	ret=SSL_CTX_use_certificate(ctx,x);
+	ret = SSL_CTX_use_certificate(ctx, x);
+
 	if (ERR_peek_error() != 0)
 		ret = 0;  /* Key/certificate mismatch doesn't imply ret==0 ... */
 	if (ret)
@@ -778,13 +782,15 @@ int SSL_CTX_use_certificate_chain_file(SSL_CTX *ctx, const char *file)
 		int r;
 		unsigned long err;
 		
-		if (ctx->extra_certs != NULL) 
+		if (ctx->extra_certs != NULL)
 			{
 			sk_X509_pop_free(ctx->extra_certs, X509_free);
 			ctx->extra_certs = NULL;
 			}
 
-		while ((ca = PEM_read_bio_X509(in,NULL,ctx->default_passwd_callback,ctx->default_passwd_callback_userdata))
+		while ((ca = PEM_read_bio_X509(in, NULL,
+					ctx->default_passwd_callback,
+					ctx->default_passwd_callback_userdata))
 			!= NULL)
 			{
 			r = SSL_CTX_add_extra_chain_cert(ctx, ca);
diff --git a/ssl/ssl_sess.c b/ssl/ssl_sess.c
index 93954e4..920b763 100644
--- a/ssl/ssl_sess.c
+++ b/ssl/ssl_sess.c
@@ -217,6 +217,9 @@ SSL_SESSION *SSL_SESSION_new(void)
 #ifndef OPENSSL_NO_PSK
 	ss->psk_identity_hint=NULL;
 	ss->psk_identity=NULL;
+#endif
+#ifndef OPENSSL_NO_SRP
+	ss->srp_username=NULL;
 #endif
 	return(ss);
 	}
@@ -228,6 +231,11 @@ const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s, unsigned int *len)
 	return s->session_id;
 	}
 
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s)
+	{
+	return s->compress_meth;
+	}
+
 /* Even with SSLv2, we have 16 bytes (128 bits) of session ID space. SSLv3/TLSv1
  * has 32 bytes (256 bits). As such, filling the ID with random gunk repeatedly
  * until we have no conflict is going to complete in one iteration pretty much
@@ -307,6 +315,16 @@ int ssl_get_new_session(SSL *s, int session)
 			ss->ssl_version=TLS1_VERSION;
 			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
 			}
+		else if (s->version == TLS1_1_VERSION)
+			{
+			ss->ssl_version=TLS1_1_VERSION;
+			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
+			}
+		else if (s->version == TLS1_2_VERSION)
+			{
+			ss->ssl_version=TLS1_2_VERSION;
+			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
+			}
 		else if (s->version == DTLS1_BAD_VER)
 			{
 			ss->ssl_version=DTLS1_BAD_VER;
@@ -430,6 +448,25 @@ int ssl_get_new_session(SSL *s, int session)
 	return(1);
 	}
 
+/* ssl_get_prev attempts to find an SSL_SESSION to be used to resume this
+ * connection. It is only called by servers.
+ *
+ *   session_id: points at the session ID in the ClientHello. This code will
+ *       read past the end of this in order to parse out the session ticket
+ *       extension, if any.
+ *   len: the length of the session ID.
+ *   limit: a pointer to the first byte after the ClientHello.
+ *
+ * Returns:
+ *   -1: error
+ *    0: a session may have been found.
+ *
+ * Side effects:
+ *   - If a session is found then s->session is pointed at it (after freeing an
+ *     existing session if need be) and s->verify_result is set from the session.
+ *   - Both for new and resumed sessions, s->tlsext_ticket_expected is set to 1
+ *     if the server should issue a new session ticket (to 0 otherwise).
+ */
 int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 			const unsigned char *limit)
 	{
@@ -437,27 +474,39 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 
 	SSL_SESSION *ret=NULL;
 	int fatal = 0;
+	int try_session_cache = 1;
 #ifndef OPENSSL_NO_TLSEXT
 	int r;
 #endif
 
 	if (len > SSL_MAX_SSL_SESSION_ID_LENGTH)
 		goto err;
+
+	if (len == 0)
+		try_session_cache = 0;
+
 #ifndef OPENSSL_NO_TLSEXT
-	r = tls1_process_ticket(s, session_id, len, limit, &ret);
-	if (r == -1)
+	r = tls1_process_ticket(s, session_id, len, limit, &ret); /* sets s->tlsext_ticket_expected */
+	switch (r)
 		{
+	case -1: /* Error during processing */
 		fatal = 1;
 		goto err;
+	case 0: /* No ticket found */
+	case 1: /* Zero length ticket found */
+		break; /* Ok to carry on processing session id. */
+	case 2: /* Ticket found but not decrypted. */
+	case 3: /* Ticket decrypted, *ret has been set. */
+		try_session_cache = 0;
+		break;
+	default:
+		abort();
 		}
-	else if (r == 0 || (!ret && !len))
-		goto err;
-	else if (!ret && !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
-#else
-	if (len == 0)
-		goto err;
-	if (!(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
 #endif
+
+	if (try_session_cache &&
+	    ret == NULL &&
+	    !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
 		{
 		SSL_SESSION data;
 		data.ssl_version=s->version;
@@ -468,20 +517,22 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 		CRYPTO_r_lock(CRYPTO_LOCK_SSL_CTX);
 		ret=lh_SSL_SESSION_retrieve(s->session_ctx->sessions,&data);
 		if (ret != NULL)
-		    /* don't allow other threads to steal it: */
-		    CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
+			{
+			/* don't allow other threads to steal it: */
+			CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
+			}
 		CRYPTO_r_unlock(CRYPTO_LOCK_SSL_CTX);
+		if (ret == NULL)
+			s->session_ctx->stats.sess_miss++;
 		}
 
-	if (ret == NULL)
+	if (try_session_cache &&
+	    ret == NULL &&
+	    s->session_ctx->get_session_cb != NULL)
 		{
 		int copy=1;
 	
-		s->session_ctx->stats.sess_miss++;
-		ret=NULL;
-		if (s->session_ctx->get_session_cb != NULL
-		    && (ret=s->session_ctx->get_session_cb(s,session_id,len,&copy))
-		       != NULL)
+		if ((ret=s->session_ctx->get_session_cb(s,session_id,len,&copy)))
 			{
 			s->session_ctx->stats.sess_cb_hit++;
 
@@ -500,23 +551,18 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 				 * things are very strange */
 				SSL_CTX_add_session(s->session_ctx,ret);
 			}
-		if (ret == NULL)
-			goto err;
 		}
 
-	/* Now ret is non-NULL, and we own one of its reference counts. */
+	if (ret == NULL)
+		goto err;
+
+	/* Now ret is non-NULL and we own one of its reference counts. */
 
 	if (ret->sid_ctx_length != s->sid_ctx_length
 	    || memcmp(ret->sid_ctx,s->sid_ctx,ret->sid_ctx_length))
 		{
-		/* We've found the session named by the client, but we don't
+		/* We have the session requested by the client, but we don't
 		 * want to use it in this context. */
-
-#if 0 /* The client cannot always know when a session is not appropriate,
-       * so we shouldn't generate an error message. */
-
-		SSLerr(SSL_F_SSL_GET_PREV_SESSION,SSL_R_ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT);
-#endif
 		goto err; /* treat like cache miss */
 		}
 	
@@ -553,39 +599,38 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 			goto err;
 		}
 
-
-#if 0 /* This is way too late. */
-
-	/* If a thread got the session, then 'swaped', and another got
-	 * it and then due to a time-out decided to 'OPENSSL_free' it we could
-	 * be in trouble.  So I'll increment it now, then double decrement
-	 * later - am I speaking rubbish?. */
-	CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
-#endif
-
 	if (ret->timeout < (long)(time(NULL) - ret->time)) /* timeout */
 		{
 		s->session_ctx->stats.sess_timeout++;
-		/* remove it from the cache */
-		SSL_CTX_remove_session(s->session_ctx,ret);
+		if (try_session_cache)
+			{
+			/* session was from the cache, so remove it */
+			SSL_CTX_remove_session(s->session_ctx,ret);
+			}
 		goto err;
 		}
 
 	s->session_ctx->stats.sess_hit++;
 
-	/* ret->time=time(NULL); */ /* rezero timeout? */
-	/* again, just leave the session 
-	 * if it is the same session, we have just incremented and
-	 * then decremented the reference count :-) */
 	if (s->session != NULL)
 		SSL_SESSION_free(s->session);
 	s->session=ret;
 	s->verify_result = s->session->verify_result;
-	return(1);
+	return 1;
 
  err:
 	if (ret != NULL)
+		{
 		SSL_SESSION_free(ret);
+#ifndef OPENSSL_NO_TLSEXT
+		if (!try_session_cache)
+			{
+			/* The session was from a ticket, so we should
+			 * issue a ticket for the new session */
+			s->tlsext_ticket_expected = 1;
+			}
+#endif
+		}
 	if (fatal)
 		return -1;
 	else
@@ -735,6 +780,10 @@ void SSL_SESSION_free(SSL_SESSION *ss)
 		OPENSSL_free(ss->psk_identity_hint);
 	if (ss->psk_identity != NULL)
 		OPENSSL_free(ss->psk_identity);
+#endif
+#ifndef OPENSSL_NO_SRP
+	if (ss->srp_username != NULL)
+		OPENSSL_free(ss->srp_username);
 #endif
 	OPENSSL_cleanse(ss,sizeof(*ss));
 	OPENSSL_free(ss);
@@ -760,10 +809,6 @@ int SSL_set_session(SSL *s, SSL_SESSION *session)
 			{
 			if (!SSL_set_ssl_method(s,meth))
 				return(0);
-			if (s->ctx->session_timeout == 0)
-				session->timeout=SSL_get_default_timeout(s);
-			else
-				session->timeout=s->ctx->session_timeout;
 			}
 
 #ifndef OPENSSL_NO_KRB5
@@ -831,6 +876,25 @@ long SSL_SESSION_set_time(SSL_SESSION *s, long t)
 	return(t);
 	}
 
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s)
+	{
+	return s->peer;
+	}
+
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len)
+	{
+	if(sid_ctx_len > SSL_MAX_SID_CTX_LENGTH)
+		{
+		SSLerr(SSL_F_SSL_SESSION_SET1_ID_CONTEXT,SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG);
+		return 0;
+		}
+	s->sid_ctx_length=sid_ctx_len;
+	memcpy(s->sid_ctx,sid_ctx,sid_ctx_len);
+
+	return 1;
+	}
+
 long SSL_CTX_set_timeout(SSL_CTX *s, long t)
 	{
 	long l;
diff --git a/ssl/ssl_txt.c b/ssl/ssl_txt.c
index 3122440..6479d52 100644
--- a/ssl/ssl_txt.c
+++ b/ssl/ssl_txt.c
@@ -115,6 +115,10 @@ int SSL_SESSION_print(BIO *bp, const SSL_SESSION *x)
 		s="SSLv2";
 	else if (x->ssl_version == SSL3_VERSION)
 		s="SSLv3";
+	else if (x->ssl_version == TLS1_2_VERSION)
+		s="TLSv1.2";
+	else if (x->ssl_version == TLS1_1_VERSION)
+		s="TLSv1.1";
 	else if (x->ssl_version == TLS1_VERSION)
 		s="TLSv1";
 	else if (x->ssl_version == DTLS1_VERSION)
@@ -187,6 +191,10 @@ int SSL_SESSION_print(BIO *bp, const SSL_SESSION *x)
 	if (BIO_puts(bp,"\n    PSK identity hint: ") <= 0) goto err;
 	if (BIO_printf(bp, "%s", x->psk_identity_hint ? x->psk_identity_hint : "None") <= 0) goto err;
 #endif
+#ifndef OPENSSL_NO_SRP
+	if (BIO_puts(bp,"\n    SRP username: ") <= 0) goto err;
+	if (BIO_printf(bp, "%s", x->srp_username ? x->srp_username : "None") <= 0) goto err;
+#endif
 #ifndef OPENSSL_NO_TLSEXT
 	if (x->tlsext_tick_lifetime_hint)
 		{
diff --git a/ssl/ssltest.c b/ssl/ssltest.c
index f6a2c79..91169bb 100644
--- a/ssl/ssltest.c
+++ b/ssl/ssltest.c
@@ -181,6 +181,9 @@
 #ifndef OPENSSL_NO_DH
 #include <openssl/dh.h>
 #endif
+#ifndef OPENSSL_NO_SRP
+#include <openssl/srp.h>
+#endif
 #include <openssl/bn.h>
 
 #define _XOPEN_SOURCE_EXTENDED	1 /* Or gethostname won't be declared properly
@@ -246,6 +249,49 @@ static unsigned int psk_server_callback(SSL *ssl, const char *identity, unsigned
 	unsigned int max_psk_len);
 #endif
 
+#ifndef OPENSSL_NO_SRP
+/* SRP client */
+/* This is a context that we pass to all callbacks */
+typedef struct srp_client_arg_st
+	{
+	char *srppassin;
+	char *srplogin;
+	} SRP_CLIENT_ARG;
+
+#define PWD_STRLEN 1024
+
+static char * MS_CALLBACK ssl_give_srp_client_pwd_cb(SSL *s, void *arg)
+	{
+	SRP_CLIENT_ARG *srp_client_arg = (SRP_CLIENT_ARG *)arg;
+	return BUF_strdup((char *)srp_client_arg->srppassin);
+	}
+
+/* SRP server */
+/* This is a context that we pass to SRP server callbacks */
+typedef struct srp_server_arg_st
+	{
+	char *expected_user;
+	char *pass;
+	} SRP_SERVER_ARG;
+
+static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
+	{
+	SRP_SERVER_ARG * p = (SRP_SERVER_ARG *) arg;
+
+	if (strcmp(p->expected_user, SSL_get_srp_username(s)) != 0)
+		{
+		fprintf(stderr, "User %s doesn't exist\n", SSL_get_srp_username(s));
+		return SSL3_AL_FATAL;
+		}
+	if (SSL_set_srp_server_param_pw(s,p->expected_user,p->pass,"1024")<0)
+		{
+		*ad = SSL_AD_INTERNAL_ERROR;
+		return SSL3_AL_FATAL;
+		}
+	return SSL_ERROR_NONE;
+	}
+#endif
+
 static BIO *bio_err=NULL;
 static BIO *bio_stdout=NULL;
 
@@ -268,6 +314,9 @@ static void sv_usage(void)
 	{
 	fprintf(stderr,"usage: ssltest [args ...]\n");
 	fprintf(stderr,"\n");
+#ifdef OPENSSL_FIPS
+	fprintf(stderr,"-F             - run test in FIPS mode\n");
+#endif
 	fprintf(stderr," -server_auth  - check server certificate\n");
 	fprintf(stderr," -client_auth  - do client authentication\n");
 	fprintf(stderr," -proxy        - allow proxy certificates\n");
@@ -289,6 +338,10 @@ static void sv_usage(void)
 #ifndef OPENSSL_NO_PSK
 	fprintf(stderr," -psk arg      - PSK in hex (without 0x)\n");
 #endif
+#ifndef OPENSSL_NO_SRP
+	fprintf(stderr," -srpuser user  - SRP username to use\n");
+	fprintf(stderr," -srppass arg   - password for 'user'\n");
+#endif
 #ifndef OPENSSL_NO_SSL2
 	fprintf(stderr," -ssl2         - use SSLv2\n");
 #endif
@@ -316,8 +369,6 @@ static void sv_usage(void)
 	               "                 (default is sect163r2).\n");
 #endif
 	fprintf(stderr," -test_cipherlist - verifies the order of the ssl cipher lists\n");
-	fprintf(stderr," -c_small_records - enable client side use of small SSL record buffers\n");
-	fprintf(stderr," -s_small_records - enable server side use of small SSL record buffers\n");
 	fprintf(stderr," -cutthrough      - enable 1-RTT full-handshake for strong ciphers\n");
 	}
 
@@ -447,10 +498,6 @@ int opaque_prf_input_cb(SSL *ssl, void *peerinput, size_t len, void *arg_)
 	return arg->ret;
 	}
 #endif
-	int ssl_mode = 0;
-	int c_small_records=0;
-	int s_small_records=0;
-	int cutthrough = 0;
 
 int main(int argc, char *argv[])
 	{
@@ -482,6 +529,12 @@ int main(int argc, char *argv[])
 #endif
 #ifndef OPENSSL_NO_ECDH
 	EC_KEY *ecdh = NULL;
+#endif
+#ifndef OPENSSL_NO_SRP
+	/* client */
+	SRP_CLIENT_ARG srp_client_arg = {NULL,NULL};
+	/* server */
+	SRP_SERVER_ARG srp_server_arg = {NULL,NULL};
 #endif
 	int no_dhe = 0;
 	int no_ecdhe = 0;
@@ -491,9 +544,13 @@ int main(int argc, char *argv[])
 	int comp = 0;
 #ifndef OPENSSL_NO_COMP
 	COMP_METHOD *cm = NULL;
-#endif
 	STACK_OF(SSL_COMP) *ssl_comp_methods = NULL;
+#endif
 	int test_cipherlist = 0;
+#ifdef OPENSSL_FIPS
+	int fips_mode=0;
+#endif
+	int cutthrough = 0;
 
 	verbose = 0;
 	debug = 0;
@@ -525,7 +582,16 @@ int main(int argc, char *argv[])
 
 	while (argc >= 1)
 		{
-		if	(strcmp(*argv,"-server_auth") == 0)
+		if(!strcmp(*argv,"-F"))
+			{
+#ifdef OPENSSL_FIPS
+			fips_mode=1;
+#else
+			fprintf(stderr,"not compiled with FIPS support, so exitting without running.\n");
+			EXIT(0);
+#endif
+			}
+		else if (strcmp(*argv,"-server_auth") == 0)
 			server_auth=1;
 		else if	(strcmp(*argv,"-client_auth") == 0)
 			client_auth=1;
@@ -579,6 +645,20 @@ int main(int argc, char *argv[])
 			no_psk=1;
 #endif
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (strcmp(*argv,"-srpuser") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_server_arg.expected_user = srp_client_arg.srplogin= *(++argv);
+			tls1=1;
+			}
+		else if (strcmp(*argv,"-srppass") == 0)
+			{
+			if (--argc < 1) goto bad;
+			srp_server_arg.pass = srp_client_arg.srppassin= *(++argv);
+			tls1=1;
+			}
+#endif
 		else if	(strcmp(*argv,"-ssl2") == 0)
 			ssl2=1;
 		else if	(strcmp(*argv,"-tls1") == 0)
@@ -687,14 +767,6 @@ int main(int argc, char *argv[])
 			{
 			test_cipherlist = 1;
 			}
-		else if (strcmp(*argv, "-c_small_records") == 0)
-			{
-			c_small_records = 1;
-			}
-		else if (strcmp(*argv, "-s_small_records") == 0)
-			{
-			s_small_records = 1;
-			}
 		else if (strcmp(*argv, "-cutthrough") == 0)
 			{
 			cutthrough = 1;
@@ -733,6 +805,20 @@ int main(int argc, char *argv[])
 		EXIT(1);
 		}
 
+#ifdef OPENSSL_FIPS
+	if(fips_mode)
+		{
+		if(!FIPS_mode_set(1))
+			{
+			ERR_load_crypto_strings();
+			ERR_print_errors(BIO_new_fp(stderr,BIO_NOCLOSE));
+			EXIT(1);
+			}
+		else
+			fprintf(stderr,"*** IN FIPS MODE ***\n");
+		}
+#endif
+
 	if (print_time)
 		{
 		if (!bio_pair)
@@ -820,26 +906,10 @@ int main(int argc, char *argv[])
 		SSL_CTX_set_cipher_list(c_ctx,cipher);
 		SSL_CTX_set_cipher_list(s_ctx,cipher);
 		}
-
-	ssl_mode = 0;
-	if (c_small_records)
-		{
-		ssl_mode = SSL_CTX_get_mode(c_ctx);
-		ssl_mode |= SSL_MODE_SMALL_BUFFERS;
-		SSL_CTX_set_mode(c_ctx, ssl_mode);
-		}
-	ssl_mode = 0;
-	if (s_small_records)
-		{
-		ssl_mode = SSL_CTX_get_mode(s_ctx);
-		ssl_mode |= SSL_MODE_SMALL_BUFFERS;
-		SSL_CTX_set_mode(s_ctx, ssl_mode);
-		}
-	ssl_mode = 0;
 	if (cutthrough)
 		{
-		ssl_mode = SSL_CTX_get_mode(c_ctx);
-		ssl_mode = SSL_MODE_HANDSHAKE_CUTTHROUGH;
+		int ssl_mode = SSL_CTX_get_mode(c_ctx);
+		ssl_mode |= SSL_MODE_HANDSHAKE_CUTTHROUGH;
 		SSL_CTX_set_mode(c_ctx, ssl_mode);
 		}
 
@@ -878,7 +948,11 @@ int main(int argc, char *argv[])
 				}
 			}
 		else
+#ifdef OPENSSL_NO_EC2M
+			nid = NID_X9_62_prime256v1;
+#else
 			nid = NID_sect163r2;
+#endif
 
 		ecdh = EC_KEY_new_by_curve_name(nid);
 		if (ecdh == NULL)
@@ -981,6 +1055,26 @@ int main(int argc, char *argv[])
 			}
 #endif
 		}
+#ifndef OPENSSL_NO_SRP
+        if (srp_client_arg.srplogin)
+		{
+		if (!SSL_CTX_set_srp_username(c_ctx, srp_client_arg.srplogin))
+			{
+			BIO_printf(bio_err,"Unable to set SRP username\n");
+			goto end;
+			}
+		SSL_CTX_set_srp_cb_arg(c_ctx,&srp_client_arg);
+		SSL_CTX_set_srp_client_pwd_callback(c_ctx, ssl_give_srp_client_pwd_cb);
+		/*SSL_CTX_set_srp_strength(c_ctx, srp_client_arg.strength);*/
+		}
+
+	if (srp_server_arg.expected_user != NULL)
+		{
+		SSL_CTX_set_verify(s_ctx,SSL_VERIFY_NONE,verify_callback);
+		SSL_CTX_set_srp_cb_arg(s_ctx, &srp_server_arg);
+		SSL_CTX_set_srp_username_callback(s_ctx, ssl_srp_server_param_cb);
+		}
+#endif
 
 	c_ssl=SSL_new(c_ctx);
 	s_ssl=SSL_new(s_ctx);
@@ -2205,15 +2299,7 @@ static int MS_CALLBACK app_verify_callback(X509_STORE_CTX *ctx, void *arg)
 		}
 
 #ifndef OPENSSL_NO_X509_VERIFY
-# ifdef OPENSSL_FIPS
-	if(s->version == TLS1_VERSION)
-		FIPS_allow_md5(1);
-# endif
 	ok = X509_verify_cert(ctx);
-# ifdef OPENSSL_FIPS
-	if(s->version == TLS1_VERSION)
-		FIPS_allow_md5(0);
-# endif
 #endif
 
 	if (cb_arg->proxy_auth)
diff --git a/ssl/t1_clnt.c b/ssl/t1_clnt.c
index c87af17..578617e 100644
--- a/ssl/t1_clnt.c
+++ b/ssl/t1_clnt.c
@@ -66,13 +66,26 @@
 static const SSL_METHOD *tls1_get_client_method(int ver);
 static const SSL_METHOD *tls1_get_client_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_client_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_client_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_client_method());
-	else
-		return(NULL);
+		return TLSv1_client_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_client_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_client_method,
+			ssl_undefined_function,
+			ssl3_connect,
+			tls1_get_client_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_client_method,
+			ssl_undefined_function,
+			ssl3_connect,
+			tls1_get_client_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_client_method,
 			ssl_undefined_function,
 			ssl3_connect,
 			tls1_get_client_method)
diff --git a/ssl/t1_enc.c b/ssl/t1_enc.c
index 793ea43..809ad2e 100644
--- a/ssl/t1_enc.c
+++ b/ssl/t1_enc.c
@@ -143,6 +143,7 @@
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
 #include <openssl/md5.h>
+#include <openssl/rand.h>
 #ifdef KSSL_DEBUG
 #include <openssl/des.h>
 #endif
@@ -158,68 +159,75 @@ static int tls1_P_hash(const EVP_MD *md, const unsigned char *sec,
 			unsigned char *out, int olen)
 	{
 	int chunk;
-	unsigned int j;
-	HMAC_CTX ctx;
-	HMAC_CTX ctx_tmp;
+	size_t j;
+	EVP_MD_CTX ctx, ctx_tmp;
+	EVP_PKEY *mac_key;
 	unsigned char A1[EVP_MAX_MD_SIZE];
-	unsigned int A1_len;
+	size_t A1_len;
 	int ret = 0;
 	
 	chunk=EVP_MD_size(md);
 	OPENSSL_assert(chunk >= 0);
 
-	HMAC_CTX_init(&ctx);
-	HMAC_CTX_init(&ctx_tmp);
-	if (!HMAC_Init_ex(&ctx,sec,sec_len,md, NULL))
+	EVP_MD_CTX_init(&ctx);
+	EVP_MD_CTX_init(&ctx_tmp);
+	EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	EVP_MD_CTX_set_flags(&ctx_tmp, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	mac_key = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, NULL, sec, sec_len);
+	if (!mac_key)
+		goto err;
+	if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key))
 		goto err;
-	if (!HMAC_Init_ex(&ctx_tmp,sec,sec_len,md, NULL))
+	if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key))
 		goto err;
-	if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len))
+	if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len))
 		goto err;
-	if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len))
+	if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len))
 		goto err;
-	if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len))
+	if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len))
 		goto err;
-	if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len))
+	if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len))
 		goto err;
-	if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len))
+	if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len))
 		goto err;
-	if (!HMAC_Final(&ctx,A1,&A1_len))
+	if (!EVP_DigestSignFinal(&ctx,A1,&A1_len))
 		goto err;
 
 	for (;;)
 		{
-		if (!HMAC_Init_ex(&ctx,NULL,0,NULL,NULL)) /* re-init */
+		/* Reinit mac contexts */
+		if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key))
 			goto err;
-		if (!HMAC_Init_ex(&ctx_tmp,NULL,0,NULL,NULL)) /* re-init */
+		if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key))
 			goto err;
-		if (!HMAC_Update(&ctx,A1,A1_len))
+		if (!EVP_DigestSignUpdate(&ctx,A1,A1_len))
 			goto err;
-		if (!HMAC_Update(&ctx_tmp,A1,A1_len))
+		if (!EVP_DigestSignUpdate(&ctx_tmp,A1,A1_len))
 			goto err;
-		if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len))
+		if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len))
 			goto err;
-		if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len))
+		if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len))
 			goto err;
-		if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len))
+		if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len))
 			goto err;
-		if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len))
+		if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len))
 			goto err;
-		if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len))
+		if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len))
 			goto err;
 
 		if (olen > chunk)
 			{
-			if (!HMAC_Final(&ctx,out,&j))
+			if (!EVP_DigestSignFinal(&ctx,out,&j))
 				goto err;
 			out+=j;
 			olen-=j;
-			if (!HMAC_Final(&ctx_tmp,A1,&A1_len)) /* calc the next A1 value */
+			/* calc the next A1 value */
+			if (!EVP_DigestSignFinal(&ctx_tmp,A1,&A1_len))
 				goto err;
 			}
 		else	/* last one */
 			{
-			if (!HMAC_Final(&ctx,A1,&A1_len))
+			if (!EVP_DigestSignFinal(&ctx,A1,&A1_len))
 				goto err;
 			memcpy(out,A1,olen);
 			break;
@@ -227,8 +235,9 @@ static int tls1_P_hash(const EVP_MD *md, const unsigned char *sec,
 		}
 	ret = 1;
 err:
-	HMAC_CTX_cleanup(&ctx);
-	HMAC_CTX_cleanup(&ctx_tmp);
+	EVP_PKEY_free(mac_key);
+	EVP_MD_CTX_cleanup(&ctx);
+	EVP_MD_CTX_cleanup(&ctx_tmp);
 	OPENSSL_cleanse(A1,sizeof(A1));
 	return ret;
 	}
@@ -256,6 +265,8 @@ static int tls1_PRF(long digest_mask,
 		if ((m<<TLS1_PRF_DGST_SHIFT) & digest_mask) count++;
 	}	
 	len=slen/count;
+	if (count == 1)
+		slen = 0;
 	S1=sec;
 	memset(out1,0,olen);
 	for (idx=0;ssl_get_handshake_digest(idx,&m,&md);idx++) {
@@ -284,7 +295,7 @@ static int tls1_generate_key_block(SSL *s, unsigned char *km,
 	     unsigned char *tmp, int num)
 	{
 	int ret;
-	ret = tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	ret = tls1_PRF(ssl_get_algorithm2(s),
 		 TLS_MD_KEY_EXPANSION_CONST,TLS_MD_KEY_EXPANSION_CONST_SIZE,
 		 s->s3->server_random,SSL3_RANDOM_SIZE,
 		 s->s3->client_random,SSL3_RANDOM_SIZE,
@@ -350,7 +361,7 @@ int tls1_change_cipher_state(SSL *s, int which)
 	{
         int i;
         for (i=0; i<s->s3->tmp.key_block_length; i++)
-		printf("%02x", key_block[i]);  printf("\n");
+		printf("%02x", s->s3->tmp.key_block[i]);  printf("\n");
         }
 #endif	/* KSSL_DEBUG */
 
@@ -358,7 +369,7 @@ int tls1_change_cipher_state(SSL *s, int which)
 		{
 		if (s->s3->tmp.new_cipher->algorithm2 & TLS1_STREAM_MAC)
 			s->mac_flags |= SSL_MAC_FLAG_READ_MAC_STREAM;
-			else
+		else
 			s->mac_flags &= ~SSL_MAC_FLAG_READ_MAC_STREAM;
 
 		if (s->enc_read_ctx != NULL)
@@ -445,7 +456,11 @@ int tls1_change_cipher_state(SSL *s, int which)
 	j=is_export ? (cl < SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher) ?
 	               cl : SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher)) : cl;
 	/* Was j=(exp)?5:EVP_CIPHER_key_length(c); */
-	k=EVP_CIPHER_iv_length(c);
+	/* If GCM mode only part of IV comes from PRF */
+	if (EVP_CIPHER_mode(c) == EVP_CIPH_GCM_MODE)
+		k = EVP_GCM_TLS_FIXED_IV_LEN;
+	else
+		k=EVP_CIPHER_iv_length(c);
 	if (	(which == SSL3_CHANGE_CIPHER_CLIENT_WRITE) ||
 		(which == SSL3_CHANGE_CIPHER_SERVER_READ))
 		{
@@ -474,10 +489,14 @@ int tls1_change_cipher_state(SSL *s, int which)
 		}
 
 	memcpy(mac_secret,ms,i);
-	mac_key = EVP_PKEY_new_mac_key(mac_type, NULL,
-			mac_secret,*mac_secret_size);
-	EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key);
-	EVP_PKEY_free(mac_key);
+
+	if (!(EVP_CIPHER_flags(c)&EVP_CIPH_FLAG_AEAD_CIPHER))
+		{
+		mac_key = EVP_PKEY_new_mac_key(mac_type, NULL,
+				mac_secret,*mac_secret_size);
+		EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key);
+		EVP_PKEY_free(mac_key);
+		}
 #ifdef TLS_DEBUG
 printf("which = %04X\nmac key=",which);
 { int z; for (z=0; z<i; z++) printf("%02X%c",ms[z],((z+1)%16)?' ':'\n'); }
@@ -487,7 +506,7 @@ printf("which = %04X\nmac key=",which);
 		/* In here I set both the read and write key/iv to the
 		 * same value since only the correct one will be used :-).
 		 */
-		if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+		if (!tls1_PRF(ssl_get_algorithm2(s),
 				exp_label,exp_label_len,
 				s->s3->client_random,SSL3_RANDOM_SIZE,
 				s->s3->server_random,SSL3_RANDOM_SIZE,
@@ -498,7 +517,7 @@ printf("which = %04X\nmac key=",which);
 
 		if (k > 0)
 			{
-			if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+			if (!tls1_PRF(ssl_get_algorithm2(s),
 					TLS_MD_IV_BLOCK_CONST,TLS_MD_IV_BLOCK_CONST_SIZE,
 					s->s3->client_random,SSL3_RANDOM_SIZE,
 					s->s3->server_random,SSL3_RANDOM_SIZE,
@@ -524,7 +543,19 @@ printf("which = %04X\nmac key=",which);
 	}
 #endif	/* KSSL_DEBUG */
 
-	EVP_CipherInit_ex(dd,c,NULL,key,iv,(which & SSL3_CC_WRITE));
+	if (EVP_CIPHER_mode(c) == EVP_CIPH_GCM_MODE)
+		{
+		EVP_CipherInit_ex(dd,c,NULL,key,NULL,(which & SSL3_CC_WRITE));
+		EVP_CIPHER_CTX_ctrl(dd, EVP_CTRL_GCM_SET_IV_FIXED, k, iv);
+		}
+	else	
+		EVP_CipherInit_ex(dd,c,NULL,key,iv,(which & SSL3_CC_WRITE));
+
+	/* Needed for "composite" AEADs, such as RC4-HMAC-MD5 */
+	if ((EVP_CIPHER_flags(c)&EVP_CIPH_FLAG_AEAD_CIPHER) && *mac_secret_size)
+		EVP_CIPHER_CTX_ctrl(dd,EVP_CTRL_AEAD_SET_MAC_KEY,
+				*mac_secret_size,mac_secret);
+
 #ifdef TLS_DEBUG
 printf("which = %04X\nkey=",which);
 { int z; for (z=0; z<EVP_CIPHER_key_length(c); z++) printf("%02X%c",key[z],((z+1)%16)?' ':'\n'); }
@@ -606,7 +637,8 @@ printf("\nkey block\n");
 { int z; for (z=0; z<num; z++) printf("%02X%c",p1[z],((z+1)%16)?' ':'\n'); }
 #endif
 
-	if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS))
+	if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS)
+		&& s->method->version <= TLS1_VERSION)
 		{
 		/* enable vulnerability countermeasure for CBC ciphers with
 		 * known-IV problem (http://www.openssl.org/~bodo/tls-cbc.txt)
@@ -635,19 +667,28 @@ printf("\nkey block\n");
 	return(ret);
 	}
 
+/* tls1_enc encrypts/decrypts the record in |s->wrec| / |s->rrec|, respectively.
+ *
+ * Returns:
+ *   0: (in non-constant time) if the record is publically invalid (i.e. too
+ *       short etc).
+ *   1: if the record's padding is valid / the encryption was successful.
+ *   -1: if the record's padding/AEAD-authenticator is invalid or, if sending,
+ *       an internal error occured.
+ */
 int tls1_enc(SSL *s, int send)
 	{
 	SSL3_RECORD *rec;
 	EVP_CIPHER_CTX *ds;
 	unsigned long l;
-	int bs,i,ii,j,k,n=0;
+	int bs,i,j,k,pad=0,ret,mac_size=0;
 	const EVP_CIPHER *enc;
 
 	if (send)
 		{
 		if (EVP_MD_CTX_md(s->write_hash))
 			{
-			n=EVP_MD_CTX_size(s->write_hash);
+			int n=EVP_MD_CTX_size(s->write_hash);
 			OPENSSL_assert(n >= 0);
 			}
 		ds=s->enc_write_ctx;
@@ -655,13 +696,34 @@ int tls1_enc(SSL *s, int send)
 		if (s->enc_write_ctx == NULL)
 			enc=NULL;
 		else
+			{
+			int ivlen;
 			enc=EVP_CIPHER_CTX_cipher(s->enc_write_ctx);
+			/* For TLSv1.1 and later explicit IV */
+			if (s->version >= TLS1_1_VERSION
+				&& EVP_CIPHER_mode(enc) == EVP_CIPH_CBC_MODE)
+				ivlen = EVP_CIPHER_iv_length(enc);
+			else
+				ivlen = 0;
+			if (ivlen > 1)
+				{
+				if ( rec->data != rec->input)
+					/* we can't write into the input stream:
+					 * Can this ever happen?? (steve)
+					 */
+					fprintf(stderr,
+						"%s:%d: rec->data != rec->input\n",
+						__FILE__, __LINE__);
+				else if (RAND_bytes(rec->input, ivlen) <= 0)
+					return -1;
+				}
+			}
 		}
 	else
 		{
 		if (EVP_MD_CTX_md(s->read_hash))
 			{
-			n=EVP_MD_CTX_size(s->read_hash);
+			int n=EVP_MD_CTX_size(s->read_hash);
 			OPENSSL_assert(n >= 0);
 			}
 		ds=s->enc_read_ctx;
@@ -676,18 +738,54 @@ int tls1_enc(SSL *s, int send)
 	printf("tls1_enc(%d)\n", send);
 #endif    /* KSSL_DEBUG */
 
-	if ((s->session == NULL) || (ds == NULL) ||
-		(enc == NULL))
+	if ((s->session == NULL) || (ds == NULL) || (enc == NULL))
 		{
 		memmove(rec->data,rec->input,rec->length);
 		rec->input=rec->data;
+		ret = 1;
 		}
 	else
 		{
 		l=rec->length;
 		bs=EVP_CIPHER_block_size(ds->cipher);
 
-		if ((bs != 1) && send)
+		if (EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_AEAD_CIPHER)
+			{
+			unsigned char buf[13],*seq;
+
+			seq = send?s->s3->write_sequence:s->s3->read_sequence;
+
+			if (s->version == DTLS1_VERSION || s->version == DTLS1_BAD_VER)
+				{
+				unsigned char dtlsseq[9],*p=dtlsseq;
+
+				s2n(send?s->d1->w_epoch:s->d1->r_epoch,p);
+				memcpy(p,&seq[2],6);
+				memcpy(buf,dtlsseq,8);
+				}
+			else
+				{
+				memcpy(buf,seq,8);
+				for (i=7; i>=0; i--)	/* increment */
+					{
+					++seq[i];
+					if (seq[i] != 0) break; 
+					}
+				}
+
+			buf[8]=rec->type;
+			buf[9]=(unsigned char)(s->version>>8);
+			buf[10]=(unsigned char)(s->version);
+			buf[11]=rec->length>>8;
+			buf[12]=rec->length&0xff;
+			pad=EVP_CIPHER_CTX_ctrl(ds,EVP_CTRL_AEAD_TLS1_AAD,13,buf);
+			if (send)
+				{
+				l+=pad;
+				rec->length+=pad;
+				}
+			}
+		else if ((bs != 1) && send)
 			{
 			i=bs-((int)l%bs);
 
@@ -708,13 +806,13 @@ int tls1_enc(SSL *s, int send)
 
 #ifdef KSSL_DEBUG
 		{
-                unsigned long ui;
+		unsigned long ui;
 		printf("EVP_Cipher(ds=%p,rec->data=%p,rec->input=%p,l=%ld) ==>\n",
-                        ds,rec->data,rec->input,l);
+			ds,rec->data,rec->input,l);
 		printf("\tEVP_CIPHER_CTX: %d buf_len, %d key_len [%d %d], %d iv_len\n",
-                        ds->buf_len, ds->cipher->key_len,
-                        DES_KEY_SZ, DES_SCHEDULE_SZ,
-                        ds->cipher->iv_len);
+			ds->buf_len, ds->cipher->key_len,
+			DES_KEY_SZ, DES_SCHEDULE_SZ,
+			ds->cipher->iv_len);
 		printf("\t\tIV: ");
 		for (i=0; i<ds->cipher->iv_len; i++) printf("%02X", ds->iv[i]);
 		printf("\n");
@@ -727,68 +825,41 @@ int tls1_enc(SSL *s, int send)
 		if (!send)
 			{
 			if (l == 0 || l%bs != 0)
-				{
-				SSLerr(SSL_F_TLS1_ENC,SSL_R_BLOCK_CIPHER_PAD_IS_WRONG);
-				ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECRYPTION_FAILED);
 				return 0;
-				}
 			}
 		
-		EVP_Cipher(ds,rec->data,rec->input,l);
+		i = EVP_Cipher(ds,rec->data,rec->input,l);
+		if ((EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_CUSTOM_CIPHER)
+						?(i<0)
+						:(i==0))
+			return -1;	/* AEAD can fail to verify MAC */
+		if (EVP_CIPHER_mode(enc) == EVP_CIPH_GCM_MODE && !send)
+			{
+			rec->data += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			rec->input += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			rec->length -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			}
 
 #ifdef KSSL_DEBUG
 		{
-                unsigned long i;
-                printf("\trec->data=");
+		unsigned long i;
+		printf("\trec->data=");
 		for (i=0; i<l; i++)
-                        printf(" %02x", rec->data[i]);  printf("\n");
-                }
+			printf(" %02x", rec->data[i]);  printf("\n");
+		}
 #endif	/* KSSL_DEBUG */
 
+		ret = 1;
+		if (EVP_MD_CTX_md(s->read_hash) != NULL)
+			mac_size = EVP_MD_CTX_size(s->read_hash);
 		if ((bs != 1) && !send)
-			{
-			ii=i=rec->data[l-1]; /* padding_length */
-			i++;
-			/* NB: if compression is in operation the first packet
-			 * may not be of even length so the padding bug check
-			 * cannot be performed. This bug workaround has been
-			 * around since SSLeay so hopefully it is either fixed
-			 * now or no buggy implementation supports compression 
-			 * [steve]
-			 */
-			if ( (s->options&SSL_OP_TLS_BLOCK_PADDING_BUG)
-				&& !s->expand)
-				{
-				/* First packet is even in size, so check */
-				if ((memcmp(s->s3->read_sequence,
-					"\0\0\0\0\0\0\0\0",8) == 0) && !(ii & 1))
-					s->s3->flags|=TLS1_FLAGS_TLS_PADDING_BUG;
-				if (s->s3->flags & TLS1_FLAGS_TLS_PADDING_BUG)
-					i--;
-				}
-			/* TLS 1.0 does not bound the number of padding bytes by the block size.
-			 * All of them must have value 'padding_length'. */
-			if (i > (int)rec->length)
-				{
-				/* Incorrect padding. SSLerr() and ssl3_alert are done
-				 * by caller: we don't want to reveal whether this is
-				 * a decryption error or a MAC verification failure
-				 * (see http://www.openssl.org/~bodo/tls-cbc.txt) */
-				return -1;
-				}
-			for (j=(int)(l-i); j<(int)l; j++)
-				{
-				if (rec->data[j] != ii)
-					{
-					/* Incorrect padding */
-					return -1;
-					}
-				}
-			rec->length-=i;
-			}
+			ret = tls1_cbc_remove_padding(s, rec, bs, mac_size);
+		if (pad && !send)
+			rec->length -= pad;
 		}
-	return(1);
+	return ret;
 	}
+
 int tls1_cert_verify_mac(SSL *s, int md_nid, unsigned char *out)
 	{
 	unsigned int ret;
@@ -841,7 +912,7 @@ int tls1_final_finish_mac(SSL *s,
 
 	for (idx=0;ssl_get_handshake_digest(idx,&mask,&md);idx++)
 		{
-		if (mask & s->s3->tmp.new_cipher->algorithm2)
+		if (mask & ssl_get_algorithm2(s))
 			{
 			int hashsize = EVP_MD_size(md);
 			if (hashsize < 0 || hashsize > (int)(sizeof buf - (size_t)(q-buf)))
@@ -860,7 +931,7 @@ int tls1_final_finish_mac(SSL *s,
 			}
 		}
 		
-	if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	if (!tls1_PRF(ssl_get_algorithm2(s),
 			str,slen, buf,(int)(q-buf), NULL,0, NULL,0, NULL,0,
 			s->session->master_key,s->session->master_key_length,
 			out,buf2,sizeof buf2))
@@ -878,10 +949,10 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
 	SSL3_RECORD *rec;
 	unsigned char *seq;
 	EVP_MD_CTX *hash;
-	size_t md_size;
+	size_t md_size, orig_len;
 	int i;
 	EVP_MD_CTX hmac, *mac_ctx;
-	unsigned char buf[5]; 
+	unsigned char header[13];
 	int stream_mac = (send?(ssl->mac_flags & SSL_MAC_FLAG_WRITE_MAC_STREAM):(ssl->mac_flags&SSL_MAC_FLAG_READ_MAC_STREAM));
 	int t;
 
@@ -902,12 +973,6 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
 	OPENSSL_assert(t >= 0);
 	md_size=t;
 
-	buf[0]=rec->type;
-	buf[1]=(unsigned char)(ssl->version>>8);
-	buf[2]=(unsigned char)(ssl->version);
-	buf[3]=rec->length>>8;
-	buf[4]=rec->length&0xff;
-
 	/* I should fix this up TLS TLS TLS TLS TLS XXXXXXXX */
 	if (stream_mac) 
 		{
@@ -926,17 +991,55 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
 		s2n(send?ssl->d1->w_epoch:ssl->d1->r_epoch, p);
 		memcpy (p,&seq[2],6);
 
-		EVP_DigestSignUpdate(mac_ctx,dtlsseq,8);
+		memcpy(header, dtlsseq, 8);
 		}
 	else
-		EVP_DigestSignUpdate(mac_ctx,seq,8);
+		memcpy(header, seq, 8);
 
-	EVP_DigestSignUpdate(mac_ctx,buf,5);
-	EVP_DigestSignUpdate(mac_ctx,rec->input,rec->length);
-	t=EVP_DigestSignFinal(mac_ctx,md,&md_size);
-	OPENSSL_assert(t > 0);
+	/* kludge: tls1_cbc_remove_padding passes padding length in rec->type */
+	orig_len = rec->length+md_size+((unsigned int)rec->type>>8);
+	rec->type &= 0xff;
+
+	header[8]=rec->type;
+	header[9]=(unsigned char)(ssl->version>>8);
+	header[10]=(unsigned char)(ssl->version);
+	header[11]=(rec->length)>>8;
+	header[12]=(rec->length)&0xff;
+
+	if (!send &&
+	    EVP_CIPHER_CTX_mode(ssl->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+	    ssl3_cbc_record_digest_supported(mac_ctx))
+		{
+		/* This is a CBC-encrypted record. We must avoid leaking any
+		 * timing-side channel information about how many blocks of
+		 * data we are hashing because that gives an attacker a
+		 * timing-oracle. */
+		ssl3_cbc_digest_record(
+			mac_ctx,
+			md, &md_size,
+			header, rec->input,
+			rec->length + md_size, orig_len,
+			ssl->s3->read_mac_secret,
+			ssl->s3->read_mac_secret_size,
+			0 /* not SSLv3 */);
+		}
+	else
+		{
+		EVP_DigestSignUpdate(mac_ctx,header,sizeof(header));
+		EVP_DigestSignUpdate(mac_ctx,rec->input,rec->length);
+		t=EVP_DigestSignFinal(mac_ctx,md,&md_size);
+		OPENSSL_assert(t > 0);
+#ifdef OPENSSL_FIPS
+		if (!send && FIPS_mode())
+			tls_fips_digest_extra(
+	    				ssl->enc_read_ctx,
+					mac_ctx, rec->input,
+					rec->length, orig_len);
+#endif
+		}
 		
-	if (!stream_mac) EVP_MD_CTX_cleanup(&hmac);
+	if (!stream_mac)
+		EVP_MD_CTX_cleanup(&hmac);
 #ifdef TLS_DEBUG
 printf("sec=");
 {unsigned int z; for (z=0; z<md_size; z++) printf("%02X ",mac_sec[z]); printf("\n"); }
@@ -970,6 +1073,7 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 	const void *co = NULL, *so = NULL;
 	int col = 0, sol = 0;
 
+
 #ifdef KSSL_DEBUG
 	printf ("tls1_generate_master_secret(%p,%p, %p, %d)\n", s,out, p,len);
 #endif	/* KSSL_DEBUG */
@@ -986,7 +1090,7 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 		}
 #endif
 
-	tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	tls1_PRF(ssl_get_algorithm2(s),
 		TLS_MD_MASTER_SECRET_CONST,TLS_MD_MASTER_SECRET_CONST_SIZE,
 		s->s3->client_random,SSL3_RANDOM_SIZE,
 		co, col,
@@ -994,6 +1098,16 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 		so, sol,
 		p,len,
 		s->session->master_key,buff,sizeof buff);
+#ifdef SSL_DEBUG
+	fprintf(stderr, "Premaster Secret:\n");
+	BIO_dump_fp(stderr, (char *)p, len);
+	fprintf(stderr, "Client Random:\n");
+	BIO_dump_fp(stderr, (char *)s->s3->client_random, SSL3_RANDOM_SIZE);
+	fprintf(stderr, "Server Random:\n");
+	BIO_dump_fp(stderr, (char *)s->s3->server_random, SSL3_RANDOM_SIZE);
+	fprintf(stderr, "Master Secret:\n");
+	BIO_dump_fp(stderr, (char *)s->session->master_key, SSL3_MASTER_SECRET_SIZE);
+#endif
 
 #ifdef KSSL_DEBUG
 	printf ("tls1_generate_master_secret() complete\n");
@@ -1001,6 +1115,95 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 	return(SSL3_MASTER_SECRET_SIZE);
 	}
 
+int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	 const char *label, size_t llen, const unsigned char *context,
+	 size_t contextlen, int use_context)
+	{
+	unsigned char *buff;
+	unsigned char *val = NULL;
+	size_t vallen, currentvalpos;
+	int rv;
+
+#ifdef KSSL_DEBUG
+	printf ("tls1_export_keying_material(%p,%p,%d,%s,%d,%p,%d)\n", s, out, olen, label, llen, p, plen);
+#endif	/* KSSL_DEBUG */
+
+	buff = OPENSSL_malloc(olen);
+	if (buff == NULL) goto err2;
+
+	/* construct PRF arguments
+	 * we construct the PRF argument ourself rather than passing separate
+	 * values into the TLS PRF to ensure that the concatenation of values
+	 * does not create a prohibited label.
+	 */
+	vallen = llen + SSL3_RANDOM_SIZE * 2;
+	if (use_context)
+		{
+		vallen += 2 + contextlen;
+		}
+
+	val = OPENSSL_malloc(vallen);
+	if (val == NULL) goto err2;
+	currentvalpos = 0;
+	memcpy(val + currentvalpos, (unsigned char *) label, llen);
+	currentvalpos += llen;
+	memcpy(val + currentvalpos, s->s3->client_random, SSL3_RANDOM_SIZE);
+	currentvalpos += SSL3_RANDOM_SIZE;
+	memcpy(val + currentvalpos, s->s3->server_random, SSL3_RANDOM_SIZE);
+	currentvalpos += SSL3_RANDOM_SIZE;
+
+	if (use_context)
+		{
+		val[currentvalpos] = (contextlen >> 8) & 0xff;
+		currentvalpos++;
+		val[currentvalpos] = contextlen & 0xff;
+		currentvalpos++;
+		if ((contextlen > 0) || (context != NULL))
+			{
+			memcpy(val + currentvalpos, context, contextlen);
+			}
+		}
+
+	/* disallow prohibited labels
+	 * note that SSL3_RANDOM_SIZE > max(prohibited label len) =
+	 * 15, so size of val > max(prohibited label len) = 15 and the
+	 * comparisons won't have buffer overflow
+	 */
+	if (memcmp(val, TLS_MD_CLIENT_FINISH_CONST,
+		 TLS_MD_CLIENT_FINISH_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_SERVER_FINISH_CONST,
+		 TLS_MD_SERVER_FINISH_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_MASTER_SECRET_CONST,
+		 TLS_MD_MASTER_SECRET_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_KEY_EXPANSION_CONST,
+		 TLS_MD_KEY_EXPANSION_CONST_SIZE) == 0) goto err1;
+
+	rv = tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+		      val, vallen,
+		      NULL, 0,
+		      NULL, 0,
+		      NULL, 0,
+		      NULL, 0,
+		      s->session->master_key,s->session->master_key_length,
+		      out,buff,olen);
+
+#ifdef KSSL_DEBUG
+	printf ("tls1_export_keying_material() complete\n");
+#endif	/* KSSL_DEBUG */
+	goto ret;
+err1:
+	SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, SSL_R_TLS_ILLEGAL_EXPORTER_LABEL);
+	rv = 0;
+	goto ret;
+err2:
+	SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, ERR_R_MALLOC_FAILURE);
+	rv = 0;
+ret:
+	if (buff != NULL) OPENSSL_free(buff);
+	if (val != NULL) OPENSSL_free(val);
+	return(rv);
+	}
+
 int tls1_alert_code(int code)
 	{
 	switch (code)
@@ -1042,4 +1245,3 @@ int tls1_alert_code(int code)
 	default:			return(-1);
 		}
 	}
-
diff --git a/ssl/t1_lib.c b/ssl/t1_lib.c
index bbab467..f170056 100644
--- a/ssl/t1_lib.c
+++ b/ssl/t1_lib.c
@@ -114,6 +114,7 @@
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
 #include <openssl/ocsp.h>
+#include <openssl/rand.h>
 #include "ssl_locl.h"
 
 const char tls1_version_str[]="TLSv1" OPENSSL_VERSION_PTEXT;
@@ -136,6 +137,7 @@ SSL3_ENC_METHOD TLSv1_enc_data={
 	TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE,
 	TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE,
 	tls1_alert_code,
+	tls1_export_keying_material,
 	};
 
 long tls1_default_timeout(void)
@@ -166,10 +168,11 @@ void tls1_free(SSL *s)
 void tls1_clear(SSL *s)
 	{
 	ssl3_clear(s);
-	s->version=TLS1_VERSION;
+	s->version = s->method->version;
 	}
 
 #ifndef OPENSSL_NO_EC
+
 static int nid_list[] =
 	{
 		NID_sect163k1, /* sect163k1 (1) */
@@ -198,7 +201,36 @@ static int nid_list[] =
 		NID_secp384r1, /* secp384r1 (24) */
 		NID_secp521r1  /* secp521r1 (25) */	
 	};
-	
+
+static int pref_list[] =
+	{
+		NID_sect571r1, /* sect571r1 (14) */ 
+		NID_sect571k1, /* sect571k1 (13) */ 
+		NID_secp521r1, /* secp521r1 (25) */	
+		NID_sect409k1, /* sect409k1 (11) */ 
+		NID_sect409r1, /* sect409r1 (12) */
+		NID_secp384r1, /* secp384r1 (24) */
+		NID_sect283k1, /* sect283k1 (9) */
+		NID_sect283r1, /* sect283r1 (10) */ 
+		NID_secp256k1, /* secp256k1 (22) */ 
+		NID_X9_62_prime256v1, /* secp256r1 (23) */ 
+		NID_sect239k1, /* sect239k1 (8) */ 
+		NID_sect233k1, /* sect233k1 (6) */
+		NID_sect233r1, /* sect233r1 (7) */ 
+		NID_secp224k1, /* secp224k1 (20) */ 
+		NID_secp224r1, /* secp224r1 (21) */
+		NID_sect193r1, /* sect193r1 (4) */ 
+		NID_sect193r2, /* sect193r2 (5) */ 
+		NID_secp192k1, /* secp192k1 (18) */
+		NID_X9_62_prime192v1, /* secp192r1 (19) */ 
+		NID_sect163k1, /* sect163k1 (1) */
+		NID_sect163r1, /* sect163r1 (2) */
+		NID_sect163r2, /* sect163r2 (3) */
+		NID_secp160k1, /* secp160k1 (15) */
+		NID_secp160r1, /* secp160r1 (16) */ 
+		NID_secp160r2, /* secp160r2 (17) */ 
+	};
+
 int tls1_ec_curve_id2nid(int curve_id)
 	{
 	/* ECC curves from draft-ietf-tls-ecc-12.txt (Oct. 17, 2005) */
@@ -270,6 +302,64 @@ int tls1_ec_nid2curve_id(int nid)
 #endif /* OPENSSL_NO_EC */
 
 #ifndef OPENSSL_NO_TLSEXT
+
+/* List of supported signature algorithms and hashes. Should make this
+ * customisable at some point, for now include everything we support.
+ */
+
+#ifdef OPENSSL_NO_RSA
+#define tlsext_sigalg_rsa(md) /* */
+#else
+#define tlsext_sigalg_rsa(md) md, TLSEXT_signature_rsa,
+#endif
+
+#ifdef OPENSSL_NO_DSA
+#define tlsext_sigalg_dsa(md) /* */
+#else
+#define tlsext_sigalg_dsa(md) md, TLSEXT_signature_dsa,
+#endif
+
+#ifdef OPENSSL_NO_ECDSA
+#define tlsext_sigalg_ecdsa(md) /* */
+#else
+#define tlsext_sigalg_ecdsa(md) md, TLSEXT_signature_ecdsa,
+#endif
+
+#define tlsext_sigalg(md) \
+		tlsext_sigalg_rsa(md) \
+		tlsext_sigalg_dsa(md) \
+		tlsext_sigalg_ecdsa(md)
+
+static unsigned char tls12_sigalgs[] = {
+#ifndef OPENSSL_NO_SHA512
+	tlsext_sigalg(TLSEXT_hash_sha512)
+	tlsext_sigalg(TLSEXT_hash_sha384)
+#endif
+#ifndef OPENSSL_NO_SHA256
+	tlsext_sigalg(TLSEXT_hash_sha256)
+	tlsext_sigalg(TLSEXT_hash_sha224)
+#endif
+#ifndef OPENSSL_NO_SHA
+	tlsext_sigalg(TLSEXT_hash_sha1)
+#endif
+#ifndef OPENSSL_NO_MD5
+	tlsext_sigalg_rsa(TLSEXT_hash_md5)
+#endif
+};
+
+int tls12_get_req_sig_algs(SSL *s, unsigned char *p)
+	{
+	size_t slen = sizeof(tls12_sigalgs);
+#ifdef OPENSSL_FIPS
+	/* If FIPS mode don't include MD5 which is last */
+	if (FIPS_mode())
+		slen -= 2;
+#endif
+	if (p)
+		memcpy(p, tls12_sigalgs, slen);
+	return (int)slen;
+	}
+
 unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned char *limit)
 	{
 	int extdatalen=0;
@@ -317,7 +407,7 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		}
 
         /* Add RI if renegotiating */
-        if (s->new_session)
+        if (s->renegotiate)
           {
           int el;
           
@@ -341,6 +431,34 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
           ret += el;
         }
 
+#ifndef OPENSSL_NO_SRP
+	/* Add SRP username if there is one */
+	if (s->srp_ctx.login != NULL)
+		{ /* Add TLS extension SRP username to the Client Hello message */
+
+		int login_len = strlen(s->srp_ctx.login);	
+		if (login_len > 255 || login_len == 0)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			} 
+
+		/* check for enough space.
+		   4 for the srp type type and entension length
+		   1 for the srp user identity
+		   + srp user identity length 
+		*/
+		if ((limit - ret - 5 - login_len) < 0) return NULL; 
+
+		/* fill in the extension */
+		s2n(TLSEXT_TYPE_srp,ret);
+		s2n(login_len+1,ret);
+		(*ret++) = (unsigned char) login_len;
+		memcpy(ret, s->srp_ctx.login, login_len);
+		ret+=login_len;
+		}
+#endif
+
 #ifndef OPENSSL_NO_EC
 	if (s->tlsext_ecpointformatlist != NULL &&
 	    s->version != DTLS1_VERSION)
@@ -426,6 +544,17 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		}
 		skip_ext:
 
+	if (TLS1_get_client_version(s) >= TLS1_2_VERSION)
+		{
+		if ((size_t)(limit - ret) < sizeof(tls12_sigalgs) + 6)
+			return NULL; 
+		s2n(TLSEXT_TYPE_signature_algorithms,ret);
+		s2n(sizeof(tls12_sigalgs) + 2, ret);
+		s2n(sizeof(tls12_sigalgs), ret);
+		memcpy(ret, tls12_sigalgs, sizeof(tls12_sigalgs));
+		ret += sizeof(tls12_sigalgs);
+		}
+
 #ifdef TLSEXT_TYPE_opaque_prf_input
 	if (s->s3->client_opaque_prf_input != NULL &&
 	    s->version != DTLS1_VERSION)
@@ -494,6 +623,20 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
 			i2d_X509_EXTENSIONS(s->tlsext_ocsp_exts, &ret);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* Add Heartbeat extension */
+	s2n(TLSEXT_TYPE_heartbeat,ret);
+	s2n(1,ret);
+	/* Set mode:
+	 * 1: peer may send requests
+	 * 2: peer not allowed to send requests
+	 */
+	if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS)
+		*(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+	else
+		*(ret++) = SSL_TLSEXT_HB_ENABLED;
+#endif
+
 #ifndef OPENSSL_NO_NEXTPROTONEG
 	if (s->ctx->next_proto_select_cb && !s->s3->tmp.finish_md_len)
 		{
@@ -506,6 +649,49 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		}
 #endif
 
+	if (s->tlsext_channel_id_enabled)
+		{
+		/* The client advertises an emtpy extension to indicate its
+		 * support for Channel ID. */
+		if (limit - ret - 4 < 0)
+			return NULL;
+		s2n(TLSEXT_TYPE_channel_id,ret);
+		s2n(0,ret);
+		}
+
+	if (s->alpn_client_proto_list && !s->s3->tmp.finish_md_len)
+		{
+		if ((size_t)(limit - ret) < 6 + s->alpn_client_proto_list_len)
+			return NULL;
+		s2n(TLSEXT_TYPE_application_layer_protocol_negotiation,ret);
+		s2n(2 + s->alpn_client_proto_list_len,ret);
+		s2n(s->alpn_client_proto_list_len,ret);
+		memcpy(ret, s->alpn_client_proto_list,
+		       s->alpn_client_proto_list_len);
+		ret += s->alpn_client_proto_list_len;
+		}
+
+#ifndef OPENSSL_NO_SRTP
+        if(SSL_get_srtp_profiles(s))
+                {
+                int el;
+
+                ssl_add_clienthello_use_srtp_ext(s, 0, &el, 0);
+                
+                if((limit - p - 4 - el) < 0) return NULL;
+
+                s2n(TLSEXT_TYPE_use_srtp,ret);
+                s2n(el,ret);
+
+                if(ssl_add_clienthello_use_srtp_ext(s, ret, &el, el))
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			}
+                ret += el;
+                }
+#endif
+
 	if ((extdatalen = ret-p-2)== 0) 
 		return p;
 
@@ -618,6 +804,28 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		ret += sol;
 		}
 #endif
+
+#ifndef OPENSSL_NO_SRTP
+        if(s->srtp_profile)
+                {
+                int el;
+
+                ssl_add_serverhello_use_srtp_ext(s, 0, &el, 0);
+                
+                if((limit - p - 4 - el) < 0) return NULL;
+
+                s2n(TLSEXT_TYPE_use_srtp,ret);
+                s2n(el,ret);
+
+                if(ssl_add_serverhello_use_srtp_ext(s, ret, &el, el))
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			}
+                ret+=el;
+                }
+#endif
+
 	if (((s->s3->tmp.new_cipher->id & 0xFFFF)==0x80 || (s->s3->tmp.new_cipher->id & 0xFFFF)==0x81) 
 		&& (SSL_get_options(s) & SSL_OP_CRYPTOPRO_TLSEXT_BUG))
 		{ const unsigned char cryptopro_ext[36] = {
@@ -633,6 +841,24 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
 
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* Add Heartbeat extension if we've received one */
+	if (s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED)
+		{
+		s2n(TLSEXT_TYPE_heartbeat,ret);
+		s2n(1,ret);
+		/* Set mode:
+		 * 1: peer may send requests
+		 * 2: peer not allowed to send requests
+		 */
+		if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS)
+			*(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+		else
+			*(ret++) = SSL_TLSEXT_HB_ENABLED;
+
+		}
+#endif
+
 #ifndef OPENSSL_NO_NEXTPROTONEG
 	next_proto_neg_seen = s->s3->next_proto_neg_seen;
 	s->s3->next_proto_neg_seen = 0;
@@ -655,6 +881,31 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		}
 #endif
 
+	/* If the client advertised support for Channel ID, and we have it
+	 * enabled, then we want to echo it back. */
+	if (s->s3->tlsext_channel_id_valid)
+		{
+		if (limit - ret - 4 < 0)
+			return NULL;
+		s2n(TLSEXT_TYPE_channel_id,ret);
+		s2n(0,ret);
+		}
+
+	if (s->s3->alpn_selected)
+		{
+		const unsigned char *selected = s->s3->alpn_selected;
+		unsigned len = s->s3->alpn_selected_len;
+
+		if ((long)(limit - ret - 4 - 2 - 1 - len) < 0)
+			return NULL;
+		s2n(TLSEXT_TYPE_application_layer_protocol_negotiation,ret);
+		s2n(3 + len,ret);
+		s2n(1 + len,ret);
+		*ret++ = len;
+		memcpy(ret, selected, len);
+		ret += len;
+		}
+
 	if ((extdatalen = ret-p-2)== 0) 
 		return p;
 
@@ -662,6 +913,76 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
 	return ret;
 	}
 
+/* tls1_alpn_handle_client_hello is called to process the ALPN extension in a
+ * ClientHello.
+ *   data: the contents of the extension, not including the type and length.
+ *   data_len: the number of bytes in |data|
+ *   al: a pointer to the alert value to send in the event of a non-zero
+ *       return.
+ *
+ *   returns: 0 on success. */
+static int tls1_alpn_handle_client_hello(SSL *s, const unsigned char *data,
+					 unsigned data_len, int *al)
+	{
+	unsigned i;
+	unsigned proto_len;
+	const unsigned char *selected;
+	unsigned char selected_len;
+	int r;
+
+	if (s->ctx->alpn_select_cb == NULL)
+		return 0;
+
+	if (data_len < 2)
+		goto parse_error;
+
+	/* data should contain a uint16 length followed by a series of 8-bit,
+	 * length-prefixed strings. */
+	i = ((unsigned) data[0]) << 8 |
+	    ((unsigned) data[1]);
+	data_len -= 2;
+	data += 2;
+	if (data_len != i)
+		goto parse_error;
+
+	if (data_len < 2)
+		goto parse_error;
+
+	for (i = 0; i < data_len;)
+		{
+		proto_len = data[i];
+		i++;
+
+		if (proto_len == 0)
+			goto parse_error;
+
+		if (i + proto_len < i || i + proto_len > data_len)
+			goto parse_error;
+
+		i += proto_len;
+		}
+
+	r = s->ctx->alpn_select_cb(s, &selected, &selected_len, data, data_len,
+				   s->ctx->alpn_select_cb_arg);
+	if (r == SSL_TLSEXT_ERR_OK) {
+		if (s->s3->alpn_selected)
+			OPENSSL_free(s->s3->alpn_selected);
+		s->s3->alpn_selected = OPENSSL_malloc(selected_len);
+		if (!s->s3->alpn_selected)
+			{
+			*al = SSL_AD_INTERNAL_ERROR;
+			return -1;
+			}
+		memcpy(s->s3->alpn_selected, selected, selected_len);
+		s->s3->alpn_selected_len = selected_len;
+	}
+	return 0;
+
+parse_error:
+	*al = SSL_AD_DECODE_ERROR;
+	return -1;
+	}
+
 int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, int n, int *al)
 	{
 	unsigned short type;
@@ -669,10 +990,32 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 	unsigned short len;
 	unsigned char *data = *p;
 	int renegotiate_seen = 0;
+	int sigalg_seen = 0;
 
 	s->servername_done = 0;
 	s->tlsext_status_type = -1;
 
+	/* Reset TLS 1.2 digest functions to defaults because they don't carry
+	 * over to a renegotiation. */
+	s->s3->digest_rsa = NULL;
+	s->s3->digest_dsa = NULL;
+	s->s3->digest_ecdsa = NULL;
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	s->s3->next_proto_neg_seen = 0;
+#endif
+
+	if (s->s3->alpn_selected)
+		{
+		OPENSSL_free(s->s3->alpn_selected);
+		s->s3->alpn_selected = NULL;
+		}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+#endif
+
 	if (data >= (d+n-2))
 		goto ri_check;
 	n2s(data,len);
@@ -799,6 +1142,31 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				}
 
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (type == TLSEXT_TYPE_srp)
+			{
+			if (size <= 0 || ((len = data[0])) != (size -1))
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if (s->srp_ctx.login != NULL)
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if ((s->srp_ctx.login = OPENSSL_malloc(len+1)) == NULL)
+				return -1;
+			memcpy(s->srp_ctx.login, &data[1], len);
+			s->srp_ctx.login[len]='\0';
+  
+			if (strlen(s->srp_ctx.login) != len) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			}
+#endif
 
 #ifndef OPENSSL_NO_EC
 		else if (type == TLSEXT_TYPE_ec_point_formats &&
@@ -843,7 +1211,8 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 			int ellipticcurvelist_length = (*(sdata++) << 8);
 			ellipticcurvelist_length += (*(sdata++));
 
-			if (ellipticcurvelist_length != size - 2)
+			if (ellipticcurvelist_length != size - 2 ||
+				ellipticcurvelist_length < 1)
 				{
 				*al = TLS1_AD_DECODE_ERROR;
 				return 0;
@@ -919,6 +1288,24 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				return 0;
 			renegotiate_seen = 1;
 			}
+		else if (type == TLSEXT_TYPE_signature_algorithms)
+			{
+			int dsize;
+			if (sigalg_seen || size < 2) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			sigalg_seen = 1;
+			n2s(data,dsize);
+			size -= 2;
+			if (dsize != size || dsize & 1) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			tls1_process_sigalgs(s, data, dsize);
+			}
 		else if (type == TLSEXT_TYPE_status_request &&
 		         s->version != DTLS1_VERSION && s->ctx->tlsext_status_cb)
 			{
@@ -1008,6 +1395,12 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				sdata = data;
 				if (dsize > 0)
 					{
+					if (s->tlsext_ocsp_exts)
+						{
+						sk_X509_EXTENSION_pop_free(s->tlsext_ocsp_exts,
+									   X509_EXTENSION_free);
+						}
+
 					s->tlsext_ocsp_exts =
 						d2i_X509_EXTENSIONS(NULL,
 							&sdata, dsize);
@@ -1025,9 +1418,27 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				else
 					s->tlsext_status_type = -1;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (type == TLSEXT_TYPE_heartbeat)
+			{
+			switch(data[0])
+				{
+				case 0x01:	/* Client allows us to send HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							break;
+				case 0x02:	/* Client doesn't accept HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+							break;
+				default:	*al = SSL_AD_ILLEGAL_PARAMETER;
+							return 0;
+				}
+			}
+#endif
 #ifndef OPENSSL_NO_NEXTPROTONEG
 		else if (type == TLSEXT_TYPE_next_proto_neg &&
-                         s->s3->tmp.finish_md_len == 0)
+			 s->s3->tmp.finish_md_len == 0 &&
+			 s->s3->alpn_selected == NULL)
 			{
 			/* We shouldn't accept this extension on a
 			 * renegotiation.
@@ -1048,7 +1459,29 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 			}
 #endif
 
+		else if (type == TLSEXT_TYPE_channel_id && s->tlsext_channel_id_enabled)
+			s->s3->tlsext_channel_id_valid = 1;
+
+		else if (type == TLSEXT_TYPE_application_layer_protocol_negotiation &&
+			 s->ctx->alpn_select_cb &&
+			 s->s3->tmp.finish_md_len == 0)
+			{
+			if (tls1_alpn_handle_client_hello(s, data, size, al) != 0)
+				return 0;
+			/* ALPN takes precedence over NPN. */
+			s->s3->next_proto_neg_seen = 0;
+			}
+
 		/* session ticket processed earlier */
+#ifndef OPENSSL_NO_SRTP
+		else if (type == TLSEXT_TYPE_use_srtp)
+			{
+			if(ssl_parse_clienthello_use_srtp_ext(s, data, size,
+							      al))
+				return 0;
+			}
+#endif
+
 		data+=size;
 		}
 				
@@ -1058,7 +1491,7 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 
 	/* Need RI if renegotiating */
 
-	if (!renegotiate_seen && s->new_session &&
+	if (!renegotiate_seen && s->renegotiate &&
 		!(s->options & SSL_OP_ALLOW_UNSAFE_LEGACY_RENEGOTIATION))
 		{
 		*al = SSL_AD_HANDSHAKE_FAILURE;
@@ -1074,7 +1507,7 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 /* ssl_next_proto_validate validates a Next Protocol Negotiation block. No
  * elements of zero length are allowed and the set of elements must exactly fill
  * the length of the block. */
-static int ssl_next_proto_validate(unsigned char *d, unsigned len)
+static char ssl_next_proto_validate(unsigned char *d, unsigned len)
 	{
 	unsigned int off = 0;
 
@@ -1099,6 +1532,21 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 	int tlsext_servername = 0;
 	int renegotiate_seen = 0;
 
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	s->s3->next_proto_neg_seen = 0;
+#endif
+
+	if (s->s3->alpn_selected)
+		{
+		OPENSSL_free(s->s3->alpn_selected);
+		s->s3->alpn_selected = NULL;
+		}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+#endif
+
 	if (data >= (d+n-2))
 		goto ri_check;
 
@@ -1138,7 +1586,8 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 			unsigned char *sdata = data;
 			int ecpointformatlist_length = *(sdata++);
 
-			if (ecpointformatlist_length != size - 1)
+			if (ecpointformatlist_length != size - 1 || 
+				ecpointformatlist_length < 1)
 				{
 				*al = TLS1_AD_DECODE_ERROR;
 				return 0;
@@ -1225,13 +1674,14 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 			s->tlsext_status_expected = 1;
 			}
 #ifndef OPENSSL_NO_NEXTPROTONEG
-		else if (type == TLSEXT_TYPE_next_proto_neg)
+		else if (type == TLSEXT_TYPE_next_proto_neg &&
+			 s->s3->tmp.finish_md_len == 0)
 			{
 			unsigned char *selected;
 			unsigned char selected_len;
 
 			/* We must have requested it. */
-			if ((s->ctx->next_proto_select_cb == NULL))
+			if (s->ctx->next_proto_select_cb == NULL)
 				{
 				*al = TLS1_AD_UNSUPPORTED_EXTENSION;
 				return 0;
@@ -1255,14 +1705,89 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				}
 			memcpy(s->next_proto_negotiated, selected, selected_len);
 			s->next_proto_negotiated_len = selected_len;
+			s->s3->next_proto_neg_seen = 1;
 			}
 #endif
+		else if (type == TLSEXT_TYPE_channel_id)
+			s->s3->tlsext_channel_id_valid = 1;
+
+		else if (type == TLSEXT_TYPE_application_layer_protocol_negotiation)
+			{
+			unsigned len;
+
+			/* We must have requested it. */
+			if (s->alpn_client_proto_list == NULL)
+				{
+				*al = TLS1_AD_UNSUPPORTED_EXTENSION;
+				return 0;
+				}
+			if (size < 4)
+				{
+				*al = TLS1_AD_DECODE_ERROR;
+				return 0;
+				}
+			/* The extension data consists of:
+			 *   uint16 list_length
+			 *   uint8 proto_length;
+			 *   uint8 proto[proto_length]; */
+			len = data[0];
+			len <<= 8;
+			len |= data[1];
+			if (len != (unsigned) size - 2)
+				{
+				*al = TLS1_AD_DECODE_ERROR;
+				return 0;
+				}
+			len = data[2];
+			if (len != (unsigned) size - 3)
+				{
+				*al = TLS1_AD_DECODE_ERROR;
+				return 0;
+				}
+			if (s->s3->alpn_selected)
+				OPENSSL_free(s->s3->alpn_selected);
+			s->s3->alpn_selected = OPENSSL_malloc(len);
+			if (!s->s3->alpn_selected)
+				{
+				*al = TLS1_AD_INTERNAL_ERROR;
+				return 0;
+				}
+			memcpy(s->s3->alpn_selected, data + 3, len);
+			s->s3->alpn_selected_len = len;
+			}
+
 		else if (type == TLSEXT_TYPE_renegotiate)
 			{
 			if(!ssl_parse_serverhello_renegotiate_ext(s, data, size, al))
 				return 0;
 			renegotiate_seen = 1;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (type == TLSEXT_TYPE_heartbeat)
+			{
+			switch(data[0])
+				{
+				case 0x01:	/* Server allows us to send HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							break;
+				case 0x02:	/* Server doesn't accept HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+							break;
+				default:	*al = SSL_AD_ILLEGAL_PARAMETER;
+							return 0;
+				}
+			}
+#endif
+#ifndef OPENSSL_NO_SRTP
+		else if (type == TLSEXT_TYPE_use_srtp)
+			{
+                        if(ssl_parse_serverhello_use_srtp_ext(s, data, size,
+							      al))
+                                return 0;
+			}
+#endif
+
 		data+=size;		
 		}
 
@@ -1342,7 +1867,7 @@ int ssl_prepare_clienthello_tlsext(SSL *s)
 			break;
 			}
 		}
-	using_ecc = using_ecc && (s->version == TLS1_VERSION);
+	using_ecc = using_ecc && (s->version >= TLS1_VERSION);
 	if (using_ecc)
 		{
 		if (s->tlsext_ecpointformatlist != NULL) OPENSSL_free(s->tlsext_ecpointformatlist);
@@ -1358,16 +1883,19 @@ int ssl_prepare_clienthello_tlsext(SSL *s)
 
 		/* we support all named elliptic curves in draft-ietf-tls-ecc-12 */
 		if (s->tlsext_ellipticcurvelist != NULL) OPENSSL_free(s->tlsext_ellipticcurvelist);
-		s->tlsext_ellipticcurvelist_length = sizeof(nid_list)/sizeof(nid_list[0]) * 2;
+		s->tlsext_ellipticcurvelist_length = sizeof(pref_list)/sizeof(pref_list[0]) * 2;
 		if ((s->tlsext_ellipticcurvelist = OPENSSL_malloc(s->tlsext_ellipticcurvelist_length)) == NULL)
 			{
 			s->tlsext_ellipticcurvelist_length = 0;
 			SSLerr(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT,ERR_R_MALLOC_FAILURE);
 			return -1;
 			}
-		for (i = 1, j = s->tlsext_ellipticcurvelist; (unsigned int)i <=
-				sizeof(nid_list)/sizeof(nid_list[0]); i++)
-			s2n(i,j);
+		for (i = 0, j = s->tlsext_ellipticcurvelist; (unsigned int)i <
+				sizeof(pref_list)/sizeof(pref_list[0]); i++)
+			{
+			int id = tls1_ec_nid2curve_id(pref_list[i]);
+			s2n(id,j);
+			}
 		}
 #endif /* OPENSSL_NO_EC */
 
@@ -1439,7 +1967,7 @@ int ssl_prepare_serverhello_tlsext(SSL *s)
 	return 1;
 	}
 
-int ssl_check_clienthello_tlsext(SSL *s)
+int ssl_check_clienthello_tlsext_early(SSL *s)
 	{
 	int ret=SSL_TLSEXT_ERR_NOACK;
 	int al = SSL_AD_UNRECOGNIZED_NAME;
@@ -1458,42 +1986,12 @@ int ssl_check_clienthello_tlsext(SSL *s)
 	else if (s->initial_ctx != NULL && s->initial_ctx->tlsext_servername_callback != 0) 		
 		ret = s->initial_ctx->tlsext_servername_callback(s, &al, s->initial_ctx->tlsext_servername_arg);
 
-	/* If status request then ask callback what to do.
- 	 * Note: this must be called after servername callbacks in case 
- 	 * the certificate has changed.
- 	 */
-	if ((s->tlsext_status_type != -1) && s->ctx && s->ctx->tlsext_status_cb)
-		{
-		int r;
-		r = s->ctx->tlsext_status_cb(s, s->ctx->tlsext_status_arg);
-		switch (r)
-			{
-			/* We don't want to send a status request response */
-			case SSL_TLSEXT_ERR_NOACK:
-				s->tlsext_status_expected = 0;
-				break;
-			/* status request response should be sent */
-			case SSL_TLSEXT_ERR_OK:
-				if (s->tlsext_ocsp_resp)
-					s->tlsext_status_expected = 1;
-				else
-					s->tlsext_status_expected = 0;
-				break;
-			/* something bad happened */
-			case SSL_TLSEXT_ERR_ALERT_FATAL:
-				ret = SSL_TLSEXT_ERR_ALERT_FATAL;
-				al = SSL_AD_INTERNAL_ERROR;
-				goto err;
-			}
-		}
-	else
-		s->tlsext_status_expected = 0;
-
 #ifdef TLSEXT_TYPE_opaque_prf_input
  	{
 		/* This sort of belongs into ssl_prepare_serverhello_tlsext(),
 		 * but we might be sending an alert in response to the client hello,
-		 * so this has to happen here in ssl_check_clienthello_tlsext(). */
+		 * so this has to happen here in
+		 * ssl_check_clienthello_tlsext_early(). */
 
 		int r = 1;
 	
@@ -1545,8 +2043,8 @@ int ssl_check_clienthello_tlsext(SSL *s)
 			}
 	}
 
-#endif
  err:
+#endif
 	switch (ret)
 		{
 		case SSL_TLSEXT_ERR_ALERT_FATAL:
@@ -1564,6 +2062,71 @@ int ssl_check_clienthello_tlsext(SSL *s)
 		}
 	}
 
+int ssl_check_clienthello_tlsext_late(SSL *s)
+	{
+	int ret = SSL_TLSEXT_ERR_OK;
+	int al;
+
+	/* If status request then ask callback what to do.
+ 	 * Note: this must be called after servername callbacks in case 
+ 	 * the certificate has changed, and must be called after the cipher
+	 * has been chosen because this may influence which certificate is sent
+ 	 */
+	if ((s->tlsext_status_type != -1) && s->ctx && s->ctx->tlsext_status_cb)
+		{
+		int r;
+		CERT_PKEY *certpkey;
+		certpkey = ssl_get_server_send_pkey(s);
+		/* If no certificate can't return certificate status */
+		if (certpkey == NULL)
+			{
+			s->tlsext_status_expected = 0;
+			return 1;
+			}
+		/* Set current certificate to one we will use so
+		 * SSL_get_certificate et al can pick it up.
+		 */
+		s->cert->key = certpkey;
+		r = s->ctx->tlsext_status_cb(s, s->ctx->tlsext_status_arg);
+		switch (r)
+			{
+			/* We don't want to send a status request response */
+			case SSL_TLSEXT_ERR_NOACK:
+				s->tlsext_status_expected = 0;
+				break;
+			/* status request response should be sent */
+			case SSL_TLSEXT_ERR_OK:
+				if (s->tlsext_ocsp_resp)
+					s->tlsext_status_expected = 1;
+				else
+					s->tlsext_status_expected = 0;
+				break;
+			/* something bad happened */
+			case SSL_TLSEXT_ERR_ALERT_FATAL:
+				ret = SSL_TLSEXT_ERR_ALERT_FATAL;
+				al = SSL_AD_INTERNAL_ERROR;
+				goto err;
+			}
+		}
+	else
+		s->tlsext_status_expected = 0;
+
+ err:
+	switch (ret)
+		{
+		case SSL_TLSEXT_ERR_ALERT_FATAL:
+			ssl3_send_alert(s,SSL3_AL_FATAL,al); 
+			return -1;
+
+		case SSL_TLSEXT_ERR_ALERT_WARNING:
+			ssl3_send_alert(s,SSL3_AL_WARNING,al);
+			return 1; 
+
+		default:
+			return 1;
+		}
+	}
+
 int ssl_check_serverhello_tlsext(SSL *s)
 	{
 	int ret=SSL_TLSEXT_ERR_NOACK;
@@ -1676,26 +2239,56 @@ int ssl_check_serverhello_tlsext(SSL *s)
 		}
 	}
 
-/* Since the server cache lookup is done early on in the processing of client
- * hello and other operations depend on the result we need to handle any TLS
- * session ticket extension at the same time.
+/* Since the server cache lookup is done early on in the processing of the
+ * ClientHello, and other operations depend on the result, we need to handle
+ * any TLS session ticket extension at the same time.
+ *
+ *   session_id: points at the session ID in the ClientHello. This code will
+ *       read past the end of this in order to parse out the session ticket
+ *       extension, if any.
+ *   len: the length of the session ID.
+ *   limit: a pointer to the first byte after the ClientHello.
+ *   ret: (output) on return, if a ticket was decrypted, then this is set to
+ *       point to the resulting session.
+ *
+ * If s->tls_session_secret_cb is set then we are expecting a pre-shared key
+ * ciphersuite, in which case we have no use for session tickets and one will
+ * never be decrypted, nor will s->tlsext_ticket_expected be set to 1.
+ *
+ * Returns:
+ *   -1: fatal error, either from parsing or decrypting the ticket.
+ *    0: no ticket was found (or was ignored, based on settings).
+ *    1: a zero length extension was found, indicating that the client supports
+ *       session tickets but doesn't currently have one to offer.
+ *    2: either s->tls_session_secret_cb was set, or a ticket was offered but
+ *       couldn't be decrypted because of a non-fatal error.
+ *    3: a ticket was successfully decrypted and *ret was set.
+ *
+ * Side effects:
+ *   Sets s->tlsext_ticket_expected to 1 if the server will have to issue
+ *   a new session ticket to the client because the client indicated support
+ *   (and s->tls_session_secret_cb is NULL) but the client either doesn't have
+ *   a session ticket or we couldn't use the one it gave us, or if
+ *   s->ctx->tlsext_ticket_key_cb asked to renew the client's ticket.
+ *   Otherwise, s->tlsext_ticket_expected is set to 0.
  */
-
 int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
-				const unsigned char *limit, SSL_SESSION **ret)
+			const unsigned char *limit, SSL_SESSION **ret)
 	{
 	/* Point after session ID in client hello */
 	const unsigned char *p = session_id + len;
 	unsigned short i;
 
+	*ret = NULL;
+	s->tlsext_ticket_expected = 0;
+
 	/* If tickets disabled behave as if no ticket present
- 	 * to permit stateful resumption.
- 	 */
+	 * to permit stateful resumption.
+	 */
 	if (SSL_get_options(s) & SSL_OP_NO_TICKET)
-		return 1;
-
+		return 0;
 	if ((s->version <= SSL3_VERSION) || !limit)
-		return 1;
+		return 0;
 	if (p >= limit)
 		return -1;
 	/* Skip past DTLS cookie */
@@ -1718,7 +2311,7 @@ int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
 		return -1;
 	/* Now at start of extensions */
 	if ((p + 2) >= limit)
-		return 1;
+		return 0;
 	n2s(p, i);
 	while ((p + 4) <= limit)
 		{
@@ -1726,39 +2319,61 @@ int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
 		n2s(p, type);
 		n2s(p, size);
 		if (p + size > limit)
-			return 1;
+			return 0;
 		if (type == TLSEXT_TYPE_session_ticket)
 			{
-			/* If tickets disabled indicate cache miss which will
- 			 * trigger a full handshake
- 			 */
-			if (SSL_get_options(s) & SSL_OP_NO_TICKET)
-				return 1;
-			/* If zero length note client will accept a ticket
- 			 * and indicate cache miss to trigger full handshake
- 			 */
+			int r;
 			if (size == 0)
 				{
+				/* The client will accept a ticket but doesn't
+				 * currently have one. */
 				s->tlsext_ticket_expected = 1;
-				return 0;	/* Cache miss */
+				return 1;
 				}
 			if (s->tls_session_secret_cb)
 				{
-				/* Indicate cache miss here and instead of
-				 * generating the session from ticket now,
-				 * trigger abbreviated handshake based on
-				 * external mechanism to calculate the master
-				 * secret later. */
-				return 0;
+				/* Indicate that the ticket couldn't be
+				 * decrypted rather than generating the session
+				 * from ticket now, trigger abbreviated
+				 * handshake based on external mechanism to
+				 * calculate the master secret later. */
+				return 2;
+				}
+			r = tls_decrypt_ticket(s, p, size, session_id, len, ret);
+			switch (r)
+				{
+				case 2: /* ticket couldn't be decrypted */
+					s->tlsext_ticket_expected = 1;
+					return 2;
+				case 3: /* ticket was decrypted */
+					return r;
+				case 4: /* ticket decrypted but need to renew */
+					s->tlsext_ticket_expected = 1;
+					return 3;
+				default: /* fatal error */
+					return -1;
 				}
-			return tls_decrypt_ticket(s, p, size, session_id, len,
-									ret);
 			}
 		p += size;
 		}
-	return 1;
+	return 0;
 	}
 
+/* tls_decrypt_ticket attempts to decrypt a session ticket.
+ *
+ *   etick: points to the body of the session ticket extension.
+ *   eticklen: the length of the session tickets extenion.
+ *   sess_id: points at the session ID.
+ *   sesslen: the length of the session ID.
+ *   psess: (output) on return, if a ticket was decrypted, then this is set to
+ *       point to the resulting session.
+ *
+ * Returns:
+ *   -1: fatal error, either from parsing or decrypting the ticket.
+ *    2: the ticket couldn't be decrypted.
+ *    3: a ticket was successfully decrypted and *psess was set.
+ *    4: same as 3, but the ticket needs to be renewed.
+ */
 static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 				const unsigned char *sess_id, int sesslen,
 				SSL_SESSION **psess)
@@ -1773,7 +2388,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 	SSL_CTX *tctx = s->initial_ctx;
 	/* Need at least keyname + iv + some encrypted data */
 	if (eticklen < 48)
-		goto tickerr;
+		return 2;
 	/* Initialize session ticket encryption and HMAC contexts */
 	HMAC_CTX_init(&hctx);
 	EVP_CIPHER_CTX_init(&ctx);
@@ -1785,7 +2400,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 		if (rv < 0)
 			return -1;
 		if (rv == 0)
-			goto tickerr;
+			return 2;
 		if (rv == 2)
 			renew_ticket = 1;
 		}
@@ -1793,15 +2408,15 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 		{
 		/* Check key name matches */
 		if (memcmp(etick, tctx->tlsext_tick_key_name, 16))
-			goto tickerr;
+			return 2;
 		HMAC_Init_ex(&hctx, tctx->tlsext_tick_hmac_key, 16,
 					tlsext_tick_md(), NULL);
 		EVP_DecryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL,
 				tctx->tlsext_tick_aes_key, etick + 16);
 		}
 	/* Attempt to process session ticket, first conduct sanity and
- 	 * integrity checks on ticket.
- 	 */
+	 * integrity checks on ticket.
+	 */
 	mlen = HMAC_size(&hctx);
 	if (mlen < 0)
 		{
@@ -1813,8 +2428,8 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 	HMAC_Update(&hctx, etick, eticklen);
 	HMAC_Final(&hctx, tick_hmac, NULL);
 	HMAC_CTX_cleanup(&hctx);
-	if (memcmp(tick_hmac, etick + eticklen, mlen))
-		goto tickerr;
+	if (CRYPTO_memcmp(tick_hmac, etick + eticklen, mlen))
+		return 2;
 	/* Attempt to decrypt session data */
 	/* Move p after IV to start of encrypted ticket, update length */
 	p = etick + 16 + EVP_CIPHER_CTX_iv_length(&ctx);
@@ -1827,33 +2442,377 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 		}
 	EVP_DecryptUpdate(&ctx, sdec, &slen, p, eticklen);
 	if (EVP_DecryptFinal(&ctx, sdec + slen, &mlen) <= 0)
-		goto tickerr;
+		return 2;
 	slen += mlen;
 	EVP_CIPHER_CTX_cleanup(&ctx);
 	p = sdec;
-		
+
 	sess = d2i_SSL_SESSION(NULL, &p, slen);
 	OPENSSL_free(sdec);
 	if (sess)
 		{
-		/* The session ID if non-empty is used by some clients to
- 		 * detect that the ticket has been accepted. So we copy it to
- 		 * the session structure. If it is empty set length to zero
- 		 * as required by standard.
- 		 */
+		/* The session ID, if non-empty, is used by some clients to
+		 * detect that the ticket has been accepted. So we copy it to
+		 * the session structure. If it is empty set length to zero
+		 * as required by standard.
+		 */
 		if (sesslen)
 			memcpy(sess->session_id, sess_id, sesslen);
 		sess->session_id_length = sesslen;
 		*psess = sess;
-		s->tlsext_ticket_expected = renew_ticket;
-		return 1;
+		if (renew_ticket)
+			return 4;
+		else
+			return 3;
 		}
-	/* If session decrypt failure indicate a cache miss and set state to
- 	 * send a new ticket
- 	 */
-	tickerr:	
-	s->tlsext_ticket_expected = 1;
+        ERR_clear_error();
+	/* For session parse failure, indicate that we need to send a new
+	 * ticket. */
+	return 2;
+	}
+
+/* Tables to translate from NIDs to TLS v1.2 ids */
+
+typedef struct 
+	{
+	int nid;
+	int id;
+	} tls12_lookup;
+
+static tls12_lookup tls12_md[] = {
+#ifndef OPENSSL_NO_MD5
+	{NID_md5, TLSEXT_hash_md5},
+#endif
+#ifndef OPENSSL_NO_SHA
+	{NID_sha1, TLSEXT_hash_sha1},
+#endif
+#ifndef OPENSSL_NO_SHA256
+	{NID_sha224, TLSEXT_hash_sha224},
+	{NID_sha256, TLSEXT_hash_sha256},
+#endif
+#ifndef OPENSSL_NO_SHA512
+	{NID_sha384, TLSEXT_hash_sha384},
+	{NID_sha512, TLSEXT_hash_sha512}
+#endif
+};
+
+static tls12_lookup tls12_sig[] = {
+#ifndef OPENSSL_NO_RSA
+	{EVP_PKEY_RSA, TLSEXT_signature_rsa},
+#endif
+#ifndef OPENSSL_NO_DSA
+	{EVP_PKEY_DSA, TLSEXT_signature_dsa},
+#endif
+#ifndef OPENSSL_NO_ECDSA
+	{EVP_PKEY_EC, TLSEXT_signature_ecdsa}
+#endif
+};
+
+static int tls12_find_id(int nid, tls12_lookup *table, size_t tlen)
+	{
+	size_t i;
+	for (i = 0; i < tlen; i++)
+		{
+		if (table[i].nid == nid)
+			return table[i].id;
+		}
+	return -1;
+	}
+
+int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk, const EVP_MD *md)
+	{
+	int sig_id, md_id;
+	if (!md)
+		return 0;
+	md_id = tls12_find_id(EVP_MD_type(md), tls12_md,
+				sizeof(tls12_md)/sizeof(tls12_lookup));
+	if (md_id == -1)
+		return 0;
+	sig_id = tls12_get_sigid(pk);
+	if (sig_id == -1)
+		return 0;
+	p[0] = (unsigned char)md_id;
+	p[1] = (unsigned char)sig_id;
+	return 1;
+	}
+
+/* tls12_get_sigid returns the TLS 1.2 SignatureAlgorithm value corresponding
+ * to the given public key, or -1 if not known. */
+int tls12_get_sigid(const EVP_PKEY *pk)
+	{
+	return tls12_find_id(pk->type, tls12_sig,
+				sizeof(tls12_sig)/sizeof(tls12_lookup));
+	}
+
+const EVP_MD *tls12_get_hash(unsigned char hash_alg)
+	{
+	switch(hash_alg)
+		{
+#ifndef OPENSSL_NO_MD5
+		case TLSEXT_hash_md5:
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return NULL;
+#endif
+		return EVP_md5();
+#endif
+#ifndef OPENSSL_NO_SHA
+	case TLSEXT_hash_sha1:
+		return EVP_sha1();
+#endif
+#ifndef OPENSSL_NO_SHA256
+	case TLSEXT_hash_sha224:
+		return EVP_sha224();
+
+	case TLSEXT_hash_sha256:
+		return EVP_sha256();
+#endif
+#ifndef OPENSSL_NO_SHA512
+	case TLSEXT_hash_sha384:
+		return EVP_sha384();
+
+	case TLSEXT_hash_sha512:
+		return EVP_sha512();
+#endif
+	default:
+		return NULL;
+
+		}
+	}
+
+/* tls1_process_sigalgs processes a signature_algorithms extension and sets the
+ * digest functions accordingly for each key type.
+ *
+ * See RFC 5246, section 7.4.1.4.1.
+ *
+ * data: points to the content of the extension, not including type and length
+ *     headers.
+ * dsize: the number of bytes of |data|. Must be even.
+ */
+void tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize)
+	{
+	int i;
+	const EVP_MD *md, **digest_ptr;
+	/* Extension ignored for TLS versions below 1.2 */
+	if (TLS1_get_version(s) < TLS1_2_VERSION)
+		return;
+
+	s->s3->digest_rsa = NULL;
+	s->s3->digest_dsa = NULL;
+	s->s3->digest_ecdsa = NULL;
+
+	for (i = 0; i < dsize; i += 2)
+		{
+		unsigned char hash_alg = data[i], sig_alg = data[i+1];
+
+		switch(sig_alg)
+			{
+#ifndef OPENSSL_NO_RSA
+			case TLSEXT_signature_rsa:
+			digest_ptr = &s->s3->digest_rsa;
+			break;
+#endif
+#ifndef OPENSSL_NO_DSA
+			case TLSEXT_signature_dsa:
+			digest_ptr = &s->s3->digest_dsa;
+			break;
+#endif
+#ifndef OPENSSL_NO_ECDSA
+			case TLSEXT_signature_ecdsa:
+			digest_ptr = &s->s3->digest_ecdsa;
+			break;
+#endif
+			default:
+			continue;
+			}
+
+		if (*digest_ptr == NULL)
+			{
+			md = tls12_get_hash(hash_alg);
+			if (md)
+				*digest_ptr = md;
+			}
+
+		}
+	}
+
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+int
+tls1_process_heartbeat(SSL *s)
+	{
+	unsigned char *p = &s->s3->rrec.data[0], *pl;
+	unsigned short hbtype;
+	unsigned int payload;
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Read type and payload length first */
+	hbtype = *p++;
+	n2s(p, payload);
+	pl = p;
+
+	if (s->msg_callback)
+		s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT,
+			&s->s3->rrec.data[0], s->s3->rrec.length,
+			s, s->msg_callback_arg);
+
+	if (hbtype == TLS1_HB_REQUEST)
+		{
+		unsigned char *buffer, *bp;
+		int r;
+
+		/* Allocate memory for the response, size is 1 bytes
+		 * message type, plus 2 bytes payload length, plus
+		 * payload, plus padding
+		 */
+		buffer = OPENSSL_malloc(1 + 2 + payload + padding);
+		bp = buffer;
+		
+		/* Enter response type, length and copy payload */
+		*bp++ = TLS1_HB_RESPONSE;
+		s2n(payload, bp);
+		memcpy(bp, pl, payload);
+		bp += payload;
+		/* Random padding */
+		RAND_pseudo_bytes(bp, padding);
+
+		r = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding);
+
+		if (r >= 0 && s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buffer, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		OPENSSL_free(buffer);
+
+		if (r < 0)
+			return r;
+		}
+	else if (hbtype == TLS1_HB_RESPONSE)
+		{
+		unsigned int seq;
+		
+		/* We only send sequence numbers (2 bytes unsigned int),
+		 * and 16 random bytes, so we just try to read the
+		 * sequence number */
+		n2s(pl, seq);
+		
+		if (payload == 18 && seq == s->tlsext_hb_seq)
+			{
+			s->tlsext_hb_seq++;
+			s->tlsext_hb_pending = 0;
+			}
+		}
+
 	return 0;
 	}
 
+int
+tls1_heartbeat(SSL *s)
+	{
+	unsigned char *buf, *p;
+	int ret;
+	unsigned int payload = 18; /* Sequence number + random bytes */
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Only send if peer supports and accepts HB requests... */
+	if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) ||
+	    s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT);
+		return -1;
+		}
+
+	/* ...and there is none in flight yet... */
+	if (s->tlsext_hb_pending)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING);
+		return -1;
+		}
+		
+	/* ...and no handshake in progress. */
+	if (SSL_in_init(s) || s->in_handshake)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE);
+		return -1;
+		}
+		
+	/* Check if padding is too long, payload and padding
+	 * must not exceed 2^14 - 3 = 16381 bytes in total.
+	 */
+	OPENSSL_assert(payload + padding <= 16381);
+
+	/* Create HeartBeat message, we just use a sequence number
+	 * as payload to distuingish different messages and add
+	 * some random stuff.
+	 *  - Message Type, 1 byte
+	 *  - Payload Length, 2 bytes (unsigned int)
+	 *  - Payload, the sequence number (2 bytes uint)
+	 *  - Payload, random bytes (16 bytes uint)
+	 *  - Padding
+	 */
+	buf = OPENSSL_malloc(1 + 2 + payload + padding);
+	p = buf;
+	/* Message Type */
+	*p++ = TLS1_HB_REQUEST;
+	/* Payload length (18 bytes here) */
+	s2n(payload, p);
+	/* Sequence number */
+	s2n(s->tlsext_hb_seq, p);
+	/* 16 random bytes */
+	RAND_pseudo_bytes(p, 16);
+	p += 16;
+	/* Random padding */
+	RAND_pseudo_bytes(p, padding);
+
+	ret = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding);
+	if (ret >= 0)
+		{
+		if (s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buf, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		s->tlsext_hb_pending = 1;
+		}
+		
+	OPENSSL_free(buf);
+
+	return ret;
+	}
+#endif
+
+#if !defined(OPENSSL_NO_TLSEXT)
+/* tls1_channel_id_hash calculates the signed data for a Channel ID on the given
+ * SSL connection and writes it to |md|.
+ */
+int
+tls1_channel_id_hash(EVP_MD_CTX *md, SSL *s)
+	{
+	EVP_MD_CTX ctx;
+	unsigned char temp_digest[EVP_MAX_MD_SIZE];
+	unsigned temp_digest_len;
+	int i;
+	static const char kClientIDMagic[] = "TLS Channel ID signature";
+
+	if (s->s3->handshake_buffer)
+		if (!ssl3_digest_cached_records(s))
+			return 0;
+
+	EVP_DigestUpdate(md, kClientIDMagic, sizeof(kClientIDMagic));
+
+	EVP_MD_CTX_init(&ctx);
+	for (i = 0; i < SSL_MAX_DIGEST; i++)
+		{
+		if (s->s3->handshake_dgst[i] == NULL)
+			continue;
+		EVP_MD_CTX_copy_ex(&ctx, s->s3->handshake_dgst[i]);
+		EVP_DigestFinal_ex(&ctx, temp_digest, &temp_digest_len);
+		EVP_DigestUpdate(md, temp_digest, temp_digest_len);
+		}
+	EVP_MD_CTX_cleanup(&ctx);
+
+	return 1;
+	}
 #endif
diff --git a/ssl/t1_meth.c b/ssl/t1_meth.c
index 6ce7c0b..53c807d 100644
--- a/ssl/t1_meth.c
+++ b/ssl/t1_meth.c
@@ -60,16 +60,28 @@
 #include <openssl/objects.h>
 #include "ssl_locl.h"
 
-static const SSL_METHOD *tls1_get_method(int ver);
 static const SSL_METHOD *tls1_get_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_method());
-	else
-		return(NULL);
+		return TLSv1_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_method,
+			ssl3_accept,
+			ssl3_connect,
+			tls1_get_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_method,
+			ssl3_accept,
+			ssl3_connect,
+			tls1_get_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_method,
 			ssl3_accept,
 			ssl3_connect,
 			tls1_get_method)
diff --git a/ssl/t1_srvr.c b/ssl/t1_srvr.c
index 42525e9..f1d1565 100644
--- a/ssl/t1_srvr.c
+++ b/ssl/t1_srvr.c
@@ -67,13 +67,26 @@
 static const SSL_METHOD *tls1_get_server_method(int ver);
 static const SSL_METHOD *tls1_get_server_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_server_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_server_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_server_method());
-	else
-		return(NULL);
+		return TLSv1_server_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_server_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_server_method,
+			ssl3_accept,
+			ssl_undefined_function,
+			tls1_get_server_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_server_method,
+			ssl3_accept,
+			ssl_undefined_function,
+			tls1_get_server_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_server_method,
 			ssl3_accept,
 			ssl_undefined_function,
 			tls1_get_server_method)
diff --git a/ssl/tls1.h b/ssl/tls1.h
index 76f368a..c6670f4 100644
--- a/ssl/tls1.h
+++ b/ssl/tls1.h
@@ -159,10 +159,24 @@ extern "C" {
 
 #define TLS1_ALLOW_EXPERIMENTAL_CIPHERSUITES	0
 
+#define TLS1_2_VERSION			0x0303
+#define TLS1_2_VERSION_MAJOR		0x03
+#define TLS1_2_VERSION_MINOR		0x03
+
+#define TLS1_1_VERSION			0x0302
+#define TLS1_1_VERSION_MAJOR		0x03
+#define TLS1_1_VERSION_MINOR		0x02
+
 #define TLS1_VERSION			0x0301
 #define TLS1_VERSION_MAJOR		0x03
 #define TLS1_VERSION_MINOR		0x01
 
+#define TLS1_get_version(s) \
+		((s->version >> 8) == TLS1_VERSION_MAJOR ? s->version : 0)
+
+#define TLS1_get_client_version(s) \
+		((s->client_version >> 8) == TLS1_VERSION_MAJOR ? s->client_version : 0)
+
 #define TLS1_AD_DECRYPTION_FAILED	21
 #define TLS1_AD_RECORD_OVERFLOW		22
 #define TLS1_AD_UNKNOWN_CA		48	/* fatal */
@@ -183,17 +197,45 @@ extern "C" {
 #define TLS1_AD_BAD_CERTIFICATE_HASH_VALUE 114
 #define TLS1_AD_UNKNOWN_PSK_IDENTITY	115	/* fatal */
 
-/* ExtensionType values from RFC3546 / RFC4366 */
+/* ExtensionType values from RFC3546 / RFC4366 / RFC6066 */
 #define TLSEXT_TYPE_server_name			0
 #define TLSEXT_TYPE_max_fragment_length		1
 #define TLSEXT_TYPE_client_certificate_url	2
 #define TLSEXT_TYPE_trusted_ca_keys		3
 #define TLSEXT_TYPE_truncated_hmac		4
 #define TLSEXT_TYPE_status_request		5
+/* ExtensionType values from RFC4681 */
+#define TLSEXT_TYPE_user_mapping		6
+
+/* ExtensionType values from RFC5878 */
+#define TLSEXT_TYPE_client_authz		7
+#define TLSEXT_TYPE_server_authz		8
+
+/* ExtensionType values from RFC6091 */
+#define TLSEXT_TYPE_cert_type		9
+
 /* ExtensionType values from RFC4492 */
 #define TLSEXT_TYPE_elliptic_curves		10
 #define TLSEXT_TYPE_ec_point_formats		11
+
+/* ExtensionType value from RFC5054 */
+#define TLSEXT_TYPE_srp				12
+
+/* ExtensionType values from RFC5246 */
+#define TLSEXT_TYPE_signature_algorithms	13
+
+/* ExtensionType value from RFC5764 */
+#define TLSEXT_TYPE_use_srtp	14
+
+/* ExtensionType value from RFC5620 */
+#define TLSEXT_TYPE_heartbeat	15
+
+/* ExtensionType value from draft-ietf-tls-applayerprotoneg-00 */
+#define TLSEXT_TYPE_application_layer_protocol_negotiation 16
+
+/* ExtensionType value from RFC4507 */
 #define TLSEXT_TYPE_session_ticket		35
+
 /* ExtensionType value from draft-rescorla-tls-opaque-prf-input-00.txt */
 #if 0 /* will have to be provided externally for now ,
        * i.e. build with -DTLSEXT_TYPE_opaque_prf_input=38183
@@ -209,6 +251,9 @@ extern "C" {
 #define TLSEXT_TYPE_next_proto_neg		13172
 #endif
 
+/* This is not an IANA defined extension number */
+#define TLSEXT_TYPE_channel_id			30031
+
 /* NameType value from RFC 3546 */
 #define TLSEXT_NAMETYPE_host_name 0
 /* status request value from RFC 3546 */
@@ -221,12 +266,37 @@ extern "C" {
 #define TLSEXT_ECPOINTFORMAT_ansiX962_compressed_char2	2
 #define TLSEXT_ECPOINTFORMAT_last			2
 
+/* Signature and hash algorithms from RFC 5246 */
+
+#define TLSEXT_signature_anonymous			0
+#define TLSEXT_signature_rsa				1
+#define TLSEXT_signature_dsa				2
+#define TLSEXT_signature_ecdsa				3
+
+#define TLSEXT_hash_none				0
+#define TLSEXT_hash_md5					1
+#define TLSEXT_hash_sha1				2
+#define TLSEXT_hash_sha224				3
+#define TLSEXT_hash_sha256				4
+#define TLSEXT_hash_sha384				5
+#define TLSEXT_hash_sha512				6
+
 #ifndef OPENSSL_NO_TLSEXT
 
 #define TLSEXT_MAXLEN_host_name 255
 
-const char *SSL_get_servername(const SSL *s, const int type) ;
-int SSL_get_servername_type(const SSL *s) ;
+const char *SSL_get_servername(const SSL *s, const int type);
+int SSL_get_servername_type(const SSL *s);
+/* SSL_export_keying_material exports a value derived from the master secret,
+ * as specified in RFC 5705. It writes |olen| bytes to |out| given a label and
+ * optional context. (Since a zero length context is allowed, the |use_context|
+ * flag controls whether a context is included.)
+ *
+ * It returns 1 on success and zero otherwise.
+ */
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context);
 
 #define SSL_set_tlsext_host_name(s,name) \
 SSL_ctrl(s,SSL_CTRL_SET_TLSEXT_HOSTNAME,TLSEXT_NAMETYPE_host_name,(char *)name)
@@ -290,6 +360,16 @@ SSL_CTX_ctrl(ctx,SSL_CTRL_SET_TLSEXT_OPAQUE_PRF_INPUT_CB_ARG, 0, arg)
 #define SSL_CTX_set_tlsext_ticket_key_cb(ssl, cb) \
 SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_TLSEXT_HB_ENABLED				0x01
+#define SSL_TLSEXT_HB_DONT_SEND_REQUESTS	0x02
+#define SSL_TLSEXT_HB_DONT_RECV_REQUESTS	0x04
+
+#define SSL_get_tlsext_heartbeat_pending(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING,0,NULL)
+#define SSL_set_tlsext_heartbeat_no_requests(ssl, arg) \
+        SSL_ctrl((ssl),SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS,arg,NULL)
+#endif
 #endif
 
 /* PSK ciphersuites from 4279 */
@@ -327,6 +407,14 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_AES_256_SHA		0x03000039
 #define TLS1_CK_ADH_WITH_AES_256_SHA			0x0300003A
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_RSA_WITH_NULL_SHA256			0x0300003B
+#define TLS1_CK_RSA_WITH_AES_128_SHA256			0x0300003C
+#define TLS1_CK_RSA_WITH_AES_256_SHA256			0x0300003D
+#define TLS1_CK_DH_DSS_WITH_AES_128_SHA256		0x0300003E
+#define TLS1_CK_DH_RSA_WITH_AES_128_SHA256		0x0300003F
+#define TLS1_CK_DHE_DSS_WITH_AES_128_SHA256		0x03000040
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_128_CBC_SHA		0x03000041
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	0x03000042
@@ -335,6 +423,16 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_CAMELLIA_128_CBC_SHA	0x03000045
 #define TLS1_CK_ADH_WITH_CAMELLIA_128_CBC_SHA		0x03000046
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_DHE_RSA_WITH_AES_128_SHA256		0x03000067
+#define TLS1_CK_DH_DSS_WITH_AES_256_SHA256		0x03000068
+#define TLS1_CK_DH_RSA_WITH_AES_256_SHA256		0x03000069
+#define TLS1_CK_DHE_DSS_WITH_AES_256_SHA256		0x0300006A
+#define TLS1_CK_DHE_RSA_WITH_AES_256_SHA256		0x0300006B
+#define TLS1_CK_ADH_WITH_AES_128_SHA256			0x0300006C
+#define TLS1_CK_ADH_WITH_AES_256_SHA256			0x0300006D
+
+/* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_256_CBC_SHA		0x03000084
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_256_CBC_SHA	0x03000085
 #define TLS1_CK_DH_RSA_WITH_CAMELLIA_256_CBC_SHA	0x03000086
@@ -350,6 +448,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_SEED_SHA                   0x0300009A
 #define TLS1_CK_ADH_WITH_SEED_SHA                	0x0300009B
 
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_CK_RSA_WITH_AES_128_GCM_SHA256		0x0300009C
+#define TLS1_CK_RSA_WITH_AES_256_GCM_SHA384		0x0300009D
+#define TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256		0x0300009E
+#define TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384		0x0300009F
+#define TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256		0x030000A0
+#define TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384		0x030000A1
+#define TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256		0x030000A2
+#define TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384		0x030000A3
+#define TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256		0x030000A4
+#define TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384		0x030000A5
+#define TLS1_CK_ADH_WITH_AES_128_GCM_SHA256		0x030000A6
+#define TLS1_CK_ADH_WITH_AES_256_GCM_SHA384		0x030000A7
+
 /* ECC ciphersuites from draft-ietf-tls-ecc-12.txt with changes soon to be in draft 13 */
 #define TLS1_CK_ECDH_ECDSA_WITH_NULL_SHA                0x0300C001
 #define TLS1_CK_ECDH_ECDSA_WITH_RC4_128_SHA             0x0300C002
@@ -381,6 +493,38 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA          0x0300C018
 #define TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA          0x0300C019
 
+/* SRP ciphersuites from RFC 5054 */
+#define TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA		0x0300C01A
+#define TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	0x0300C01B
+#define TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	0x0300C01C
+#define TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA		0x0300C01D
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	0x0300C01E
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	0x0300C01F
+#define TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA		0x0300C020
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	0x0300C021
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	0x0300C022
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256         0x0300C023
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384         0x0300C024
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256          0x0300C025
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384          0x0300C026
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256           0x0300C027
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384           0x0300C028
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256            0x0300C029
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384            0x0300C02A
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256	0x0300C02B
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384	0x0300C02C
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256      0x0300C02D
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384      0x0300C02E
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256       0x0300C02F
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384       0x0300C030
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032
+
 /* XXX
  * Inconsistency alert:
  * The OpenSSL names of ciphers with ephemeral DH here include the string
@@ -448,6 +592,17 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_PSK_WITH_AES_128_CBC_SHA		"PSK-AES128-CBC-SHA"
 #define TLS1_TXT_PSK_WITH_AES_256_CBC_SHA		"PSK-AES256-CBC-SHA"
 
+/* SRP ciphersuite from RFC 5054 */
+#define TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA		"SRP-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	"SRP-RSA-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	"SRP-DSS-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA		"SRP-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	"SRP-RSA-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	"SRP-DSS-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA		"SRP-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	"SRP-RSA-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	"SRP-DSS-AES-256-CBC-SHA"
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_TXT_RSA_WITH_CAMELLIA_128_CBC_SHA		"CAMELLIA128-SHA"
 #define TLS1_TXT_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	"DH-DSS-CAMELLIA128-SHA"
@@ -471,6 +626,55 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_DHE_RSA_WITH_SEED_SHA                  "DHE-RSA-SEED-SHA"
 #define TLS1_TXT_ADH_WITH_SEED_SHA                      "ADH-SEED-SHA"
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_TXT_RSA_WITH_NULL_SHA256			"NULL-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_128_SHA256		"AES128-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_SHA256		"AES256-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_SHA256		"DH-DSS-AES128-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_SHA256		"DH-RSA-AES128-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256		"DHE-DSS-AES128-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256		"DHE-RSA-AES128-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_SHA256		"DH-DSS-AES256-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_SHA256		"DH-RSA-AES256-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256		"DHE-DSS-AES256-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256		"DHE-RSA-AES256-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_128_SHA256		"ADH-AES128-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_SHA256		"ADH-AES256-SHA256"
+
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256		"AES128-GCM-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384		"AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256	"DHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384	"DHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256		"DH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384		"DH-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256	"DHE-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384	"DHE-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256		"DH-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384		"DH-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256		"ADH-AES128-GCM-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384		"ADH-AES256-GCM-SHA384"
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256    "ECDHE-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384    "ECDHE-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256     "ECDH-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384     "ECDH-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256      "ECDHE-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384      "ECDHE-RSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256       "ECDH-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384       "ECDH-RSA-AES256-SHA384"
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256    "ECDHE-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384    "ECDHE-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256     "ECDH-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384     "ECDH-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256      "ECDHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384      "ECDHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384"
 
 #define TLS_CT_RSA_SIGN			1
 #define TLS_CT_DSS_SIGN			2
diff --git a/ssl/tls_srp.c b/ssl/tls_srp.c
new file mode 100644
index 0000000..2315a7c
--- /dev/null
+++ b/ssl/tls_srp.c
@@ -0,0 +1,507 @@
+/* ssl/tls_srp.c */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with 
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr) 
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004-2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#include "ssl_locl.h"
+#ifndef OPENSSL_NO_SRP
+
+#include <openssl/rand.h>
+#include <openssl/srp.h>
+#include <openssl/err.h>
+
+int SSL_CTX_SRP_CTX_free(struct ssl_ctx_st *ctx)
+	{
+	if (ctx == NULL)
+		return 0;
+	OPENSSL_free(ctx->srp_ctx.login);
+	BN_free(ctx->srp_ctx.N);
+	BN_free(ctx->srp_ctx.g);
+	BN_free(ctx->srp_ctx.s);
+	BN_free(ctx->srp_ctx.B);
+	BN_free(ctx->srp_ctx.A);
+	BN_free(ctx->srp_ctx.a);
+	BN_free(ctx->srp_ctx.b);
+	BN_free(ctx->srp_ctx.v);
+	ctx->srp_ctx.TLS_ext_srp_username_callback = NULL;
+	ctx->srp_ctx.SRP_cb_arg = NULL;
+	ctx->srp_ctx.SRP_verify_param_callback = NULL;
+	ctx->srp_ctx.SRP_give_srp_client_pwd_callback = NULL;
+	ctx->srp_ctx.N = NULL;
+	ctx->srp_ctx.g = NULL;
+	ctx->srp_ctx.s = NULL;
+	ctx->srp_ctx.B = NULL;
+	ctx->srp_ctx.A = NULL;
+	ctx->srp_ctx.a = NULL;
+	ctx->srp_ctx.b = NULL;
+	ctx->srp_ctx.v = NULL;
+	ctx->srp_ctx.login = NULL;
+	ctx->srp_ctx.info = NULL;
+	ctx->srp_ctx.strength = SRP_MINIMAL_N;
+	ctx->srp_ctx.srp_Mask = 0;
+	return (1);
+	}
+
+int SSL_SRP_CTX_free(struct ssl_st *s)
+	{
+	if (s == NULL)
+		return 0;
+	OPENSSL_free(s->srp_ctx.login);
+	BN_free(s->srp_ctx.N);
+	BN_free(s->srp_ctx.g);
+	BN_free(s->srp_ctx.s);
+	BN_free(s->srp_ctx.B);
+	BN_free(s->srp_ctx.A);
+	BN_free(s->srp_ctx.a);
+	BN_free(s->srp_ctx.b);
+	BN_free(s->srp_ctx.v);
+	s->srp_ctx.TLS_ext_srp_username_callback = NULL;
+	s->srp_ctx.SRP_cb_arg = NULL;
+	s->srp_ctx.SRP_verify_param_callback = NULL;
+	s->srp_ctx.SRP_give_srp_client_pwd_callback = NULL;
+	s->srp_ctx.N = NULL;
+	s->srp_ctx.g = NULL;
+	s->srp_ctx.s = NULL;
+	s->srp_ctx.B = NULL;
+	s->srp_ctx.A = NULL;
+	s->srp_ctx.a = NULL;
+	s->srp_ctx.b = NULL;
+	s->srp_ctx.v = NULL;
+	s->srp_ctx.login = NULL;
+	s->srp_ctx.info = NULL;
+	s->srp_ctx.strength = SRP_MINIMAL_N;
+	s->srp_ctx.srp_Mask = 0;
+	return (1);
+	}
+
+int SSL_SRP_CTX_init(struct ssl_st *s)
+	{
+	SSL_CTX *ctx;
+
+	if ((s == NULL) || ((ctx = s->ctx) == NULL))
+		return 0;
+	s->srp_ctx.SRP_cb_arg = ctx->srp_ctx.SRP_cb_arg;
+	/* set client Hello login callback */
+	s->srp_ctx.TLS_ext_srp_username_callback = ctx->srp_ctx.TLS_ext_srp_username_callback;
+	/* set SRP N/g param callback for verification */
+	s->srp_ctx.SRP_verify_param_callback = ctx->srp_ctx.SRP_verify_param_callback;
+	/* set SRP client passwd callback */
+	s->srp_ctx.SRP_give_srp_client_pwd_callback = ctx->srp_ctx.SRP_give_srp_client_pwd_callback;
+
+	s->srp_ctx.N = NULL;
+	s->srp_ctx.g = NULL;
+	s->srp_ctx.s = NULL;
+	s->srp_ctx.B = NULL;
+	s->srp_ctx.A = NULL;
+	s->srp_ctx.a = NULL;
+	s->srp_ctx.b = NULL;
+	s->srp_ctx.v = NULL;
+	s->srp_ctx.login = NULL;
+	s->srp_ctx.info = ctx->srp_ctx.info;
+	s->srp_ctx.strength = ctx->srp_ctx.strength;
+
+	if (((ctx->srp_ctx.N != NULL) &&
+		 ((s->srp_ctx.N = BN_dup(ctx->srp_ctx.N)) == NULL)) ||
+		((ctx->srp_ctx.g != NULL) &&
+		 ((s->srp_ctx.g = BN_dup(ctx->srp_ctx.g)) == NULL)) ||
+		((ctx->srp_ctx.s != NULL) &&
+		 ((s->srp_ctx.s = BN_dup(ctx->srp_ctx.s)) == NULL)) ||
+		((ctx->srp_ctx.B != NULL) &&
+		 ((s->srp_ctx.B = BN_dup(ctx->srp_ctx.B)) == NULL)) ||
+		((ctx->srp_ctx.A != NULL) &&
+		 ((s->srp_ctx.A = BN_dup(ctx->srp_ctx.A)) == NULL)) ||
+		((ctx->srp_ctx.a != NULL) &&
+		 ((s->srp_ctx.a = BN_dup(ctx->srp_ctx.a)) == NULL)) ||
+		((ctx->srp_ctx.v != NULL) &&
+		 ((s->srp_ctx.v = BN_dup(ctx->srp_ctx.v)) == NULL)) ||
+		((ctx->srp_ctx.b != NULL) &&
+		 ((s->srp_ctx.b = BN_dup(ctx->srp_ctx.b)) == NULL)))
+		{
+		SSLerr(SSL_F_SSL_SRP_CTX_INIT,ERR_R_BN_LIB);
+		goto err;
+		}
+	if ((ctx->srp_ctx.login != NULL) && 
+		((s->srp_ctx.login = BUF_strdup(ctx->srp_ctx.login)) == NULL))
+		{
+		SSLerr(SSL_F_SSL_SRP_CTX_INIT,ERR_R_INTERNAL_ERROR);
+		goto err;
+		}
+	s->srp_ctx.srp_Mask = ctx->srp_ctx.srp_Mask;
+
+	return (1);
+err:
+	OPENSSL_free(s->srp_ctx.login);
+	BN_free(s->srp_ctx.N);
+	BN_free(s->srp_ctx.g);
+	BN_free(s->srp_ctx.s);
+	BN_free(s->srp_ctx.B);
+	BN_free(s->srp_ctx.A);
+	BN_free(s->srp_ctx.a);
+	BN_free(s->srp_ctx.b);
+	BN_free(s->srp_ctx.v);
+	return (0);
+	}
+
+int SSL_CTX_SRP_CTX_init(struct ssl_ctx_st *ctx)
+	{
+	if (ctx == NULL)
+		return 0;
+
+	ctx->srp_ctx.SRP_cb_arg = NULL;
+	/* set client Hello login callback */
+	ctx->srp_ctx.TLS_ext_srp_username_callback = NULL;
+	/* set SRP N/g param callback for verification */
+	ctx->srp_ctx.SRP_verify_param_callback = NULL;
+	/* set SRP client passwd callback */
+	ctx->srp_ctx.SRP_give_srp_client_pwd_callback = NULL;
+
+	ctx->srp_ctx.N = NULL;
+	ctx->srp_ctx.g = NULL;
+	ctx->srp_ctx.s = NULL;
+	ctx->srp_ctx.B = NULL;
+	ctx->srp_ctx.A = NULL;
+	ctx->srp_ctx.a = NULL;
+	ctx->srp_ctx.b = NULL;
+	ctx->srp_ctx.v = NULL;
+	ctx->srp_ctx.login = NULL;
+	ctx->srp_ctx.srp_Mask = 0;
+	ctx->srp_ctx.info = NULL;
+	ctx->srp_ctx.strength = SRP_MINIMAL_N;
+
+	return (1);
+	}
+
+/* server side */
+int SSL_srp_server_param_with_username(SSL *s, int *ad)
+	{
+	unsigned char b[SSL_MAX_MASTER_KEY_LENGTH];
+	int al;
+
+	*ad = SSL_AD_UNKNOWN_PSK_IDENTITY;
+	if ((s->srp_ctx.TLS_ext_srp_username_callback !=NULL) &&
+		((al = s->srp_ctx.TLS_ext_srp_username_callback(s, ad, s->srp_ctx.SRP_cb_arg))!=SSL_ERROR_NONE))
+			return al;
+
+	*ad = SSL_AD_INTERNAL_ERROR;
+	if ((s->srp_ctx.N == NULL) ||
+		(s->srp_ctx.g == NULL) ||
+		(s->srp_ctx.s == NULL) ||
+		(s->srp_ctx.v == NULL))
+		return SSL3_AL_FATAL;
+
+	if (RAND_bytes(b, sizeof(b)) <= 0)
+		return SSL3_AL_FATAL;
+	s->srp_ctx.b = BN_bin2bn(b,sizeof(b),NULL);
+	OPENSSL_cleanse(b,sizeof(b));
+
+	/* Calculate:  B = (kv + g^b) % N  */
+
+	return ((s->srp_ctx.B = SRP_Calc_B(s->srp_ctx.b, s->srp_ctx.N, s->srp_ctx.g, s->srp_ctx.v)) != NULL)?
+			SSL_ERROR_NONE:SSL3_AL_FATAL;
+	}
+
+/* If the server just has the raw password, make up a verifier entry on the fly */
+int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass, const char *grp)
+	{
+	SRP_gN *GN = SRP_get_default_gN(grp);
+	if(GN == NULL) return -1;
+	s->srp_ctx.N = BN_dup(GN->N);
+	s->srp_ctx.g = BN_dup(GN->g);
+	if(s->srp_ctx.v != NULL)
+		{
+		BN_clear_free(s->srp_ctx.v);
+		s->srp_ctx.v = NULL;
+		}
+	if(s->srp_ctx.s != NULL)
+		{
+		BN_clear_free(s->srp_ctx.s);
+		s->srp_ctx.s = NULL;
+		}
+	if(!SRP_create_verifier_BN(user, pass, &s->srp_ctx.s, &s->srp_ctx.v, GN->N, GN->g)) return -1;
+
+	return 1;
+	}
+
+int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g,
+			     BIGNUM *sa, BIGNUM *v, char *info)
+	{
+	if (N!= NULL)
+		{
+		if (s->srp_ctx.N != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.N,N))
+				{
+				BN_free(s->srp_ctx.N);
+				s->srp_ctx.N = NULL;
+				}
+			}
+		else
+			s->srp_ctx.N = BN_dup(N);
+		}
+	if (g!= NULL)
+		{
+		if (s->srp_ctx.g != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.g,g))
+				{
+				BN_free(s->srp_ctx.g);
+				s->srp_ctx.g = NULL;
+				}
+			}
+		else
+			s->srp_ctx.g = BN_dup(g);
+		}
+	if (sa!= NULL)
+		{
+		if (s->srp_ctx.s != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.s,sa))
+				{
+				BN_free(s->srp_ctx.s);
+				s->srp_ctx.s = NULL;
+				}
+			}
+		else
+			s->srp_ctx.s = BN_dup(sa);
+		}
+	if (v!= NULL)
+		{
+		if (s->srp_ctx.v != NULL)
+			{
+			if (!BN_copy(s->srp_ctx.v,v))
+				{
+				BN_free(s->srp_ctx.v);
+				s->srp_ctx.v = NULL;
+				}
+			}
+		else
+			s->srp_ctx.v = BN_dup(v);
+		}
+	s->srp_ctx.info = info;
+
+	if (!(s->srp_ctx.N) ||
+		!(s->srp_ctx.g) ||
+		!(s->srp_ctx.s) ||
+		!(s->srp_ctx.v))
+		return -1;
+
+	return 1;
+	}
+
+int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key)
+	{
+	BIGNUM *K = NULL, *u = NULL;
+	int ret = -1, tmp_len;
+	unsigned char *tmp = NULL;
+
+	if (!SRP_Verify_A_mod_N(s->srp_ctx.A,s->srp_ctx.N))
+		goto err;
+	if (!(u = SRP_Calc_u(s->srp_ctx.A,s->srp_ctx.B,s->srp_ctx.N)))
+		goto err;
+	if (!(K = SRP_Calc_server_key(s->srp_ctx.A, s->srp_ctx.v, u, s->srp_ctx.b, s->srp_ctx.N)))
+		goto err;
+
+	tmp_len = BN_num_bytes(K);
+	if ((tmp = OPENSSL_malloc(tmp_len)) == NULL)
+		goto err;
+	BN_bn2bin(K, tmp);
+	ret = s->method->ssl3_enc->generate_master_secret(s,master_key,tmp,tmp_len);
+err:
+	if (tmp)
+		{
+		OPENSSL_cleanse(tmp,tmp_len) ;
+		OPENSSL_free(tmp);
+		}
+	BN_clear_free(K);
+	BN_clear_free(u);
+	return ret;
+	}
+
+/* client side */
+int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key)
+	{
+	BIGNUM *x = NULL, *u = NULL, *K = NULL;
+	int ret = -1, tmp_len;
+	char *passwd = NULL;
+	unsigned char *tmp = NULL;
+
+	/* Checks if b % n == 0
+	 */
+	if (SRP_Verify_B_mod_N(s->srp_ctx.B,s->srp_ctx.N)==0) goto err;
+	if (!(u = SRP_Calc_u(s->srp_ctx.A,s->srp_ctx.B,s->srp_ctx.N))) goto err;
+	if (s->srp_ctx.SRP_give_srp_client_pwd_callback == NULL) goto err;
+	if (!(passwd = s->srp_ctx.SRP_give_srp_client_pwd_callback(s, s->srp_ctx.SRP_cb_arg))) goto err;
+	if (!(x = SRP_Calc_x(s->srp_ctx.s,s->srp_ctx.login,passwd))) goto err;
+	if (!(K = SRP_Calc_client_key(s->srp_ctx.N, s->srp_ctx.B, s->srp_ctx.g, x, s->srp_ctx.a, u))) goto err;
+
+	tmp_len = BN_num_bytes(K);
+	if ((tmp = OPENSSL_malloc(tmp_len)) == NULL) goto err;
+	BN_bn2bin(K, tmp);
+	ret = s->method->ssl3_enc->generate_master_secret(s,master_key,tmp,tmp_len);
+err:
+	if (tmp)
+		{
+		OPENSSL_cleanse(tmp,tmp_len) ;
+		OPENSSL_free(tmp);
+		}
+	BN_clear_free(K);
+	BN_clear_free(x);
+	if (passwd)
+		{
+		OPENSSL_cleanse(passwd,strlen(passwd)) ;
+		OPENSSL_free(passwd);
+		}
+	BN_clear_free(u);
+	return ret;
+	}
+
+int SRP_Calc_A_param(SSL *s)
+	{
+	unsigned char rnd[SSL_MAX_MASTER_KEY_LENGTH];
+
+	if (BN_num_bits(s->srp_ctx.N) < s->srp_ctx.strength)
+		return -1;
+
+	if (s->srp_ctx.SRP_verify_param_callback ==NULL && 
+		!SRP_check_known_gN_param(s->srp_ctx.g,s->srp_ctx.N))
+		return -1 ;
+
+	RAND_bytes(rnd, sizeof(rnd));
+	s->srp_ctx.a = BN_bin2bn(rnd, sizeof(rnd), s->srp_ctx.a);
+	OPENSSL_cleanse(rnd, sizeof(rnd));
+
+	if (!(s->srp_ctx.A = SRP_Calc_A(s->srp_ctx.a,s->srp_ctx.N,s->srp_ctx.g)))
+		return -1;
+
+	/* We can have a callback to verify SRP param!! */
+	if (s->srp_ctx.SRP_verify_param_callback !=NULL) 
+		return s->srp_ctx.SRP_verify_param_callback(s,s->srp_ctx.SRP_cb_arg);
+
+	return 1;
+	}
+
+BIGNUM *SSL_get_srp_g(SSL *s)
+	{
+	if (s->srp_ctx.g != NULL)
+		return s->srp_ctx.g;
+	return s->ctx->srp_ctx.g;
+	}
+
+BIGNUM *SSL_get_srp_N(SSL *s)
+	{
+	if (s->srp_ctx.N != NULL)
+		return s->srp_ctx.N;
+	return s->ctx->srp_ctx.N;
+	}
+
+char *SSL_get_srp_username(SSL *s)
+	{
+	if (s->srp_ctx.login != NULL)
+		return s->srp_ctx.login;
+	return s->ctx->srp_ctx.login;
+	}
+
+char *SSL_get_srp_userinfo(SSL *s)
+	{
+	if (s->srp_ctx.info != NULL)
+		return s->srp_ctx.info;
+	return s->ctx->srp_ctx.info;
+	}
+
+#define tls1_ctx_ctrl ssl3_ctx_ctrl
+#define tls1_ctx_callback_ctrl ssl3_ctx_callback_ctrl
+
+int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name)
+	{
+	return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_USERNAME,0,name);
+	}
+
+int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password)
+	{
+	return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD,0,password);
+	}
+
+int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength)
+	{
+	return tls1_ctx_ctrl(ctx, SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH, strength,
+			     NULL);
+	}
+
+int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx, int (*cb)(SSL *,void *))
+	{
+	return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_SRP_VERIFY_PARAM_CB,
+				      (void (*)(void))cb);
+	}
+
+int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg)
+	{
+	return tls1_ctx_ctrl(ctx,SSL_CTRL_SET_SRP_ARG,0,arg);
+	}
+
+int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx,
+				      int (*cb)(SSL *,int *,void *))
+	{
+	return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB,
+				      (void (*)(void))cb);
+	}
+
+int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx, char *(*cb)(SSL *,void *))
+	{
+	return tls1_ctx_callback_ctrl(ctx,SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB,
+				      (void (*)(void))cb);
+	}
+
+#endif