diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1d2c71f3..edd7e4f2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -73,6 +73,9 @@ jobs: picotool info -a flash_nuke.uf2 test-examples: + # Prevent running twice for PRs from same repo + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name + name: Test Build Examples runs-on: ubuntu-latest steps: - name: Checkout diff --git a/BUILD.bazel b/BUILD.bazel index d120f500..5944db8e 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -17,6 +17,20 @@ picotool_binary_data_header( out = "xip_ram_perms_elf.h", ) +# TODO: Make it possible to build the prebuilt from source. +picotool_binary_data_header( + name = "enc_bootloader_elf", + src = "//enc_bootloader:enc_bootloader_prebuilt", + out = "enc_bootloader_elf.h", +) + +# TODO: Make it possible to build the prebuilt from source. +picotool_binary_data_header( + name = "enc_bootloader_mbedtls_elf", + src = "//enc_bootloader:enc_bootloader_mbedtls_prebuilt", + out = "enc_bootloader_mbedtls_elf.h", +) + # TODO: Make it possible to build the prebuilt from source. picotool_binary_data_header( name = "flash_id_bin", @@ -26,9 +40,9 @@ picotool_binary_data_header( cc_library( name = "xip_ram_perms", - srcs = ["xip_ram_perms.cpp"], + srcs = ["get_xip_ram_perms.cpp"], hdrs = [ - "xip_ram_perms.h", + "get_xip_ram_perms.h", "xip_ram_perms_elf.h", ], deps = [ @@ -37,6 +51,20 @@ cc_library( ], ) +cc_library( + name = "enc_bootloader", + srcs = ["get_enc_bootloader.cpp"], + hdrs = [ + "get_enc_bootloader.h", + "enc_bootloader_elf.h", + "enc_bootloader_mbedtls_elf.h", + ], + deps = [ + "//bazel:data_locs", + "//lib/whereami", + ], +) + filegroup( name = "data_locs_header", srcs = ["data_locs.h"], @@ -61,7 +89,8 @@ cc_binary( "otp.cpp", "otp.h", "rp2350.rom.h", - "xip_ram_perms.cpp", + "get_xip_ram_perms.cpp", + "get_enc_bootloader.cpp", ] + select({ # MSVC can't handle long strings, so use this manually generated # header instead. @@ -97,6 +126,7 @@ cc_binary( }), deps = [ ":xip_ram_perms", + ":enc_bootloader", "//bazel:data_locs", "//bintool", "//elf", diff --git a/CMakeLists.txt b/CMakeLists.txt index e9f14b4d..b89af796 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,7 +42,6 @@ endif() # todo better install paths for this set(DATA_LOCS "./" "${CMAKE_INSTALL_PREFIX}/${INSTALL_DATADIR}/") -message(${DATA_LOCS}) string(REGEX REPLACE ";" "\",\"" DATA_LOCS_VEC "${DATA_LOCS}") configure_file(data_locs.template.cpp ${CMAKE_CURRENT_BINARY_DIR}/data_locs.cpp) @@ -57,11 +56,49 @@ endif() list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) +add_subdirectory(lib) + +if (NOT DEFINED USE_PRECOMPILED) + set(USE_PRECOMPILED true) +endif() + +# compile enc_bootloader.elf +ExternalProject_Add(enc_bootloader + PREFIX enc_bootloader + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/enc_bootloader + BINARY_DIR ${CMAKE_BINARY_DIR}/enc_bootloader + CMAKE_ARGS + "-DCMAKE_MAKE_PROGRAM:FILEPATH=${CMAKE_MAKE_PROGRAM}" + "-DPICO_SDK_PATH:FILEPATH=${PICO_SDK_PATH}" + "-DUSE_PRECOMPILED:BOOL=${USE_PRECOMPILED}" + "-DUSE_MBEDTLS=0" + "-DPICO_DEBUG_INFO_IN_RELEASE=OFF" + BUILD_ALWAYS 1 # todo remove this + INSTALL_COMMAND "" + ) + +set(ENC_BOOTLOADER_ELF ${CMAKE_BINARY_DIR}/enc_bootloader/enc_bootloader.elf) + +if (TARGET mbedtls) + ExternalProject_Add(enc_bootloader_mbedtls + PREFIX enc_bootloader_mbedtls + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/enc_bootloader + BINARY_DIR ${CMAKE_BINARY_DIR}/enc_bootloader_mbedtls + CMAKE_ARGS + "-DCMAKE_MAKE_PROGRAM:FILEPATH=${CMAKE_MAKE_PROGRAM}" + "-DPICO_SDK_PATH:FILEPATH=${PICO_SDK_PATH}" + "-DUSE_PRECOMPILED:BOOL=${USE_PRECOMPILED}" + "-DUSE_MBEDTLS=1" + "-DPICO_DEBUG_INFO_IN_RELEASE=OFF" + BUILD_ALWAYS 1 # todo remove this + INSTALL_COMMAND "" + ) + + set(ENC_BOOTLOADER_MBEDTLS_ELF ${CMAKE_BINARY_DIR}/enc_bootloader_mbedtls/enc_bootloader.elf) +endif() + if (NOT PICOTOOL_NO_LIBUSB) # compile xip_ram_perms.elf - if (NOT DEFINED USE_PRECOMPILED) - set(USE_PRECOMPILED true) - endif() ExternalProject_Add(xip_ram_perms PREFIX xip_ram_perms SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/xip_ram_perms @@ -76,14 +113,6 @@ if (NOT PICOTOOL_NO_LIBUSB) ) set(XIP_RAM_PERMS_ELF ${CMAKE_BINARY_DIR}/xip_ram_perms/xip_ram_perms.elf) - add_executable(xip_ram_perms_elf IMPORTED) - add_dependencies(xip_ram_perms_elf xip_ram_perms) - set_property(TARGET xip_ram_perms_elf PROPERTY IMPORTED_LOCATION ${XIP_RAM_PERMS_ELF}) - # copy xip_ram_perms.elf into build directory - add_custom_command(TARGET xip_ram_perms - COMMAND ${CMAKE_COMMAND} -E copy ${XIP_RAM_PERMS_ELF} ${CMAKE_BINARY_DIR}/xip_ram_perms.elf - DEPENDS xip_ram_perms - ) # compile flash_id ExternalProject_Add(flash_id @@ -100,14 +129,6 @@ if (NOT PICOTOOL_NO_LIBUSB) ) set(FLASH_ID_BIN ${CMAKE_BINARY_DIR}/picoboot_flash_id/flash_id.bin) - add_executable(flash_id_bin IMPORTED) - add_dependencies(flash_id_bin flash_id) - set_property(TARGET flash_id_bin PROPERTY IMPORTED_LOCATION ${FLASH_ID_BIN}) - # copy flash_id.bin into build directory - add_custom_command(TARGET flash_id - COMMAND ${CMAKE_COMMAND} -E copy ${FLASH_ID_BIN} ${CMAKE_BINARY_DIR}/flash_id.bin - DEPENDS flash_id - ) # We want to generate headers from WELCOME.HTM etc. ExternalProject_Add(otp_header_parser @@ -169,7 +190,16 @@ if (NOT PICOTOOL_NO_LIBUSB) endif() endif() -add_custom_target(binary_data DEPENDS +if (TARGET mbedtls) + add_custom_target(embedded_data_no_libusb DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/enc_bootloader_elf.h + ${CMAKE_CURRENT_BINARY_DIR}/enc_bootloader_mbedtls_elf.h) +else() + add_custom_target(embedded_data_no_libusb DEPENDS + ${CMAKE_CURRENT_BINARY_DIR}/enc_bootloader_elf.h) +endif() + +add_custom_target(embedded_data DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/rp2350.rom.h ${CMAKE_CURRENT_BINARY_DIR}/xip_ram_perms_elf.h ${CMAKE_CURRENT_BINARY_DIR}/flash_id_bin.h) @@ -188,6 +218,22 @@ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/xip_ram_perms_elf.h DEPENDS xip_ram_perms COMMENT "Configuring xip_ram_perms_elf.h" VERBATIM) +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/enc_bootloader_elf.h + COMMAND ${CMAKE_COMMAND} + -D BINARY_FILE=${ENC_BOOTLOADER_ELF} + -D OUTPUT_NAME=enc_bootloader_elf + -P ${CMAKE_CURRENT_LIST_DIR}/cmake/binh.cmake + DEPENDS enc_bootloader + COMMENT "Configuring enc_bootloader_elf.h" + VERBATIM) +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/enc_bootloader_mbedtls_elf.h + COMMAND ${CMAKE_COMMAND} + -D BINARY_FILE=${ENC_BOOTLOADER_MBEDTLS_ELF} + -D OUTPUT_NAME=enc_bootloader_mbedtls_elf + -P ${CMAKE_CURRENT_LIST_DIR}/cmake/binh.cmake + DEPENDS enc_bootloader_mbedtls + COMMENT "Configuring enc_bootloader_mbedtls_elf.h" + VERBATIM) add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/flash_id_bin.h COMMAND ${CMAKE_COMMAND} -D BINARY_FILE=${FLASH_ID_BIN} @@ -203,8 +249,6 @@ add_subdirectory(picoboot_connection) add_subdirectory(elf) add_subdirectory(elf2uf2) -add_subdirectory(lib) - add_subdirectory(bintool) if (NOT PICOTOOL_NO_LIBUSB) @@ -228,11 +272,13 @@ target_include_directories(regs_headers INTERFACE ${PICO_SDK_PATH}/src/rp2350/ha # Main picotool executable add_executable(picotool data_locs.cpp + get_enc_bootloader.cpp ${OTP_EXE} main.cpp) +add_dependencies(picotool embedded_data_no_libusb) if (NOT PICOTOOL_NO_LIBUSB) - target_sources(picotool PRIVATE xip_ram_perms.cpp) - add_dependencies(picotool generate_otp_header xip_ram_perms_elf binary_data) + target_sources(picotool PRIVATE get_xip_ram_perms.cpp) + add_dependencies(picotool generate_otp_header embedded_data) endif() set(PROJECT_VERSION 2.1.2-develop) set(PICOTOOL_VERSION 2.1.2-develop) @@ -327,6 +373,21 @@ install(FILES DESTINATION ${INSTALL_CONFIGDIR} ) +#Install enc_bootloader.elf +install(FILES + ${ENC_BOOTLOADER_ELF} + DESTINATION ${INSTALL_DATADIR} +) + +if (TARGET mbedtls) + #Install enc_bootloader_mbedtls.elf + install(FILES + ${ENC_BOOTLOADER_MBEDTLS_ELF} + DESTINATION ${INSTALL_DATADIR} + RENAME enc_bootloader_mbedtls.elf + ) +endif() + if (NOT PICOTOOL_NO_LIBUSB) if (NOT PICOTOOL_CODE_OTP) #Install the otp json diff --git a/README.md b/README.md index 65753107..376d3da5 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,10 @@ SYNOPSIS: picotool config [-s ] [-g ] [-t ] picotool load [--ignore-partitions] [--family ] [-p ] [-n] [-N] [-u] [-v] [-x] [-t ] [-o ] [device-selection] - picotool encrypt [--quiet] [--verbose] [--hash] [--sign] [-t ] [-o ] [-t ] [-t ] - [] [-t ] - picotool seal [--quiet] [--verbose] [--hash] [--sign] [--clear] [-t ] [-o ] [-t ] [] [-t - ] [] [-t ] [--major ] [--minor ] [--rollback [..]] + picotool encrypt [--quiet] [--verbose] [--embed] [--fast-rosc] [--use-mbedtls] [--otp-key-page ] [--hash] [--sign] [-t + ] [-o ] [-t ] + picotool seal [--quiet] [--verbose] [--hash] [--sign] [--clear] [-t ] [-o ] [-t ] + [--major ] [--minor ] [--rollback [..]] picotool link [--quiet] [--verbose] [-t ] [-t ] [-t ] [] [-t ] [-p ] picotool save [-p] [-v] [--family ] [-t ] [device-selection] picotool save -a [-v] [--family ] [-t ] [device-selection] @@ -33,7 +33,7 @@ SYNOPSIS: picotool partition info|create picotool uf2 info|convert picotool version [-s] [] - picotool coprodis [--quiet] [--verbose] [-t ] [-t ] + picotool coprodis [--quiet] [--verbose] picotool help [] COMMANDS: @@ -578,14 +578,18 @@ SEAL: Add final metadata to a binary, optionally including a hash and/or signature. SYNOPSIS: - picotool seal [--quiet] [--verbose] [--hash] [--sign] [--clear] [-t ] [-o ] [-t ] [] [-t - ] [] [-t ] [--major ] [--minor ] [--rollback [..]] + picotool seal [--quiet] [--verbose] [--hash] [--sign] [--clear] [-t ] [-o ] [-t ] + [--major ] [--minor ] [--rollback [..]] OPTIONS: --quiet Don't print any output --verbose Print verbose output + + Key file (.pem) + + JSON file to save OTP to (will edit existing file if it exists) --major Add Major Version --minor @@ -614,21 +618,11 @@ OPTIONS: The file name -t Specify file type (uf2 | elf | bin) explicitly, ignoring file extension - Key file - - The file name - -t - Specify file type (pem) explicitly, ignoring file extension - File to save OTP to (will edit existing file if it exists) - - The file name - -t - Specify file type (json) explicitly, ignoring file extension ``` ## encrypt -`encrypt` allows you to encrypt and sign a binary for use on the RP2350. By default, it will sign the encrypted binary, but that can be configured similarly to `picotool seal`. +`encrypt` allows you to encrypt and sign a binary for use on the RP2350. By default, it will sign the encrypted binary, but that can be configured similarly to `picotool seal`. You can either provide your own bootloader to decrypt the binary (see pico-examples/bootloaders/encrypted), or embed a decrypting bootloader into the binary with the `--embed` argument, to create a self-decrypting binary. The encrypted binary will have the following structure: @@ -638,7 +632,15 @@ The encrypted binary will have the following structure: - Padding to ensure the encrypted length is a multiple of 4 words - Signature metadata block -The AES key must be provided as a .bin file of the 256 bit AES key to be used for encryption. +The AES key can either be provided as a 1024-bit (128 byte) 4-way key share or as a 256-bit (32 byte) AES key. In the latter case, a key share will be generated for you, as the decryption code needs a 4-way key share in OTP rather than a plain key. + +The 4-way key share should be generated by creating a random .bin file of length 128 bytes and the AES key will be derived from that. With the words stored in the file as A[0], B[0], C[0], D[0], A[1], B[1], etc., C[7], D[7], word i of the key X is X[i] = A[i] ^ B[i] ^ C[i] ^ D[i]. + +Alternatively an AES key can be provided directly, as either a 32 byte .bin file or as a string of 64 hexadecimal characters (eg 0x0123456789abcdeffedcba98765432100123456789abcdeffedcba9876543210). Picotool will generate a random key share for that key, by generating 3 random shares (A, B & C) and calculating a 4th share (D) to match the provided key (X), with each word i of the 4th share calculated as D[i] = X[i] ^ A[i] ^ B[i] ^ C[i]. + +When using a .bin file, picotool will use the file size to determine if a 128 byte key share or a 32 byte key has been provided - any other file size will throw an error. + +The encryption/decryption code also salts the IV (initialisation vector) so it's not stored in plaintext in the binary. This requires a per-device IV salt in OTP, which should be a 128-bit (16 byte) random value - this can be provided as either a 16 byte .bin file, or a string of 32 hexadecimal characters, similar to the AES key. This per-device salt will be XORed with the IV stored in the binary, to give the IV used by the decryption code. ```text $ picotool help encrypt @@ -646,14 +648,32 @@ ENCRYPT: Encrypt the program. SYNOPSIS: - picotool encrypt [--quiet] [--verbose] [--hash] [--sign] [-t ] [-o ] [-t ] [-t ] - [] [-t ] + picotool encrypt [--quiet] [--verbose] [--embed] [--fast-rosc] [--use-mbedtls] [--otp-key-page ] [--hash] [--sign] [-t + ] [-o ] [-t ] OPTIONS: --quiet Don't print any output --verbose Print verbose output + --embed + Embed bootloader in output file + --fast-rosc + Use ~180MHz ROSC configuration for embedded bootloader + --use-mbedtls + Use MbedTLS implementation of embedded bootloader (faster but less secure) + --otp-key-page + Specify the OTP page storing the AES key (IV salt is stored on the next page) + + OTP page (default 30) + + AES Key Share or AES Key + + IV Salt + + Signing Key file (.pem) + + JSON file to save OTP to (will edit existing file if it exists) Signing Configuration --hash Hash the encrypted file @@ -674,16 +694,6 @@ OPTIONS: The file name -t Specify file type (uf2 | elf | bin) explicitly, ignoring file extension - AES Key - - The file name - -t - Specify file type (bin) explicitly, ignoring file extension - Signing Key file - - The file name - -t - Specify file type (pem) explicitly, ignoring file extension ``` ## partition @@ -752,19 +762,16 @@ PARTITION CREATE: Create a partition table from json SYNOPSIS: - picotool partition create [--quiet] [--verbose] [-t ] [-t ] [[-o ] [--family ]] - [] [-t ] [[--sign ] [-t ] [--no-hash] [--singleton]] [[--abs-block] []] + picotool partition create [--quiet] [--verbose] [-t ] [[-o ] [--family ]] [] [-t + ] [[--sign ] [-t ] [--no-hash] [--singleton]] [[--abs-block] []] OPTIONS: --quiet Don't print any output --verbose Print verbose output - partition table JSON - The file name - -t - Specify file type (json) explicitly, ignoring file extension + partition table JSON output file The file name @@ -899,9 +906,9 @@ SYNOPSIS: picotool otp get [-c ] [-r] [-e] [-n] [-i ] [device-selection] [-z] [..] picotool otp set [-c ] [-r] [-e] [-s] [-i ] [-z] [device-selection] picotool otp load [-r] [-e] [-s ] [-i ] [-t ] [device-selection] - picotool otp dump [-r] [-e] [device-selection] - picotool otp permissions [-t ] [--led ] [--hash] [--sign] [] [-t ] [device-selection] - picotool otp white-label -s [-t ] [device-selection] + picotool otp dump [-r] [-e] [-p] [device-selection] + picotool otp permissions [--led ] [--hash] [--sign] [device-selection] + picotool otp white-label -s [device-selection] SUB COMMANDS: list List matching known registers/fields @@ -1104,14 +1111,11 @@ OTP WHITE-LABEL: Set the white labelling values in OTP SYNOPSIS: - picotool otp white-label -s [-t ] [device-selection] + picotool otp white-label -s [device-selection] OPTIONS: - File with white labelling values - The file name - -t - Specify file type (json) explicitly, ignoring file extension + JSON file with white labelling values Target device selection --bus Filter devices by USB bus number @@ -1183,14 +1187,11 @@ OTP PERMISSIONS: Set the OTP access permissions SYNOPSIS: - picotool otp permissions [-t ] [--led ] [--hash] [--sign] [] [-t ] [device-selection] + picotool otp permissions [--led ] [--hash] [--sign] [device-selection] OPTIONS: - File to load permissions from - The file name - -t - Specify file type (json) explicitly, ignoring file extension + JSON file to load permissions from --led LED Pin to flash; default 25 Signing Configuration @@ -1198,11 +1199,8 @@ OPTIONS: Hash the executable --sign Sign the executable - Key file - The file name - -t - Specify file type (pem) explicitly, ignoring file extension + Key file (.pem) Target device selection --bus Filter devices by USB bus number @@ -1253,7 +1251,7 @@ OTP DUMP: Dump entire OTP SYNOPSIS: - picotool otp dump [-r] [-e] [device-selection] + picotool otp dump [-r] [-e] [-p] [device-selection] OPTIONS: Row/field options @@ -1261,6 +1259,8 @@ OPTIONS: Get raw 24-bit values. This is the default -e, --ecc Use error correction + -p, --pages + Index by page number & row number TARGET SELECTION: Target device selection @@ -1332,23 +1332,17 @@ COPRODIS: Post-process coprocessor instructions in disassembly files. SYNOPSIS: - picotool coprodis [--quiet] [--verbose] [-t ] [-t ] + picotool coprodis [--quiet] [--verbose] OPTIONS: --quiet Don't print any output --verbose Print verbose output - Input DIS - The file name - -t - Specify file type (uf2 | elf | bin) explicitly, ignoring file extension - Output DIS + Input DIS - The file name - -t - Specify file type (uf2 | elf | bin) explicitly, ignoring file extension + Output DIS ``` ## link diff --git a/bintool/bintool.cpp b/bintool/bintool.cpp index 19debf6c..5df18d24 100644 --- a/bintool/bintool.cpp +++ b/bintool/bintool.cpp @@ -260,6 +260,7 @@ block place_new_block(elf_file *elf, std::unique_ptr &first_block) { int32_t loop_start_rel = 0; uint32_t new_block_addr = 0; + std::unique_ptr new_first_block; if (!first_block->next_block_rel) { set_next_block(elf, first_block, highest_address); loop_start_rel = -first_block->next_block_rel; @@ -267,7 +268,6 @@ block place_new_block(elf_file *elf, std::unique_ptr &first_block) { } else { DEBUG_LOG("There is already a block loop\n"); uint32_t next_block_addr = first_block->physical_addr + first_block->next_block_rel; - std::unique_ptr new_first_block; while (true) { auto segment = elf->segment_from_physical_address(next_block_addr); if (segment == nullptr) { @@ -314,10 +314,32 @@ block place_new_block(elf_file *elf, std::unique_ptr &first_block) { // loop back to first block block new_block(new_block_addr, loop_start_rel); - // copt the existing block - std::copy(first_block->items.begin(), - first_block->items.end(), - std::back_inserter(new_block.items)); + // check if last block has an image_def + if (new_first_block != nullptr && new_first_block->get_item() != nullptr) { + // copy the last block items + std::copy(new_first_block->items.begin(), + new_first_block->items.end(), + std::back_inserter(new_block.items)); + } else { + // copy the first block items + std::copy(first_block->items.begin(), + first_block->items.end(), + std::back_inserter(new_block.items)); + } + + // Delete existing signature and hash as these will be replaced with new ones + std::shared_ptr signature = new_block.get_item(); + if (signature != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), signature), new_block.items.end()); + } + std::shared_ptr hash_value = new_block.get_item(); + if (hash_value != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), hash_value), new_block.items.end()); + } + std::shared_ptr hash_def = new_block.get_item(); + if (hash_def != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), hash_def), new_block.items.end()); + } return new_block; } @@ -404,13 +426,14 @@ block place_new_block(std::vector &bin, uint32_t storage_addr, std::uni int32_t loop_start_rel = 0; uint32_t new_block_addr = 0; + std::unique_ptr new_first_block; if (!first_block->next_block_rel) { set_next_block(bin, storage_addr, first_block, highest_address); loop_start_rel = -first_block->next_block_rel; new_block_addr = first_block->physical_addr + first_block->next_block_rel; } else { DEBUG_LOG("Ooh, there is already a block loop - lets find it's end\n"); - auto new_first_block = get_last_block(bin, storage_addr, first_block); + new_first_block = get_last_block(bin, storage_addr, first_block); set_next_block(bin, storage_addr, new_first_block, highest_address); new_block_addr = new_first_block->physical_addr + new_first_block->next_block_rel; loop_start_rel = first_block->physical_addr - new_block_addr; @@ -421,10 +444,32 @@ block place_new_block(std::vector &bin, uint32_t storage_addr, std::uni // loop back to first block block new_block(new_block_addr, loop_start_rel); - // copt the existing block - std::copy(first_block->items.begin(), - first_block->items.end(), - std::back_inserter(new_block.items)); + // check if last block has an image_def + if (new_first_block != nullptr && new_first_block->get_item() != nullptr) { + // copy the last block items + std::copy(new_first_block->items.begin(), + new_first_block->items.end(), + std::back_inserter(new_block.items)); + } else { + // copy the first block items + std::copy(first_block->items.begin(), + first_block->items.end(), + std::back_inserter(new_block.items)); + } + + // Delete existing signature and hash as these will be replaced with new ones + std::shared_ptr signature = new_block.get_item(); + if (signature != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), signature), new_block.items.end()); + } + std::shared_ptr hash_value = new_block.get_item(); + if (hash_value != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), hash_value), new_block.items.end()); + } + std::shared_ptr hash_def = new_block.get_item(); + if (hash_def != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), hash_def), new_block.items.end()); + } return new_block; } @@ -625,7 +670,7 @@ std::vector get_lm_hash_data(elf_file *elf, block *new_block, bool clea if (data.size() != seg->physical_size()) { fail(ERROR_INCOMPATIBLE, "Elf segment physical size (%" PRIx32 ") does not match data size in file (%zx)", seg->physical_size(), data.size()); } - if (seg->physical_size() && seg->physical_address() < new_block->physical_addr) { + if (seg->physical_size()) { std::copy(data.begin(), data.end(), std::back_inserter(to_hash)); DEBUG_LOG("HASH %08x + %08x\n", (int)seg->physical_address(), (int)seg->physical_size()); entries.push_back( @@ -855,8 +900,7 @@ void verify_block(std::vector bin, uint32_t storage_addr, uint32_t runt } -int encrypt(elf_file *elf, block *new_block, const private_t aes_key, const public_t public_key, const private_t private_key, bool hash_value, bool sign) { - +void encrypt_guts(elf_file *elf, block *new_block, const aes_key_t aes_key, std::vector &iv_data, std::vector &enc_data) { std::vector to_enc = get_lm_hash_data(elf, new_block); std::random_device rand{}; @@ -872,12 +916,26 @@ int encrypt(elf_file *elf, block *new_block, const private_t aes_key, const publ e = rand(); } - std::vector iv_data(iv.bytes, iv.bytes + sizeof(iv.bytes)); + iv_data.resize(sizeof(iv.bytes)); + memcpy(iv_data.data(), iv.bytes, sizeof(iv.bytes)); - std::vector enc_data; enc_data.resize(to_enc.size()); aes256_buffer(to_enc.data(), to_enc.size(), enc_data.data(), &aes_key, &iv); +} + + +int encrypt(elf_file *elf, block *new_block, const aes_key_t aes_key, const public_t public_key, const private_t private_key, std::vector iv_salt, bool hash_value, bool sign) { + + std::vector iv_data; + std::vector enc_data; + encrypt_guts(elf, new_block, aes_key, iv_data, enc_data); + + // Salt IV + assert(iv_data.size() == iv_salt.size()); + for (int i=0; i < iv_data.size(); i++) { + iv_data[i] ^= iv_salt[i]; + } unsigned int i=0; for(const auto &seg : sorted_segs(elf)) { @@ -961,7 +1019,7 @@ int encrypt(elf_file *elf, block *new_block, const private_t aes_key, const publ } -std::vector encrypt(std::vector bin, uint32_t storage_addr, uint32_t runtime_addr, block *new_block, const private_t aes_key, const public_t public_key, const private_t private_key, bool hash_value, bool sign) { +std::vector encrypt(std::vector bin, uint32_t storage_addr, uint32_t runtime_addr, block *new_block, const aes_key_t aes_key, const public_t public_key, const private_t private_key, std::vector iv_salt, bool hash_value, bool sign) { std::random_device rand{}; assert(rand.max() - rand.min() >= 256); @@ -977,6 +1035,12 @@ std::vector encrypt(std::vector bin, uint32_t storage_addr, ui std::vector iv_data(iv.bytes, iv.bytes + sizeof(iv.bytes)); + // Salt IV + assert(iv_data.size() == iv_salt.size()); + for (int i=0; i < iv_data.size(); i++) { + iv_data[i] ^= iv_salt[i]; + } + std::vector enc_data; enc_data.resize(bin.size()); diff --git a/bintool/bintool.h b/bintool/bintool.h index 2c574cde..c4ddbec2 100644 --- a/bintool/bintool.h +++ b/bintool/bintool.h @@ -25,7 +25,8 @@ std::unique_ptr find_first_block(elf_file *elf); block place_new_block(elf_file *elf, std::unique_ptr &first_block); #if HAS_MBEDTLS int hash_andor_sign(elf_file *elf, block *new_block, const public_t public_key, const private_t private_key, bool hash_value, bool sign, bool clear_sram = false); - int encrypt(elf_file *elf, block *new_block, const private_t aes_key, const public_t public_key, const private_t private_key, bool hash_value, bool sign); + void encrypt_guts(elf_file *elf, block *new_block, const aes_key_t aes_key, std::vector &iv_data, std::vector &enc_data); + int encrypt(elf_file *elf, block *new_block, const aes_key_t aes_key, const public_t public_key, const private_t private_key, std::vector iv_salt, bool hash_value, bool sign); #endif // Bins @@ -37,6 +38,6 @@ block place_new_block(std::vector &bin, uint32_t storage_addr, std::uni uint32_t calc_checksum(std::vector bin); #if HAS_MBEDTLS std::vector hash_andor_sign(std::vector bin, uint32_t storage_addr, uint32_t runtime_addr, block *new_block, const public_t public_key, const private_t private_key, bool hash_value, bool sign, bool clear_sram = false); - std::vector encrypt(std::vector bin, uint32_t storage_addr, uint32_t runtime_addr, block *new_block, const private_t aes_key, const public_t public_key, const private_t private_key, bool hash_value, bool sign); + std::vector encrypt(std::vector bin, uint32_t storage_addr, uint32_t runtime_addr, block *new_block, const aes_key_t aes_key, const public_t public_key, const private_t private_key, std::vector iv_salt, bool hash_value, bool sign); void verify_block(std::vector bin, uint32_t storage_addr, uint32_t runtime_addr, block *block, verified_t &hash_verified, verified_t &sig_verified, get_more_bin_cb more_cb = nullptr); #endif diff --git a/bintool/mbedtls_wrapper.c b/bintool/mbedtls_wrapper.c index 0fcf083f..c623ebdc 100644 --- a/bintool/mbedtls_wrapper.c +++ b/bintool/mbedtls_wrapper.c @@ -40,7 +40,49 @@ void mb_sha256_buffer(const uint8_t *data, size_t len, message_digest_t *digest_ mbedtls_sha256(data, len, digest_out->bytes, 0); } -void mb_aes256_buffer(const uint8_t *data, size_t len, uint8_t *data_out, const private_t *key, iv_t *iv) { +#if IV0_XOR +// Taken from mbedtls_aes_crypt_ctr, but with XOR instead of adding to IV0 +int mb_aes_crypt_ctr_xor(mbedtls_aes_context *ctx, + size_t length, + unsigned char iv0[16], + unsigned char nonce_xor[16], + unsigned char stream_block[16], + const unsigned char *input, + unsigned char *output) +{ + int c; + int ret = 0; + size_t n = 0; + uint32_t counter = 0; + + assert(length == (uint32_t)length); + + while (length--) { + if (n == 0) { + for (int i = 16; i > 0; i--) { + nonce_xor[i-1] = iv0[i-1]; + if (i > 16 - sizeof(counter)) { + nonce_xor[i-1] ^= (unsigned char)(counter >> ((16-i)*8)); + } + } + + ret = mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, nonce_xor, stream_block); + if (ret != 0) { + break; + } + counter++; + } + c = *input++; + *output++ = (unsigned char) (c ^ stream_block[n]); + + n = (n + 1) & 0x0F; + } + + return ret; +} +#endif + +void mb_aes256_buffer(const uint8_t *data, size_t len, uint8_t *data_out, const aes_key_t *key, iv_t *iv) { mbedtls_aes_context aes; assert(len % 16 == 0); @@ -48,7 +90,12 @@ void mb_aes256_buffer(const uint8_t *data, size_t len, uint8_t *data_out, const mbedtls_aes_setkey_enc(&aes, key->bytes, 256); uint8_t stream_block[16] = {0}; size_t nc_off = 0; +#if IV0_XOR + uint8_t xor_working_block[16] = {0}; + mb_aes_crypt_ctr_xor(&aes, len, iv->bytes, xor_working_block, stream_block, data, data_out); +#else mbedtls_aes_crypt_ctr(&aes, len, &nc_off, iv->bytes, stream_block, data, data_out); +#endif } void raw_to_der(signature_t *sig) { diff --git a/bintool/mbedtls_wrapper.h b/bintool/mbedtls_wrapper.h index c96909cc..48c06f7b 100644 --- a/bintool/mbedtls_wrapper.h +++ b/bintool/mbedtls_wrapper.h @@ -19,6 +19,16 @@ extern "C" { #include #include +/* + * Use XOR of counter with IV0 to generate the IV for each encrypted block + * + * ie IV = IV0 ^ block_number, rather than the default IV = IV0 + block_number + * + * The power signature for this calculation is easier to mask on RP2350 than + * adding the block number to the IV0 + */ +#define IV0_XOR 1 + #ifdef __cplusplus #define _Static_assert static_assert #endif @@ -40,11 +50,27 @@ typedef struct iv { uint8_t bytes[16]; } iv_t; /**< Convenience typedef */ +typedef struct aes_key { + /** An array 32 bytes key data. */ + union { + uint8_t bytes[32]; + uint32_t words[8]; + }; +} aes_key_t; /**< Convenience typedef */ + +typedef struct aes_key_share { + /** An array 128 bytes key data, 1 word from each share at a time. */ + union { + uint8_t bytes[128]; + uint32_t words[32]; + }; +} aes_key_share_t; /**< Convenience typedef */ + typedef signature_t public_t; typedef message_digest_t private_t; void mb_sha256_buffer(const uint8_t *data, size_t len, message_digest_t *digest_out); -void mb_aes256_buffer(const uint8_t *data, size_t len, uint8_t *data_out, const private_t *key, iv_t *iv); +void mb_aes256_buffer(const uint8_t *data, size_t len, uint8_t *data_out, const aes_key_t *key, iv_t *iv); void mb_sign_sha256(const uint8_t *entropy, size_t entropy_size, const message_digest_t *m, const public_t *p, const private_t *d, signature_t *out); uint32_t mb_verify_signature_secp256k1( diff --git a/enc_bootloader/BUILD.bazel b/enc_bootloader/BUILD.bazel new file mode 100644 index 00000000..ba986770 --- /dev/null +++ b/enc_bootloader/BUILD.bazel @@ -0,0 +1,23 @@ +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "enc_bootloader_prebuilt", + srcs = ["enc_bootloader.elf"], +) + +filegroup( + name = "enc_bootloader_mbedtls_prebuilt", + srcs = ["enc_bootloader_mbedtls.elf"], +) + +# TODO: Make this work. +cc_library( + name = "enc_bootloader", + srcs = ["enc_bootloader.c"], + tags = ["manual"], + deps = [ + "//:enc_bootloader", + "@pico-sdk//src/rp2_common/pico_stdlib", + "@pico-sdk//src/rp2_common/pico_rand", + ], +) diff --git a/enc_bootloader/CMakeLists.txt b/enc_bootloader/CMakeLists.txt new file mode 100644 index 00000000..45f3cc16 --- /dev/null +++ b/enc_bootloader/CMakeLists.txt @@ -0,0 +1,103 @@ +cmake_minimum_required(VERSION 3.12) + +if (NOT USE_PRECOMPILED) + set(PICO_PLATFORM rp2350-arm-s) + + set(PICO_NO_PICOTOOL 1) + + # Ensure we're using a MinSizeRel build + set(CMAKE_BUILD_TYPE MinSizeRel) + + # If the user set these environment variables to influence the picotool + # build, unset them here so that they do not influence the pico-sdk + # build. This is especially required for flags that are not supported + # by arm-none-eabi compilers. + unset(ENV{CFLAGS}) + unset(ENV{CXXFLAGS}) + unset(ENV{LDFLAGS}) + + # Pull in SDK (must be before project) + include(${PICO_SDK_PATH}/external/pico_sdk_import.cmake) + + project(enc_bootloader C CXX ASM) + set(CMAKE_C_STANDARD 11) + set(CMAKE_CXX_STANDARD 17) + + if (PICO_SDK_VERSION_STRING VERSION_LESS "2.1.2") + message(FATAL_ERROR "Raspberry Pi Pico SDK version 2.1.2 (or later) required. Your version is ${PICO_SDK_VERSION_STRING}") + endif() + + # Initialize the SDK + pico_sdk_init() + + # Encrypted Bootloader + add_executable(enc_bootloader + enc_bootloader.c + ) + + target_link_libraries(enc_bootloader + pico_stdlib + ) + + if (USE_MBEDTLS) + target_sources(enc_bootloader PRIVATE mbedtls_aes.c) + + target_link_libraries(enc_bootloader pico_mbedtls) + + target_compile_definitions(enc_bootloader PRIVATE + PICO_STACK_SIZE=0x800 + # 0x20080000 -> 0x20081000 doesn't overlap the stack + ROM_CHAIN_WORKSPACE=0x20080000) + + target_include_directories(enc_bootloader PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + + pico_set_linker_script(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/memmap_mbedtls.ld) + else() + target_sources(enc_bootloader PRIVATE aes.S) + + target_compile_definitions(enc_bootloader PRIVATE + PICO_STACK_SIZE=0x180 + # AES Code & workspace from 0x20080044 -> 0x20081604, so 0x20080200 -> 0x20081200 is inside that + ROM_CHAIN_WORKSPACE=0x20080200) + + pico_set_linker_script(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/memmap_enc_bootloader.ld) + endif() + + target_compile_definitions(enc_bootloader PRIVATE + # use stack guards, as AES variables are written near the stack + PICO_USE_STACK_GUARDS=1 + # The following are to reduce the size of the binary + PICO_NO_PROGRAM_INFO=1 + # No spinlocks used + PICO_USE_SW_SPIN_LOCKS=0 + # No heap is used + PICO_HEAP_SIZE=0 + # These inits are not required + PICO_RUNTIME_SKIP_INIT_SPIN_LOCKS_RESET=1 + PICO_RUNTIME_SKIP_INIT_PER_CORE_IRQ_PRIORITIES=1 + PICO_BOOTROM_LOCKING_ENABLED=0 + # Don't need any vtor irqs + PICO_MINIMAL_STORED_VECTOR_TABLE=1 + PICO_NO_RAM_VECTOR_TABLE=1 + ) + + # print memory usage + target_link_options(enc_bootloader PUBLIC -Wl,--print-memory-usage) + + pico_minimize_runtime(enc_bootloader) + + pico_set_binary_type(enc_bootloader no_flash) + pico_add_dis_output(enc_bootloader) +else() + project(enc_bootloader C CXX ASM) + message("Using precompiled enc_bootloader.elf") + if (USE_MBEDTLS) + configure_file(${CMAKE_CURRENT_LIST_DIR}/enc_bootloader_mbedtls.elf ${CMAKE_CURRENT_BINARY_DIR}/enc_bootloader.elf COPYONLY) + else() + configure_file(${CMAKE_CURRENT_LIST_DIR}/enc_bootloader.elf ${CMAKE_CURRENT_BINARY_DIR}/enc_bootloader.elf COPYONLY) + endif() + # Use manually specified variables + set(NULL ${CMAKE_MAKE_PROGRAM}) + set(NULL ${PICO_SDK_PATH}) + set(NULL ${PICO_DEBUG_INFO_IN_RELEASE}) +endif() diff --git a/enc_bootloader/aes.S b/enc_bootloader/aes.S new file mode 100644 index 00000000..e3ef4644 --- /dev/null +++ b/enc_bootloader/aes.S @@ -0,0 +1,1950 @@ +/* MEMORY LAYOUT ASSUMPTIONS + +The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see +the macro getchaffaddress. + +The stack must be located at the end of Y scratch RAM: see the memory +wiping at the end of ctr_crypt_s where memory between the start of Y +scratch RAM and the stack pointer is overwritten. +*/ + +.syntax unified +.cpu cortex-m33 +.thumb + +#include "config.h" +#include "hardware/platform_defs.h" +#include "hardware/regs/addressmap.h" +#include "hardware/regs/clocks.h" +#include "hardware/regs/sha256.h" +#include "hardware/regs/resets.h" +#include "hardware/regs/rosc.h" +#include "hardware/regs/trng.h" +#include "hardware/rcp.h" + +.global decrypt +.global chaff + +.extern lock_key + +@ RCP macros + +#define CTAG0 0x2a +#define CTAG1 0x2b +#define CTAG2 0x2c +#define CTAG3 0x2d +#define CTAG4 0x2e +#define CTAG5 0x30 +#define CTAG6 0x31 +#define CTAG7 0x32 +#define CTAG8 0x33 +#define CTAG9 0x34 +#define CTAG10 0x35 @ not used +#define CTAG11 0x36 @ not used +#define CTAG12 0x37 +#define CTAG13 0x38 +#define CTAG14 0x39 +#define CTAG15 0x3a +#define CTAG16 0x3b +#define CTAG17 0x3c +#define CTAG18 0x3d @ not used + +@ number of blocks from the TRNG processed to initialise rstate_sha +#define TRNG_BLOCKS 25 + +@ The lower jitterpriorty is, the more the jitter +.macro SET_COUNT n,jitterpriority +.if RC_COUNT +.if RC_JITTER > \jitterpriority + rcp_count_set \n +.else + rcp_count_set_nodelay \n +.endif +.endif +.endm + +.macro CHK_COUNT n,jitterpriority +.if RC_COUNT +.if RC_JITTER > \jitterpriority + rcp_count_check \n +.else + rcp_count_check_nodelay \n +.endif +.endif +.endm + +.macro GET_CANARY rx,tag,jitterpriority +.if RC_CANARY +.if RC_JITTER > \jitterpriority + rcp_canary_get \rx,\tag +.else + rcp_canary_get_nodelay \rx,\tag +.endif +.endif +.endm + +.macro CHK_CANARY rx,tag,jitterpriority +.if RC_CANARY +.if RC_JITTER > \jitterpriority + rcp_canary_check \rx,\tag +.else + rcp_canary_check_nodelay \rx,\tag +.endif +.endif +.endm + +@ Clear internal stripe load registers, and r0-r3 +@ 0 <= offset <= 32 +.macro clear03 offset=0 + getchaffaddress r0,\offset + ldmia r0,{r0-r3} +.endm + +.macro clear03_preserve_r3 offset=0 + getchaffaddress r0,\offset + ldmia r0!,{r1-r2} + ldmia r0!,{r1-r2} +.endm + +.macro clear01 offset=0 + getchaffaddress r0,\offset + ldmia r0,{r0,r1} +.endm + +@ Put workspace in the second scratch area +@ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants, +@ otherwise they may end up silently replaced with 0 or 0xffffffff +.section .scratch_y.aes,"aw",%progbits + +workspace_start: + +@ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress +@ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000) +@ getchaffaddress is used by clear03 and clear01 and other sensitive cases which require the first load to be a random one +@ chaff has to be 0 mod 16 for other reasons +.macro getchaffaddress rx,offset=0 +@ ldr \rx,=(chaff+\offset) + mov \rx,#(0x1000+\offset) + movt \rx,#0x2008 +.endm +chaff: +.space 48 + +.balign 16 +rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words + @ see comment at init_key_4way for description of layout and meaning of rkey_s +.space 600 +rkey4way: @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space +.space 128 +.if CT_BPERM +bperm_rand: @ 32 half words that define the oblivious permutation of blocks +.space 64 +.endif + +.balign 16 +permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) +perm16: +.space 16 +@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s +.balign 16 +fourway: @ Must be 0 mod 16 +shareA: @ 0 mod 16 +.space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 +shareB: @ 4 mod 16 +.space 20 +shareC: @ 8 mod 16 +.space 4 +statevperm: @ 12 mod 16 +.space 4 @ vperm state rotation: only last two bits are operational; other bits random +RKshareC: @ Round key common share C; see comment at init_key_4way for explanation +.space 4 +RKshareCchange: @ Temporary used by ref_roundkey_share_s +.space 4 +IV0: @ 2-way share of IV for block 0 +.space 36 @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16) + @ The gap at IV0[4] is to defeat unsharing by internal striped memory registers + @ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless + +@ Regardless of configuration, the code uses a single 256-entry LUT, +@ which is a simple S-box table. +@ The LUT is represented as two shares, lut_a and lut_b, +@ whose values must be EORed. Furthermore, the contents of each share are +@ scambled according to a 4-byte "map". The map comprises two bytes that +@ are EORed into the addressing of the share, and two bytes that are +@ EORed into the data read back from the share. Performing a lookup +@ of a value x involves computing +@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ c₁ ^ lut_b[x ^ b₀ ^ b₁] ^ d₀ ^ d₁ +@ where a₀, a₁, c₀ and c₁ are the "map" of the lut_a share and +@ b₀, b₁, d₀ and d₁ are the "map" of the lut_b share. +@ In practice the result of a lookup is itself represented in two +@ shares, namely +@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀ and +@ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁ +.balign 16 +lut_a: @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup) +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +lut_a_map: @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b +.space 4 +.space 4 @ align to 8 mod 16 +lut_b: @ LUT share B (must be 8 mod 16 so that init_key_sbox knows how to mask the lookup) +.space 256 +lut_b_map: +.space 4 +.space 4 @ align to multiple of 8 + +.balign 16 +rstate_all_start: @ Mark start of RNG data to allow selective memory wipe +rstate_sha: @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero +.space 16 +jstate: @ 32-bit jitter state +.space 4 +rstate_lfsr: @ 32-bit LFSR random state and constant used to step it +.space 4 +.word 0x1d872b41 @ constant that defines a maximal-length LFSR +rstate_all_end: @ Mark end of RNG data to allow selective memory wipe + +.if CT_BPERM +.balign 16 +murmur3_constants: @ Five constants used in murmur3_32 hash +.word 0xcc9e2d51 +.word 0x1b873593 +.word 0xe6546b64 +.word 0x85ebca6b +.word 0xc2b2ae35 +.endif + +scratch_y_end: + +@ Initialisation code in main .text section +.section .text,"ax",%progbits + +@ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments. +@ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some +@ random numbers. +@ Trashes r0-r6 +.balign 4 +init_rstate: + CHK_COUNT 24,6 + ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET + ldr r5,=SHA256_BASE + movs r1,#1 + str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] + ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] @ reads as 0 + movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS @ initialise SHA internal state by writing START bit + str r1,[r5,#SHA256_CSR_OFFSET] + str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET -TRNG_RNG_IMR_OFFSET] + movs r6,#TRNG_BLOCKS*2+1 @ odd so that we break out of the loop half-way through loading the SHA hardware, giving + @ time for previous SHA computation to complete +2: + movs r1,#0xff @ TRNG setup is inside loop in case it is skipped. + str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET] @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples + str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] @ start ROSC if it is not already started + str r1,[r4,#TRNG_RNG_ICR_OFFSET -TRNG_RNG_IMR_OFFSET] @ clear all interrupts (including EHR_VLD) + adds r0,r4,#TRNG_EHR_DATA0_OFFSET -TRNG_RNG_IMR_OFFSET + movs r2,#TRNG_TRNG_BUSY_OFFSET -TRNG_RNG_IMR_OFFSET +1: + ldr r1,[r4,r2] @ wait for 192 ROSC samples to fill EHR,should take constant time + cmp r1,#0 + bne 1b + subs r6,#1 @ done? + beq 3f + movs r1,#8 +1: + ldmia r0!,{r2} @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1) + str r2,[r5,#SHA256_WDATA_OFFSET] @ for a total of half a SHA-256 block + subs r1,#1 + bne 1b + ldr r2,[r5,#SHA256_SUM0_OFFSET] @ TRNG is now sampling again; use some SHA bits to modulate the chain length + str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] + b.n 2b + +3: + CHK_COUNT 25,6 + str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] @ turn off rand source and wipe SHA bits left in TRNG config; r1=0 + str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] + adds r5,r5,#SHA256_SUM0_OFFSET +@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc) + ldmia r5,{r0-r3} @ load first 4 words of the 8 word SHA256 output + ldr r6,=rstate_sha +@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc), r6=rstate_sha + stmia r6,{r0-r3} + CHK_COUNT 26,6 + movs r0,#0 + strb r0,[r6] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" + +@ try to find a non-zero initialiser to create a non-degenerate LFSR random state + ldr r1,[r5,#16] @ SHA SUM4 + cbnz r1,1f @ is word 4 non-zero? then use it + ldr r1,[r5,#20] @ SHA SUM5 + cbnz r1,1f @ otherwise, is word 5 non-zero? use it + mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) +1: + str r1,[r6,#rstate_lfsr-rstate_sha] + +@ try to find a non-zero initialiser to create a non-degenerate ROSC random state + ldr r1,[r5,#24] @ SHA SUM6 + cbnz r1,1f @ is word 6 non-zero? then use it + ldr r1,[r5,#28] @ SHA SUM7 + cbnz r1,1f @ otherwise, is word 7 non-zero? use it + mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) +1: + ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE + str r1,[r2,#0] @ Initialise ROSC LFSR + CHK_COUNT 27,6 + +.if GEN_RAND_SHA +.if SH_JITTER + movs r2,#0 + str r2,[r6,#jstate-rstate_sha] +.endif +.endif + + CHK_COUNT 28,6 + bx r14 + +@ Put AES core code in first scratch area +.section .scratch_x.aes,"ax",%progbits + +.if GEN_RAND_SHA +@ we need SHA256_SUM0_OFFSET==8 (see note below) +.if SHA256_SUM0_OFFSET!=8 +.err +.endif + +@ Return single random word in r0 +@ Preserves r1-r13 +.balign 4 +gen_rand_sha: + push {r14} + GET_CANARY r14,CTAG1,2 + push {r1-r3,r14} +.if SH_JITTER + ldr r2,=rstate_sha + ldr r0,[r2,#jstate-rstate_sha] + movs r1,#1 + ands r3,r0,#3 + movs r3,r3,lsl#2 + movs r3,r1,lsl r3 @ 1<<(4*(r0&3)) + udiv r3,r3,r1 @ Takes constant + (r0&3) cycles + lsrs r0,r0,#2 + bne 1f + bl gen_rand_sha_nonpres + ldr r2,=rstate_sha +1: + str r0,[r2,#jstate-rstate_sha] +.endif + bl gen_rand_sha_nonpres + pop {r1-r3,r14} + CHK_CANARY r14,CTAG1,0 + pop {r15} + +@ Return single random word in r0 +@ Trashes r1-r3 +.balign 4 +gen_rand_sha_nonpres: + ldr r0,=SHA256_BASE + ldr r2,=rstate_sha + ldrb r1,[r2] @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers) + subs r3,r1,#4 @ decrement it to previous SUM register + ble 1f @ if the offset was 4 or less we have run out of SUM register values + ldr r0,[r0,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8 + strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[] + bx r14 +1: +@ [CK_JITTER code was here] + movs r3,#SHA256_SUM6_OFFSET+1 + strb r3,[r2] @ reset word counter: the +1 is compensated for later + movw r1,#(1<>30, vpermB=Bptr[4]>>30, and +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 +.balign 4 +.thumb_func +ref_roundkey_shares_s: + mov r11,#15 @ there are 15 expanded keys +ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds + ldr r4,=rkey_s + loadlfsr + steplfsr @ r0=change in RKshareC + ldr r2,=RKshareCchange + str r0,[r2] + ldr r3,=RKshareC + ldr r5,[r3] + eors r5,r5,r0 + str r5,[r3] + @ r0=lfsr_state, r1=lfsr_const, r4=roundkey_ptr, r11=roundcounter + +ref_roundkey_shares_s_loop: + ldmia r4!,{r5-r8,r10} @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA + + ldr r12,[r4,#16] @ r12 = X_B=vperm+rotations of rkey shareB + mov r2,r12,lsr#30 @ r2 = vpermB + sub r9,r2,r10,lsr#30 @ r9 = vpermB - vpermA (|junk) + mov r2,r9,lsl#3 @ r2 = 8*(vpermB - vpermA) mod 32 + mov r12,r12,ror r2 + usub8 r12,r10,r12 @ r12 = rotsA - (rotsB ror r2) + + @ r2,r3,r10=workspace, r0=lfsr_state, r1=lfsr_const, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=rotsA, r11=roundcounter, r12=rotdiff + steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] + + ldr r3,=RKshareCchange + ldr r3,[r3] + movs r2,#0 + usub8 r10,r2,r10 + ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2 + ror r2,r3,r10; mov r10,r10,ror#8; eors r6,r6,r2 + ror r2,r3,r10; mov r10,r10,ror#8; eors r7,r7,r2 + ror r2,r3,r10; eors r8,r8,r2 + + subs r4,r4,#20 + stmia r4,{r5-r8} + adds r4,r4,#40 + subs r11,r11,#1 + + bne ref_roundkey_shares_s_loop + ldr r2,=rstate_lfsr @ restore rstate_lfsr + savelfsr @ Save lfsr_state + clear03 24 +ref_roundkey_shares_s_exit: + bx r14 + +.balign 4 +.thumb_func +@ Rotates roundkey vperms and RK_ROR rotations by random amounts +@ Trashes r0-r10 +@ If i = word number 0..3, +@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then +@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 +ref_roundkey_hvperms_s: + movs r7,#30 +ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares + GET_CANARY r10,CTAG9,6 + push {r10,r14} + ldr r10,=rkey_s +ref_roundkey_hvperms_s_loop: + bl gen_rand_lfsr_nonpres @ r0=new vperm high|rotations + ldmia r10,{r2-r5,r9} @ r2-r5=roundkey share A/B, r9=old vperm high|rotations + str r0,[r10,#16] + mov r8,r0,lsr#30 @ r8=new vperm low + sub r6,r8,r9,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk + mov r8,r6,lsl#3 @ r8=8*((new vperm low)-(old vperm low)) mod 32 + mov r0,r0,ror r8 + usub8 r0,r9,r0 @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations) + movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2] + adds r10,r10,#20 + subs r7,r7,#1 + bne ref_roundkey_hvperms_s_loop + clear03 28 +ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code + pop {r10,r14} + CHK_CANARY r10,CTAG9,6 + bx r14 + +.else + +@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC +@ Trashes r0-r11 +.balign 4 +.thumb_func +ref_roundkey_shares_s: + mov r11,#15 @ there are 15 expanded keys +ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds + GET_CANARY r4,CTAG8,6 + push {r4,r14} + ldr r4,=rkey_s + loadlfsr + steplfsr @ r0=change in RKshareC + ldr r3,=RKshareC + ldr r5,[r3] + eors r5,r5,r0 + str r5,[r3] + mov r10,r0 +ref_roundkey_shares_s_loop: + ldmia r4!,{r5-r9} @ r5-r8 = rkey shareA with vperm r9 + + @ clear03: would need to do this with, say r2,r3,r12 (reloading r2 later) + + ldr r3,[r4,#16] @ rkey shareB has a vperm of r10>>30 + movs r3,r3,lsr#30 + sub r9,r3,r9,lsr#30 @ r9 = vperm_B - vperm_A (|junk) + @ r3,r12=workspace, r0=lfsr_state, r1=lfsr_const, r2=rstate_lfsr, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=RKshareCchange, r11=roundcounter + + steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2] + + subs r4,r4,#20 + stmia r4,{r5-r8} + adds r4,r4,#40 + subs r11,r11,#1 + + @ clear03: would need to do this with, say r3,r5-r8 + + bne ref_roundkey_shares_s_loop + savelfsr + clear03 24 +ref_roundkey_shares_s_exit: + pop {r4,r14} + CHK_CANARY r4,CTAG8,6 + bx r14 + +.balign 4 +.thumb_func +@ Rotates roundkey vperms by random amounts +@ Trashes r0-r9 +ref_roundkey_hvperms_s: + movs r7,#30 +ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares + GET_CANARY r0,CTAG9,6 + push {r0,r14} + bl gen_rand_lfsr_nonpres + ldr r1,=rkey_s +ref_roundkey_hvperms_s_loop: + cmp r7,#15 + bne 2f +@ Get a new random r0 after using 15 x 2 bits of the original one +@ Note that the junk bits (2-31) in the vperms are not adjusted independently, but that's no big loss, +@ and the gain is only calling gen_rand_lfsr twice instead of 30 times. + push {r1}; bl gen_rand_lfsr_nonpres; pop {r1} + 2: + ldmia r1,{r2-r5,r9} @ roundkey share A/B=r2-r5, vperm=r9 (including junk bits) + mov r8,r9,lsr#30 @ r8=old vperm (low) + add r6,r9,r0 @ r6=new vperm (high) | new junk + str r6,[r1,#16] + rsb r6,r8,r6,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk bits + ands r6,r6,#3; str r2,[r1,r6,lsl#2]; adds r6,r6,#1 + ands r6,r6,#3; str r3,[r1,r6,lsl#2]; adds r6,r6,#1 + ands r6,r6,#3; str r4,[r1,r6,lsl#2]; adds r6,r6,#1 + ands r6,r6,#3; str r5,[r1,r6,lsl#2] + adds r1,r1,#20 + movs r0,r0,ror#2 + subs r7,r7,#1 + bne ref_roundkey_hvperms_s_loop + clear03 28 +ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code + pop {r0,r14} + CHK_CANARY r0,CTAG9,6 + bx r14 + +.endif + +.ltorg + +.if ST_VPERM +.balign 4 +.thumb_func +@ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount +@ given in the bottom two bits of R0 and update the rotation recorded at statevperm. +@ On entry R1 must point to statevperm. +@ Trashes r0-r3,r12 +@ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ... +@ r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ... +@ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise. +addstatevperm: + ldr r2,[r1] + adds r2,r2,r0 + str r2,[r1] + + ldr r1,=shareA + ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1 + ldmia r1,{r4-r7} + + getchaffaddress r12 @ Overwrite temporary storage with random numbers + ldmia r12!,{r2,r3} + stmia r1!,{r2,r3} + ldmia r12!,{r2,r3} + stmia r1!,{r2,r3} + + ldr r1,=shareB + ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r9, [r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1 + ldmia r1,{r8-r11} + + getchaffaddress r0,16 @ Overwrite temporary storage with random numbers + ldmia r0!,{r2,r3} + stmia r1!,{r2,r3} + ldmia r0!,{r2,r3} + stmia r1!,{r2,r3} + +addstatevperm_exit: @ label exit point to be to able to specify to analysis code + bx r14 +.endif + +@ Conjugate lut_a, lut_b with (state) shareC +@ I.e., EOR the input and output with shareC. +@ We need to pick one input for each share A and B, and one output for ONE of the shares A and B +@ Arbitrarily choosing a0, b1 and d0 +.balign 4 +conjshareC: +.if ST_SHAREC + ldr r1,=shareC + ldr r0,[r1] @ Get shareC as a word (all bytes the same) + ldr r1,=lut_a @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs... + ldr r2,[r1,#0x100] + eors r2,r2,r0,lsr#24 + str r2,[r1,#0x100] + movs r0,r0,lsr#16 + ldr r1,=lut_b @ ... (continued) Here we're EORing share C into a0, b1 and d0. + ldr r2,[r1,#0x100] + eors r2,r2,r0,lsl#8 + str r2,[r1,#0x100] +.endif + bx r14 + +.balign 4 +.thumb_func +shift_rows_s: +@ First "rotate" the two most-significant bytes of the state by two registers +@ Trashes r0-r3 +@ Slightly faster (but not shorter?) with ubfx/bfi + eors r0,r4,r6 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta; + lsrs r0,r0,#16 + lsls r0,r0,#16 + eors r4,r4,r0 + eors r6,r6,r0 + eors r0,r5,r7 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta; + lsrs r0,r0,#16 + lsls r0,r0,#16 + eors r5,r5,r0 + eors r7,r7,r0 +@ next "rotate" the two odd-significance bytes of the state by one register + eors r1,r7,r4 @ tb=state[3]^state[0]; tb&=0xff00ff00; + ands r1,r1,#0xff00ff00 + eors r0,r4,r5 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta; + ands r0,r0,#0xff00ff00 + eors r4,r4,r0 + eors r0,r5,r6 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta; + ands r0,r0,#0xff00ff00 + eors r5,r5,r0 + eors r0,r6,r7 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; + ands r0,r0,#0xff00ff00 + eors r6,r6,r0 + eors r7,r7,r1 @ state[3]^=tb; +@ repeat for other share, conjugated by ror#16 + clear01 @ barrier + eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0x0000ffff; state[0]^=ta; state[2]^=ta; + lsls r0,r0,#16 + lsrs r0,r0,#16 + eors r8,r8,r0 + eors r10,r10,r0 + eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0x0000ffff; state[1]^=ta; state[3]^=ta; + lsls r0,r0,#16 + lsrs r0,r0,#16 + eors r9,r9,r0 + eors r11,r11,r0 + eors r1,r11,r8 @ tb=state[3]^state[0]; tb&=0xff00ff00; + ands r1,r1,#0xff00ff00 + eors r0,r8,r9 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta; + ands r0,r0,#0xff00ff00 + eors r8,r8,r0 + eors r0,r9,r10 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta; + ands r0,r0,#0xff00ff00 + eors r9,r9,r0 + eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; + ands r0,r0,#0xff00ff00 + eors r10,r10,r0 + + eors r11,r11,r1 @ state[3]^=tb; + + clear01 @ barrier + bx r14 + +@ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1 +@ r0x00 is a register holding 0x00000000; r0x1b is a register holding 0x1b1b1b1b +.macro mixcol rx,rt,ru,r0x00,r0x1b + @ let rx=(a,b,c,d) + uadd8 \rt,\rx,\rx @ MSB of each byte into the GE flags + sel \ru,\r0x1b,\r0x00 @ get bytewise correction for bytewise field multiplication by 2 + eors \rt,\rt,\ru @ (2a,2b,2c,2d) + + eors \ru,\rt,\rx @ (3a,3b,3c,3d) + eors \rt,\rt,\rx,ror#24 @ (2a+b,2b+c,2c+d,2d+a) + eors \rt,\rt,\rx,ror#16 @ (2a+b+c,2b+c+d,2c+d+a,2d+a+b) + eors \rx,\rt,\ru,ror#8 @ (2a+b+c+3d,2b+c+d+3a,2c+d+a+3b,2d+a+b+3c) +.endm + +@ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1 +.macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b + uadd8 \rt,\rx,\rx @ field multiplication by 2 as above + sel \rw,\r0x1b,\r0x00 + eors \rt,\rt,\rw @ 2x + uadd8 \ru,\rt,\rt + sel \rw,\r0x1b,\r0x00 + eors \ru,\ru,\rw @ 4x + uadd8 \rv,\ru,\ru + sel \rw,\r0x1b,\r0x00 + eors \rv,\rv,\rw @ 8x + + eors \rx,\rx,\rv @ 9x + eors \rw,\rx,\rt @ 11x + eors \rw,\rw,\rx,ror#16 @ 11x ^ 9x ROL #16 + eors \rx,\rx,\ru @ 13x + eors \rw,\rw,\rx,ror#8 @ 11x ^ 9x ROL #16 ^ 13x ROL #24 + eors \rt,\rt,\ru @ 6x + eors \rt,\rt,\rv @ 14x + eors \rx,\rt,\rw,ror#8 @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24 +.endm + +.balign 4 +.thumb_func +@ Trashes r0-r3,r12 +mix_cols_s: + mov r2,#0x00000000 + mov r3,#0x1b1b1b1b + mixcol r4 ,r0,r1,r2,r3 @ apply mixcol to each state word + mixcol r5 ,r0,r1,r2,r3 + mixcol r6 ,r0,r1,r2,r3 + mixcol r7 ,r0,r1,r2,r3 + ldr r12,=chaff + ldmia r12!,{r0,r1} @ overwrite sensitive shareA-related quantities r0,r1 with random numbers + mixcol r8 ,r0,r1,r2,r3 + mixcol r9 ,r0,r1,r2,r3 + mixcol r10,r0,r1,r2,r3 + mixcol r11,r0,r1,r2,r3 + ldmia r12!,{r0,r1} @ overwrite sensitive shareB-related quantities r0,r1 with random numbers + bx r14 + +@ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups) +.macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3 + ubfx \Rspare0,\Rtarg,#0, #8 + ubfx \Rspare1,\Rtarg,#8, #8 + ubfx \Rspare2,\Rtarg,#16, #8 + ubfx \Rspare3,\Rtarg,#24, #8 + + ldrb \Rspare0,[\Rtable,\Rspare0] + ldrb \Rspare1,[\Rtable,\Rspare1] + ldrb \Rspare2,[\Rtable,\Rspare2] + ldrb \Rspare3,[\Rtable,\Rspare3] + orr \Rspare0,\Rspare0,\Rspare1,lsl#8 + orr \Rspare2,\Rspare2,\Rspare3,lsl#8 + orr \Rtarg,\Rspare0,\Rspare2,lsl#16 +.endm + +@ map all bytes of the state through the split LUT, lut_a and lut_b +@ Trashes r0-r3,r12 +.balign 4 +.thumb_func +map_sbox_s: + GET_CANARY r12,CTAG12,3 + push {r12,r14} + + ldr r0,=shareA @ Write out state share A to memory +@ stmia r0,{r4-r7} @ Used to do a STM + getchaffaddress r1 + ldr r2,[r1] + str r4,[r0] @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms, + str r2,[r1] @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired + str r5,[r0,#4] @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic. + str r2,[r1] @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1. + str r6,[r0,#8] @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but + str r2,[r1] @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic. + str r7,[r0,#12] + str r2,[r1] + + ldr r0,=shareB @ Write out state share B to memory + stmia r0,{r8-r11} @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with + + bl makeperm16 @ Rebuild random 16-way permutation. Maybe do this less frequently +@ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation + + bl gen_rand_sha_nonpres + mov r11,r0 + ldr r8,=lut_a + ldr r9,=lut_b + ldr r0,[r8,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) + eors r3,r0,r0,lsr#8 @ R3 = a0^a1 | junk + uxtb r10,r3 + ldr r1,[r9,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) + eors r1,r0,r1 + eors r2,r1,r1,lsr#8 + movs r12,r1,lsr#16 @ R12 = c0^d0 | (c1^d1)<<8 + bfi r12,r2,#16,#8 @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 + + ldr r4,=perm16 + ldr r5,=shareA + ldr r6,=shareB + movs r1,#0;movs r2,#0;movs r3,#0 +@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 + movs r0,#15 +1: @ (Ordering instructions to minimise result delays) + ldrb r1,[r4,r0] @ r1 = perm[r0] + mov r11,r11,ror#11 @ Rotate random 32 bits to present a new low 8 bits + eors r7,r1,#2 @ r7 = perm[r0]^2 + ldrb r2,[r5,r1] @ r2 = shareA[perm[r0]] + eor r11,r11,r2,ror#8 @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted) + ldrb r3,[r6,r7] @ r3 = shareB[perm[r0]^2] + eor r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 + eors r2,r2,r3 @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2] + ldrb r3,[r8,r2] @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]] + eor r2,r2,r12,lsr#16 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] + eor r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) + eor r3,r3,r11 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8) + strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand + ldrb r3,[r9,r2] @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]] + subs r0,r0,#1 + eor r3,r3,r11 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand + eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8) + strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 + bpl 1b + clear03 8 @ barrier + + ldmia r6,{r8-r11} @ Read state share B back from memory + clear03 12 @ barrier + getchaffaddress r0,16 + bfi r0,r5,#0,#4 @ match chaff pointer (r0) to share A location (R5) mod 16 + @ldmia r5,{r4-r7} @ Read state share A back from memory + @clear03 16 @ barrier + ldr r4,[r5] @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s + ldr r1,[r0] + ldr r6,[r5,#8] + ldr r1,[r0,#8] + ldr r7,[r5,#12] + ldr r1,[r0,#12] + ldr r5,[r5,#4] @ Do r5 last because it's the address register + ldr r1,[r0,#4] + +@ Refresh state shares because luts only give imperfect share-by-value +@ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent) +@ loadlfsr +@ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc +@ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16 +@ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16 +@ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16 +@ savelfsr + + pop {r12,r14} + CHK_CANARY r12,CTAG12,5 + bx r14 + +.ltorg + +.balign 4 +.thumb_func +randomisechaff: +@ Randomise 48 bytes of chaff values (random load values) +@ Uses 12 bytes of permscratch +@ Trashes r0-3 + GET_CANARY r0,CTAG13,6 + push {r0,r14} + movs r0,#12 + ldr r1,=permscratch + bl makesmallperm @ Store the random words in a random order to make 2nd order attacks harder + movs r1,#11 +1: + push {r1} + bl gen_rand_sha_nonpres + pop {r1} + ldr r2,=permscratch + ldrb r2,[r2,r1] + getchaffaddress r3 + str r0,[r3,r2,lsl#2] + subs r1,r1,#1 + bpl 1b + pop {r0,r14} + CHK_CANARY r0,CTAG13,6 + bx r14 + +.balign 4 +refreshchaff_and_lfsr: +@ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff +@ Re-randomise LFSR with SHA +@ Uses 12 bytes of permscratch +@ Trashes r0-3,12 + GET_CANARY r0,CTAG14,6 + push {r0,r14} + +@ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence + bl gen_rand_sha_nonpres + ldr r1,=rstate_lfsr + ldr r2,[r1] + adds r2,r2,r0 + beq 1f @ Don't update LFSR state to 0 + str r2,[r1] +1: + +@ Choose a random order to update chaff words to make 2nd order attacks harder + movs r0,#12 + ldr r1,=permscratch + bl makesmallperm + + movs r1,#11 +1: + push {r1} + bl gen_rand_lfsr_nonpres + pop {r1} + ldr r2,=permscratch + ldr r3,=chaff + ldrb r2,[r2,r1] + ldr r12,[r3,r2,lsl#2] + add r0,r0,r12 + str r0,[r3,r2,lsl#2] + subs r1,r1,#1 + bpl 1b + pop {r0,r14} + CHK_CANARY r0,CTAG14,6 + bx r14 + +.balign 4 +.thumb_func +@ Do sbox on the four bytes of the 4-way share r4-r7 +@ Trashes r0,r8-r12 +init_key_sbox: + GET_CANARY r12,CTAG15,6 + push {r1-r3,r12,r14} + bl gen_rand_sha_nonpres; mov r8,r0 + bl gen_rand_sha_nonpres; mov r9,r0 + bl gen_rand_sha_nonpres; mov r10,r0 + bl gen_rand_sha_nonpres; mov r11,r0 + ldr r0,=fourway @ Write out 4-way share to memory + stmia r0,{r8-r11} @ Save random values first to obscure saving of state + stmia r0,{r4-r7} + movs r4,#0 @ Clear r4-r7 so that they don't interact with makesmallperm + movs r5,#0 + movs r6,#0 + movs r7,#0 + + bl randomisechaff @ Randomise block of memory mainly used for obscuring loads + + movs r0,#4 + ldr r1,=permscratch + bl makesmallperm @ Build random 4-way permutation determining order of bytes to be SBOXed + ldr r1,=permscratch @ Write out random addresses in advance to save two registers (reusing permscratch) + ldr r4,[r1] + ldr r0,=fourway + uxtab r5,r0,r4 + uxtab r6,r0,r4,ror#8 + uxtab r7,r0,r4,ror#16 + uxtab r8,r0,r4,ror#24 + stmia r1,{r5-r8} @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] + + bl gen_rand_sha @ Save some randomness for the resharing operation later + movs r7,r0 + bl gen_rand_sha + movs r8,r0 + + ldr r2,=lut_a + ldr r3,=lut_b + ldr r0,[r2,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) + eors r10,r0,r0,lsr#8 + uxtb r10,r10 @ R10 = a0^a1 + ldr r1,[r3,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) + eors r1,r0,r1 + eors r4,r1,r1,lsr#8 + uxtb r11,r4 @ R11 = a0^a1^b0^b1 + eor r10,r10,r11,lsl#8 @ R10 = a0^a1 | (a0^a1^b0^b1)<<8 + movs r12,r1,ror#16 @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24 + + ldr r1,=permscratch + ldr r11,=chaff +@ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk +1: + ands r5,r1,#12 + adds r5,r11,r5 @ Align chaff address to r1 + ldr r6,[r1],#4 @ r6 = fourway + perm[i] (i=0-3, loop iteration) + ldr r5,[r5] @ Random load to mask previous load + + ands r9,r6,#12 + add r9,r11,r9 @ r9 = chaff address aligned to (r6 bic 3) mod 16 + ldrb r4,[r6,#0] + ldr r14,[r9,#0] @ Random load to mask previous load + eor r4,r4,r10 + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ldrb r5,[r6,#4] + ldr r14,[r9,#4] @ Random load to mask previous load + eors r4,r4,r5 + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ldrb r5,[r6,#8] + ldr r14,[r9,#8] @ Random load to mask previous load + eors r4,r4,r5 + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ldrb r5,[r6,#12] + ldr r14,[r9,#12] @ Random load to mask previous load + eors r4,r4,r5 @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ands r14,r4,#255 + ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] + and r14,r4,#15 + add r14,r14,#32 + ldrb r14,[r11,r14] @ Random load to mask previous load (r2 and r11 are both 0 mod 16) + eors r5,r5,r12 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24 +@ split r5 into two shares and store at [r6,#0] and [r6,#4] + strb r7,[r6,#0] + eors r5,r5,r7 + strb r5,[r6,#4] + + mov r5,r10,lsr#8 @ r5=a0^a1^b0^b1 + ldr r14,[r11,#44] @ Need to eor into a random destination register + eors r14,r4,r5 @ r14 = unsharedbyte[perm[i]]^b0^b1 | junk<<8 + and r14,r14,#255 + + ldrb r5,[r3,r14] @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1] + and r14,r14,#15 + add r4,r11,#24 + ldrb r14,[r4,r14] @ Random load to mask previous load (r3==8 and r11==0 mod 16) + eor r5,r5,r12,ror#8 @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24 +@ split r5 into two shares and store at [r6,#8] and [r6,#12] + strb r8,[r6,#8] + eors r5,r5,r8 + strb r5,[r6,#12] + + movs r7,r7,ror#8 + movs r8,r8,ror#8 + + tst r1,#12 @ This does 4 loop iterations because permscratch is guaranteed to be 0 mod 16 + bne 1b + + ldr r0,=fourway + ldmia r0,{r4-r7} @ Load SBOXed values back into register r4-r7 + ldmia r11,{r8-r12,r14} @ Random load to mask previous load and to obfuscate registers + + pop {r1-r3,r12,r14} + CHK_CANARY r12,CTAG15,6 + bx r14 + +.balign 4 +.thumb_func +@ r1 = pointer to 4 x 4-way share (16 words); left unchanged +@ r3 = rkey_s+40*roundkeynumber; advanced by 40 +@ Trashes r8-r12 +@ If i = word number 0..3, +@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then +@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16 +storeroundkey: + GET_CANARY r8,CTAG16,6 + push {r2,r8,r14} + +@ eor two 4-way share components to make a component of a 2-way share +@ Note that we load from 4-way share at a random address then convert to 2-way share and +@ store at a fixed address, rather than the other way around, so that 2-way shares are obscured +@ by vperm (we don't know which 2-way share is being processed at a particular point in time). +@ And (if RK_ROR) we rotate first before EORing down to 2-way, so there is never an unrotated 2-way share + + bl gen_rand_sha @ Get r0 = vperm for shareA of the round key + str r0,[r3,#16] + mov r8,r0,lsr#30 + rsb r8,r8,#0 @ r8=-vperm +.if RK_ROR + movs r2,#0 + usub8 r2,r2,r0 @ r2=-hperms +.endif + mov r9,#4 +1: + and r8,r8,#3 + adds r0,r1,r8,lsl#4 + + ldmia r0,{r10,r11} +.if RK_ROR + mov r10,r10,ror r2 + mov r11,r11,ror r2 + movs r2,r2,ror#8 +.endif + eor r10,r10,r11 + str r10,[r3],#4 + add r8,r8,#1 + subs r9,r9,#1 + bne 1b + + adds r1,r1,#8 + adds r3,r3,#4 @ skip over vperm (already stored) + + bl gen_rand_sha @ Get r0 = vperm for shareB of the round key + str r0,[r3,#16] + mov r8,r0,lsr#30 + rsb r8,r8,#0 @ r8=-vperm +.if RK_ROR + movs r2,#0 + usub8 r2,r2,r0 @ r2=-hperms +.endif + mov r9,#4 + ldr r12,=RKshareC + ldr r12,[r12] +1: + and r8,r8,#3 + adds r0,r1,r8,lsl#4 + ldmia r0,{r10,r11} + eor r10,r10,r12 @ Mix in RKshareC into round key shareB +.if RK_ROR + mov r10,r10,ror r2 + mov r11,r11,ror r2 + movs r2,r2,ror#8 +.endif + mov r10,r10,ror#16 + mov r11,r11,ror#16 + eor r10,r10,r11 + str r10,[r3],#4 + add r8,r8,#1 + subs r9,r9,#1 + bne 1b + + subs r1,r1,#8 @ Restore r1 = (r1 on entry) + adds r3,r3,#4 @ Set r3 = (r3 on entry) + 40 + + pop {r2,r8,r14} + CHK_CANARY r8,CTAG16,6 + bx r14 + +.balign 4 +.thumb_func +init_key_4way: +@ On entry, r0 points to 4-way shared raw key data (64 bytes, 64 byte gap for FIB workaround, then other 64 bytes) +@ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 +@ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K. +@ +@ On exit, rkeys_s, a 40*15=600-byte region, is filled as follows. +@ Each of the 15 round keys is represented as two 5-word regions rka[0..4] and rkb[0..4], +@ each of which consists of 4 words of round key followed by a word encoding vperm and rotation (RK_ROR) information. +@ In addition a common share word, RKshareC, is set randomly. +@ For a given round, rk[i] = the i^th word of the actual round key is given by: +@ vpermA=rka[4]>>30 +@ vpermB=rkb[4]>>30 +@ rka_unrot[i] = rka[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of rka[4]) +@ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16 +@ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC + + GET_CANARY r12,CTAG17,6 + push {r0-r12,r14} + +@ Transfer 4-way key into local workspace, rerandomising the shares + mov r5,r0 @ r5=4-way key input + bl randomisechaff + ldr r6,=rkey4way + movs r7,#8 + b 1f +2: + adds r5,#64 @ Skip 64 byte gap for FIB workaround + subs r7,r7,#1 +1: + ldmia r5!,{r1-r4} + bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0 + bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0 + bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0 + stmia r6!,{r1-r4} + cmp r7,#5 + beq 2b + subs r7,r7,#1 + bne 1b + +@ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for +@ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys. + bl gen_rand_sha_nonpres + ldr r12,=RKshareC + str r0,[r12] @ Make RKshareC random word + ldr r3,=rkey_s @ r3=rkey_s + ldr r1,=rkey4way @ r1=rkey4way + bl storeroundkey @ Store round key 0 and advance r3 by 40 + adds r1,r1,#64 + bl storeroundkey @ Store round key 1 and advance r3 by 40 + adds r1,r1,#48 + ldmia r1!,{r4-r7} @ r4-r7 = 4-way share of previous round key word + @ r1=rkey4way+128 on entry to main loop + movs r2,#0 @ r2=word counter (0-51), offset from word 8 + +@ Note that r1-r3 are not sensitive values, so it's safe to stack +@ them and conditionally branch on them. + +@ rkey4way = 8 x 4 consecutive 4-way share words as cyclic buffer of +@ Rounds 0,1 Rounds 2,3 Rounds 12,13 Round 14 +@ a0 b0 c0 d0 -> a8 b8 c8 d8 -> ... -> a48 b48 c48 d48 -> a56 b56 c56 d56 +@ a1 b1 c1 d1 -> a9 b9 c9 d9 a49 b49 c49 d49 a57 b57 c57 d57 +@ a2 b2 c2 d2 etc a50 b50 c50 d50 a58 b58 c58 d58 +@ a3 b3 c3 d3 a51 b51 c51 d51 a59 b59 c59 d59 +@ a4 b4 c4 d4 a52 b52 c52 d52 =============== +@ a5 b5 c5 d5 a53 b53 c53 d53 +@ a6 b6 c6 d6 a54 b54 c54 d54 +@ a7 b7 c7 d7 a55 b55 c55 d55 + +init_key_expandloop: +@ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8) +@ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words) +@ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4]) +@ r4-r7 = 4-way share of previous roundkey word + + tst r2,#7 + bne 1f + subs r1,r1,#128 @ Every 8th word, reset cyclic buffer pointer and do ROTWORD + movs r4,r4,ror#8 + movs r5,r5,ror#8 + movs r6,r6,ror#8 + movs r7,r7,ror#8 +1: + + tst r2,#3 + bne 1f + bl init_key_sbox @ Every 4th word, do SUBBYTES (sbox) on r4-r7 +1: + + tst r2,#7 + bne 1f + movs r0,r2,lsr#3 + mov r8,#1 + movs r8,r8,lsl r0 + eors r4,r4,r8 @ Every 8th word, add in round constant +1: + + ldmia r1,{r8-r11} @ eor with key from two rounds ago and advance r1 by 16 + eors r4,r4,r8 + eors r5,r5,r9 + eors r6,r6,r10 + eors r7,r7,r11 + stmia r1!,{r4-r7} + + add r2,r2,#1 + tst r2,#3 + bne 1f + subs r1,r1,#64 + bl storeroundkey @ Store round key 1+r2/4 and advance r3 by 40 + adds r1,r1,#64 +1: + + cmp r2,#52 + bne init_key_expandloop + + pop {r0-r12,r14} + CHK_CANARY r12,CTAG17,6 + bx r14 + +.ltorg + +@ Add the round key shares pointed to by r12 into the state shares +@ Trashes r0-r3 +.balign 4 +addrkey_s: + + ldr r0,=chaff @ guaranteed 0 mod 16 +.if ST_VPERM + ldr r3,=statevperm + ldr r3,[r3] @ r3=vperm state rotation in bottom two bits + ldr r2,[r0,#12] @ barrier load +.else + movs r3,#0 +.endif + bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 + ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits + ldr r2,[r0,#16] @ barrier load + + rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot +@ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot +@ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr +.if RK_ROR + movs r0,r2,lsl#3 + movs r1,r1,ror r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; rors r0,r0,r1; eors r4,r4,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0 +.else + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r4,r4,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r5,r5,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r6,r6,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r7,r7,r0 +.endif + clear03_preserve_r3 + add r12,r12,#20 + @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr + + bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 + ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits + ldr r2,[r0,#16] @ barrier load + rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot + ldr r3,=RKshareC @ r3=common round key shareC + bfi r0,r3,#0,#4 + ldr r3,[r3] + ldr r0,[r0] @ barrier load + +@ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot +@ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr +.if RK_ROR + movs r0,r2,lsl#3 + movs r1,r1,ror r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; rors r0,r0,r1; eor r8,r8,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r9,r9,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r10,r10,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eor r11,r11,r0 +.else + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; eors r8,r8,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; eors r9,r9,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; eors r10,r10,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; eors r11,r11,r0 +.endif + clear03 + bx r14 + +.balign 4 +.thumb_func +@ de/encrypt data in place +@ r0: ivec +@ r1: buf +@ r2: n, number of blocks, n>0 +.if CT_BPERM +@ In AES-CTR each block can be independently en/decrypted as the encryption only depends on the IV, +@ the key, and the block number. We can therefore process them in any order, and using a +@ random order helps to defeat attacks that work on the output of the AES, since an attacker +@ wouldn't know what plaintext or ciphertext corresponds to a particular instruction. +.endif + +ctr_crypt_s: +@ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks + GET_CANARY r12,CTAG0,6 + push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets + + push {r0-r3} + + SET_COUNT 93,6 + +.if CT_BPERM +@ Initialise 32 random numbers (which fit in half-words) +@ r3=number of blocks + ldr r4,=bperm_rand + movs r5,#32 +1: + bl gen_rand_sha + umull r0,r2,r0,r3 @ Random number between 0 and n-1 (n=#blocks) + strh r2,[r4],#2 + subs r5,r5,#1 + bne 1b +.endif + + bl randomisechaff + +@ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0 +@ Not doing shareC or state vperm at this point + pop {r0} + ldmia r0,{r4-r7} @ r4-r7 = IVshareA + clear03 16 + pop {r1} + ldmia r1,{r8-r11} @ r8-r11 = IVshareB + clear03 32 + bl gen_rand_sha_nonpres; eors r4,r4,r0; movs r1,#0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc + bl gen_rand_sha_nonpres; eors r5,r5,r0; movs r1,#0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16 + bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16 + bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16 + ldr r0,=IV0 + stmia r0,{r4-r7} + adds r0,r0,#20 + stmia r0,{r8-r11} +@ "Decommission" IV0 so that it doesn't get stacked + bl gen_rand_sha_nonpres; movs r4,r0 + bl gen_rand_sha_nonpres; movs r5,r0 + bl gen_rand_sha_nonpres; movs r6,r0 + bl gen_rand_sha_nonpres; movs r7,r0 + bl gen_rand_sha_nonpres; mov r8,r0 + bl gen_rand_sha_nonpres; mov r9,r0 + bl gen_rand_sha_nonpres; mov r10,r0 + bl gen_rand_sha_nonpres; mov r11,r0 + pop {r1,r2} +@ r1=cipher/plaintext buffer, r2=number of blocks + + movs r3,#0 + CHK_COUNT 93,6 + +ctr_crypt_mainloop: + SET_COUNT 80,6 +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter + +@ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) + push {r1-r3} +@ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret) + + tst r3,#(REFCHAFF_PERIOD-1) + bne 1f + bl refreshchaff_and_lfsr +1: + + ldr r3,[r13,#8] @ get block count off the stack + tst r3,#(REMAP_PERIOD-1) + bne 1f + bl remap @ shuffle the LUTs; this preserves R3 +1: + CHK_COUNT 80,6 + + tst r3,#(REFROUNDKEYSHARES_PERIOD-1) + bne 1f + bl ref_roundkey_shares_s @ refresh the round key shares +1: + + ldr r3,[r13,#8] @ get block count off the stack + tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1) + bne 1f + bl ref_roundkey_hvperms_s @ refresh the round key vperms +1: + + CHK_COUNT 81,6 + + pop {r1-r3} +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter + +@ Now calculate r12 = block number-to-be-deciphered from r3 = block counter +.if CT_BPERM +@ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7 + push {r1} + ldr r0,=murmur3_constants + ldmia r0,{r9-r12,r14} @ load five murmur3_32 hash constants + ldr r0,=bperm_rand + movs r1,#31 + movs r4,r3 @ r4=i +1: + ldrh r5,[r0],#2 @ r5=k + subs r5,r5,r4 @ r5=k-i + ands r6,r2,r5,asr#31 @ r6=n*(k-i<0) + adds r5,r5,r6 @ r5=j=(k-i)%n + adds r6,r4,r5 @ r6=i+j + subs r7,r4,r5 @ r7=i-j + and r8,r7,r7,asr#31 @ r8=min(i-j,0) + sub r7,r7,r8,lsl#1 @ r7=|i-j| + mla r6,r6,r2,r7 @ r6=n(i+j)+|i-j|, encodes the unordered pair {i,j} + eors r6,r6,r1,lsl#27 @ mix with swap-or-not round counter to get different hash functions +@ Now do murmur3_32 hash of r6 + mul r6,r6,r9 + movs r6,r6,ror#17 + mul r6,r6,r10 + movs r6,r6,ror#19 + adds r6,r6,r6,lsl#2 + add r6,r6,r11 + eors r6,r6,#4 + eors r6,r6,r6,lsr#16 + mul r6,r6,r12 + eors r6,r6,r6,lsr#13 + mul r6,r6,r14 + eors r6,r6,r6,lsr#16 @ not actually used here +@ Now set i to j, conditional on the top bit of r6 + subs r7,r5,r4 @ r7=j-i + ands r7,r7,r6,asr#31 @ r7=(j-i)*(top bit of r6) + adds r4,r4,r7 @ r4=j if top bit of r6, else i + subs r1,r1,#1 + bpl 1b + pop {r1} + mov r12,r4 +.else + mov r12,r3 +.endif + CHK_COUNT 82,6 + +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered) + push {r1-r3,r12} +@ r4-r11 = IV0, r12=block number + +processIV: @ non-target label to assist power analysis + ldr r8,=IV0 + ldmia r8,{r4-r7} @ load IV0_A + clear03 16 + add r8,r8,#20 + ldmia r8,{r8-r11} @ load IV0_B + clear03 32 + rev r0,r12 + eor r7,r7,r0 @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n. + @ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n) +@ r4-r11 = IV for the current block + CHK_COUNT 83,6 +.if ST_SHAREC + bl gen_rand_sha_nonpres @ Create state share C; all bytes the same + ands r0,r0,#255 + orrs r0,r0,r0,lsl#8 + orrs r12,r0,r0,lsl#16 + ldr r1,=shareC + str r12,[r1] +.else + movs r12,#0 +.endif +@ r4-r11 = IV for the current block w/o shareC, r12=shareC +@ refresh state shares and mix in shareC + bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc + bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16 + bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16 + bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16 +.if ST_VPERM + bl gen_rand_sha_nonpres + ldr r1,=statevperm + movs r2,#0 + str r2,[r1] + bl addstatevperm @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG) +.endif + + CHK_COUNT 84,6 + bl conjshareC @ Add the effect of shareC to lut_a, lut_b + CHK_COUNT 85,6 +@ now perform the 15 encryption rounds on (key, state=IV+x) +@ here r4-r7, r8-r11: state + mov r2,#0 @ round counter +rounds_s_mainloop: + ldr r12,=rkey_s + add r12,r12,r2,lsl#5 @ pointer to key shares for this round + add r12,r12,r2,lsl#3 + push {r2} @ save round count + bl addrkey_s + bl map_sbox_s + bl shift_rows_s +.if ST_VPERM + ldr r2,[r13] @ peek at stack to get round count + cmp r2,#NUMREFSTATEVPERM + bcs 1f + bl gen_rand_lfsr_nonpres + ldr r1,=statevperm + bl addstatevperm @ V shuffle of r4-r11 +1: +.endif + pop {r2} + adds r2,r2,#1 @ increment round counter + cmp r2,#14 + beq 2f @ break from loop? (last round has no mix_cols) + push {r2} + bl mix_cols_s + pop {r2} + b rounds_s_mainloop +2: + CHK_COUNT 86,6 + ldr r12,=rkey_s+14*40 @ final round key shares + bl addrkey_s + CHK_COUNT 87,6 + bl conjshareC @ Undo the effect of shareC from lut_a, lut_b + CHK_COUNT 88,6 +.if ST_VPERM +@ Undo the effects of vperm rotation recorded in statevperm + ldr r1,=statevperm + ldr r2,[r1] + rsbs r0,r2,#0 + bl addstatevperm +.endif + + pop {r1-r3,r12} + push {r3} +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered + +decryption_start: +@ Decrypt ciphertext using AES output in shares: r4-r11 +.if ST_SHAREC + ldr r0,=shareC + ldr r0,[r0] +.else + movs r0,#0 +.endif + ldr r14,=chaff +@ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff + CHK_COUNT 89,6 + add r1,r1,r12,lsl#4 @ Temporarily r1 points to block-to-be-deciphered + ldr r3,[r1] @ r3=ciphertext word + eors r3,r3,r4 @ r3=r3^shareA + ldr r4,[r14] @ barrier load + eor r3,r3,r8,ror#16 @ r3=r3^shareB + eors r3,r3,r0 @ r3=r3^shareC + str r3,[r1] @ plaintext word=r3 + ldr r3,[r1,#4] @ and similarly for words 1,2,3 of block... + ldr r4,[r14,#4] + eors r3,r3,r5 + eor r3,r3,r9,ror#16 + eors r3,r3,r0 + str r3,[r1,#4] + ldr r3,[r1,#8] + ldr r4,[r14,#8] + eors r3,r3,r6 + eor r3,r3,r10,ror#16 + eors r3,r3,r0 + str r3,[r1,#8] + ldr r3,[r1,#12] + ldr r4,[r14,#12] + eors r3,r3,r7 + eor r3,r3,r11,ror#16 + eors r3,r3,r0 + str r3,[r1,#12] + + sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer + CHK_COUNT 90,6 + + pop {r3} @ Restore block counter +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter +decryption_end: + + adds r3,r3,#1 + cmp r3,r2 + CHK_COUNT 91,6 + bne ctr_crypt_mainloop + +#if WIPE_MEMORY +@ Wipe memory from workspace_start up to the stack pointer +@ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals + ldr r4,=workspace_start + ldr r5,=rstate_all_start +1: + bl gen_rand_sha_nonpres + stmia r4!,{r0} + cmp r4,r5 + bcc 1b + ldr r4,=rstate_all_end + mov r5,r13 @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register +1: + bl gen_rand_sha_nonpres + stmia r4!,{r0} + cmp r4,r5 + bcc 1b + +@ Then fill everything with zeros so as not to leave behind clues about the RNG state + ldr r4,=workspace_start + movs r0,#0 + mov r5,r13 +1: + stmia r4!,{r0} + cmp r4,r5 + bcc 1b +#endif + +.if GEN_RAND_SHA + SET_COUNT 23,6 + bl reset_sha_trng @ clear out the SHA hardware +.endif + pop {r0-r12,r14} + CHK_CANARY r12,CTAG0,6 + bx r14 diff --git a/enc_bootloader/config.h b/enc_bootloader/config.h new file mode 100644 index 00000000..1573fbff --- /dev/null +++ b/enc_bootloader/config.h @@ -0,0 +1,94 @@ +#pragma once + +// These options (up to long /////////////// line) should be enabled because the security risk of not using them is too high +// or because the time cost is very low so you may as well have them. +// They can be set to 0 for analysis or testing purposes. + +#ifndef GEN_RAND_SHA +#define GEN_RAND_SHA 1 // use SHA256 hardware to generate some random numbers +#endif + // Some RNG calls are hard coded to LFSR RNG, others to SHA RNG + // Setting GEN_RAND_SHA to 0 has the effect of redirecting the latter to LFSR RNG +#ifndef ST_SHAREC +#define ST_SHAREC 1 // This creates a partial extra share at almost no extra cost +#endif +#ifndef ST_VPERM +#define ST_VPERM 1 // insert random vertical permutations in state during de/encryption? +#endif +#ifndef CT_BPERM +#define CT_BPERM 1 // process blocks in a random order in counter mode? +#endif +#ifndef RK_ROR +#define RK_ROR 1 // store round key shares with random rotations within each word +#endif + +#ifndef WIPE_MEMORY +#define WIPE_MEMORY 1 // Wipe memory after decryption +#endif + +// The following options should be enabled to increase resistance to glitching attacks. + +#ifndef RC_CANARY +#define RC_CANARY 1 // use rcp_canary feature +#endif +#ifndef RC_COUNT +#define RC_COUNT 1 // use rcp_count feature +#endif + +// Although jitter/timing-variation may be circumventable in theory, in practice +// randomising the timing of operations can make side-channel attacks very much more +// effort to carry out. These can be disabled for analysis or testing purposes. +// It is advisable to use a least one form of jitter. + +// RC_JITTER is quite slow, and is probably the most predictable of the three, so it is disabled by default. +// (Leaving it as an option because it's just possible that the large delays it produces are advantageous in defeating certain side-channel attacks.) +#ifndef RC_JITTER +#define RC_JITTER 0 // 0-7. Higher = more jitter. Governs use of random-delay versions of RCP instructions. +#endif + +#ifndef SH_JITTER +#define SH_JITTER 1 // Insert random delays, tagged onto SHA RNG +#endif + +#ifndef CK_JITTER +#define CK_JITTER 1 // Use the ROSC clock to make ARM timings unpredictable +#endif + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// The following options can be adjusted, affecting the performance/security tradeoff + +// Period = X means that the operation in question occurs every X blocks, so higher = more performance and lower security. +// No point in making them more than 16 or so, since the time taken by the subroutines would be negligible. +// These must be a power of 2. Timings as of commit 82d31652 +// +// Baseline time per 16-byte block = 14109 (with no jitter) cycles +#ifndef REFCHAFF_PERIOD +#define REFCHAFF_PERIOD 1 // Extra cost per 16-byte block = 474/REFCHAFF_PERIOD cycles +#endif +#ifndef REMAP_PERIOD +#define REMAP_PERIOD 4 // Extra cost per 16-byte block = 4148/REMAP_PERIOD cycles +#endif +#ifndef REFROUNDKEYSHARES_PERIOD +#define REFROUNDKEYSHARES_PERIOD 1 // Extra cost per 16-byte block = 1304/REFROUNDKEYSHARES_PERIOD cycles +#endif +#ifndef REFROUNDKEYHVPERMS_PERIOD +#define REFROUNDKEYHVPERMS_PERIOD 1 // Extra cost per 16-byte block = 1486/REFROUNDKEYVPERM_PERIOD cycles +#endif + +// Setting NUMREFSTATEVPERM to X means that state vperm refreshing happens on the first X AES rounds only, +// so lower = more performance and lower security. +// The rationale for doing it this way is that later rounds should be protected by CT_BPERM. +// NUMREFSTATEVPERM can be from 0 to 14. +#ifndef NUMREFSTATEVPERM +#define NUMREFSTATEVPERM 7 // Extra cost per 16-byte block = 61*NUMREFSTATEVPERM cycles +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define MAX_NUM_BLOCKS 32768 + +#if SH_JITTER && !GEN_RAND_SHA +#error GEN_RAND_SHA must be set if you want to use SH_JITTER +#endif diff --git a/enc_bootloader/enc_bootloader.c b/enc_bootloader/enc_bootloader.c new file mode 100644 index 00000000..60b0f427 --- /dev/null +++ b/enc_bootloader/enc_bootloader.c @@ -0,0 +1,194 @@ +/** + * Copyright (c) 2023 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include "pico/stdlib.h" +#include "boot/picobin.h" +#include "boot/picoboot.h" +#include "pico/bootrom.h" +#include "hardware/structs/otp.h" + +#include "hardware/structs/trng.h" +#include "hardware/structs/sha256.h" + +#include "pico/binary_info.h" + +#include "hardware/clocks.h" +#include "hardware/xosc.h" +#include "hardware/structs/rosc.h" + +#include "config.h" + +extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk); + +// These just have to be higher than the actual frequency, to prevent overclocking unused peripherals +#define ROSC_HZ 300*MHZ +#define OTHER_CLK_DIV 30 + +// Enable calibration of the ROSC using the XOSC - if disabled it runs with the fixed ~110MHz ROSC configuration +#define XOSC_CALIBRATION 0 + +#if CK_JITTER +void runtime_init_clocks(void) { + // Disable resus that may be enabled from previous software + clocks_hw->resus.ctrl = 0; + + bi_decl(bi_ptr_int32(0, 0, rosc_div, 2)); // default divider 2 + bi_decl(bi_ptr_int32(0, 0, rosc_drive, 0x7777)); // default drives of 0b111 (0x7) + + // Bump up ROSC speed to ~110MHz + rosc_hw->freqa = 0; // reset the drive strengths + rosc_hw->div = rosc_div | ROSC_DIV_VALUE_PASS; // set divider + // Increment the freqency range one step at a time - this is safe provided the current config is not TOOHIGH + // because ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH + static_assert(ROSC_CTRL_FREQ_RANGE_VALUE_LOW | ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM == ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM); + static_assert(ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH); + hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM); + hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_HIGH); + + // Enable rosc randomisation + rosc_hw->freqa = (ROSC_FREQA_PASSWD_VALUE_PASS << ROSC_FREQA_PASSWD_LSB) | + rosc_drive | ROSC_FREQA_DS1_RANDOM_BITS | ROSC_FREQA_DS0_RANDOM_BITS; // enable randomisation + + // Not used with FREQ_RANGE_VALUE_HIGH, but should still be set to the maximum drive + rosc_hw->freqb = (ROSC_FREQB_PASSWD_VALUE_PASS << ROSC_FREQB_PASSWD_LSB) | + ROSC_FREQB_DS7_LSB | ROSC_FREQB_DS6_LSB | ROSC_FREQB_DS5_LSB | ROSC_FREQB_DS4_LSB; + +#if XOSC_CALIBRATION + // Calibrate ROSC frequency if XOSC present - otherwise just configure + bi_decl(bi_ptr_int32(0, 0, xosc_hz, 12000000)); // xosc freq in Hz + bi_decl(bi_ptr_int32(0, 0, clk_khz, 150 * KHZ)); // maximum clk_sys freq in KHz + if (xosc_hz) { + xosc_init(); + // Switch away from ROSC to avoid overclocking + // CLK_REF = XOSC + clock_configure_int_divider(clk_ref, + CLOCKS_CLK_REF_CTRL_SRC_VALUE_XOSC_CLKSRC, + 0, + xosc_hz, + 1); + // CLK_SYS = CLK_REF + clock_configure_int_divider(clk_sys, + CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLK_REF, + CLOCKS_CLK_SYS_CTRL_AUXSRC_VALUE_ROSC_CLKSRC, // leave the aux source on ROSC to prevent glitches when we switch back to it + xosc_hz, + 1); + + // Go through configurations until you get below 150MHz + uint8_t div = 1; + uint32_t drives = 0x7777; + while (div < 4) { + rosc_hw->div = div | ROSC_DIV_VALUE_PASS; + rosc_hw->freqa = (ROSC_FREQA_PASSWD_VALUE_PASS << ROSC_FREQA_PASSWD_LSB) | + drives | + ROSC_FREQA_DS1_RANDOM_BITS | ROSC_FREQA_DS0_RANDOM_BITS; // enable randomisation + + // Wait for ROSC to be stable + while(!(rosc_hw->status & ROSC_STATUS_STABLE_BITS)) { + tight_loop_contents(); + } + + if (frequency_count_khz(CLOCKS_FC0_SRC_VALUE_ROSC_CLKSRC_PH) < clk_khz) { + break; + } + + if (!drives) div++; + drives = drives ? 0x0000 : 0x7777; + } + } +#endif // XOSC_CALIBRATION + // CLK SYS = ROSC directly, as it's running slowly enough + clock_configure_int_divider(clk_sys, + CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX, + CLOCKS_CLK_SYS_CTRL_AUXSRC_VALUE_ROSC_CLKSRC, + ROSC_HZ, // this doesn't have to be accurate + 1); + + // Configure other clocks - none of these need to be accurate + + // CLK_REF = ROSC / OTHER_CLK_DIV - this and other clocks aren't really used, so just need to be set to a low enough frequency + clock_configure_int_divider(clk_ref, + CLOCKS_CLK_REF_CTRL_SRC_VALUE_ROSC_CLKSRC_PH, + 0, + ROSC_HZ, + OTHER_CLK_DIV); + + // CLK USB (not used) + clock_configure_int_divider(clk_usb, + 0, // No GLMUX + CLOCKS_CLK_USB_CTRL_AUXSRC_VALUE_ROSC_CLKSRC_PH, + ROSC_HZ, + OTHER_CLK_DIV); + + // CLK ADC (not used) + clock_configure_int_divider(clk_adc, + 0, // No GLMUX + CLOCKS_CLK_ADC_CTRL_AUXSRC_VALUE_ROSC_CLKSRC_PH, + ROSC_HZ, + OTHER_CLK_DIV); + + // CLK PERI Used as reference clock for UART and SPI serial. (not used) + clock_configure_int_divider(clk_peri, + 0, + CLOCKS_CLK_PERI_CTRL_AUXSRC_VALUE_CLK_SYS, + ROSC_HZ, + OTHER_CLK_DIV); + + // CLK_HSTX Transmit bit clock for the HSTX peripheral. (not used) + clock_configure_int_divider(clk_hstx, + 0, + CLOCKS_CLK_HSTX_CTRL_AUXSRC_VALUE_CLK_SYS, + ROSC_HZ, + OTHER_CLK_DIV); +} +#endif + + +bi_decl(bi_ptr_int32(0, 0, otp_key_page, 30)); + +// The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins. +// That is a suitable point to lock the OTP area where key information is stored. +void lock_key() { + otp_hw->sw_lock[otp_key_page] = 0xf; + otp_hw->sw_lock[otp_key_page + 1] = 0xf; +} + + +int main() { + bi_decl(bi_ptr_int32(0, 0, data_start_addr, 0x20000000)); + bi_decl(bi_ptr_int32(0, 0, data_size, 0x78000)); + bi_decl(bi_ptr_string(0, 0, iv, "0123456789abcdef", 17)); + + // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors + uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE; + decrypt( + (uint8_t*)&(otp_data[otp_key_page * 0x40]), + (uint8_t*)&(otp_data[(otp_key_page + 2) * 0x40]), + (uint8_t*)iv, + (void*)data_start_addr, + data_size/16 + ); + + // Lock the IV salt + otp_hw->sw_lock[otp_key_page + 2] = 0xf; + + // Increase stack limit by 0x100 + pico_default_asm_volatile( + "mrs r0, msplim\n" + "subs r0, 0x100\n" + "msr msplim, r0" + :::"r0"); + + // Chain into decrypted image + rom_chain_image( + (uint8_t*)ROM_CHAIN_WORKSPACE, + 4 * 1024, + data_start_addr, + data_size + ); + + __breakpoint(); +} diff --git a/enc_bootloader/enc_bootloader.elf b/enc_bootloader/enc_bootloader.elf new file mode 100755 index 00000000..d7eb8e8a Binary files /dev/null and b/enc_bootloader/enc_bootloader.elf differ diff --git a/enc_bootloader/enc_bootloader_mbedtls.elf b/enc_bootloader/enc_bootloader_mbedtls.elf new file mode 100644 index 00000000..b3b2fbf5 Binary files /dev/null and b/enc_bootloader/enc_bootloader_mbedtls.elf differ diff --git a/enc_bootloader/mbedtls_aes.c b/enc_bootloader/mbedtls_aes.c new file mode 100644 index 00000000..e6c7ab30 --- /dev/null +++ b/enc_bootloader/mbedtls_aes.c @@ -0,0 +1,74 @@ +#include +#include "pico/stdlib.h" + +extern void lock_key(); + +int mb_aes_crypt_ctr_xor(mbedtls_aes_context *ctx, + size_t length, + unsigned char iv0[16], + unsigned char nonce_xor[16], + unsigned char stream_block[16], + const unsigned char *input, + unsigned char *output) +{ + int c; + int ret = 0; + size_t n = 0; + uint32_t counter = 0; + + assert(length == (uint32_t)length); + + while (length--) { + if (n == 0) { + for (int i = 16; i > 0; i--) { + nonce_xor[i-1] = iv0[i-1]; + if (i - (int)(16 - sizeof(counter)) > (int)0) { + nonce_xor[i-1] ^= (unsigned char)(counter >> ((16-i)*8)); + } + } + + ret = mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, nonce_xor, stream_block); + if (ret != 0) { + break; + } + counter++; + } + c = *input++; + *output++ = (unsigned char) (c ^ stream_block[n]); + + n = (n + 1) & 0x0F; + } + + return ret; +} + +void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk) { + mbedtls_aes_context aes; + + uint32_t aes_key[8]; + uint32_t* key4waywords = (uint32_t*)key4way; + // Key is stored as a 4-way share of each word, ie X[0] = A[0] ^ B[0] ^ C[0] ^ D[0], stored as A[0], B[0], C[0], D[0] + for (int i=0; i < count_of(aes_key); i++) { + int skip = (i/4)*16; // skip every other 16 words (64 bytes), due to the FIB workaround + aes_key[i] = key4waywords[i*4 + skip] + ^ key4waywords[i*4 + 1 + skip] + ^ key4waywords[i*4 + 2 + skip] + ^ key4waywords[i*4 + 3 + skip]; + } + + uint8_t iv[16]; + for (int i=0; i < sizeof(iv); i++) { + iv[i] = IV_OTPsalt[i] ^ IV_public[i]; + } + + int len = nblk * 16; + + mbedtls_aes_setkey_enc(&aes, (uint8_t*)aes_key, 256); + + lock_key(); + + uint8_t xor_working_block[16] = {0}; + uint8_t stream_block[16] = {0}; + size_t nc_off = 0; + mb_aes_crypt_ctr_xor(&aes, len, (uint8_t*)iv, xor_working_block, stream_block, (uint8_t*)buf, (uint8_t*)buf); +} diff --git a/enc_bootloader/mbedtls_config.h b/enc_bootloader/mbedtls_config.h new file mode 100644 index 00000000..87486cfc --- /dev/null +++ b/enc_bootloader/mbedtls_config.h @@ -0,0 +1,10 @@ +#ifndef _MBEDTLS_CONFIG_H +#define _MBEDTLS_CONFIG_H + +#define MBEDTLS_HAVE_ASM +#define MBEDTLS_AES_C +#define MBEDTLS_AES_ROM_TABLES +// #define MBEDTLS_AES_FEWER_TABLES +#define MBEDTLS_CIPHER_MODE_CTR + +#endif diff --git a/enc_bootloader/memmap_enc_bootloader.ld b/enc_bootloader/memmap_enc_bootloader.ld new file mode 100644 index 00000000..6b08c5b3 --- /dev/null +++ b/enc_bootloader/memmap_enc_bootloader.ld @@ -0,0 +1,264 @@ +/* Based on GCC ARM embedded samples. + Defines the following symbols for use by code: + __exidx_start + __exidx_end + __etext + __data_start__ + __preinit_array_start + __preinit_array_end + __init_array_start + __init_array_end + __fini_array_start + __fini_array_end + __data_end__ + __bss_start__ + __bss_end__ + __end__ + end + __HeapLimit + __StackLimit + __StackTop + __stack (== StackTop) +*/ + +MEMORY +{ + RAM_START(rwx) : ORIGIN = 0x20080000, LENGTH = 0x44 + SCRATCH_X(rwx) : ORIGIN = 0x20080044, LENGTH = 0xFBC + SCRATCH_Y(rwx) : ORIGIN = 0x20081000, LENGTH = 0x800 + RAM(rwx) : ORIGIN = 0x20081800, LENGTH = 0x800 +} + +ENTRY(_entry_point) + +SECTIONS +{ + /* Note unlike RP2040, we start the image with a vector table even for + NO_FLASH builds. On Arm, the bootrom expects a VT at the start of the + image by default; on RISC-V, the default is to enter the image at its + lowest address, so an IMAGEDEF item is required to specify the + nondefault entry point. */ + + .start_text : { + __logical_binary_start = .; + /* Vectors require 512-byte alignment on v8-M when >48 IRQs are used, + so we would waste RAM if the vector table were not at the + start. */ + KEEP (*(.vectors)) + KEEP (*(.binary_info_header)) + __binary_info_header_end = .; + KEEP (*(.embedded_block)) + __embedded_block_end = .; + } > RAM_START + + .text : { + __reset_start = .; + KEEP (*(.reset)) + __reset_end = .; + *(.time_critical*) + *(.text*) + . = ALIGN(4); + *(.init) + *(.fini) + /* Pull all c'tors into .text */ + *crtbegin.o(.ctors) + *crtbegin?.o(.ctors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors) + *(SORT(.ctors.*)) + *(.ctors) + /* Followed by destructors */ + *crtbegin.o(.dtors) + *crtbegin?.o(.dtors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors) + *(SORT(.dtors.*)) + *(.dtors) + + *(.eh_frame*) + } > RAM + + .rodata : { + . = ALIGN(4); + *(.rodata*) + *(.srodata*) + . = ALIGN(4); + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.flashdata*))) + . = ALIGN(4); + } > RAM + + .ARM.extab : + { + *(.ARM.extab* .gnu.linkonce.armextab.*) + } > RAM + + __exidx_start = .; + .ARM.exidx : + { + *(.ARM.exidx* .gnu.linkonce.armexidx.*) + } > RAM + __exidx_end = .; + + /* Machine inspectable binary information */ + . = ALIGN(4); + __binary_info_start = .; + .binary_info : + { + KEEP(*(.binary_info.keep.*)) + *(.binary_info.*) + } > RAM + __binary_info_end = .; + . = ALIGN(4); + + .data : { + __data_start__ = .; + *(vtable) + *(.data*) + *(.sdata*) + + . = ALIGN(4); + *(.after_data.*) + . = ALIGN(4); + /* preinit data */ + PROVIDE_HIDDEN (__mutex_array_start = .); + KEEP(*(SORT(.mutex_array.*))) + KEEP(*(.mutex_array)) + PROVIDE_HIDDEN (__mutex_array_end = .); + + . = ALIGN(4); + /* preinit data */ + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP(*(SORT(.preinit_array.*))) + KEEP(*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + + . = ALIGN(4); + /* init data */ + PROVIDE_HIDDEN (__init_array_start = .); + KEEP(*(SORT(.init_array.*))) + KEEP(*(.init_array)) + PROVIDE_HIDDEN (__init_array_end = .); + + . = ALIGN(4); + /* finit data */ + PROVIDE_HIDDEN (__fini_array_start = .); + *(SORT(.fini_array.*)) + *(.fini_array) + PROVIDE_HIDDEN (__fini_array_end = .); + + *(.jcr) + . = ALIGN(4); + } > RAM + + .tdata : { + . = ALIGN(4); + *(.tdata .tdata.* .gnu.linkonce.td.*) + /* All data end */ + __tdata_end = .; + } > RAM + PROVIDE(__data_end__ = .); + + .uninitialized_data (NOLOAD): { + . = ALIGN(4); + *(.uninitialized_data*) + } > RAM + /* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */ + __etext = LOADADDR(.data); + + .tbss (NOLOAD) : { + . = ALIGN(4); + __bss_start__ = .; + __tls_base = .; + *(.tbss .tbss.* .gnu.linkonce.tb.*) + *(.tcommon) + + __tls_end = .; + } > RAM + + .bss (NOLOAD) : { + . = ALIGN(4); + __tbss_end = .; + + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*))) + *(COMMON) + PROVIDE(__global_pointer$ = . + 2K); + *(.sbss*) + . = ALIGN(4); + __bss_end__ = .; + } > RAM + + .heap (NOLOAD): + { + __end__ = .; + end = __end__; + KEEP(*(.heap*)) + } > RAM + /* historically on GCC sbrk was growing past __HeapLimit to __StackLimit, however + to be more compatible, we now set __HeapLimit explicitly to where the end of the heap is */ + __HeapLimit = ORIGIN(RAM) + LENGTH(RAM); + + /* Start and end symbols must be word-aligned */ + .scratch_x : { + __scratch_x_start__ = .; + *(.scratch_x.*) + . = ALIGN(4); + __scratch_x_end__ = .; + } > SCRATCH_X + __scratch_x_source__ = LOADADDR(.scratch_x); + + .scratch_y : { + __scratch_y_start__ = .; + *(.scratch_y.*) + . = ALIGN(4); + __scratch_y_end__ = .; + } > SCRATCH_Y + __scratch_y_source__ = LOADADDR(.scratch_y); + + /* .stack*_dummy section doesn't contains any symbols. It is only + * used for linker to calculate size of stack sections, and assign + * values to stack symbols later + * + * stack1 section may be empty/missing if platform_launch_core1 is not used */ + + /* by default we put core 0 stack at the end of scratch Y, so that if core 1 + * stack is not used then all of SCRATCH_X is free. + */ + .stack1_dummy (NOLOAD): + { + *(.stack1*) + } > SCRATCH_X + .stack_dummy (NOLOAD): + { + KEEP(*(.stack*)) + } > SCRATCH_Y + + /* stack limit is poorly named, but historically is maximum heap ptr */ + __StackLimit = ORIGIN(RAM) + LENGTH(RAM); + __StackOneTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X); + __StackTop = ORIGIN(SCRATCH_Y) + LENGTH(SCRATCH_Y); + __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy); + __StackBottom = __StackTop - SIZEOF(.stack_dummy); + PROVIDE(__stack = __StackTop); + + /* picolibc and LLVM */ + PROVIDE (__heap_start = __end__); + PROVIDE (__heap_end = __HeapLimit); + PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) ); + PROVIDE( __tls_size_align = (__tls_size + __tls_align - 1) & ~(__tls_align - 1)); + PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) ); + + /* llvm-libc */ + PROVIDE (_end = __end__); + PROVIDE (__llvm_libc_heap_limit = __HeapLimit); + + /* Check if data + heap + stack exceeds RAM limit */ + ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed") + + ASSERT( __binary_info_header_end - __logical_binary_start <= 1024, "Binary info must be in first 1024 bytes of the binary") + ASSERT( __embedded_block_end - __logical_binary_start <= 4096, "Embedded block must be in first 4096 bytes of the binary") + + /* todo assert on extra code */ + ASSERT(chaff==0x20081000, "Chaff array must be located at 0x20081000") + + /* Provide symbols for picotool */ + __enc_bootloader_start = ORIGIN(RAM_START); + __enc_bootloader_end = ORIGIN(RAM) + LENGTH(RAM); +} diff --git a/enc_bootloader/memmap_mbedtls.ld b/enc_bootloader/memmap_mbedtls.ld new file mode 100644 index 00000000..261e336b --- /dev/null +++ b/enc_bootloader/memmap_mbedtls.ld @@ -0,0 +1,263 @@ +/* Based on GCC ARM embedded samples. + Defines the following symbols for use by code: + __exidx_start + __exidx_end + __etext + __data_start__ + __preinit_array_start + __preinit_array_end + __init_array_start + __init_array_end + __fini_array_start + __fini_array_end + __data_end__ + __bss_start__ + __bss_end__ + __end__ + end + __HeapLimit + __StackLimit + __StackTop + __stack (== StackTop) +*/ + +MEMORY +{ + RAM(rwx) : ORIGIN = 0x13ffc000, LENGTH = 16k + SCRATCH_X(rwx) : ORIGIN = 0x20080000, LENGTH = 4k + SCRATCH_Y(rwx) : ORIGIN = 0x20081000, LENGTH = 4k +} + +ENTRY(_entry_point) + +SECTIONS +{ + /* Note unlike RP2040, we start the image with a vector table even for + NO_FLASH builds. On Arm, the bootrom expects a VT at the start of the + image by default; on RISC-V, the default is to enter the image at its + lowest address, so an IMAGEDEF item is required to specify the + nondefault entry point. */ + + .start_text : { + __logical_binary_start = .; + /* Vectors require 512-byte alignment on v8-M when >48 IRQs are used, + so we would waste RAM if the vector table were not at the + start. */ + KEEP (*(.vectors)) + KEEP (*(.binary_info_header)) + __binary_info_header_end = .; + KEEP (*(.embedded_block)) + __embedded_block_end = .; + } > RAM + + .text : { + __reset_start = .; + KEEP (*(.reset)) + __reset_end = .; + *(.time_critical*) + *(.text*) + . = ALIGN(4); + *(.init) + *(.fini) + /* Pull all c'tors into .text */ + *crtbegin.o(.ctors) + *crtbegin?.o(.ctors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors) + *(SORT(.ctors.*)) + *(.ctors) + /* Followed by destructors */ + *crtbegin.o(.dtors) + *crtbegin?.o(.dtors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors) + *(SORT(.dtors.*)) + *(.dtors) + + *(.eh_frame*) + } > RAM + + .rodata : { + . = ALIGN(4); + *(.rodata*) + *(.srodata*) + . = ALIGN(4); + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.flashdata*))) + . = ALIGN(4); + } > RAM + + .ARM.extab : + { + *(.ARM.extab* .gnu.linkonce.armextab.*) + } > RAM + + __exidx_start = .; + .ARM.exidx : + { + *(.ARM.exidx* .gnu.linkonce.armexidx.*) + } > RAM + __exidx_end = .; + + /* Machine inspectable binary information */ + . = ALIGN(4); + __binary_info_start = .; + .binary_info : + { + KEEP(*(.binary_info.keep.*)) + *(.binary_info.*) + } > RAM + __binary_info_end = .; + . = ALIGN(4); + + .data : { + __data_start__ = .; + *(vtable) + *(.data*) + *(.sdata*) + + . = ALIGN(4); + *(.after_data.*) + . = ALIGN(4); + /* preinit data */ + PROVIDE_HIDDEN (__mutex_array_start = .); + KEEP(*(SORT(.mutex_array.*))) + KEEP(*(.mutex_array)) + PROVIDE_HIDDEN (__mutex_array_end = .); + + . = ALIGN(4); + /* preinit data */ + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP(*(SORT(.preinit_array.*))) + KEEP(*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + + . = ALIGN(4); + /* init data */ + PROVIDE_HIDDEN (__init_array_start = .); + KEEP(*(SORT(.init_array.*))) + KEEP(*(.init_array)) + PROVIDE_HIDDEN (__init_array_end = .); + + . = ALIGN(4); + /* finit data */ + PROVIDE_HIDDEN (__fini_array_start = .); + *(SORT(.fini_array.*)) + *(.fini_array) + PROVIDE_HIDDEN (__fini_array_end = .); + + *(.jcr) + . = ALIGN(4); + } > RAM + + .tdata : { + . = ALIGN(4); + *(.tdata .tdata.* .gnu.linkonce.td.*) + /* All data end */ + __tdata_end = .; + } > RAM + PROVIDE(__data_end__ = .); + + .uninitialized_data (NOLOAD): { + . = ALIGN(4); + *(.uninitialized_data*) + } > RAM + /* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */ + __etext = LOADADDR(.data); + + .tbss (NOLOAD) : { + . = ALIGN(4); + __bss_start__ = .; + __tls_base = .; + *(.tbss .tbss.* .gnu.linkonce.tb.*) + *(.tcommon) + + __tls_end = .; + } > RAM + + .bss (NOLOAD) : { + . = ALIGN(4); + __tbss_end = .; + + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*))) + *(COMMON) + PROVIDE(__global_pointer$ = . + 2K); + *(.sbss*) + . = ALIGN(4); + __bss_end__ = .; + } > RAM + + .heap (NOLOAD): + { + __end__ = .; + end = __end__; + KEEP(*(.heap*)) + } > RAM + /* historically on GCC sbrk was growing past __HeapLimit to __StackLimit, however + to be more compatible, we now set __HeapLimit explicitly to where the end of the heap is */ + __HeapLimit = ORIGIN(RAM) + LENGTH(RAM); + + /* Start and end symbols must be word-aligned */ + .scratch_x : { + __scratch_x_start__ = .; + *(.scratch_x.*) + . = ALIGN(4); + __scratch_x_end__ = .; + } > SCRATCH_X + __scratch_x_source__ = LOADADDR(.scratch_x); + + .scratch_y : { + __scratch_y_start__ = .; + *(.scratch_y.*) + . = ALIGN(4); + __scratch_y_end__ = .; + } > SCRATCH_Y + __scratch_y_source__ = LOADADDR(.scratch_y); + + /* .stack*_dummy section doesn't contains any symbols. It is only + * used for linker to calculate size of stack sections, and assign + * values to stack symbols later + * + * stack1 section may be empty/missing if platform_launch_core1 is not used */ + + /* by default we put core 0 stack at the end of scratch Y, so that if core 1 + * stack is not used then all of SCRATCH_X is free. + */ + .stack1_dummy (NOLOAD): + { + *(.stack1*) + } > SCRATCH_X + .stack_dummy (NOLOAD): + { + KEEP(*(.stack*)) + } > SCRATCH_Y + + /* stack limit is poorly named, but historically is maximum heap ptr */ + __StackLimit = ORIGIN(RAM) + LENGTH(RAM); + __StackOneTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X); + __StackTop = ORIGIN(SCRATCH_Y) + LENGTH(SCRATCH_Y); + __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy); + __StackBottom = __StackTop - SIZEOF(.stack_dummy); + PROVIDE(__stack = __StackTop); + + /* picolibc and LLVM */ + PROVIDE (__heap_start = __end__); + PROVIDE (__heap_end = __HeapLimit); + PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) ); + PROVIDE( __tls_size_align = (__tls_size + __tls_align - 1) & ~(__tls_align - 1)); + PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) ); + + /* llvm-libc */ + PROVIDE (_end = __end__); + PROVIDE (__llvm_libc_heap_limit = __HeapLimit); + + /* Check if data + heap + stack exceeds RAM limit */ + ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed") + + ASSERT( __binary_info_header_end - __logical_binary_start <= 1024, "Binary info must be in first 1024 bytes of the binary") + ASSERT( __embedded_block_end - __logical_binary_start <= 4096, "Embedded block must be in first 4096 bytes of the binary") + + /* todo assert on extra code */ + + /* Provide symbols for picotool */ + __enc_bootloader_start = ORIGIN(RAM); + __enc_bootloader_end = ORIGIN(RAM) + LENGTH(RAM); +} + diff --git a/get_enc_bootloader.cpp b/get_enc_bootloader.cpp new file mode 100644 index 00000000..bac91d50 --- /dev/null +++ b/get_enc_bootloader.cpp @@ -0,0 +1,52 @@ + +#include +#include +#include +#include +#include + +#include "get_enc_bootloader.h" +#include "enc_bootloader_elf.h" +#if HAS_MBEDTLS +#include "enc_bootloader_mbedtls_elf.h" +#endif + +#include "data_locs.h" + +#include "whereami++.h" + + +std::shared_ptr get_enc_bootloader(bool use_mbedtls) { + // search same directory as executable + whereami::whereami_path_t executablePath = whereami::getExecutablePath(); + std::string local_loc = executablePath.dirname() + "/"; + if (std::find(data_locs.begin(), data_locs.end(), local_loc) == data_locs.end()) { + data_locs.insert(data_locs.begin(), local_loc); + } + + for (auto loc : data_locs) { + std::string filename = loc + + "enc_bootloader" + + (use_mbedtls ? "_mbedtls" : "") + + ".elf"; + std::ifstream i(filename); + if (i.good()) { + printf("Picking file %s\n", filename.c_str()); + auto file = std::make_shared(filename, std::ios::in|std::ios::binary); + return file; + } + } + + // fall back to embedded enc_bootloader.elf file + printf("Could not find enc_bootloader%s.elf file - using embedded binary\n", use_mbedtls ? "_mbedtls" : ""); + auto tmp = std::make_shared(); +#if HAS_MBEDTLS + if (use_mbedtls) { + tmp->write(reinterpret_cast(enc_bootloader_mbedtls_elf), enc_bootloader_mbedtls_elf_SIZE); + } else +#endif + { + tmp->write(reinterpret_cast(enc_bootloader_elf), enc_bootloader_elf_SIZE); + } + return tmp; +} diff --git a/get_enc_bootloader.h b/get_enc_bootloader.h new file mode 100644 index 00000000..07684be7 --- /dev/null +++ b/get_enc_bootloader.h @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#pragma once + +#include +#include + +std::shared_ptr get_enc_bootloader(bool use_mbedtls = false); diff --git a/xip_ram_perms.cpp b/get_xip_ram_perms.cpp similarity index 97% rename from xip_ram_perms.cpp rename to get_xip_ram_perms.cpp index 1f6260d2..b940bb65 100644 --- a/xip_ram_perms.cpp +++ b/get_xip_ram_perms.cpp @@ -5,7 +5,7 @@ #include #include -#include "xip_ram_perms.h" +#include "get_xip_ram_perms.h" #include "xip_ram_perms_elf.h" #include "data_locs.h" diff --git a/xip_ram_perms.h b/get_xip_ram_perms.h similarity index 100% rename from xip_ram_perms.h rename to get_xip_ram_perms.h diff --git a/lib/include/mbedtls_config.h b/lib/include/mbedtls_config.h index 0410687b..aed82466 100644 --- a/lib/include/mbedtls_config.h +++ b/lib/include/mbedtls_config.h @@ -1 +1 @@ -#include "picotool_mbedtls_config.h" \ No newline at end of file +#include "picotool_mbedtls_config.h" diff --git a/main.cpp b/main.cpp index 7b7fd48d..6450dd88 100644 --- a/main.cpp +++ b/main.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #if !defined(__APPLE__) && !defined(__FreeBSD__) #include #endif @@ -33,10 +34,11 @@ #include "boot/uf2.h" #include "boot/picobin.h" +#include "get_enc_bootloader.h" #if HAS_LIBUSB #include "picoboot_connection_cxx.h" #include "rp2350.rom.h" - #include "xip_ram_perms.h" + #include "get_xip_ram_perms.h" #else #include "picoboot_connection.h" #endif @@ -80,6 +82,7 @@ static __forceinline int __builtin_ctz(unsigned x) { #define OTP_PAGE_COUNT 64 #define OTP_PAGE_ROWS 64 #define OTP_ROW_COUNT (OTP_PAGE_COUNT * OTP_PAGE_ROWS) +#define OTP_SPECIAL_PAGES 3 using std::string; using std::vector; @@ -245,6 +248,41 @@ template struct range_map { } } + void insert_overwrite(const range& r, T t) { + if (r.to != r.from) { + assert(r.to > r.from); + // insert overlapping entry, and overwrite any it overlaps + + // avoid modifying m while iterating through it + vector to_erase; + vector>> to_add; + + auto f = m.upper_bound(r.from); // first element that starts after r.from + if (f != m.begin()) f--; // back up, to catch element that starts on or before r.from + for(; f != m.end() && f->first < r.to; f++) { // loop till we can't possibly overlap + range r2(f->first, f->second.first); + T r2off = f->second.second; + if (r2.intersects(r)) { + // remove existing r2 + to_erase.push_back(r2.from); + if (r2.from < r.from) { + // add r2 which ends at start of r + to_add.push_back(std::make_pair(r2.from, std::make_pair(r.from, r2off))); + } + if (r2.to > r.to) { + // add r2 which starts at end of r + to_add.push_back(std::make_pair(r.to, std::make_pair(r2.to, r2off + (r.to - r2.from)))); + } + } + } + for (auto k : to_erase) m.erase(k); + for (auto v : to_add) m.insert(v); + + // finally, add the new entry + m.insert(std::make_pair(r.from, std::make_pair(r.to, t))); + } + } + pair get(uint32_t p) { auto f = m.upper_bound(p); if (f == m.end()) { @@ -397,8 +435,8 @@ struct multi_cmd : public cmd { }; struct _settings { - std::array filenames; - std::array file_types; + std::array filenames; + std::array file_types; uint32_t binary_start = FLASH_START; int bus=-1; int address=-1; @@ -439,6 +477,7 @@ struct _settings { std::vector selectors; uint32_t row = 0; std::vector extra_files; + bool dump_pages = false; } otp; struct { @@ -471,12 +510,21 @@ struct _settings { bool hash = false; bool sign = false; bool clear_sram = false; + bool set_tbyb = false; uint16_t major_version = 0; uint16_t minor_version = 0; uint16_t rollback_version = 0; std::vector rollback_rows; } seal; + struct { + bool embed = false; + bool otp_key_page_set = false; + bool fast_rosc = false; + bool use_mbedtls = false; + uint16_t otp_key_page = 29; + } encrypt; + struct { uint32_t align = 0x1000; } link; @@ -560,6 +608,13 @@ auto device_selection = named_file_types_x(types, i)\ ) +#define named_untyped_file_selection_x(name, i)\ +(\ + value(name).with_exclusion_filter([](const string &value) {\ + return value.find_first_of('-') == 0;\ + }).set(settings.filenames[i]) % "The file name"\ +) + #define optional_file_selection_x(name, i)\ (\ value(name).with_exclusion_filter([](const string &value) {\ @@ -576,6 +631,13 @@ auto device_selection = named_file_types_x(types, i)\ ).min(0).doc_non_optional(true) +#define optional_untyped_file_selection_x(name, i)\ +(\ + value(name).with_exclusion_filter([](const string &value) {\ + return value.find_first_of('-') == 0;\ + }).set(settings.filenames[i]).min(0) % "The file name"\ +).min(0).doc_non_optional(true) + #define option_file_selection_x(option, i)\ (\ option & value("filename").with_exclusion_filter([](const string &value) {\ @@ -785,6 +847,13 @@ struct encrypt_command : public cmd { return ( option("--quiet").set(settings.quiet) % "Don't print any output" + option("--verbose").set(settings.verbose) % "Print verbose output" + + option("--embed").set(settings.encrypt.embed) % "Embed bootloader in output file" + + option("--fast-rosc").set(settings.encrypt.fast_rosc) % "Use ~180MHz ROSC configuration for embedded bootloader" + + option("--use-mbedtls").set(settings.encrypt.use_mbedtls) % "Use MbedTLS implementation of embedded bootloader (faster but less secure)" + + ( + option("--otp-key-page").set(settings.encrypt.otp_key_page_set) % "Specify the OTP page storing the AES key (IV salt is stored on the next page)" & + integer("page").set(settings.encrypt.otp_key_page) % "OTP page (default 30)" + ).force_expand_help(true) + ( option("--hash").set(settings.seal.hash) % "Hash the encrypted file" + option("--sign").set(settings.seal.sign) % "Sign the encrypted file" @@ -795,8 +864,10 @@ struct encrypt_command : public cmd { hex("offset").set(settings.offset) % "Load offset (memory address; default 0x10000000)" ).force_expand_help(true) % "BIN file options" + named_file_selection_x("outfile", 1) % "File to save to" + - named_typed_file_selection_x("aes_key", 2, "bin") % "AES Key" + - optional_typed_file_selection_x("signing_key", 3, "pem") % "Signing Key file" + named_untyped_file_selection_x("aes_key", 2) % "AES Key Share or AES Key" + + named_untyped_file_selection_x("iv_salt", 3) % "IV Salt" + + optional_untyped_file_selection_x("signing_key", 4) % "Signing Key file (.pem)" + + optional_untyped_file_selection_x("otp", 5) % "JSON file to save OTP to (will edit existing file if it exists)" ); } @@ -825,8 +896,8 @@ struct seal_command : public cmd { hex("offset").set(settings.offset) % "Load offset (memory address; default 0x10000000)" ).force_expand_help(true) % "BIN file options" + named_file_selection_x("outfile", 1) % "File to save to" + - optional_typed_file_selection_x("key", 2, "pem") % "Key file" + - optional_typed_file_selection_x("otp", 3, "json") % "File to save OTP to (will edit existing file if it exists)" + + optional_untyped_file_selection_x("key", 2) % "Key file (.pem)" + + optional_untyped_file_selection_x("otp", 3) % "JSON file to save OTP to (will edit existing file if it exists)" + ( option("--major") & integer("major").set(settings.seal.major_version) @@ -898,7 +969,7 @@ struct partition_create_command : public cmd { return ( option("--quiet").set(settings.quiet) % "Don't print any output" + option("--verbose").set(settings.verbose) % "Print verbose output" + - named_typed_file_selection_x("infile", 0, "json") % "partition table JSON" + + named_untyped_file_selection_x("infile", 0) % "partition table JSON" + (named_file_selection_x("outfile", 1) % "output file" + ( (option('o', "--offset").set(settings.offset_set) % "Specify the load address for UF2 file output" & @@ -1029,7 +1100,8 @@ struct otp_dump_command : public cmd { return ( ( option('r', "--raw").set(settings.otp.raw) % "Get raw 24-bit values. This is the default" + - option('e', "--ecc").set(settings.otp.ecc) % "Use error correction" + option('e', "--ecc").set(settings.otp.ecc) % "Use error correction" + + option('p', "--pages").set(settings.otp.dump_pages) % "Index by page number & row number" ).min(0).doc_non_optional(true) % "Row/field options" + ( device_selection % "Target device selection" @@ -1113,12 +1185,12 @@ struct otp_permissions_command : public cmd { group get_cli() override { return ( - named_typed_file_selection_x("filename", 0, "json") % "File to load permissions from" + + named_untyped_file_selection_x("filename", 0) % "JSON file to load permissions from" + (option("--led") & integer("pin").set(settings.otp.led_pin)) % "LED Pin to flash; default 25" + ( option("--hash").set(settings.seal.hash) % "Hash the executable" + option("--sign").set(settings.seal.sign) % "Sign the executable" + - optional_typed_file_selection_x("key", 2, "pem") % "Key file" + optional_untyped_file_selection_x("key", 2) % "Key file (.pem)" ).min(0).doc_non_optional(true) % "Signing Configuration" + device_selection % "Target device selection" ); @@ -1140,7 +1212,7 @@ struct otp_white_label_command : public cmd { ( (option('s', "--start_row") & integer("row").set(settings.otp.row)) % "Start row for white label struct (default 0x100) (note use 0x for hex)" ).min(0).doc_non_optional(true) % "Row options" + - named_typed_file_selection_x("filename", 0, "json") % "File with white labelling values" + + named_untyped_file_selection_x("filename", 0) % "JSON file with white labelling values" + device_selection % "Target device selection" ); } @@ -1243,8 +1315,8 @@ struct coprodis_command : public cmd { return ( option("--quiet").set(settings.quiet) % "Don't print any output" + option("--verbose").set(settings.verbose) % "Print verbose output" + - named_file_selection_x("infile", 0) % "Input DIS" + - named_file_selection_x("outfile", 1) % "Output DIS" + named_untyped_file_selection_x("infile", 0) % "Input DIS" + + named_untyped_file_selection_x("outfile", 1) % "Output DIS" ); } @@ -2653,6 +2725,27 @@ uint32_t guess_flash_size(memory_access &access) { return size * 2; } +// returns true if string is a hex string, and fills array with the values +bool string_to_hex_array(const string& str, uint8_t *array, size_t size, const string& error_msg) { + + if (!str.empty() && str.find("0x") == 0) { + // Hex string instead of file + if (str.size() != size*2 + 2) { + fail(ERROR_ARGS, "%s hex string must be %d characters long (the supplied string is %d characters)", error_msg.c_str(), size*2, str.size() - 2); + } + for (size_t i=0; i < size; i++) { + auto value = "0x" + str.substr(2 + i*2, 2); + auto ret = integer::parse_string(value, array[i]); + if (!ret.empty()) { + fail(ERROR_ARGS, "Invalid hex string: %s %s", value.c_str(), ret.c_str()); + } + } + return true; + } + + return false; +} + std::shared_ptr get_file_idx(ios::openmode mode, uint8_t idx) { auto filename = settings.filenames[idx]; auto file = std::make_shared(filename, mode); @@ -2789,7 +2882,12 @@ void build_rmap_load_map(std::shared_ptrload_map, range_mapentries.size(); i++) { auto e = load_map->entries[i]; if (e.storage_address != 0) { - rmap.insert(range(e.runtime_address, e.runtime_address + e.size), e.storage_address); + try { + rmap.insert(range(e.runtime_address, e.runtime_address + e.size), e.storage_address); + } catch (command_failure&) { + // Overlapping memory ranges are permitted in a load_map, so overwrite overlapping range + rmap.insert_overwrite(range(e.runtime_address, e.runtime_address + e.size), e.storage_address); + } } } } @@ -4788,104 +4886,48 @@ bool load_command::execute(device_map &devices) { } #endif -#if HAS_MBEDTLS -bool encrypt_command::execute(device_map &devices) { - bool isElf = false; - bool isBin = false; - if (get_file_type() == filetype::elf) { - isElf = true; - } else if (get_file_type() == filetype::bin) { - isBin = true; - } else { - fail(ERROR_ARGS, "Can only sign ELFs or BINs"); - } - - if (get_file_type_idx(1) != get_file_type()) { - fail(ERROR_ARGS, "Can only sign to same file type"); - } - - if (get_file_type_idx(2) != filetype::bin) { - fail(ERROR_ARGS, "Can only read AES key from BIN file"); - } - - if (settings.seal.sign && settings.filenames[3].empty()) { - fail(ERROR_ARGS, "missing key file for signing after encryption"); - } - - if (!settings.filenames[3].empty() && get_file_type_idx(3) != filetype::pem) { - fail(ERROR_ARGS, "Can only read pem keys"); - } - - - auto aes_file = get_file_idx(ios::in|ios::binary, 2); - - private_t aes_key; - aes_file->read((char*)aes_key.bytes, sizeof(aes_key.bytes)); +static uint32_t even_parity(uint32_t input) { + return __builtin_popcount(input) & 1; +} - private_t private_key = {}; - public_t public_key = {}; - - if (settings.seal.sign) read_keys(settings.filenames[3], &public_key, &private_key); - - if (isElf) { - elf_file source_file(settings.verbose); - elf_file *elf = &source_file; - elf->read_file(get_file(ios::in|ios::binary)); - // Remove any holes in the ELF file, as these cause issues when encrypting - elf->remove_sh_holes(); - - std::unique_ptr first_block = find_first_block(elf); - if (!first_block) { - fail(ERROR_FORMAT, "No first block found"); - } - elf->editable = false; - block new_block = place_new_block(elf, first_block); - elf->editable = true; - - encrypt(elf, &new_block, aes_key, public_key, private_key, settings.seal.hash, settings.seal.sign); - - auto out = get_file_idx(ios::out|ios::binary, 1); - elf->write(out); - out->close(); - } else if (isBin) { - auto binfile = get_file_memory_access(0); - auto rmap = binfile.get_rmap(); - auto ranges = rmap.ranges(); - assert(ranges.size() == 1); - auto bin_start = ranges[0].from; - auto bin_size = ranges[0].len(); - - vector bin = binfile.read_vector(bin_start, bin_size, false); - - std::unique_ptr first_block = find_first_block(bin, bin_start); - if (!first_block) { - fail(ERROR_FORMAT, "No first block found"); - } - auto bin_cp = bin; - block new_block = place_new_block(bin_cp, bin_start, first_block); - - auto enc_data = encrypt(bin, bin_start, bin_start, &new_block, aes_key, public_key, private_key, settings.seal.hash, settings.seal.sign); - - auto out = get_file_idx(ios::out|ios::binary, 1); - out->write((const char *)enc_data.data(), enc_data.size()); - out->close(); - } else { - fail(ERROR_ARGS, "Must be ELF or BIN"); - } - - return false; +// In: 16-bit unsigned integer. Out: 22-bit unsigned integer. +uint32_t __noinline otp_calculate_ecc(uint16_t x) { + // Source: db_shf40_ap_ab.pdf, page 25, "TABLE 9: PARITY BIT GENERATION MAP + // FOR 16 BIT USER DATA (X24 SHF MACROCELL)" + // https://drive.google.com/drive/u/1/folders/1jgU3tZt2BDeGkWUFhi6KZAlaYUpGrFaG + uint32_t p0 = even_parity(x & 0b1010110101011011); + uint32_t p1 = even_parity(x & 0b0011011001101101); + uint32_t p2 = even_parity(x & 0b1100011110001110); + uint32_t p3 = even_parity(x & 0b0000011111110000); + uint32_t p4 = even_parity(x & 0b1111100000000000); + uint32_t p5 = even_parity(x) ^ p0 ^ p1 ^ p2 ^ p3 ^ p4; + uint32_t p = p0 | (p1 << 1) | (p2 << 2) | (p3 << 3) | (p4 << 4) | (p5 << 5); + return x | (p << 16); } + #if HAS_MBEDTLS void sign_guts_elf(elf_file* elf, private_t private_key, public_t public_key) { std::unique_ptr first_block = find_first_block(elf); if (!first_block) { - fail(ERROR_FORMAT, "No first block found"); + // Throw a clearer error for RP2040 binaries with no block loop + auto family_id = get_family_id(0); + if (family_id == RP2040_FAMILY_ID) { + fail(ERROR_FORMAT, "No metadata block found when sealing RP2040 binary - either use RP2350, or set PICO_CRT0_INCLUDE_PICOBIN_BLOCK=1"); + } else { + fail(ERROR_FORMAT, "No metadata block found"); + } } block new_block = place_new_block(elf, first_block); + if (settings.seal.set_tbyb) { + // Set the TBYB bit on the image_type_item + std::shared_ptr image_type = new_block.get_item(); + image_type->flags |= PICOBIN_IMAGE_TYPE_EXE_TBYB_BITS; + } + if (settings.seal.major_version || settings.seal.minor_version || settings.seal.rollback_version) { std::shared_ptr version = new_block.get_item(); if (version != nullptr) { @@ -4927,9 +4969,12 @@ void sign_guts_elf(elf_file* elf, private_t private_key, public_t public_key) { } } } - auto segment = elf->segment_from_physical_address(vtor_loc); + auto segment = elf->segment_from_virtual_address(vtor_loc); + if (segment == nullptr) { + fail(ERROR_NOT_POSSIBLE, "The ELF file does not contain the vector table location %x", vtor_loc); + } auto content = elf->content(*segment); - auto offset = vtor_loc - segment->physical_address(); + auto offset = vtor_loc - segment->virtual_address(); uint32_t ep; memcpy(&ep, content.data() + offset + 4, sizeof(ep)); uint32_t sp; @@ -4952,7 +4997,13 @@ vector sign_guts_bin(iostream_memory_access in, private_t private_key, std::unique_ptr first_block = find_first_block(bin, bin_start); if (!first_block) { - fail(ERROR_FORMAT, "No first block found"); + // Throw a clearer error for RP2040 binaries with no block loop + auto family_id = get_family_id(0); + if (family_id == RP2040_FAMILY_ID) { + fail(ERROR_FORMAT, "No metadata block found when sealing RP2040 binary - either use RP2350, or set PICO_CRT0_INCLUDE_PICOBIN_BLOCK"); + } else { + fail(ERROR_FORMAT, "No metadata block found"); + } } block new_block = place_new_block(bin, bin_start, first_block); @@ -5006,7 +5057,408 @@ vector sign_guts_bin(iostream_memory_access in, private_t private_key, return sig_data; } -#endif + +bool encrypt_command::execute(device_map &devices) { + bool isElf = false; + bool isBin = false; + + bool keyFromFile = true; + bool keyIsShare = false; + bool ivFromFile = true; + + aes_key_t aes_key; + aes_key_share_t aes_key_share; + std::vector iv_salt; + iv_salt.resize(16); + + if (get_file_type() == filetype::elf) { + isElf = true; + } else if (get_file_type() == filetype::bin) { + if (settings.encrypt.embed) { + fail(ERROR_ARGS, "Can only embed decrypting bootloader into ELFs"); + } + isBin = true; + } else { + fail(ERROR_ARGS, "Can only sign ELFs or BINs"); + } + + if (get_file_type_idx(1) != get_file_type()) { + fail(ERROR_ARGS, "Can only sign to same file type"); + } + + if (string_to_hex_array(settings.filenames[2], aes_key.bytes, sizeof(aes_key.bytes), "AES key")) { + keyFromFile = false; + } else if (get_file_type_idx(2) != filetype::bin) { + fail(ERROR_ARGS, "Can only read AES key or AES key share from BIN file"); + } + + if (string_to_hex_array(settings.filenames[3], iv_salt.data(), iv_salt.size(), "IV OTP salt")) { + ivFromFile = false; + } else if (get_file_type_idx(3) != filetype::bin) { + if (get_file_type_idx(3) == filetype::pem) { + // picotool encrypt <=2.1.1 would take PEM key file in the location of the IV OTP salt + fail(ERROR_ARGS, "This picotool version (%s) is not compatible with SDK versions <=2.1.1 - you must manually build & install picotool version 2.1.1 to use those SDK versions with encryption", PICOTOOL_VERSION); + } + fail(ERROR_ARGS, "Can only read IV OTP salt from BIN file"); + } + + if (settings.seal.sign && settings.filenames[4].empty()) { + fail(ERROR_ARGS, "missing key file for signing after encryption"); + } + + if (!settings.filenames[4].empty() && get_file_type_idx(4) != filetype::pem) { + fail(ERROR_ARGS, "Can only read pem keys"); + } + + if (keyFromFile) { + auto aes_file = get_file_idx(ios::in|ios::binary, 2); + aes_file->exceptions(std::iostream::failbit | std::iostream::badbit); + aes_file->seekg(0, std::ios::end); + auto aes_key_file_size = aes_file->tellg(); + if (aes_key_file_size == 32) { + keyIsShare = false; + aes_file->seekg(0, std::ios::beg); + aes_file->read((char*)aes_key.bytes, sizeof(aes_key.bytes)); + } else if (aes_key_file_size == 128) { + keyIsShare = true; + aes_file->seekg(0, std::ios::beg); + aes_file->read((char*)aes_key_share.bytes, sizeof(aes_key_share.bytes)); + } else { + fail(ERROR_INCOMPATIBLE, "The AES key file must be a 128 byte key share, or a 32 byte key (the supplied file is %d bytes)", aes_key_file_size); + } + } + + if (!keyIsShare) { + // Generate a random key share from 256-bit key + std::random_device rand{}; + assert(rand.max() - rand.min() >= 256); + for(int i=0; i < 8; i++) { + for (int j=0; j < 12; j++) { + aes_key_share.bytes[i*16 + j] = rand(); + } + aes_key_share.words[i*4 + 3] = aes_key.words[i] + ^ aes_key_share.words[i*4] + ^ aes_key_share.words[i*4 + 1] + ^ aes_key_share.words[i*4 + 2]; + } + } + + // Key is stored as a 4-way share of each word, ie X[0] = A[0] ^ B[0] ^ C[0] ^ D[0], stored as A[0], B[0], C[0], D[0] + for (int i=0; i < count_of(aes_key.words); i++) { + aes_key.words[i] = aes_key_share.words[i*4] + ^ aes_key_share.words[i*4 + 1] + ^ aes_key_share.words[i*4 + 2] + ^ aes_key_share.words[i*4 + 3]; + } + + private_t private_key = {}; + public_t public_key = {}; + + if (settings.seal.sign) read_keys(settings.filenames[4], &public_key, &private_key); + + // Read IV Salt + if (ivFromFile) { + auto iv_salt_file = get_file_idx(ios::in|ios::binary, 3); + iv_salt_file->exceptions(std::iostream::failbit | std::iostream::badbit); + iv_salt_file->seekg(0, std::ios::end); + if (iv_salt_file->tellg() != 16) { + fail(ERROR_INCOMPATIBLE, "The IV OTP salt must be a 16 byte file (the supplied file is %d bytes)", iv_salt_file->tellg()); + } + iv_salt_file->seekg(0, std::ios::beg); + iv_salt_file->read((char*)iv_salt.data(), iv_salt.size()); + } + + if (isElf) { + elf_file source_file(settings.verbose); + elf_file *elf = &source_file; + elf->read_file(get_file(ios::in|ios::binary)); + // Remove any holes in the ELF file, as these cause issues when encrypting + elf->remove_sh_holes(); + + std::unique_ptr first_block = find_first_block(elf); + if (!first_block) { + fail(ERROR_FORMAT, "No first block found"); + } + elf->editable = false; + block new_block = place_new_block(elf, first_block); + elf->editable = true; + + // Delete existing load_map, as it will be invalid after encryption + std::shared_ptr load_map = new_block.get_item(); + if (load_map != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), load_map), new_block.items.end()); + } + + if (settings.encrypt.embed) { + std::vector iv_data; + std::vector enc_data; + uint32_t data_start_address = SRAM_START; + encrypt_guts(elf, &new_block, aes_key, iv_data, enc_data); + + // Salt IV + assert(iv_data.size() == iv_salt.size()); + for (int i=0; i < iv_data.size(); i++) { + iv_data[i] ^= iv_salt[i]; + } + auto tmp = std::make_shared(); + auto file = get_enc_bootloader(settings.encrypt.use_mbedtls); + *tmp << file->rdbuf(); + + auto program = get_iostream_memory_access(tmp, filetype::elf, true); + program.set_model(rp2350); + + // data_start_addr + settings.config.key = "data_start_addr"; + settings.config.value = hex_string(data_start_address); + config_guts(program); + // data_size + settings.config.key = "data_size"; + settings.config.value = hex_string(enc_data.size()); + config_guts(program); + // iv + { + string s((char*)iv_data.data(), iv_data.size()); + settings.config.key = "iv"; + settings.config.value = s; + config_guts(program); + } + // otp_key_page + if (settings.encrypt.otp_key_page_set) { + settings.config.key = "otp_key_page"; + settings.config.value = hex_string(settings.encrypt.otp_key_page); + config_guts(program); + } + + // fast rosc + if (settings.encrypt.fast_rosc) { + settings.config.key = "rosc_div"; + settings.config.value = "0x1"; + config_guts(program); + settings.config.key = "rosc_drive"; + settings.config.value = "0x0000"; + config_guts(program); + } + + elf_file source_file(settings.verbose); + elf_file *enc_elf = &source_file; + enc_elf->read_file(tmp); + + // Bootloader size + auto bootloader_start = enc_elf->get_symbol("__enc_bootloader_start"); + auto bootloader_end = enc_elf->get_symbol("__enc_bootloader_end"); + uint32_t bootloader_size = bootloader_end - bootloader_start; + + // Move bootloader down in physical space to start of SRAM (which will be start of flash once packaged) + enc_elf->move_all(data_start_address - bootloader_start); + + // Add encrypted blob + enc_elf->append_segment(data_start_address, data_start_address + bootloader_size, enc_data.size(), ".enc_data"); + auto data_section = enc_elf->get_section(".enc_data"); + assert(data_section); + assert(data_section->virtual_address() == data_start_address); + + if (data_section->size < enc_data.size()) { + fail(ERROR_UNKNOWN, "Block is too big for elf section\n"); + } + + DEBUG_LOG("Adding enc_data len %d\n", (int)enc_data.size()); + for (auto x : enc_data) DEBUG_LOG("%02x", x); + DEBUG_LOG("\n"); + + enc_elf->content(*data_section, enc_data); + + // Get the version from the encrypted binary + std::shared_ptr version = new_block.get_item(); + if (version != nullptr) { + settings.seal.major_version = version->major; + settings.seal.minor_version = version->minor; + settings.seal.rollback_version = version->rollback; + for (auto row : version->otp_rows) { + settings.seal.rollback_rows.push_back(row); + } + } + + // Get the TBYB from the encrypted binary + std::shared_ptr image_type = new_block.get_item(); + if (image_type->tbyb()) { + settings.seal.set_tbyb = true; + } + + // Sign the final thing + settings.seal.clear_sram = true; + sign_guts_elf(enc_elf, private_key, public_key); + + auto out = get_file_idx(ios::out|ios::binary, 1); + enc_elf->write(out); + out->close(); + } else { + encrypt(elf, &new_block, aes_key, public_key, private_key, iv_salt, settings.seal.hash, settings.seal.sign); + auto out = get_file_idx(ios::out|ios::binary, 1); + elf->write(out); + out->close(); + } + } else if (isBin) { + auto binfile = get_file_memory_access(0); + auto rmap = binfile.get_rmap(); + auto ranges = rmap.ranges(); + assert(ranges.size() == 1); + auto bin_start = ranges[0].from; + auto bin_size = ranges[0].len(); + + vector bin = binfile.read_vector(bin_start, bin_size, false); + + std::unique_ptr first_block = find_first_block(bin, bin_start); + if (!first_block) { + fail(ERROR_FORMAT, "No first block found"); + } + auto bin_cp = bin; + block new_block = place_new_block(bin_cp, bin_start, first_block); + + // Delete existing load_map, as it will be invalid after encryption + std::shared_ptr load_map = new_block.get_item(); + if (load_map != nullptr) { + new_block.items.erase(std::remove(new_block.items.begin(), new_block.items.end(), load_map), new_block.items.end()); + } + + auto enc_data = encrypt(bin, bin_start, bin_start, &new_block, aes_key, public_key, private_key, iv_salt, settings.seal.hash, settings.seal.sign); + + auto out = get_file_idx(ios::out|ios::binary, 1); + out->write((const char *)enc_data.data(), enc_data.size()); + out->close(); + } else { + fail(ERROR_ARGS, "Must be ELF or BIN"); + } + + if (!settings.filenames[5].empty()) { + if (get_file_type_idx(5) != filetype::json) { + fail(ERROR_ARGS, "Can only output OTP json"); + } + auto check_json_file = std::ifstream(settings.filenames[5]); + json otp_json; + if (check_json_file.good()) { + otp_json = json::parse(check_json_file); + DEBUG_LOG("Appending to existing otp json\n"); + check_json_file.close(); + } + auto json_out = get_file_idx(ios::out, 5); + + #define FIB_WORKAROUND 1 + #if FIB_WORKAROUND + // Make inverse pages to work around OTP FIB attack + vector page0_data; + page0_data.resize(64); + vector page1_data; + page1_data.resize(64); + vector page2_data; + page2_data.resize(iv_salt.size()); + + // Inverse pages need to be raw, to invert the ECC bits too + vector page0_inverse; + page0_inverse.resize(page0_data.size()*2); + vector page1_inverse; + page1_inverse.resize(page1_data.size()*2); + vector page2_inverse; + page2_inverse.resize(page2_data.size()*2); + + memcpy(page0_data.data(), aes_key_share.bytes, 64); + memcpy(page1_data.data(), aes_key_share.bytes + 64, 64); + memcpy(page2_data.data(), iv_salt.data(), iv_salt.size()); + + // The bits in rows 32-63 must be the inverse of the bits in rows 0-31 + for (int i = 0; i < page0_data.size(); i += 2) { + page0_inverse[i*2] = ~page0_data[i]; + page0_inverse[i*2+1] = ~page0_data[i+1]; + page0_inverse[i*2+2] = ~otp_calculate_ecc(*(uint16_t*)&page0_data[i]) >> 16; + } + for (int i = 0; i < page1_data.size(); i += 2) { + page1_inverse[i*2] = ~page1_data[i]; + page1_inverse[i*2+1] = ~page1_data[i+1]; + page1_inverse[i*2+2] = ~otp_calculate_ecc(*(uint16_t*)&page1_data[i]) >> 16; + } + for (int i = 0; i < page2_data.size(); i += 2) { + page2_inverse[i*2] = ~page2_data[i]; + page2_inverse[i*2+1] = ~page2_data[i+1]; + page2_inverse[i*2+2] = ~otp_calculate_ecc(*(uint16_t*)&page2_data[i]) >> 16; + } + + // Add otp AES key pages + for (int i = 0; i < page0_data.size(); i++) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page << ":0"; + otp_json[ss.str()]["ecc"] = true; + otp_json[ss.str()]["value"][i] = page0_data[i]; + } + for (int i = 0; i < page1_data.size(); i++) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page + 1 << ":0"; + otp_json[ss.str()]["ecc"] = true; + otp_json[ss.str()]["value"][i] = page1_data[i]; + } + + // Add otp IV salt page + for (int i = 0; i < page2_data.size(); i++) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page + 2 << ":0"; + otp_json[ss.str()]["ecc"] = true; + otp_json[ss.str()]["value"][i] = page2_data[i]; + } + + // Add inverse pages + for (int i = 0; i < page0_inverse.size(); i++) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page << ":32"; + otp_json[ss.str()]["ecc"] = false; + otp_json[ss.str()]["value"][i] = page0_inverse[i]; + } + for (int i = 0; i < page1_inverse.size(); i++) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page + 1 << ":32"; + otp_json[ss.str()]["ecc"] = false; + otp_json[ss.str()]["value"][i] = page1_inverse[i]; + } + for (int i = 0; i < page2_inverse.size(); i++) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page + 2 << ":32"; + otp_json[ss.str()]["ecc"] = false; + otp_json[ss.str()]["value"][i] = page2_inverse[i]; + } + #else + // Add otp AES key page + for (int i = 0; i < 128; ++i) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page << ":0"; + otp_json[ss.str()]["ecc"] = true; + otp_json[ss.str()]["value"][i] = aes_key_share.bytes[i]; + } + + // Add otp IV salt page + for (int i = 0; i < iv_salt.size(); ++i) { + std::stringstream ss; + ss << settings.encrypt.otp_key_page + 1 << ":0"; + otp_json[ss.str()]["ecc"] = true; + otp_json[ss.str()]["value"][i] = iv_salt[i]; + } + #endif + + // Add page locks to prevent BL and NS access, and only allow S reads + { + std::stringstream ss; + ss << "PAGE" << settings.encrypt.otp_key_page << "_LOCK1"; + otp_json[ss.str()] = "0x3d3d3d"; + ss.str(string()); + ss << "PAGE" << settings.encrypt.otp_key_page + 1 << "_LOCK1"; + otp_json[ss.str()] = "0x3d3d3d"; + ss.str(string()); + ss << "PAGE" << settings.encrypt.otp_key_page + 2 << "_LOCK1"; + otp_json[ss.str()] = "0x3d3d3d"; + } + + *json_out << std::setw(4) << otp_json << std::endl; + json_out->close(); + } + + return false; +} bool seal_command::execute(device_map &devices) { bool isElf = false; @@ -5649,26 +6101,6 @@ std::map, otp_match> filter_otp(std::vector return matches; } -// todo we could make this popcount at the cost of having this not be Armv6m or adding the popcount instruction to varmulet for bootrom -static uint32_t even_parity(uint32_t input) { - return __builtin_popcount(input) & 1; -} - -// In: 16-bit unsigned integer. Out: 22-bit unsigned integer. -uint32_t __noinline otp_calculate_ecc(uint16_t x) { - // Source: db_shf40_ap_ab.pdf, page 25, "TABLE 9: PARITY BIT GENERATION MAP - // FOR 16 BIT USER DATA (X24 SHF MACROCELL)" - // https://drive.google.com/drive/u/1/folders/1jgU3tZt2BDeGkWUFhi6KZAlaYUpGrFaG - uint32_t p0 = even_parity(x & 0b1010110101011011); - uint32_t p1 = even_parity(x & 0b0011011001101101); - uint32_t p2 = even_parity(x & 0b1100011110001110); - uint32_t p3 = even_parity(x & 0b0000011111110000); - uint32_t p4 = even_parity(x & 0b1111100000000000); - uint32_t p5 = even_parity(x) ^ p0 ^ p1 ^ p2 ^ p3 ^ p4; - uint32_t p = p0 | (p1 << 1) | (p2 << 2) | (p3 << 3) | (p4 << 4) | (p5 << 5); - return x | (p << 16); -} - #if HAS_LIBUSB static void hack_init_otp_regs(picoboot::connection& con) { // build map of OTP regs by offset @@ -5701,7 +6133,7 @@ bool otp_get_command::execute(device_map &devices) { if (m.reg_row / OTP_PAGE_ROWS != last_page) { // todo pre-check page lock struct picoboot_otp_cmd otp_cmd; - if (m.reg_row / OTP_PAGE_ROWS >= 62) { + if (m.reg_row / OTP_PAGE_ROWS >= OTP_PAGE_COUNT - OTP_SPECIAL_PAGES) { // Read individual rows for lock words otp_cmd.wRow = m.reg_row; otp_cmd.wRowCount = 1; @@ -5855,20 +6287,59 @@ bool otp_dump_command::execute(device_map &devices) { auto con = get_single_rp2350_bootsel_device_connection(devices, false); // todo pre-check page lock struct picoboot_otp_cmd otp_cmd; - otp_cmd.wRow = 0; - otp_cmd.wRowCount = OTP_ROW_COUNT; otp_cmd.bEcc = settings.otp.ecc && !settings.otp.raw; vector raw_buffer; - raw_buffer.resize(otp_cmd.wRowCount * (otp_cmd.bEcc ? 2 : 4)); + uint8_t row_size = otp_cmd.bEcc ? 2 : 4; + raw_buffer.resize(OTP_ROW_COUNT * row_size); picoboot_memory_access raw_access(con); - con.otp_read(&otp_cmd, raw_buffer.data(), raw_buffer.size()); + std::map page_errors; + std::map row_errors; + + // Read most pages by page, as permissions are per page + otp_cmd.wRowCount = OTP_PAGE_ROWS; + for (int i=0; i < OTP_PAGE_COUNT - OTP_SPECIAL_PAGES; i++) { + otp_cmd.wRow = i * OTP_PAGE_ROWS; + try { + con.otp_read(&otp_cmd, raw_buffer.data() + i*(raw_buffer.size() / OTP_PAGE_COUNT), raw_buffer.size() / OTP_PAGE_COUNT); + } catch (picoboot::command_failure& e) { + if (e.get_code() == PICOBOOT_NOT_PERMITTED) { + page_errors[i] = e.what(); + } else { + throw e; + } + } + } + + // Read special pages by row, as permissions are special + otp_cmd.wRowCount = 1; + for (int i=(OTP_PAGE_COUNT - OTP_SPECIAL_PAGES) * OTP_PAGE_ROWS; i < OTP_PAGE_COUNT * OTP_PAGE_ROWS; i++) { + otp_cmd.wRow = i; + try { + con.otp_read(&otp_cmd, raw_buffer.data() + i * row_size, row_size); + } catch (picoboot::command_failure& e) { + if (e.get_code() == PICOBOOT_NOT_PERMITTED) { + row_errors[i] = e.what(); + } else { + throw e; + } + } + } + fos.first_column(0); char buf[256]; for(int i=0;i