Skip to content

Commit 7259e7f

Browse files
committed
Implement multithreaded file extraction with configurable core usage
Enhance extraction process for large files (e.g., Games)
1 parent 6c6dd3c commit 7259e7f

File tree

2 files changed

+165
-23
lines changed

2 files changed

+165
-23
lines changed

include/PyInstArchive.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,9 @@ class PyInstArchive {
5959
void parseTOC();
6060
void timeExtractionProcess(const std::string& outputDir);
6161
void displayInfo();
62-
void decompressData(const std::vector<char>& compressedData, std::vector<char>& decompressedData);
63-
64-
// New methods for multithreading
65-
void MultiThreaedFileExtract(const std::vector<CTOCEntry>& tocEntries, const std::string& outputDir);
62+
void MultiThreadedFileExtract(const std::vector<CTOCEntry>& tocEntries, const std::string& outputDir, size_t numThreads);
6663
void decompressAndExtractFile(const CTOCEntry& tocEntry, const std::string& outputDir, std::mutex& mtx, std::mutex& printMtx);
67-
64+
const std::vector<CTOCEntry>& getTOCList() const;
6865
private:
6966
std::mutex mtx; // Protects file pointer access
7067
std::mutex printMtx; // Protects console output

src/Pyinstaller.cpp

Lines changed: 163 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <future>
1212
#include <iomanip>
1313
#include <chrono>
14+
#include <Windows.h>
1415

1516
#include "../include/PyInstArchive.h"
1617
#include "../include/zlib.h"
@@ -61,6 +62,22 @@ void PyInstArchive::close() {
6162
}
6263
}
6364

65+
/**
66+
* @brief Retrieves the list of Table of Contents (TOC) entries from the PyInstaller archive.
67+
*
68+
* This method returns a constant reference to the vector containing the TOC entries.
69+
* The TOC entries represent individual files within the PyInstaller archive, including their
70+
* positions, compressed sizes, uncompressed sizes, compression flags, data types, and names.
71+
*
72+
* @return A constant reference to a vector of CTOCEntry objects representing the TOC entries.
73+
*
74+
* @note The vector returned by this method is read-only, ensuring the TOC entries cannot be modified
75+
* directly through the returned reference. To modify the TOC entries, use appropriate member functions.
76+
*/
77+
const std::vector<CTOCEntry>& PyInstArchive::getTOCList() const {
78+
return tocList;
79+
}
80+
6481
/**
6582
* @brief Checks if the file is a valid PyInstaller archive.
6683
*
@@ -147,6 +164,59 @@ uint32_t swapBytes(uint32_t value) {
147164
((value << 24) & 0xFF000000);
148165
}
149166

167+
/**
168+
* @brief Retrieves the number of physical CPU cores on the system.
169+
*
170+
* This function uses the Windows API to obtain information about the system's logical processors
171+
* and their relationship to physical CPU cores. It first determines the required buffer size for
172+
* the processor information, allocates the buffer, and then retrieves the information.
173+
*
174+
* The function iterates through the retrieved data to count the number of physical cores and
175+
* returns this count. If an error occurs at any stage, the function outputs an error message and
176+
* returns a default value of 1.
177+
*
178+
* @return The number of physical CPU cores on the system. If an error occurs, returns 1.
179+
*
180+
* @note This function is platform-specific and intended for use on Windows systems.
181+
* @note The function uses `malloc` for buffer allocation and `free` for deallocation.
182+
*/
183+
size_t getPhysicalCoreCount() {
184+
DWORD length = 0;
185+
// Initial call to get buffer size
186+
GetLogicalProcessorInformation(nullptr, &length);
187+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
188+
std::cerr << "[!] Error: Unable to determine buffer size for processor information.\n";
189+
return 1; // Default to 1 if unable to determine
190+
}
191+
192+
// Allocate buffer for processor information
193+
SYSTEM_LOGICAL_PROCESSOR_INFORMATION* buffer = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION*>(malloc(length));
194+
if (buffer == nullptr) {
195+
std::cerr << "[!] Error: Memory allocation failed.\n";
196+
return 1;
197+
}
198+
199+
// Retrieve processor information
200+
if (!GetLogicalProcessorInformation(buffer, &length)) {
201+
std::cerr << "[!] Error: Unable to get logical processor information.\n";
202+
free(buffer);
203+
return 1;
204+
}
205+
206+
DWORD processorCoreCount = 0;
207+
DWORD count = length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
208+
209+
// Count the number of physical cores
210+
for (DWORD i = 0; i < count; ++i) {
211+
if (buffer[i].Relationship == RelationProcessorCore) {
212+
processorCoreCount++;
213+
}
214+
}
215+
216+
free(buffer);
217+
return static_cast<size_t>(processorCoreCount);
218+
}
219+
150220
/**
151221
* @brief Extracts and parses CArchive information from the PyInstaller file.
152222
*
@@ -308,9 +378,13 @@ void PyInstArchive::displayInfo() {
308378
* @param outputDir The directory where the extracted files will be saved.
309379
*/
310380
void PyInstArchive::timeExtractionProcess(const std::string& outputDir) {
381+
// Determine the number of physical cores to use as threads
382+
size_t numThreads = getPhysicalCoreCount();
383+
311384
auto start = std::chrono::high_resolution_clock::now();
312385

313-
MultiThreaedFileExtract(tocList, outputDir);
386+
// Call MultiThreadedFileExtract with the required arguments
387+
MultiThreadedFileExtract(tocList, outputDir, numThreads);
314388

315389
auto end = std::chrono::high_resolution_clock::now();
316390
std::chrono::duration<double> elapsed = end - start;
@@ -334,12 +408,30 @@ void PyInstArchive::timeExtractionProcess(const std::string& outputDir) {
334408
* @note The mutexes `mtx` and `printMtx` are used within the tasks to ensure thread-safe operations.
335409
* @note The ThreadPool destructor ensures all tasks are completed before the program continues.
336410
*/
337-
void PyInstArchive::MultiThreaedFileExtract(const std::vector<CTOCEntry>& tocEntries, const std::string& outputDir) {
338-
size_t numThreads = std::thread::hardware_concurrency();
339-
if (numThreads == 0) numThreads = 4; // Fallback if hardware_concurrency can't determine
411+
void PyInstArchive::MultiThreadedFileExtract(const std::vector<CTOCEntry>& tocEntries, const std::string& outputDir, size_t numThreads) {
412+
size_t maxCores = getPhysicalCoreCount(); // Function to get number of physical cores
413+
414+
// Validate user-specified number of threads
415+
if (numThreads == 0) {
416+
numThreads = maxCores;
417+
std::cout << "[*] Using all available physical cores: " << numThreads << "\n";
418+
}
419+
else {
420+
if (numThreads > maxCores) {
421+
std::cout << "[!] Specified number of cores (" << numThreads << ") exceeds available physical cores (" << maxCores << "). Using maximum available cores.\n";
422+
numThreads = maxCores;
423+
}
424+
else {
425+
std::cout << "[*] Using user-specified number of cores: " << numThreads << "\n";
426+
}
427+
}
428+
429+
if (numThreads == 0) numThreads = 1; // Ensure at least one thread
340430

431+
// Initialize ThreadPool with the specified number of threads
341432
ThreadPool pool(numThreads);
342433

434+
// Enqueue tasks
343435
for (const auto& tocEntry : tocEntries) {
344436
pool.enqueue([this, &tocEntry, &outputDir] {
345437
this->decompressAndExtractFile(tocEntry, outputDir, mtx, printMtx);
@@ -429,29 +521,81 @@ void PyInstArchive::decompressAndExtractFile(const CTOCEntry& tocEntry, const st
429521
}
430522

431523
/**
432-
* @brief Parses command-line arguments for interacting with a PyInstaller archive.
524+
* @brief Parses command-line arguments and initiates the archive processing.
433525
*
434-
* This method processes the command-line arguments, checks if the required parameters
435-
* are provided, and then opens the specified PyInstaller archive. It can either display
436-
* information about the archive or extract its files to the specified output directory.
526+
* This function handles the parsing of command-line arguments to determine the appropriate
527+
* operation to perform on the PyInstaller archive. It supports specifying the number of cores
528+
* to use for extraction, the command to execute (either to display information or to extract files),
529+
* the path to the archive, and the optional output directory.
530+
*
531+
* Supported arguments:
532+
* - `-cores N`: Specifies the number of cores to use for the extraction process. If not provided or set to 0, all available physical cores are used.
533+
* - `-i`: Command to display information about the archive (filenames, sizes).
534+
* - `-u`: Command to extract files from the archive.
535+
* - `<archive_path>`: The path to the PyInstaller archive file.
536+
* - `[output_dir]`: Optional output directory where the extracted files will be saved. Defaults to "unpacked".
537+
*
538+
* Example usage:
539+
* - `unpack.exe -cores 4 -u archive_file.exe output_dir`
540+
* - `unpack.exe -i archive_file.exe`
437541
*
438542
* @param argc The number of command-line arguments.
439543
* @param argv The array of command-line arguments.
440-
*
441-
* @note The command must be either "-i" to display archive information or "-u" to extract files.
442-
* The archive path is required, and an optional output directory can be specified.
443-
* @note If the output directory does not exist, it will be created automatically.
444-
* @note Errors are logged if any arguments are invalid or if the archive cannot be processed.
445544
*/
446545
void parseArgs(int argc, char* argv[]) {
546+
// Default values
547+
int numCores = 0; // 0 indicates 'use all available physical cores'
548+
std::string command;
549+
std::string archivePath;
550+
std::string outputDir = "unpacked"; // Default output directory
551+
int argIndex = 1;
552+
553+
// Check if there are enough arguments
447554
if (argc < 3) {
448-
std::cerr << "[!] Usage: " << argv[0] << " [-i | -u] <archive_path> [output_dir]" << std::endl;
555+
std::cerr << "[!] Usage: " << argv[0] << " [-cores N] [-i | -u] <archive_path> [output_dir]" << std::endl;
449556
exit(1);
450557
}
451558

452-
std::string command = argv[1]; // Command (-i or -u)
453-
std::string archivePath = argv[2]; // Archive file path
454-
std::string outputDir = (argc > 3) ? argv[3] : "unpacked"; // Output directory (default to "output")
559+
// Parse arguments
560+
while (argIndex < argc) {
561+
std::string arg = argv[argIndex];
562+
563+
if (arg == "-cores") {
564+
// Handle the -cores argument
565+
argIndex++;
566+
if (argIndex >= argc) {
567+
std::cerr << "[!] Error: Expected number after -cores" << std::endl;
568+
exit(1);
569+
}
570+
numCores = atoi(argv[argIndex]);
571+
if (numCores <= 0) {
572+
std::cerr << "[!] Invalid number of cores specified. Using all available physical cores." << std::endl;
573+
numCores = 0;
574+
}
575+
argIndex++;
576+
}
577+
else if (arg == "-i" || arg == "-u") {
578+
// Handle the command (-i or -u)
579+
command = arg;
580+
argIndex++;
581+
}
582+
else if (archivePath.empty()) {
583+
// First argument that's not an option is the archive path
584+
archivePath = arg;
585+
argIndex++;
586+
}
587+
else {
588+
// Optional output directory
589+
outputDir = arg;
590+
argIndex++;
591+
}
592+
}
593+
594+
// Validate required arguments
595+
if (command.empty() || archivePath.empty()) {
596+
std::cerr << "[!] Usage: " << argv[0] << " [-cores N] [-i | -u] <archive_path> [output_dir]" << std::endl;
597+
exit(1);
598+
}
455599

456600
// Check if the output directory exists, create it if it doesn't
457601
if (!std::filesystem::exists(outputDir)) {
@@ -479,7 +623,8 @@ void parseArgs(int argc, char* argv[]) {
479623
archive.displayInfo(); // Display information about the archive (filenames, sizes)
480624
}
481625
else if (command == "-u") {
482-
archive.timeExtractionProcess(outputDir); // Extract files to the specified directory
626+
archive.parseTOC(); // Parse the Table of Contents before extraction
627+
archive.MultiThreadedFileExtract(archive.getTOCList(), outputDir, static_cast<size_t>(numCores));
483628
}
484629
else {
485630
std::cerr << "[!] Unknown command: " << command << std::endl;

0 commit comments

Comments
 (0)