Skip to content

Commit 6c6dd3c

Browse files
committed
Implement multithreaded file extraction using ThreadPool
update
1 parent 25b766d commit 6c6dd3c

File tree

7 files changed

+254
-65
lines changed

7 files changed

+254
-65
lines changed

PyInstaller-C++.vcxproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
</ItemGroup>
2121
<ItemGroup>
2222
<ClInclude Include="include\PyInstArchive.h" />
23+
<ClInclude Include="include\ThreadPool.h" />
2324
<ClInclude Include="include\zconf.h" />
2425
<ClInclude Include="include\zlib.h" />
2526
</ItemGroup>
2627
<ItemGroup>
2728
<ClCompile Include="src\Pyinstaller.cpp" />
29+
<ClCompile Include="src\ThreadPool.cpp" />
2830
</ItemGroup>
2931
<PropertyGroup Label="Globals">
3032
<VCProjectVersion>17.0</VCProjectVersion>

PyInstaller-C++.vcxproj.filters

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,16 @@
2424
<ClInclude Include="include\zlib.h">
2525
<Filter>Header Files</Filter>
2626
</ClInclude>
27+
<ClInclude Include="include\ThreadPool.h">
28+
<Filter>Header Files</Filter>
29+
</ClInclude>
2730
</ItemGroup>
2831
<ItemGroup>
2932
<ClCompile Include="src\Pyinstaller.cpp">
3033
<Filter>Source Files</Filter>
3134
</ClCompile>
35+
<ClCompile Include="src\ThreadPool.cpp">
36+
<Filter>Source Files</Filter>
37+
</ClCompile>
3238
</ItemGroup>
3339
</Project>

PyInstaller-C++.vcxproj.user

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<PropertyGroup />
4+
</Project>

include/PyInstArchive.h

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,33 +6,42 @@
66
#include <vector>
77
#include <string>
88
#include <cstring>
9-
#include <cstdint>
9+
#include <cstdint>
1010
#include <mutex>
11+
#include <thread>
12+
#include <functional>
13+
#include <condition_variable>
14+
#include <queue>
15+
#include <filesystem>
16+
17+
#include "../include/ThreadPool.h"
18+
1119

1220
// Structure for Table of Contents Entry
1321
struct CTOCEntry {
1422
uint32_t position; // Position of the entry
15-
uint32_t cmprsdDataSize; // Compressed data size
16-
uint32_t uncmprsdDataSize; // Uncompressed data size
17-
uint8_t cmprsFlag; // Compression flag
18-
char typeCmprsData; // Type of compressed data
19-
std::string name; // Name of the entry
23+
uint32_t cmprsdDataSize; // Compressed data size
24+
uint32_t uncmprsdDataSize; // Uncompressed data size
25+
uint8_t cmprsFlag; // Compression flag
26+
char typeCmprsData; // Type of compressed data
27+
std::string name; // Name of the entry
2028

2129
// Constructor
2230
CTOCEntry(uint32_t pos, uint32_t cmprsdSize, uint32_t uncmprsdSize, uint8_t flag, char type, const std::string& n)
23-
: position(pos), cmprsdDataSize(cmprsdSize), uncmprsdDataSize(uncmprsdSize), cmprsFlag(flag), typeCmprsData(type), name(n) {}
31+
: position(pos), cmprsdDataSize(cmprsdSize), uncmprsdDataSize(uncmprsdSize), cmprsFlag(flag), typeCmprsData(type), name(n) {
32+
}
2433

2534
// Getters for entry details
2635
uint32_t getCompressedDataSize() const {
27-
return cmprsdDataSize;
36+
return cmprsdDataSize;
2837
}
2938

3039
const std::string& getName() const {
31-
return name;
40+
return name;
3241
}
3342

3443
bool isCompressed() const {
35-
return cmprsFlag != 0;
44+
return cmprsFlag != 0;
3645
}
3746
};
3847

@@ -49,28 +58,32 @@ class PyInstArchive {
4958
bool getCArchiveInfo();
5059
void parseTOC();
5160
void timeExtractionProcess(const std::string& outputDir);
52-
void decompressAndExtractFile(const CTOCEntry& tocEntry, const std::string& outputDir);
5361
void displayInfo();
5462
void decompressData(const std::vector<char>& compressedData, std::vector<char>& decompressedData);
5563

64+
// New methods for multithreading
65+
void MultiThreaedFileExtract(const std::vector<CTOCEntry>& tocEntries, const std::string& outputDir);
66+
void decompressAndExtractFile(const CTOCEntry& tocEntry, const std::string& outputDir, std::mutex& mtx, std::mutex& printMtx);
67+
5668
private:
57-
std::mutex mtx;
58-
std::mutex printMtx; // Mutex for synchronizing print statements
59-
std::string filePath; // Path to the archive file
60-
std::ifstream fPtr; // File stream for reading the archive
61-
uint64_t fileSize; // Size of the file
62-
uint64_t cookiePos; // Position of the cookie
63-
uint64_t overlayPos; // Position of the overlay
64-
uint64_t overlaySize; // Size of the overlay
65-
uint64_t tableOfContentsPos; // Position of the TOC
66-
uint64_t tableOfContentsSize; // Size of the TOC
67-
uint8_t pyinstVer; // PyInstaller version
68-
uint8_t pymaj; // Python major version
69-
uint8_t pymin; // Python minor version
70-
std::vector<CTOCEntry> tocList; // List of TOC entries
71-
uint32_t lengthofPackage; // Length of the package
72-
uint32_t toc; // Table of contents
73-
uint32_t tocLen; // Length of the table of contents
69+
std::mutex mtx; // Protects file pointer access
70+
std::mutex printMtx; // Protects console output
71+
72+
std::string filePath; // Path to the archive file
73+
std::ifstream fPtr; // File stream for reading the archive
74+
uint64_t fileSize; // Size of the file
75+
uint64_t cookiePos; // Position of the cookie
76+
uint64_t overlayPos; // Position of the overlay
77+
uint64_t overlaySize; // Size of the overlay
78+
uint64_t tableOfContentsPos; // Position of the TOC
79+
uint64_t tableOfContentsSize; // Size of the TOC
80+
uint8_t pyinstVer; // PyInstaller version
81+
uint8_t pymaj; // Python major version
82+
uint8_t pymin; // Python minor version
83+
std::vector<CTOCEntry> tocList; // List of TOC entries
84+
uint32_t lengthofPackage; // Length of the package
85+
uint32_t toc; // Table of contents
86+
uint32_t tocLen; // Length of the table of contents
7487

7588
// Constants for PyInstaller cookie sizes
7689
static const uint8_t PYINST20_COOKIE_SIZE = 24;

include/ThreadPool.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#ifndef THREADPOOL_H
2+
#define THREADPOOL_H
3+
4+
5+
#include <thread>
6+
#include <functional>
7+
#include <queue>
8+
#include <mutex>
9+
10+
class ThreadPool {
11+
public:
12+
ThreadPool(size_t numThreads);
13+
~ThreadPool();
14+
15+
void enqueue(std::function<void()> task);
16+
17+
private:
18+
// Worker threads
19+
std::vector<std::thread> workers;
20+
// Task queue
21+
std::queue<std::function<void()>> tasks;
22+
23+
// Synchronization primitives
24+
std::mutex queueMutex;
25+
std::condition_variable condition;
26+
bool stop;
27+
};
28+
29+
#endif // THREADPOOL_H

src/Pyinstaller.cpp

Lines changed: 75 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,14 @@
99
#include <queue>
1010
#include <condition_variable>
1111
#include <future>
12+
#include <iomanip>
13+
#include <chrono>
1214

1315
#include "../include/PyInstArchive.h"
1416
#include "../include/zlib.h"
1517

18+
19+
1620
/**
1721
* @brief The magic string used to identify PyInstaller archives.
1822
*
@@ -304,89 +308,123 @@ void PyInstArchive::displayInfo() {
304308
* @param outputDir The directory where the extracted files will be saved.
305309
*/
306310
void PyInstArchive::timeExtractionProcess(const std::string& outputDir) {
307-
auto start = std::chrono::steady_clock::now();
311+
auto start = std::chrono::high_resolution_clock::now();
308312

309-
std::vector<std::future<void>> futures;
310-
for (const auto& tocEntry : tocList) {
311-
futures.emplace_back(std::async(std::launch::async, &PyInstArchive::decompressAndExtractFile, this, std::ref(tocEntry), std::ref(outputDir)));
312-
}
313+
MultiThreaedFileExtract(tocList, outputDir);
313314

314-
for (auto& future : futures) {
315-
future.get();
316-
}
315+
auto end = std::chrono::high_resolution_clock::now();
316+
std::chrono::duration<double> elapsed = end - start;
317317

318-
auto end = std::chrono::steady_clock::now();
319-
std::chrono::duration<double> elapsedSeconds = end - start;
320-
int minutes = static_cast<int>(elapsedSeconds.count()) / 60;
321-
double seconds = elapsedSeconds.count() - (minutes * 60);
322-
std::cout << "Time: " << std::setfill('0') << minutes << ":"
323-
<< std::fixed << std::setprecision(2) << std::setw(5) << seconds << std::endl;
318+
std::cout << "[*] Extraction completed in " << elapsed.count() << " seconds.\n";
324319
}
325320

326321
/**
327-
* @brief Decompresses and extracts a file from the PyInstaller archive to the specified output directory.
322+
* @brief Decompresses and extracts all files from the PyInstaller archive using multithreading.
323+
*
324+
* The `MultiThreadedFileExtract` method initializes a thread pool and enqueues tasks to decompress
325+
* and extract each file specified in the Table of Contents (TOC) entries. It leverages multithreading
326+
* to improve extraction performance by processing multiple files concurrently.
328327
*
329-
* This function reads the compressed data of the file from the archive, decompresses it if necessary,
330-
* and writes the resulting data to a file in the specified output directory. The file extraction process
331-
* is thread-safe, utilizing mutexes to ensure proper synchronization of file reading and console output.
328+
* @param tocEntries A vector of TOC entries representing the files to extract from the archive.
329+
* @param outputDir The directory where the extracted files will be saved.
332330
*
333-
* @param tocEntry The Table of Contents (TOC) entry that contains metadata about the file to be extracted.
331+
* @note The function creates a thread pool with a number of threads equal to the hardware concurrency.
332+
* If the hardware concurrency cannot be determined, it defaults to 4 threads.
333+
* @note Each TOC entry is processed by a separate task that calls `decompressAndExtractFile`.
334+
* @note The mutexes `mtx` and `printMtx` are used within the tasks to ensure thread-safe operations.
335+
* @note The ThreadPool destructor ensures all tasks are completed before the program continues.
336+
*/
337+
void PyInstArchive::MultiThreaedFileExtract(const std::vector<CTOCEntry>& tocEntries, const std::string& outputDir) {
338+
size_t numThreads = std::thread::hardware_concurrency();
339+
if (numThreads == 0) numThreads = 4; // Fallback if hardware_concurrency can't determine
340+
341+
ThreadPool pool(numThreads);
342+
343+
for (const auto& tocEntry : tocEntries) {
344+
pool.enqueue([this, &tocEntry, &outputDir] {
345+
this->decompressAndExtractFile(tocEntry, outputDir, mtx, printMtx);
346+
});
347+
}
348+
}
349+
350+
/**
351+
* @brief Decompresses and extracts a single file from the PyInstaller archive.
352+
*
353+
* This method handles the decompression and extraction of a single file specified by the
354+
* Table of Contents (TOC) entry. It reads the compressed data from the archive file,
355+
* decompresses it if necessary, and writes the output to the specified directory,
356+
* preserving the file structure. Thread safety is ensured through mutex locks for file
357+
* access and console output, allowing concurrent execution in a multithreaded environment.
358+
*
359+
* @param tocEntry The Table of Contents entry representing the file to extract.
334360
* @param outputDir The directory where the extracted file will be saved.
361+
* @param mtx Mutex to synchronize access to the file stream `fPtr` for reading.
362+
* @param printMtx Mutex to synchronize console output to prevent message interleaving.
363+
*
364+
* @note The function checks if the data is compressed and handles decompression using zlib.
365+
* @note Any errors during reading, decompression, or writing are logged to the console.
366+
* @note The function assumes that the output directory exists or can be created.
367+
* @note This method is designed to be thread-safe and can be called concurrently by multiple threads.
335368
*/
336-
void PyInstArchive::decompressAndExtractFile(const CTOCEntry& tocEntry, const std::string& outputDir) {
369+
void PyInstArchive::decompressAndExtractFile(const CTOCEntry& tocEntry, const std::string& outputDir, std::mutex& mtx, std::mutex& printMtx) {
337370
std::vector<char> compressedData;
371+
372+
// Read Compressed Data with File Lock
338373
{
339374
std::lock_guard<std::mutex> lock(mtx);
340375
fPtr.seekg(tocEntry.position, std::ios::beg);
341376
compressedData.resize(tocEntry.getCompressedDataSize());
342377
fPtr.read(compressedData.data(), tocEntry.getCompressedDataSize());
343378
}
344379

345-
// Decompress data
380+
// Decompress Data
346381
std::vector<char> decompressedData;
347382
if (tocEntry.isCompressed()) {
348383
decompressedData.resize(tocEntry.uncmprsdDataSize);
349384

350385
z_stream strm = {};
351-
strm.avail_in = tocEntry.getCompressedDataSize();
386+
strm.avail_in = static_cast<uInt>(tocEntry.getCompressedDataSize());
352387
strm.next_in = reinterpret_cast<Bytef*>(compressedData.data());
353-
strm.avail_out = tocEntry.uncmprsdDataSize;
388+
strm.avail_out = static_cast<uInt>(tocEntry.uncmprsdDataSize);
354389
strm.next_out = reinterpret_cast<Bytef*>(decompressedData.data());
355390

356391
if (inflateInit(&strm) != Z_OK) {
357-
std::cerr << "[!] Error: Could not initialize zlib for decompression" << std::endl;
392+
std::lock_guard<std::mutex> lock(printMtx);
393+
std::cerr << "[!] Error: Could not initialize zlib for decompression\n";
358394
return;
359395
}
360396

361397
int result = inflate(&strm, Z_FINISH);
362398
inflateEnd(&strm);
363399

364400
if (result != Z_STREAM_END) {
365-
std::cerr << "[!] Error: Decompression failed for " << tocEntry.getName() << std::endl;
401+
std::lock_guard<std::mutex> lock(printMtx);
402+
std::cerr << "[!] Error: Decompression failed for " << tocEntry.getName() << "\n";
366403
return;
367404
}
368405
}
369406
else {
370-
decompressedData = compressedData;
407+
decompressedData = std::move(compressedData);
371408
}
372409

373-
// Extract file
410+
// Extract File
374411
std::filesystem::path outputFilePath = std::filesystem::path(outputDir) / tocEntry.getName();
375412
std::filesystem::create_directories(outputFilePath.parent_path());
376413

377-
std::ofstream outFile(outputFilePath, std::ios::binary);
378-
if (!outFile.is_open()) {
379-
std::cerr << "[!] Error: Could not open output file " << outputFilePath << std::endl;
380-
return;
414+
{
415+
std::ofstream outFile(outputFilePath, std::ios::binary);
416+
if (!outFile.is_open()) {
417+
std::lock_guard<std::mutex> lock(printMtx);
418+
std::cerr << "[!] Error: Could not open output file " << outputFilePath << "\n";
419+
return;
420+
}
421+
outFile.write(decompressedData.data(), decompressedData.size());
381422
}
382423

383-
outFile.write(decompressedData.data(), decompressedData.size());
384-
outFile.close();
385-
386-
// Synchronize print statements
424+
// Log Extraction Success
387425
{
388-
std::lock_guard<std::mutex> printLock(printMtx);
389-
std::cout << "[+] Extracted: " << tocEntry.getName() << " (" << decompressedData.size() << " bytes)" << std::endl;
426+
std::lock_guard<std::mutex> lock(printMtx);
427+
std::cout << "[+] Extracted: " << tocEntry.getName() << " (" << decompressedData.size() << " bytes)\n";
390428
}
391429
}
392430

0 commit comments

Comments
 (0)