Skip to content

Commit e572148

Browse files
authored
VSI archive (/vsizip/, /vsiar/) performance improvements with large number of files (OSGeo#12939)
Traversal of large archives (many entries, especially in a large hierarchy) can be slow. This change massively improves performance of random lookups by path within an archive, as well as traversal using VSIReadDirRecursive. The main change is to avoid full traversal of the VSIArchiveContent::entries array for each path lookup. I achieved this by building an index mapping each directory entry to the indices of its children in the entries list, and using that index to speed up lookups by path. This also has a massive perf improvement for VSIReadDirRecursive since it works by calling ReadDirEx for each subdirectory (which previously meant looking at all entries). Now it uses the directory index to immediately jump to where the directory is in the entries list, avoiding visiting other entries. On my laptop, ReadDirRecursive on a zip file containing 600 dirs each containing 600 files: * previous master gdal: 728 seconds * after this change: 4.2 seconds
1 parent f9bc129 commit e572148

File tree

2 files changed

+73
-40
lines changed

2 files changed

+73
-40
lines changed

port/cpl_vsi_virtual.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,9 @@ typedef struct
581581
GIntBig nModifiedTime;
582582
} VSIArchiveEntry;
583583

584+
// Store list of child indices for each directory
585+
using DirectoryChildren = std::vector<int>;
586+
584587
class VSIArchiveContent
585588
{
586589
public:
@@ -589,7 +592,10 @@ class VSIArchiveContent
589592
int nEntries = 0;
590593
VSIArchiveEntry *entries = nullptr;
591594

595+
std::map<std::string, DirectoryChildren> dirIndex{};
596+
592597
VSIArchiveContent() = default;
598+
593599
~VSIArchiveContent();
594600

595601
private:

port/cpl_vsil_abstract_archive.cpp

Lines changed: 67 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "cpl_port.h"
1414
#include "cpl_vsi_virtual.h"
1515

16+
#include <algorithm>
1617
#include <cstring>
1718
#include <fcntl.h>
1819
#include <ctime>
@@ -128,6 +129,21 @@ static CPLString GetStrippedFilename(const CPLString &osFileName, bool &bIsDir)
128129
return osRet;
129130
}
130131

132+
/************************************************************************/
133+
/* BuildDirectoryIndex() */
134+
/************************************************************************/
135+
136+
static void BuildDirectoryIndex(VSIArchiveContent *content)
137+
{
138+
content->dirIndex.clear();
139+
for (int i = 0; i < content->nEntries; i++)
140+
{
141+
const char *fileName = content->entries[i].fileName;
142+
std::string parentDir = CPLGetPathSafe(fileName);
143+
content->dirIndex[parentDir].push_back(i);
144+
}
145+
}
146+
131147
/************************************************************************/
132148
/* GetContentOfArchive() */
133149
/************************************************************************/
@@ -267,6 +283,9 @@ VSIArchiveFilesystemHandler::GetContentOfArchive(const char *archiveFilename,
267283
if (bMustClose)
268284
delete (poReader);
269285

286+
// Build directory index for fast lookups
287+
BuildDirectoryIndex(content);
288+
270289
return content;
271290
}
272291

@@ -284,13 +303,32 @@ int VSIArchiveFilesystemHandler::FindFileInArchive(
284303
const VSIArchiveContent *content = GetContentOfArchive(archiveFilename);
285304
if (content)
286305
{
287-
for (int i = 0; i < content->nEntries; i++)
306+
std::string parentDir;
307+
const char *lastSlash = strrchr(fileInArchiveName, '/');
308+
if (!lastSlash)
309+
lastSlash = strrchr(fileInArchiveName, '\\');
310+
311+
if (lastSlash)
312+
{
313+
parentDir =
314+
std::string(fileInArchiveName, lastSlash - fileInArchiveName);
315+
}
316+
// else: file is in root directory (empty parentDir)
317+
318+
// Use directory index to search within parent directory's children
319+
auto dirIter = content->dirIndex.find(parentDir);
320+
if (dirIter != content->dirIndex.end())
288321
{
289-
if (strcmp(fileInArchiveName, content->entries[i].fileName) == 0)
322+
const std::vector<int> &childIndices = dirIter->second;
323+
for (int childIdx : childIndices)
290324
{
291-
if (archiveEntry)
292-
*archiveEntry = &content->entries[i];
293-
return TRUE;
325+
if (strcmp(content->entries[childIdx].fileName,
326+
fileInArchiveName) == 0)
327+
{
328+
if (archiveEntry)
329+
*archiveEntry = &content->entries[childIdx];
330+
return TRUE;
331+
}
294332
}
295333
}
296334
}
@@ -776,48 +814,37 @@ char **VSIArchiveFilesystemHandler::ReadDirEx(const char *pszDirname,
776814
#ifdef DEBUG_VERBOSE
777815
CPLDebug("VSIArchive", "Read dir %s", pszDirname);
778816
#endif
779-
for (int i = 0; i < content->nEntries; i++)
817+
818+
std::string searchDir = lenInArchiveSubDir > 0
819+
? std::string(osInArchiveSubDir)
820+
: std::string("");
821+
822+
// Use directory index to find the list of children for this directory
823+
auto dirIter = content->dirIndex.find(searchDir);
824+
if (dirIter == content->dirIndex.end())
780825
{
781-
const char *fileName = content->entries[i].fileName;
782-
/* Only list entries at the same level of inArchiveSubDir */
783-
if (lenInArchiveSubDir != 0 &&
784-
strncmp(fileName, osInArchiveSubDir, lenInArchiveSubDir) == 0 &&
785-
IsEitherSlash(fileName[lenInArchiveSubDir]) &&
786-
fileName[lenInArchiveSubDir + 1] != 0)
787-
{
788-
const char *slash = strchr(fileName + lenInArchiveSubDir + 1, '/');
789-
if (slash == nullptr)
790-
slash = strchr(fileName + lenInArchiveSubDir + 1, '\\');
791-
if (slash == nullptr || slash[1] == 0)
792-
{
793-
char *tmpFileName = CPLStrdup(fileName);
794-
if (slash != nullptr)
795-
{
796-
tmpFileName[strlen(tmpFileName) - 1] = 0;
797-
}
798-
#ifdef DEBUG_VERBOSE
799-
CPLDebug("VSIArchive", "Add %s as in directory %s",
800-
tmpFileName + lenInArchiveSubDir + 1, pszDirname);
801-
#endif
802-
oDir.AddString(tmpFileName + lenInArchiveSubDir + 1);
803-
CPLFree(tmpFileName);
804-
}
805-
}
806-
else if (lenInArchiveSubDir == 0 && strchr(fileName, '/') == nullptr &&
807-
strchr(fileName, '\\') == nullptr)
826+
// Directory not found in index - no children
827+
CPLFree(archiveFilename);
828+
return oDir.StealList();
829+
}
830+
const std::vector<int> &childIndices = dirIter->second;
831+
832+
// Scan the children of this directory
833+
for (int childIdx : childIndices)
834+
{
835+
const char *fileName = content->entries[childIdx].fileName;
836+
837+
const char *baseName = fileName;
838+
if (lenInArchiveSubDir != 0)
808839
{
809-
// Only list toplevel files and directories.
810-
#ifdef DEBUG_VERBOSE
811-
CPLDebug("VSIArchive", "Add %s as in directory %s", fileName,
812-
pszDirname);
813-
#endif
814-
oDir.AddString(fileName);
840+
// Skip the directory prefix and slash to get just the child name
841+
baseName = fileName + lenInArchiveSubDir + 1;
815842
}
843+
oDir.AddStringDirectly(CPLStrdup(baseName));
816844

817845
if (nMaxFiles > 0 && oDir.Count() > nMaxFiles)
818846
break;
819847
}
820-
821848
CPLFree(archiveFilename);
822849
return oDir.StealList();
823850
}

0 commit comments

Comments
 (0)