Skip to content

Commit ad11297

Browse files
committed
gdal vector partition: add --scheme=hive|flat and --pattern arguments
1 parent 84ed507 commit ad11297

File tree

5 files changed

+393
-65
lines changed

5 files changed

+393
-65
lines changed

apps/gdalalg_vector_partition.cpp

Lines changed: 233 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ constexpr int DIRECTORY_CREATION_MODE = 0755;
2828

2929
constexpr const char *NULL_MARKER = "__HIVE_DEFAULT_PARTITION__";
3030

31+
constexpr const char *DEFAULT_PATTERN_HIVE = "part_%010d";
32+
constexpr const char *DEFAULT_PATTERN_FLAT = "{LAYER_NAME}_{FIELD_VALUE}_%010d";
33+
34+
constexpr char DIGIT_ZERO = '0';
35+
3136
/************************************************************************/
3237
/* GetConstructorOptions() */
3338
/************************************************************************/
@@ -79,6 +84,74 @@ GDALVectorPartitionAlgorithm::GDALVectorPartitionAlgorithm(bool standaloneStep)
7984

8085
AddArg("field", 0, _("Field(s) on which to partition"), &m_fields)
8186
.SetRequired();
87+
AddArg("scheme", 0, _("Partitioning scheme"), &m_scheme)
88+
.SetChoices(SCHEME_HIVE, SCHEME_FLAT)
89+
.SetDefault(m_scheme);
90+
AddArg("pattern", 0,
91+
_("Filename pattern ('part_%010d' for scheme=hive, "
92+
"'{LAYER_NAME}_{FIELD_VALUE}_%010d' for scheme=flat)"),
93+
&m_pattern)
94+
.SetMinCharCount(1)
95+
.AddValidationAction(
96+
[this]()
97+
{
98+
if (!m_pattern.empty())
99+
{
100+
const auto nPercentPos = m_pattern.find('%');
101+
if (nPercentPos == std::string::npos)
102+
{
103+
ReportError(CE_Failure, CPLE_IllegalArg, "%s",
104+
"Missing '%' character in pattern");
105+
return false;
106+
}
107+
if (nPercentPos + 1 < m_pattern.size() &&
108+
m_pattern.find('%', nPercentPos + 1) !=
109+
std::string::npos)
110+
{
111+
ReportError(
112+
CE_Failure, CPLE_IllegalArg, "%s",
113+
"A single '%' character is expected in pattern");
114+
return false;
115+
}
116+
bool percentFound = false;
117+
for (size_t i = nPercentPos + 1; i < m_pattern.size(); ++i)
118+
{
119+
if (m_pattern[i] >= DIGIT_ZERO && m_pattern[i] <= '9')
120+
{
121+
// ok
122+
}
123+
else if (m_pattern[i] == 'd')
124+
{
125+
percentFound = true;
126+
break;
127+
}
128+
else
129+
{
130+
break;
131+
}
132+
}
133+
if (!percentFound)
134+
{
135+
ReportError(
136+
CE_Failure, CPLE_IllegalArg, "%s",
137+
"pattern value must include a single "
138+
"'%[0]?[1-9]?[0]?d' part number specification");
139+
return false;
140+
}
141+
m_partDigitCount =
142+
atoi(m_pattern.c_str() + nPercentPos + 1);
143+
if (m_partDigitCount > 10)
144+
{
145+
ReportError(CE_Failure, CPLE_IllegalArg,
146+
"Number of digits in part number "
147+
"specifiation should be in [1,10] range");
148+
return false;
149+
}
150+
m_partDigitLeadingZeroes =
151+
m_pattern[nPercentPos + 1] == DIGIT_ZERO;
152+
}
153+
return true;
154+
});
82155
AddArg("feature-limit", 0, _("Maximum number of features per file"),
83156
&m_featureLimit)
84157
.SetMinValueExcluded(0);
@@ -266,8 +339,6 @@ static size_t GetEstimatedFeatureSize(
266339

267340
constexpr int MIN_FILE_SIZE = 65536;
268341

269-
constexpr const char *FILENAME_PREFIX = "part_";
270-
271342
namespace
272343
{
273344
struct Layer
@@ -292,8 +363,10 @@ struct Layer
292363
static bool GetCurrentOutputLayer(
293364
GDALAlgorithm *const alg, const OGRFeatureDefn *const poSrcFeatureDefn,
294365
OGRLayer *const poSrcLayer, const std::string &osKey,
295-
const std::string &osLayerDir, const int featureLimit,
296-
const GIntBig maxFileSize, const bool omitPartitionedFields,
366+
const std::string &osLayerDir, const std::string &osScheme,
367+
const std::string &osPatternIn, bool partDigitLeadingZeroes,
368+
size_t partDigitCount, const int featureLimit, const GIntBig maxFileSize,
369+
const bool omitPartitionedFields,
297370
const std::vector<bool> &abPartitionedFields, const char *pszExtension,
298371
GDALDriver *const poOutDriver, const CPLStringList &datasetCreationOptions,
299372
const CPLStringList &layerCreationOptions,
@@ -303,6 +376,12 @@ static bool GetCurrentOutputLayer(
303376
lru11::Cache<std::string, std::shared_ptr<Layer>> &oCacheOutputLayer,
304377
std::shared_ptr<Layer> &outputLayer)
305378
{
379+
const std::string osPattern =
380+
!osPatternIn.empty() ? osPatternIn
381+
: osScheme == GDALVectorPartitionAlgorithm::SCHEME_HIVE
382+
? DEFAULT_PATTERN_HIVE
383+
: DEFAULT_PATTERN_FLAT;
384+
306385
bool bLimitReached = false;
307386
bool bOpenOrCreateNewFile = true;
308387
if (oCacheOutputLayer.tryGet(osKey, outputLayer))
@@ -335,13 +414,69 @@ static bool GetCurrentOutputLayer(
335414
outputLayer->bUseTransactions = bUseTransactions;
336415
}
337416

338-
const auto GetBasenameFromCounter = [](int nCounter)
339-
{ return CPLSPrintf("%s%010d", FILENAME_PREFIX, nCounter); };
417+
const auto SubstituteVariables = [&osKey, poSrcLayer](const std::string &s)
418+
{
419+
CPLString ret(s);
420+
ret.replaceAll("{LAYER_NAME}",
421+
PercentEncode(poSrcLayer->GetDescription()));
422+
423+
if (ret.find("{FIELD_VALUE}") != std::string::npos)
424+
{
425+
std::string fieldValue;
426+
const CPLStringList aosTokens(
427+
CSLTokenizeString2(osKey.c_str(), "/", 0));
428+
for (int i = 0; i < aosTokens.size(); ++i)
429+
{
430+
const CPLStringList aosFieldNameValue(
431+
CSLTokenizeString2(aosTokens[i], "=", 0));
432+
if (!fieldValue.empty())
433+
fieldValue += '_';
434+
fieldValue +=
435+
aosFieldNameValue.size() == 2
436+
? (strcmp(aosFieldNameValue[1], NULL_MARKER) == 0
437+
? std::string("__NULL__")
438+
: aosFieldNameValue[1])
439+
: std::string("__EMPTY__");
440+
}
441+
ret.replaceAll("{FIELD_VALUE}", fieldValue);
442+
}
443+
return ret;
444+
};
445+
446+
const auto nPercentPos = osPattern.find('%');
447+
CPLAssert(nPercentPos !=
448+
std::string::npos); // checked by validation action
449+
const std::string osPatternPrefix =
450+
SubstituteVariables(osPattern.substr(0, nPercentPos));
451+
const auto nAfterDPos = osPattern.find('d', nPercentPos + 1) + 1;
452+
const std::string osPatternSuffix =
453+
nAfterDPos < osPattern.size()
454+
? SubstituteVariables(osPattern.substr(nAfterDPos))
455+
: std::string();
456+
457+
const auto GetBasenameFromCounter = [partDigitCount, partDigitLeadingZeroes,
458+
&osPatternPrefix,
459+
&osPatternSuffix](int nCounter)
460+
{
461+
const std::string sCounter(CPLSPrintf("%d", nCounter));
462+
std::string s(osPatternPrefix);
463+
if (sCounter.size() < partDigitCount)
464+
{
465+
s += std::string(partDigitCount - sCounter.size(),
466+
partDigitLeadingZeroes ? DIGIT_ZERO : ' ');
467+
}
468+
s += sCounter;
469+
s += osPatternSuffix;
470+
return s;
471+
};
340472

341473
if (bOpenOrCreateNewFile)
342474
{
343-
const std::string osDatasetDir =
344-
CPLFormFilenameSafe(osLayerDir.c_str(), osKey.c_str(), nullptr);
475+
std::string osDatasetDir =
476+
osScheme == GDALVectorPartitionAlgorithm::SCHEME_HIVE
477+
? CPLFormFilenameSafe(osLayerDir.c_str(), osKey.c_str(),
478+
nullptr)
479+
: osLayerDir;
345480
outputLayer->nFeatureCount = 0;
346481

347482
bool bCreateNewFile = true;
@@ -367,31 +502,36 @@ static bool GetCurrentOutputLayer(
367502
}
368503

369504
int nMaxCounter = 0;
370-
auto psDir = VSIOpenDir(osDatasetDir.c_str(), 0, nullptr);
505+
std::unique_ptr<VSIDIR, decltype(&VSICloseDir)> psDir(
506+
VSIOpenDir(osDatasetDir.c_str(), 0, nullptr), VSICloseDir);
371507
if (psDir)
372508
{
373-
while (const auto *psEntry = VSIGetNextDirEntry(psDir))
509+
while (const auto *psEntry = VSIGetNextDirEntry(psDir.get()))
374510
{
375-
if (cpl::starts_with(std::string_view(psEntry->pszName),
376-
FILENAME_PREFIX))
511+
const std::string osName(
512+
CPLGetBasenameSafe(psEntry->pszName));
513+
if (cpl::starts_with(osName, osPatternPrefix) &&
514+
cpl::ends_with(osName, osPatternSuffix))
377515
{
378516
nMaxCounter = std::max(
379517
nMaxCounter,
380-
atoi(CPLGetBasenameSafe(psEntry->pszName +
381-
strlen(FILENAME_PREFIX))
518+
atoi(osName
519+
.substr(osPatternPrefix.size(),
520+
osName.size() -
521+
osPatternPrefix.size() -
522+
osPatternSuffix.size())
382523
.c_str()));
383524
}
384525
}
385-
VSICloseDir(psDir);
386526
}
387527

388528
if (nMaxCounter > 0)
389529
{
390530
outputLayer->nFileCounter = nMaxCounter;
391531

392532
const std::string osFilename = CPLFormFilenameSafe(
393-
osDatasetDir.c_str(), GetBasenameFromCounter(nMaxCounter),
394-
pszExtension);
533+
osDatasetDir.c_str(),
534+
GetBasenameFromCounter(nMaxCounter).c_str(), pszExtension);
395535
auto poDS = std::unique_ptr<GDALDataset>(GDALDataset::Open(
396536
osFilename.c_str(),
397537
GDAL_OF_VECTOR | GDAL_OF_UPDATE | GDAL_OF_VERBOSE_ERROR));
@@ -497,7 +637,7 @@ static bool GetCurrentOutputLayer(
497637

498638
const std::string osFilename = CPLFormFilenameSafe(
499639
osDatasetDir.c_str(),
500-
GetBasenameFromCounter(outputLayer->nFileCounter),
640+
GetBasenameFromCounter(outputLayer->nFileCounter).c_str(),
501641
pszExtension);
502642
outputLayer->poDS.reset(
503643
poOutDriver->Create(osFilename.c_str(), 0, 0, 0, GDT_Unknown,
@@ -630,7 +770,8 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
630770
return false;
631771
}
632772

633-
if (EQUAL(poOutDriver->GetDescription(), "PARQUET"))
773+
if (EQUAL(poOutDriver->GetDescription(), "PARQUET") &&
774+
m_scheme == SCHEME_HIVE)
634775
{
635776
// Required for Parquet Hive partitioning
636777
m_omitPartitionedFields = true;
@@ -686,41 +827,76 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
686827

687828
// Do a sanity check to verify that this looks like a directory
688829
// generated by partition
689-
auto psDir = VSIOpenDir(m_output.c_str(), -1, nullptr);
690-
if (psDir)
830+
831+
if (m_scheme == SCHEME_HIVE)
691832
{
692-
while (const auto *psEntry = VSIGetNextDirEntry(psDir))
833+
std::unique_ptr<VSIDIR, decltype(&VSICloseDir)> psDir(
834+
VSIOpenDir(m_output.c_str(), -1, nullptr), VSICloseDir);
835+
if (psDir)
693836
{
694-
emptyDir = false;
695-
if (VSI_ISDIR(psEntry->nMode))
837+
while (const auto *psEntry =
838+
VSIGetNextDirEntry(psDir.get()))
696839
{
697-
std::string_view v(psEntry->pszName);
698-
if (std::count_if(v.begin(), v.end(),
699-
[](char c) {
700-
return c == '/' || c == '\\';
701-
}) == 1)
840+
emptyDir = false;
841+
if (VSI_ISDIR(psEntry->nMode))
702842
{
703-
const auto nPosDirSep = v.find_first_of("/\\");
704-
const auto nPosEqual = v.find('=', nPosDirSep);
705-
if (nPosEqual != std::string::npos)
843+
std::string_view v(psEntry->pszName);
844+
if (std::count_if(v.begin(), v.end(),
845+
[](char c) {
846+
return c == '/' || c == '\\';
847+
}) == 1)
706848
{
707-
hasDirLevel1WithEqual = true;
708-
break;
849+
const auto nPosDirSep = v.find_first_of("/\\");
850+
const auto nPosEqual = v.find('=', nPosDirSep);
851+
if (nPosEqual != std::string::npos)
852+
{
853+
hasDirLevel1WithEqual = true;
854+
break;
855+
}
709856
}
710857
}
711858
}
712859
}
713-
VSICloseDir(psDir);
714-
}
715860

716-
if (!hasDirLevel1WithEqual && !emptyDir)
861+
if (!hasDirLevel1WithEqual && !emptyDir)
862+
{
863+
ReportError(
864+
CE_Failure, CPLE_AppDefined,
865+
"Rejecting removing '%s' as it does not look like "
866+
"a directory generated by this utility. If you are "
867+
"sure, remove it manually and re-run",
868+
m_output.c_str());
869+
return false;
870+
}
871+
}
872+
else
717873
{
718-
ReportError(CE_Failure, CPLE_AppDefined,
719-
"Rejecting removing '%s' as it does not look like "
720-
"a directory generated by this utility. If you are "
721-
"sure, remove it manually and re-run",
722-
m_output.c_str());
723-
return false;
874+
bool hasSubDir = false;
875+
std::unique_ptr<VSIDIR, decltype(&VSICloseDir)> psDir(
876+
VSIOpenDir(m_output.c_str(), 0, nullptr), VSICloseDir);
877+
if (psDir)
878+
{
879+
while (const auto *psEntry =
880+
VSIGetNextDirEntry(psDir.get()))
881+
{
882+
if (VSI_ISDIR(psEntry->nMode))
883+
{
884+
hasSubDir = true;
885+
break;
886+
}
887+
}
888+
}
889+
890+
if (hasSubDir)
891+
{
892+
ReportError(
893+
CE_Failure, CPLE_AppDefined,
894+
"Rejecting removing '%s' as it does not look like "
895+
"a directory generated by this utility. If you are "
896+
"sure, remove it manually and re-run",
897+
m_output.c_str());
898+
return false;
899+
}
724900
}
725901

726902
if (VSIRmdirRecursive(m_output.c_str()) != 0)
@@ -750,10 +926,15 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
750926

751927
for (OGRLayer *poSrcLayer : poSrcDS->GetLayers())
752928
{
753-
const std::string osLayerDir = CPLFormFilenameSafe(
754-
m_output.c_str(),
755-
PercentEncode(poSrcLayer->GetDescription()).c_str(), nullptr);
756-
if (VSIStatL(osLayerDir.c_str(), &sStat) != 0)
929+
const std::string osLayerDir =
930+
m_scheme == SCHEME_HIVE
931+
? CPLFormFilenameSafe(
932+
m_output.c_str(),
933+
PercentEncode(poSrcLayer->GetDescription()).c_str(),
934+
nullptr)
935+
: m_output;
936+
if (m_scheme == SCHEME_HIVE &&
937+
VSIStatL(osLayerDir.c_str(), &sStat) != 0)
757938
{
758939
if (VSIMkdir(osLayerDir.c_str(), DIRECTORY_CREATION_MODE) != 0)
759940
{
@@ -983,9 +1164,11 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
9831164

9841165
if (!GetCurrentOutputLayer(
9851166
this, poSrcFeatureDefn, poSrcLayer, osKey, osLayerDir,
986-
m_featureLimit, m_maxFileSize, m_omitPartitionedFields,
987-
abPartitionedFields, pszExtension, poOutDriver,
988-
datasetCreationOptions, layerCreationOptions,
1167+
m_scheme, m_pattern, m_partDigitLeadingZeroes,
1168+
m_partDigitCount, m_featureLimit, m_maxFileSize,
1169+
m_omitPartitionedFields, abPartitionedFields,
1170+
pszExtension, poOutDriver, datasetCreationOptions,
1171+
layerCreationOptions,
9891172
poFeatureDefnWithoutPartitionedFields.get(),
9901173
poFeature->GetGeometryRef()
9911174
? nSpatialIndexPerFeatureConstant

0 commit comments

Comments
 (0)