@@ -28,6 +28,11 @@ constexpr int DIRECTORY_CREATION_MODE = 0755;
2828
2929constexpr const char *NULL_MARKER = " __HIVE_DEFAULT_PARTITION__" ;
3030
31+ constexpr const char *DEFAULT_PATTERN_HIVE = " part_%010d" ;
32+ constexpr const char *DEFAULT_PATTERN_FLAT = " {LAYER_NAME}_{FIELD_VALUE}_%010d" ;
33+
34+ constexpr char DIGIT_ZERO = ' 0' ;
35+
3136/* ***********************************************************************/
3237/* GetConstructorOptions() */
3338/* ***********************************************************************/
@@ -79,6 +84,74 @@ GDALVectorPartitionAlgorithm::GDALVectorPartitionAlgorithm(bool standaloneStep)
7984
8085 AddArg (" field" , 0 , _ (" Field(s) on which to partition" ), &m_fields)
8186 .SetRequired ();
87+ AddArg (" scheme" , 0 , _ (" Partitioning scheme" ), &m_scheme)
88+ .SetChoices (SCHEME_HIVE, SCHEME_FLAT)
89+ .SetDefault (m_scheme);
90+ AddArg (" pattern" , 0 ,
91+ _ (" Filename pattern ('part_%010d' for scheme=hive, "
92+ " '{LAYER_NAME}_{FIELD_VALUE}_%010d' for scheme=flat)" ),
93+ &m_pattern)
94+ .SetMinCharCount (1 )
95+ .AddValidationAction (
96+ [this ]()
97+ {
98+ if (!m_pattern.empty ())
99+ {
100+ const auto nPercentPos = m_pattern.find (' %' );
101+ if (nPercentPos == std::string::npos)
102+ {
103+ ReportError (CE_Failure, CPLE_IllegalArg, " %s" ,
104+ " Missing '%' character in pattern" );
105+ return false ;
106+ }
107+ if (nPercentPos + 1 < m_pattern.size () &&
108+ m_pattern.find (' %' , nPercentPos + 1 ) !=
109+ std::string::npos)
110+ {
111+ ReportError (
112+ CE_Failure, CPLE_IllegalArg, " %s" ,
113+ " A single '%' character is expected in pattern" );
114+ return false ;
115+ }
116+ bool percentFound = false ;
117+ for (size_t i = nPercentPos + 1 ; i < m_pattern.size (); ++i)
118+ {
119+ if (m_pattern[i] >= DIGIT_ZERO && m_pattern[i] <= ' 9' )
120+ {
121+ // ok
122+ }
123+ else if (m_pattern[i] == ' d' )
124+ {
125+ percentFound = true ;
126+ break ;
127+ }
128+ else
129+ {
130+ break ;
131+ }
132+ }
133+ if (!percentFound)
134+ {
135+ ReportError (
136+ CE_Failure, CPLE_IllegalArg, " %s" ,
137+ " pattern value must include a single "
138+ " '%[0]?[1-9]?[0]?d' part number specification" );
139+ return false ;
140+ }
141+ m_partDigitCount =
142+ atoi (m_pattern.c_str () + nPercentPos + 1 );
143+ if (m_partDigitCount > 10 )
144+ {
145+ ReportError (CE_Failure, CPLE_IllegalArg,
146+ " Number of digits in part number "
147+ " specifiation should be in [1,10] range" );
148+ return false ;
149+ }
150+ m_partDigitLeadingZeroes =
151+ m_pattern[nPercentPos + 1 ] == DIGIT_ZERO;
152+ }
153+ return true ;
154+ });
82155 AddArg (" feature-limit" , 0 , _ (" Maximum number of features per file" ),
83156 &m_featureLimit)
84157 .SetMinValueExcluded (0 );
@@ -266,8 +339,6 @@ static size_t GetEstimatedFeatureSize(
266339
267340constexpr int MIN_FILE_SIZE = 65536 ;
268341
269- constexpr const char *FILENAME_PREFIX = " part_" ;
270-
271342namespace
272343{
273344struct Layer
@@ -292,8 +363,10 @@ struct Layer
292363static bool GetCurrentOutputLayer (
293364 GDALAlgorithm *const alg, const OGRFeatureDefn *const poSrcFeatureDefn,
294365 OGRLayer *const poSrcLayer, const std::string &osKey,
295- const std::string &osLayerDir, const int featureLimit,
296- const GIntBig maxFileSize, const bool omitPartitionedFields,
366+ const std::string &osLayerDir, const std::string &osScheme,
367+ const std::string &osPatternIn, bool partDigitLeadingZeroes,
368+ size_t partDigitCount, const int featureLimit, const GIntBig maxFileSize,
369+ const bool omitPartitionedFields,
297370 const std::vector<bool > &abPartitionedFields, const char *pszExtension,
298371 GDALDriver *const poOutDriver, const CPLStringList &datasetCreationOptions,
299372 const CPLStringList &layerCreationOptions,
@@ -303,6 +376,12 @@ static bool GetCurrentOutputLayer(
303376 lru11::Cache<std::string, std::shared_ptr<Layer>> &oCacheOutputLayer,
304377 std::shared_ptr<Layer> &outputLayer)
305378{
379+ const std::string osPattern =
380+ !osPatternIn.empty () ? osPatternIn
381+ : osScheme == GDALVectorPartitionAlgorithm::SCHEME_HIVE
382+ ? DEFAULT_PATTERN_HIVE
383+ : DEFAULT_PATTERN_FLAT;
384+
306385 bool bLimitReached = false ;
307386 bool bOpenOrCreateNewFile = true ;
308387 if (oCacheOutputLayer.tryGet (osKey, outputLayer))
@@ -335,13 +414,69 @@ static bool GetCurrentOutputLayer(
335414 outputLayer->bUseTransactions = bUseTransactions;
336415 }
337416
338- const auto GetBasenameFromCounter = [](int nCounter)
339- { return CPLSPrintf (" %s%010d" , FILENAME_PREFIX, nCounter); };
417+ const auto SubstituteVariables = [&osKey, poSrcLayer](const std::string &s)
418+ {
419+ CPLString ret (s);
420+ ret.replaceAll (" {LAYER_NAME}" ,
421+ PercentEncode (poSrcLayer->GetDescription ()));
422+
423+ if (ret.find (" {FIELD_VALUE}" ) != std::string::npos)
424+ {
425+ std::string fieldValue;
426+ const CPLStringList aosTokens (
427+ CSLTokenizeString2 (osKey.c_str (), " /" , 0 ));
428+ for (int i = 0 ; i < aosTokens.size (); ++i)
429+ {
430+ const CPLStringList aosFieldNameValue (
431+ CSLTokenizeString2 (aosTokens[i], " =" , 0 ));
432+ if (!fieldValue.empty ())
433+ fieldValue += ' _' ;
434+ fieldValue +=
435+ aosFieldNameValue.size () == 2
436+ ? (strcmp (aosFieldNameValue[1 ], NULL_MARKER) == 0
437+ ? std::string (" __NULL__" )
438+ : aosFieldNameValue[1 ])
439+ : std::string (" __EMPTY__" );
440+ }
441+ ret.replaceAll (" {FIELD_VALUE}" , fieldValue);
442+ }
443+ return ret;
444+ };
445+
446+ const auto nPercentPos = osPattern.find (' %' );
447+ CPLAssert (nPercentPos !=
448+ std::string::npos); // checked by validation action
449+ const std::string osPatternPrefix =
450+ SubstituteVariables (osPattern.substr (0 , nPercentPos));
451+ const auto nAfterDPos = osPattern.find (' d' , nPercentPos + 1 ) + 1 ;
452+ const std::string osPatternSuffix =
453+ nAfterDPos < osPattern.size ()
454+ ? SubstituteVariables (osPattern.substr (nAfterDPos))
455+ : std::string ();
456+
457+ const auto GetBasenameFromCounter = [partDigitCount, partDigitLeadingZeroes,
458+ &osPatternPrefix,
459+ &osPatternSuffix](int nCounter)
460+ {
461+ const std::string sCounter (CPLSPrintf (" %d" , nCounter));
462+ std::string s (osPatternPrefix);
463+ if (sCounter .size () < partDigitCount)
464+ {
465+ s += std::string (partDigitCount - sCounter .size (),
466+ partDigitLeadingZeroes ? DIGIT_ZERO : ' ' );
467+ }
468+ s += sCounter ;
469+ s += osPatternSuffix;
470+ return s;
471+ };
340472
341473 if (bOpenOrCreateNewFile)
342474 {
343- const std::string osDatasetDir =
344- CPLFormFilenameSafe (osLayerDir.c_str (), osKey.c_str (), nullptr );
475+ std::string osDatasetDir =
476+ osScheme == GDALVectorPartitionAlgorithm::SCHEME_HIVE
477+ ? CPLFormFilenameSafe (osLayerDir.c_str (), osKey.c_str (),
478+ nullptr )
479+ : osLayerDir;
345480 outputLayer->nFeatureCount = 0 ;
346481
347482 bool bCreateNewFile = true ;
@@ -367,31 +502,36 @@ static bool GetCurrentOutputLayer(
367502 }
368503
369504 int nMaxCounter = 0 ;
370- auto psDir = VSIOpenDir (osDatasetDir.c_str (), 0 , nullptr );
505+ std::unique_ptr<VSIDIR, decltype (&VSICloseDir)> psDir (
506+ VSIOpenDir (osDatasetDir.c_str (), 0 , nullptr ), VSICloseDir);
371507 if (psDir)
372508 {
373- while (const auto *psEntry = VSIGetNextDirEntry (psDir))
509+ while (const auto *psEntry = VSIGetNextDirEntry (psDir. get () ))
374510 {
375- if (cpl::starts_with (std::string_view (psEntry->pszName ),
376- FILENAME_PREFIX))
511+ const std::string osName (
512+ CPLGetBasenameSafe (psEntry->pszName ));
513+ if (cpl::starts_with (osName, osPatternPrefix) &&
514+ cpl::ends_with (osName, osPatternSuffix))
377515 {
378516 nMaxCounter = std::max (
379517 nMaxCounter,
380- atoi (CPLGetBasenameSafe (psEntry->pszName +
381- strlen (FILENAME_PREFIX))
518+ atoi (osName
519+ .substr (osPatternPrefix.size (),
520+ osName.size () -
521+ osPatternPrefix.size () -
522+ osPatternSuffix.size ())
382523 .c_str ()));
383524 }
384525 }
385- VSICloseDir (psDir);
386526 }
387527
388528 if (nMaxCounter > 0 )
389529 {
390530 outputLayer->nFileCounter = nMaxCounter;
391531
392532 const std::string osFilename = CPLFormFilenameSafe (
393- osDatasetDir.c_str (), GetBasenameFromCounter (nMaxCounter),
394- pszExtension);
533+ osDatasetDir.c_str (),
534+ GetBasenameFromCounter (nMaxCounter). c_str (), pszExtension);
395535 auto poDS = std::unique_ptr<GDALDataset>(GDALDataset::Open (
396536 osFilename.c_str (),
397537 GDAL_OF_VECTOR | GDAL_OF_UPDATE | GDAL_OF_VERBOSE_ERROR));
@@ -497,7 +637,7 @@ static bool GetCurrentOutputLayer(
497637
498638 const std::string osFilename = CPLFormFilenameSafe (
499639 osDatasetDir.c_str (),
500- GetBasenameFromCounter (outputLayer->nFileCounter ),
640+ GetBasenameFromCounter (outputLayer->nFileCounter ). c_str () ,
501641 pszExtension);
502642 outputLayer->poDS .reset (
503643 poOutDriver->Create (osFilename.c_str (), 0 , 0 , 0 , GDT_Unknown,
@@ -630,7 +770,8 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
630770 return false ;
631771 }
632772
633- if (EQUAL (poOutDriver->GetDescription (), " PARQUET" ))
773+ if (EQUAL (poOutDriver->GetDescription (), " PARQUET" ) &&
774+ m_scheme == SCHEME_HIVE)
634775 {
635776 // Required for Parquet Hive partitioning
636777 m_omitPartitionedFields = true ;
@@ -686,41 +827,76 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
686827
687828 // Do a sanity check to verify that this looks like a directory
688829 // generated by partition
689- auto psDir = VSIOpenDir (m_output. c_str (), - 1 , nullptr );
690- if (psDir )
830+
831+ if (m_scheme == SCHEME_HIVE )
691832 {
692- while (const auto *psEntry = VSIGetNextDirEntry (psDir))
833+ std::unique_ptr<VSIDIR, decltype (&VSICloseDir)> psDir (
834+ VSIOpenDir (m_output.c_str (), -1 , nullptr ), VSICloseDir);
835+ if (psDir)
693836 {
694- emptyDir = false ;
695- if ( VSI_ISDIR (psEntry-> nMode ))
837+ while ( const auto *psEntry =
838+ VSIGetNextDirEntry (psDir. get () ))
696839 {
697- std::string_view v (psEntry->pszName );
698- if (std::count_if (v.begin (), v.end (),
699- [](char c) {
700- return c == ' /' || c == ' \\ ' ;
701- }) == 1 )
840+ emptyDir = false ;
841+ if (VSI_ISDIR (psEntry->nMode ))
702842 {
703- const auto nPosDirSep = v.find_first_of (" /\\ " );
704- const auto nPosEqual = v.find (' =' , nPosDirSep);
705- if (nPosEqual != std::string::npos)
843+ std::string_view v (psEntry->pszName );
844+ if (std::count_if (v.begin (), v.end (),
845+ [](char c) {
846+ return c == ' /' || c == ' \\ ' ;
847+ }) == 1 )
706848 {
707- hasDirLevel1WithEqual = true ;
708- break ;
849+ const auto nPosDirSep = v.find_first_of (" /\\ " );
850+ const auto nPosEqual = v.find (' =' , nPosDirSep);
851+ if (nPosEqual != std::string::npos)
852+ {
853+ hasDirLevel1WithEqual = true ;
854+ break ;
855+ }
709856 }
710857 }
711858 }
712859 }
713- VSICloseDir (psDir);
714- }
715860
716- if (!hasDirLevel1WithEqual && !emptyDir)
861+ if (!hasDirLevel1WithEqual && !emptyDir)
862+ {
863+ ReportError (
864+ CE_Failure, CPLE_AppDefined,
865+ " Rejecting removing '%s' as it does not look like "
866+ " a directory generated by this utility. If you are "
867+ " sure, remove it manually and re-run" ,
868+ m_output.c_str ());
869+ return false ;
870+ }
871+ }
872+ else
717873 {
718- ReportError (CE_Failure, CPLE_AppDefined,
719- " Rejecting removing '%s' as it does not look like "
720- " a directory generated by this utility. If you are "
721- " sure, remove it manually and re-run" ,
722- m_output.c_str ());
723- return false ;
874+ bool hasSubDir = false ;
875+ std::unique_ptr<VSIDIR, decltype (&VSICloseDir)> psDir (
876+ VSIOpenDir (m_output.c_str (), 0 , nullptr ), VSICloseDir);
877+ if (psDir)
878+ {
879+ while (const auto *psEntry =
880+ VSIGetNextDirEntry (psDir.get ()))
881+ {
882+ if (VSI_ISDIR (psEntry->nMode ))
883+ {
884+ hasSubDir = true ;
885+ break ;
886+ }
887+ }
888+ }
889+
890+ if (hasSubDir)
891+ {
892+ ReportError (
893+ CE_Failure, CPLE_AppDefined,
894+ " Rejecting removing '%s' as it does not look like "
895+ " a directory generated by this utility. If you are "
896+ " sure, remove it manually and re-run" ,
897+ m_output.c_str ());
898+ return false ;
899+ }
724900 }
725901
726902 if (VSIRmdirRecursive (m_output.c_str ()) != 0 )
@@ -750,10 +926,15 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
750926
751927 for (OGRLayer *poSrcLayer : poSrcDS->GetLayers ())
752928 {
753- const std::string osLayerDir = CPLFormFilenameSafe (
754- m_output.c_str (),
755- PercentEncode (poSrcLayer->GetDescription ()).c_str (), nullptr );
756- if (VSIStatL (osLayerDir.c_str (), &sStat ) != 0 )
929+ const std::string osLayerDir =
930+ m_scheme == SCHEME_HIVE
931+ ? CPLFormFilenameSafe (
932+ m_output.c_str (),
933+ PercentEncode (poSrcLayer->GetDescription ()).c_str (),
934+ nullptr )
935+ : m_output;
936+ if (m_scheme == SCHEME_HIVE &&
937+ VSIStatL (osLayerDir.c_str (), &sStat ) != 0 )
757938 {
758939 if (VSIMkdir (osLayerDir.c_str (), DIRECTORY_CREATION_MODE) != 0 )
759940 {
@@ -983,9 +1164,11 @@ bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
9831164
9841165 if (!GetCurrentOutputLayer (
9851166 this , poSrcFeatureDefn, poSrcLayer, osKey, osLayerDir,
986- m_featureLimit, m_maxFileSize, m_omitPartitionedFields,
987- abPartitionedFields, pszExtension, poOutDriver,
988- datasetCreationOptions, layerCreationOptions,
1167+ m_scheme, m_pattern, m_partDigitLeadingZeroes,
1168+ m_partDigitCount, m_featureLimit, m_maxFileSize,
1169+ m_omitPartitionedFields, abPartitionedFields,
1170+ pszExtension, poOutDriver, datasetCreationOptions,
1171+ layerCreationOptions,
9891172 poFeatureDefnWithoutPartitionedFields.get (),
9901173 poFeature->GetGeometryRef ()
9911174 ? nSpatialIndexPerFeatureConstant
0 commit comments