--extract -z: force redundant schema removal for '-t'

xamidi · xamidi · commit 652a46fecc0e · 2024-08-08T18:09:28.000+02:00
+ fix output
+ update comments for 'auto parse' in DRuleParser::parseAbstractDProof()
  and 'auto explore' in DRuleParser::compressAbstractDProof()
diff --git a/README.html b/README.html
@@ -494,11 +494,12 @@ <h4 id="usage">Usage</h4>
        -p: search proofs (rather than conclusions) ; used only when '-n', '-s' and '-t' unspecified
        -f: search terms are given by input file path (where a comma-separated string is stored), ignoring all CR, LF, whitespace, and lines starting with '%'
        -d: print debug information
-  --extract [-t &lt;limit or -1&gt;] [-o &lt;output file&gt;] [-s] [-# &lt;amount up to 35&gt;] [-h &lt;string&gt;] [-l &lt;limit or -1&gt;] [-k &lt;limit or -1&gt;] [-f] [-d]
+  --extract [-t &lt;limit or -1&gt;] [-o &lt;output file&gt;] [-s] [-z] [-# &lt;amount up to 35&gt;] [-h &lt;string&gt;] [-l &lt;limit or -1&gt;] [-k &lt;limit or -1&gt;] [-f] [-d]
      Various options to extract information from proof files ; [Hint: Generate missing files with '--variate 1 -s'.]
        -t: compose file with up to the given amount of smallest conclusions that occur in proof files ; includes origins, symbolic lengths, proofs, and formulas in normal Polish notation
        -o: specify output file path for '-t' ; relative to effective data location ; default: "top&lt;amount&gt;SmallestConclusions_&lt;min proof length&gt;to&lt;max proof length&gt;Steps&lt;unfiltered info&gt;.txt"
        -s: redundant schema removal for '-t' ; very time-intensive for requesting huge collections from unfiltered proof files - better pre-filter via '-g' or '-m' instead ; default: false
+       -z: force redundant schema removal for '-t' ; like '-s', but also filter proof files not declared as unfiltered (which might be useful for non-standard procedures)
        -#: initialize proof system at ./data/[&lt;hash&gt;/]/extraction-&lt;id&gt;/ with the given amount of smallest filtered conclusions that occur in proof files ; specify with '-c &lt;parent system&gt; -e &lt;id&gt;'
        -h: similar to '-#' ; hand-pick conclusions with a comma-separated string of proofs ; "." to not modify axioms
        -l: similar to '-#' (but creates identical system with prebuilt proof files) ; copy proofs with conclusions that have symbolic lengths of at most the given number
diff --git a/README.md b/README.md
@@ -119,11 +119,12 @@ Some more – and very special – proof systems are illustrated [further down b
            -p: search proofs (rather than conclusions) ; used only when '-n', '-s' and '-t' unspecified
            -f: search terms are given by input file path (where a comma-separated string is stored), ignoring all CR, LF, whitespace, and lines starting with '%'
            -d: print debug information
-      --extract [-t <limit or -1>] [-o <output file>] [-s] [-# <amount up to 35>] [-h <string>] [-l <limit or -1>] [-k <limit or -1>] [-f] [-d]
+      --extract [-t <limit or -1>] [-o <output file>] [-s] [-z] [-# <amount up to 35>] [-h <string>] [-l <limit or -1>] [-k <limit or -1>] [-f] [-d]
          Various options to extract information from proof files ; [Hint: Generate missing files with '--variate 1 -s'.]
            -t: compose file with up to the given amount of smallest conclusions that occur in proof files ; includes origins, symbolic lengths, proofs, and formulas in normal Polish notation
            -o: specify output file path for '-t' ; relative to effective data location ; default: "top<amount>SmallestConclusions_<min proof length>to<max proof length>Steps<unfiltered info>.txt"
            -s: redundant schema removal for '-t' ; very time-intensive for requesting huge collections from unfiltered proof files - better pre-filter via '-g' or '-m' instead ; default: false
+           -z: force redundant schema removal for '-t' ; like '-s', but also filter proof files not declared as unfiltered (which might be useful for non-standard procedures)
            -#: initialize proof system at ./data/[<hash>/]/extraction-<id>/ with the given amount of smallest filtered conclusions that occur in proof files ; specify with '-c <parent system> -e <id>'
            -h: similar to '-#' ; hand-pick conclusions with a comma-separated string of proofs ; "." to not modify axioms
            -l: similar to '-#' (but creates identical system with prebuilt proof files) ; copy proofs with conclusions that have symbolic lengths of at most the given number
diff --git a/logic/DlProofEnumerator.cpp b/logic/DlProofEnumerator.cpp
@@ -3151,7 +3151,7 @@ map<string, string> DlProofEnumerator::searchProofFiles(const vector<string>& se
 	return bestResults;
 }
 
-void DlProofEnumerator::extractConclusions(ExtractionMethod method, uint32_t extractAmount, const string* config, bool allowRedundantSchemaRemoval, size_t bound1, size_t bound2, bool debug, string* optOut_createdExDir) {
+void DlProofEnumerator::extractConclusions(ExtractionMethod method, uint32_t extractAmount, const string* config, bool allowRedundantSchemaRemoval, bool forceRedundantSchemaRemoval, size_t bound1, size_t bound2, bool debug, string* optOut_createdExDir) {
 	chrono::time_point<chrono::steady_clock> startTime;
 	vector<string> dProofs;
 	if ((method == ExtractionMethod::ProofSystemFromTopList && !extractAmount) || (method == ExtractionMethod::ProofSystemFromString && (!config || config->empty()))) {
@@ -3249,7 +3249,7 @@ void DlProofEnumerator::extractConclusions(ExtractionMethod method, uint32_t ext
 		//       removals are performed (unless requested by ridiculous limits for ExtractionMethod::TopListFile), which should be done via -g or -m anyway.
 		//       For small sets, where this loop can occur for ExtractionMethod::ProofSystemFromTopList (and under reasonable parameters for
 		//       ExtractionMethod::TopListFile), its performance overhead is insignificant.
-		redundantSchemaRemoval = useUnfilteredFiles && (method == ExtractionMethod::ProofSystemFromTopList || allowRedundantSchemaRemoval);
+		redundantSchemaRemoval = forceRedundantSchemaRemoval || (useUnfilteredFiles && (method == ExtractionMethod::ProofSystemFromTopList || allowRedundantSchemaRemoval));
 		if (redundantSchemaRemoval && limits.front() > 1)
 			cerr << "Warning: Schemas are filtered only partially, because proofs with less than " + to_string(limits.front()) + " steps are missing from the collection. Generate corresponding file(s) to avoid this." << endl;
 		bool repeat = false;
@@ -3353,7 +3353,7 @@ void DlProofEnumerator::extractConclusions(ExtractionMethod method, uint32_t ext
 										uint32_t proofLen = q.first;
 										size_t lineNo = r.first;
 										bool schemaFound = false;
-										if (proofLen >= unfiltered) { // detect conclusions that are merely instances of other formulas proven in lower or equal amounts of steps
+										if (forceRedundantSchemaRemoval || proofLen >= unfiltered) { // detect conclusions that are merely instances of other formulas proven in lower or equal amounts of steps
 											bool search = true;
 											for (const pair<const size_t, tbb::concurrent_map<uint32_t, tbb::concurrent_map<size_t, string>>>& p_ : topList)
 												if (search && p_.first <= symConLen) {
@@ -3524,7 +3524,7 @@ void DlProofEnumerator::extractConclusions(ExtractionMethod method, uint32_t ext
 		size_t maxSymConNumLen = FctHelper::digitsNum_uint64(maxSymConLen);
 
 		// 3. Print relevant results to file.
-		filesystem::path file = filesystem::u8path(config ? effectiveDataLocation + *config : effectiveDataLocation + "top" + (extractAmount == UINT32_MAX ? "" : to_string(counter)) + "SmallestConclusions_" + to_string(limits.empty() ? 0 : limits.front()) + "to" + to_string(limits.empty() ? 0 : limits.back()) + "Steps" + (!useUnfilteredFiles || (redundantSchemaRemoval && limits.front() == 1) ? "" : string("-") + (redundantSchemaRemoval ? "partially-" : "un") + "filtered" + to_string(unfiltered) + "+") + ".txt");
+		filesystem::path file = filesystem::u8path(config ? effectiveDataLocation + *config : effectiveDataLocation + "top" + (extractAmount == UINT32_MAX ? "" : to_string(counter)) + "SmallestConclusions_" + to_string(limits.empty() ? 0 : limits.front()) + "to" + to_string(limits.empty() ? 0 : limits.back()) + "Steps" + ((!useUnfilteredFiles && !forceRedundantSchemaRemoval) || (redundantSchemaRemoval && limits.front() == 1) ? (forceRedundantSchemaRemoval ? "-filtered" : "") : string("-") + (redundantSchemaRemoval ? "partially-" : "un") + "filtered" + (forceRedundantSchemaRemoval ? "" : to_string(unfiltered) + "+")) + ".txt");
 		string::size_type bytes = 0;
 		while (!filesystem::exists(file) && !FctHelper::ensureDirExists(file.string()))
 			cerr << "Failed to create file at \"" << file.string() << "\", trying again." << endl;
diff --git a/logic/DlProofEnumerator.h b/logic/DlProofEnumerator.h
@@ -105,7 +105,7 @@ struct DlProofEnumerator {
 
 	// Data search ; input files with conclusions are required
 	static std::map<std::string, std::string> searchProofFiles(const std::vector<std::string>& searchTerms, bool normalPolishNotation = false, bool searchProofs = false, unsigned schemaSearch = 0, const std::string* inputFile = nullptr, bool debug = false);
-	static void extractConclusions(ExtractionMethod method, std::uint32_t extractAmount, const std::string* config = nullptr, bool allowRedundantSchemaRemoval = false, std::size_t bound1 = 0, std::size_t bound2 = 0, bool debug = false, std::string* optOut_createdExDir = nullptr);
+	static void extractConclusions(ExtractionMethod method, std::uint32_t extractAmount, const std::string* config = nullptr, bool allowRedundantSchemaRemoval = false, bool forceRedundantSchemaRemoval = false, std::size_t bound1 = 0, std::size_t bound2 = 0, bool debug = false, std::string* optOut_createdExDir = nullptr);
 
 	// Data representation ; input files with conclusions are required
 	static void printConclusionLengthPlotData(bool measureSymbolicLength = true, bool table = true, std::int64_t cutX = -1, std::int64_t cutY = -1, const std::string& dataLocation = "data", const std::string& inputFilePrefix = "dProofs-withConclusions/dProofs", bool includeUnfiltered = false, std::ostream* mout = nullptr, bool debug = false, const std::uint32_t* proofLenStepSize = nullptr);
diff --git a/main.cpp b/main.cpp
diff --git a/metamath/DRuleParser.cpp b/metamath/DRuleParser.cpp