EngineeringSoftware
diff --git a/‎README.md‎
Lines changed: 7 additions & 5 deletions b/‎README.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎roosterize/data/DataMiner.py‎
Lines changed: 7 additions & 7 deletions b/‎roosterize/data/DataMiner.py‎
Lines changed: 7 additions & 7 deletions
@@ -2,6 +2,9 @@
 
 Roosterize is a tool for suggesting lemma names in verification
 projects that use the [Coq proof assistant](https://coq.inria.fr).
+The tool is based on leveraging neural networks that take serialized Coq
+lemma statements and elaborated terms as input; see the [Technique](#Technique)
+section below.
 
 ## Requirements
 
@@ -78,7 +81,7 @@ project, and `$SERAPI_OPTIONS` should be replaced with the SerAPI
 command line options for mapping logical paths to directories (see [SerAPI's
 documentation](https://github.com/ejgallego/coq-serapi/blob/v8.11/FAQ.md#does-serapi-support-coqs-command-line-flags)).
 For example, if the logical path (inside Coq) for the project is `Verified`,
-you should set `SERAPI_OPTIONS="-Q $PATH_TO_PROJECT,Verified"`.
+you should set `SERAPI_OPTIONS="-R $PATH_TO_PROJECT,Verified"`.
 
 The command extracts all lemmas from the project, uses Roosterize's
 pre-trained model (at `./models/roosterize-ta`) to predict a lemma name
@@ -90,8 +93,6 @@ Below is an example of printed suggestions:
 infotheo/ecc_classic/bch.v: infotheo.ecc_classic.bch.BCH.BCH_PCM_altP1 -> inde_F2
 infotheo/ecc_classic/bch.v: infotheo.ecc_classic.bch.BCH.BCH_PCM_altP2 -> inde_mul
 infotheo/ecc_classic/bch.v: infotheo.ecc_classic.bch.BCH.PCM_altP -> F2_eq0
-infotheo/ecc_classic/bch.v: infotheo.ecc_classic.bch.BCH.PCM_alt_GRS -> P
-infotheo/ecc_classic/bch.v: infotheo.ecc_classic.bch.BCH_codebook -> map_P
 ...
 ```
 
@@ -109,7 +110,7 @@ For example, the Coq lemma sentence
 ```coq
 Lemma mg_eq_proof L1 L2 (N1 : mgClassifier L1) : L1 =i L2 -> nerode L2 N1.
 ```
-is serialized into the following tokens:
+is serialized into the following tokens (simplified):
 ```lisp
 (Sentence((IDENT Lemma)(IDENT mg_eq_proof)(IDENT L1)(IDENT L2)
   (KEYWORD"(")(IDENT N1)(KEYWORD :)(IDENT mgClassifier)
@@ -134,7 +135,8 @@ architecture, as applied to this example:
 Our [research paper][arxiv-paper] outlines the design of Roosterize,
 and describes an evaluation on a [corpus][math-comp-corpus]
 of serialized Coq code derived from the [Mathematical Components][math-comp-website]
-family of projects.
+family of projects. The training, validation, and testing sets of Coq files from the corpus
+used in the evaluation are defined in the `training` directory.
 
 If you have used Roosterize in a research project, please cite
 the research paper in any related publication:
 
@@ -456,7 +456,7 @@ def collect_lemmas(cls, data_mgr: FilesManager, projects: List[Project], files:
         # Assign uids
         for lemma_i, lemma in enumerate(lemmas):  lemma.uid = lemma_i
 
-        data_mgr.dump_data([FilesManager.LEMMAS, "lemmas"], lemmas, IOUtils.Format.json, is_batched=True, per_batch=5000)
+        data_mgr.dump_data([FilesManager.LEMMAS], lemmas, IOUtils.Format.json, is_batched=True, per_batch=5000)
         return
 
     @classmethod
@@ -468,7 +468,7 @@ def filter_lemmas(cls, data_mgr: FilesManager):
         data_mgr.resolve([FilesManager.LEMMAS_FILTERED]).mkdir(parents=True)
 
         # Load lemmas
-        lemmas: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS, "lemmas"], IOUtils.Format.json, is_batched=True, clz=Lemma)
+        lemmas: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS], IOUtils.Format.json, is_batched=True, clz=Lemma)
         heights: List[int] = [l.backend_sexp.height() for l in lemmas]
 
         depth_cutoff_point = sorted(heights)[int(np.ceil(Macros.LEMMAS_DEPTH_CUTOFF * len(lemmas)))]
@@ -480,7 +480,7 @@ def filter_lemmas(cls, data_mgr: FilesManager):
         # Assign uids
         for lemma_i, lemma in enumerate(lemmas_filtered):  lemma.uid = lemma_i
 
-        data_mgr.dump_data([FilesManager.LEMMAS_FILTERED, "lemmas"], lemmas_filtered, IOUtils.Format.json, is_batched=True, per_batch=5000)
+        data_mgr.dump_data([FilesManager.LEMMAS_FILTERED], lemmas_filtered, IOUtils.Format.json, is_batched=True, per_batch=5000)
         return
 
     @classmethod
@@ -529,7 +529,7 @@ def collect_lemmas_backend_sexp_transformations(cls, data_mgr: FilesManager):
         # Increase recursion limit because the backend sexps are CRAZZZZY deep
         sys.setrecursionlimit(10000)
 
-        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED, "lemmas"], IOUtils.Format.json, is_batched=True, clz=Lemma)
+        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma)
 
         # Main stream transformations, applied one after another
         levels_lemmas_bsexp_transformed: Dict[str, List[SexpNode]] = dict()
@@ -573,7 +573,7 @@ def collect_lemmas_foreend_sexp_transformations(cls, data_mgr: FilesManager):
         # Increase recursion limit because the backend sexps are CRAZZZZY deep
         sys.setrecursionlimit(10000)
 
-        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED, "lemmas"], IOUtils.Format.json, is_batched=True, clz=Lemma)
+        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma)
 
         # Main stream transformations, applied one after another
         levels_lemmas_fsexp_transformed: Dict[str, List[SexpNode]] = dict()
@@ -857,14 +857,14 @@ def extract_data_from_corpus(cls,
         data_mgr = FilesManager(corpus_path)
 
         # 2. Load lemmas and definitions
-        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED, "lemmas"], IOUtils.Format.json, is_batched=True, clz=Lemma)
+        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma)
         definitions: List[Definition] = data_mgr.load_data([FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition)
 
         # 3. Output to output_path for each combination of traineval and group
         for traineval in trainevals:
             for group in groups:
                 IOUtils.mk_dir(output_path/f"{group}-{traineval}")
-                data_indexes = data_mgr.load_data([FilesManager.DATA_INDEXES, f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str)
+                data_indexes = IOUtils.load(project_dir/"training"/f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str)
                 IOUtils.dump(output_path/f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([l for l in lemmas_filtered if l.data_index in data_indexes]), IOUtils.Format.json)
                 IOUtils.dump(output_path/f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([d for d in definitions if d.data_index in data_indexes]), IOUtils.Format.json)
             # end for