refactor: rename EDGE_CASE to OTHER_EYE_DATA, clean training data, bump to 0.1.2

jimnoneill · jimnoneill · commit 3be3c5c082ab · 2026-03-17T16:53:13.000-07:00
- Rename EDGE_CASE label to OTHER_EYE_DATA across source, tests, and docs
- Rename prob_edge output key to prob_other_eye
- Clean and expand training data from 452 to 474 examples
- Add Zenodo classification results to README
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,20 @@
 # Changelog
 
+## [0.1.2] - 2026-03-17
+
+### Changed
+
+- Renamed `EDGE_CASE` class to `OTHER_EYE_DATA`
+- Renamed `prob_edge` output key to `prob_other_eye`
+- Training data cleaned and expanded to 474 examples
+- Improved spot-check accuracy to 87.9% (29/33)
+
 ## [0.1.0] - 2026-03-03
 
 ### Added
 
 - Initial beta scaffold
-- 4-class SetFit classifier (EYE_IMAGING, EYE_SOFTWARE, EDGE_CASE, NEGATIVE)
+- 4-class SetFit classifier (EYE_IMAGING, EYE_SOFTWARE, OTHER_EYE_DATA, NEGATIVE)
 - CLI with `classify`, `train`, and `info` commands
 - Auto-download of model weights from HuggingFace
 - Batch classification support
diff --git a/README.md b/README.md
@@ -67,11 +67,24 @@ envision-classifier info
 ## Model
 
 - **Base model**: `sentence-transformers/all-mpnet-base-v2` (768-dim)
-- **Training data**: 474 curated examples (77 EYE_IMAGING, 48 EYE_SOFTWARE, 79 EDGE_CASE, 270 NEGATIVE)
+- **Training data**: 474 curated examples (77 EYE_IMAGING, 48 EYE_SOFTWARE, 79 OTHER_EYE_DATA, 270 NEGATIVE)
 - **Test accuracy**: 0.937, **macro F1**: 0.902
 - **Spot-check**: 29/33 (87.9%)
 - **Model weights**: [fairdataihub/envision-eye-imaging-classifier](https://huggingface.co/fairdataihub/envision-eye-imaging-classifier)
 
+## Zenodo Classification Results
+
+Applied to 515 Zenodo dataset records via [envision-discovery](https://github.com/EyeACT/envision-discovery):
+
+| Class | Count |
+|-------|-------|
+| EYE_IMAGING | 120 |
+| EYE_SOFTWARE | 66 |
+| OTHER_EYE_DATA | 3 |
+| NEGATIVE | 325 |
+
+Classification is based on metadata only (titles, descriptions, keywords, and file types inspected inside archives via HTTP Range requests) — no dataset files are downloaded.
+
 ## Related
 
 - [envision-discovery](https://github.com/EyeACT/envision-discovery) -- Full pipeline (scraping + classification + export)
diff --git a/docs/modules/classifier.md b/docs/modules/classifier.md
@@ -21,7 +21,7 @@ print(result)
 |-------|-------------|
 | `EYE_IMAGING` | Actual eye imaging datasets (fundus, OCT, OCTA, cornea, etc.) |
 | `EYE_SOFTWARE` | Code, tools, models for eye imaging (no actual data) |
-| `EDGE_CASE` | Eye research papers, reviews, borderline items |
+| `OTHER_EYE_DATA` | Eye research papers, reviews, non-imaging data |
 | `NEGATIVE` | Unrelated domains |
 
 ### Batch Classification
diff --git a/envision_classifier/__init__.py b/envision_classifier/__init__.py
@@ -4,7 +4,7 @@
 A 4-class SetFit classifier for detecting eye imaging datasets:
   - EYE_IMAGING: Actual eye imaging datasets (fundus, OCT, OCTA, etc.)
   - EYE_SOFTWARE: Code, models, tools for eye imaging
-  - EDGE_CASE: Eye research papers, reviews, borderline items
+  - OTHER_EYE_DATA: Eye research papers, reviews, non-imaging data
   - NEGATIVE: Unrelated domains
 
 Usage:
@@ -14,7 +14,7 @@
     {'label': 'EYE_IMAGING', 'confidence': 0.999, 'probabilities': {...}}
 """
 
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 __author__ = "James O'Neill"
 
 from .classifier import EyeImagingClassifier, LABELS
diff --git a/envision_classifier/classifier.py b/envision_classifier/classifier.py
@@ -5,7 +5,7 @@
 Uses sentence-transformers/all-mpnet-base-v2 sentence transformer with 4-class classification:
   - 3: EYE_IMAGING - Actual eye imaging datasets (fundus, OCT, OCTA, cornea, etc.)
   - 2: EYE_SOFTWARE - Code, tools, models for eye imaging (no actual data)
-  - 1: EDGE_CASE - Eye research (papers, reviews, non-imaging data)
+  - 1: OTHER_EYE_DATA - Eye research (papers, reviews, non-imaging data)
   - 0: NEGATIVE - Not eye-related at all
 """
 
@@ -22,7 +22,7 @@
 # Model configuration
 BASE_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
 HF_MODEL_REPO = "fairdataihub/envision-eye-imaging-classifier"
-LABELS = ["NEGATIVE", "EDGE_CASE", "EYE_SOFTWARE", "EYE_IMAGING"]
+LABELS = ["NEGATIVE", "OTHER_EYE_DATA", "EYE_SOFTWARE", "EYE_IMAGING"]
 
 # ============================================================
 # TRAINING DATA - Curated examples for few-shot learning
@@ -111,7 +111,7 @@
 ]
 
 # EYE_SOFTWARE (label=2): Code, tools, models for eye imaging (NOT actual data)
-# Added: misplaced software from EYE_IMAGING + EDGE_CASE, spot-check examples
+# Added: misplaced software from EYE_IMAGING + OTHER_EYE_DATA, spot-check examples
 EYE_SOFTWARE_EXAMPLES = [
     "linchundan88/Fundus-image-preprocessing: fundus image preprocessing Python code",
     "NIH-NEI/oct-image-segmentation-models: v0.8.2 trained model weights",
@@ -150,7 +150,7 @@
     "ResNet-50 classifiers and diffusion models trained on retinal fundus images",
     "AMikroulis/octopus OCT image processing dataset",
     "anithaj17/RetinoNet-DR-Classification fundus image dataset",
-    # Moved from EDGE_CASE (clearly software/tools)
+    # Moved from OTHER_EYE_DATA (clearly software/tools)
     "Python package for retinal image preprocessing",
     "Deep learning framework for fundus image segmentation code only",
     "OCT image reconstruction algorithm implementation",
@@ -166,9 +166,9 @@
     "Flexible corneal neurotechnology reveals in-vivo pathological retinal oscillations recording device",
 ]
 
-# EDGE_CASE (label=1): Eye/vision research but NOT actual imaging datasets
+# OTHER_EYE_DATA (label=1): Eye/vision research but NOT actual imaging datasets
 # Cleaned: removed misplaced software→EYE_SOFTWARE, non-eye→NEGATIVE; added eye metabolomics
-EDGE_CASE_EXAMPLES = [
+OTHER_EYE_DATA_EXAMPLES = [
     "A Review of Deep Learning Methods for Diabetic Retinopathy Detection",
     "Survey of Machine Learning Techniques for Glaucoma Diagnosis",
     "Advances in Optical Coherence Tomography Technology Review Article",
@@ -257,7 +257,7 @@
 ]
 
 # NEGATIVE (label=0): Clearly not eye-related
-# Added: non-eye medical imaging from EDGE_CASE, spot-check confounders
+# Added: non-eye medical imaging from OTHER_EYE_DATA, spot-check confounders
 NEGATIVE_EXAMPLES = [
     "Climate change impact on coral reef ecosystems dataset",
     "COVID-19 genome sequencing and variant analysis",
@@ -492,7 +492,7 @@
     "Dataset_1 of AF driver detection in pulmonary vein area cardiac arrhythmia",
     "Data from Dichoptic metacontrast masking functions to infer transmission delay",
     "IRIS Carbon Mapping Project Curated Dataset carbon emissions",
-    # Moved from EDGE_CASE (non-eye medical imaging — clearly NEGATIVE)
+    # Moved from OTHER_EYE_DATA (non-eye medical imaging — clearly NEGATIVE)
     "Brain MRI analysis for Alzheimer's disease detection",
     "Cardiac CT angiography for coronary artery disease",
     "Dermatology skin lesion classification dataset",
@@ -503,7 +503,7 @@
     "Ultrasound imaging for liver disease assessment",
     "PET scan analysis for neurological disorders",
     "Spine MRI for degenerative disc disease",
-    # Moved from EDGE_CASE (non-eye OCT — clearly NEGATIVE)
+    # Moved from OTHER_EYE_DATA (non-eye OCT — clearly NEGATIVE)
     "OCT for industrial material inspection dataset",
     "Optical coherence tomography in dermatology skin imaging",
     "OCT imaging of atherosclerotic plaque in arteries",
@@ -545,7 +545,7 @@ class EyeImagingClassifier:
     Classifies metadata records into 4 classes:
       - EYE_IMAGING: Actual eye imaging datasets (fundus, OCT, OCTA, etc.)
       - EYE_SOFTWARE: Code, tools, models for eye imaging (no actual data)
-      - EDGE_CASE: Eye research papers, reviews, borderline items
+      - OTHER_EYE_DATA: Eye research papers, reviews, borderline items
       - NEGATIVE: Unrelated domains
 
     Usage:
@@ -679,7 +679,7 @@ def _predict_batch(self, texts):
             else:
                 pred_int = {
                     "NEGATIVE": 0,
-                    "EDGE_CASE": 1,
+                    "OTHER_EYE_DATA": 1,
                     "EYE_SOFTWARE": 2,
                     "EYE_IMAGING": 3,
                 }.get(str(pred), 0)
@@ -692,7 +692,7 @@ def _predict_batch(self, texts):
                     "confidence": float(max(probs)),
                     "probabilities": {
                         "NEGATIVE": float(probs[0]),
-                        "EDGE_CASE": float(probs[1]),
+                        "OTHER_EYE_DATA": float(probs[1]),
                         "EYE_SOFTWARE": float(probs[2]),
                         "EYE_IMAGING": float(probs[3]),
                     },
@@ -733,13 +733,13 @@ def train(cls, output_dir=None, device=None, base_model_name=None,
         train_texts = (
             EYE_IMAGING_EXAMPLES
             + EYE_SOFTWARE_EXAMPLES
-            + EDGE_CASE_EXAMPLES
+            + OTHER_EYE_DATA_EXAMPLES
             + NEGATIVE_EXAMPLES
         )
         train_labels = (
             [3] * len(EYE_IMAGING_EXAMPLES)
             + [2] * len(EYE_SOFTWARE_EXAMPLES)
-            + [1] * len(EDGE_CASE_EXAMPLES)
+            + [1] * len(OTHER_EYE_DATA_EXAMPLES)
             + [0] * len(NEGATIVE_EXAMPLES)
         )
 
diff --git a/envision_classifier/cli.py b/envision_classifier/cli.py
@@ -76,7 +76,7 @@ def info():
         LABELS,
         EYE_IMAGING_EXAMPLES,
         EYE_SOFTWARE_EXAMPLES,
-        EDGE_CASE_EXAMPLES,
+        OTHER_EYE_DATA_EXAMPLES,
         NEGATIVE_EXAMPLES,
     )
 
@@ -86,5 +86,5 @@ def info():
     click.echo(f"Labels:           {', '.join(LABELS)}")
     click.echo(f"Training data:    {len(EYE_IMAGING_EXAMPLES)} eye_imaging, "
                f"{len(EYE_SOFTWARE_EXAMPLES)} eye_software, "
-               f"{len(EDGE_CASE_EXAMPLES)} edge_case, "
+               f"{len(OTHER_EYE_DATA_EXAMPLES)} other_eye_data, "
                f"{len(NEGATIVE_EXAMPLES)} negative")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 
 name = "envision-classifier"
-version = "0.1.1"
+version = "0.1.2"
 description = "Few-shot classifier for detecting eye imaging datasets"
 
 packages = [{ include = "envision_classifier" }]
diff --git a/tests/test_classifier.py b/tests/test_classifier.py
@@ -4,7 +4,7 @@
     LABELS,
     EYE_IMAGING_EXAMPLES,
     EYE_SOFTWARE_EXAMPLES,
-    EDGE_CASE_EXAMPLES,
+    OTHER_EYE_DATA_EXAMPLES,
     NEGATIVE_EXAMPLES,
 )
 
@@ -18,7 +18,7 @@ def test_labels():
 def test_training_data_not_empty():
     assert len(EYE_IMAGING_EXAMPLES) > 0
     assert len(EYE_SOFTWARE_EXAMPLES) > 0
-    assert len(EDGE_CASE_EXAMPLES) > 0
+    assert len(OTHER_EYE_DATA_EXAMPLES) > 0
     assert len(NEGATIVE_EXAMPLES) > 0