Merge pull request #420 from ATOMScience-org/bug_remove_outlier_replicates

stewarthe6 · web-flow · commit 96bc75a033f4 · 2026-02-24T18:13:55.000-08:00
fix remove_outlier_replicates to dismiss NaN rows from response column.

Also uploads code coverage reports as one large merged report at the end of the workflow. This more correctly reports coverage after direct and indirect changes.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -53,10 +53,34 @@ jobs:
         env:
           ENV: test
 
-      - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+      - name: Debug - Find all coverage files
+        run: |
+          echo "=== Current directory ==="
+          pwd
+          echo ""
+          echo "=== Contents of current directory ==="
+          ls -la
+          echo ""
+          echo "=== Searching for ALL .coverage files recursively ==="
+          find . -name ".coverage*" -type f 2>/dev/null || echo "No .coverage files found"
+          echo ""
+          echo "=== Contents of test directory ==="
+          ls -la atomsci/ddm/test/integrative/ || echo "Directory not found"
+          echo ""
+          echo "=== Checking if coverage was even run ==="
+          which coverage
+          coverage --version || echo "Coverage not installed"
+
+      - name: Save coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-pytest-unit
+          path: |
+            atomsci/ddm/test/unit/.coverage*
+            atomsci/modac/test/unit/.coverage*
+          include-hidden-files: true
+          if-no-files-found: error
+
   pytest-integrative-1:
     runs-on: ubuntu-24.04
     strategy:
@@ -100,10 +124,32 @@ jobs:
         env:
           ENV: test
 
-      - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+      - name: Debug - Find all coverage files
+        run: |
+          echo "=== Current directory ==="
+          pwd
+          echo ""
+          echo "=== Contents of current directory ==="
+          ls -la
+          echo ""
+          echo "=== Searching for ALL .coverage files recursively ==="
+          find . -name ".coverage*" -type f 2>/dev/null || echo "No .coverage files found"
+          echo ""
+          echo "=== Contents of test directory ==="
+          ls -la atomsci/ddm/test/integrative/ || echo "Directory not found"
+          echo ""
+          echo "=== Checking if coverage was even run ==="
+          which coverage
+          coverage --version || echo "Coverage not installed"
+
+      - name: Save coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-pytest-integrative-1
+          path: atomsci/ddm/test/integrative/**/.coverage*
+          include-hidden-files: true
+          if-no-files-found: error
+
   pytest-integrative-2:
     runs-on: ubuntu-24.04
     strategy:
@@ -147,10 +193,32 @@ jobs:
         env:
           ENV: test
 
-      - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+      - name: Debug - Find all coverage files
+        run: |
+          echo "=== Current directory ==="
+          pwd
+          echo ""
+          echo "=== Contents of current directory ==="
+          ls -la
+          echo ""
+          echo "=== Searching for ALL .coverage files recursively ==="
+          find . -name ".coverage*" -type f 2>/dev/null || echo "No .coverage files found"
+          echo ""
+          echo "=== Contents of test directory ==="
+          ls -la atomsci/ddm/test/integrative/ || echo "Directory not found"
+          echo ""
+          echo "=== Checking if coverage was even run ==="
+          which coverage
+          coverage --version || echo "Coverage not installed"
+
+      - name: Save coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-pytest-integrative-2
+          path: atomsci/ddm/test/integrative/**/.coverage*
+          include-hidden-files: true
+          if-no-files-found: error
+
   pytest-integrative-3:
     runs-on: ubuntu-24.04
     strategy:
@@ -194,10 +262,32 @@ jobs:
         env:
           ENV: test
 
-      - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v4
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+      - name: Debug - Find all coverage files
+        run: |
+          echo "=== Current directory ==="
+          pwd
+          echo ""
+          echo "=== Contents of current directory ==="
+          ls -la
+          echo ""
+          echo "=== Searching for ALL .coverage files recursively ==="
+          find . -name ".coverage*" -type f 2>/dev/null || echo "No .coverage files found"
+          echo ""
+          echo "=== Contents of test directory ==="
+          ls -la atomsci/ddm/test/integrative/ || echo "Directory not found"
+          echo ""
+          echo "=== Checking if coverage was even run ==="
+          which coverage
+          coverage --version || echo "Coverage not installed"
+
+      - name: Save coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-pytest-integrative-3
+          path: atomsci/ddm/test/integrative/**/.coverage*
+          include-hidden-files: true
+          if-no-files-found: error
+
   pytest-integrative-4:
     runs-on: ubuntu-24.04
     strategy:
@@ -241,7 +331,70 @@ jobs:
         env:
           ENV: test
 
-      - name: Upload coverage reports to Codecov
+      - name: Debug - Find all coverage files
+        run: |
+          echo "=== Current directory ==="
+          pwd
+          echo ""
+          echo "=== Contents of current directory ==="
+          ls -la
+          echo ""
+          echo "=== Searching for ALL .coverage files recursively ==="
+          find . -name ".coverage*" -type f 2>/dev/null || echo "No .coverage files found"
+          echo ""
+          echo "=== Contents of test directory ==="
+          ls -la atomsci/ddm/test/integrative/ || echo "Directory not found"
+          echo ""
+          echo "=== Checking if coverage was even run ==="
+          which coverage
+          coverage --version || echo "Coverage not installed"
+
+      - name: Save coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-pytest-integrative-4
+          path: atomsci/ddm/test/integrative/**/.coverage*
+          include-hidden-files: true
+          if-no-files-found: error
+
+  coverage-merge:
+    runs-on: ubuntu-24.04
+    needs: [pytest-unit, pytest-integrative-1, pytest-integrative-2, pytest-integrative-3, pytest-integrative-4]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      
+      - name: Install coverage
+        run: pip install coverage
+      
+      - name: Download all coverage artifacts
+        uses: actions/download-artifact@v5
+        with:
+          path: coverage-reports
+      
+      - name: Merge coverage reports
+        run: |
+          # List directory structure for debugging
+          ls -la
+          ls -la coverage-reports/ || echo "coverage-reports directory not found"
+          
+          # Find and combine all coverage files
+          find coverage-reports -name ".coverage*" -type f
+          
+          # Combine all coverage files
+          coverage combine $(find coverage-reports -name ".coverage*" -type f -print)
+          
+          # Generate XML report for codecov
+          coverage xml
+      
+      - name: Upload merged coverage to Codecov
         uses: codecov/codecov-action@v4
+        with:
+          files: ./coverage.xml
+          flags: unittests
+          name: codecov-umbrella
         env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/atomsci/ddm/test/integrative/curation_funcs/test_curation_funcs.py b/atomsci/ddm/test/integrative/curation_funcs/test_curation_funcs.py
@@ -33,42 +33,48 @@ def write_to_file(filt_df, filt_file):
     print(f"Wrote outlier-filtered data to {filt_file}")
     return filt_df
 
-def test_remove_outlier_replicates():
-    """Test outlier removal using curate_data.remove_outlier_replicates"""
+def create_raw_and_filt_file():
+    """Create filtered file for testing aggregation function"""
     raw_df = get_raw_data()
-    print(f"Raw data has {len(raw_df)} rows, {len(set(raw_df.base_rdkit_smiles.values))} unique compounds")
     filt_df = curate_data.remove_outlier_replicates(raw_df, response_col='log_efflux_ratio', id_col='base_rdkit_smiles',
                                                     max_diff_from_median=0.5)
+    write_to_file(filt_df, filt_file)
+
+    return raw_df, filt_df
+
+def test_remove_outlier_replicates(capsys):
+    """Test outlier removal using curate_data.remove_outlier_replicates"""
+     # Clean up old files
+    clean()
+
+    raw_df, filt_df = create_raw_and_filt_file()
+
+    captured = capsys.readouterr()
+    assert 'Removed 1 rows with missing log_efflux_ratio values' in captured.out, "Error: expected message about removed rows with missing values"
     n_filt_rows = len(filt_df)
     n_filt_cmpds = len(set(filt_df.base_rdkit_smiles.values))
     print(f"Filtered data has {n_filt_rows} rows, {n_filt_cmpds} unique compounds")
     assert (n_filt_rows == 1093), "Error: expected 1093 rows in filtered data"
     assert (n_filt_cmpds == 803), "Error: expected 803 unique compounds in filtered data"
     n_removed = len(raw_df) - n_filt_rows
-    assert (n_removed == 7), f"Error: {n_removed} rows were removed, expected 7"
-
-    write_to_file(filt_df, filt_file)
+    assert (n_removed == 8), f"Error: {n_removed} rows were removed, expected 8"
 
 def test_aggregate_assay_data():
     """Test curate_data.aggregate_assay_data, the preferred function for averaging replicate values over compounds"""
-    if not os.path.exists(filt_file):
-        test_remove_outlier_replicates()
-    else:
-        try:
-            filt_df = pd.read_csv(filt_file)
-            agg_df = curate_data.aggregate_assay_data(filt_df, value_col='log_efflux_ratio', label_actives=False,
-                                              id_col='compound_id', smiles_col='base_rdkit_smiles', relation_col='relation')
-            n_agg_rows = len(agg_df)
-            n_agg_cmpds = len(set(agg_df.base_rdkit_smiles.values))
-            print(f"Aggregated data has {n_agg_rows} rows, {n_agg_cmpds} unique compounds")
-            assert (n_agg_rows == 803), "Error: expected 803 rows in aggregated data"
-            assert (n_agg_cmpds == 803), "Error: expected 803 unique compounds in aggregated data"
-
-            agg_file = f"{script_path}/{test_file_prefix}-aggregated.csv"
-            agg_df.to_csv(agg_file, index=False)
-            print(f"Wrote aggregated data to {agg_file}")
-        except Exception as e:
-            pytest.fail(f"Could not read file {filt_file}: {e}")
+     # Clean up old files
+    clean()
+    raw_df, filt_df = create_raw_and_filt_file()
+    agg_df = curate_data.aggregate_assay_data(filt_df, value_col='log_efflux_ratio', label_actives=False,
+                                        id_col='compound_id', smiles_col='base_rdkit_smiles', relation_col='relation')
+    n_agg_rows = len(agg_df)
+    n_agg_cmpds = len(set(agg_df.base_rdkit_smiles.values))
+    print(f"Aggregated data has {n_agg_rows} rows, {n_agg_cmpds} unique compounds")
+    assert (n_agg_rows == 803), "Error: expected 803 rows in aggregated data"
+    assert (n_agg_cmpds == 803), "Error: expected 803 unique compounds in aggregated data"
+
+    agg_file = f"{script_path}/{test_file_prefix}-aggregated.csv"
+    agg_df.to_csv(agg_file, index=False)
+    print(f"Wrote aggregated data to {agg_file}")
 
 def test_average_and_remove_duplicates():
     """Test outlier removal and averaging using deprecated curation function"""
@@ -90,23 +96,3 @@ def test_average_and_remove_duplicates():
     curated_df.to_csv(curated_file, index=False)
     print(f"Wrote curated data to {curated_file}")
 
-
-def test():
-    """Test data curation functions"""
-
-    # Clean up old files
-    clean()
-
-    # Filter out outliers (preferred method)
-    test_remove_outlier_replicates()
-
-    # Average replicate values per compound (preferred method)
-    test_aggregate_assay_data()
-    
-    # Remove outliers and average over replicates (old method)
-    test_average_and_remove_duplicates()
-    
-
-
-if __name__ == '__main__':
-    test()
diff --git a/atomsci/ddm/test/test_datasets/pGP_MDCK_efflux_ratio_chembl29.csv b/atomsci/ddm/test/test_datasets/pGP_MDCK_efflux_ratio_chembl29.csv
@@ -1099,3 +1099,4 @@ activity_id,compound_id,base_rdkit_smiles,relation,efflux_ratio,log_efflux_ratio
 20668378,CHEMBL3892073,NC(=O)c1cc(-c2ccc(F)cc2)c2ccc(CN3C(=O)CCC3=O)cc2n1,=,7.0,0.8450980400142568
 20679457,CHEMBL4644751,C[C@H](C(=O)Nc1ccc(F)cc1)C12CC(NC(=O)c3ccc(F)c(F)c3)(C1)C2,=,1.3,0.11394335230683678
 20717398,CHEMBL4643105,COc1cc(/C=C2\CCCN3C2=NO[C@H](c2cc(F)cc(F)c2)[C@@H]3C)ccc1-n1cnc(C)c1,=,2.46,0.39093510710337914
+20717398,CHEMBL4643105,COc1cc(/C=C2\CCCN3C2=NO[C@H](c2cc(F)cc(F)c2)[C@@H]3C)ccc1-n1cnc(C)c1,=,,
diff --git a/atomsci/ddm/utils/curate_data.py b/atomsci/ddm/utils/curate_data.py
@@ -519,10 +519,16 @@ def remove_outlier_replicates(df, response_col='pIC50', id_col='compound_id', ma
         max_diff_from_median (float): Maximum absolute difference from median value allowed for retained replicates.
 
     Returns:
-        result_df (DataFrame): Filtered data frame with outlier replicates removed.
+        result_df (DataFrame): Filtered data frame with outlier replicates removed. Rows with NaN values in the 
+            response column are removed as a preprocessing step before outlier detection.
 
     """
-
+    
+    prev_len = len(df)
+    df = df.dropna(subset=[response_col])
+    if prev_len != len(df):
+        print(f"Removed {prev_len - len(df)} rows with missing {response_col} values")
+    
     fr_df = freq_table(df, id_col, min_freq=2)
     rep_ids = fr_df[id_col].values.tolist()
     has_rep_df = df[df[id_col].isin(rep_ids)]