PopHIVE
diff --git a/‎.claude/settings.local.json‎
Lines changed: 17 additions & 1 deletion b/‎.claude/settings.local.json‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎.github/workflows/build_docs.yaml‎
Lines changed: 48 additions & 0 deletions b/‎.github/workflows/build_docs.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 97 additions & 15 deletions b/‎CLAUDE.md‎
Lines changed: 97 additions & 15 deletions
diff --git a/‎data/NREVSS/measure_info.json‎
Lines changed: 12 additions & 9 deletions b/‎data/NREVSS/measure_info.json‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎data/NREVSS/process.json‎
Lines changed: 23 additions & 2 deletions b/‎data/NREVSS/process.json‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎data/NREVSS/standard/datapackage.json‎
Lines changed: 18 additions & 7 deletions b/‎data/NREVSS/standard/datapackage.json‎
Lines changed: 18 additions & 7 deletions
@@ -25,7 +25,23 @@
       "Bash(powershell -Command \"gzip -d -c ''c:\\\\Users\\\\DMW63\\\\OneDrive - Yale University\\\\Desktop\\\\R_Projects\\\\POPHIVE PROJECT\\\\ingest\\\\resources\\\\all_fips.csv.gz'' | Select-Object -First 25\")",
       "Bash(powershell -Command \"gzip -d -c 'c:\\\\Users\\\\DMW63\\\\OneDrive - Yale University\\\\Desktop\\\\R_Projects\\\\POPHIVE PROJECT\\\\ingest\\\\resources\\\\all_fips.csv.gz' | Select-String -Pattern '^[0-9]{2}[^0-9]' | Select-Object -First 60\":*)",
       "Bash(zcat:*)",
-      "Bash(\"C:\\\\Program Files\\\\R\\\\R-4.4.1\\\\bin\\\\Rscript.exe\" -e:*)"
+      "Bash(\"C:\\\\Program Files\\\\R\\\\R-4.4.1\\\\bin\\\\Rscript.exe\" -e:*)",
+      "Bash(for file in data/schoolvaxview/standard/*.csv.gz)",
+      "Bash(for file in data/measles_cdc/standard/*.csv.gz)",
+      "Bash(for file in data/epic/standard/*.csv.gz)",
+      "Bash(for file in data/wastewater_measles/standard/*.csv.gz)",
+      "Bash(for file in data/delphi_ili_fluview/standard/*.csv.gz)",
+      "WebFetch(domain:github.com)",
+      "WebFetch(domain:pophive.github.io)",
+      "WebFetch(domain:raw.githubusercontent.com)",
+      "WebFetch(domain:api.github.com)",
+      "Bash(powershell -Command:*)",
+      "Bash(python:*)",
+      "Bash(python3:*)",
+      "Bash(cmd.exe /c \"Rscript -e \"\"dcf::dcf_check\\(\\)\"\"\")",
+      "Bash(cmd.exe /c \"cd /d \"\"c:\\\\Users\\\\DMW63\\\\OneDrive - Yale University\\\\Desktop\\\\R_Projects\\\\POPHIVE PROJECT\\\\ingest\"\" && \"\"C:\\\\Program Files\\\\R\\\\R-4.3.0\\\\bin\\\\Rscript.exe\"\" -e \"\"dcf::dcf_check\\(\\)\"\"\")",
+      "Bash(\"C:/Program Files/R/R-4.4.2/bin/Rscript.exe\" -e \"dcf::dcf_add_source\\(''schoolvax_washpost''\\)\")",
+      "Bash(gzip:*)"
     ]
   }
 }
@@ -0,0 +1,48 @@
+name: Build Documentation
+
+on:
+  push:
+    paths:
+      - 'data/*/measure_info.json'
+      - 'data/*/standard/*.csv.gz'
+      - 'scripts/build_docs.R'
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  build-docs:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: r-lib/actions/setup-r@v2
+
+      - name: Install R dependencies
+        run: |
+          install.packages(c("jsonlite", "vroom", "htmltools", "glue"))
+        shell: Rscript {0}
+
+      - name: Build documentation
+        run: Rscript scripts/build_docs.R
+
+      - name: Check for changes
+        id: diff
+        run: |
+          if git diff --exit-code docs/; then
+            echo "changed=false" >> $GITHUB_OUTPUT
+          else
+            echo "changed=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Commit and push
+        if: ${{ steps.diff.outputs.changed == 'true' }}
+        run: |
+          git config user.email "actions@github.com"
+          git config user.name "GitHub Actions"
+          git add docs/
+          git commit -m "Update data documentation"
+          git push
@@ -155,6 +155,7 @@ PopHIVE/Ingest/
 
 ```r
 # Create new data source folder structure
+### Important!! When adding a new data source, you MUST run this function. Otherwise the process.json files will not be initialized correctly, causing the pieplein to fail
 dcf::dcf_add_source("source_name")
 
 # Initialize processing record for tracking changes
@@ -330,6 +331,8 @@ if (!identical(process$raw_state, raw_state)) {
 
 ## measure_info.json Template
 
+Each `measure_info.json` file should include variable definitions and a centralized `_sources` object. Variables reference sources by ID.
+
 ```json
 {
   "variable_name": {
@@ -343,23 +346,47 @@ if (!identical(process$raw_state, raw_state)) {
     "measure_type": "Incidence|Prevalence|Rate|Percent|Count",
     "unit": "Cases per 100,000|Percent|Count",
     "time_resolution": "Week|Month|Year",
-    "restrictions": "Non-commercial purposes|Attribution required|None",
-    "sources": [
-      {
-        "name": "Source organization",
-        "url": "https://data.source.url"
-      }
-    ],
+    "sources": [{ "id": "source_id" }],
     "citations": [
       {
         "title": "Publication title",
         "url": "https://doi.org/..."
       }
     ]
+  },
+
+  "_sources": {
+    "source_id": {
+      "name": "Full source name",
+      "url": "https://data.source.url",
+      "organization": "Organization name",
+      "organization_url": "https://organization.url",
+      "location": "Specific dataset location (optional)",
+      "location_url": "https://specific.dataset.url (optional)",
+      "description": "Detailed narrative description of the data source, including methodology, coverage, limitations, and any important caveats for users.",
+      "restrictions": "License and usage restrictions. Examples: 'Public domain. CDC data is generally not subject to copyright restrictions.' or 'CC BY 4.0. Attribution required for reuse.' or 'Attribution required. Cite [citation].'",
+      "date_accessed": 2025
+    }
   }
 }
 ```
 
+### _sources Field Requirements
+
+Every `_sources` entry MUST include:
+- **name**: Full name of the data source
+- **url**: Primary URL for the data source
+- **organization**: Name of the organization providing the data
+- **organization_url**: URL for the organization
+- **description**: Narrative description of the source (methodology, coverage, limitations)
+- **restrictions**: License and usage restrictions
+
+Special restriction wording:
+- **Epic Cosmos**: "The data can be re-used with appropriate attribution. A suggested citation relating to this data is 'Results of research performed with Epic Cosmos were obtained from the PopHIVE platform (https://github.com/PopHIVE/Ingest).'"
+- **Google Health Trends**: "Data can be reused with attribution of data from the Google Health Trends API, obtained via the PopHIVE platform (https://github.com/PopHIVE/Ingest)."
+- **CDC/CMS data**: "Public domain. CDC data is generally not subject to copyright restrictions."
+- **Academic publications**: "Attribution required. Cite [full citation]."
+
 ---
 
 ## Common Data Source Patterns
@@ -527,13 +554,31 @@ data %>%
 ```
 
 ### Issue: Error "process file process.json does not exist"
-```r
-# Problem: dcf::dcf_process_record() fails on first run of new data source
-# Solution: Check if process.json exists before calling dcf_process_record()
-if (!file.exists("process.json")) {
-  process <- list(raw_state = NULL)
-} else {
-  process <- dcf::dcf_process_record()
+This is caused by failure to initialize a new datasource with `dcf::dcf_add_source()`. If this is not done, the process.json file is not properly initialized.
+
+**Preferred solution**: Run `dcf::dcf_add_source("source_name")` to create the folder structure properly.
+
+**Manual fix**: If you need to create the process.json manually, use this structure (replace `source_name` with your data folder name):
+
+```json
+{
+  "name": "source_name",
+  "type": "source",
+  "scripts": [
+    {
+      "path": "ingest.R",
+      "manual": false,
+      "frequency": 0,
+      "last_run": "",
+      "run_time": "",
+      "last_status": {
+        "log": "",
+        "success": true
+      }
+    }
+  ],
+  "checked": "",
+  "check_results": []
 }
 ```
 
@@ -555,6 +600,34 @@ dcf::dcf_process("source_name")
 # Also ensure project.Rproj and README.md exist in the source folder
 ```
 
+### Issue: Error "vec_math.arrow_binary() not implemented" when running dcf_process()
+
+This error occurs when a script works fine when run directly but fails via `dcf_process()`. The cause is vroom's Arrow ALTREP (lazy loading) backend:
+
+- **When running directly**: Interactive sessions may materialize data earlier or have different environment state
+- **When running via dcf_process()**: Scripts run in a cleaner context where Arrow ALTREP stays active, keeping columns as Arrow binary types until an operation forces materialization
+
+The error typically triggers when using `if_else()` with mixed types (e.g., comparing integers with Arrow-backed columns) or when `cdlTools::fips()` returns integers that get mixed with other types.
+
+```r
+# Problem: cdlTools::fips() with if_else causes Arrow type issues
+mutate(geography = cdlTools::fips(statename, to='FIPS'),
+       geography = if_else(statename=='United States', 0, geography))
+
+# Solution: Use FIPS lookup merge instead (also faster)
+all_fips <- vroom::vroom("../../resources/all_fips.csv.gz", show_col_types = FALSE)
+state_fips_lookup <- all_fips %>%
+  filter(nchar(geography) == 2) %>%
+  select(geography, geography_name)
+
+data <- data %>%
+  left_join(state_fips_lookup, by = c("statename" = "geography_name")) %>%
+  mutate(geography = if_else(statename == 'United States', "00", geography))
+
+# Alternative: Disable Arrow ALTREP (less preferred)
+data <- vroom::vroom("file.csv.xz", show_col_types = FALSE, altrep = FALSE)
+```
+
 ### Issue: Connecticut county FIPS codes not matching
 
 Connecticut abolished its 8 counties in 2022 and replaced them with 9 planning regions as county-equivalents. This means:
@@ -609,6 +682,9 @@ dcf::dcf_build()
 # Validate standard file format
 source("scripts/validate_standard.R")
 validate_standard_file("data/source_name/standard/data.csv.gz")
+
+# Rebuild data source documentation (generates docs/index.html)
+Rscript scripts/build_docs.R
 ```
 
 ---
@@ -641,7 +717,13 @@ validate_standard_file("data/source_name/standard/data.csv.gz")
    dcf::dcf_process("bundle_category", ".")
    ```
 
-8. **Commit changes**: Include raw data sample, ingest.R, measure_info.json, standard output
+8. **Update documentation**: The data source documentation is auto-generated from `measure_info.json` files
+   ```r
+   Rscript scripts/build_docs.R
+   ```
+   This generates `docs/index.html` with variable tables and source information. The GitHub Action will also rebuild docs automatically when `measure_info.json` files change.
+
+9. **Commit changes**: Include raw data sample, ingest.R, measure_info.json, standard output, and updated docs/
 
 ---
 
 
@@ -187,14 +187,17 @@
     "measure_type": "week",
     "unit": "week",
     "time_resolution": "",
-    "sources": [
-      {
-        "name": "Centers for Disease Control and Prevention",
-        "url": "https://data.cdc.gov",
-        "location": "Percent Positivity of Respiratory Syncytial Virus Nucleic Acid Amplification Tests by HHS Region, National Respiratory and Enteric Virus Surveillance System",
-        "location_url": "https://data.cdc.gov/resource/3cxc-4k8q",
-        "date_accessed": 2025
-      }
-    ]
+    "sources": [{ "id": "nrevss" }]
+    },
+
+  "_sources": {
+    "nrevss": {
+      "name": "National Respiratory and Enteric Virus Surveillance System (NREVSS)",
+      "url": "https://data.cdc.gov/resource/3cxc-4k8q",
+      "organization": "Centers for Disease Control and Prevention",
+      "organization_url": "https://www.cdc.gov/surveillance/nrevss/",
+      "description": "The National Respiratory and Enteric Virus Surveillance System (NREVSS) is a voluntary, laboratory-based surveillance system that monitors temporal and geographic trends for respiratory syncytial virus (RSV), human parainfluenza viruses, respiratory adenoviruses, human metapneumovirus, human coronaviruses, and rotavirus circulation in the United States. Participating laboratories report weekly to CDC on the number of tests performed and the number positive for each virus. NREVSS data are used to characterize seasonal patterns of these viruses and to help public health officials anticipate and prepare for outbreaks. Data are aggregated at the HHS regional and national levels. The system has been operational since 1987 and includes approximately 300 participating laboratories across the United States.",
+      "restrictions": "Public domain. CDC data is generally not subject to copyright restrictions."
     }
   }
+  }
@@ -17,6 +17,7 @@
       "filename": "data.csv.gz",
       "versions": {
         "hash": [
+          "b0cd5e888daf4a63d94d1370140226c7b8dcd812",
           "4bf7a93442011f06a6d0911b452537d3e687b01c",
           "0e1dbc341fd649cbb7039f7c72fd26207bec7331",
           "99cbd8c9ec417a7f9687ef9496149ef012a972b0",
@@ -52,6 +53,7 @@
           "0ca5a1658309a86937b5870cdd8e65c3737615b8"
         ],
         "author": [
+          "danweinberger <daniel.weinberger@yale.edu>",
           "GitHub Actions <actions@github.com>",
           "GitHub Actions <actions@github.com>",
           "GitHub Actions <actions@github.com>",
@@ -87,6 +89,7 @@
           "weinbergerlab <dweinber@gmail.com>"
         ],
         "date": [
+          "Sun Feb 1 22:36:14 2026 -0500",
           "Fri Jan 23 14:44:05 2026 +0000",
           "Fri Jan 16 14:42:17 2026 +0000",
           "Fri Jan 9 14:43:14 2026 +0000",
@@ -122,6 +125,7 @@
           "Fri Jul 11 11:35:21 2025 -0400"
         ],
         "message": [
+          "update respiratory files",
           "scheduled data build",
           "scheduled data build",
           "scheduled data build",
@@ -174,8 +178,8 @@
       "id_length": 0,
       "time": "time",
       "profile": "data-resource",
-      "created": "2026-02-01 22:15:31.77317",
-      "last_modified": "2025-07-14 09:31:21.119787",
+      "created": "2026-02-01 22:15:31",
+      "last_modified": "2025-07-11 11:04:59.776478",
       "vintage": {},
       "row_count": 3333,
       "entity_count": 11,
@@ -780,14 +784,21 @@
       "unit": "week",
       "sources": [
         {
-          "name": "Centers for Disease Control and Prevention",
-          "url": "https://data.cdc.gov",
-          "location": "Percent Positivity of Respiratory Syncytial Virus Nucleic Acid Amplification Tests by HHS Region, National Respiratory and Enteric Virus Surveillance System",
-          "location_url": "https://data.cdc.gov/resource/3cxc-4k8q",
-          "date_accessed": 2025
+          "id": "nrevss"
         }
       ],
       "id": "week"
+    },
+    "_sources": {
+      "nrevss": {
+        "name": "National Respiratory and Enteric Virus Surveillance System (NREVSS)",
+        "url": "https://data.cdc.gov/resource/3cxc-4k8q",
+        "organization": "Centers for Disease Control and Prevention",
+        "organization_url": "https://www.cdc.gov/surveillance/nrevss/",
+        "description": "The National Respiratory and Enteric Virus Surveillance System (NREVSS) is a voluntary, laboratory-based surveillance system that monitors temporal and geographic trends for respiratory syncytial virus (RSV), human parainfluenza viruses, respiratory adenoviruses, human metapneumovirus, human coronaviruses, and rotavirus circulation in the United States. Participating laboratories report weekly to CDC on the number of tests performed and the number positive for each virus. NREVSS data are used to characterize seasonal patterns of these viruses and to help public health officials anticipate and prepare for outbreaks. Data are aggregated at the HHS regional and national levels. The system has been operational since 1987 and includes approximately 300 participating laboratories across the United States.",
+        "restrictions": "Public domain. CDC data is generally not subject to copyright restrictions."
+      },
+      "id": "_sources"
     }
   }
 }