Skip to content

Commit d3c0f7e

Browse files
Merge pull request #79 from PopHIVE/build_docs
Build docs
2 parents 1b8f10d + ce71e4b commit d3c0f7e

File tree

143 files changed

+55721
-68388
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+55721
-68388
lines changed

.claude/settings.local.json

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,23 @@
2525
"Bash(powershell -Command \"gzip -d -c ''c:\\\\Users\\\\DMW63\\\\OneDrive - Yale University\\\\Desktop\\\\R_Projects\\\\POPHIVE PROJECT\\\\ingest\\\\resources\\\\all_fips.csv.gz'' | Select-Object -First 25\")",
2626
"Bash(powershell -Command \"gzip -d -c 'c:\\\\Users\\\\DMW63\\\\OneDrive - Yale University\\\\Desktop\\\\R_Projects\\\\POPHIVE PROJECT\\\\ingest\\\\resources\\\\all_fips.csv.gz' | Select-String -Pattern '^[0-9]{2}[^0-9]' | Select-Object -First 60\":*)",
2727
"Bash(zcat:*)",
28-
"Bash(\"C:\\\\Program Files\\\\R\\\\R-4.4.1\\\\bin\\\\Rscript.exe\" -e:*)"
28+
"Bash(\"C:\\\\Program Files\\\\R\\\\R-4.4.1\\\\bin\\\\Rscript.exe\" -e:*)",
29+
"Bash(for file in data/schoolvaxview/standard/*.csv.gz)",
30+
"Bash(for file in data/measles_cdc/standard/*.csv.gz)",
31+
"Bash(for file in data/epic/standard/*.csv.gz)",
32+
"Bash(for file in data/wastewater_measles/standard/*.csv.gz)",
33+
"Bash(for file in data/delphi_ili_fluview/standard/*.csv.gz)",
34+
"WebFetch(domain:github.com)",
35+
"WebFetch(domain:pophive.github.io)",
36+
"WebFetch(domain:raw.githubusercontent.com)",
37+
"WebFetch(domain:api.github.com)",
38+
"Bash(powershell -Command:*)",
39+
"Bash(python:*)",
40+
"Bash(python3:*)",
41+
"Bash(cmd.exe /c \"Rscript -e \"\"dcf::dcf_check\\(\\)\"\"\")",
42+
"Bash(cmd.exe /c \"cd /d \"\"c:\\\\Users\\\\DMW63\\\\OneDrive - Yale University\\\\Desktop\\\\R_Projects\\\\POPHIVE PROJECT\\\\ingest\"\" && \"\"C:\\\\Program Files\\\\R\\\\R-4.3.0\\\\bin\\\\Rscript.exe\"\" -e \"\"dcf::dcf_check\\(\\)\"\"\")",
43+
"Bash(\"C:/Program Files/R/R-4.4.2/bin/Rscript.exe\" -e \"dcf::dcf_add_source\\(''schoolvax_washpost''\\)\")",
44+
"Bash(gzip:*)"
2945
]
3046
}
3147
}

.github/workflows/build_docs.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: Build Documentation
2+
3+
on:
4+
push:
5+
paths:
6+
- 'data/*/measure_info.json'
7+
- 'data/*/standard/*.csv.gz'
8+
- 'scripts/build_docs.R'
9+
branches: [main]
10+
workflow_dispatch:
11+
12+
jobs:
13+
build-docs:
14+
runs-on: ubuntu-latest
15+
permissions:
16+
contents: write
17+
steps:
18+
- uses: actions/checkout@v4
19+
with:
20+
fetch-depth: 0
21+
22+
- uses: r-lib/actions/setup-r@v2
23+
24+
- name: Install R dependencies
25+
run: |
26+
install.packages(c("jsonlite", "vroom", "htmltools", "glue"))
27+
shell: Rscript {0}
28+
29+
- name: Build documentation
30+
run: Rscript scripts/build_docs.R
31+
32+
- name: Check for changes
33+
id: diff
34+
run: |
35+
if git diff --exit-code docs/; then
36+
echo "changed=false" >> $GITHUB_OUTPUT
37+
else
38+
echo "changed=true" >> $GITHUB_OUTPUT
39+
fi
40+
41+
- name: Commit and push
42+
if: ${{ steps.diff.outputs.changed == 'true' }}
43+
run: |
44+
git config user.email "actions@github.com"
45+
git config user.name "GitHub Actions"
46+
git add docs/
47+
git commit -m "Update data documentation"
48+
git push

CLAUDE.md

Lines changed: 97 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ PopHIVE/Ingest/
155155

156156
```r
157157
# Create new data source folder structure
158+
### Important!! When adding a new data source, you MUST run this function. Otherwise the process.json files will not be initialized correctly, causing the pieplein to fail
158159
dcf::dcf_add_source("source_name")
159160

160161
# Initialize processing record for tracking changes
@@ -330,6 +331,8 @@ if (!identical(process$raw_state, raw_state)) {
330331

331332
## measure_info.json Template
332333

334+
Each `measure_info.json` file should include variable definitions and a centralized `_sources` object. Variables reference sources by ID.
335+
333336
```json
334337
{
335338
"variable_name": {
@@ -343,23 +346,47 @@ if (!identical(process$raw_state, raw_state)) {
343346
"measure_type": "Incidence|Prevalence|Rate|Percent|Count",
344347
"unit": "Cases per 100,000|Percent|Count",
345348
"time_resolution": "Week|Month|Year",
346-
"restrictions": "Non-commercial purposes|Attribution required|None",
347-
"sources": [
348-
{
349-
"name": "Source organization",
350-
"url": "https://data.source.url"
351-
}
352-
],
349+
"sources": [{ "id": "source_id" }],
353350
"citations": [
354351
{
355352
"title": "Publication title",
356353
"url": "https://doi.org/..."
357354
}
358355
]
356+
},
357+
358+
"_sources": {
359+
"source_id": {
360+
"name": "Full source name",
361+
"url": "https://data.source.url",
362+
"organization": "Organization name",
363+
"organization_url": "https://organization.url",
364+
"location": "Specific dataset location (optional)",
365+
"location_url": "https://specific.dataset.url (optional)",
366+
"description": "Detailed narrative description of the data source, including methodology, coverage, limitations, and any important caveats for users.",
367+
"restrictions": "License and usage restrictions. Examples: 'Public domain. CDC data is generally not subject to copyright restrictions.' or 'CC BY 4.0. Attribution required for reuse.' or 'Attribution required. Cite [citation].'",
368+
"date_accessed": 2025
369+
}
359370
}
360371
}
361372
```
362373

374+
### _sources Field Requirements
375+
376+
Every `_sources` entry MUST include:
377+
- **name**: Full name of the data source
378+
- **url**: Primary URL for the data source
379+
- **organization**: Name of the organization providing the data
380+
- **organization_url**: URL for the organization
381+
- **description**: Narrative description of the source (methodology, coverage, limitations)
382+
- **restrictions**: License and usage restrictions
383+
384+
Special restriction wording:
385+
- **Epic Cosmos**: "The data can be re-used with appropriate attribution. A suggested citation relating to this data is 'Results of research performed with Epic Cosmos were obtained from the PopHIVE platform (https://github.com/PopHIVE/Ingest).'"
386+
- **Google Health Trends**: "Data can be reused with attribution of data from the Google Health Trends API, obtained via the PopHIVE platform (https://github.com/PopHIVE/Ingest)."
387+
- **CDC/CMS data**: "Public domain. CDC data is generally not subject to copyright restrictions."
388+
- **Academic publications**: "Attribution required. Cite [full citation]."
389+
363390
---
364391

365392
## Common Data Source Patterns
@@ -527,13 +554,31 @@ data %>%
527554
```
528555

529556
### Issue: Error "process file process.json does not exist"
530-
```r
531-
# Problem: dcf::dcf_process_record() fails on first run of new data source
532-
# Solution: Check if process.json exists before calling dcf_process_record()
533-
if (!file.exists("process.json")) {
534-
process <- list(raw_state = NULL)
535-
} else {
536-
process <- dcf::dcf_process_record()
557+
This is caused by failure to initialize a new datasource with `dcf::dcf_add_source()`. If this is not done, the process.json file is not properly initialized.
558+
559+
**Preferred solution**: Run `dcf::dcf_add_source("source_name")` to create the folder structure properly.
560+
561+
**Manual fix**: If you need to create the process.json manually, use this structure (replace `source_name` with your data folder name):
562+
563+
```json
564+
{
565+
"name": "source_name",
566+
"type": "source",
567+
"scripts": [
568+
{
569+
"path": "ingest.R",
570+
"manual": false,
571+
"frequency": 0,
572+
"last_run": "",
573+
"run_time": "",
574+
"last_status": {
575+
"log": "",
576+
"success": true
577+
}
578+
}
579+
],
580+
"checked": "",
581+
"check_results": []
537582
}
538583
```
539584

@@ -555,6 +600,34 @@ dcf::dcf_process("source_name")
555600
# Also ensure project.Rproj and README.md exist in the source folder
556601
```
557602

603+
### Issue: Error "vec_math.arrow_binary() not implemented" when running dcf_process()
604+
605+
This error occurs when a script works fine when run directly but fails via `dcf_process()`. The cause is vroom's Arrow ALTREP (lazy loading) backend:
606+
607+
- **When running directly**: Interactive sessions may materialize data earlier or have different environment state
608+
- **When running via dcf_process()**: Scripts run in a cleaner context where Arrow ALTREP stays active, keeping columns as Arrow binary types until an operation forces materialization
609+
610+
The error typically triggers when using `if_else()` with mixed types (e.g., comparing integers with Arrow-backed columns) or when `cdlTools::fips()` returns integers that get mixed with other types.
611+
612+
```r
613+
# Problem: cdlTools::fips() with if_else causes Arrow type issues
614+
mutate(geography = cdlTools::fips(statename, to='FIPS'),
615+
geography = if_else(statename=='United States', 0, geography))
616+
617+
# Solution: Use FIPS lookup merge instead (also faster)
618+
all_fips <- vroom::vroom("../../resources/all_fips.csv.gz", show_col_types = FALSE)
619+
state_fips_lookup <- all_fips %>%
620+
filter(nchar(geography) == 2) %>%
621+
select(geography, geography_name)
622+
623+
data <- data %>%
624+
left_join(state_fips_lookup, by = c("statename" = "geography_name")) %>%
625+
mutate(geography = if_else(statename == 'United States', "00", geography))
626+
627+
# Alternative: Disable Arrow ALTREP (less preferred)
628+
data <- vroom::vroom("file.csv.xz", show_col_types = FALSE, altrep = FALSE)
629+
```
630+
558631
### Issue: Connecticut county FIPS codes not matching
559632

560633
Connecticut abolished its 8 counties in 2022 and replaced them with 9 planning regions as county-equivalents. This means:
@@ -609,6 +682,9 @@ dcf::dcf_build()
609682
# Validate standard file format
610683
source("scripts/validate_standard.R")
611684
validate_standard_file("data/source_name/standard/data.csv.gz")
685+
686+
# Rebuild data source documentation (generates docs/index.html)
687+
Rscript scripts/build_docs.R
612688
```
613689

614690
---
@@ -641,7 +717,13 @@ validate_standard_file("data/source_name/standard/data.csv.gz")
641717
dcf::dcf_process("bundle_category", ".")
642718
```
643719

644-
8. **Commit changes**: Include raw data sample, ingest.R, measure_info.json, standard output
720+
8. **Update documentation**: The data source documentation is auto-generated from `measure_info.json` files
721+
```r
722+
Rscript scripts/build_docs.R
723+
```
724+
This generates `docs/index.html` with variable tables and source information. The GitHub Action will also rebuild docs automatically when `measure_info.json` files change.
725+
726+
9. **Commit changes**: Include raw data sample, ingest.R, measure_info.json, standard output, and updated docs/
645727

646728
---
647729

data/NREVSS/measure_info.json

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -187,14 +187,17 @@
187187
"measure_type": "week",
188188
"unit": "week",
189189
"time_resolution": "",
190-
"sources": [
191-
{
192-
"name": "Centers for Disease Control and Prevention",
193-
"url": "https://data.cdc.gov",
194-
"location": "Percent Positivity of Respiratory Syncytial Virus Nucleic Acid Amplification Tests by HHS Region, National Respiratory and Enteric Virus Surveillance System",
195-
"location_url": "https://data.cdc.gov/resource/3cxc-4k8q",
196-
"date_accessed": 2025
197-
}
198-
]
190+
"sources": [{ "id": "nrevss" }]
191+
},
192+
193+
"_sources": {
194+
"nrevss": {
195+
"name": "National Respiratory and Enteric Virus Surveillance System (NREVSS)",
196+
"url": "https://data.cdc.gov/resource/3cxc-4k8q",
197+
"organization": "Centers for Disease Control and Prevention",
198+
"organization_url": "https://www.cdc.gov/surveillance/nrevss/",
199+
"description": "The National Respiratory and Enteric Virus Surveillance System (NREVSS) is a voluntary, laboratory-based surveillance system that monitors temporal and geographic trends for respiratory syncytial virus (RSV), human parainfluenza viruses, respiratory adenoviruses, human metapneumovirus, human coronaviruses, and rotavirus circulation in the United States. Participating laboratories report weekly to CDC on the number of tests performed and the number positive for each virus. NREVSS data are used to characterize seasonal patterns of these viruses and to help public health officials anticipate and prepare for outbreaks. Data are aggregated at the HHS regional and national levels. The system has been operational since 1987 and includes approximately 300 participating laboratories across the United States.",
200+
"restrictions": "Public domain. CDC data is generally not subject to copyright restrictions."
199201
}
200202
}
203+
}

data/NREVSS/process.json

Lines changed: 23 additions & 2 deletions
Large diffs are not rendered by default.

data/NREVSS/standard/datapackage.json

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"filename": "data.csv.gz",
1818
"versions": {
1919
"hash": [
20+
"b0cd5e888daf4a63d94d1370140226c7b8dcd812",
2021
"4bf7a93442011f06a6d0911b452537d3e687b01c",
2122
"0e1dbc341fd649cbb7039f7c72fd26207bec7331",
2223
"99cbd8c9ec417a7f9687ef9496149ef012a972b0",
@@ -52,6 +53,7 @@
5253
"0ca5a1658309a86937b5870cdd8e65c3737615b8"
5354
],
5455
"author": [
56+
"danweinberger <daniel.weinberger@yale.edu>",
5557
"GitHub Actions <actions@github.com>",
5658
"GitHub Actions <actions@github.com>",
5759
"GitHub Actions <actions@github.com>",
@@ -87,6 +89,7 @@
8789
"weinbergerlab <dweinber@gmail.com>"
8890
],
8991
"date": [
92+
"Sun Feb 1 22:36:14 2026 -0500",
9093
"Fri Jan 23 14:44:05 2026 +0000",
9194
"Fri Jan 16 14:42:17 2026 +0000",
9295
"Fri Jan 9 14:43:14 2026 +0000",
@@ -122,6 +125,7 @@
122125
"Fri Jul 11 11:35:21 2025 -0400"
123126
],
124127
"message": [
128+
"update respiratory files",
125129
"scheduled data build",
126130
"scheduled data build",
127131
"scheduled data build",
@@ -174,8 +178,8 @@
174178
"id_length": 0,
175179
"time": "time",
176180
"profile": "data-resource",
177-
"created": "2026-02-01 22:15:31.77317",
178-
"last_modified": "2025-07-14 09:31:21.119787",
181+
"created": "2026-02-01 22:15:31",
182+
"last_modified": "2025-07-11 11:04:59.776478",
179183
"vintage": {},
180184
"row_count": 3333,
181185
"entity_count": 11,
@@ -780,14 +784,21 @@
780784
"unit": "week",
781785
"sources": [
782786
{
783-
"name": "Centers for Disease Control and Prevention",
784-
"url": "https://data.cdc.gov",
785-
"location": "Percent Positivity of Respiratory Syncytial Virus Nucleic Acid Amplification Tests by HHS Region, National Respiratory and Enteric Virus Surveillance System",
786-
"location_url": "https://data.cdc.gov/resource/3cxc-4k8q",
787-
"date_accessed": 2025
787+
"id": "nrevss"
788788
}
789789
],
790790
"id": "week"
791+
},
792+
"_sources": {
793+
"nrevss": {
794+
"name": "National Respiratory and Enteric Virus Surveillance System (NREVSS)",
795+
"url": "https://data.cdc.gov/resource/3cxc-4k8q",
796+
"organization": "Centers for Disease Control and Prevention",
797+
"organization_url": "https://www.cdc.gov/surveillance/nrevss/",
798+
"description": "The National Respiratory and Enteric Virus Surveillance System (NREVSS) is a voluntary, laboratory-based surveillance system that monitors temporal and geographic trends for respiratory syncytial virus (RSV), human parainfluenza viruses, respiratory adenoviruses, human metapneumovirus, human coronaviruses, and rotavirus circulation in the United States. Participating laboratories report weekly to CDC on the number of tests performed and the number positive for each virus. NREVSS data are used to characterize seasonal patterns of these viruses and to help public health officials anticipate and prepare for outbreaks. Data are aggregated at the HHS regional and national levels. The system has been operational since 1987 and includes approximately 300 participating laboratories across the United States.",
799+
"restrictions": "Public domain. CDC data is generally not subject to copyright restrictions."
800+
},
801+
"id": "_sources"
791802
}
792803
}
793804
}

0 commit comments

Comments
 (0)