OpenSourceAP
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 5 deletions b/‎.gitignore‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 0 additions & 36 deletions b/‎AGENTS.md‎
Lines changed: 0 additions & 36 deletions
diff --git a/‎Portfolios/Code/00_SettingsAndTools.R‎
Lines changed: 55 additions & 1 deletion b/‎Portfolios/Code/00_SettingsAndTools.R‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎Portfolios/Code/12_SignalExhibits.R‎
Lines changed: 16 additions & 4 deletions b/‎Portfolios/Code/12_SignalExhibits.R‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎Portfolios/Code/21_PredictorExhibits.R‎
Lines changed: 1 addition & 1 deletion b/‎Portfolios/Code/21_PredictorExhibits.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Portfolios/Code/41_PlaceboExhibits.R‎
Lines changed: 3 additions & 3 deletions b/‎Portfolios/Code/41_PlaceboExhibits.R‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Portfolios/Code/50_DailyPredictorPorts.R‎
Lines changed: 3 additions & 0 deletions b/‎Portfolios/Code/50_DailyPredictorPorts.R‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Portfolios/Code/master.R‎
Lines changed: 20 additions & 2 deletions b/‎Portfolios/Code/master.R‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 49 additions & 29 deletions b/‎README.md‎
Lines changed: 49 additions & 29 deletions
@@ -5,17 +5,13 @@ deleteme*
 Signals/Debug/*
 Signals/pyCode/Debug/*
 Signals/zzz-archive/*
-Signals/Human/*
 Signals/pyCode/dev/*
 
-# Testing exceptions
-!Signals/Logs/testout_placebos.md
-!Signals/Logs/testout_predictors.md
-
 .Rproj.user
 .Rhistory
 .RData
 .Ruserdata
+.Rapp.history
 
 *.Rprofile
 *.Rproj
@@ -30,6 +26,7 @@ Portfolios/Data/*
 Portfolios/pyData/*
 Signals/Data/*
 Signals/pyData/*
+Shipping/Data/* 
 
 # Credentials
 .env
@@ -45,4 +42,5 @@ __*__*
 # Other
 Portfolios/Code/Rplots.pdf
 .git
+AGENTS.md
 
@@ -1,3 +1,10 @@
+# """
+# Inputs: SignalDoc.csv, portfolio configuration globals, package dependencies
+# Outputs: Initialized paths, loaded documentation, and helper functions for downstream R scripts
+# How to run: source('Portfolios/Code/00_SettingsAndTools.R', echo = TRUE)
+# Example: source('Portfolios/Code/00_SettingsAndTools.R', echo = TRUE)
+# """
+
 #### GLOBAL SETTINGS
 
 options(dplyr.summarise.inform = FALSE)
@@ -192,6 +199,54 @@ checkSignals = function(docs = alldocumentation, pathProj = pathProject) {
 
 } # end function
 
+check_signal_csvs <- function(path_proj = pathProject, signal_source = SignalSource) {
+  signal_doc_path <- file.path(path_proj, 'SignalDoc.csv')
+  signal_folder <- if (identical(signal_source, 'Python')) 'pyData' else 'Data'
+  predictors_dir <- file.path(path_proj, 'Signals', signal_folder, 'Predictors')
+  placebos_dir <- file.path(path_proj, 'Signals', signal_folder, 'Placebos')
+
+  if (!file.exists(signal_doc_path)) {
+    stop('SignalDoc.csv not found. Please verify pathProject is set correctly.')
+  }
+
+  if (!dir.exists(predictors_dir) || !dir.exists(placebos_dir)) {
+    stop('Signals/pyCode directories not found. Please run the signals pipeline first.')
+  }
+
+  signal_doc <- read.csv(signal_doc_path, stringsAsFactors = FALSE)
+  expected_signals <- unique(signal_doc$Acronym[signal_doc$Acronym != '' & signal_doc$Cat.Signal != 'Drop'])
+  expected_signals <- sort(expected_signals)
+
+  predictor_files <- tools::file_path_sans_ext(list.files(predictors_dir, pattern = '\\.csv$', ignore.case = TRUE))
+  placebo_files <- tools::file_path_sans_ext(list.files(placebos_dir, pattern = '\\.csv$', ignore.case = TRUE))
+
+  all_signal_files <- sort(unique(c(predictor_files, placebo_files)))
+  missing_signals <- setdiff(expected_signals, all_signal_files)
+  extra_files <- setdiff(all_signal_files, expected_signals)
+
+  if (length(missing_signals) == 0) {
+    message('Signal completeness check: all expected signals found in pyCode/Predictors and pyCode/Placebos.')
+  } else {
+    warning(sprintf('Signal completeness check: %d missing signal CSV(s) detected.', length(missing_signals)))
+    for (signal_name in missing_signals) {
+      message(sprintf('  - Missing CSV: %s', signal_name))
+    }
+  }
+
+  if (length(extra_files) > 0) {
+    message(sprintf('Found %d extra CSV(s) not documented in SignalDoc.csv:', length(extra_files)))
+    for (signal_name in extra_files) {
+      in_predictors <- signal_name %in% predictor_files
+      in_placebos <- signal_name %in% placebo_files
+      location <- paste(c(if (in_predictors) 'Predictors' else NULL,
+                          if (in_placebos) 'Placebos' else NULL), collapse = ', ')
+      message(sprintf('  - Extra CSV: %s (%s)', signal_name, location))
+    }
+  }
+
+  invisible(length(missing_signals) == 0)
+}
+
 
 ### FUNCTION FOR STANDARD CSV EXPORT
 writestandard = function(df, path, filename){
@@ -446,4 +501,3 @@ loop_over_strategies = function(
 
 
 } # end function
-
 
@@ -1,3 +1,15 @@
+# """
+# Inputs: relies on documentation tables and signal CSVs generated by the upstream Python pipeline (requires `alldocumentation`, `pathProject`, `pathPredictors`, `pathDataIntermediate`, `pathResults` in scope).
+# Outputs: writes `coverage.xlsx` and intermediate fst files; also produces correlation exhibits and related plots.
+# How to run:
+#   Rscript 12_SignalExhibits.R
+# Example:
+#   Rscript 12_SignalExhibits.R
+# """
+
+# Ensure CRAN mirror is defined for non-interactive runs
+options(repos = c(CRAN = "https://cloud.r-project.org"))
+
 ## exhibits that use only signals or documentation
 
 ### ENVIRONMENT ###
@@ -24,7 +36,7 @@ count.us = readdocumentation() %>%
   )
 
 count.mp = read_csv(
-  paste0(pathProject, 'Comparison_to_MetaReplications.csv')
+  paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
 ) %>%
   filter(metastudy == 'MP') %>% 
   mutate(covered = ourname != '_missing_') %>%
@@ -35,7 +47,7 @@ count.mp = read_csv(
   mutate(pctcov = covered/n*100)
 
 count.ghz = read_csv(
-  paste0(pathProject, 'Comparison_to_MetaReplications.csv')
+  paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
 ) %>%
   filter(metastudy == 'GHZ') %>% 
   mutate(covered = ourname != '_missing_') %>%
@@ -47,7 +59,7 @@ count.ghz = read_csv(
 
 # for HXZ, we create a special category for alternative holding periods
 count.hxz = read_csv(
-  paste0(pathProject, 'Comparison_to_MetaReplications.csv')
+  paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
 ) %>%
   filter(metastudy == 'HXZ') %>% 
   mutate(covered = ourname != '_missing_') %>% 
@@ -70,7 +82,7 @@ count.hxz = read_csv(
 # HLZ has its own csv since it's so different (not replication)
 # coverage then needs to be more judgmental
 count.hlz = read_csv(
-  paste0(pathProject, 'Comparison_to_HLZ.csv')
+  paste0(pathProject, 'Docs/Comparison_to_HLZ.csv')
 ) %>%
   mutate(
     covered = Coverage != 'zz missing'
 
@@ -471,7 +471,7 @@ statsFull <- read_xlsx(paste0(pathDataPortfolios, "PredictorSummary.xlsx"),
   select(signalname, tstat, rbar) 
 
 mpSignals = read_csv(
-  paste0(pathProject, 'Comparison_to_MetaReplications.csv')
+  paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
 ) %>%
   filter(
     metastudy == 'MP', ourname != '_missing_'
 
@@ -114,7 +114,7 @@ statsFull <- read_xlsx(paste0(pathDataPortfolios, "PredictorSummary.xlsx"),
   )
 
 mpSignals = read_csv(
-  paste0(pathProject, 'Comparison_to_MetaReplications.csv')
+  paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
 ) %>%
   filter(metastudy == 'MP', ourname != '_missing_')
 
@@ -208,12 +208,12 @@ df_merge %>% filter(inMP) %>% summarize(mean(rbar), sd(rbar), sum(tstat>1.5))
 # Replication rate vis-a-vis other studies --------------------------------
 
 mpSignals = read_csv(
-  paste0(pathProject, 'Comparison_to_MetaReplications.csv')
+  paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
 ) %>%
   filter(metastudy == 'MP', ourname != '_missing_')
 
 hxzSignals = read_csv(
-  paste0(pathProject, 'Comparison_to_MetaReplications.csv')
+  paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
 ) %>%
   filter(metastudy == 'HXZ', ourname != '_missing_')
 
 
@@ -47,6 +47,9 @@ dir.create(pathDataDailyDecileVW)
 dir.create(pathDataDailyQuintile)
 dir.create(pathDataDailyQuintileVW)
 
+# Ensure CRAN mirror is defined for non-interactive runs
+options(repos = c(CRAN = "https://cloud.r-project.org"))
+
 # since no other script uses lme4, it should go here.
 install.packages(setdiff(c('lme4'), rownames(installed.packages())))
 library(lme4)
 
@@ -27,13 +27,13 @@
 rm(list = ls())
 # ENTER PROJECT PATH HERE (i.e. this should be the path to your local repo folder & location of SignalDoc.csv)
 # if using Rstudio, pathProject = paste0(getwd(), '/') should work
-pathProject = '/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/'
+pathProject = '~/Dropbox/oap-ac/CrossSection/'
 
 SignalSource = "Python" # use "Stata" for legacy signals (Signals/Data/) or "Python" for new signals (Signals/pyData/)
 
 quickrun =  F # use T if you want to run quickly for testing
 quickrunlist = c('Accruals','AM') # list of signals to use for quickrun
-skipdaily = T # use T to skip daily CRSP which is very slow
+skipdaily = F # use T to skip daily CRSP which is very slow
 feed.verbose = F # use T if you want lots of feedback
 
 # Check whether project path is set correctly
@@ -49,6 +49,24 @@ source('01_PortfolioFunction.R', echo=T)
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+# Check if signals data is complete
+signals_complete <- check_signal_csvs(pathProject, SignalSource)
+if (!interactive()) {
+    if (!signals_complete) {
+        stop('Non-interactive run halted: signals not complete')
+    } 
+} else {
+    user_response <- tolower(trimws(readline(prompt = 'Proceed with portfolio build? [y/N]: ')))
+    if (!(user_response %in% c('y', 'yes'))) {
+        stop('Aborting master.R execution at user request.')
+    }
+    if (!signals_complete) {
+        message('Continuing despite missing signals based on user confirmation.')
+    }
+}
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 # PREPARE INTERMEDIATE DATA ####
 
 print('master: 10_DownloadCRSP.R')
 
@@ -38,9 +38,36 @@ The code is separated into three folders:
 
 We separate the code so you can choose which parts you want to run.  If you only want to create signals, you can run the files in `Signals/pyCode/` and then do your thing.  If you just want to create portfolios, you can skip the signal generation by directly downloading its output via the [data page](https://www.openassetpricing.com/).  The whole thing is about 15,000 lines, so you might want to pick your battles.
 
-More details are below.
+More details are below
 
-### 1. Signals/pyCode/
+### `Signals/pyCode` Instructions
+
+**1. Set up for Creating Signals (Python and R)**
+
+* Install Python dependencies:
+  ```bash
+  cd Signals/pyCode/
+  pip install -r requirements.txt
+  ```
+* Install required R packages. [tbc]
+* Copy `Signals/pyCode/dotenv.template` to `Signals/pyCode/.env` and add your WRDS and FRED credentials.
+  - For FRED credentials, request an [API key from FRED](https://research.stlouisfed.org/docs/api/api_key.html)
+
+**2. (Optional) Generate Prep Data**
+
+This is only necessary for a handful of signals
+
+If you have bash:
+* from `Signals/pyCode/`
+  - run `bash prep1_run_on_wrds.sh` to copy the prep scripts to the WRDS Cloud
+  - wait about 5 hours
+    - use qstat to check if it's still running
+    - if impatient, check most recent file in `~/temp_prep/log/` on WRDS server.  
+  - run `bash prep2_dl_from_wrds.sh` to download the prep data from the WRDS Cloud to `Signals/pyData/Prep/`
+
+You can alternatively upload to the WRDS Cloud manually, ssh into WRDS, run `qsub run_all_prep.sh`, and then manually download the prep data.
+
+**3. Run the Signals Code**
 
 `master.py` runs the end-to-end Python pipeline. It calls the staged scripts in:
 
@@ -49,26 +76,16 @@ More details are below.
 * `Predictors/` constructs stock-level predictors and outputs to `Signals/pyData/Predictors/`
 * `Placebos/` constructs "not predictors" and "indirect evidence" signals and outputs to `Signals/pyData/Placebos/`
 
-The orchestrator blocks are written to keep running even if a particular download fails (for example due to a missing subscription) so you get as much data as possible. You can track progress in `Signals/Logs/`.
-
-#### Minimal Setup
-
-1. From `Signals/pyCode/`, create a Python 3 virtual environment (e.g. `python3 -m venv .venv`) and install the requirements via `pip install -r requirements.txt` after activating the environment. `set_up_pyCode.py` automates these steps if you prefer.
-2. Copy `dotenv.template` to `.env` and populate credentials such as `WRDS_USERNAME`, `WRDS_PASSWORD`, and any other keys you need (e.g. `FRED_API_KEY`).
-3. Run the full pipeline with `python master.py` (from inside `Signals/pyCode/`). You can also run `01_DownloadData.py` and `02_CreatePredictors.py` individually if you just need part of the workflow.
-4. Outputs are written to `Signals/pyData/`, and detailed logs are saved under `Signals/Logs/`.
+**To run:**
+```bash
+cd Signals/pyCode/
+python master.py
+```
 
-#### Optional Setup
-
-The minimal setup produces the vast majority of signals. Thanks to exception handling, the pipeline will keep going even if a particular source is unavailable.
-
-To reproduce every signal:
+The orchestrator blocks are written to keep running even if a particular download fails (for example due to a missing subscription) so you get as much data as possible. You can track progress in `Signals/Logs/`.
 
-* For IBES, 13F, OptionMetrics, and bid-ask spread signals, run the helper scripts in `Signals/pyCode/PrepScripts/` (many are designed for WRDS Cloud) and place the resulting files in `Signals/pyData/Prep/`.
-* For signals that use the VIX, inflation, or broker-dealer leverage, request an [API key from FRED](https://research.stlouisfed.org/docs/api/api_key.html) and add `FRED_API_KEY` to `.env` before running the download scripts.
-* For signals that rely on patent citations, BEA input-output tables, or Compustat customer data, ensure that `Rscript` is available on your system because some helper scripts shell out to R.
 
-### 2. Portfolios/Code/
+### `Portfolios/Code` Instructions
 
 `master.R` runs everything. It:
 
@@ -78,29 +95,32 @@ To reproduce every signal:
 
 It also uses `SignalDoc.csv` as a guide for how to run the portfolios.
 
-By default the code skips the daily portfolios (`skipdaily = T`), and takes about 8 hours, assuming you examine all 300 or so signals.  However, the baseline portfolios (based on predictability results in the original papers) will be done in just 30 minutes. You can keep an eye on how it's going by checking the csvs outputted to `Portfolios/Data/Portfolios/`.  Every 30 minutes or so the code should output another set of portfolios.  Adding the daily portfolios (`skipdaily = F`) takes an additional 12ish hours.
-
-#### Minimal Setup
+**To run:**
+* Option 1 - Command line:
+  ```bash
+  cd Portfolios/Code/
+  Rscript master.R
+  ```
+* Option 2 - RStudio: Open `master.R` in RStudio and click "Source" or press Ctrl+Shift+S (Cmd+Shift+S on Mac)
 
-All you need to do is set `pathProject` in `master.R` to the project root directory (where `SignalDoc.csv` is).  Then `master.R` will create portfolios for Price, Size, and STreversal in `Portfolios/Data/Portfolios/`.
+**Before running:** You must set `pathProject` in `master.R` (line 30) to your project root directory (where `SignalDoc.csv` is located). If using RStudio, `pathProject = paste0(getwd(), '/')` should work automatically.
 
-#### Probable Setup
+By default the code skips the daily portfolios (`skipdaily = T`), and takes about 8 hours, assuming you examine all 300 or so signals.  However, the baseline portfolios (based on predictability results in the original papers) will be done in just 30 minutes. You can keep an eye on how it's going by checking the csvs outputted to `Portfolios/Data/Portfolios/`.  Every 30 minutes or so the code should output another set of portfolios.  Adding the daily portfolios (`skipdaily = F`) takes an additional 12ish hours.
 
-You probably want more than Price, Size, and STreversal portfolios, and so you probably want to set up more signal data before you run `master.R`.  
+#### Minimal Setup
 
+To get started quickly, `master.R` will create portfolios for Price, Size, and STreversal in `Portfolios/Data/Portfolios/`.
 There are a couple ways to set up this signal data:
 
 * Run the code in `Signals/pyCode/` (see above).
 * Download `Firm Level Characteristics/Full Sets/PredictorsIndiv.zip` and `Firm Level Characteristics/Full Sets/PlacebosIndiv.zip` via the [data page](https://sites.google.com/site/chenandrewy/open-source-ap) and unzip to `Signals/Data/Predictors/` and `Signals/Data/Placebos/`.
 * Download only some selected csvs via the [data page](https://sites.google.com/site/chenandrewy/open-source-ap) and place in `Signals/Data/Predictors/` (e.g. just download `BM.csv`, `AssetGrowth.csv`, and `EarningsSurprise.csv` and put them in `Signals/Data/Predictors/`).
 
 
-### 3. Shipping/Code/
+### `Shipping/Code` Instructions
 
 This code zips up the data, makes some quality checks, and copies files for uploading to Gdrive.  You shouldn't need to use this but we keep it with the rest of the code for replicability.
 
-----
 
-## Contribute
 
-Please let us know if you find typos in the code or think that we should add additional signals. You can let us know about any suggested changes via pull requests for this repo. We will keep the code up to date for other researchers to use it.
+