Skip to content

Commit b4e911e

Browse files
authored
Merge pull request #202 from OpenSourceAP/CleaningSetup
Tidying full repo on a full run
2 parents 92033e6 + 164dbea commit b4e911e

39 files changed

+915
-23865
lines changed

.gitignore

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,13 @@ deleteme*
55
Signals/Debug/*
66
Signals/pyCode/Debug/*
77
Signals/zzz-archive/*
8-
Signals/Human/*
98
Signals/pyCode/dev/*
109

11-
# Testing exceptions
12-
!Signals/Logs/testout_placebos.md
13-
!Signals/Logs/testout_predictors.md
14-
1510
.Rproj.user
1611
.Rhistory
1712
.RData
1813
.Ruserdata
14+
.Rapp.history
1915

2016
*.Rprofile
2117
*.Rproj
@@ -30,6 +26,7 @@ Portfolios/Data/*
3026
Portfolios/pyData/*
3127
Signals/Data/*
3228
Signals/pyData/*
29+
Shipping/Data/*
3330

3431
# Credentials
3532
.env
@@ -45,4 +42,5 @@ __*__*
4542
# Other
4643
Portfolios/Code/Rplots.pdf
4744
.git
45+
AGENTS.md
4846

AGENTS.md

Lines changed: 0 additions & 36 deletions
This file was deleted.

Portfolios/Code/00_SettingsAndTools.R

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# """
2+
# Inputs: SignalDoc.csv, portfolio configuration globals, package dependencies
3+
# Outputs: Initialized paths, loaded documentation, and helper functions for downstream R scripts
4+
# How to run: source('Portfolios/Code/00_SettingsAndTools.R', echo = TRUE)
5+
# Example: source('Portfolios/Code/00_SettingsAndTools.R', echo = TRUE)
6+
# """
7+
18
#### GLOBAL SETTINGS
29

310
options(dplyr.summarise.inform = FALSE)
@@ -192,6 +199,54 @@ checkSignals = function(docs = alldocumentation, pathProj = pathProject) {
192199

193200
} # end function
194201

202+
check_signal_csvs <- function(path_proj = pathProject, signal_source = SignalSource) {
203+
signal_doc_path <- file.path(path_proj, 'SignalDoc.csv')
204+
signal_folder <- if (identical(signal_source, 'Python')) 'pyData' else 'Data'
205+
predictors_dir <- file.path(path_proj, 'Signals', signal_folder, 'Predictors')
206+
placebos_dir <- file.path(path_proj, 'Signals', signal_folder, 'Placebos')
207+
208+
if (!file.exists(signal_doc_path)) {
209+
stop('SignalDoc.csv not found. Please verify pathProject is set correctly.')
210+
}
211+
212+
if (!dir.exists(predictors_dir) || !dir.exists(placebos_dir)) {
213+
stop('Signals/pyCode directories not found. Please run the signals pipeline first.')
214+
}
215+
216+
signal_doc <- read.csv(signal_doc_path, stringsAsFactors = FALSE)
217+
expected_signals <- unique(signal_doc$Acronym[signal_doc$Acronym != '' & signal_doc$Cat.Signal != 'Drop'])
218+
expected_signals <- sort(expected_signals)
219+
220+
predictor_files <- tools::file_path_sans_ext(list.files(predictors_dir, pattern = '\\.csv$', ignore.case = TRUE))
221+
placebo_files <- tools::file_path_sans_ext(list.files(placebos_dir, pattern = '\\.csv$', ignore.case = TRUE))
222+
223+
all_signal_files <- sort(unique(c(predictor_files, placebo_files)))
224+
missing_signals <- setdiff(expected_signals, all_signal_files)
225+
extra_files <- setdiff(all_signal_files, expected_signals)
226+
227+
if (length(missing_signals) == 0) {
228+
message('Signal completeness check: all expected signals found in pyCode/Predictors and pyCode/Placebos.')
229+
} else {
230+
warning(sprintf('Signal completeness check: %d missing signal CSV(s) detected.', length(missing_signals)))
231+
for (signal_name in missing_signals) {
232+
message(sprintf(' - Missing CSV: %s', signal_name))
233+
}
234+
}
235+
236+
if (length(extra_files) > 0) {
237+
message(sprintf('Found %d extra CSV(s) not documented in SignalDoc.csv:', length(extra_files)))
238+
for (signal_name in extra_files) {
239+
in_predictors <- signal_name %in% predictor_files
240+
in_placebos <- signal_name %in% placebo_files
241+
location <- paste(c(if (in_predictors) 'Predictors' else NULL,
242+
if (in_placebos) 'Placebos' else NULL), collapse = ', ')
243+
message(sprintf(' - Extra CSV: %s (%s)', signal_name, location))
244+
}
245+
}
246+
247+
invisible(length(missing_signals) == 0)
248+
}
249+
195250

196251
### FUNCTION FOR STANDARD CSV EXPORT
197252
writestandard = function(df, path, filename){
@@ -446,4 +501,3 @@ loop_over_strategies = function(
446501

447502

448503
} # end function
449-

Portfolios/Code/12_SignalExhibits.R

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
# """
2+
# Inputs: relies on documentation tables and signal CSVs generated by the upstream Python pipeline (requires `alldocumentation`, `pathProject`, `pathPredictors`, `pathDataIntermediate`, `pathResults` in scope).
3+
# Outputs: writes `coverage.xlsx` and intermediate fst files; also produces correlation exhibits and related plots.
4+
# How to run:
5+
# Rscript 12_SignalExhibits.R
6+
# Example:
7+
# Rscript 12_SignalExhibits.R
8+
# """
9+
10+
# Ensure CRAN mirror is defined for non-interactive runs
11+
options(repos = c(CRAN = "https://cloud.r-project.org"))
12+
113
## exhibits that use only signals or documentation
214

315
### ENVIRONMENT ###
@@ -24,7 +36,7 @@ count.us = readdocumentation() %>%
2436
)
2537

2638
count.mp = read_csv(
27-
paste0(pathProject, 'Comparison_to_MetaReplications.csv')
39+
paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
2840
) %>%
2941
filter(metastudy == 'MP') %>%
3042
mutate(covered = ourname != '_missing_') %>%
@@ -35,7 +47,7 @@ count.mp = read_csv(
3547
mutate(pctcov = covered/n*100)
3648

3749
count.ghz = read_csv(
38-
paste0(pathProject, 'Comparison_to_MetaReplications.csv')
50+
paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
3951
) %>%
4052
filter(metastudy == 'GHZ') %>%
4153
mutate(covered = ourname != '_missing_') %>%
@@ -47,7 +59,7 @@ count.ghz = read_csv(
4759

4860
# for HXZ, we create a special category for alternative holding periods
4961
count.hxz = read_csv(
50-
paste0(pathProject, 'Comparison_to_MetaReplications.csv')
62+
paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
5163
) %>%
5264
filter(metastudy == 'HXZ') %>%
5365
mutate(covered = ourname != '_missing_') %>%
@@ -70,7 +82,7 @@ count.hxz = read_csv(
7082
# HLZ has its own csv since it's so different (not replication)
7183
# coverage then needs to be more judgmental
7284
count.hlz = read_csv(
73-
paste0(pathProject, 'Comparison_to_HLZ.csv')
85+
paste0(pathProject, 'Docs/Comparison_to_HLZ.csv')
7486
) %>%
7587
mutate(
7688
covered = Coverage != 'zz missing'

Portfolios/Code/21_PredictorExhibits.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ statsFull <- read_xlsx(paste0(pathDataPortfolios, "PredictorSummary.xlsx"),
471471
select(signalname, tstat, rbar)
472472

473473
mpSignals = read_csv(
474-
paste0(pathProject, 'Comparison_to_MetaReplications.csv')
474+
paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
475475
) %>%
476476
filter(
477477
metastudy == 'MP', ourname != '_missing_'

Portfolios/Code/41_PlaceboExhibits.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ statsFull <- read_xlsx(paste0(pathDataPortfolios, "PredictorSummary.xlsx"),
114114
)
115115

116116
mpSignals = read_csv(
117-
paste0(pathProject, 'Comparison_to_MetaReplications.csv')
117+
paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
118118
) %>%
119119
filter(metastudy == 'MP', ourname != '_missing_')
120120

@@ -208,12 +208,12 @@ df_merge %>% filter(inMP) %>% summarize(mean(rbar), sd(rbar), sum(tstat>1.5))
208208
# Replication rate vis-a-vis other studies --------------------------------
209209

210210
mpSignals = read_csv(
211-
paste0(pathProject, 'Comparison_to_MetaReplications.csv')
211+
paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
212212
) %>%
213213
filter(metastudy == 'MP', ourname != '_missing_')
214214

215215
hxzSignals = read_csv(
216-
paste0(pathProject, 'Comparison_to_MetaReplications.csv')
216+
paste0(pathProject, 'Docs/Comparison_to_MetaReplications.csv')
217217
) %>%
218218
filter(metastudy == 'HXZ', ourname != '_missing_')
219219

Portfolios/Code/50_DailyPredictorPorts.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ dir.create(pathDataDailyDecileVW)
4747
dir.create(pathDataDailyQuintile)
4848
dir.create(pathDataDailyQuintileVW)
4949

50+
# Ensure CRAN mirror is defined for non-interactive runs
51+
options(repos = c(CRAN = "https://cloud.r-project.org"))
52+
5053
# since no other script uses lme4, it should go here.
5154
install.packages(setdiff(c('lme4'), rownames(installed.packages())))
5255
library(lme4)

Portfolios/Code/master.R

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@
2727
rm(list = ls())
2828
# ENTER PROJECT PATH HERE (i.e. this should be the path to your local repo folder & location of SignalDoc.csv)
2929
# if using Rstudio, pathProject = paste0(getwd(), '/') should work
30-
pathProject = '/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/'
30+
pathProject = '~/Dropbox/oap-ac/CrossSection/'
3131

3232
SignalSource = "Python" # use "Stata" for legacy signals (Signals/Data/) or "Python" for new signals (Signals/pyData/)
3333

3434
quickrun = F # use T if you want to run quickly for testing
3535
quickrunlist = c('Accruals','AM') # list of signals to use for quickrun
36-
skipdaily = T # use T to skip daily CRSP which is very slow
36+
skipdaily = F # use T to skip daily CRSP which is very slow
3737
feed.verbose = F # use T if you want lots of feedback
3838

3939
# Check whether project path is set correctly
@@ -49,6 +49,24 @@ source('01_PortfolioFunction.R', echo=T)
4949

5050
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5151

52+
# Check if signals data is complete
53+
signals_complete <- check_signal_csvs(pathProject, SignalSource)
54+
if (!interactive()) {
55+
if (!signals_complete) {
56+
stop('Non-interactive run halted: signals not complete')
57+
}
58+
} else {
59+
user_response <- tolower(trimws(readline(prompt = 'Proceed with portfolio build? [y/N]: ')))
60+
if (!(user_response %in% c('y', 'yes'))) {
61+
stop('Aborting master.R execution at user request.')
62+
}
63+
if (!signals_complete) {
64+
message('Continuing despite missing signals based on user confirmation.')
65+
}
66+
}
67+
68+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69+
5270
# PREPARE INTERMEDIATE DATA ####
5371

5472
print('master: 10_DownloadCRSP.R')

README.md

Lines changed: 49 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,36 @@ The code is separated into three folders:
3838

3939
We separate the code so you can choose which parts you want to run. If you only want to create signals, you can run the files in `Signals/pyCode/` and then do your thing. If you just want to create portfolios, you can skip the signal generation by directly downloading its output via the [data page](https://www.openassetpricing.com/). The whole thing is about 15,000 lines, so you might want to pick your battles.
4040

41-
More details are below.
41+
More details are below
4242

43-
### 1. Signals/pyCode/
43+
### `Signals/pyCode` Instructions
44+
45+
**1. Set up for Creating Signals (Python and R)**
46+
47+
* Install Python dependencies:
48+
```bash
49+
cd Signals/pyCode/
50+
pip install -r requirements.txt
51+
```
52+
* Install required R packages. [tbc]
53+
* Copy `Signals/pyCode/dotenv.template` to `Signals/pyCode/.env` and add your WRDS and FRED credentials.
54+
- For FRED credentials, request an [API key from FRED](https://research.stlouisfed.org/docs/api/api_key.html)
55+
56+
**2. (Optional) Generate Prep Data**
57+
58+
This is only necessary for a handful of signals
59+
60+
If you have bash:
61+
* from `Signals/pyCode/`
62+
- run `bash prep1_run_on_wrds.sh` to copy the prep scripts to the WRDS Cloud
63+
- wait about 5 hours
64+
- use qstat to check if it's still running
65+
- if impatient, check most recent file in `~/temp_prep/log/` on WRDS server.
66+
- run `bash prep2_dl_from_wrds.sh` to download the prep data from the WRDS Cloud to `Signals/pyData/Prep/`
67+
68+
You can alternatively upload to the WRDS Cloud manually, ssh into WRDS, run `qsub run_all_prep.sh`, and then manually download the prep data.
69+
70+
**3. Run the Signals Code**
4471

4572
`master.py` runs the end-to-end Python pipeline. It calls the staged scripts in:
4673

@@ -49,26 +76,16 @@ More details are below.
4976
* `Predictors/` constructs stock-level predictors and outputs to `Signals/pyData/Predictors/`
5077
* `Placebos/` constructs "not predictors" and "indirect evidence" signals and outputs to `Signals/pyData/Placebos/`
5178

52-
The orchestrator blocks are written to keep running even if a particular download fails (for example due to a missing subscription) so you get as much data as possible. You can track progress in `Signals/Logs/`.
53-
54-
#### Minimal Setup
55-
56-
1. From `Signals/pyCode/`, create a Python 3 virtual environment (e.g. `python3 -m venv .venv`) and install the requirements via `pip install -r requirements.txt` after activating the environment. `set_up_pyCode.py` automates these steps if you prefer.
57-
2. Copy `dotenv.template` to `.env` and populate credentials such as `WRDS_USERNAME`, `WRDS_PASSWORD`, and any other keys you need (e.g. `FRED_API_KEY`).
58-
3. Run the full pipeline with `python master.py` (from inside `Signals/pyCode/`). You can also run `01_DownloadData.py` and `02_CreatePredictors.py` individually if you just need part of the workflow.
59-
4. Outputs are written to `Signals/pyData/`, and detailed logs are saved under `Signals/Logs/`.
79+
**To run:**
80+
```bash
81+
cd Signals/pyCode/
82+
python master.py
83+
```
6084

61-
#### Optional Setup
62-
63-
The minimal setup produces the vast majority of signals. Thanks to exception handling, the pipeline will keep going even if a particular source is unavailable.
64-
65-
To reproduce every signal:
85+
The orchestrator blocks are written to keep running even if a particular download fails (for example due to a missing subscription) so you get as much data as possible. You can track progress in `Signals/Logs/`.
6686

67-
* For IBES, 13F, OptionMetrics, and bid-ask spread signals, run the helper scripts in `Signals/pyCode/PrepScripts/` (many are designed for WRDS Cloud) and place the resulting files in `Signals/pyData/Prep/`.
68-
* For signals that use the VIX, inflation, or broker-dealer leverage, request an [API key from FRED](https://research.stlouisfed.org/docs/api/api_key.html) and add `FRED_API_KEY` to `.env` before running the download scripts.
69-
* For signals that rely on patent citations, BEA input-output tables, or Compustat customer data, ensure that `Rscript` is available on your system because some helper scripts shell out to R.
7087

71-
### 2. Portfolios/Code/
88+
### `Portfolios/Code` Instructions
7289

7390
`master.R` runs everything. It:
7491

@@ -78,29 +95,32 @@ To reproduce every signal:
7895

7996
It also uses `SignalDoc.csv` as a guide for how to run the portfolios.
8097

81-
By default the code skips the daily portfolios (`skipdaily = T`), and takes about 8 hours, assuming you examine all 300 or so signals. However, the baseline portfolios (based on predictability results in the original papers) will be done in just 30 minutes. You can keep an eye on how it's going by checking the csvs outputted to `Portfolios/Data/Portfolios/`. Every 30 minutes or so the code should output another set of portfolios. Adding the daily portfolios (`skipdaily = F`) takes an additional 12ish hours.
82-
83-
#### Minimal Setup
98+
**To run:**
99+
* Option 1 - Command line:
100+
```bash
101+
cd Portfolios/Code/
102+
Rscript master.R
103+
```
104+
* Option 2 - RStudio: Open `master.R` in RStudio and click "Source" or press Ctrl+Shift+S (Cmd+Shift+S on Mac)
84105

85-
All you need to do is set `pathProject` in `master.R` to the project root directory (where `SignalDoc.csv` is). Then `master.R` will create portfolios for Price, Size, and STreversal in `Portfolios/Data/Portfolios/`.
106+
**Before running:** You must set `pathProject` in `master.R` (line 30) to your project root directory (where `SignalDoc.csv` is located). If using RStudio, `pathProject = paste0(getwd(), '/')` should work automatically.
86107

87-
#### Probable Setup
108+
By default the code skips the daily portfolios (`skipdaily = T`), and takes about 8 hours, assuming you examine all 300 or so signals. However, the baseline portfolios (based on predictability results in the original papers) will be done in just 30 minutes. You can keep an eye on how it's going by checking the csvs outputted to `Portfolios/Data/Portfolios/`. Every 30 minutes or so the code should output another set of portfolios. Adding the daily portfolios (`skipdaily = F`) takes an additional 12ish hours.
88109

89-
You probably want more than Price, Size, and STreversal portfolios, and so you probably want to set up more signal data before you run `master.R`.
110+
#### Minimal Setup
90111

112+
To get started quickly, `master.R` will create portfolios for Price, Size, and STreversal in `Portfolios/Data/Portfolios/`.
91113
There are a couple ways to set up this signal data:
92114

93115
* Run the code in `Signals/pyCode/` (see above).
94116
* Download `Firm Level Characteristics/Full Sets/PredictorsIndiv.zip` and `Firm Level Characteristics/Full Sets/PlacebosIndiv.zip` via the [data page](https://sites.google.com/site/chenandrewy/open-source-ap) and unzip to `Signals/Data/Predictors/` and `Signals/Data/Placebos/`.
95117
* Download only some selected csvs via the [data page](https://sites.google.com/site/chenandrewy/open-source-ap) and place in `Signals/Data/Predictors/` (e.g. just download `BM.csv`, `AssetGrowth.csv`, and `EarningsSurprise.csv` and put them in `Signals/Data/Predictors/`).
96118

97119

98-
### 3. Shipping/Code/
120+
### `Shipping/Code` Instructions
99121

100122
This code zips up the data, makes some quality checks, and copies files for uploading to Gdrive. You shouldn't need to use this but we keep it with the rest of the code for replicability.
101123

102-
----
103124

104-
## Contribute
105125

106-
Please let us know if you find typos in the code or think that we should add additional signals. You can let us know about any suggested changes via pull requests for this repo. We will keep the code up to date for other researchers to use it.
126+

0 commit comments

Comments
 (0)