CSR-ABM/flaringABM_postproc.R at main · nwillems94/CSR-ABM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
args <- commandArgs(trailingOnly=TRUE)
args <- args[substr(args, 1, 2)!="--"]

jobIDs <- lapply(strsplit(args, "="), `[[`, 2)
names(jobIDs) <- gsub("\\n", "\n", sapply(strsplit(args, "="), `[[`, 1), fixed=TRUE)
print(jobIDs)

if (!("refID" %in% names(jobIDs))) {
    jobIDs <- c(jobIDs, "refID"=NA)
}

library(data.table)
library(DBI)
require(RSQLite)

process_outputs <- function(db, ID, append_file) {
    print(paste("Compiling model:", names(ID)))

    ## IMPORT FILES ##
    params_wide <- fread(sprintf("./logs/param_log_%s.csv", ID),
                    colClasses="character")
    params_wide[, "RunID":= .I]

    params <- melt(params_wide,
        measure.vars=patterns("market_prop_green","prob_m","SRoR", "Activism"),
        value.name=c("market_prop_green","prob_m","SRoR", "Activism"),
        variable="time", variable.factor=FALSE)
    params[,
        names(params[, -c("refID","strategy","reporting")]):=
            lapply(.SD[, -c("refID","strategy","reporting")], as.numeric)
    ]
    params[, "time":= time - 1 + t0]
    params[refID=="", "refID":= NA]

    market_states <- fread(sprintf("./outputs/market_states_%s.csv", ID))
    lease_states <- fread(sprintf("./outputs/lease_states_%s.csv", ID))
    agent_states <- fread(sprintf("./outputs/agent_states_%s.csv", ID))

    params[, "model":= names(ID)]
    market_states[, "model":= names(ID)]
    lease_states[, "model":= names(ID)]
    agent_states[, "model":= names(ID)]

    ## CALCULATE VERIFICATION / VALIDATION METRICS ##
    # amount of gas deposited and withdrawn from storage due to excess supply
    lease_states[,
        "dw":= any(market=="none") * fifelse(market=="none", 1, -1, na=0),
        by=.(model, RunID, leaseID)
    ]
    lease_states[, "net_dw":= cumsum(dw), by=.(model, RunID, leaseID)]
    while(min(lease_states$net_dw)<0) {
        lease_states[
            (dw<0) & (net_dw<0),
            "dw":= replace(dw, which.min(time), 0),
            by=.(model, RunID, leaseID)
        ]
        lease_states[, "net_dw":= cumsum(dw), by=.(model, RunID, leaseID)]
    }
    market_states[
        lease_states[
            dw!=0,
            sum((gas_MCF + csgd_MCF) * dw),
            by=.(model, RunID, time)
        ], on=c("model", "RunID", "time"),
        "q_stored":= V1
    ]

    # what fraction of the market is represented
    demand_file <- sprintf("./outputs/demand_function_%s-%%s.rds",
                        fcoalesce(jobIDs$refID, ID))
    market_states[,
        "frac":= median(readRDS(sprintf(demand_file, .BY)
            )$historical_market$frac),
        by=RunID]
    setcolorder(market_states, c("time", "p_grey", "p_green", "p_oil_mult",
                                "q_grey", "q_green", "q_stored", "q_oil",
                                "market_prop_green", "frac", "RunID", "model"))

    # add metrics to agent states
    agent_states[
        lease_states[
            (status=="producing"),
            .(sum(csgd_MCF[class=="underdeveloped"]) + sum(sopf_MCF),
                sum(oil_BBL), sum(gas_MCF)),
            by=.(model, RunID, time, firmID)
        ], on=c("model", "RunID", "time", "firmID"),
        c("gas_flared_calc", "oil_prod", "gas_prod"):= .(V1, V2, V3)]
    setnafill(agent_states,
        cols=c("oil_prod", "gas_prod", "gas_flared_calc"), fill=0)

    ## WRITE CSVs ##
    cat("\tWriting CSVs")
    fwrite(params,
        sprintf("./logs/params_%s.csv.gz", ID))
    fwrite(market_states,
        sprintf("./outputs/processed/market_states_%s.csv.gz", ID))
    fwrite(agent_states,
        sprintf("./outputs/processed/agent_states_%s.csv.gz", ID))

    ## PROCESS DB TABLES ##
    # begin SQLite transaction and turn off autocommit
    dbBegin(db)
    cat("\tUpdating lookup table")
    # save space by using lookup table for string definitions
    if (!append_file) {
        # create keyed tabled WITHOUT ROWID column
        dbExecute(db, "CREATE TABLE string_lookup
                        ( column_name TEXT,
                          string_key TEXT,
                          integer_key INTEGER,
                          PRIMARY KEY (column_name, string_key)
                        ) WITHOUT ROWID;")
        model_num <- 1L
    } else {
        model_num <- dbGetQuery(db, "SELECT MAX(integer_key) FROM string_lookup
                                    WHERE column_name='model'") + 1
    }

    # update model number
    dbExecute(db, "INSERT INTO string_lookup
                    (column_name, string_key, integer_key) VALUES (?, ?, ?);",
                   list("model", names(ID), as.integer(model_num)))
    dbExecute(db, "INSERT INTO string_lookup
                    (column_name, string_key, integer_key) VALUES (?, ?, ?);",
                   list("jobID", ID[[1]], as.integer(model_num)))

    # convert string columns to integers
    for (dt in c("agent_states", "lease_states", "market_states", "params")) {
        get(dt)[, "model":= NULL]
        get(dt)[, "model":= as.integer(model_num)]
    }

    for (col in c("strategy", "reporting", "activity", "behavior", "class", "status", "market", "area")) {
        dt <- if (col %in% c("strategy", "reporting")) "params" else
                if (col %in% c("activity", "behavior")) "agent_states" else
                    "lease_states"

        # ensure factor levels match existing records
        if (append_file) {
            factor_levels <- dbGetQuery(db,
                paste0("SELECT * FROM string_lookup
                        WHERE column_name='", col,
                        "' ORDER BY integer_key"))$string_key
        } else {
            factor_levels <- c()
        }
        get(dt)[is.na(get(col)), c(col):= ""]
        get(dt)[,
            c(col):= factor(get(col),
                levels=c(factor_levels, setdiff(get(col), factor_levels)))
        ]

        # write string to lookup table
        get(dt)[,
            .("cname"=col, "skey"=levels(get(col)), "ikey"=seq(nlevels(get(col))))
            ][,
            dbExecute(db, "INSERT OR IGNORE INTO string_lookup
                (column_name, string_key, integer_key)
                VALUES (:cname, :skey, :ikey);", .SD)
        ]
        # cast string to integer
        get(dt)[, c(col):= as.integer(get(col))]
    }

    # store repeated columns of lease and agent states in separate table
    agent_info <- agent_states[, first(.SD), by=.(model, RunID, firmID),
                                .SDcols=patterns("^production_*")]
    agent_states[,
        setdiff(names(agent_info),
        c("model", "RunID", "firmID")):= NULL
    ]

    lease_info <- lease_states[,
        .SD[which.max(time)],
        by=.(model, RunID, leaseID, area, DISTRICT_NO, OIL_GAS_CODE),
        .SDcols=-patterns("dw|^cost_*|^ERR_MCF$|^class$|^market$|^lifetime$")
    ]
    lease_states[,
        setdiff(names(lease_info),
        c("model", "RunID", "leaseID", "time", "status")):= NULL
    ]
    setnames(lease_info, "time", "t_last")

    ## WRITE DB TABLES ##
    cat("\tWriting data to db\n")
    for (dt in c("params", "market_states", "agent_info", "agent_states", "lease_info", "lease_states")) {
        keys <- c("model", "RunID", if (!grepl("info", dt)) "time",
                    if (grepl("agent", dt)) "firmID" else
                        if (grepl("lease", dt)) "leaseID")
        if (!append_file) {
            # create keyed tabled WITHOUT ROWID column
            sql <- sub("\n)\n$",
                    sprintf(",\n  \\PRIMARY KEY(%s ASC) \n) WITHOUT ROWID;\n",
                        paste(keys, collapse=" ASC, ")),
                    as.character(
                        sqlCreateTable(db, dt, get(dt), row.names=FALSE)))
            dbExecute(db, sql)
        } else {
            # order columns like existing sql table
            setcolorder(get(dt),
                dbGetQuery(db, sprintf("PRAGMA table_info(%s)", dt))$name)
        }

        # match order of primary keys to speed up write time
        setorderv(get(dt), cols=keys)

        dbExecute(db, paste0("INSERT INTO ", dt,
                    " VALUES(:", paste(names(get(dt)), collapse=", :"), ")"),
                get(dt))
        rm(dt)
    }
    if (!("demand_functions" %in% dbListTables(db))) {
        # store demand functions as blobs
        dbExecute(db, "CREATE TABLE demand_functions
                        ( RunID INTEGER,
                          fun_bin BLOB,
                          PRIMARY KEY (RunID)
                        ) WITHOUT ROWID;")
        demand_funs <- sapply(list.files(dirname(demand_file),
                            gsub("%s.*", "*", basename(demand_file)),
                            full.names=TRUE), readRDS)
        names(demand_funs) <- gsub(".*-(.*).rds", "\\1", names(demand_funs))
        dbExecute(db,
            "INSERT INTO demand_functions (RunID, fun_bin) VALUES (?, ?);",
            list(names(demand_funs), lapply(demand_funs, serialize, NULL)))
    }
    dbCommit(db)
}


all_states <- dbConnect(RSQLite::SQLite(),
                sprintf("%s/CSR-ABM/outputs/processed/all_states_%s.sqlite",
	                fcoalesce(Sys.getenv("WORK", unset=NA), ".."),
                    fcoalesce(jobIDs$refID,
                        paste(na.omit(jobIDs), collapse="-"))))

# turn off safety measures since all data is backed up in CSVs
dbExecute(all_states, "PRAGMA journal_mode = OFF;")
dbExecute(all_states, "PRAGMA synchronous = OFF;")

# do postproc
lapply(seq(jobIDs)[-which(names(jobIDs)=="refID")],
    function(i) process_outputs(all_states, jobIDs[i], i!=1))

# optimize later queries
dbExecute(all_states, "PRAGMA analysis_limit=1000;")
dbExecute(all_states, "PRAGMA optimize;")
dbDisconnect(all_states)

print(warnings())
print("Finished assembling databases")