illinoisdata · SumayT9 · Feb 11, 2024 · Feb 19, 2024 · Feb 22, 2024 · Feb 28, 2024
diff --git a/experiments/docker-compose.yml b/experiments/docker-compose.yml
@@ -7,7 +7,7 @@ services:
     stdin_open: true
     volumes:
       - ${PWD}:/pod
-      - neo4j_data:/neo4j_data/data
+      - /data/elastic-notebook/data:/pod/nbdata
     working_dir: /pod
     command: ["/bin/bash"]
 
@@ -30,21 +30,6 @@ services:
       - redis_data:/data
     command: ["redis-server", "--port", "6379"]
 
-  podneo4j:
-    # image: neo4j:5.15.0-community-bullseye
-    image: neo4j:5.15.0-enterprise-bullseye
-    hostname: podneo4j
-    tty: true
-    stdin_open: true
-    volumes:
-      - neo4j_data:/data
-    environment:
-      NEO4J_ACCEPT_LICENSE_AGREEMENT: "yes"  # Using Neo4j Developer License (https://neo4j.com/licensing/)
-      NEO4J_AUTH: neo4j/podneo4jPassword
-      NEO4J_server_memory_pagecache_size: 2G
-      NEO4J_server_memory_heap_max__size: 2G
-    command: ["neo4j"]
-
   podmongo:
     image: mongo:7.0.5-jammy
     hostname: podmongo
@@ -53,4 +38,3 @@ services:
 
 volumes:
   redis_data:
-  neo4j_data:
diff --git a/experiments/pod.Dockerfile b/experiments/pod.Dockerfile
@@ -36,6 +36,7 @@ RUN python -m pip install -r /pod/requirements.txt
 COPY ./pod /pod/pod
 COPY ./setup.py /pod/setup.py
 COPY ./README.md /pod/README.md
+
 RUN python -m pip install -e /pod/
 
 WORKDIR /

diff --git a/experiments/train.sh b/experiments/train.sh
@@ -0,0 +1,12 @@
+for alpha in 0.1; do 
+    for gamma in 0.7; do
+        echo "alpha $alpha gamma $gamma"
+        stdout_file="eval_logs/stdout_alpha${alpha}_gamma${gamma}.log"
+        stderr_file="eval_logs/stderr_alpha${alpha}_gamma${gamma}.log"
+        # Ensure the train_logs directory exists
+        mkdir -p eval_logs
+        # Execute the command and use tee for stdout and stderr
+        ( docker exec experiments-pod-1 python pod/train.py --gamma $gamma --alpha $alpha 2> >(tee "$stderr_file" >&2) | tee "$stdout_file" ) &
+    done
+done
+echo "done with script"
diff --git a/pod/find_bench_size.py b/pod/find_bench_size.py
@@ -0,0 +1,124 @@
+import contextlib
+import gc
+import io
+import multiprocessing as mp
+import os
+import shutil
+import sys
+import time
+from dataclasses import dataclass
+from functools import partial
+from multiprocessing import Process, Queue
+from pathlib import Path
+from typing import List
+import json
+
+import numpy as np
+from loguru import logger
+from model import QLearningPoddingModel
+from tqdm import tqdm
+
+from pod.bench import BenchArgs, NotebookExecutor, Notebooks
+from pod.common import PodId
+from pod.feature import __FEATURE__
+from pod.pickling import ManualPodding, SnapshotPodPickling, StaticPodPickling
+from pod.stats import ExpStat
+from pod.storage import FilePodStorage
+
+
+@dataclass
+class TrainArgs:
+    gamma: float
+    alpha: float
+
+
+def run_iter(nb_path, update_q: Queue):
+    # print(nb_path)
+    args = BenchArgs(expname="", nb=nb_path, sut="snapshot")
+    # Load notebook.
+    logger.info(f"PID {os.getpid()}, {nb_path}")
+    save_file_str = nb_path
+    nb_cells = Notebooks.nb(args=args)
+    nb_exec = NotebookExecutor(nb_cells)
+
+    pod_storage_path = Path(f"tmp/pod{save_file_str}")
+    if pod_storage_path.exists():
+        shutil.rmtree(pod_storage_path)
+
+    # Initialize sut
+    # sut = SnapshotPodPickling(Path(f"tmp/pod{save_file_str}"))
+    sut = SnapshotPodPickling(Path(f"tmp/pod{save_file_str}"))
+
+    sizes = []
+    times = []
+    last_storage_size = 0
+    # expstat = ExpStat()
+    pids: List[PodId] = []
+    for nth, (cell, the_globals, the_locals) in enumerate(nb_exec.iter()):
+        # Dump current state.
+        dump_start_ts = time.time()
+        pid = sut.dump(the_locals)
+        dump_stop_ts = time.time()
+
+        # Record measurements.
+        cur_size = sut.estimate_size()
+        dump_time = dump_stop_ts - dump_start_ts
+        times.append(dump_time)
+        pids.append(pid)
+        size = cur_size - last_storage_size
+        last_storage_size = cur_size
+        sizes.append(size)
+        # Reset environment to reduce noise.
+        gc.collect()
+
+    update_q.put({"nb": nb_path, "sizes" : sizes, "times" : times, "final_size" : cur_size})
+    print("DONE")
+    return
+
+
+def find_bench_size(nbs):
+    """Finds average size using snapshot"""
+    procs: List[Process] = []
+    update_q = Queue()
+    for nb_path in nbs:
+        p = Process(target=run_iter, args=(nb_path, update_q))
+        procs.append(p)
+        try:
+            print("STARTING PROC")
+            p.start()
+        except:
+            logger.info("ERROR STARTING PROCESS")
+            return
+
+    global_data = {}
+    popped = 0
+    while popped < len(nbs):
+        print("GETTING FROM UPD")
+        data = update_q.get()
+        popped += 1
+        global_data[data["nb"]] = {"sizes" : data["sizes"], "times" : data["times"], "final_size" : data["final_size"]}
+    for p in procs:
+        try:
+            p.join()
+        except:
+            logger.info("ERROR JOINING")
+    return global_data
+
+
+if __name__ == "__main__":
+    # logger.info(f"Arguments {sys.argv}")
+    bench_data = find_bench_size(
+        [
+            "notebooks/it-s-that-time-of-the-year-again.ipynb",
+            "notebooks/better-xgb-baseline.ipynb",
+            "notebooks/fast-fourier-transform-denoising.ipynb",
+            "notebooks/cv19w3-2-v2-play-2-v3fix-sub-last6dayopt.ipynb",
+            # "notebooks/amex-dataset.ipynb",
+            "notebooks/denoising-with-direct-wavelet-transform.ipynb",
+            "notebooks/04_training_linear_models.ipynb",
+        ]
+    )
+    json_object = json.dumps(bench_data, indent=4)
+    with open("benchdata.json", "w") as f:
+        print("WRITING")
+        f.write(json_object)
diff --git a/pod/inspect_qt.py b/pod/inspect_qt.py
@@ -0,0 +1,65 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from itertools import product
+from model import QLearningPoddingModel
+
+
+def idx_to_state(idx):
+    # The lists from which the Cartesian product was created
+    lists = [
+        [True, False],
+        QLearningPoddingModel.SIZES,
+        QLearningPoddingModel.SIZES,
+        QLearningPoddingModel.PROBABILITIES,
+        QLearningPoddingModel.PROBABILITIES,
+        QLearningPoddingModel.TYPES,
+        QLearningPoddingModel.ACTION_CHOICES,
+    ]
+
+    total_combinations = [len(lst) for lst in lists]
+    parameters = []
+    for lst_len in reversed(total_combinations):
+        value_idx = idx % lst_len
+        parameters.append(lists[len(lists) - len(parameters) - 1][value_idx])
+        idx //= lst_len
+    parameters.reverse()
+    return tuple(parameters)
+
+
+def sort_by_max(arr):
+    max_values = np.max(arr, axis=1)
+    sorted_indices = np.argsort(max_values)[::-1]
+    sorted_arr = arr[sorted_indices]
+    return sorted_indices, sorted_arr
+
+
+def inspect(qt_path):
+    qt = np.load(qt_path)
+    fresh_qt = np.load("qtables/EVAL.npy")
+    # print(f"MAX {np.max(qt)}")
+    # print(f"MIN {np.min(qt)}")
+    # plt.hist(qt, bins=10, edgecolor='black')
+    # plt.title('Histogram of Data')
+    # plt.xlabel('Value')
+    # plt.ylabel('Frequency')
+    # plt.savefig("qt_vals.png")
+    # plt.show()
+    used_values = np.where((qt == 10) | (qt == 20) | (qt == 30), -10000, qt)
+    sorted_used_idx, sorted_used_qt = sort_by_max(used_values)
+    sorted_fresh_idx, sorted_fresh_qt = sort_by_max(fresh_qt)
+    with open("differences.txt", "w") as diff_file:
+        for idx in sorted_used_idx:
+            if np.max(qt[idx] < 1e-8):
+                continue
+            if np.argmax(qt[idx]) != np.argmax(fresh_qt[idx]):
+                diff_file.write(f"STATE {idx_to_state(idx)} VALUE IN USED {qt[idx]} VALUE IN FRESH {fresh_qt[idx]}\n")
+    # relevant_max_id = sorted_indices[-28650:-28500]
+    # print(f"Max modified idices {relevant_max_id}, values {qt.flatten()[relevant_max_id]}")
+    # print(f"Min idices {sorted_indices[:20]}, values {qt.flatten()[sorted_indices[:20]]}")
+    # for i in relevant_max_id:
+    #     print(index_to_state(i))
+
+
+
+if __name__ == "__main__":
+    inspect("qtables/0-6&0-1.npy")