Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 1 addition & 17 deletions experiments/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ services:
stdin_open: true
volumes:
- ${PWD}:/pod
- neo4j_data:/neo4j_data/data
- /data/elastic-notebook/data:/pod/nbdata
working_dir: /pod
command: ["/bin/bash"]

Expand All @@ -30,21 +30,6 @@ services:
- redis_data:/data
command: ["redis-server", "--port", "6379"]

podneo4j:
# image: neo4j:5.15.0-community-bullseye
image: neo4j:5.15.0-enterprise-bullseye
hostname: podneo4j
tty: true
stdin_open: true
volumes:
- neo4j_data:/data
environment:
NEO4J_ACCEPT_LICENSE_AGREEMENT: "yes" # Using Neo4j Developer License (https://neo4j.com/licensing/)
NEO4J_AUTH: neo4j/podneo4jPassword
NEO4J_server_memory_pagecache_size: 2G
NEO4J_server_memory_heap_max__size: 2G
command: ["neo4j"]

podmongo:
image: mongo:7.0.5-jammy
hostname: podmongo
Expand All @@ -53,4 +38,3 @@ services:

volumes:
redis_data:
neo4j_data:
1 change: 1 addition & 0 deletions experiments/pod.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ RUN python -m pip install -r /pod/requirements.txt
COPY ./pod /pod/pod
COPY ./setup.py /pod/setup.py
COPY ./README.md /pod/README.md

RUN python -m pip install -e /pod/

WORKDIR /
Expand Down
12 changes: 12 additions & 0 deletions experiments/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
for alpha in 0.1; do
for gamma in 0.7; do
echo "alpha $alpha gamma $gamma"
stdout_file="eval_logs/stdout_alpha${alpha}_gamma${gamma}.log"
stderr_file="eval_logs/stderr_alpha${alpha}_gamma${gamma}.log"
# Ensure the train_logs directory exists
mkdir -p eval_logs
# Execute the command and use tee for stdout and stderr
( docker exec experiments-pod-1 python pod/train.py --gamma $gamma --alpha $alpha 2> >(tee "$stderr_file" >&2) | tee "$stdout_file" ) &
done
done
echo "done with script"
124 changes: 124 additions & 0 deletions pod/find_bench_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import contextlib
import gc
import io
import multiprocessing as mp
import os
import shutil
import sys
import time
from dataclasses import dataclass
from functools import partial
from multiprocessing import Process, Queue
from pathlib import Path
from typing import List
import json

import numpy as np
from loguru import logger
from model import QLearningPoddingModel
from tqdm import tqdm

from pod.bench import BenchArgs, NotebookExecutor, Notebooks
from pod.common import PodId
from pod.feature import __FEATURE__
from pod.pickling import ManualPodding, SnapshotPodPickling, StaticPodPickling
from pod.stats import ExpStat
from pod.storage import FilePodStorage


@dataclass
class TrainArgs:
gamma: float
alpha: float


def run_iter(nb_path, update_q: Queue):
# print(nb_path)
args = BenchArgs(expname="", nb=nb_path, sut="snapshot")
# Load notebook.
logger.info(f"PID {os.getpid()}, {nb_path}")
save_file_str = nb_path
nb_cells = Notebooks.nb(args=args)
nb_exec = NotebookExecutor(nb_cells)

pod_storage_path = Path(f"tmp/pod{save_file_str}")
if pod_storage_path.exists():
shutil.rmtree(pod_storage_path)

# Initialize sut
# sut = SnapshotPodPickling(Path(f"tmp/pod{save_file_str}"))
sut = SnapshotPodPickling(Path(f"tmp/pod{save_file_str}"))

sizes = []
times = []
last_storage_size = 0
# expstat = ExpStat()
pids: List[PodId] = []
for nth, (cell, the_globals, the_locals) in enumerate(nb_exec.iter()):
# Dump current state.
dump_start_ts = time.time()
pid = sut.dump(the_locals)
dump_stop_ts = time.time()

# Record measurements.
cur_size = sut.estimate_size()
dump_time = dump_stop_ts - dump_start_ts
times.append(dump_time)
pids.append(pid)
size = cur_size - last_storage_size
last_storage_size = cur_size
sizes.append(size)
# Reset environment to reduce noise.
gc.collect()

update_q.put({"nb": nb_path, "sizes" : sizes, "times" : times, "final_size" : cur_size})
print("DONE")
return


def find_bench_size(nbs):
"""Finds average size using snapshot"""
procs: List[Process] = []
update_q = Queue()
for nb_path in nbs:
p = Process(target=run_iter, args=(nb_path, update_q))
procs.append(p)
try:
print("STARTING PROC")
p.start()
except:
logger.info("ERROR STARTING PROCESS")
return

global_data = {}
popped = 0
while popped < len(nbs):
print("GETTING FROM UPD")
data = update_q.get()
popped += 1
global_data[data["nb"]] = {"sizes" : data["sizes"], "times" : data["times"], "final_size" : data["final_size"]}
for p in procs:
try:
p.join()
except:
logger.info("ERROR JOINING")
return global_data


if __name__ == "__main__":
# logger.info(f"Arguments {sys.argv}")
bench_data = find_bench_size(
[
"notebooks/it-s-that-time-of-the-year-again.ipynb",
"notebooks/better-xgb-baseline.ipynb",
"notebooks/fast-fourier-transform-denoising.ipynb",
"notebooks/cv19w3-2-v2-play-2-v3fix-sub-last6dayopt.ipynb",
# "notebooks/amex-dataset.ipynb",
"notebooks/denoising-with-direct-wavelet-transform.ipynb",
"notebooks/04_training_linear_models.ipynb",
]
)
json_object = json.dumps(bench_data, indent=4)
with open("benchdata.json", "w") as f:
print("WRITING")
f.write(json_object)
65 changes: 65 additions & 0 deletions pod/inspect_qt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from model import QLearningPoddingModel


def idx_to_state(idx):
# The lists from which the Cartesian product was created
lists = [
[True, False],
QLearningPoddingModel.SIZES,
QLearningPoddingModel.SIZES,
QLearningPoddingModel.PROBABILITIES,
QLearningPoddingModel.PROBABILITIES,
QLearningPoddingModel.TYPES,
QLearningPoddingModel.ACTION_CHOICES,
]

total_combinations = [len(lst) for lst in lists]
parameters = []
for lst_len in reversed(total_combinations):
value_idx = idx % lst_len
parameters.append(lists[len(lists) - len(parameters) - 1][value_idx])
idx //= lst_len
parameters.reverse()
return tuple(parameters)


def sort_by_max(arr):
max_values = np.max(arr, axis=1)
sorted_indices = np.argsort(max_values)[::-1]
sorted_arr = arr[sorted_indices]
return sorted_indices, sorted_arr


def inspect(qt_path):
qt = np.load(qt_path)
fresh_qt = np.load("qtables/EVAL.npy")
# print(f"MAX {np.max(qt)}")
# print(f"MIN {np.min(qt)}")
# plt.hist(qt, bins=10, edgecolor='black')
# plt.title('Histogram of Data')
# plt.xlabel('Value')
# plt.ylabel('Frequency')
# plt.savefig("qt_vals.png")
# plt.show()
used_values = np.where((qt == 10) | (qt == 20) | (qt == 30), -10000, qt)
sorted_used_idx, sorted_used_qt = sort_by_max(used_values)
sorted_fresh_idx, sorted_fresh_qt = sort_by_max(fresh_qt)
with open("differences.txt", "w") as diff_file:
for idx in sorted_used_idx:
if np.max(qt[idx] < 1e-8):
continue
if np.argmax(qt[idx]) != np.argmax(fresh_qt[idx]):
diff_file.write(f"STATE {idx_to_state(idx)} VALUE IN USED {qt[idx]} VALUE IN FRESH {fresh_qt[idx]}\n")
# relevant_max_id = sorted_indices[-28650:-28500]
# print(f"Max modified idices {relevant_max_id}, values {qt.flatten()[relevant_max_id]}")
# print(f"Min idices {sorted_indices[:20]}, values {qt.flatten()[sorted_indices[:20]]}")
# for i in relevant_max_id:
# print(index_to_state(i))



if __name__ == "__main__":
inspect("qtables/0-6&0-1.npy")
Loading