From a83bfc0e6b9f662e83d9192c278892a3b1a6dbd7 Mon Sep 17 00:00:00 2001 From: Ataraxist Date: Thu, 17 Apr 2025 18:14:28 -0700 Subject: [PATCH 1/6] Renaming files to have semantic names. Working through issues. Found a critical flaw in the existing code for the training pipeline. --- tkyo-drift/package-lock.json | 28 ++++++------------- tkyo-drift/package.json | 2 +- tkyo-drift/tkyoDrift.js | 6 ++-- ...ftSetTraining.py => batchEmbController.py} | 10 +++---- ...pythonTrainingEmb.py => batchEmbWriter.py} | 4 +-- .../{pythonKMeans.py => batchMakeKMeans.py} | 0 ...tSetTrainingHook.js => batchPythonHook.js} | 2 +- ...edScalars.py => batchScalarWriteShared.py} | 2 +- ...adTrainingData.py => getHFTrainingData.py} | 0 .../util/{makeLogEntry.js => logMakeEntry.js} | 0 .../{makeErrorLogEntry.js => logMakeError.js} | 0 .../{printLogCLI.js => logPrintCosCLI.js} | 0 ...printScalarCLI.js => logPrintScalarCLI.js} | 4 +-- tkyo-drift/util/oneOffEmb.js | 8 +++--- .../util/{DriftModel.js => oneOffModel.js} | 0 ...calarMetrics.js => scalarCaptureShared.js} | 0 ...calarDistributions.js => scalarCompare.js} | 0 ...dScalarMetrics.js => scalarLoadMetrics.js} | 0 .../util/{pythonHNSW.py => sharedHNSW.py} | 0 19 files changed, 27 insertions(+), 39 deletions(-) rename tkyo-drift/util/{tkyoDriftSetTraining.py => batchEmbController.py} (91%) rename tkyo-drift/util/{pythonTrainingEmb.py => batchEmbWriter.py} (99%) rename tkyo-drift/util/{pythonKMeans.py => batchMakeKMeans.py} (100%) rename tkyo-drift/util/{tkyoDriftSetTrainingHook.js => batchPythonHook.js} (96%) rename tkyo-drift/util/{writeSharedScalars.py => batchScalarWriteShared.py} (98%) rename tkyo-drift/util/{downloadTrainingData.py => getHFTrainingData.py} (100%) rename tkyo-drift/util/{makeLogEntry.js => logMakeEntry.js} (100%) rename tkyo-drift/util/{makeErrorLogEntry.js => logMakeError.js} (100%) rename tkyo-drift/util/{printLogCLI.js => logPrintCosCLI.js} (100%) rename tkyo-drift/util/{printScalarCLI.js => logPrintScalarCLI.js} (97%) rename tkyo-drift/util/{DriftModel.js => oneOffModel.js} (100%) rename tkyo-drift/util/{captureSharedScalarMetrics.js => scalarCaptureShared.js} (100%) rename tkyo-drift/util/{compareScalarDistributions.js => scalarCompare.js} (100%) rename tkyo-drift/util/{loadScalarMetrics.js => scalarLoadMetrics.js} (100%) rename tkyo-drift/util/{pythonHNSW.py => sharedHNSW.py} (100%) diff --git a/tkyo-drift/package-lock.json b/tkyo-drift/package-lock.json index 725ce30..e8397e8 100644 --- a/tkyo-drift/package-lock.json +++ b/tkyo-drift/package-lock.json @@ -1,21 +1,23 @@ { - "name": "tkyodrifttest1", - "version": "1.0.0", + "name": "tkyodrift", + "version": "1.0.6", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "tkyodrifttest1", - "version": "1.0.0", - "license": "ISC", + "name": "tkyodrift", + "version": "1.0.6", + "license": "MIT", "dependencies": { "@xenova/transformers": "^2.17.2", "chalk": "^5.4.1", "cli-table3": "^0.6.5", "fs": "^0.0.1-security", "path": "^0.12.7", - "tkyodrifttest1": "^1.0.0", "uuid": "^11.1.0" + }, + "bin": { + "tkyo": "tkyoDrift.js" } }, "node_modules/@colors/colors": { @@ -938,20 +940,6 @@ "b4a": "^1.6.4" } }, - "node_modules/tkyodrifttest1": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/tkyodrifttest1/-/tkyodrifttest1-1.0.0.tgz", - "integrity": "sha512-475elQaD3QMC4zrVPba9k6783lVTBXm2wmjl0SGB7+S/zzw+fi+P2SmrJyzr9IDNA3DpwBNb+o6kamxFiscT+A==", - "license": "ISC", - "dependencies": { - "@xenova/transformers": "^2.17.2", - "chalk": "^5.4.1", - "cli-table3": "^0.6.5", - "fs": "^0.0.1-security", - "path": "^0.12.7", - "uuid": "^11.1.0" - } - }, "node_modules/tunnel-agent": { "version": "0.6.0", "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", diff --git a/tkyo-drift/package.json b/tkyo-drift/package.json index 40b16d2..94cfa55 100644 --- a/tkyo-drift/package.json +++ b/tkyo-drift/package.json @@ -3,7 +3,7 @@ "version": "1.0.6", "description": "Lightweight CLI tool and library for detecting AI model drift using embeddings and scalar metrics. Tracks semantic, conceptual, and lexical change over time.", "main": "./tkyoDrift.js", - "bin":{ + "bin": { "tkyo": "./tkyoDrift.js" }, "types": "./tkyo.d.ts", diff --git a/tkyo-drift/tkyoDrift.js b/tkyo-drift/tkyoDrift.js index eb25e72..5fcd558 100755 --- a/tkyo-drift/tkyoDrift.js +++ b/tkyo-drift/tkyoDrift.js @@ -42,9 +42,9 @@ @@@@@@@@@@@@@@@@@%+:--::=****=:..::-. ...... ...:::::.......................... . @%%%####******+++++++++=============------:::::............. ...............................::::::::::::::::::::::------=====+++++++*******#######%%%%%%@@@@@@@ @@@@@@@@@@@@@@@@@@%%%##############%%%%%%%%%%%%%%%%%%%%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/ -import tkyoDriftSetTrainingHook from './util/tkyoDriftSetTrainingHook.js'; -import printScalarCLI from './util/printScalarCLI.js'; -import printLogCLI from './util/printLogCLI.js'; +import tkyoDriftSetTrainingHook from './util/batchPythonHook.js'; +import printScalarCLI from './util/logPrintScalarCLI.js'; +import printLogCLI from './util/logPrintCosCLI.js'; import tkyoDrift from './util/oneOffEmb.js'; import chalk from 'chalk'; import path from 'path'; diff --git a/tkyo-drift/util/tkyoDriftSetTraining.py b/tkyo-drift/util/batchEmbController.py similarity index 91% rename from tkyo-drift/util/tkyoDriftSetTraining.py rename to tkyo-drift/util/batchEmbController.py index ce24a66..e5b0021 100644 --- a/tkyo-drift/util/tkyoDriftSetTraining.py +++ b/tkyo-drift/util/batchEmbController.py @@ -2,8 +2,8 @@ import sys sys.dont_write_bytecode = True # Import helper function to load and embed the data -import pythonTrainingEmb -from writeSharedScalars import write_shared_scalar_metrics +import batchEmbWriter +from batchScalarWriteShared import write_shared_scalar_metrics # Allows the use of time functions @@ -22,8 +22,8 @@ def tkyoDriftSetTraining(data_set_Path, io_type, io_type_name): MODELS = { # 't5': 'Xenova/sentence-t5-large', # 'bert': 'Xenova/sentence_bert', - 'mini': 'Xenova/all-MiniLM-L12-v2', - 'e5': 'Xenova/e5-base-v2', + 'mini': 'all-MiniLM-L12-v2', + 'e5': 'e5-base-v2', } @@ -32,7 +32,7 @@ def tkyoDriftSetTraining(data_set_Path, io_type, io_type_name): # Iterate through models dictionary for model_type, model_name in MODELS.items(): - pythonTrainingEmb.trainingEmb( + batchEmbWriter.trainingEmb( model_type=model_type, model_name=model_name, data_path=data_set_Path, diff --git a/tkyo-drift/util/pythonTrainingEmb.py b/tkyo-drift/util/batchEmbWriter.py similarity index 99% rename from tkyo-drift/util/pythonTrainingEmb.py rename to tkyo-drift/util/batchEmbWriter.py index 9a6ec73..98d4d6a 100644 --- a/tkyo-drift/util/pythonTrainingEmb.py +++ b/tkyo-drift/util/batchEmbWriter.py @@ -3,7 +3,7 @@ sys.dont_write_bytecode = True # Import helper function to create kmeans of data -import pythonKMeans +import batchMakeKMeans # This is good for vectors/matrices import numpy as np @@ -250,7 +250,7 @@ def chunk_text(text, tokenizer, max_length=512, stride=256): embeddings.astype(np.float32).tofile(f) else: print(f"You have >= 100000 {io_type} embeddings: Performing K Means analysis to filter embeddings.") - kMeansEmbedding = pythonKMeans.kMeansClustering(embeddings) + kMeansEmbedding = batchMakeKMeans.kMeansClustering(embeddings) # Assign the number of vectors for the training data num_vectors = kMeansEmbedding.shape[0] diff --git a/tkyo-drift/util/pythonKMeans.py b/tkyo-drift/util/batchMakeKMeans.py similarity index 100% rename from tkyo-drift/util/pythonKMeans.py rename to tkyo-drift/util/batchMakeKMeans.py diff --git a/tkyo-drift/util/tkyoDriftSetTrainingHook.js b/tkyo-drift/util/batchPythonHook.js similarity index 96% rename from tkyo-drift/util/tkyoDriftSetTrainingHook.js rename to tkyo-drift/util/batchPythonHook.js index 16c2482..51ca538 100644 --- a/tkyo-drift/util/tkyoDriftSetTrainingHook.js +++ b/tkyo-drift/util/batchPythonHook.js @@ -26,7 +26,7 @@ export default async function tkyoDriftSetTraining( ); } // Ensures we are running tkyoDriftSetTraining.py correctly - const scriptPath = path.join(__dirname, './tkyoDriftSetTraining.py'); + const scriptPath = path.join(__dirname, './batchEmbController.py'); const pyProg = spawn('python3', [ '-u', scriptPath, diff --git a/tkyo-drift/util/writeSharedScalars.py b/tkyo-drift/util/batchScalarWriteShared.py similarity index 98% rename from tkyo-drift/util/writeSharedScalars.py rename to tkyo-drift/util/batchScalarWriteShared.py index cd906b5..d76ce5f 100644 --- a/tkyo-drift/util/writeSharedScalars.py +++ b/tkyo-drift/util/batchScalarWriteShared.py @@ -4,7 +4,7 @@ import numpy as np import time from datetime import datetime -from pythonTrainingEmb import resolve_io_column +from batchEmbWriter import resolve_io_column # * Writes shared scalar metrics (like character length, entropy, etc.) for training data # * One file is created per metric (e.g., ioTypeName.characterLength.training.scalar.jsonl) diff --git a/tkyo-drift/util/downloadTrainingData.py b/tkyo-drift/util/getHFTrainingData.py similarity index 100% rename from tkyo-drift/util/downloadTrainingData.py rename to tkyo-drift/util/getHFTrainingData.py diff --git a/tkyo-drift/util/makeLogEntry.js b/tkyo-drift/util/logMakeEntry.js similarity index 100% rename from tkyo-drift/util/makeLogEntry.js rename to tkyo-drift/util/logMakeEntry.js diff --git a/tkyo-drift/util/makeErrorLogEntry.js b/tkyo-drift/util/logMakeError.js similarity index 100% rename from tkyo-drift/util/makeErrorLogEntry.js rename to tkyo-drift/util/logMakeError.js diff --git a/tkyo-drift/util/printLogCLI.js b/tkyo-drift/util/logPrintCosCLI.js similarity index 100% rename from tkyo-drift/util/printLogCLI.js rename to tkyo-drift/util/logPrintCosCLI.js diff --git a/tkyo-drift/util/printScalarCLI.js b/tkyo-drift/util/logPrintScalarCLI.js similarity index 97% rename from tkyo-drift/util/printScalarCLI.js rename to tkyo-drift/util/logPrintScalarCLI.js index 1ed7ec0..14e8c43 100644 --- a/tkyo-drift/util/printScalarCLI.js +++ b/tkyo-drift/util/logPrintScalarCLI.js @@ -2,8 +2,8 @@ import fs from 'fs'; import path from 'path'; import chalk from 'chalk'; import Table from 'cli-table3'; -import { compareScalarDistributions } from './compareScalarDistributions.js'; -import { loadScalarMetrics } from './loadScalarMetrics.js'; +import { compareScalarDistributions } from './scalarCompare.js'; +import { loadScalarMetrics } from './scalarLoadMetrics.js'; import { OUTPUT_DIR } from './oneOffEmb.js'; export default async function printScalarCLI() { diff --git a/tkyo-drift/util/oneOffEmb.js b/tkyo-drift/util/oneOffEmb.js index 1af6b12..504957e 100644 --- a/tkyo-drift/util/oneOffEmb.js +++ b/tkyo-drift/util/oneOffEmb.js @@ -1,10 +1,10 @@ import fs from 'fs'; import path from 'path'; import { v4 } from 'uuid'; -import { DriftModel } from './DriftModel.js'; -import makeLogEntry from './makeLogEntry.js'; -import makeErrorLogEntry from './makeErrorLogEntry.js'; -import captureSharedScalarMetrics from './captureSharedScalarMetrics.js'; +import { DriftModel } from './oneOffModel.js'; +import makeLogEntry from './logMakeEntry.js'; +import makeErrorLogEntry from './logMakeError.js'; +import captureSharedScalarMetrics from './scalarCaptureShared.js'; // * Global Variables for the utilities // Embedding Models diff --git a/tkyo-drift/util/DriftModel.js b/tkyo-drift/util/oneOffModel.js similarity index 100% rename from tkyo-drift/util/DriftModel.js rename to tkyo-drift/util/oneOffModel.js diff --git a/tkyo-drift/util/captureSharedScalarMetrics.js b/tkyo-drift/util/scalarCaptureShared.js similarity index 100% rename from tkyo-drift/util/captureSharedScalarMetrics.js rename to tkyo-drift/util/scalarCaptureShared.js diff --git a/tkyo-drift/util/compareScalarDistributions.js b/tkyo-drift/util/scalarCompare.js similarity index 100% rename from tkyo-drift/util/compareScalarDistributions.js rename to tkyo-drift/util/scalarCompare.js diff --git a/tkyo-drift/util/loadScalarMetrics.js b/tkyo-drift/util/scalarLoadMetrics.js similarity index 100% rename from tkyo-drift/util/loadScalarMetrics.js rename to tkyo-drift/util/scalarLoadMetrics.js diff --git a/tkyo-drift/util/pythonHNSW.py b/tkyo-drift/util/sharedHNSW.py similarity index 100% rename from tkyo-drift/util/pythonHNSW.py rename to tkyo-drift/util/sharedHNSW.py From 2349009d95a2fe9019d468a4d698010b66d284a5 Mon Sep 17 00:00:00 2001 From: Ataraxist Date: Thu, 17 Apr 2025 19:53:40 -0700 Subject: [PATCH 2/6] Debugged and tested. Semantic names are working. --- tkyo-drift/README.md | 8 ++++++-- tkyo-drift/util/batchEmbController.py | 4 ++-- tkyo-drift/util/batchEmbWriter.py | 2 +- tkyo-drift/util/oneOffModel.js | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tkyo-drift/README.md b/tkyo-drift/README.md index 8367b5d..a8feaf6 100644 --- a/tkyo-drift/README.md +++ b/tkyo-drift/README.md @@ -115,9 +115,13 @@ tkyoDrift(userSubmission, 'input') 5. Enjoy the benefits of having drift detection: +```bash +npx tkyo cos +npx tkyo scalar +🏎️☁️☁️☁️ ← THAT GUY IS DRIFTING ``` -🏎️☁️☁️☁️ <- THAT GUY IS DRIFTING -``` + +This library will create a tkyoData folder at the project root! Don't forget to add it to your `.gitIgnore` as it may contain large files depending on your throughput. All logs, scalars, and binary files tkyoDrift needs to operate will be placed there. # How do you use this thing? diff --git a/tkyo-drift/util/batchEmbController.py b/tkyo-drift/util/batchEmbController.py index e5b0021..a5c1dae 100644 --- a/tkyo-drift/util/batchEmbController.py +++ b/tkyo-drift/util/batchEmbController.py @@ -22,8 +22,8 @@ def tkyoDriftSetTraining(data_set_Path, io_type, io_type_name): MODELS = { # 't5': 'Xenova/sentence-t5-large', # 'bert': 'Xenova/sentence_bert', - 'mini': 'all-MiniLM-L12-v2', - 'e5': 'e5-base-v2', + 'mini': 'sentence-transformers/all-MiniLM-L12-v2', + 'e5': 'intfloat/e5-base-v2', } diff --git a/tkyo-drift/util/batchEmbWriter.py b/tkyo-drift/util/batchEmbWriter.py index 98d4d6a..7fa659a 100644 --- a/tkyo-drift/util/batchEmbWriter.py +++ b/tkyo-drift/util/batchEmbWriter.py @@ -150,7 +150,7 @@ def chunk_text(text, tokenizer, max_length=512, stride=256): return chunks # Embed Data - print(f"Embedding {io_type}s using {model_name} for {model_type} knowledge...") + print(f"Embedding {io_type}s using {model_name}") # Initialize an empty list to store all input embeddings embeddings = [] # Set the number of examples to process at once (smaller = less memory, larger = faster) diff --git a/tkyo-drift/util/oneOffModel.js b/tkyo-drift/util/oneOffModel.js index 16e9822..4838025 100644 --- a/tkyo-drift/util/oneOffModel.js +++ b/tkyo-drift/util/oneOffModel.js @@ -294,7 +294,7 @@ export class DriftModel { ); } // Ensures we are running pythonHNSW.py correctly - const scriptPath = path.join(__dirname, 'pythonHNSW.py'); + const scriptPath = path.join(__dirname, 'sharedHNSW.py'); try { return new Promise((resolve, reject) => { From 3b1c6c6c94a53b06b655804d324061459b2b8f60 Mon Sep 17 00:00:00 2001 From: Ataraxist Date: Sat, 26 Apr 2025 23:44:32 -0700 Subject: [PATCH 3/6] Moved getHFTrainingData.py for loading Hugging Face datasets to the main folder as it is not a utility involved in batched or one off embeddings. Updated package.json to remove unused dependencies. Fiddled with some console log art --- tkyo-drift/{util => }/getHFTrainingData.py | 0 tkyo-drift/package.json | 3 --- tkyo-drift/tkyoDrift.js | 18 +++++++++--------- 3 files changed, 9 insertions(+), 12 deletions(-) rename tkyo-drift/{util => }/getHFTrainingData.py (100%) diff --git a/tkyo-drift/util/getHFTrainingData.py b/tkyo-drift/getHFTrainingData.py similarity index 100% rename from tkyo-drift/util/getHFTrainingData.py rename to tkyo-drift/getHFTrainingData.py diff --git a/tkyo-drift/package.json b/tkyo-drift/package.json index 39ab862..72d96db 100644 --- a/tkyo-drift/package.json +++ b/tkyo-drift/package.json @@ -16,9 +16,6 @@ "ai-monitoring", "embedding", "model-drift", - "semantic-drift", - "concept-drift", - "lexical-drift", "ai-evaluation", "machine-learning", "transformers", diff --git a/tkyo-drift/tkyoDrift.js b/tkyo-drift/tkyoDrift.js index 5fcd558..8d5f668 100755 --- a/tkyo-drift/tkyoDrift.js +++ b/tkyo-drift/tkyoDrift.js @@ -106,14 +106,14 @@ if (process.argv[1].endsWith('tkyo')) { default: console.log( chalk.gray(` -↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ ↑↑↑ ↗↓↓↓↗ ↓↓↓ ↓↓↓ ↓↓↓↓↓↓↓↓↓↓↓↓↖ - ↑↑↑ ↑↑↑ ↗↑↑↑ ↑↑↑ ↑↑↑ ↑↑↑↑ ↖↑↑ - ↑↑↑ ↑↑↑ ↗↑↑↑ ↑↑↑ ↑↑↑ ↑↑↑ ↖↑↑ - ↑↑↑ β†‘β†‘β†‘β†‘β†‘β†‘β†‘β†˜ ↑↑↑ ↑↑↑↑ ↑↑↑ ↖↑↑ - ↖↑↑ →↑↑ β†‘β†‘β†‘β†˜ ↑↑↑↑↑↑↑↑↑↑↑↑↑ ←↑↑ ↑↑↑↗ - ↑↑↑ ↑↑↑ β†‘β†‘β†‘β†˜ ↑↑↑ ↑↑↑ ↗↑↑↓ - ↑↑↑ ↑↑↑ β†‘β†‘β†‘β†˜ ↑↑↑ ↑↑↑↑ ↗↑↑↑ - ↑↑↑ ↑↑↑ β†‘β†‘β†‘β†˜ ↑↑↑ ↑↑↑↑↑↑↑↑↑↑↑↑↑↗ +↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ ↗↑↑ ↗↓↓↓↗ ↓↓↓ ↓↓↓ ↓↓↓↓↓↓↓↓↓↓↓↓↖ + ↗↑↑ ↗↑↑ ↗↑↑↑ ↗↑↑ ↗↑↑ ↗↑↑↑ ↖↑↑ + ↗↑↑ ↗↑↑ ↗↑↑↑ ↗↑↑ ↗↑↑ ↗↑↑ ↖↑↑ + ↗↑↑ β†‘β†‘β†‘β†‘β†‘β†‘β†‘β†˜ ↗↑↑ ↗↑↑↑ ↗↑↑ ↖↑↑ + ↖↑↑ →↑↑ β†‘β†‘β†‘β†˜ ↑↑↑↑↑↑↑↑↑↑↑↑↑ ←↑↑ ↗↑↑↓ + ↗↑↑ ↗↑↑ β†‘β†‘β†‘β†˜ ↑↑↑ ↖↑↑ ↗↑↑↓ + ↗↑↑ ↗↑↑ β†‘β†‘β†‘β†˜ ↑↑↑ ↖↑↑↑ ↗↑↑↗ + ↗↑↑ ↗↑↑ β†‘β†‘β†‘β†˜ ↑↑↑ ↖↑↑↑↑↑↑↑↑↑↑↑↑↗ Usage: ${chalk.yellowBright('tkyo')} ${chalk.white('cos')} ${chalk.blueBright( @@ -126,7 +126,7 @@ Usage: ' ' )} Embed dataset and update training baseline -Readme docs in the node package or at ${chalk.blueBright( +Readme docs are in the node package or at ${chalk.blueBright( 'https://github.com/oslabs-beta/tkyo-drift' )} `) From 3aaacc91d350c581774cbe7df4a3a8ab748063d9 Mon Sep 17 00:00:00 2001 From: Ataraxist Date: Sun, 27 Apr 2025 01:11:01 -0700 Subject: [PATCH 4/6] First feature expansion of the TKYO Drift library to log input texts to their own CSV file for analysis and comparison. Notably there are some unanswered questions here regarding the storage of large text over time. Theoretically this could be a problem. Additionally, some workflows may EXPLICITLY not want to track inputs, but this feature is being implemented as a hardcoded "always do this". I will see about implementing a bash command to enable or disable this feature in the next commit. --- tkyo-drift/README.md | 7 +- tkyo-drift/getHFTrainingData.py | 32 +++++- tkyo-drift/package-lock.json | 4 +- tkyo-drift/package.json | 2 +- tkyo-drift/tkyoDrift.js | 37 +++++++ tkyo-drift/util/batchEmbController.py | 24 +++++ tkyo-drift/util/batchEmbWriter.py | 69 +++++++++++- tkyo-drift/util/batchMakeKMeans.py | 24 +++++ tkyo-drift/util/batchPythonHook.js | 18 ++++ tkyo-drift/util/batchScalarWriteShared.py | 30 ++++++ .../{logMakeEntry.js => logMakeDriftEntry.js} | 13 +++ tkyo-drift/util/logMakeError.js | 12 ++- tkyo-drift/util/logMakeInputEntry.js | 63 +++++++++++ tkyo-drift/util/logPrintCosCLI.js | 16 ++- tkyo-drift/util/logPrintScalarCLI.js | 39 ++++++- tkyo-drift/util/oneOffEmb.js | 63 +++++++++-- tkyo-drift/util/oneOffModel.js | 100 ++++++++++++++++-- tkyo-drift/util/scalarCaptureShared.js | 30 +++++- tkyo-drift/util/scalarCompare.js | 42 +++++++- tkyo-drift/util/scalarLoadMetrics.js | 20 +++- tkyo-drift/util/sharedHNSW.py | 48 +++++++++ 21 files changed, 651 insertions(+), 42 deletions(-) rename tkyo-drift/util/{logMakeEntry.js => logMakeDriftEntry.js} (80%) create mode 100644 tkyo-drift/util/logMakeInputEntry.js diff --git a/tkyo-drift/README.md b/tkyo-drift/README.md index baaebb2..8e961fa 100644 --- a/tkyo-drift/README.md +++ b/tkyo-drift/README.md @@ -24,7 +24,7 @@ In production, even minor changes to prompts, model weights, or input phrasing c And it’s not just the model: user language evolves too. New slang, trending phrases, or tone shifts may emerge that your model wasn't trained on and without observability, you'll miss them. -TKYO Drift embeds each message and compares it against a configurable baseline using **Cosine similarity**, **Euclidean distance**, and scalar features like **punctuation density**, **entropy**, and more. The result is a continuous record of how your model’s and users’ behavior changes over time. +TKYO Drift embeds each message and compares it against a configurable baseline using **Cosine similarity**, **Euclidean distance**, and scalar features like **punctuation density**, **entropy**, and more. The result is a continuous record of how your model's and users' behavior changes over time. Use it to answer questions like: @@ -326,7 +326,8 @@ ID, TIMESTAMP, I/O TYPE, SEMANTIC ROLLING EUC, SEMANTIC TRAINING EUC, CONCEPT RO - Cosine similarities and euclidean distances are recorded per model and baseline type. - Additional metadata like ioType, date and UUIDs are included for tracking. -- Neither the log, nor the binary files, contain your users input or AI outputs. This data is not necessary to calculate drift, and its exclusion is an intentional choice for data privacy. +- Text inputs are logged in a separate `text_log.csv` file for debugging and analysis purposes. This is separate from the drift calculation logs and binary files. +- The binary files contain only the embeddings and do not store the original text inputs or AI outputs. Note: if you add or remove model types to the tkyoDrift tracker, the log will break. Please ensure you clear any existing logs after altering the embedding model names. What we mean here, is that if you change your conceptual embedding model from "concept" to "vibes", when writing to the log the makeLogEntry method of the Drift Class would work, but the log parser would fail. @@ -505,7 +506,7 @@ The result is a value between -1 and 1. For normalized embedding vectors (as use - `1.0` β†’ Identical direction (no drift) - `0.0` β†’ Orthogonal (maximum drift) -Normalization ensures magnitude doesn’t influence the result, so only the _direction_ of the vector matters. Additionally, we are calculating the Euclidean Distance. This metric is not scale-invariant and is typically larger in magnitude. It’s useful in conjunction with cosine similarity to detect both directional and magnitude-based drift. +Normalization ensures magnitude doesn't influence the result, so only the _direction_ of the vector matters. Additionally, we are calculating the Euclidean Distance. This metric is not scale-invariant and is typically larger in magnitude. It's useful in conjunction with cosine similarity to detect both directional and magnitude-based drift. ## How we get the Baseline (B) diff --git a/tkyo-drift/getHFTrainingData.py b/tkyo-drift/getHFTrainingData.py index 5f4a692..0f26869 100644 --- a/tkyo-drift/getHFTrainingData.py +++ b/tkyo-drift/getHFTrainingData.py @@ -1,15 +1,39 @@ +""" +Utility module for downloading and loading datasets from Hugging Face. +This module provides functionality to download training data from Hugging Face +datasets and store them in the local cache. +""" + # Prevent _pycache_ creation, since these scripts only run on demand import sys sys.dont_write_bytecode = True from datasets import load_dataset -# ? If you are using a model on hugging face, you can use this utility to download the training data -# The data will be stored in you ~./cache folder +# Default dataset to load data_location = "SmallDoge/SmallThoughts" -def dataSetLoader (data_location): +def dataSetLoader(data_location): + """ + Load a dataset from Hugging Face and store it in the local cache. + + This function downloads the specified dataset from Hugging Face and + stores it in the user's ~/.cache folder. The dataset can then be used + for training or evaluation purposes. + + Args: + data_location (str): The Hugging Face dataset identifier (e.g., 'username/dataset-name') + + Returns: + Dataset: The loaded Hugging Face dataset object + + Example: + >>> dataset = dataSetLoader("SmallDoge/SmallThoughts") + >>> print(dataset) + """ dataset = load_dataset("SmallDoge/SmallThoughts") print(dataset) return dataset -dataSetLoader(data_location) \ No newline at end of file +# Load the default dataset when the script is run directly +if __name__ == "__main__": + dataSetLoader(data_location) \ No newline at end of file diff --git a/tkyo-drift/package-lock.json b/tkyo-drift/package-lock.json index e8397e8..40fec83 100644 --- a/tkyo-drift/package-lock.json +++ b/tkyo-drift/package-lock.json @@ -1,12 +1,12 @@ { "name": "tkyodrift", - "version": "1.0.6", + "version": "1.0.7", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "tkyodrift", - "version": "1.0.6", + "version": "1.0.7", "license": "MIT", "dependencies": { "@xenova/transformers": "^2.17.2", diff --git a/tkyo-drift/package.json b/tkyo-drift/package.json index 72d96db..27898a4 100644 --- a/tkyo-drift/package.json +++ b/tkyo-drift/package.json @@ -1,6 +1,6 @@ { "name": "tkyodrift", - "version": "1.0.7", + "version": "1.1.0", "description": "Lightweight CLI tool and library for detecting AI model drift using embeddings and scalar metrics. Tracks semantic, conceptual, and lexical change over time.", "main": "./tkyoDrift.js", "bin": { diff --git a/tkyo-drift/tkyoDrift.js b/tkyo-drift/tkyoDrift.js index 8d5f668..c3b8e88 100755 --- a/tkyo-drift/tkyoDrift.js +++ b/tkyo-drift/tkyoDrift.js @@ -1,4 +1,12 @@ #!/usr/bin/env node + +/** + * Main entry point for the TKYO Drift CLI tool. + * This module provides the command-line interface for drift analysis, + * including cosine similarity analysis, scalar metric comparison, + * and training data processing. + */ + /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*::::-%@@#:..-:..+@@@@@@@@@%#++==+*%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@%%%%%%%%##########********#######%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@+:#@@@@@@=.-=+#++:..:=+*##*=..*%%@@@@%#=:@@@@@@@@@@%**@@@@@@@@@@@%#+=-::..::-==+**######%%%%%%%@@@@@@@@@@@@@@@@@@@@@@@@@@@@%%#:.-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@@@@@@@@@@@@@@@@@@+%@@@@@@@@@@:.-:=#@@@@@@@@@@@@@%%=%@@@@@@%*.:-*%%%%*-*+=-:.:-=+*#%%######%%%%###***+++=====--------======+++++++****####%%%%@@%%#=--:+@@@@@@@@@@@@@@@@@@@@@@@=-@@@@@@@@@@@@@@@@@@@@@@@ @@ -42,6 +50,7 @@ @@@@@@@@@@@@@@@@@%+:--::=****=:..::-. ...... ...:::::.......................... . @%%%####******+++++++++=============------:::::............. ...............................::::::::::::::::::::::------=====+++++++*******#######%%%%%%@@@@@@@ @@@@@@@@@@@@@@@@@@%%%##############%%%%%%%%%%%%%%%%%%%%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/ + import tkyoDriftSetTrainingHook from './util/batchPythonHook.js'; import printScalarCLI from './util/logPrintScalarCLI.js'; import printLogCLI from './util/logPrintCosCLI.js'; @@ -53,6 +62,24 @@ import fs from 'fs'; // Get the commands from the CLI (the first 2 are not commands) const [command, ...rest] = process.argv.slice(2); +/** + * Main CLI handler for TKYO Drift commands. + * Processes the following commands: + * - cos: Show cosine similarity drift logs + * - scalar: Show scalar metric drift comparison + * - train: Process training data and update baselines + * + * @example + * Show cosine similarity drift for last 30 days + * tkyo cos 30 + * + * Show scalar metric drift comparison + * tkyo scalar + * + * Process training data + * tkyo train ./data input input + */ + // Only run if the command is a "tkyo" command // if (process.argv[1] === new URL(import.meta.url).pathname) { // ! Alternative, ESM Based if (process.argv[1].endsWith('tkyo')) { @@ -134,4 +161,14 @@ Readme docs are in the node package or at ${chalk.blueBright( } } +/** + * Export the main drift analysis function for programmatic use. + * This allows the drift analysis functionality to be used as a library + * in addition to the CLI interface. + * + * @example + * import tkyoDrift from 'tkyodrift'; + * await tkyoDrift("Sample text", "input"); + */ + export default tkyoDrift; diff --git a/tkyo-drift/util/batchEmbController.py b/tkyo-drift/util/batchEmbController.py index 623747f..f4d7238 100644 --- a/tkyo-drift/util/batchEmbController.py +++ b/tkyo-drift/util/batchEmbController.py @@ -1,3 +1,9 @@ +""" +Controller module for batch processing of embeddings and scalar metrics. +This module coordinates the generation of embeddings and scalar metrics +for training data sets. +""" + # Prevent _pycache_ creation, since these scripts only run on demand import sys sys.dont_write_bytecode = True @@ -14,6 +20,24 @@ import traceback def tkyoDriftSetTraining(data_set_Path, io_type, io_type_name): + """ + Process a dataset to generate embeddings and scalar metrics for training. + + This function coordinates the generation of embeddings for multiple models + and computes scalar metrics for the given dataset. It handles both the + embedding generation and scalar metric computation in a single pass. + + Args: + data_set_Path (str): Path to the dataset directory + io_type (str): Type of input/output (e.g., 'input', 'output') + io_type_name (str): Name identifier for the I/O type + + Returns: + dict: A dictionary containing the status and message of the operation + + Raises: + Exception: If any error occurs during processing + """ # Starts the total function timer startTotal = time.perf_counter() diff --git a/tkyo-drift/util/batchEmbWriter.py b/tkyo-drift/util/batchEmbWriter.py index 7fa659a..640dc07 100644 --- a/tkyo-drift/util/batchEmbWriter.py +++ b/tkyo-drift/util/batchEmbWriter.py @@ -1,3 +1,9 @@ +""" +Module for batch processing of text embeddings using transformer models. +This module handles the generation of embeddings for both short and long texts, +with special handling for texts that exceed the model's maximum token length. +""" + # Prevent _pycache_ creation, since these scripts only run on demand import sys sys.dont_write_bytecode = True @@ -23,6 +29,23 @@ import gc def trainingEmb(model_type, model_name, data_path, io_type, io_type_name): + """ + Generate embeddings for a dataset using a specified transformer model. + + This function processes a dataset in batches, handling both short and long texts. + For long texts, it uses a chunking strategy to process them in smaller pieces. + It also computes and saves model-specific scalar metrics for each embedding. + + Args: + model_type (str): Type of model (e.g., 'mini', 'e5') + model_name (str): Name of the transformer model to use + data_path (str): Path to the dataset directory + io_type (str): Type of input/output (e.g., 'input', 'output') + io_type_name (str): Name identifier for the I/O type + + Raises: + ValueError: If no .arrow files are found in the dataset directory + """ # Starts the total function timer startTotal = time.perf_counter() @@ -62,6 +85,16 @@ def trainingEmb(model_type, model_name, data_path, io_type, io_type_name): # When invoked, this will embed the current batch def embed_data(data): + """ + Embed a batch of texts, handling both short and long texts appropriately. + + Args: + data (list): List of text strings to embed + + Returns: + numpy.ndarray: Array of embeddings in the same order as input texts + """ + # Stores texts shorter than 512 tokens short_texts = [] # Stores short text positions in the batch @@ -121,6 +154,16 @@ def embed_data(data): # Handles the embeddings of a single long text def embed_long_text(text): + """ + Embed a single long text by chunking and averaging chunk embeddings. + + Args: + text (str): The text to embed + + Returns: + numpy.ndarray: The averaged embedding vector + """ + chunks = chunk_text(text, tokenizer) tokenized = tokenizer( chunks, @@ -136,6 +179,19 @@ def embed_long_text(text): # Breaks the text up into overlapping chunks def chunk_text(text, tokenizer, max_length=512, stride=256): + """ + Break a long text into overlapping chunks for processing. + + Args: + text (str): The text to chunk + tokenizer: The tokenizer to use + max_length (int): Maximum length of each chunk + stride (int): Number of tokens to overlap between chunks + + Returns: + list: List of text chunks + """ + # Tokenizes each input tokens = tokenizer.encode(text, add_special_tokens=False) # Holds the tokenized chunks @@ -292,6 +348,17 @@ def chunk_text(text, tokenizer, max_length=512, stride=256): return def resolve_io_column(batch, io_type_name): + """ + Resolve the correct column name for I/O data in the batch. + + Args: + batch: The dataset batch + io_type_name (str): Name identifier for the I/O type + + Returns: + list: List of texts from the correct column + """ + try: # ------------------------------- # Case 1: Flat column access @@ -329,7 +396,7 @@ def resolve_io_column(batch, io_type_name): if val is not None: result.append(val) except (KeyError, IndexError, TypeError): - # If something goes wrong (e.g., path doesn’t exist, value is None), skip it + # If something goes wrong (e.g., path doesn't exist, value is None), skip it print(f"[WARN] Skipping row {i}: missing nested path in {io_type_name}") return result diff --git a/tkyo-drift/util/batchMakeKMeans.py b/tkyo-drift/util/batchMakeKMeans.py index 72f98f5..0a90365 100644 --- a/tkyo-drift/util/batchMakeKMeans.py +++ b/tkyo-drift/util/batchMakeKMeans.py @@ -1,3 +1,9 @@ +""" +Module for performing K-means clustering on embedding vectors. +This module provides functionality to cluster embedding vectors into groups +using the K-means algorithm, with automatic determination of optimal cluster count. +""" + # Prevent _pycache_ creation, since these scripts only run on demand import sys sys.dont_write_bytecode = True @@ -9,6 +15,24 @@ import time def kMeansClustering(embeddings): + """ + Perform K-means clustering on a set of embedding vectors. + + This function automatically determines the optimal number of clusters + based on the number of vectors, then performs K-means clustering to + identify centroids that represent the main patterns in the data. + + Args: + embeddings (numpy.ndarray): Array of embedding vectors to cluster + + Returns: + numpy.ndarray: Array of cluster centroids + + Note: + The number of clusters is determined by the formula: sqrt(n/2) * 10, + where n is the number of vectors. This is a heuristic that balances + the granularity of clustering with computational efficiency. + """ # Starts the total function timer startTotal = time.perf_counter() diff --git a/tkyo-drift/util/batchPythonHook.js b/tkyo-drift/util/batchPythonHook.js index 51ca538..a9d4336 100644 --- a/tkyo-drift/util/batchPythonHook.js +++ b/tkyo-drift/util/batchPythonHook.js @@ -1,3 +1,9 @@ +/** + * Utility function to interface with Python batch processing scripts. + * This module provides a bridge between Node.js and Python for batch processing + * of training data and embeddings. + */ + import { spawn } from 'child_process'; import path from 'path'; import { fileURLToPath } from 'url'; @@ -8,6 +14,18 @@ const __filename = fileURLToPath(import.meta.url); // Directory containing the file (tkyo-drift) const __dirname = path.dirname(__filename); +/** + * Sets up and processes training data using Python batch processing scripts. + * This function spawns a Python process to handle batch embedding generation + * and training data setup. + * + * @param {string} dataSetPath - Path to the dataset directory + * @param {string} ioType - Type of input/output (e.g., 'input', 'output') + * @param {string} ioTypeName - Name identifier for the I/O type + * @returns {Promise} The output from the Python process + * @throws {Error} If the dataset path doesn't exist or if Python process fails + */ + export default async function tkyoDriftSetTraining( dataSetPath, ioType, diff --git a/tkyo-drift/util/batchScalarWriteShared.py b/tkyo-drift/util/batchScalarWriteShared.py index d76ce5f..33699cd 100644 --- a/tkyo-drift/util/batchScalarWriteShared.py +++ b/tkyo-drift/util/batchScalarWriteShared.py @@ -1,3 +1,9 @@ +""" +Module for computing and writing shared scalar metrics for text data. +This module calculates various text-based metrics like character length, +entropy, word length, and punctuation density, then writes them to JSONL files. +""" + from datasets import Dataset, concatenate_datasets import os import json @@ -9,6 +15,30 @@ # * Writes shared scalar metrics (like character length, entropy, etc.) for training data # * One file is created per metric (e.g., ioTypeName.characterLength.training.scalar.jsonl) def write_shared_scalar_metrics(data_path, io_type, io_type_name): + """ + Compute and write shared scalar metrics for a dataset of texts. + + This function processes a dataset of texts and computes various scalar metrics + for each text, including: + - Character length + - Character entropy (measures repetition vs. diversity) + - Average word length + - Punctuation density + - Uppercase ratio + + The metrics are written to separate JSONL files, one per metric type, + in the format: ioType.metricName.training.scalar.jsonl + + Args: + data_path (str): Path to the dataset directory containing .arrow files + io_type (str): Type of input/output (e.g., 'input', 'output') + io_type_name (str): Name identifier for the I/O type + + Note: + The function includes progress tracking and estimated time remaining + for long-running operations. + """ + # Load all `.arrow` files from the provided dataset directory arrow_files = [ os.path.join(data_path, f) diff --git a/tkyo-drift/util/logMakeEntry.js b/tkyo-drift/util/logMakeDriftEntry.js similarity index 80% rename from tkyo-drift/util/logMakeEntry.js rename to tkyo-drift/util/logMakeDriftEntry.js index 950918e..433e41b 100644 --- a/tkyo-drift/util/logMakeEntry.js +++ b/tkyo-drift/util/logMakeDriftEntry.js @@ -1,7 +1,20 @@ +/** + * Utility function to log drift metrics (cosine similarity or euclidean distance) to a CSV file. + * This function handles both the creation of new log files and appending to existing ones. + * The log file structure is dynamic based on the models and baseline types being used. + */ + import fs from 'fs'; import path from 'path'; import { OUTPUT_DIR } from './oneOffEmb.js'; +/** + * Creates or appends a log entry for drift metrics to the appropriate CSV file. + * + * @param {string} id - Unique identifier (UUID) for the drift analysis + * @param {Object} mathObject - Object containing drift metrics with keys in format "modelType.ioType.baselineType" + * @param {string} type - Type of drift metric, either 'COS' for cosine similarity or 'EUC' for euclidean distance + */ export default function makeLogEntry(id, mathObject, type) { let logPath = ''; // Construct the destination to the log in the data folder diff --git a/tkyo-drift/util/logMakeError.js b/tkyo-drift/util/logMakeError.js index 71d50c9..0c4e313 100644 --- a/tkyo-drift/util/logMakeError.js +++ b/tkyo-drift/util/logMakeError.js @@ -1,8 +1,18 @@ +/** + * Utility function to log errors that occur during drift analysis to a CSV file. + * This function handles both the creation of new error log files and appending to existing ones. + * Errors are logged with timestamps and error messages in a structured format. + */ + import fs from 'fs'; import path from 'path'; import { OUTPUT_DIR } from './oneOffEmb.js'; -// * Logs a structured error entry to a CSV in the data folder +/** + * Creates or appends an error log entry to the error log CSV file. + * + * @param {Error} error - The error object to be logged + */ export default function makeErrorLogEntry(error) { // Build path to error log const logPath = path.join(OUTPUT_DIR, 'logs', 'ERR_log.csv'); diff --git a/tkyo-drift/util/logMakeInputEntry.js b/tkyo-drift/util/logMakeInputEntry.js new file mode 100644 index 0000000..d486894 --- /dev/null +++ b/tkyo-drift/util/logMakeInputEntry.js @@ -0,0 +1,63 @@ +/** + * Utility function to log text inputs that are being analyzed for drift detection. + * This creates and maintains a CSV log file that tracks the text inputs along with their + * unique identifiers and timestamps. This log is separate from the drift metrics log + * to keep the input data distinct from the analysis results. + */ + +import fs from 'fs'; +import path from 'path'; +import { OUTPUT_DIR } from './oneOffEmb.js'; + +/** + * Escapes a string for CSV format by: + * 1. Replacing any double quotes with two double quotes + * 2. Wrapping the entire string in double quotes + * + * @param {string} str - The string to escape + * @returns {string} - The escaped string + */ +function escapeCSV(str) { + // Replace any double quotes with two double quotes + const escaped = str.replace(/"/g, '""'); + // Wrap in double quotes + return `"${escaped}"`; +} + +/** + * Creates or appends a log entry for a text input being analyzed for drift. + * + * @param {string} id - Unique identifier (UUID) for the text input + * @param {string} text - The actual text content being analyzed + */ +export default function logMakeInputEntry(id, text) { + // Construct the path to the text log file in the logs directory + const logPath = path.join(OUTPUT_DIR, 'logs', 'text_log.csv'); + + // Generate an ISO timestamp for when this entry is being made + const timestamp = new Date().toISOString(); + + // Create the CSV row with ID, timestamp, and escaped text content + // ID and timestamp don't need escaping as they won't contain special characters + const row = [id, timestamp, escapeCSV(text)]; + const csvLine = row.join(',') + '\n'; + + // Check if the log file already exists + const fileExists = fs.existsSync(logPath); + + // Write to the log file + try { + if (!fileExists) { + // If the file doesn't exist, create it with headers + const headers = ['ID', 'TIMESTAMP', 'TEXT'].join(',') + '\n'; + fs.writeFileSync(logPath, headers + csvLine); + } else { + // If the file exists, append the new entry + fs.appendFileSync(logPath, csvLine); + } + } catch (error) { + // Log any errors that occur during file operations + // This could be due to permissions, disk space, or file locks + console.error('Failed to write text log entry:', error.message); + } +} \ No newline at end of file diff --git a/tkyo-drift/util/logPrintCosCLI.js b/tkyo-drift/util/logPrintCosCLI.js index 3ebdb2b..dd27b60 100644 --- a/tkyo-drift/util/logPrintCosCLI.js +++ b/tkyo-drift/util/logPrintCosCLI.js @@ -1,9 +1,23 @@ +/** + * Utility function to print cosine similarity drift metrics in a formatted CLI table. + * This function reads the cosine similarity log file, processes the data, and displays + * it in a color-coded table showing drift metrics over a specified time period. + */ + import fs from 'fs'; import path from 'path'; import chalk from 'chalk'; import Table from 'cli-table3'; import { MODELS, OUTPUT_DIR } from './oneOffEmb.js'; +/** + * Prints a formatted table of cosine similarity drift metrics to the console. + * The table shows average similarity scores and violation counts for each model, + * I/O type, and baseline combination over the specified time period. + * + * @param {string|number} arg - Number of days to look back for drift analysis (defaults to 30 if invalid) + * @throws {Error} If the log file doesn't exist or can't be parsed + */ export default async function printLogCLI(arg) { // Constants & CLI Args const logPath = path.join(OUTPUT_DIR, 'logs', 'COS_log.csv'); @@ -16,7 +30,7 @@ export default async function printLogCLI(arg) { throw new Error(`No log file not found at: ${logPath}`); } - // Declare header and row variables so they’re accessible later + // Declare header and row variables so they're accessible later let headers, rows; try { diff --git a/tkyo-drift/util/logPrintScalarCLI.js b/tkyo-drift/util/logPrintScalarCLI.js index 14e8c43..ae7209a 100644 --- a/tkyo-drift/util/logPrintScalarCLI.js +++ b/tkyo-drift/util/logPrintScalarCLI.js @@ -1,3 +1,9 @@ +/** + * Utility function to print scalar metric drift analysis in a formatted CLI table. + * This function reads scalar metric files, compares training and rolling distributions, + * and displays the results in a color-coded table showing drift metrics. + */ + import fs from 'fs'; import path from 'path'; import chalk from 'chalk'; @@ -6,6 +12,14 @@ import { compareScalarDistributions } from './scalarCompare.js'; import { loadScalarMetrics } from './scalarLoadMetrics.js'; import { OUTPUT_DIR } from './oneOffEmb.js'; +/** + * Prints a formatted table of scalar metric drift analysis to the console. + * The table shows statistical comparisons between training and rolling data, + * including means, standard deviations, and Population Stability Index (PSI). + * + * @returns {Promise} + */ + export default async function printScalarCLI() { // Define the path to where scalar .jsonl files are stored const SCALAR_DIR = path.join(OUTPUT_DIR, 'scalars'); @@ -155,14 +169,27 @@ export default async function printScalarCLI() { } } - // Helper to color code regular values + /** + * Formats a numeric value with 2 decimal places. + * + * @param {number} val - The value to format + * @returns {string} - The formatted value in white + */ + function format(val) { if (typeof val !== 'number') return chalk.gray('n/a'); const formatted = val.toFixed(2); return chalk.white(formatted); } - // Helper to color code delta values by severity + /** + * Formats a delta value with color coding based on its z-score. + * + * @param {number} val - The delta value to format + * @param {number} std - The standard deviation to use for z-score calculation + * @returns {string} - The formatted value in green/yellow/red based on severity + */ + function formatDelta(val, std) { if (typeof val !== 'number') return chalk.gray('n/a'); const formatted = val.toFixed(2); @@ -174,7 +201,13 @@ export default async function printScalarCLI() { return chalk.red(formatted); // Drifted } - // Helper to color code PSI values by severity + /** + * Formats a PSI value with color coding based on drift severity. + * + * @param {number} val - The PSI value to format + * @returns {string} - The formatted value in green/yellow/red based on severity + */ + function formatPSI(val) { if (typeof val !== 'number') return chalk.gray('n/a'); const formatted = val.toFixed(3); diff --git a/tkyo-drift/util/oneOffEmb.js b/tkyo-drift/util/oneOffEmb.js index 504957e..999889d 100644 --- a/tkyo-drift/util/oneOffEmb.js +++ b/tkyo-drift/util/oneOffEmb.js @@ -1,25 +1,69 @@ +/** + * Core module for drift analysis using transformer-based embeddings. + * This module provides the main pipeline for analyzing text drift using + * multiple models and metrics, including cosine similarity and euclidean distance. + */ + import fs from 'fs'; import path from 'path'; import { v4 } from 'uuid'; import { DriftModel } from './oneOffModel.js'; -import makeLogEntry from './logMakeEntry.js'; +import logMakeDriftEntry from './logMakeDriftEntry.js'; import makeErrorLogEntry from './logMakeError.js'; +import logMakeInputEntry from './logMakeInputEntry.js'; import captureSharedScalarMetrics from './scalarCaptureShared.js'; -// * Global Variables for the utilities -// Embedding Models +/** + * Available embedding models for drift analysis. + * @type {Object.} + */ + export const MODELS = { // t5: 'Xenova/sentence-t5-large', // bert: 'Xenova/sentence_bert', mini: 'Xenova/all-MiniLM-L12-v2', e5: 'Xenova/e5-base-v2', }; -// Log, Scalar, and Vector root output directory + +/** + * Root directory for all drift analysis data. + * Contains subdirectories for vectors, scalars, and logs. + * @type {string} + */ + export const OUTPUT_DIR = path.resolve('./tkyoData'); -// Cache of pipeline output results, to speed up model loading + +/** + * Cache for pipeline output results to speed up model loading. + * @type {Object} + */ + export const MODEL_CACHE = {}; -// * One Off Ingestion Pipeline Logic +/** + * Main function for performing drift analysis on a text input. + * + * This function orchestrates the entire drift analysis pipeline: + * 1. Sets up necessary directories and validates model configuration + * 2. Initializes and loads transformer models + * 3. Generates embeddings for the input text + * 4. Computes scalar metrics (both shared and model-specific) + * 5. Saves embeddings and metrics to disk + * 6. Calculates drift metrics (cosine similarity and euclidean distance) + * 7. Logs all results + * + * @param {string} text - The text to analyze for drift + * @param {string} ioType - Type of input/output (e.g., 'input', 'output') + * @returns {Promise} + * + * @throws {Error} If model configuration is invalid + * @throws {Error} If there are issues with model construction or loading + * + * @example + * Analyze drift in an input text + * await tkyoDrift("Sample text to analyze", "input"); + */ + export default async function tkyoDrift(text, ioType) { // Stopwatch START 🏎️ // console.time('Drift Analyzer Full Run'); @@ -159,9 +203,10 @@ export default async function tkyoDrift(text, ioType) { // * Push the results to each log // Make shared ID and date for the cosine and Euclidean logs const sharedID = v4(); - makeLogEntry(sharedID, similarityResults, 'COS'); - makeLogEntry(sharedID, distanceResults, 'EUC'); - + logMakeDriftEntry(sharedID, similarityResults, 'COS'); + logMakeDriftEntry(sharedID, distanceResults, 'EUC'); + logMakeInputEntry(sharedID, text); + // ------------- << END try/catch Error Handling >> ------------- // * Push any errors to the error log // ! NOTE: This platform intentionally fails silently diff --git a/tkyo-drift/util/oneOffModel.js b/tkyo-drift/util/oneOffModel.js index 4838025..b1b73fc 100644 --- a/tkyo-drift/util/oneOffModel.js +++ b/tkyo-drift/util/oneOffModel.js @@ -1,13 +1,32 @@ +/** + * Core class for handling drift analysis using a single model. + * This class manages the lifecycle of a drift analysis model, including + * model loading, embedding generation, and drift metric computation. + */ + import fs from 'fs'; import path from 'path'; import { error } from 'console'; +import { fileURLToPath } from 'url'; import fsPromises from 'fs/promises'; import { spawn } from 'child_process'; import { pipeline } from '@xenova/transformers'; import { OUTPUT_DIR, MODEL_CACHE } from './oneOffEmb.js'; -import { fileURLToPath } from 'url'; +/** + * Class representing a drift analysis model. + * Handles model initialization, embedding generation, and drift metric computation. + */ export class DriftModel { + /** + * Create a new DriftModel instance. + * + * @param {string} modelType - Type of model (e.g., 'mini', 'e5') + * @param {string} modelName - Name of the transformer model to use + * @param {string} ioType - Type of input/output (e.g., 'input', 'output') + * @param {string} baselineType - Type of baseline ('training' or 'rolling') + */ + constructor(modelType, modelName, ioType, baselineType) { this.baselineType = baselineType; this.modelType = modelType; @@ -25,7 +44,13 @@ export class DriftModel { this.embeddingFilePath = null; } - // * Function to set the file path + /** + * Set the file paths for embeddings and scalar metrics. + * Handles both regular and KMeans-based training files. + * + * @throws {Error} If there's an error setting the file paths + */ + setFilePaths() { try { // ?NOTE: training baselines may use KMeans files, which are handled inside the Python logic. @@ -68,7 +93,13 @@ export class DriftModel { } } - // * Function to load the embedding model + /** + * Load the embedding model using the Xenova transformer pipeline. + * Uses a global cache to avoid reloading the same model. + * + * @throws {Error} If there's an error loading the model + */ + async loadModel() { try { // Don't reload a model if it's loaded. @@ -93,7 +124,14 @@ export class DriftModel { } } - // * Function to make an embedding from an input/output pair + /** + * Generate an embedding for the given text. + * Handles both short and long texts using chunking for texts that exceed token limits. + * + * @param {string} text - The text to generate an embedding for + * @throws {Error} If the text is invalid or embedding generation fails + */ + async makeEmbedding(text) { try { // Validate that the text is not null/undefined/empty @@ -184,7 +222,13 @@ export class DriftModel { } } - // * Function to Save Data to file path + /** + * Save the current embedding to a binary file. + * Only saves for rolling baselines, not training baselines. + * + * @throws {Error} If there's an error saving the embedding + */ + async saveToBin() { // Skip if training β€” this method is only for rolling baseline if (this.baselineType === 'training') return; @@ -273,7 +317,13 @@ export class DriftModel { } } - // * Function to read the contents of the Bins, Build an HNSW + /** + * Read embeddings from the binary file. + * Handles both regular and KMeans-based training files. + * + * @throws {Error} If there's an error reading the file + */ + async readFromBin() { // Full path to DriftModel.js const __filename = fileURLToPath(import.meta.url); @@ -357,7 +407,13 @@ export class DriftModel { } } - // * Function to get baseline value from vectorArray + /** + * Calculate the baseline embedding from the loaded vector array. + * Computes the mean of all vectors in the array. + * + * @throws {Error} If the vector array is not loaded or empty + */ + getBaseline() { try { // Check to make sure the vectorArray was correctly set in readFromBin @@ -407,7 +463,13 @@ export class DriftModel { } } - // * Function to get cosine similarity between baseline and embedding + /** + * Calculate the cosine similarity between the current embedding and baseline. + * + * @returns {number} The cosine similarity score + * @throws {Error} If embeddings are not available + */ + getCosineSimilarity() { try { // Validate the embedding and baselines both exist @@ -448,7 +510,13 @@ export class DriftModel { } } - // * Function to calculate the euclidean distance from the baseline + /** + * Calculate the euclidean distance between the current embedding and baseline. + * + * @returns {number} The euclidean distance + * @throws {Error} If embeddings are not available + */ + getEuclideanDistance() { try { // Validate that the embedding and baselines exist @@ -485,7 +553,12 @@ export class DriftModel { } } - // * Function to siphon PSI distribution metrics + /** + * Capture model-specific scalar metrics for the current text. + * + * @param {string} text - The text to analyze + */ + captureModelSpecificScalarMetrics(text) { try { // Skip if training β€” this method is only for rolling baseline @@ -508,7 +581,12 @@ export class DriftModel { } } - // * Function to write model-specific scalar metrics to separate files + /** + * Save the captured scalar metrics to a JSONL file. + * + * @throws {Error} If there's an error saving the metrics + */ + async saveScalarMetrics() { // Skip if training β€” this method is only for rolling baseline if (this.baselineType === 'training') return; diff --git a/tkyo-drift/util/scalarCaptureShared.js b/tkyo-drift/util/scalarCaptureShared.js index 8300696..8c665c5 100644 --- a/tkyo-drift/util/scalarCaptureShared.js +++ b/tkyo-drift/util/scalarCaptureShared.js @@ -1,9 +1,24 @@ +/** + * Utility functions for capturing and computing shared scalar metrics for text analysis. + * These metrics include character length, entropy, word length, punctuation density, + * and uppercase ratio, which are stored in JSONL files for drift analysis. + */ + import fsPromises from 'fs/promises'; import fs from 'fs'; import path from 'path'; import { OUTPUT_DIR } from './oneOffEmb.js'; -// Calculates the shared scalar values for a given input/output pair +/** + * Captures and stores shared scalar metrics for a given text input. + * The metrics are written to JSONL files in the scalars directory, + * with separate files for each metric type. + * + * @param {string} text - The text to analyze + * @param {string} ioType - The type of input/output (e.g., 'input', 'output') + * @returns {Promise} + */ + export default async function captureSharedScalarMetrics(text, ioType) { const timestamp = new Date().toISOString(); @@ -31,7 +46,18 @@ export default async function captureSharedScalarMetrics(text, ioType) { ); } -// Internal helper to calculate scalar metrics for a given string +/** + * Computes various scalar metrics for a given text string. + * + * @param {string} text - The text to analyze + * @returns {Object} An object containing the following metrics: + * - characterLength: Total number of characters + * - characterEntropy: Shannon entropy of character distribution + * - avgWordLength: Average length of words + * - punctuationDensity: Ratio of punctuation characters + * - uppercaseRatio: Ratio of uppercase letters + */ + function computeMetrics(text) { const metrics = {}; diff --git a/tkyo-drift/util/scalarCompare.js b/tkyo-drift/util/scalarCompare.js index b12eee5..ade1b26 100644 --- a/tkyo-drift/util/scalarCompare.js +++ b/tkyo-drift/util/scalarCompare.js @@ -1,4 +1,18 @@ -// * Function that compares the scalar distributions between rolling and training +/** + * Utility functions for comparing statistical distributions between training and rolling data. + * These functions calculate means, standard deviations, and Population Stability Index (PSI) + * to detect drift in scalar metrics. + */ + +/** + * Compares statistical distributions between training and rolling data sets. + * For each shared metric, calculates means, standard deviations, and PSI. + * + * @param {Object} trainingMetrics - Object containing arrays of training data for each metric + * @param {Object} rollingMetrics - Object containing arrays of rolling data for each metric + * @returns {Object} Object containing statistical comparisons for each shared metric + */ + export function compareScalarDistributions(trainingMetrics, rollingMetrics) { const result = {}; @@ -37,12 +51,24 @@ export function compareScalarDistributions(trainingMetrics, rollingMetrics) { return result; } -// Helper: Mean +/** + * Calculates the arithmetic mean of an array of numbers. + * + * @param {number[]} arr - Array of numbers + * @returns {number} The mean value + */ + function mean(arr) { return arr.reduce((sum, val) => sum + val, 0) / arr.length; } -// Helper: Standard Deviation +/** + * Calculates the standard deviation of an array of numbers. + * + * @param {number[]} arr - Array of numbers + * @returns {number} The standard deviation + */ + function stddev(arr) { const avg = mean(arr); const variance = @@ -50,6 +76,16 @@ function stddev(arr) { return Math.sqrt(variance); } +/** + * Calculates the Population Stability Index (PSI) between two distributions. + * PSI measures the difference between two probability distributions. + * + * @param {number[]} train - Array of training data values + * @param {number[]} roll - Array of rolling data values + * @param {number} bins - Number of bins to use for distribution comparison (default: 10) + * @returns {number|null} The PSI value, or null if input is invalid + */ + function calculatePSI(train, roll, bins = 10) { if ( !Array.isArray(train) || diff --git a/tkyo-drift/util/scalarLoadMetrics.js b/tkyo-drift/util/scalarLoadMetrics.js index 164e5ef..34598a6 100644 --- a/tkyo-drift/util/scalarLoadMetrics.js +++ b/tkyo-drift/util/scalarLoadMetrics.js @@ -1,9 +1,27 @@ +/** + * Utility function for loading scalar metrics from JSONL files. + * This function reads metric data from files, handles both model-specific and model-agnostic metrics, + * and supports hybrid mode for training data. + */ + import fs from 'fs'; import readline from 'readline'; import path from 'path'; import { OUTPUT_DIR } from './oneOffEmb.js'; -// * Function to read scalar metrics from the scalar jsonl files and group them by metric name +/** + * Loads scalar metrics from JSONL files and groups them by metric name. + * Supports both model-specific and model-agnostic metrics, and can operate in hybrid mode + * where training data is derived from rolling data. + * + * @param {string[]} metricNames - Array of metric names to load + * @param {string} ioType - Type of input/output (e.g., 'input', 'output') + * @param {string} baselineType - Type of baseline ('training' or 'rolling') + * @param {string|null} modelType - Optional model type for model-specific metrics + * @param {boolean} hybridMode - If true, uses rolling data as training data + * @returns {Promise} Object containing arrays of metric values keyed by metric name + */ + export async function loadScalarMetrics( metricNames, ioType, diff --git a/tkyo-drift/util/sharedHNSW.py b/tkyo-drift/util/sharedHNSW.py index 02b6e2e..dc4bc65 100644 --- a/tkyo-drift/util/sharedHNSW.py +++ b/tkyo-drift/util/sharedHNSW.py @@ -1,3 +1,9 @@ +""" +Module for performing nearest neighbor search using HNSW (Hierarchical Navigable Small World) algorithm. +This module provides functionality to find similar vectors in a dataset using approximate nearest neighbor search, +with special handling for both training and rolling baselines. +""" + # Numerical operations package import numpy as np # HNSW nearest neighbor search package @@ -16,6 +22,28 @@ import traceback def HNSW(io_type, model_type, query, baseline_type, file_path): + """ + Perform nearest neighbor search using HNSW algorithm. + + This function loads embeddings from a binary file and finds the k nearest neighbors + to the query vector. It handles both training and rolling baselines differently, + with special considerations for small datasets and KMeans centroids. + + Args: + io_type (str): Type of input/output (e.g., 'input', 'output') + model_type (str): Type of model (e.g., 'mini', 'e5') + query (str): JSON string containing the query vector + baseline_type (str): Type of baseline ('training' or 'rolling') + file_path (str): Path to the binary file containing embeddings + + Returns: + dict: A dictionary containing: + - centroids: List of nearest neighbor vectors + - distances: List of distances to nearest neighbors (None for small datasets) + + Raises: + ValueError: If query format is invalid or data size mismatch + """ # Parse the JSON query string into a numpy array try: @@ -24,6 +52,26 @@ def HNSW(io_type, model_type, query, baseline_type, file_path): raise ValueError("Invalid query format - must be JSON string") def load_embeddings(filename): + """ + Load embeddings from a binary file with header information. + + The binary file format is: + - First 8 bytes: Header containing num_vectors (4 bytes) and dims (4 bytes) + - Remaining bytes: Float32 array of embeddings + + Args: + filename (str): Path to the binary file + + Returns: + tuple: (reshaped_data, num_vectors, dims) + - reshaped_data: numpy array of shape (num_vectors, dims) + - num_vectors: Number of vectors in the file + - dims: Dimension of each vector + + Raises: + ValueError: If data size doesn't match header information + """ + # Loads the embeddings from the binary file with the header with open(filename, "rb") as f: # Read and parse header containing num_vector and dims From ace23c016495d1dd34172296a503c5b1061e0ae6 Mon Sep 17 00:00:00 2001 From: Ataraxist Date: Sun, 27 Apr 2025 16:10:53 -0700 Subject: [PATCH 5/6] In order to have a config setting that is toggleable, we need to modify a static file that contains the property. I tried to achieve this with a config.js file, but as soon as the context window closes, the toggle goes back to whatever value it had before. As a reuslt, I tried creating a config JSON so that config settings are modifyable using the command line interface. This backfired as import type assertions for JSON files are not working in my environment, and I dont want to risk this being broken for other people in their deployments. As a result, the fucntion is always enabled, but the config.js file will read env variables for text logging instead. --- tkyo-drift/config.js | 23 ++++ tkyo-drift/tkyoDrift.js | 126 ++++++++++-------- tkyo-drift/util/batchPythonHook.js | 4 +- tkyo-drift/util/logMakeDriftEntry.js | 8 +- .../{logMakeError.js => logMakeErrorEntry.js} | 6 +- tkyo-drift/util/logMakeInputEntry.js | 12 +- tkyo-drift/util/logPrintCosCLI.js | 6 +- tkyo-drift/util/logPrintScalarCLI.js | 8 +- tkyo-drift/util/oneOffEmb.js | 40 ++---- tkyo-drift/util/oneOffModel.js | 13 +- tkyo-drift/util/scalarCaptureShared.js | 6 +- tkyo-drift/util/scalarLoadMetrics.js | 19 ++- 12 files changed, 142 insertions(+), 129 deletions(-) create mode 100644 tkyo-drift/config.js rename tkyo-drift/util/{logMakeError.js => logMakeErrorEntry.js} (88%) diff --git a/tkyo-drift/config.js b/tkyo-drift/config.js new file mode 100644 index 0000000..effbd3a --- /dev/null +++ b/tkyo-drift/config.js @@ -0,0 +1,23 @@ +import path from 'path'; + +// TKYO Drift configuration file +// +// You can override the following settings using environment variables: +// - TEXT_LOGGING: Set to 'false' to disable text input logging (default: true) +// - OUTPUT_DIR: Set the output directory for all drift data (default: './tkyoData') +// +// The models object is static. To add or change models, edit this file directly. + +export const config = { + // List of transformer models to use for drift analysis. Edit this object to add/remove models. + models: { + mini: 'Xenova/all-MiniLM-L12-v2', + e5: 'Xenova/e5-base-v2' + }, + + // Enable or disable logging of input text. Set TEXT_LOGGING=false in your environment to disable. + enableTextLogging: process.env.TEXT_LOGGING === 'false' ? false : true, + + // Output directory for all drift data. Set OUTPUT_DIR in your environment to override. + outputDir: path.resolve(process.env.OUTPUT_DIR || './tkyoData') +}; \ No newline at end of file diff --git a/tkyo-drift/tkyoDrift.js b/tkyo-drift/tkyoDrift.js index c3b8e88..7b4880e 100755 --- a/tkyo-drift/tkyoDrift.js +++ b/tkyo-drift/tkyoDrift.js @@ -11,7 +11,7 @@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@+:#@@@@@@=.-=+#++:..:=+*##*=..*%%@@@@%#=:@@@@@@@@@@%**@@@@@@@@@@@%#+=-::..::-==+**######%%%%%%%@@@@@@@@@@@@@@@@@@@@@@@@@@@@%%#:.-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@@@@@@@@@@@@@@@@@@+%@@@@@@@@@@:.-:=#@@@@@@@@@@@@@%%=%@@@@@@%*.:-*%%%%*-*+=-:.:-=+*#%%######%%%%###***+++=====--------======+++++++****####%%%%@@%%#=--:+@@@@@@@@@@@@@@@@@@@@@@@=-@@@@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@+*@@@@@@@@@@@@@@@@@@@@@%%%@@@@@@%-%%%+..=#%%#####*+=--=+**##%%%%%#*++=--::.........................................:%@%.....:...:::-=+*#%@@@@@@@@@**:@@@@@@@@@@@@@@@@@@@@@@@ -@@@@@@@@@#%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@%%%@@@@@%#%#:-%####%%#**+=-:. -%#. :.....::::::::::::::::::-*.@@@@@@@@@@@@@@@@@@@@@@@ +@@@@@#%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@%%%@@@@@%#%#:-%####%%#**+=-:. -%#. :.....::::::::::::::::::-*.@@@@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@@.#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@%%@@%@@@%%=:%%*.. .%@:..:.....::::::::::::::::=-:@@@@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@@@=-@@*@@@@@@@@@@@@@@@@@@@@@%*==-:.-#@@@@@@@@@@@@@@@@@@@@@%:#%+.. .************=:*****:.=****+-+****===*####=--+#%@@@@%#*::::...........%%:..-. :@@@@@@@@@@@@@@@==-@@@@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@@@@=--@@@@@@@@@@@@@@@@@@@@@@@@@@@@@%+.*@@@@@@@@@@@@@@@@@@=-%%.. ......:@@@@@@@@@@@@%.%@@@@=+@@@@%:.*@@@@#:.#@@@@+.%@@@@@@@@@@@@:.... --.=.#@-..:: +@@@@@@@@@@@@@+=+@@@@@@@@@@@@@@@@@@@@@@@ @@ -68,6 +68,7 @@ const [command, ...rest] = process.argv.slice(2); * - cos: Show cosine similarity drift logs * - scalar: Show scalar metric drift comparison * - train: Process training data and update baselines + * - inputs: Toggle text input logging * * @example * Show cosine similarity drift for last 30 days @@ -78,61 +79,64 @@ const [command, ...rest] = process.argv.slice(2); * * Process training data * tkyo train ./data input input + * + * Toggle text input logging + * tkyo inputs */ -// Only run if the command is a "tkyo" command -// if (process.argv[1] === new URL(import.meta.url).pathname) { // ! Alternative, ESM Based -if (process.argv[1].endsWith('tkyo')) { - // switch case to determine which file to invoke - switch (command) { - // ? tkyo cos - case 'cos': { - const dayArgument = rest[0] || '30'; - process.argv = ['node', 'printLogCLI.js', dayArgument]; - await printLogCLI(dayArgument); - break; - } - - // ? tkyo scalar - case 'scalar': { - await printScalarCLI(); - break; - } +async function main() { + // Only run if the command is a "tkyo" command + if (process.argv[1].endsWith('tkyo')) { + // switch case to determine which file to invoke + switch (command) { + // ? tkyo cos + case 'cos': { + const dayArgument = rest[0] || '30'; + process.argv = ['node', 'printLogCLI.js', dayArgument]; + await printLogCLI(dayArgument); + break; + } - // ? tkyo train - case 'train': { - const [pathToData, columnName, ioType] = rest; + // ? tkyo scalar + case 'scalar': { + await printScalarCLI(); + break; + } - // Error handle when - if (!pathToData || !columnName || !ioType) { - console.error( - chalk.blueBright( - 'Usage: tkyo train ' - ) + // ? tkyo train + case 'train': { + const [pathToData, columnName, ioType] = rest; + + // Error handle when the user doesn't provide the correct arguments + if (!pathToData || !columnName || !ioType) { + console.error( + chalk.blueBright( + 'Usage: tkyo train ' + ) + ); + process.exit(1); + } + + // If someone calls the train command, we normalize the path. + const normalizedPath = path.resolve( + process.cwd(), + pathToData.replace(/\\/g, '/') ); - process.exit(1); - } - // If someone calls the train command, we normalize the path. - const normalizedPath = path.resolve( - process.cwd(), - pathToData.replace(/\\/g, '/') - ); + // Error handle when the path does not exist. + if (!fs.existsSync(normalizedPath)) { + console.error(chalk.red(`The dataSetPath provided does not exist.`)); + } - // Error handle when the path does not exist. - if (!fs.existsSync(normalizedPath)) { - console.error(chalk.red(`The dataSetPath provided does not exist.`)); + await tkyoDriftSetTrainingHook(normalizedPath, columnName, ioType); + console.log(chalk.green("Job's done.")); + break; } - await tkyoDriftSetTrainingHook(normalizedPath, columnName, ioType); - console.log(chalk.green("Job's done.")); - break; - } - - // ? help commands - default: - console.log( - chalk.gray(` + // ? help commands + default: + console.log( + chalk.gray(` ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ ↗↑↑ ↗↓↓↓↗ ↓↓↓ ↓↓↓ ↓↓↓↓↓↓↓↓↓↓↓↓↖ ↗↑↑ ↗↑↑ ↗↑↑↑ ↗↑↑ ↗↑↑ ↗↑↑↑ ↖↑↑ ↗↑↑ ↗↑↑ ↗↑↑↑ ↗↑↑ ↗↑↑ ↗↑↑ ↖↑↑ @@ -144,23 +148,31 @@ if (process.argv[1].endsWith('tkyo')) { Usage: ${chalk.yellowBright('tkyo')} ${chalk.white('cos')} ${chalk.blueBright( - '' - )} Show COS Drift logs for last N days + '' + )} Show COS Drift logs for last N days ${chalk.yellowBright('tkyo')} ${chalk.white( - 'scalar' - )} Show scalar drift comparison + 'scalar' + )} Show scalar drift comparison ${chalk.yellowBright('tkyo')} ${chalk.white('train')} ${chalk.blueBright( - ' ' - )} Embed dataset and update training baseline + ' ' + )} Embed dataset and update training baseline + +${chalk.cyanBright('Environment variables:')} + ${chalk.white('TEXT_LOGGING')} Set to 'false' to disable text input logging (default: true) + ${chalk.white('OUTPUT_DIR')} Set the output directory for all drift data (default: ./tkyoData) + ${chalk.white('You can also update the embedding models in the config.js file')} Readme docs are in the node package or at ${chalk.blueBright( - 'https://github.com/oslabs-beta/tkyo-drift' - )} - `) - ); + 'https://github.com/oslabs-beta/tkyo-drift' + )} + `) + ); + } } } +main(); + /** * Export the main drift analysis function for programmatic use. * This allows the drift analysis functionality to be used as a library diff --git a/tkyo-drift/util/batchPythonHook.js b/tkyo-drift/util/batchPythonHook.js index a9d4336..58ee4ef 100644 --- a/tkyo-drift/util/batchPythonHook.js +++ b/tkyo-drift/util/batchPythonHook.js @@ -4,10 +4,10 @@ * of training data and embeddings. */ -import { spawn } from 'child_process'; +import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; -import fs from 'fs'; +import { spawn } from 'child_process'; // Full path to tkyoDriftSetTrainingHook.js const __filename = fileURLToPath(import.meta.url); diff --git a/tkyo-drift/util/logMakeDriftEntry.js b/tkyo-drift/util/logMakeDriftEntry.js index 433e41b..d83d89b 100644 --- a/tkyo-drift/util/logMakeDriftEntry.js +++ b/tkyo-drift/util/logMakeDriftEntry.js @@ -6,7 +6,7 @@ import fs from 'fs'; import path from 'path'; -import { OUTPUT_DIR } from './oneOffEmb.js'; +import { config } from '../config.js'; /** * Creates or appends a log entry for drift metrics to the appropriate CSV file. @@ -19,9 +19,9 @@ export default function makeLogEntry(id, mathObject, type) { let logPath = ''; // Construct the destination to the log in the data folder if (type === 'COS') { - logPath = path.join(OUTPUT_DIR, 'logs', 'COS_log.csv'); - } else { - logPath = path.join(OUTPUT_DIR, 'logs', 'EUC_log.csv'); + logPath = path.join(config.outputDir, 'logs', 'COS_log.csv'); + } else if (type === 'EUC') { + logPath = path.join(config.outputDir, 'logs', 'EUC_log.csv'); } // Create a timestamp diff --git a/tkyo-drift/util/logMakeError.js b/tkyo-drift/util/logMakeErrorEntry.js similarity index 88% rename from tkyo-drift/util/logMakeError.js rename to tkyo-drift/util/logMakeErrorEntry.js index 0c4e313..29890e5 100644 --- a/tkyo-drift/util/logMakeError.js +++ b/tkyo-drift/util/logMakeErrorEntry.js @@ -6,7 +6,7 @@ import fs from 'fs'; import path from 'path'; -import { OUTPUT_DIR } from './oneOffEmb.js'; +import { config } from '../config.js'; /** * Creates or appends an error log entry to the error log CSV file. @@ -14,8 +14,8 @@ import { OUTPUT_DIR } from './oneOffEmb.js'; * @param {Error} error - The error object to be logged */ export default function makeErrorLogEntry(error) { - // Build path to error log - const logPath = path.join(OUTPUT_DIR, 'logs', 'ERR_log.csv'); + // Construct the path to the error log file + const logPath = path.join(config.outputDir, 'logs', 'ERR_log.csv'); // Create a timestamp for when the error occurred const timestamp = new Date().toISOString(); diff --git a/tkyo-drift/util/logMakeInputEntry.js b/tkyo-drift/util/logMakeInputEntry.js index d486894..fd65363 100644 --- a/tkyo-drift/util/logMakeInputEntry.js +++ b/tkyo-drift/util/logMakeInputEntry.js @@ -7,7 +7,7 @@ import fs from 'fs'; import path from 'path'; -import { OUTPUT_DIR } from './oneOffEmb.js'; +import { config } from '../config.js'; /** * Escapes a string for CSV format by: @@ -31,21 +31,13 @@ function escapeCSV(str) { * @param {string} text - The actual text content being analyzed */ export default function logMakeInputEntry(id, text) { - // Construct the path to the text log file in the logs directory - const logPath = path.join(OUTPUT_DIR, 'logs', 'text_log.csv'); - - // Generate an ISO timestamp for when this entry is being made + const logPath = path.join(config.outputDir, 'logs', 'text_log.csv'); const timestamp = new Date().toISOString(); - - // Create the CSV row with ID, timestamp, and escaped text content - // ID and timestamp don't need escaping as they won't contain special characters const row = [id, timestamp, escapeCSV(text)]; const csvLine = row.join(',') + '\n'; - // Check if the log file already exists const fileExists = fs.existsSync(logPath); - // Write to the log file try { if (!fileExists) { // If the file doesn't exist, create it with headers diff --git a/tkyo-drift/util/logPrintCosCLI.js b/tkyo-drift/util/logPrintCosCLI.js index dd27b60..0bc1a26 100644 --- a/tkyo-drift/util/logPrintCosCLI.js +++ b/tkyo-drift/util/logPrintCosCLI.js @@ -8,7 +8,7 @@ import fs from 'fs'; import path from 'path'; import chalk from 'chalk'; import Table from 'cli-table3'; -import { MODELS, OUTPUT_DIR } from './oneOffEmb.js'; +import { config } from '../config.js'; /** * Prints a formatted table of cosine similarity drift metrics to the console. @@ -20,7 +20,7 @@ import { MODELS, OUTPUT_DIR } from './oneOffEmb.js'; */ export default async function printLogCLI(arg) { // Constants & CLI Args - const logPath = path.join(OUTPUT_DIR, 'logs', 'COS_log.csv'); + const logPath = path.join(config.outputDir, 'logs', 'COS_log.csv'); const days = isNaN(parseInt(arg)) ? 30 : parseInt(arg); const driftThreshold = 0.8; const startTime = Date.now() - days * 86400000; // milliseconds in a day @@ -98,7 +98,7 @@ export default async function printLogCLI(arg) { // Build the table rows by model type, io type, and baseline type for (const ioType of ioTypes) { - for (const [modelType] of Object.entries(MODELS)) { + for (const [modelType] of Object.entries(config.models)) { for (const baselineType of baselineTypes) { const columnHeader = `${modelType.toUpperCase()} ${baselineType.toUpperCase()} COS`; const colIndex = headers.indexOf(columnHeader); diff --git a/tkyo-drift/util/logPrintScalarCLI.js b/tkyo-drift/util/logPrintScalarCLI.js index ae7209a..f23f788 100644 --- a/tkyo-drift/util/logPrintScalarCLI.js +++ b/tkyo-drift/util/logPrintScalarCLI.js @@ -8,9 +8,9 @@ import fs from 'fs'; import path from 'path'; import chalk from 'chalk'; import Table from 'cli-table3'; -import { compareScalarDistributions } from './scalarCompare.js'; +import { config } from '../config.js'; import { loadScalarMetrics } from './scalarLoadMetrics.js'; -import { OUTPUT_DIR } from './oneOffEmb.js'; +import { compareScalarDistributions } from './scalarCompare.js'; /** * Prints a formatted table of scalar metric drift analysis to the console. @@ -21,8 +21,8 @@ import { OUTPUT_DIR } from './oneOffEmb.js'; */ export default async function printScalarCLI() { - // Define the path to where scalar .jsonl files are stored - const SCALAR_DIR = path.join(OUTPUT_DIR, 'scalars'); + // Construct the path to the scalar metrics directory + const SCALAR_DIR = path.join(config.outputDir, 'scalars'); // Define warning boolean to console log a warning if we are in hybrid mode let warn = false; diff --git a/tkyo-drift/util/oneOffEmb.js b/tkyo-drift/util/oneOffEmb.js index 999889d..d3a50ae 100644 --- a/tkyo-drift/util/oneOffEmb.js +++ b/tkyo-drift/util/oneOffEmb.js @@ -7,34 +7,16 @@ import fs from 'fs'; import path from 'path'; import { v4 } from 'uuid'; +import { config } from '../config.js'; import { DriftModel } from './oneOffModel.js'; +import makeErrorLogEntry from './logMakeErrorEntry.js'; import logMakeDriftEntry from './logMakeDriftEntry.js'; -import makeErrorLogEntry from './logMakeError.js'; import logMakeInputEntry from './logMakeInputEntry.js'; import captureSharedScalarMetrics from './scalarCaptureShared.js'; -/** - * Available embedding models for drift analysis. - * @type {Object.} - */ - -export const MODELS = { - // t5: 'Xenova/sentence-t5-large', - // bert: 'Xenova/sentence_bert', - mini: 'Xenova/all-MiniLM-L12-v2', - e5: 'Xenova/e5-base-v2', -}; - -/** - * Root directory for all drift analysis data. - * Contains subdirectories for vectors, scalars, and logs. - * @type {string} - */ - -export const OUTPUT_DIR = path.resolve('./tkyoData'); - /** * Cache for pipeline output results to speed up model loading. + * This is used to prevent reloading models on each request in warm environments. * @type {Object} */ @@ -78,20 +60,20 @@ export default async function tkyoDrift(text, ioType) { try { // ------------- << Make Directories >> ------------- // Check if directory exists, if not, make it. - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); + if (!fs.existsSync(config.outputDir)) { + fs.mkdirSync(config.outputDir, { recursive: true }); } // Create subdirectories for vectors, scalars, and logs for (const dir of subdirectories) { - const subdirPath = path.join(OUTPUT_DIR, dir); + const subdirPath = path.join(config.outputDir, dir); if (!fs.existsSync(subdirPath)) { fs.mkdirSync(subdirPath, { recursive: true }); } } // Validate model config (we need the / and it's gotta be a string) - for (const [type, name] of Object.entries(MODELS)) { + for (const [type, name] of Object.entries(config.models)) { if (typeof name !== 'string' || !name.includes('/')) { throw new Error( `Invalid or missing model ID for "${type}" model: "${name}"` @@ -102,7 +84,7 @@ export default async function tkyoDrift(text, ioType) { // ------------- << Construct Model Combinations >> ------------- try { // * For each model, for each baselineType, make a model and assign to driftModels object - for (const [modelType, modelName] of Object.entries(MODELS)) { + for (const [modelType, modelName] of Object.entries(config.models)) { for (const baselineType of baselineTypes) { const key = `${modelType}.${ioType}.${baselineType}`; driftModels[key] = new DriftModel( @@ -205,7 +187,11 @@ export default async function tkyoDrift(text, ioType) { const sharedID = v4(); logMakeDriftEntry(sharedID, similarityResults, 'COS'); logMakeDriftEntry(sharedID, distanceResults, 'EUC'); - logMakeInputEntry(sharedID, text); + + // Log the input text if logging is enabled + if (config.enableTextLogging) { + logMakeInputEntry(sharedID, text); + } // ------------- << END try/catch Error Handling >> ------------- // * Push any errors to the error log diff --git a/tkyo-drift/util/oneOffModel.js b/tkyo-drift/util/oneOffModel.js index b1b73fc..2fede7e 100644 --- a/tkyo-drift/util/oneOffModel.js +++ b/tkyo-drift/util/oneOffModel.js @@ -9,9 +9,10 @@ import path from 'path'; import { error } from 'console'; import { fileURLToPath } from 'url'; import fsPromises from 'fs/promises'; +import { config } from '../config.js'; import { spawn } from 'child_process'; +import { MODEL_CACHE } from './oneOffEmb.js'; import { pipeline } from '@xenova/transformers'; -import { OUTPUT_DIR, MODEL_CACHE } from './oneOffEmb.js'; /** * Class representing a drift analysis model. @@ -60,14 +61,14 @@ export class DriftModel { const baseName = `${this.modelType}.${this.ioType}.${this.baselineType}`; // Assemble the embedding file path (.bin file) - const vectorPath = path.join(OUTPUT_DIR, 'vectors', `${baseName}.bin`); + const vectorPath = path.join(config.outputDir, 'vectors', `${baseName}.bin`); const vectorKmeansPath = path.join( - OUTPUT_DIR, + config.outputDir, 'vectors', `${baseName}.kmeans.bin` ); const fallbackPath = path.join( - OUTPUT_DIR, + config.outputDir, 'vectors', `${this.modelType}.${this.ioType}.rolling.bin` ); @@ -82,7 +83,7 @@ export class DriftModel { // Scalar metric path (.scalar.jsonl) this.scalarFilePath = path.join( - OUTPUT_DIR, + config.outputDir, 'scalars', `${baseName}.scalar.jsonl` ); @@ -605,7 +606,7 @@ export class DriftModel { // Construct the file path using: ioType.metric.modelType.baselineType.scalar.jsonl // Example: input.norm.semantic.rolling.scalar.jsonl const filePath = path.join( - OUTPUT_DIR, + config.outputDir, 'scalars', `${this.ioType}.${metric}.${this.modelType}.rolling.scalar.jsonl` ); diff --git a/tkyo-drift/util/scalarCaptureShared.js b/tkyo-drift/util/scalarCaptureShared.js index 8c665c5..9a4718e 100644 --- a/tkyo-drift/util/scalarCaptureShared.js +++ b/tkyo-drift/util/scalarCaptureShared.js @@ -4,10 +4,10 @@ * and uppercase ratio, which are stored in JSONL files for drift analysis. */ -import fsPromises from 'fs/promises'; import fs from 'fs'; import path from 'path'; -import { OUTPUT_DIR } from './oneOffEmb.js'; +import fsPromises from 'fs/promises'; +import { config } from '../config.js'; /** * Captures and stores shared scalar metrics for a given text input. @@ -28,7 +28,7 @@ export default async function captureSharedScalarMetrics(text, ioType) { Object.entries(metricSet).map(([metric, value]) => { // Construct the file path const filePath = path.join( - OUTPUT_DIR, + config.outputDir, 'scalars', `${ioType}.${metric}.rolling.scalar.jsonl` ); diff --git a/tkyo-drift/util/scalarLoadMetrics.js b/tkyo-drift/util/scalarLoadMetrics.js index 34598a6..d308dff 100644 --- a/tkyo-drift/util/scalarLoadMetrics.js +++ b/tkyo-drift/util/scalarLoadMetrics.js @@ -5,9 +5,9 @@ */ import fs from 'fs'; -import readline from 'readline'; import path from 'path'; -import { OUTPUT_DIR } from './oneOffEmb.js'; +import readline from 'readline'; +import { config } from '../config.js'; /** * Loads scalar metrics from JSONL files and groups them by metric name. @@ -32,22 +32,23 @@ export async function loadScalarMetrics( ) { const metrics = {}; // this will hold the final merged metric data + // Use config.outputDir instead of OUTPUT_DIR + const scalarDir = path.join(config.outputDir, 'scalars'); + for (const metric of metricNames) { let filePath; // Configure file path based on model type first if (modelType) { filePath = path.join( - OUTPUT_DIR, - 'scalars', // ? If the scalar metric is model specific, this will catch it (when this function gets invoked with a model value) + scalarDir, `${ioType}.${metric}.${modelType}.${baselineType}.scalar.jsonl` ); } else { filePath = path.join( - OUTPUT_DIR, - 'scalars', // ? Otherwise, the scalar metric will come from a model agnostic file + scalarDir, `${ioType}.${metric}.${baselineType}.scalar.jsonl` ); } @@ -56,16 +57,14 @@ export async function loadScalarMetrics( if (hybridMode) { if (modelType) { filePath = path.join( - OUTPUT_DIR, - 'scalars', // ? If the scalar metric is model specific, this will catch it (when this function gets invoked with a model value) + scalarDir, `${ioType}.${metric}.${modelType}.rolling.scalar.jsonl` ); } else { filePath = path.join( - OUTPUT_DIR, - 'scalars', // ? Otherwise, the scalar metric will come from a model agnostic file + scalarDir, `${ioType}.${metric}.rolling.scalar.jsonl` ); } From 8eac69741befa151651fb9ff1158164436dac75f Mon Sep 17 00:00:00 2001 From: Ataraxist Date: Sun, 27 Apr 2025 16:20:24 -0700 Subject: [PATCH 6/6] Moved config to root, and updated the readme with what env variables are pre-suppored. --- tkyo-drift/README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tkyo-drift/README.md b/tkyo-drift/README.md index 8e961fa..1ecca99 100644 --- a/tkyo-drift/README.md +++ b/tkyo-drift/README.md @@ -136,6 +136,22 @@ You can interact with this library in a couple ways; There is also a small training file downloader script in the util folder called downloadTrainingData.py that you can run to grab the training data from hugging face if you happen to be using a model for your workflow from there. +## Configuration via Environment Variables + +TKYO Drift supports configuration via environment variables for deployment flexibility. You can set the following variables: + +- `TEXT_LOGGING`: Set to `false` to disable logging of input text. Default is `true`. +- `OUTPUT_DIR`: Set the output directory for all drift data. Default is `./tkyoData`. + +Example usage (in your shell or `.env` file): + +```bash +export TEXT_LOGGING=false +export OUTPUT_DIR=/custom/path/for/tkyoData +``` + +If not set, the defaults in `util/config.js` will be used. + ## One-off Ingestion Usage: Add `tkyoDrift.js(text, ioType)` in your file, along with an import statement. @@ -308,7 +324,7 @@ Again, the second argument is the key for the object you would like to embed and ## Logging -Results are stored in two CSV files (`COS_log.csv` & `EUC_log.csv`) with dynamic headers. Each one-off run appends one row to each file. Keep in mind that training data is not added to the log, as the assumption is that your training baseline is what we compare against to measure drift. +Results are stored in three CSV files (`COS_log.csv`, `EUC_log.csv` & `text_log.csv`) with dynamic headers. Each one-off run appends one row to each file. Keep in mind that training data is not added to the log, as the assumption is that your training baseline is what we compare against to measure drift. ### Format @@ -324,6 +340,12 @@ For the euclidean distance log: ID, TIMESTAMP, I/O TYPE, SEMANTIC ROLLING EUC, SEMANTIC TRAINING EUC, CONCEPT ROLLING EUC... ``` +For the text input log: + +``` +ID, TEXT +``` + - Cosine similarities and euclidean distances are recorded per model and baseline type. - Additional metadata like ioType, date and UUIDs are included for tracking. - Text inputs are logged in a separate `text_log.csv` file for debugging and analysis purposes. This is separate from the drift calculation logs and binary files.