llm/abuse: change monitoring, rewrite requirements

haraldschilly · haraldschilly · commit 9beda5e76b42 · 2024-06-28T18:23:52.000+02:00
diff --git a/src/packages/server/llm/abuse.ts b/src/packages/server/llm/abuse.ts
@@ -1,28 +1,17 @@
 /*
-We initially just implement some very simple rate limitations to prevent very
-blatant abuse.
-
-- at most $10^5$ tokens per signed in user per hour \(that's \$0.20\); that allows for major usage...
-  but if somebody tried to do something really abusive, it would stop it.  Nobody
-  would hit this in practice unless they are really trying to abuse cocalc...
-  WRONG: it's very easy to hit this due to large inputs, e.g., analyzing a paper.
-- at most $10^6$ tokens per hour across all users \-\- that's \$2/hour. That would
-  come out to a bit more if sustained than my budget, but allows for bursts.
-
-See https://help.openai.com/en/articles/7039783-chatgpt-api-faq for the upstream rate limits,
-where they limit per minute, not per hour (like below):
-
-    What's the rate limits for the ChatGPT API?
-
-    Free trial users: 20 RPM 40000 TPM
-    Pay-as-you-go users (first 48 hours): 60 RPM 60000 TPM
-    Pay-as-you-go users (after 48 hours): 3500 RPM 90000 TPM
-
-    RPM = requests per minute
-    TPM = tokens per minute
+This is a basic rate limitation for free and metered usage of LLMs.
+- any call must be identified by an account (we had by a token, but it got abused)
+- There is a distinction between "cocalc.com" and "on-prem":
+   - cocalc.com has some models (the more expensive ones) which are metered per token and some which are free
+   - on-prem: there is only rate limiting, no metered usage
+- quotas are adjustable
+- at it's core, this should limit individual users from too much free usage, and overall cap the usage
+- monitoring as necessary, to give feedback for tweaking the parameters
 */
 
-import { newCounter, newHistogram } from "@cocalc/backend/metrics";
+import { isObject } from "lodash";
+
+import { newCounter, newGauge } from "@cocalc/backend/metrics";
 import { process_env_int } from "@cocalc/backend/misc";
 import getPool, { CacheTime } from "@cocalc/database/pool";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -41,7 +30,6 @@ import {
 } from "@cocalc/util/db-schema/llm-utils";
 import { KUCALC_COCALC_COM } from "@cocalc/util/db-schema/site-defaults";
 import { isValidUUID } from "@cocalc/util/misc";
-import { isObject } from "lodash";
 
 // These are tokens over a given period of time – summed by account/analytics_cookie or global.
 const QUOTAS = {
@@ -50,18 +38,11 @@ const QUOTAS = {
   global: process_env_int("COCALC_LLM_QUOTA_GLOBAL", 10 ** 6),
 } as const;
 
-const prom_quotas = newHistogram(
+const prom_quotas = newGauge(
   "llm",
-  "abuse_usage",
-  "Language model abuse usage",
-  {
-    buckets:
-      // 10 buckets evenly spaced from 0 to QUOTAS.global
-      Array.from({ length: 10 }, (_, i) =>
-        Math.floor((i * QUOTAS.global) / 10),
-      ),
-    labels: ["usage"],
-  },
+  "abuse_usage_pct",
+  "Language model abuse, 0 to 100 percent of limit",
+  ["quota"],
 );
 
 const prom_rejected = newCounter(
@@ -122,7 +103,7 @@ export async function checkForAbuse({
     analytics_cookie,
   });
 
-  prom_quotas.labels("recent").observe(usage);
+  prom_quotas.labels("account").set(100 * (usage / QUOTAS.account));
 
   // console.log("usage = ", usage);
   if (account_id) {
@@ -146,7 +127,7 @@ export async function checkForAbuse({
   // Prevent more sophisticated abuse, e.g., changing analytics_cookie or account frequently,
   // or just a general huge surge in usage.
   const overallUsage = await recentUsage({ cache: "long", period: "1 hour" });
-  prom_quotas.labels("global").observe(overallUsage);
+  prom_quotas.labels("global").set(100 * (overallUsage / QUOTAS.global));
   // console.log("overallUsage = ", usage);
   if (overallUsage > QUOTAS.global) {
     prom_rejected.labels("global").inc();