Merge pull request #7643 from sagemathinc/llm-tweak-abuse

williamstein · web-flow · commit dce07fe6f2e2 · 2024-07-09T08:12:34.000-07:00
llm/abuse improvements
diff --git a/src/packages/server/llm/abuse.ts b/src/packages/server/llm/abuse.ts
@@ -1,28 +1,17 @@
 /*
-We initially just implement some very simple rate limitations to prevent very
-blatant abuse.
-
-- at most $10^5$ tokens per signed in user per hour \(that's \$0.20\); that allows for major usage...
-  but if somebody tried to do something really abusive, it would stop it.  Nobody
-  would hit this in practice unless they are really trying to abuse cocalc...
-  WRONG: it's very easy to hit this due to large inputs, e.g., analyzing a paper.
-- at most $10^6$ tokens per hour across all users \-\- that's \$2/hour. That would
-  come out to a bit more if sustained than my budget, but allows for bursts.
-
-See https://help.openai.com/en/articles/7039783-chatgpt-api-faq for the upstream rate limits,
-where they limit per minute, not per hour (like below):
-
-    What's the rate limits for the ChatGPT API?
-
-    Free trial users: 20 RPM 40000 TPM
-    Pay-as-you-go users (first 48 hours): 60 RPM 60000 TPM
-    Pay-as-you-go users (after 48 hours): 3500 RPM 90000 TPM
-
-    RPM = requests per minute
-    TPM = tokens per minute
+This is a basic rate limitation for free and metered usage of LLMs.
+- any call must be identified by an account (we had just by a cookie ID, but it got abused, hence noAccount=0)
+- There is a distinction between "cocalc.com" and "on-prem":
+   - cocalc.com has some models (the more expensive ones) which are metered per token and some which are free
+   - on-prem: there is only rate limiting, no metered usage
+- quotas are adjustable
+- at it's core, this should limit individual users from too much free usage, and overall cap the usage
+- monitoring as necessary, to give feedback for tweaking the parameters
 */
 
-import { newCounter, newHistogram } from "@cocalc/backend/metrics";
+import { isObject } from "lodash";
+
+import { newCounter, newGauge, newHistogram } from "@cocalc/backend/metrics";
 import { process_env_int } from "@cocalc/backend/misc";
 import getPool, { CacheTime } from "@cocalc/database/pool";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -41,7 +30,7 @@ import {
 } from "@cocalc/util/db-schema/llm-utils";
 import { KUCALC_COCALC_COM } from "@cocalc/util/db-schema/site-defaults";
 import { isValidUUID } from "@cocalc/util/misc";
-import { isObject } from "lodash";
+import isValidAccount from "../accounts/is-valid-account";
 
 // These are tokens over a given period of time – summed by account/analytics_cookie or global.
 const QUOTAS = {
@@ -50,18 +39,18 @@ const QUOTAS = {
   global: process_env_int("COCALC_LLM_QUOTA_GLOBAL", 10 ** 6),
 } as const;
 
-const prom_quotas = newHistogram(
+const prom_quota_global = newGauge(
+  "llm",
+  "abuse_usage_global_pct",
+  "Language model abuse limit, global, 0 to 100 percent of limit, rounded",
+  ["quota"],
+);
+
+const prom_quota_per_account = newHistogram(
   "llm",
-  "abuse_usage",
-  "Language model abuse usage",
-  {
-    buckets:
-      // 10 buckets evenly spaced from 0 to QUOTAS.global
-      Array.from({ length: 10 }, (_, i) =>
-        Math.floor((i * QUOTAS.global) / 10),
-      ),
-    labels: ["usage"],
-  },
+  "abuse_usage_account_pct",
+  "Language model usage per account, to see if users reach certain thresholds for their account usage.",
+  { buckets: [25, 50, 75, 100, 110] },
 );
 
 const prom_rejected = newCounter(
@@ -104,7 +93,6 @@ export async function checkForAbuse({
     (await getServerSettings()).kucalc === KUCALC_COCALC_COM;
 
   if (!isFreeModel(model, is_cocalc_com)) {
-    // we exclude Ollama (string), because it is free.
     const service = model2service(model) as LanguageServiceCore;
     // This is a for-pay product, so let's make sure user can purchase it.
     await assertPurchaseAllowed({ account_id, service });
@@ -122,7 +110,9 @@ export async function checkForAbuse({
     analytics_cookie,
   });
 
-  prom_quotas.labels("recent").observe(usage);
+  // this fluctuates for each account, we'll tally up how often users end up in certain usage buckets
+  // that's more explicit than a histogram
+  prom_quota_per_account.observe(100 * (usage / QUOTAS.account));
 
   // console.log("usage = ", usage);
   if (account_id) {
@@ -146,8 +136,9 @@ export async function checkForAbuse({
   // Prevent more sophisticated abuse, e.g., changing analytics_cookie or account frequently,
   // or just a general huge surge in usage.
   const overallUsage = await recentUsage({ cache: "long", period: "1 hour" });
-  prom_quotas.labels("global").observe(overallUsage);
-  // console.log("overallUsage = ", usage);
+  prom_quota_global
+    .labels("global")
+    .set(Math.round(100 * (overallUsage / QUOTAS.global)));
   if (overallUsage > QUOTAS.global) {
     prom_rejected.labels("global").inc();
     throw new Error(
@@ -175,11 +166,7 @@ async function recentUsage({
   const pool = getPool(cache);
   let query, args;
   if (account_id) {
-    const { rows } = await pool.query(
-      "SELECT COUNT(*) FROM accounts WHERE account_id=$1",
-      [account_id],
-    );
-    if (rows.length == 0) {
+    if (!(await isValidAccount(account_id))) {
       throw Error(`invalid account_id ${account_id}`);
     }
     query = `SELECT SUM(total_tokens) AS usage FROM openai_chatgpt_log WHERE account_id=$1 AND time >= NOW() - INTERVAL '${period}'`;