Initial inference documentation pass (#3330)

olliestanley · web-flow · commit 4a397e029885 · 2023-06-09T14:11:02.000+02:00
In the recent meeting it was raised that the codebase is not documented
and hard to understand as a result. This adds some initial
documentation. For now it is quite basic but hopefully will allow a bit
more understanding of some parts of the code "at a glance", reducing the
friction somewhat.
diff --git a/inference/safety/main.py b/inference/safety/main.py
@@ -1,4 +1,8 @@
-# A FastAPI server to run the safety pipeline
+"""
+A simple FastAPI server which serves a `blade2blade2` safety model.
+
+See https://github.com/LAION-AI/blade2blade for context.
+"""
 
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
@@ -40,6 +44,7 @@ async def load_pipeline():
 
 
 async def async_predict(pipeline: Blade2Blade, inputs: str):
+    """Run predictions in a separate thread for a small server parallelism benefit."""
     return await asyncio.get_event_loop().run_in_executor(executor, pipeline.predict, inputs)
 
 
diff --git a/inference/safety/settings.py b/inference/safety/settings.py
@@ -2,6 +2,7 @@
 
 
 class Settings(pydantic.BaseSettings):
+    # HuggingFace model ID for the model to load in blade2blade
     safety_model_name: str = "shahules786/blade2blade-t5-base"
 
 
diff --git a/inference/server/export.py b/inference/server/export.py
@@ -1,3 +1,5 @@
+"""Script to facilitate exporting chat data from the server database."""
+
 import argparse
 import asyncio
 import contextlib
diff --git a/inference/server/main.py b/inference/server/main.py
@@ -56,6 +56,7 @@ def terminate_server(signum, frame):
 
 @app.on_event("startup")
 async def alembic_upgrade():
+    """Upgrades database schema based on Alembic migration scripts."""
     signal.signal(signal.SIGINT, terminate_server)
     if not settings.update_alembic:
         logger.warning("Skipping alembic upgrade on startup (update_alembic is False)")
@@ -113,7 +114,7 @@ async def maybe_add_debug_api_keys():
 app.include_router(workers.router)
 app.include_router(configs.router)
 
-# mount plugins
+# mount builtin plugins to be hosted on this server
 for app_prefix, sub_app in plugins.plugin_apps.items():
     app.mount(path=settings.plugins_path_prefix + app_prefix, app=sub_app)
 
diff --git a/inference/server/oasst_inference_server/admin.py b/inference/server/oasst_inference_server/admin.py
@@ -1,3 +1,5 @@
+"""Logic related to admin actions."""
+
 import fastapi
 from loguru import logger
 from oasst_inference_server import database, models
diff --git a/inference/server/oasst_inference_server/auth.py b/inference/server/oasst_inference_server/auth.py
@@ -1,3 +1,5 @@
+"""Logic related to authorization actions."""
+
 import hashlib
 import json
 from datetime import datetime, timedelta
diff --git a/inference/server/oasst_inference_server/chat_repository.py b/inference/server/oasst_inference_server/chat_repository.py
@@ -12,6 +12,8 @@
 
 
 class ChatRepository(pydantic.BaseModel):
+    """Wrapper around a database session providing functionality relating to chats."""
+
     session: database.AsyncSession
 
     class Config:
@@ -38,6 +40,10 @@ async def get_prompter_message_by_id(self, message_id: str) -> models.DbMessage:
     async def start_work(
         self, *, message_id: str, worker_id: str, worker_config: inference.WorkerConfig
     ) -> models.DbMessage:
+        """
+        Update an assistant message in the database to be allocated to a specific worker.
+        The message must be in `pending` state. An exception is raised if the message has timed out or was cancelled.
+        """
         logger.debug(f"Starting work on message {message_id}")
         message = await self.get_assistant_message_by_id(message_id)
 
@@ -65,6 +71,10 @@ async def start_work(
         return message
 
     async def reset_work(self, message_id: str) -> models.DbMessage:
+        """
+        Update an assistant message in the database which has already been allocated to a worker to remove the
+        allocation and reset the message state to `pending`.
+        """
         logger.warning(f"Resetting work on message {message_id}")
         message = await self.get_assistant_message_by_id(message_id)
         message.state = inference.MessageState.pending
@@ -78,6 +88,7 @@ async def reset_work(self, message_id: str) -> models.DbMessage:
         return message
 
     async def abort_work(self, message_id: str, reason: str) -> models.DbMessage:
+        """Update an assistant message in the database to mark it as having been aborted by the allocated worker."""
         logger.warning(f"Aborting work on message {message_id}")
         message = await self.get_assistant_message_by_id(message_id)
         message.state = inference.MessageState.aborted_by_worker
@@ -88,7 +99,13 @@ async def abort_work(self, message_id: str, reason: str) -> models.DbMessage:
         await self.session.refresh(message)
         return message
 
-    async def complete_work(self, message_id: str, content: str, used_plugin: inference.PluginUsed) -> models.DbMessage:
+    async def complete_work(
+        self, message_id: str, content: str, used_plugin: inference.PluginUsed | None
+    ) -> models.DbMessage:
+        """
+        Update an assistant message in the database to mark it as having been completed with the given content, also
+        updating the used plugin if one is specified.
+        """
         logger.debug(f"Completing work on message {message_id}")
         message = await self.get_assistant_message_by_id(message_id)
         message.state = inference.MessageState.complete
diff --git a/inference/server/oasst_inference_server/chat_utils.py b/inference/server/oasst_inference_server/chat_utils.py
@@ -3,6 +3,7 @@
 
 
 def get_model_config(model_config_name: str) -> model_configs.ModelConfig:
+    """Get a `ModelConfig` by its name. See `oasst_shared.model_configs`."""
     if settings.allowed_model_config_names != "*":
         if model_config_name not in settings.allowed_model_config_names_list:
             raise ValueError(f"Model {model_config_name} not in allowed models: {settings.allowed_model_config_names}")
diff --git a/inference/server/oasst_inference_server/compliance.py b/inference/server/oasst_inference_server/compliance.py
@@ -1,3 +1,5 @@
+"""Logic related to worker compliance checks, which seek to ensure workers do not produce malicious responses."""
+
 import datetime
 from typing import cast
 
@@ -14,6 +16,10 @@
 async def find_compliance_work_request_message(
     session: database.AsyncSession, worker_config: inference.WorkerConfig, worker_id: str
 ) -> models.DbMessage | None:
+    """
+    Find a suitable assistant message to carry out a worker compliance check for the given worker. Such a message must
+    have been generated by a different worker, but one with the same compatibility hash as the given worker.
+    """
     compat_hash = worker_config.compat_hash
     query = (
         sqlmodel.select(models.DbMessage)
@@ -30,6 +36,10 @@ async def find_compliance_work_request_message(
 
 
 async def should_do_compliance_check(session: database.AsyncSession, worker_id: str) -> bool:
+    """
+    Check whether we should carry out a compliance check for the given worker, based on time since last check.
+    Trusted workers are excluded.
+    """
     worker = await worker_utils.get_worker(worker_id, session)
     if worker.trusted:
         return False
@@ -43,6 +53,13 @@ async def should_do_compliance_check(session: database.AsyncSession, worker_id:
 
 
 async def run_compliance_check(websocket: fastapi.WebSocket, worker_id: str, worker_config: inference.WorkerConfig):
+    """
+    Run a compliance check for the given worker:
+    - Find a suitable compliance check assistant message
+    - Task the worker with generating a response with the same context
+    - Compare the respons against the existing completed message
+    - Update the database with the outcome
+    """
     async with deps.manual_create_session() as session:
         try:
             worker = await worker_utils.get_worker(worker_id, session)
diff --git a/inference/server/oasst_inference_server/database.py b/inference/server/oasst_inference_server/database.py
@@ -75,6 +75,7 @@ async def get_async_session(autoflush=True):
 
 
 def alembic_upgrade(connection):
+    """Upgrades database schema based on Alembic migration scripts."""
     alembic_ini_path = Path(__file__).parent.parent / "alembic.ini"
     alembic_cfg = alembic.config.Config(str(alembic_ini_path))
     alembic_cfg.set_main_option("sqlalchemy.url", settings.database_uri)
diff --git a/inference/server/oasst_inference_server/plugin_utils.py b/inference/server/oasst_inference_server/plugin_utils.py
@@ -10,6 +10,7 @@
 
 
 async def attempt_fetch_plugin(session: aiohttp.ClientSession, url: str, timeout: float = 5.0):
+    """Attempt to fetch a plugin specification from the given URL once."""
     async with session.get(url, timeout=timeout) as response:
         content_type = response.headers.get("Content-Type")
 
@@ -41,6 +42,7 @@ async def attempt_fetch_plugin(session: aiohttp.ClientSession, url: str, timeout
 
 
 async def fetch_plugin(url: str, retries: int = 3, timeout: float = 5.0) -> inference.PluginConfig:
+    """Fetch a plugin specification from the given URL, with retries using exponential backoff."""
     async with aiohttp.ClientSession() as session:
         for attempt in range(retries):
             try:
diff --git a/inference/server/oasst_inference_server/user_chat_repository.py b/inference/server/oasst_inference_server/user_chat_repository.py
@@ -9,6 +9,8 @@
 
 
 class UserChatRepository(pydantic.BaseModel):
+    """Wrapper around a database session providing user-specific functionality relating to chats."""
+
     session: database.AsyncSession
     user_id: str = pydantic.Field(..., min_length=1)
 
diff --git a/inference/server/oasst_inference_server/worker_utils.py b/inference/server/oasst_inference_server/worker_utils.py
@@ -54,6 +54,7 @@ async def get_worker_id(
     api_key: str = Depends(get_api_key),
     protocol_version: str = Depends(get_protocol_version),
 ) -> models.DbWorker:
+    """Get the ID of a worker from its API key and protocol version."""
     logger.info(f"get_worker: {api_key=}, {protocol_version=}")
     query = sqlmodel.select(models.DbWorker).where(models.DbWorker.api_key == api_key)
     async with deps.manual_create_session() as session:
@@ -107,6 +108,11 @@ async def build_work_request(
     session: database.AsyncSession,
     message_id: str,
 ) -> inference.WorkRequest:
+    """
+    Build a work request based on the assistant message associated with the given ID in the database.
+    This will build a chat history based on the parents of the assistant message which will form the work request along
+    with the work parameters associated with the assistant message.
+    """
     query = (
         sqlmodel.select(models.DbMessage)
         .options(
diff --git a/inference/worker/basic_hf_server.py b/inference/worker/basic_hf_server.py
@@ -1,4 +1,7 @@
-# a basic fastapi server to run generation on HF models
+"""
+Basic FastAPI server to serve models using HuggingFace Transformers library.
+This is an alternative to running the HuggingFace `text-generation-inference` (tgi) server.
+"""
 
 import sys
 import threading
@@ -48,6 +51,7 @@ async def log_exceptions(request: fastapi.Request, call_next):
 
 
 def model_thread():
+    """Continually obtain new work requests from the model input queue and work on them."""
     model: transformers.PreTrainedModel
     tokenizer: transformers.PreTrainedTokenizer
     model, tokenizer, decode_token = load_models()
diff --git a/inference/worker/hf_langchain_inference.py b/inference/worker/hf_langchain_inference.py
@@ -4,6 +4,8 @@
 
 
 class HFInference(LLM):
+    """LangChain LLM implementation which uses the HF inference server configured in the worker settings."""
+
     max_new_tokens: int = 512
     top_k: int | None = None
     top_p: float | None = None
diff --git a/inference/worker/hf_stopping.py b/inference/worker/hf_stopping.py
@@ -4,6 +4,8 @@
 
 
 class SequenceStoppingCriteria(StoppingCriteria):
+    """Enables automatic stopping of model text generation when specific text sequences are generated."""
+
     def __init__(
         self,
         tokenizer: Tokenizer,
diff --git a/inference/worker/utils.py b/inference/worker/utils.py
@@ -20,6 +20,14 @@
 
 
 class TokenBuffer:
+    """
+    A buffer for storing and managing tokens based on various conditions including stop sequences.
+
+    The TokenBuffer class accumulates tokens while keeping track of the length and manages the tokens based on the stop
+    sequences provided during initialization. Tokens can be added to the buffer and later on iterated upon finishing
+    depending on the reason.
+    """
+
     def __init__(self, stop_sequences: list[str]) -> None:
         self.stop_sequences = stop_sequences
         self.longest_stop_len = max((len(stop) for stop in stop_sequences), default=1)
@@ -65,6 +73,7 @@ def finish(self, reason: Literal["length", "eos_token", "stop_sequence"]) -> Ite
 
 
 def get_max_input_length(worker_config: inference.WorkerConfig, plugin_used: bool):
+    """Get the maximum possible input length based on the worker config and whether a plugin is in use."""
     max_input_length = worker_config.model_config.max_input_length
     if plugin_used:
         max_input_length = max_input_length - 1
@@ -78,6 +87,13 @@ def truncate_prompt(
     prompt: str,
     plugin_used: bool,
 ):
+    """
+    Truncate a prompt to ensure it does not exceed the maximum input length. Regardless of truncation, the system
+    prompt is always retained if it is present. If truncation removes the final prompter prefix, a new one is added.
+
+    The stream generation parameters are also updated with a maximum new tokens value which will not cause the total
+    length to exceed the maximum specified in the worker's model config.
+    """
     with shared_tokenizer_lock:
         ids = tokenizer.encode(prompt)
         prompter_prefix_id = tokenizer.convert_tokens_to_ids(V2_PROMPTER_PREFIX)
@@ -123,6 +139,7 @@ def truncate_prompt(
 
 
 def wait_for_inference_server(http: "HttpClient", timeout: int = 600):
+    """Wait for the "health" endpoint of the inference server to return status 200."""
     time_limit = time.time() + timeout
     while True:
         try:
@@ -139,7 +156,13 @@ def wait_for_inference_server(http: "HttpClient", timeout: int = 600):
             break
 
 
-def text_to_events(text: str, seed: int | None = None, pause: float = 0.0):
+def text_to_events(
+    text: str, seed: int | None = None, pause: float = 0.0
+) -> Iterable[interface.GenerateStreamResponse]:
+    """
+    Iterate over stream generation "events" derived from the given text, where each word in the text is treated as a
+    generated "token".
+    """
     tokens = text.split()
     for token in tokens[:-1]:
         yield interface.GenerateStreamResponse(
@@ -184,6 +207,8 @@ def send_response(
 
 
 class HttpClient(pydantic.BaseModel):
+    """Basic HTTP client built around `requests`. Supports simple authentication."""
+
     base_url: str
     basic_auth_username: str | None = None
     basic_auth_password: str | None = None
@@ -212,7 +237,10 @@ def post(self, path: str, **kwargs):
         return requests.post(self.base_url + path, auth=self.auth, **kwargs)
 
 
-def get_inference_server_stream_events(request: interface.GenerateStreamRequest):
+def get_inference_server_stream_events(
+    request: interface.GenerateStreamRequest,
+) -> Iterable[interface.GenerateStreamResponse]:
+    """Query the model inference server specified in the worker settings and stream the generation events."""
     http = HttpClient(
         base_url=settings.inference_server_url,
         basic_auth_username=settings.basic_auth_username,
diff --git a/inference/worker/work.py b/inference/worker/work.py
@@ -27,6 +27,7 @@ def make_prompt_and_parameters(
     tokenizer: transformers.PreTrainedTokenizer,
     work_request: inference.WorkRequest,
 ) -> tuple[str, interface.GenerateStreamParameters]:
+    """Prepare a formatted prompt and stream generation parameters based on a work request."""
     if settings.oa_protocol_version != "v2":
         raise RuntimeError(f"Unsupported oa protocol version: {settings.oa_protocol_version}")
 
@@ -36,13 +37,15 @@ def _prepare_message(message: inference.MessageRead) -> str:
         prefix = V2_ASST_PREFIX if message.is_assistant else V2_PROMPTER_PREFIX
         return prefix + message.content + eos_token
 
-    # construct prompt
+    # Construct prompt
     messages = [_prepare_message(message) for message in work_request.thread.messages]
 
+    # Prepend system prompt if it was specified in work parameters
     if work_request.parameters.system_prompt:
         pre_prompt = V2_SYSTEM_PREFIX + work_request.parameters.system_prompt + eos_token
         messages = [pre_prompt] + messages
 
+    # Stringify and append assistant prefix to signify start of generation
     prompt = "".join(messages) + V2_ASST_PREFIX
 
     parameters = interface.GenerateStreamParameters.from_work_parameters(work_request.parameters)
@@ -61,17 +64,23 @@ def _prepare_message(message: inference.MessageRead) -> str:
 
 
 def prepare_safe_prompt(prompt: str, label: str, rots: str) -> str:
+    """Given a prompt, safety label, and safety rule of thumb, prepare a 'safe prompt' to replace the prompt."""
     pre_prompt = f"Answer the following request with {label} as responsible chatbot that believes that {rots}: "
     input_list = prompt.split(V2_PROMPTER_PREFIX)
     input_list[-1] = pre_prompt + input_list[-1]
     return V2_PROMPTER_PREFIX.join(input_list)
 
 
 def is_safety_triggered(safety_label: str, safety_level: int) -> bool:
+    """
+    Determines whether to trigger the safe prompt based on the configured safety level and severity label from the
+    safety classifier.
+    """
     return ("caution" in safety_label and safety_level > 1) or ("intervention" in safety_label and safety_level > 0)
 
 
 def parse_safety_response(safety_opinion: str) -> tuple[str, str]:
+    """Parse the response from the safety model into a separate label and rule of thumb."""
     safety_opinion = re.sub(r"<pad>|</s>", "", safety_opinion).split("<sep>")
     label, rots = safety_opinion[0], "and".join([x.strip(".") for x in safety_opinion[1:]])
     label = label.replace("<pad>", "").strip()
@@ -84,6 +93,7 @@ def handle_work_request(
     work_request: inference.WorkRequest,
     worker_config: inference.WorkerConfig,
 ):
+    """Handle a work request from end-to-end. Handles plugins and safety if enabled."""
     parameters = interface.GenerateStreamParameters.from_work_parameters(work_request.parameters)
     prompt = ""
     used_plugin = None
@@ -209,6 +219,7 @@ def handle_work_request(
 
 
 def get_safety_server_response(request: inference.SafetyRequest) -> inference.SafetyResponse:
+    """Query the safety server URL configured in the worker settings."""
     http = utils.HttpClient(base_url=settings.safety_server_url)
     response = http.post("/safety", json=request.dict())
     try:

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`
`3`	`3`
`4`	`4`	`class Settings(pydantic.BaseSettings):`
	`5`	`+ # HuggingFace model ID for the model to load in blade2blade`
`5`	`6`	`safety_model_name: str = "shahules786/blade2blade-t5-base"`
`6`	`7`
`7`	`8`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+"""Script to facilitate exporting chat data from the server database."""`
	`2`	`+`
`1`	`3`	`import argparse`
`2`	`4`	`import asyncio`
`3`	`5`	`import contextlib`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+"""Logic related to admin actions."""`
	`2`	`+`
`1`	`3`	`import fastapi`
`2`	`4`	`from loguru import logger`
`3`	`5`	`from oasst_inference_server import database, models`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+"""Logic related to authorization actions."""`
	`2`	`+`
`1`	`3`	`import hashlib`
`2`	`4`	`import json`
`3`	`5`	`from datetime import datetime, timedelta`