linagora
diff --git a/‎.hydra_config/config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.hydra_config/config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/content/docs/documentation/API.mdx‎
Lines changed: 12 additions & 0 deletions b/‎docs/content/docs/documentation/API.mdx‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/content/docs/documentation/milvus_migration.md‎
Lines changed: 63 additions & 19 deletions b/‎docs/content/docs/documentation/milvus_migration.md‎
Lines changed: 63 additions & 19 deletions
diff --git a/‎docs/content/docs/documentation/temporality.md‎
Lines changed: 138 additions & 0 deletions b/‎docs/content/docs/documentation/temporality.md‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎extern/indexer-ui‎ b/‎extern/indexer-ui‎
diff --git a/‎openrag/components/indexer/utils/files.py‎
Lines changed: 26 additions & 1 deletion b/‎openrag/components/indexer/utils/files.py‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎openrag/components/indexer/utils/test_files.py‎
Lines changed: 29 additions & 2 deletions b/‎openrag/components/indexer/utils/test_files.py‎
Lines changed: 29 additions & 2 deletions
@@ -40,6 +40,7 @@ vectordb:
   collection_name: ${oc.env:VDB_COLLECTION_NAME, vdb_test}
   hybrid_search: ${oc.env:VDB_HYBRID_SEARCH, true}
   enable: true
+  schema_version: 1  # Increment when the collection schema changes and a migration is required
 
 rdb:
   host: ${oc.env:POSTGRES_HOST, rdb}
 
@@ -78,6 +78,18 @@ Upload a new file to a specific partition for indexing.
 - `201 Created`: Returns task status URL
 - `409 Conflict`: File already exists in partition
 
+##### Temporal Filtering
+OpenRAG supports temporal filtering to retrieve documents from specific time periods.
+The client can include the temporal field to allow temporal-aware search in search endpoints.
+
+* `created_at`: ISO 8601 format date of when the file was created
+
+:::info
+`created_at` is provided by the client in the metadata of the file during upload.
+This is a first iteration — additional temporal fields (e.g. `updated_at`) may be added in future releases as needed.
+:::
+
+
 ##### Upload files while modeling relations between them
 
 OpenRAG supports document relationships to enable context-aware retrieval.
 
@@ -41,24 +41,6 @@ results = client.query(
 > * `PT3H` = 3 hours
 > * `P2DT6H` = 2 days and 6 hours.
 
-## Current State
-
-:::info
-Temporal fields are currently stored as **strings**, not **`TIMESTAMPTZ`**. Migrating to `TIMESTAMPTZ` requires a schema and index change, and Milvus doesn't support migrations on schema and index changes: it has to be handled manually.
-
-Until a Milvus schema & index migration strategy is defined, filtering still works via **lexicographic string comparison** on ISO 8601 strings:
-```python
-expr = "tsz != '2025-01-03T00:00:00+08:00'"  # No ISO/INTERVAL keywords
-results = client.query(
-    collection_name,
-    filter=expr,
-    output_fields=["id", "tsz"],
-    limit=10
-)
-```
-Full `TIMESTAMPTZ` support will be activated in a future release once the migration is established.
-:::
-
 ## Milvus version upgrade Steps
 :::danger[Before running Milvus Version Migration]
 These steps must be performed on a deployment running OpenRAG **prior to version 1.1.6** (Milvus 2.5.4) before switching to the newest version of OpenRAG.
@@ -129,4 +111,66 @@ docker inspect milvus-standalone --format '{{ .Config.Image }}'
 # Expected: milvusdb/milvus:v2.6.11
 ```
 
-Now you can switch to the newest release of OpenRAG and it should work fine.
+Now you can switch to the newest release of OpenRAG and it should work fine.
+
+## Schema Migration — Add Temporal Fields
+
+:::info
+This migration adds a `TIMESTAMPTZ` fields `created_at` and a `STL_SORT` index to an existing collection.
+
+Existing documents will have `null` for that field; new documents will have them populated at index time.
+:::
+
+:::danger[OpenRAG must be stopped]
+Stop the OpenRAG application before running this migration.
+:::
+
+### Step 1 — Start only the Milvus container
+
+```bash
+docker compose up -d milvus
+```
+
+Wait until Milvus is healthy:
+
+```bash
+docker compose ps milvus
+```
+
+### Step 2 — Dry-run (inspect, no changes)
+
+```bash
+docker compose run --no-deps --rm --build --entrypoint "" openrag \
+    uv run python scripts/migrations/milvus/1.add_temporal_fields.py --dry-run
+```
+
+Review the output to confirm which fields and indexes are missing.
+
+### Step 3 — Apply the migration
+
+```bash
+docker compose run --no-deps --rm --build --entrypoint "" openrag \
+    uv run python scripts/migrations/milvus/1.add_temporal_fields.py
+```
+
+The script will:
+1. Add any missing `TIMESTAMPTZ` fields (nullable)
+2. Create `STL_SORT` indexes for each field
+3. Stamp the collection with `schema_version=1` so OpenRAG no longer reports a migration error on startup
+
+### Step 4 — Restart OpenRAG
+
+```bash
+docker compose up --build -d
+```
+
+### Rollback
+
+Milvus does not yet support dropping fields. The rollback only removes the indexes and resets the version stamp — the fields remain in the schema but are unused:
+
+```bash
+docker compose run --no-deps --rm --build --entrypoint "" openrag \
+    uv run python scripts/migrations/milvus/1.add_temporal_fields.py --downgrade
+```
+
+To fully remove the fields you would need to recreate the collection from scratch.
@@ -0,0 +1,138 @@
+---
+title: Temporality 
+---
+
+# Milvus representation
+
+* As scalar field
+
+Scalar fields store primitive, structured values—commonly referred to as metadata—such as numbers, strings, or dates.
+
+They allow you to narrow search results based on specific attributes, like limiting documents to a particular category or a defined **time range**.
+
+    * You can set nullable=True for TIMESTAMPTZ fields to allow missing values.
+    * You can specify a default timestamp value using the default_value attribute in ISO 8601 format.
+
+* format: timestamp (ISO 8601 format)
+    * All temporal fields are stored in ISO 8601 format
+
+* **Automatic date extraction**
+
+# Operation
+## Add a TIMESTAMPTZ field that allows null values
+* schema.add_field("tsz", DataType.TIMESTAMPTZ, nullable=True)
+* You can specify a default timestamp value using the **`default_value`** attribute in **`ISO 8601` format**.
+
+
+## Filtering operations
+
+Compatible with milvus 2.6.6
+
+* **`TIMESTAMPTZ`** supports scalar comparisons, interval arithmetic, and extraction of time components.
+
+* **Comparison and filtering**: All filtering and ordering operations are performed in UTC, ensuring consistent and predictable results across different time zones.
+
+* Query with timestamp filtering
+    * Use arithmetic operators like ==, !=, <, >, <=, >=. For a full list of arithmetic operators available in Milvus, refer to [Arithmetic Operators](https://milvus.io/docs/basic-operators.md#Arithmetic-Operators)
+
+    * timestamp filtering
+
+        ```python
+        expr = "tsz != ISO '2025-01-03T00:00:00+08:00'"
+
+        results = client.query(
+            collection_name=collection_name,
+            filter=expr,
+            output_fields=["id", "tsz"],
+            limit=10
+        )
+
+        print("Query result: ", results)
+        ```
+
+    * Interval operations
+        * You can perform arithmetic on TIMESTAMPTZ fields using INTERVAL values in the ISO 8601 duration format. This allows you to add or subtract durations, such as days, hours, or minutes, from a timestamp when filtering data.
+        
+        ```python
+        expr = "tsz + INTERVAL 'P0D' != ISO '2025-01-03T00:00:00+08:00'"
+
+        results = client.query(
+            collection_name, 
+            filter=expr, 
+            output_fields=["id", "tsz"], 
+            limit=10
+        )
+
+        print("Query result: ", results)
+        ```
+
+        * **`INTERVAL`** values follow the **`ISO 8601` duration** syntax. For example:
+            * P1D → 1 day
+            * PT3H → 3 hours
+            * P2DT6H → 2 days and 6 hours
+        
+        * You can use **`INTERVAL`** arithmetic directly in filter expressions, such as:
+            * tsz + INTERVAL 'P3D' → Adds 3 days
+            * tsz - INTERVAL 'PT2H' → Subtracts 2 hours
+    
+    * Search with timestamp filtering
+        * You can combine **`TIMESTAMPTZ`** filtering with vector similarity search to narrow results by both time and similarity.
+
+
+
+--------
+
+* Migration from Milvus v2.5.4 to v2.6.11
+    * TIMESTAMPTZ is compatible with Milvus 2.6.6+
+
+    * Migration according to the release notes for Milvus Standalone: https://milvus.io/docs/upgrade_milvus_standalone-docker.md 
+        * `You must upgrade to v2.5.16 or later before upgrading to v2.6.11.`
+    
+        * Steps for upgrading: https://milvus.io/docs/upgrade_milvus_standalone-docker.md#Upgrade-process
+
+        * Issue: I've moved from Milvs 2.5.4 to 2.6.11 following https://milvus.io/docs/upgrade_milvus_standalone-docker.md. Previous collections created in 2.5.4 can't be loaded. It runs forever.
+
+        * https://github.com/milvus-io/milvus/issues/43295
+
+        * https://www.perplexity.ai/search/i-ve-moved-from-milvs-2-5-4-to-CDHCle5hQl.qsUa_nw4WHQ
+
+
+
+
+* Done successfully   
+
+-----
+
+* Setting "datatype=DataType.TIMESTAMPTZ" datatype for the field created_at
+
+* Search
+    * search_params for search https://milvus.io/api-reference/pymilvus/v2.6.x/MilvusClient/Vector/search.md#Request-syntax
+    * param via AnnSearchRequest: https://milvus.io/api-reference/pymilvus/v2.6.x/MilvusClient/Vector/hybrid_search.md#Request-Syntax
+
+
+-----
+
+* Finally i manage to make it work following the migration steps
+
+* Logical operators
+    * Logical operators are used to combine multiple conditions into a more complex filter expression. These include AND, OR, and NOT.
+
+* Range operators
+    * https://milvus.io/docs/basic-operators.md#Range-operators
+    * Supported Range Operators:
+        * IN: Used to match values within a specific set or range.
+        * LIKE: Used to match a pattern (mostly for text fields). Milvus allows you to build an NGRAM index on VARCHAR or JSON fields to accelerate text queries. For details, refer to [NGRAM](https://milvus.io/docs/ngram.md).
+    
+
+## Time
+
+Time fields
+
+* datetime
+* modified_at
+* created_at
+==> Added
+* indexed_at
+
+
+# Reorder
@@ -1,12 +1,13 @@
 import re
 import secrets
 import time
+from datetime import UTC, datetime
 from pathlib import Path
 
 import aiofiles
 import consts
 from components.utils import load_config
-from fastapi import UploadFile
+from fastapi import HTTPException, UploadFile, status
 
 config = load_config()
 SERIALIZE_TIMEOUT = config.ray.indexer.get("serialize_timeout", 3600)
@@ -84,3 +85,27 @@ async def serialize_file(task_id: str, path: str, metadata: dict | None = None):
         timeout=SERIALIZE_TIMEOUT,
         task_description=f"Serialization task {task_id}",
     )
+
+
+def extract_temporal_fields(metadata: dict, temporal_fields: list) -> dict:
+    result = {}
+
+    ## Use provided created_at if available, otherwise extract from file system
+    for field in temporal_fields:
+        if field not in metadata or metadata[field] is None:
+            continue
+
+        datetime_str = metadata[field]
+        try:
+            # Try parsing the provided datetime to ensure it's valid
+            d = datetime.fromisoformat(datetime_str)
+            if d.tzinfo is None:
+                d = d.replace(tzinfo=UTC)
+            result[field] = d.isoformat()
+        except Exception:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Invalid ISO 8601 datetime field ({datetime_str}) for field '{field}'.",
+            )
+
+    return result
@@ -2,9 +2,9 @@
 from pathlib import Path
 
 import pytest
-from fastapi import UploadFile
+from fastapi import HTTPException, UploadFile
 
-from .files import sanitize_filename, save_file_to_disk
+from .files import extract_temporal_fields, sanitize_filename, save_file_to_disk
 
 
 @pytest.mark.asyncio
@@ -83,3 +83,30 @@ def fake_make_unique_filename(filename: str) -> str:
 )
 def test_sanitize_filename(input_name, expected):
     assert sanitize_filename(input_name) == expected
+
+
+# --- extract_temporal_fields ---
+
+
+def test_extract_temporal_fields_field_not_in_metadata():
+    assert extract_temporal_fields({}, ["created_at"]) == {}
+
+
+def test_extract_temporal_fields_naive_datetime_defaults_to_utc():
+    metadata = {"created_at": "2024-06-15T12:30:00"}
+    result = extract_temporal_fields(metadata, ["created_at"])
+    assert result == {"created_at": "2024-06-15T12:30:00+00:00"}
+
+
+def test_extract_temporal_fields_with_timezone():
+    metadata = {"created_at": "2024-06-15T12:30:00+02:00"}
+    result = extract_temporal_fields(metadata, ["created_at"])
+    assert result == {"created_at": "2024-06-15T12:30:00+02:00"}
+
+
+def test_extract_temporal_fields_invalid_datetime_raises_400():
+    with pytest.raises(HTTPException) as exc_info:
+        extract_temporal_fields({"created_at": "not-a-date"}, ["created_at"])
+    assert exc_info.value.status_code == 400
+    assert "not-a-date" in exc_info.value.detail
+    assert "created_at" in exc_info.value.detail