id: using hashlib implemented

GorkiPower · GorkiPower · commit bca088ffe1c8 · 2025-12-10T14:57:15.000+01:00
diff --git a/backend/datamodel.py b/backend/datamodel.py
@@ -1,19 +1,42 @@
-from datetime import datetime
 
-from pydantic import BaseModel, Field
+from datetime import datetime
+from typing import Optional, Dict, Any
+from pydantic import BaseModel, Field, root_validator
+import hashlib
+import json
 
 
 class SchemaDefinition(BaseModel):
-    id: str = Field(..., description="Unique identifier for the schema")
-    name: str | None = Field(
+    id: str = Field(..., description="Unique identifier for the schema (content hash)")
+    name: Optional[str] = Field(
         None,
         description="Human-readable name of the schema",
         min_length=3,
     )
-    version: str | None = Field(None, description="Version of the schema")
-    content: dict | None = Field(
+    version: Optional[str] = Field(None, description="Version of the schema")
+    content: Optional[Dict[str, Any]] = Field(
         None, description="The actual schema content as a dictionary"
     )
-    updated_at: datetime | None = Field(
+    updated_at: Optional[datetime] = Field(
         None, description="Timestamp of the last update"
     )
+
+    @staticmethod
+    def _compute_hash(name: Optional[str], version: Optional[str], content: Optional[Dict[str, Any]]) -> str:
+        """
+        Compute a deterministic SHA-256 hash from the canonical JSON of {name, version, content}.
+        """
+        payload = {"name": name, "version": version, "content": content}
+        canonical = json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
+        return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+
+    @root_validator(pre=True)
+    def assign_id_from_content(cls, values):
+        """
+        Always (re)compute `id` from the content so it is deterministic and content-addressed.
+        Any provided `id` is ignored to ensure correctness.
+        """
+        name = values.get("name")
+        version = values.get("version")
+        content = values.get("content")
+        values["id"] = cls._compute_hash(name, version, content)
diff --git a/backend/main.py b/backend/main.py
@@ -1,13 +1,14 @@
+
 from datetime import datetime
-from typing import Any, Optional
-from uuid import uuid4
+from typing import Any, List, Dict, Optional
 
 from database import schemas_collection
 from datamodel import SchemaDefinition
 from fastapi import FastAPI, HTTPException
 from fastapi.encoders import jsonable_encoder
 from fastapi.middleware.cors import CORSMiddleware
 
+
 # ---- FastAPI app & CORS ----
 app = FastAPI()
 app.add_middleware(
@@ -19,91 +20,130 @@
 )
 
 
+def _compute_hash_from_doc(doc: Dict[str, Any]) -> str:
+    """
+    Helper to compute the same SHA-256 hash used by SchemaDefinition
+    without instantiating a model (used for quick normalization paths).
+    """
+    # Import locally to avoid circular imports and to use the same logic
+    from datamodel import SchemaDefinition
+    return SchemaDefinition._compute_hash(
+        doc.get("name"),
+        doc.get("version"),
+        doc.get("content"),
+    )
+
+
 # ---- Routes ----
-@app.get("/schemas", response_model=list[SchemaDefinition])
-async def get_all_schemas() -> list[dict[str, Any]]:
+@app.get("/schemas", response_model=List[SchemaDefinition])
+async def get_all_schemas() -> List[Dict[str, Any]]:
     """
-    Retrieve all schemas. Ensures each document has `id` as a string and a valid `updated_at`.
+    Retrieve all schemas. Ensures each document has `id` as the content hash
+    and a valid `updated_at`. If the stored `id` is missing/mismatched,
+    it will be recomputed to keep the collection consistent.
     """
     docs = list(schemas_collection.find())
-    normalized: list[dict[str, Any]] = []
-
+    normalized: List[Dict[str, Any]] = []
     for d in docs:
-        # Ensure `id` exists and is a non-empty string
-        if not isinstance(d.get("id"), str) or not d["id"].strip():
-            d["id"] = str(uuid4())
-
-        # Ensure `updated_at` is present (optional safeguard)
+        # Compute the correct content hash
+        computed_id = _compute_hash_from_doc(d)
+        if d.get("id") != computed_id:
+            # Heal legacy/mismatched ids
+            d["id"] = computed_id
+            # Do not modify updated_at during passive normalization
+            schemas_collection.update_one({"_id": d["_id"]}, {"$set": {"id": computed_id}})
+
+        # Ensure updated_at exists (server-side default)
         if d.get("updated_at") is None:
             d["updated_at"] = datetime.utcnow()
+            schemas_collection.update_one({"_id": d["_id"]}, {"$set": {"updated_at": d["updated_at"]}})
 
+        # Remove internal MongoDB _id from outward JSON
+        d.pop("_id", None)
         normalized.append(d)
 
-    # No BSON present; plain JSON encoding is fine
     return jsonable_encoder(normalized)
 
 
 @app.post("/schemas", response_model=SchemaDefinition)
-async def add_schema(schema: SchemaDefinition) -> dict[str, Any]:
+async def add_schema(schema: SchemaDefinition) -> Dict[str, Any]:
     """
-    Add a new schema. If `id` is missing/empty, generate one; always refresh `updated_at`.
+    Add a new schema. `id` is deterministically computed from {name, version, content}.
+    Server sets `updated_at`.
     """
-    doc = schema.dict()
-
-    # Guarantee a usable id
-    if not isinstance(doc.get("id"), str) or not doc["id"].strip():
-        doc["id"] = str(uuid4())
-
-    # Server-side timestamp
+    # Rebuild model explicitly to guarantee id is based on content, not caller-supplied id
+    model = SchemaDefinition(
+        name=schema.name,
+        version=schema.version,
+        content=schema.content,
+        updated_at=None,
+        id="ignored"  # ignored by validator; kept for clarity
+    )
+    doc = model.dict()
     doc["updated_at"] = datetime.utcnow()
 
-    # Insert as-is (no ObjectId conversions)
+    # Insert as-is (no ObjectId conversions for id)
     schemas_collection.insert_one(doc)
 
     # Return exactly what we stored
     return jsonable_encoder(doc)
 
 
-@app.put("/schemas/{id}", response_model=dict[str, str])
-async def update_schema(id: str, update: SchemaDefinition) -> dict[str, str]:
+@app.put("/schemas/{id}", response_model=Dict[str, str])
+async def update_schema(id: str, update: SchemaDefinition) -> Dict[str, str]:
     """
-    Update schema by `id`. Ignores `id` field in the payload (primary key is immutable).
-    Only non-None fields are updated; `updated_at` is refreshed automatically.
+    Update schema by `id`. Because `id` is a content hash, any change in
+    {name, version, content} will produce a new `id`. This endpoint:
+      1) Finds the existing document by the current `id`.
+      2) Merges provided fields (ignores `None` and any `id` supplied).
+      3) Recomputes `id` from merged content.
+      4) Replaces the document and returns the (possibly new) `id`.
     """
     if not isinstance(id, str) or not id.strip():
-        raise HTTPException(
-            status_code=400, detail="Invalid schema id (must be a non-empty string)"
-        )
+        raise HTTPException(status_code=400, detail="Invalid schema id (must be a non-empty string)")
 
-    # Ignore None values and prevent changing the primary key
-    update_fields = {
-        k: v for k, v in update.dict().items() if v is not None and k != "id"
-    }
+    existing = schemas_collection.find_one({"id": id})
+    if not existing:
+        raise HTTPException(status_code=404, detail="Schema not found")
 
-    if update_fields:
-        update_fields["updated_at"] = datetime.utcnow()
+    # Merge non-None fields from the payload (ignore any 'id' from client)
+    payload = update.dict()
+    merged = {
+        "name": payload.get("name", existing.get("name")),
+        "version": payload.get("version", existing.get("version")),
+        "content": payload.get("content", existing.get("content")),
+    }
+    # Compute new hash-based id using SchemaDefinition logic
+    new_model = SchemaDefinition(**merged, id="ignored", updated_at=None)
+    new_id = new_model.id
+
+    # Build final doc to store
+    final_doc = {
+        "id": new_id,
+        "name": merged["name"],
+        "version": merged["version"],
+        "content": merged["content"],
+        "updated_at": datetime.utcnow(),
+    }
 
-    result = schemas_collection.update_one(
-        {"id": id}, {"$set": update_fields} if update_fields else {}
-    )
-    if result.matched_count == 0:
-        raise HTTPException(status_code=404, detail="Schema not found")
+    # Replace the existing document (matched by the previous id)
+    replace_result = schemas_collection.replace_one({"id": id}, final_doc)
+    if replace_result.matched_count == 0:
+        raise HTTPException(status_code=404, detail="Schema not found during update")
 
-    return {"message": "Schema updated"}
+    # If id changed, the caller now has to reference the new id
+    return {"message": "Schema updated", "id": new_id}
 
 
-@app.delete("/schemas/{id}", response_model=dict[str, str])
-async def delete_schema(id: str) -> dict[str, str]:
+@app.delete("/schemas/{id}", response_model=Dict[str, str])
+async def delete_schema(id: str) -> Dict[str, str]:
     """
     Delete schema by `id`.
     """
     if not isinstance(id, str) or not id.strip():
-        raise HTTPException(
-            status_code=400, detail="Invalid schema id (must be a non-empty string)"
-        )
+        raise HTTPException(status_code=400, detail="Invalid schema id (must be a non-empty string)")
 
     result = schemas_collection.delete_one({"id": id})
     if result.deleted_count == 0:
         raise HTTPException(status_code=404, detail="Schema not found")
-
     return {"message": "Schema deleted"}