Adjustment when calculating hash | Adjustment of the hash calculation… (#1837)

joaorura · web-flow · commit f2d1ce137238 · 2025-01-14T19:16:12.000+05:30
… method

When trying to load the saved models after adaptation, alerts like these
were always triggered:

Loaded prompt hash does not match the saved hash.
Loaded prompt hash does not match the saved hash.

Furthermore, in Python, the default hash() function may yield different
results for the same string across different sessions. To achieve
consistent hash values, for tha i using the hashlib module to calculate
de hash of prompt, which provides stable hashing algorithms.
diff --git a/src/ragas/prompt/pydantic_prompt.py b/src/ragas/prompt/pydantic_prompt.py
@@ -4,6 +4,8 @@
 import json
 import logging
 import os
+import hashlib
+
 import typing as t
 
 from langchain_core.exceptions import OutputParserException
@@ -226,12 +228,7 @@ async def adapt(
         """
         Adapt the prompt to a new language.
         """
-
-        # set the original hash, this is used to
-        # identify the original prompt object when loading from file
-        if self.original_hash is None:
-            self.original_hash = hash(self)
-
+        
         strings = get_all_strings(self.examples)
         translated_strings = await translate_statements_prompt.generate(
             llm=llm,
@@ -257,6 +254,8 @@ async def adapt(
             )
             new_prompt.instruction = translated_instruction.statements[0]
 
+        new_prompt.original_hash = hash(new_prompt)
+
         return new_prompt
 
     def __repr__(self):
@@ -276,7 +275,7 @@ def __str__(self):
             ensure_ascii=False,
         )[1:-1]
         return f"{self.__class__.__name__}({json_str})"
-
+        
     def __hash__(self):
         # convert examples to json string for hashing
         examples = []
@@ -285,19 +284,23 @@ def __hash__(self):
             examples.append(
                 (input_model.model_dump_json(), output_model.model_dump_json())
             )
-
-        # not sure if input_model and output_model should be included
-        return hash(
-            (
-                self.name,
-                self.input_model,
-                self.output_model,
-                self.instruction,
-                *examples,
-                self.language,
-            )
-        )
-
+    
+        # create a SHA-256 hash object
+        hasher = hashlib.sha256()
+    
+        # update the hash object with the bytes of each attribute
+        hasher.update(self.name.encode('utf-8'))
+        hasher.update(self.input_model.__name__.encode('utf-8'))
+        hasher.update(self.output_model.__name__.encode('utf-8'))
+        hasher.update(self.instruction.encode('utf-8'))
+        for example in examples:
+            hasher.update(example[0].encode('utf-8'))
+            hasher.update(example[1].encode('utf-8'))
+        hasher.update(self.language.encode('utf-8'))
+    
+        # return the integer value of the hash
+        return int(hasher.hexdigest(), 16)
+    
     def __eq__(self, other):
         if not isinstance(other, PydanticPrompt):
             return False