Chore/gleanings any encoding (#1569)

AlonsoGuevara · web-flow · commit 5f9ad0d00303 · 2025-01-02T11:44:21.000-06:00
* Make claims and entities independent of encoding

* Semver

* Change semver release type
diff --git a/.semversioner/next-release/patch-20241230224307150194.json b/.semversioner/next-release/patch-20241230224307150194.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Make gleanings independent of encoding"
+}
diff --git a/graphrag/index/operations/extract_covariates/claim_extractor.py b/graphrag/index/operations/extract_covariates/claim_extractor.py
@@ -88,8 +88,8 @@ def __init__(
 
         # Construct the looping arguments
         encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL)
-        yes = f"{encoding.encode('YES')[0]}"
-        no = f"{encoding.encode('NO')[0]}"
+        yes = f"{encoding.encode('Y')[0]}"
+        no = f"{encoding.encode('N')[0]}"
         self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1}
 
     async def __call__(
@@ -195,7 +195,7 @@ async def _process_document(
                 history=response.history,
                 model_parameters=self._loop_args,
             )
-            if response.output.content != "YES":
+            if response.output.content != "Y":
                 break
 
         return self._parse_claim_tuples(results, prompt_args)
diff --git a/graphrag/index/operations/extract_entities/graph_extractor.py b/graphrag/index/operations/extract_entities/graph_extractor.py
@@ -92,8 +92,8 @@ def __init__(
 
         # Construct the looping arguments
         encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL)
-        yes = f"{encoding.encode('YES')[0]}"
-        no = f"{encoding.encode('NO')[0]}"
+        yes = f"{encoding.encode('Y')[0]}"
+        no = f"{encoding.encode('N')[0]}"
         self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1}
 
     async def __call__(
@@ -180,7 +180,7 @@ async def _process_document(
                 model_parameters=self._loop_args,
             )
 
-            if response.output.content != "YES":
+            if response.output.content != "Y":
                 break
 
         return results
diff --git a/graphrag/prompts/index/claim_extraction.py b/graphrag/prompts/index/claim_extraction.py
@@ -58,4 +58,4 @@
 
 
 CONTINUE_PROMPT = "MANY entities were missed in the last extraction.  Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities may have still been missed.  Answer YES {tuple_delimiter} NO if there are still entities that need to be added.\n"
+LOOP_PROMPT = "It appears some entities may have still been missed.  Answer Y or N if there are still entities that need to be added.\n"
diff --git a/graphrag/prompts/index/entity_extraction.py b/graphrag/prompts/index/entity_extraction.py
@@ -126,4 +126,4 @@
 Output:"""
 
 CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities and relationships may have still been missed.  Answer YES | NO if there are still entities or relationships that need to be added.\n"
+LOOP_PROMPT = "It appears some entities and relationships may have still been missed.  Answer Y or N if there are still entities or relationships that need to be added.\n"

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "minor",
 +  "description": "Make gleanings independent of encoding"
 +}
Original file line number	Diff line number	Diff line change
`@@ -58,4 +58,4 @@`
`58`	`58`
`59`	`59`
`60`	`60`	`CONTINUE_PROMPT = "MANY entities were missed in the last extraction. Add them below using the same format:\n"`
`61`		`-LOOP_PROMPT = "It appears some entities may have still been missed. Answer YES {tuple_delimiter} NO if there are still entities that need to be added.\n"`
	`61`	`+LOOP_PROMPT = "It appears some entities may have still been missed. Answer Y or N if there are still entities that need to be added.\n"`