@@ -72,6 +72,9 @@ class Program:
7272 artifacts_json : Optional [str ] = None # JSON-serialized small artifacts
7373 artifact_dir : Optional [str ] = None # Path to large artifact files
7474
75+ # Embedding vector for novelty rejection sampling
76+ embedding : Optional [List [float ]] = None
77+
7578 def to_dict (self ) -> Dict [str , Any ]:
7679 """Convert to dictionary representation"""
7780 return asdict (self )
@@ -183,6 +186,13 @@ def __init__(self, config: DatabaseConfig):
183186 }
184187
185188 logger .info (f"Initialized program database with { len (self .programs )} programs" )
189+
190+ # Novelty judge setup
191+ from openevolve .embedding import EmbeddingClient
192+ self .novelty_llm = config .novelty_llm
193+ self .embedding_client = EmbeddingClient (config .embedding_model ) if config .embedding_model else None
194+ self .similarity_threshold = config .similarity_threshold
195+
186196
187197 def add (
188198 self , program : Program , iteration : int = None , target_island : Optional [int ] = None
@@ -240,6 +250,11 @@ def add(
240250
241251 island_idx = island_idx % len (self .islands ) # Ensure valid island
242252
253+ # Novelty check before adding
254+ if not self ._is_novel (program .id , island_idx ):
255+ logger .debug (f"Program { program .id } failed in novelty check and won't be added in the island { island_idx } " )
256+ return program .id # Do not add non-novel program
257+
243258 # Add to island-specific feature map (replacing existing if better)
244259 feature_key = self ._feature_coords_to_key (feature_coords )
245260 island_feature_map = self .island_feature_maps [island_idx ]
@@ -931,6 +946,120 @@ def _feature_coords_to_key(self, coords: List[int]) -> str:
931946 """
932947 return "-" .join (str (c ) for c in coords )
933948
949+ def _cosine_similarity (self , vec1 : List [float ], vec2 : List [float ]) -> float :
950+ """
951+ Adapted from SakanaAI/ShinkaEvolve (Apache-2.0 License)
952+ Original source: https://github.com/SakanaAI/ShinkaEvolve/blob/main/shinka/database/dbase.py#L1452
953+
954+ Compute cosine similarity between two vectors.
955+ """
956+ if not vec1 or not vec2 or len (vec1 ) != len (vec2 ):
957+ return 0.0
958+
959+ arr1 = np .array (vec1 , dtype = np .float32 )
960+ arr2 = np .array (vec2 , dtype = np .float32 )
961+
962+ norm_a = np .linalg .norm (arr1 )
963+ norm_b = np .linalg .norm (arr2 )
964+
965+ if norm_a == 0 or norm_b == 0 :
966+ return 0.0
967+
968+ similarity = np .dot (arr1 , arr2 ) / (norm_a * norm_b )
969+
970+ return float (similarity )
971+
972+ def _llm_judge_novelty (self , program : Program , similar_program : Program ) -> bool :
973+ """
974+ Use LLM to judge if a program is novel compared to a similar existing program
975+ """
976+ import asyncio
977+ from openevolve .novelty_judge import NOVELTY_SYSTEM_MSG , NOVELTY_USER_MSG
978+
979+ user_msg = NOVELTY_USER_MSG .format (
980+ language = program .language ,
981+ existing_code = similar_program .code ,
982+ proposed_code = program .code ,
983+ )
984+
985+ try :
986+ content : str = asyncio .run (
987+ self .novelty_llm .generate_with_context (
988+ system_msg = NOVELTY_SYSTEM_MSG ,
989+ messages = [{"role" : "user" , "content" : user_msg }],
990+ )
991+ )
992+
993+ if content is None or content is None :
994+ logger .warning ("Novelty LLM returned empty response" )
995+ return True
996+
997+ content = content .strip ()
998+
999+ # Parse the response
1000+ NOVEL_i = content .upper ().find ("NOVEL" )
1001+ NOT_NOVEL_i = content .upper ().find ("NOT NOVEL" )
1002+
1003+ if NOVEL_i == - 1 and NOT_NOVEL_i == - 1 :
1004+ logger .warning (f"Unexpected novelty LLM response: { content } " )
1005+ return True # Assume novel if we can't parse
1006+
1007+ if NOVEL_i != - 1 and NOT_NOVEL_i != - 1 :
1008+ # Both found, take the one that appears first
1009+ is_novel = NOVEL_i < NOT_NOVEL_i
1010+ elif NOVEL_i != - 1 :
1011+ is_novel = True
1012+ else :
1013+ is_novel = False
1014+
1015+ return is_novel
1016+
1017+ except Exception as e :
1018+ logger .error (f"Error in novelty LLM check: { e } " )
1019+
1020+ return True
1021+
1022+ def _is_novel (self , program_id : int , island_idx : int ) -> bool :
1023+ """
1024+ Determine if a program is novel based on diversity to existing programs
1025+
1026+ Args:
1027+ program: Program to check
1028+ island_idx: Island index
1029+
1030+ Returns:
1031+ True if novel, False otherwise
1032+ """
1033+ if self .embedding_client is None or self .similarity_threshold <= 0.0 :
1034+ # Novelty checking disabled
1035+ return True
1036+
1037+ program = self .programs [program_id ]
1038+ embd = self .embedding_client .get_embedding (program .code )
1039+ self .programs [program_id ].embedding = embd
1040+
1041+ max_smlty = float ('-inf' )
1042+ max_smlty_pid = None
1043+
1044+ for pid in self .islands [island_idx ]:
1045+ other = self .programs [pid ]
1046+
1047+ if other .embedding is None :
1048+ logger .log ("Warning: Program %s has no embedding, skipping similarity check" , other .id )
1049+ continue
1050+
1051+ similarity = self ._cosine_similarity (embd , other .embedding )
1052+
1053+ if similarity >= max (max_smlty , self .similarity_threshold ):
1054+ max_smlty = similarity
1055+ max_smlty_pid = pid
1056+
1057+ if max_smlty_pid is None :
1058+ # No similar programs found, consider it novel
1059+ return True
1060+
1061+ return self ._llm_judge_novelty (program , self .programs [max_smlty_pid ])
1062+
9341063 def _is_better (self , program1 : Program , program2 : Program ) -> bool :
9351064 """
9361065 Determine if program1 has better FITNESS than program2
0 commit comments