@@ -1950,7 +1950,82 @@ def get_label(self, i):
19501950 folder = f"cpu0{ i } "
19511951 return folder
19521952
1953- def get_db_unique (self , db_name = None , prec = 3 , update_topology = True , key = 'ff_energy' ):
1953+ def get_db_unique (self , db_name = None , prec = 3 , key = 'ff_energy' , max_N_atoms = 64 ):
1954+ """
1955+ Get a database file containing only unique structures based on topology and energy.
1956+
1957+ Args:
1958+ db_name (str, optional): Filename for the new database.
1959+ If None, will use original name with '_unique' suffix.
1960+ prec (int, optional): Precision for rounding energy values. Default is 3.
1961+ key (str, optional): Energy attribute name to use for filtering.
1962+ Default is 'ff_energy'.
1963+ max_N_atoms (int, optional): Maximum n_atoms for pmg match. Default is 64.
1964+
1965+ Returns:
1966+ int: Number of unique structures in the new database.
1967+
1968+ Note:
1969+ Two structures are considered identical if they have:
1970+ - Same density value (within precision)
1971+ - Same energy value (within precision)
1972+ - Pymatgen match
1973+
1974+ When duplicates are found, the structure with lower DOF is kept.
1975+ """
1976+ from pymatgen .analysis .structure_matcher import StructureMatcher
1977+ matcher = StructureMatcher (stol = 0.3 , ltol = 0.2 , angle_tol = 5 )
1978+
1979+ print (f"The { self .db_name :s} has { self .db .count ():d} strucs" )
1980+ if db_name is None :
1981+ db_name = self .db_name [:- 3 ] + "_unique.db"
1982+ if os .path .exists (db_name ):
1983+ os .remove (db_name )
1984+
1985+ lists = []
1986+ for row in self .db .select ():
1987+ if hasattr (row , key ) and getattr (row , key ) is not None :
1988+ dof , den , energy = row .dof , round (row .density , prec ), round (getattr (row , key ), prec )
1989+ spg , wps = row .space_group_number , row .wps
1990+ is_unique = True
1991+ pmg = ase2pymatgen (row .toatoms ())
1992+ list_entry = (row .id , dof , den , energy , spg , wps , pmg )
1993+ for list_entry_existing in lists :
1994+ (_id , _dof , _den , _energy , _spg , _wps , _pmg ) = list_entry_existing
1995+ if den == _den and energy == _energy :
1996+ # check pymatgen match
1997+ if len (_pmg ) > max_N_atoms or len (pmg ) > max_N_atoms :
1998+ if spg == _spg and wps == _wps :
1999+ is_unique = False
2000+ print ("Duplicate" , row .id , den , energy )
2001+ break
2002+ # for large structures, skip pymatgen match to save time
2003+ else :
2004+ if matcher .fit (pmg , _pmg ):
2005+ is_unique = False
2006+ if dof < _dof :
2007+ print ("Updating" , row .id , den , energy )
2008+ lists .remove (list_entry_existing )
2009+ lists .append (list_entry )
2010+ else :
2011+ print ("Duplicate" , row .id , den , energy )
2012+ break
2013+ if is_unique :
2014+ print ("Adding" , row .id , den , energy )
2015+ lists .append (list_entry )
2016+ ids = [entry [0 ] for entry in lists ]
2017+ with connect (db_name , serial = True ) as db :
2018+ for id in ids :
2019+ row = self .db .get (id )
2020+ kvp = {}
2021+ for key in self .keys :
2022+ if hasattr (row , key ):
2023+ kvp [key ] = getattr (row , key )
2024+ db .write (row .toatoms (), key_value_pairs = kvp )
2025+ print (f"Created { db_name :s} with { db .count ():d} strucs" )
2026+ return db .count ()
2027+
2028+ def get_db_unique_topology (self , db_name = None , prec = 3 , update_topology = True , key = 'ff_energy' ):
19542029 """
19552030 Get a database file containing only unique structures based on topology and energy.
19562031
0 commit comments