Skip to content

Commit b54645f

Browse files
committed
update db_unique to get rid of topology checking
1 parent 9859036 commit b54645f

File tree

1 file changed

+76
-1
lines changed

1 file changed

+76
-1
lines changed

pyxtal/db.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1950,7 +1950,82 @@ def get_label(self, i):
19501950
folder = f"cpu0{i}"
19511951
return folder
19521952

1953-
def get_db_unique(self, db_name=None, prec=3, update_topology=True, key='ff_energy'):
1953+
def get_db_unique(self, db_name=None, prec=3, key='ff_energy', max_N_atoms=64):
1954+
"""
1955+
Get a database file containing only unique structures based on topology and energy.
1956+
1957+
Args:
1958+
db_name (str, optional): Filename for the new database.
1959+
If None, will use original name with '_unique' suffix.
1960+
prec (int, optional): Precision for rounding energy values. Default is 3.
1961+
key (str, optional): Energy attribute name to use for filtering.
1962+
Default is 'ff_energy'.
1963+
max_N_atoms (int, optional): Maximum n_atoms for pmg match. Default is 64.
1964+
1965+
Returns:
1966+
int: Number of unique structures in the new database.
1967+
1968+
Note:
1969+
Two structures are considered identical if they have:
1970+
- Same density value (within precision)
1971+
- Same energy value (within precision)
1972+
- Pymatgen match
1973+
1974+
When duplicates are found, the structure with lower DOF is kept.
1975+
"""
1976+
from pymatgen.analysis.structure_matcher import StructureMatcher
1977+
matcher = StructureMatcher(stol=0.3, ltol=0.2, angle_tol=5)
1978+
1979+
print(f"The {self.db_name:s} has {self.db.count():d} strucs")
1980+
if db_name is None:
1981+
db_name = self.db_name[:-3] + "_unique.db"
1982+
if os.path.exists(db_name):
1983+
os.remove(db_name)
1984+
1985+
lists = []
1986+
for row in self.db.select():
1987+
if hasattr(row, key) and getattr(row, key) is not None:
1988+
dof, den, energy = row.dof, round(row.density, prec), round(getattr(row, key), prec)
1989+
spg, wps = row.space_group_number, row.wps
1990+
is_unique = True
1991+
pmg = ase2pymatgen(row.toatoms())
1992+
list_entry = (row.id, dof, den, energy, spg, wps, pmg)
1993+
for list_entry_existing in lists:
1994+
(_id, _dof, _den, _energy, _spg, _wps, _pmg) = list_entry_existing
1995+
if den == _den and energy == _energy:
1996+
# check pymatgen match
1997+
if len(_pmg) > max_N_atoms or len(pmg) > max_N_atoms:
1998+
if spg == _spg and wps == _wps:
1999+
is_unique = False
2000+
print("Duplicate", row.id, den, energy)
2001+
break
2002+
# for large structures, skip pymatgen match to save time
2003+
else:
2004+
if matcher.fit(pmg, _pmg):
2005+
is_unique = False
2006+
if dof < _dof:
2007+
print("Updating", row.id, den, energy)
2008+
lists.remove(list_entry_existing)
2009+
lists.append(list_entry)
2010+
else:
2011+
print("Duplicate", row.id, den, energy)
2012+
break
2013+
if is_unique:
2014+
print("Adding", row.id, den, energy)
2015+
lists.append(list_entry)
2016+
ids = [entry[0] for entry in lists]
2017+
with connect(db_name, serial=True) as db:
2018+
for id in ids:
2019+
row = self.db.get(id)
2020+
kvp = {}
2021+
for key in self.keys:
2022+
if hasattr(row, key):
2023+
kvp[key] = getattr(row, key)
2024+
db.write(row.toatoms(), key_value_pairs=kvp)
2025+
print(f"Created {db_name:s} with {db.count():d} strucs")
2026+
return db.count()
2027+
2028+
def get_db_unique_topology(self, db_name=None, prec=3, update_topology=True, key='ff_energy'):
19542029
"""
19552030
Get a database file containing only unique structures based on topology and energy.
19562031

0 commit comments

Comments
 (0)