-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPubchemAutoURLOriginal.py
More file actions
40 lines (30 loc) · 1.65 KB
/
PubchemAutoURLOriginal.py
File metadata and controls
40 lines (30 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import requests, pandas as pd, pubchempy as pcp, time, json
from rdkit import Chem
from tqdm import tqdm
# Input search structure
search_query = 'CCO'
# Build the URL with a placeholder for the structure search query
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/{search_query}/JSON?MaxRecords=50000"
print("Structure search query URL:", url)
# Make the API request
response = requests.get(url)
# Get the Waiting.ListKey element from the JSON response
list_key = json.loads(response.content).get('Waiting', {}).get('ListKey')
print("ListKey:", list_key or "No compounds found matching the structure query.")
if list_key:
# Build the URL to retrieve the search results
url2 = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{list_key}/cids/JSON"
# Wait for the search to complete
while 'Waiting' in json.loads(requests.get(url2).content):
print("Waiting for 10 seconds...")
time.sleep(10)
# Retrieve properties for each compound
cid_elements = json.loads(requests.get(url2).content).get('IdentifierList', {}).get('CID', [])
properties = [{'CID': cid, 'InChI': pcp.Compound.from_cid(cid).inchi}
for cid in tqdm(cid_elements, desc="Retrieving properties") if not time.sleep(0.2) and not ValueError]
# Convert to pandas dataframe and filter out unwanted compounds
df = pd.DataFrame(properties)
df = df[~df['InChI'].str.contains('/i|\*|\.\d+|,')]
# Convert InChI to SMILES and save the filtered dataframe to a new CSV file
df['SMILES'] = df['InChI'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromInchi(x)))
df.to_csv('Library_filtered.csv', index=False)