Skip to content

Commit 0166d27

Browse files
committed
Merge commit '19ffa7a2f137f2781ffca64bf39b31f5f99f199d'
2 parents f018955 + 19ffa7a commit 0166d27

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+3073
-1564
lines changed

lib/foldseek/lib/foldcomp/CMakeLists.txt

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Author: Milot Mirdita (milot@mirdita.de), Hyunbin Kim (khb7840@gmail.com)
2-
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
2+
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
33
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
44
set(CMAKE_CXX_STANDARD 17)
55
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -14,6 +14,11 @@ option(GCS_SUPPORT "Enable Google Cloud Storage support" OFF)
1414
include_directories(src)
1515
add_subdirectory(src)
1616

17+
# For windows, include lib/windows
18+
if(WIN32)
19+
include_directories(lib/windows)
20+
endif(WIN32)
21+
1722
if(HAVE_SANITIZER)
1823
include(FindUBSan)
1924
include(FindASan)
@@ -26,6 +31,11 @@ if(BUILD_LIBRARY)
2631
${foldcomp_header_files}
2732
${foldcomp_source_files})
2833
elseif(BUILD_PYTHON)
34+
if(MSVC)
35+
install(FILES lib/windows/dirent.h DESTINATION include)
36+
endif(MSVC)
37+
find_package(PythonInterp)
38+
find_package(PythonLibs) # Trying to fix cibuildwheel ubuntu error
2939
find_package(PythonExtensions REQUIRED)
3040
add_library(foldcomp MODULE
3141
${foldcomp_header_files}
@@ -45,26 +55,34 @@ else()
4555
target_compile_definitions(foldcomp PUBLIC FOLDCOMP_EXECUTABLE)
4656
# For debugging
4757
# target_compile_definitions(foldcomp PUBLIC _GLIBCXX_DEBUG=1 _LIBCPP_DEBUG=1)
48-
49-
find_package(OpenMP REQUIRED)
50-
if(OPENMP_CXX_FOUND)
51-
if((CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))
52-
target_link_libraries(foldcomp PUBLIC OpenMP::OpenMP_CXX)
53-
else()
54-
target_link_libraries(foldcomp PRIVATE "${OpenMP_CXX_FLAGS}")
55-
target_compile_options(foldcomp PRIVATE "${OpenMP_CXX_FLAGS}")
58+
59+
if(NOT EMSCRIPTEN)
60+
# For local compilation, openmp and zlib are required
61+
find_package(OpenMP REQUIRED)
62+
if(OPENMP_CXX_FOUND)
63+
if((CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))
64+
target_link_libraries(foldcomp PUBLIC OpenMP::OpenMP_CXX)
65+
else()
66+
target_link_libraries(foldcomp PRIVATE "${OpenMP_CXX_FLAGS}")
67+
target_compile_options(foldcomp PRIVATE "${OpenMP_CXX_FLAGS}")
68+
endif()
5669
target_compile_definitions(foldcomp PUBLIC OPENMP)
5770
endif()
71+
find_package(ZLIB REQUIRED)
72+
target_link_libraries(foldcomp PUBLIC ZLIB::ZLIB)
73+
else()
74+
# For webassembly, not using openmp. zlib is added to compile flags
75+
set_target_properties(
76+
foldcomp
77+
PROPERTIES
78+
COMPILE_FLAGS -sUSE_ZLIB=1
79+
LINK_FLAGS "-sUSE_ZLIB=1 -sEXPORTED_RUNTIME_METHODS=callMain,FS -sINVOKE_RUN=0 -sFILESYSTEM=1 -sALLOW_MEMORY_GROWTH=1 -sTOTAL_MEMORY=256MB -sENVIRONMENT=web -sMODULARIZE=1 -s EXPORT_ES6=1 -sEXPORT_NAME=createFoldcomp -sSINGLE_FILE=0 -sASSERTIONS=0")
5880
endif()
5981

60-
find_package(ZLIB REQUIRED)
61-
target_link_libraries(foldcomp PUBLIC ZLIB::ZLIB)
62-
6382
include_directories(lib/gemmi)
6483

6584
include_directories(lib/microtar)
6685
add_subdirectory(lib/microtar)
67-
6886
target_link_libraries(foldcomp PUBLIC microtar)
6987

7088
if(GCS_SUPPORT)

lib/foldseek/lib/foldcomp/README.md

Lines changed: 104 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# Foldcomp
2+
23
<p align="center">
34
<img src="https://raw.githubusercontent.com/steineggerlab/foldcomp/master/.github/img/foldcomp_strong_marv.png" max-height="300px" height="300" display="block" margin-left="auto" margin-right="auto" display="block"/>
45
</p>
5-
Foldcomp compresses protein structures with torsion angles effectively. It compresses the backbone atoms to 8 bytes and the side chain to additionally 4-5 byes per residue, thus an averaged-sized protein of 350 residues requires ~6kb.
6+
Foldcomp compresses protein structures with torsion angles effectively. It compresses the backbone atoms to 8 bytes and the side chain to additionally 4-5 byes per residue, thus an averaged-sized protein of 350 residues requires ~6kb.
67

78
Foldcomp efficient compressed format stores protein structures requiring only 13 bytes per residue, which reduces the required storage space by an order of magnitude compared to saving 3D coordinates directly. We achieve this reduction by encoding the torsion angles of the backbone as well as the side-chain angles in a compact binary file format (FCZ).
89

@@ -16,6 +17,18 @@ Foldcomp efficient compressed format stores protein structures requiring only 13
1617
</picture>
1718
</p>
1819

20+
## Publications
21+
22+
[Hyunbin Kim, Milot Mirdita, Martin Steinegger, Foldcomp: a library and format for compressing and indexing large protein structure sets, Bioinformatics, 2023;, btad153,](https://doi.org/10.1093/bioinformatics/btad153)
23+
24+
## Presentation Video
25+
26+
We presented Foldcomp at ISMB/ECCB2023. Check it out:
27+
28+
<a href="https://www.youtube.com/watch?v=aFtqH0VqE7w" target="_blank">
29+
<img src="https://raw.githubusercontent.com/steineggerlab/foldcomp/master/.github/img/ismb_thumbnail.png" alt="Foldcomp presented at ISMB/ECCB2023" max-width="720px" max-height="400px" width="auto" height="auto">
30+
</a>
31+
1932
## Usage
2033

2134
### Installing Foldcomp
@@ -32,40 +45,89 @@ wget https://mmseqs.com/foldcomp/foldcomp-linux-arm64.tar.gz
3245
3346
# Download binary for macOS
3447
wget https://mmseqs.com/foldcomp/foldcomp-macos-universal.tar.gz
48+
49+
# Download binary for Windows (x64)
50+
wget https://mmseqs.com/foldcomp/foldcomp-windows-x64.zip
3551
```
3652

3753
### Executable
3854
```
3955
# Compression
40-
foldcomp compress <pdb_file|cif_file> [<fcz_file>]
41-
foldcomp compress [-t number] <pdb_dir|cif_dir> [<fcz_dir>]
56+
foldcomp compress <pdb|cif> [<fcz>]
57+
foldcomp compress [-t number] <dir|tar(.gz)> [<dir|tar|db>]
4258
4359
# Decompression
44-
foldcomp decompress <fcz_file> [<pdb_file>]
45-
foldcomp decompress [-t number] <fcz_dir> [<pdb_dir>]
60+
foldcomp decompress <fcz|tar> [<pdb>]
61+
foldcomp decompress [-t number] <dir|tar(.gz)|db> [<dir|tar>]
62+
63+
# Decompressing a subset of Foldcomp database
64+
foldcomp decompress [-t number] --id-list <idlist.txt> <db> [<dir|tar>]
4665
4766
# Extraction of sequence or pLDDT
48-
foldcomp extract [--plddt|--fasta] <fcz_file> [<txt_file|fasta_file>]
49-
foldcomp extract [--plddt|--fasta] [-t number] <fcz_dir|tar> [<output_dir>]
67+
foldcomp extract [--plddt|--amino-acid] <fcz> [<fasta>]
68+
foldcomp extract [--plddt|--amino-acid] [-t number] <dir|tar(.gz)|db> [<fasta_out>]
5069
5170
# Check
52-
foldcomp check <fcz_file>
53-
foldcomp check [-t number] <fcz_dir|tar>
71+
foldcomp check <fcz>
72+
foldcomp check [-t number] <dir|tar(.gz)|db>
5473
5574
# RMSD
56-
foldcomp rmsd <pdb1|cif1> <pdb2|cif2>
75+
foldcomp rmsd <pdb|cif> <pdb|cif>
5776
5877
# Options
59-
-h, --help print this help message
60-
-t, --threads threads for (de)compression of folders/tar files [default=1]
61-
-a, --alt use alternative atom order [default=false]
62-
-b, --break interval size to save absolute atom coordinates [default=25]
63-
-z, --tar save as tar file [default=false]
64-
--plddt extract pLDDT score (only for extraction mode)
65-
--fasta extract amino acid sequence (only for extraction mode)
66-
--no-merge do not merge output files (only for extraction mode)
78+
-h, --help print this help message
79+
-v, --version print version
80+
-t, --threads threads for (de)compression of folders/tar files [default=1]
81+
-r, --recursive recursively look for files in directory [default=0]
82+
-f, --file input is a list of files [default=0]
83+
-a, --alt use alternative atom order [default=false]
84+
-b, --break interval size to save absolute atom coordinates [default=25]
85+
-z, --tar save as tar file [default=false]
86+
-d, --db save as database [default=false]
87+
-y, --overwrite overwrite existing files [default=false]
88+
-l, --id-list a file of id list to be processed (only for database input)
89+
--skip-discontinuous skip PDB with with discontinuous residues (only batch compression)
90+
--check check FCZ before and skip entries with error (only for batch decompression)
91+
--plddt extract pLDDT score (only for extraction mode)
92+
-p, --plddt-digits extract pLDDT score with specified number of digits (only for extraction mode)
93+
- 1: single digit (fasta-like format), 2: 2-digit(00-99; tsv), 3: 3-digit, 4: 4-digit (max)
94+
--fasta, --amino-acid extract amino acid sequence (only for extraction mode)
95+
--no-merge do not merge output files (only for extraction mode)
96+
--use-title use TITLE as the output file name (only for extraction mode)
97+
--time measure time for compression/decompression
98+
```
99+
100+
### Downloading Databases
101+
We offer prebuilt databases for multiple large sets of predicted protein structures and a Python helper to download the database files.
102+
103+
You can download the AlphaFoldDB Swiss-Prot with the following command:
104+
```
105+
python -c "import foldcomp; foldcomp.setup('afdb_swissprot_v4');
67106
```
68107

108+
Currently we offer the following databases:
109+
* [ESMAtlas](https://esmatlas.com/) full (v0 + v2023_02): `foldcomp.setup('esmatlas')`
110+
* ESMAtlas v2023_02: `foldcomp.setup('esmatlas_v2023_02')`
111+
* ESMAtlas high-quality: `foldcomp.setup('highquality_clust30')`
112+
113+
**Note:** We skipped all structures with discontinous residues or other issues.
114+
Here is a list with the affected predictions;
115+
[full](https://foldcomp.steineggerlab.workers.dev/esmatlas.err.log) (~21M),
116+
[high-quality](https://foldcomp.steineggerlab.workers.dev/highquality_clust30_issues.txt) (~100k),
117+
[v2023_02](https://foldcomp.steineggerlab.workers.dev/esmatlas_v2023_02.err.log) (~10k)
118+
119+
* [AlphaFoldDB Uniprot](https://alphafold.ebi.ac.uk/): `foldcomp.setup('afdb_uniprot_v4')`
120+
* AlphaFoldDB Swiss-Prot: `foldcomp.setup('afdb_swissprot_v4')`
121+
* AlphaFoldDB Model Organisms: `foldcomp.setup('h_sapiens')`
122+
* `a_thaliana`, `c_albicans`, `c_elegans`, `d_discoideum`, `d_melanogaster`, `d_rerio`, `e_coli`, `g_max`,
123+
`h_sapiens`, `m_jannaschii`, `m_musculus`, `o_sativa`, `r_norvegicus`, `s_cerevisiae`, `s_pombe`, `z_mays`
124+
* [AlphaFoldDB Cluster Representatives](https://afdb-cluster.steineggerlab.workers.dev/): `foldcomp.setup('afdb_rep_v4')`
125+
* AlphaFoldDB Cluster Representatives (Dark Clusters): `foldcomp.setup('afdb_rep_dark_v4')`
126+
127+
If you want other prebuilt datasets, please get in touch with us through our [GitHub issues](https://github.com/steineggerlab/foldcomp/issues).
128+
129+
If you have issues downloading the databases you can navigate directly to our [download server](https://foldcomp.steineggerlab.workers.dev/) and download the required files. E.g. `afdb_uniprot_v4`, `afdb_uniprot_v4.index`, `afdb_uniprot_v4.dbtype`, `afdb_uniprot_v4.lookup`, and optionally `afdb_uniprot_v4.source`.
130+
69131
### Python API
70132

71133
You can find more in-depth examples of using Foldcomp's Python interface in the example notebook:
@@ -85,6 +147,17 @@ with open("test/compressed.fcz", "rb") as fcz:
85147
with open(name, "w") as pdb_file:
86148
pdb_file.write(pdb)
87149

150+
# Get data as dictionary
151+
data_dict = foldcomp.get_data(fcz_binary) # foldcomp.get_data(pdb) also works
152+
# Keys: phi, psi, omega, torsion_angles, residues, bond_angles, coordinates
153+
data_dict["phi"] # phi angles (C-N-CA-C)
154+
data_dict["psi"] # psi angles (N-CA-C-N)
155+
data_dict["omega"] # omega angles (CA-C-N-CA)
156+
data_dict["torsion_angles"] # torsion angles of the backbone as list (phi + psi + omega)
157+
data_dict["bond_angles"] # bond angles of the backbone as list
158+
data_dict["residues"] # amino acid residues as string
159+
data_dict["coordinates"] # coordinates of the backbone as list
160+
88161
# 02. Iterate over a database of FCZ files
89162
# Open a foldcomp database
90163
ids = ["d1asha_", "d1it2a_"]
@@ -96,6 +169,19 @@ with foldcomp.open("test/example_db", ids=ids) as db:
96169
pdb_file.write(pdb)
97170
```
98171

172+
## Subsetting Databases
173+
If you are dealing with millions of entries, we recommend using `createsubdb` command
174+
of [mmseqs2](https://mmseqs.com) to subset databases.
175+
The following commands can be used to subset the AlphaFold Uniprot DB with given IDs.
176+
```sh
177+
# mmseqs createsubdb --subdb-mode 0 --id-mode 1 id_list.txt input_foldcomp_db output_foldcomp_db
178+
mmseqs createsubdb --subdb-mode 0 --id-mode 1 id_list.txt afdb_uniprot_v4 afdb_subset
179+
```
180+
Please note that the IDs in afdb_uniprot_v4 are in the format `AF-A0A5S3Y9Q7-F1-model_v4` .
181+
182+
## Community Contributions
183+
* [PyMOL Plugin for reading Foldcomp files](https://github.com/yakomaxa/load_fcz_PyMOL) by @yakomaxa
184+
99185
## Contributor
100186
<a href="https://github.com/steineggerlab/foldcomp/graphs/contributors">
101187
<img src="https://contributors-img.firebaseapp.com/image?repo=steineggerlab/foldcomp" />

lib/foldseek/lib/foldcomp/src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ set(foldcomp_header_files
44
src/bond_info.h
55
src/execution_timer.h
66
src/database_reader.h
7+
src/database_writer.h
78
src/discretizer.h
89
src/float3d.h
910
src/foldcomp.h
1011
src/nerf.h
1112
src/sidechain.h
13+
src/tcbspan.h
1214
src/torsion_angle.h
1315
src/utility.h
1416
PARENT_SCOPE
@@ -18,6 +20,7 @@ set(foldcomp_source_files
1820
src/amino_acid.cpp
1921
src/atom_coordinate.cpp
2022
src/database_reader.cpp
23+
src/database_writer.cpp
2124
src/discretizer.cpp
2225
src/foldcomp.cpp
2326
src/nerf.cpp

0 commit comments

Comments
 (0)