Skip to content

Commit b71317c

Browse files
committed
Update CLI clustering
1 parent bca9f1f commit b71317c

File tree

4 files changed

+31
-49
lines changed

4 files changed

+31
-49
lines changed

README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,27 +47,29 @@ gleams cluster --help
4747
GLEAMS provides the `gleams embed` command to convert MS/MS spectra in peak files to 32-dimensional embeddings. Example:
4848

4949
```
50-
gleams embed *.mzML --embed_name GLEAMS.embed
50+
gleams embed *.mzML --embed_name GLEAMS_embed
5151
```
5252

53-
This will read the MS/MS spectra from all matched mzML files and export the results to a two-dimensional NumPy array of dimension _n_ x 32 in file `GLEAMS.embed.npy`, with _n_ the number of MS/MS spectra read from the mzML files.
54-
Additionally, a tabular file `GLEAMS.embed.parquet` will be created containing corresponding metadata for the embedded spectra.
53+
This will read the MS/MS spectra from all matched mzML files and export the results to a two-dimensional NumPy array of dimension _n_ x 32 in file `GLEAMS_embed.npy`, with _n_ the number of MS/MS spectra read from the mzML files.
54+
Additionally, a tabular file `GLEAMS_embed.parquet` will be created containing corresponding metadata for the embedded spectra.
5555

5656
### Embedding clustering
5757

5858
After converting the MS/MS spectra to 32-dimensional embeddings, they can be clustered to group spectra with similar embeddings using the `gleams cluster` command. Example:
5959

6060
```
61-
gleams cluster --embed_name GLEAMS.embed --cluster_name GLEAMS.cluster --eps 0.05
61+
gleams cluster --embed_name GLEAMS_embed --cluster_name GLEAMS_cluster --distance_threshold 0.3
6262
```
6363

64-
This will perform DBSCAN clustering on the embeddings.
65-
The output will be written to the `GLEAMS.cluster.npy` NumPy file with cluster labels per embedding (`-1` indicates noise, minimum cluster size 2).
66-
Additionally, a tabular file `GLEAMS.cluster.parquet` will be created containing corresponding metadata for the clustered spectra.
67-
Note that although this `GLEAMS.cluster.parquet` metadata file contains information for the same spectra as the `GLEAMS.embed.parquet` metadata file, the order of the spectra (matching the clustering results) is different.
64+
This will perform hierarchical clustering on the embeddings with the given distance threshold.
65+
The output will be written to the `GLEAMS_cluster.npy` NumPy file with cluster labels per embedding (`-1` indicates noise, minimum cluster size 2).
66+
Additionally, a file `GLEAMS_cluster_medoids.npy` will be created containing indexes of the cluster representative spectra (medoids).
67+
68+
### Advanced usage
69+
70+
Full configuration of GLEAMS, including various configurations to train the neural network, can be modified in the `gleams/config.py` file.
6871

6972
Contact
7073
-------
7174

7275
For more information you can visit the [official code website](https://github.com/bittremieux/GLEAMS) or send an email to <wbittremieux@health.ucsd.edu>.
73-

gleams/cluster/cluster.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def cluster(embeddings_filename: str, metadata_filename: str,
5252
'not recomputed')
5353
return
5454
clusters_dir = os.path.dirname(clusters_filename)
55-
if not os.path.exists(clusters_dir):
55+
if clusters_dir and not os.path.exists(clusters_dir):
5656
os.mkdir(clusters_dir)
5757
# Sort the metadata by increasing precursor m/z for easy subsetting.
5858
metadata = (pd.read_parquet(metadata_filename, columns=['charge', 'mz'])

gleams/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,5 @@
7979
num_probe = 1024
8080

8181
# Clustering.
82+
linkage = 'average'
8283
distance_threshold = 0.35

gleams/gleams.py

Lines changed: 18 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ def gleams():
3737
@gleams.command('embed')
3838
@click.argument('peak_in', nargs=-1, required=True)
3939
@click.option(
40-
'--embed_name', default='GLEAMS.embed',
40+
'--embed_name', default='GLEAMS_embed',
4141
help='The output will be written to the current working directory with the'
42-
' specified name (default: "GLEAMS.embed"). The output consists of a '
42+
' specified name (default: "GLEAMS_embed"). The output consists of a '
4343
'NumPy file containing the GLEAMS embeddings (extension ".npy") and '
4444
'a Parquet file containing the corresponding MS/MS spectra metadata '
4545
'(extension ".parquet").')
@@ -59,7 +59,7 @@ def cli_embed(peak_in: List[str], embed_name: str) -> None:
5959

6060
# Create temporary working directory.
6161
temp_dir = tempfile.mkdtemp()
62-
metadata_filename = os.path.join(temp_dir, 'metadata.parquet')
62+
metadata_filename = os.path.join(temp_dir, f'{embed_name}.parquet')
6363
embed_dir = os.path.join(temp_dir, 'embed')
6464
os.mkdir(embed_dir)
6565
# Create a metadata file with the file names.
@@ -104,26 +104,24 @@ def cli_embed(peak_in: List[str], embed_name: str) -> None:
104104

105105
@gleams.command('cluster')
106106
@click.option(
107-
'--embed_name', default='GLEAMS.embed',
108-
help='Name of the GLEAMS embeddings (default: "GLEAMS.embed"). Both a '
107+
'--embed_name', default='GLEAMS_embed',
108+
help='Name of the GLEAMS embeddings (default: "GLEAMS_embed"). Both a '
109109
'NumPy file and a Parquet file should be present in the current '
110110
'working directory.')
111111
@click.option(
112-
'--cluster_name', default='GLEAMS.cluster',
112+
'--cluster_name', default='GLEAMS_cluster',
113113
help='The output will be written to the current working directory with the'
114-
' specified name (default: "GLEAMS.cluster"). The output consists of '
114+
' specified name (default: "GLEAMS_cluster"). The output consists of '
115115
'a NumPy file containing the cluster labels (extension ".npy") and '
116-
'a Parquet file containing the corresponding MS/MS spectra metadata '
117-
'(extension ".parquet"). Attention: the spectrum order in this '
118-
'metadata file differs from the order in the embedding metadata '
119-
'file.')
116+
'a NumPy file containing indexes of the cluster medoid spectra '
117+
'(extension "_medoids.npy").')
120118
@click.option(
121-
'--eps', default=0.05,
122-
help='The maximum Euclidean distance between embeddings to be considered '
123-
'in each other\'s neighborhood during DBSCAN clustering '
124-
'(default: 0.05).'
119+
'--distance_threshold', default=0.3,
120+
help='The Euclidean distance threshold between embeddings to be merged '
121+
'during hierarchical clustering (average linkage) (default: 0.3).'
125122
)
126-
def cli_cluster(embed_name: str, cluster_name: str, eps: float) -> None:
123+
def cli_cluster(embed_name: str, cluster_name: str,
124+
distance_threshold: float) -> None:
127125
"""
128126
Cluster embeddings.
129127
@@ -132,26 +130,7 @@ def cli_cluster(embed_name: str, cluster_name: str, eps: float) -> None:
132130
"""
133131
logger.info('GLEAMS version %s', str(__version__))
134132

135-
# Create temporary working directory.
136-
temp_dir = tempfile.mkdtemp()
137-
dist_filename = os.path.join(temp_dir, f'{embed_name}.npz')
138-
# Compute the pairwise distance to a temporary file.
139-
cluster.compute_pairwise_distances(
140-
f'{embed_name}.npy', f'{embed_name}.parquet', dist_filename,
141-
config.precursor_tol_mass, config.precursor_tol_mode,
142-
config.mz_interval, config.num_neighbors, config.num_neighbors_ann,
143-
config.num_probe, config.batch_size_add, config.batch_size_dist,
144-
config.charges)
145-
# Move the metadata file to the working directory.
146-
shutil.move(os.path.join(temp_dir, f'{embed_name}.parquet'),
147-
f'{cluster_name}.parquet')
148-
# Remove previous result (if applicable).
149-
if os.path.isfile(f'{cluster_name}.npy'):
150-
os.remove(f'{cluster_name}.npy')
151-
# DBSCAN clustering.
152-
cluster.cluster(
153-
dist_filename, f'{cluster_name}.parquet', f'{cluster_name}.npy',
154-
eps, config.min_samples, config.precursor_tol_mass,
155-
config.precursor_tol_mode)
156-
# Clean up intermediate files.
157-
shutil.rmtree(temp_dir)
133+
cluster.cluster(f'{embed_name}.npy', f'{embed_name}.parquet',
134+
f'{cluster_name}.npy', config.precursor_tol_mass,
135+
config.precursor_tol_mode, config.linkage,
136+
distance_threshold, config.charges)

0 commit comments

Comments
 (0)