-
Notifications
You must be signed in to change notification settings - Fork 2
Reformat OLGA, make GLIPH2 optional #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
b976f9e
ca95489
0eeca80
72f9c9d
1763739
980c6c1
78ac24f
40cf5ad
9ade10a
b83a497
abefc74
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| process ANNOTATE_CONCATENATE { | ||
| label 'process_low' | ||
|
|
||
| input: | ||
| path samplesheet_utf8 | ||
| path all_sample_files | ||
|
|
||
| output: | ||
| path "concatenated_cdr3.tsv", emit: concat_cdr3 | ||
|
|
||
| script: | ||
| """ | ||
| # Concatenate input Adaptive files and process metadata | ||
| compare_concatenate.py $samplesheet_utf8 | ||
dltamayo marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """ | ||
| } | ||
|
|
||
| process ANNOTATE_SORT_CDR3 { | ||
| label 'process_medium' | ||
|
|
||
| input: | ||
| path concat_cdr3 | ||
|
|
||
| output: | ||
| path 'concatenated_cdr3_sorted.tsv', emit: concat_cdr3_sorted | ||
|
|
||
| script: | ||
| """ | ||
| head -n 1 ${concat_cdr3} > concatenated_cdr3_sorted.tsv | ||
|
|
||
| tail -n +2 ${concat_cdr3} \ | ||
| | LC_ALL=C sort \ | ||
| -t \$'\t' \ | ||
| -k1,1 -k2,5 \ | ||
| --parallel=${task.cpus} \ | ||
| -S 50% \ | ||
| >> concatenated_cdr3_sorted.tsv | ||
| """ | ||
| } | ||
|
|
||
| process ANNOTATE_DEDUPLICATE_CDR3_TRBV { | ||
| label 'process_low' | ||
|
|
||
| input: | ||
| path concat_cdr3 | ||
|
|
||
| output: | ||
| path 'unique_cdr3_trbv.tsv', emit: unique_cdr3_trbv | ||
| path 'unique_cdr3_trbv_with_vcall.tsv', emit: unique_cdr3_trbv_with_vcall | ||
|
|
||
| script: | ||
| """ | ||
| tail -n +2 ${concat_cdr3} \ | ||
| | awk -F'\t' '{print toupper(\$1) "\t" toupper(\$2)}' \ | ||
| | LC_ALL=C sort -u \ | ||
| > unique_cdr3_trbv.tsv | ||
|
|
||
| # additional file with blank TRBV calls removed for GIANA | ||
| awk -F'\t' 'NF>=2 && \$2 ~ /^TRBV/' unique_cdr3_trbv.tsv > unique_cdr3_trbv_with_vcall.tsv | ||
| """ | ||
| } | ||
|
|
||
| process ANNOTATE_DEDUPLICATE_CDR3 { | ||
| label 'process_single' | ||
|
|
||
| input: | ||
| path unique_cdr3_trbv | ||
|
|
||
| output: | ||
| path 'unique_cdr3.txt', emit: unique_cdr3 | ||
|
|
||
| script: | ||
| """ | ||
| cut -f1 ${unique_cdr3_trbv} \ | ||
| | LC_ALL=C sort -u \ | ||
| > unique_cdr3.txt | ||
| """ | ||
| } | ||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ process TCRSHARING_CALC { | |
| path concat_cdr3 | ||
|
|
||
| output: | ||
| path "cdr3_sharing_pgen.tsv", emit: "shared_cdr3" | ||
| path "cdr3_sharing.tsv", emit: "shared_cdr3" | ||
| path "sample_mapping.tsv", emit: "sample_mapping" | ||
|
|
||
| script: | ||
|
|
@@ -18,6 +18,9 @@ process TCRSHARING_CALC { | |
| # Load data | ||
| df = pd.read_csv("${concat_cdr3}", sep="\t") | ||
|
|
||
| # Remove rows where pgen = 0 | ||
| df = df[df['pgen'] != 0] | ||
|
|
||
| # Map sample to integer codes | ||
| df['sample'] = df['sample'].astype('category') | ||
| df['sample_id'] = df['sample'].cat.codes + 1 | ||
|
|
@@ -31,9 +34,12 @@ process TCRSHARING_CALC { | |
|
|
||
| # Get unique sample_ids per CDR3b — vectorized | ||
| grouped = ( | ||
| df.groupby('CDR3b')['sample_id'] | ||
| .unique() # UNIQUE — fast & vectorized | ||
| .apply(np.sort) # SORT — vectorized | ||
| df.groupby('CDR3b') | ||
| .agg( | ||
| sample_id=('sample_id', 'unique'), | ||
| pgen=('pgen', 'first'), | ||
| log10_pgen=('log10_pgen', 'first') | ||
| ) | ||
| .reset_index() | ||
| ) | ||
|
|
||
|
|
@@ -44,29 +50,12 @@ process TCRSHARING_CALC { | |
| ) | ||
|
|
||
| # Drop raw list | ||
| final_df = grouped[['CDR3b', 'total_samples', 'samples_present']] | ||
| final_df = grouped[['CDR3b', 'pgen', 'log10_pgen', 'total_samples', 'samples_present']] | ||
| final_df = final_df.sort_values(by="total_samples", ascending=False) | ||
|
|
||
| # Export final list | ||
| final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False) | ||
| EOF | ||
|
Comment on lines
52
to
58
|
||
|
|
||
|
|
||
| olga-compute_pgen --humanTRB -i cdr3_sharing.tsv -o pgen_sharing.tsv | ||
|
|
||
|
|
||
| python - <<EOF | ||
| import pandas as pd | ||
|
|
||
| # Load TSVs for shared cdr3s and corresponding pgen values | ||
| left_df = pd.read_csv('pgen_sharing.tsv', sep='\t', header=None, usecols=[0, 1], names=['CDR3b', 'pgen']) | ||
| right_df = pd.read_csv('cdr3_sharing.tsv', sep='\t') | ||
|
|
||
| # Drop rows where pgen == 0 and merge | ||
| left_df = left_df[left_df['pgen'] != 0] | ||
| merged_df = pd.merge(left_df, right_df, on='CDR3b', how='left') | ||
| merged_df.to_csv('cdr3_sharing_pgen.tsv', sep='\t', index=False) | ||
| EOF | ||
| """ | ||
| } | ||
|
|
||
|
|
@@ -86,7 +75,7 @@ process TCRSHARING_HISTOGRAM { | |
| import pandas as pd | ||
| import matplotlib.pyplot as plt | ||
|
|
||
| merged_df = pd.read_csv('$shared_cdr3', sep='\t') | ||
| merged_df = pd.read_csv('${shared_cdr3}', sep='\t') | ||
|
|
||
| # Plot histogram | ||
| sharing = merged_df['total_samples'].values | ||
|
|
@@ -127,10 +116,9 @@ process TCRSHARING_SCATTERPLOT { | |
| import matplotlib.pyplot as plt | ||
| from matplotlib.ticker import MaxNLocator | ||
|
|
||
| merged_df = pd.read_csv('$shared_cdr3', sep='\t') | ||
| merged_df = pd.read_csv('${shared_cdr3}', sep='\t') | ||
|
|
||
| # Create scatter plot with log-transform pgen | ||
| merged_df["log10_pgen"] = np.log10(merged_df["pgen"]) | ||
| plt.figure(figsize=(8, 6)) | ||
| plt.grid(True) | ||
| plt.scatter(merged_df["log10_pgen"], merged_df["total_samples"], c='blue', alpha=0.7) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.