scTools-py/scTools_dew.py at master · jzussman/scTools-py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000

import pickle
import os
import sys
import scipy.sparse
import scipy.stats
import sklearn
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
import igraph as ig


# LOADING DATA

def load_alevin(library_ids, input_path):
    '''
    Mirrors the functionality of load_inDrops

    Imports data files generated by Salmon-Alevin, when run with the --dumpMtx option. Specifically, this
    function will expect files at the following locations:
    /input_path/library_id/alevin/quants_mat.mtx.gz
    /input_path/library_id/alevin/quants_mat_rows.txt
    /input_path/library_id/alevin/quants_mat_cols.txt
    where 'library_ids' contains one or more inDrops.py output folders located at the indicated path.
    '''

    # Create a dictionary to hold data
    D = {}
    for j, s in enumerate(library_ids):
        D[s] = {}

    # Load counts data, metadata, & convert to AnnData objects
    for s in library_ids:

        # Load counts, gene names into AnnData structure
        D[s] = sc.read_mtx(input_path + '/' + s + '/alevin/quants_mat.mtx.gz', dtype='float32')
        D[s].var_names = np.loadtxt(input_path + '/' + s + '/alevin/quants_mat_cols.txt', dtype='str')
        D[s].obs['library_id'] = np.tile(s, [D[s].n_obs, 1])
        D[s].uns['library_id'] = s

        # Load cell barcodes into AnnData structure
        cell_bcds = np.loadtxt(input_path + '/' + s + '/alevin/quants_mat_rows.txt', dtype='str')

        # Append library name to each cell barcode to create unique cell IDs
        lib_cell_bcds = []
        for bcd in cell_bcds:
            lib_cell_bcds.append(s + '_' + bcd)
        D[s].obs['unique_cell_id'] = lib_cell_bcds

    return D


def load_alevinfry(frydir, output_format="scRNA", nonzero=False, quiet=False):
    """
    This function is forked from: https://github.com/COMBINE-lab/pyroe
    load alevin-fry quantification result into an AnnData object
    Required Parameters
    ----------
    frydir : `str`
        The path to a output directory returned by alevin-fry quant command. \\
        The directory containing the alevin-fry quantification (i.e. the the quant.json file & alevin subdirectory).
    Optional Parameters
    ----------
    output_format : `str` or `dict`
        A string represents one of the pre-defined output formats, which are "scRNA", "snRNA" and "velocity". \\
        If a customized format of the returned `AnnData` is needed, one can pass a Dictionary.\\
        See Notes section for details.
    nonzero : `bool` (default: `False`)
        True if cells with non-zero expression value across all genes should be filtered in each layer.
        False if unexpressed genes should be kept.
    quiet : `bool` (default: `False`)
        True if function should be quiet.
        False if messages (including error messages) should be printed out.
    Notes
    ----------
    The `output_format` argument takes either a dictionary that defines the customized format or
    a string that represents one of the pre-defined format of the returned `AnnData` object.
    Each of the pre-defined formats contains a `X` field and some optional extra `AnnData.layers`
    obtained from the submatrices representing unspliced (U), spliced (S) and ambiguous (A) counts
    returned by alevin-fry.
    The following formats are defined:
    * "scRNA": \\
        This format is recommended for single cell RNA-sequencing experiments.
        It returns a `X` field that contains the S+A count of each gene in each cell without any extra layers.
    * "snRNA": \\
        This format is recommended for single nucleus RNA-sequencing experiments.
        It returns a `X` field that contains the U+S+A count of each gene in each cell without any extra layers.
    * "raw": \\
        This format uses the S count matrix as the `X` field and put the U, S, and A counts into three
        separate layers, which are "unspliced", "spliced" and "ambiguous".
    * "velocity": \\
        This format is the same as "scRNA", except it contains two extra layers: the "spliced" layer,
        which contains the S+A counts, and the "unspliced" layer, which contains the U counts.
    A custom output format can be defined using a Dictionary specifying the desired format of the output `Anndata` object.
    If the input is not a USA mode quantification directory, this parameter is ignored
    and the count matrix is returned in the `X` field of the returned `AnnData` object.  If the input
    quantification directory contains a USA mode quantification, then there are 3 sub-matrices that can
    be referenced in the dictionary; 'U', 'S', 'A' containing, respectively, unspliced, spliced and
    ambiguous counts.  The dictionary should have entries of the form `key` (str) : `value` (list[str]).
    The following constraints apply : there should be one key-value pair with the key `X`, the resulting
    counts will be returned in the `X` field of the AnnData object. There can be an arbitrary number
    of other key-value pairs, but each will be returned as a layer of the resulting AnnData object.
    Within the key-value pairs, the key refers to the layer name that will be given to the combined
    count matrix upon output, and the value should be a subset of `['U', 'S', 'A']` that defines
    which sub-matrices should be summed.  For example:
    `{'X' : ['S', 'A'], 'unspliced' : ['U']}`
    will result in a return AnnData object where the X field has a matrix in which each entry
    corresponds to the summed spliced and ambiguous counts for each gene in each cell, and there
    is an additional "unspliced" layer, whose counts are taken directly from the unspliced sub-matrix.
    Returns:
    ----------
        An AnnData object with X and layers corresponding to the requested `output_format`.
    """
    import json
    import os
    import pandas as pd

    def process_output_format(output_format, quiet):
        # make sure output_format isn't empty
        if not output_format:
            raise ValueError("output_format cannot be empty")

        if isinstance(output_format, (str, dict)):
            if isinstance(output_format, str):
                predefined_format = {
                    "scrna": {"X": ["S", "A"]},
                    "snrna": {"X": ["U", "S", "A"]},
                    "velocity": {
                        "X": ["S", "A"],
                        "spliced": ["S", "A"],
                        "unspliced": ["U"],
                    },
                    "raw": {
                        "X": ["S"],
                        "spliced": ["S"],
                        "unspliced": ["U"],
                        "ambiguous": ["A"],
                    },
                }

                output_format = output_format.lower()
                if output_format not in predefined_format.keys():
                    # invalid output_format string
                    if not quiet:
                        print(
                            "Provided output_format string must be 'scRNA', 'snRNA', 'raw' or 'velocity'."
                        )
                        print("See function help message for details.")
                    raise ValueError("Invalid output_format.")
                if not quiet:
                    print("Using pre-defined output format:", output_format)
                    print(
                        f"Will populate output field X with sum of counts from {predefined_format[output_format]['X']}."
                    )
                    for (k, v) in predefined_format[output_format].items():
                        if k != "X":
                            print(f"Will combine {v} into output layer {k}.")

                return predefined_format[output_format]
            else:
                if not quiet:
                    print("Processing user-defined output format.")
                # make sure the X is there
                if "X" not in output_format.keys():
                    raise ValueError(
                        'In USA mode some sub-matrices must be assigned to the "X" (default) output.'
                    )
                if not quiet:
                    print(
                        f"Will populate output field X with sum of counts from {output_format['X']}."
                    )

                for (k, v) in output_format.items():
                    if not v:
                        # empty list
                        raise ValueError(
                            f"The element list of key '{k}' in output_format is empty. Please remove it."
                        )

                    # v contains Non-USA element
                    if len(set(v) - set(["U", "S", "A"])) != 0:
                        # invalid value
                        raise ValueError(
                            f"Found non-USA element in output_format element list '{v}' for key '{k}'; cannot proceed."
                        )
                    if not quiet and (k != "X"):
                        print(f"Will combine {v} into output layer {k}.")

                return output_format
        else:
            raise ValueError("Provided invalid output_format. See function help message for details")

    # since alevin-fry 0.4.1 the generic "meta_info.json"
    # has been replaced by a more informative name for each
    # sub-command. For quantification, it is "quant.json".
    # we check for both files here, in order.
    meta_info_files = ["quant.json", "meta_info.json"]

    fpath = os.path.sep.join([frydir, meta_info_files[0]])
    # first, check for the new file, if we don't find it, check
    # for the old one.
    if not os.path.exists(fpath):
        if not quiet:
            print(
                f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}."
            )
        fpath = os.path.sep.join([frydir, meta_info_files[1]])
        # if we don't find the old one either, then return None
        if not os.path.exists(fpath):
            raise IOError(f"Found no {meta_info_files[1]} file either; cannot proceed.")

    # if we got here then we had a valid json file, so
    # use it to get the number of genes, and if we are
    # in USA mode or not.
    meta_info = json.load(open(fpath))
    ng = meta_info["num_genes"]
    usa_mode = meta_info["usa_mode"]
    if not quiet:
        print(f"USA mode: {usa_mode}")

    # if we are in USA mode
    if usa_mode:
        # preparation
        # each gene has 3 splicing statuses, so the actual number of distinct
        # genes is ng/3.
        ng = int(ng / 3)
        output_assays = process_output_format(output_format, quiet)
    elif not quiet:
        print(
            "Processing input in standard mode, the count matrix will be stored in field 'X'."
        )
        if output_format != "scRNA":
            print("Output_format will be ignored.")

    # read the actual input matrix
    af_raw = sc.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"]))
    afg = [
        line.rstrip()
        for line in open(
            os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])
        ).readlines()
    ][:ng]
    # read the gene ids
    afg_df = pd.DataFrame(afg, columns=["gene_ids"])
    afg_df = afg_df.set_index("gene_ids")
    # and the barcodes
    abc = [
        line.rstrip()
        for line in open(
            os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])
        ).readlines()
    ]
    abc_df = pd.DataFrame(abc, columns=["barcodes"])
    abc_df.index = abc_df["barcodes"]

    x = af_raw.X
    # if we're not in USA mode, just combine this info into
    # an AnnData object
    if not usa_mode:
        af = sc.AnnData(x.T, var=abc_df, obs=afg_df)
        af = af.T

    else:  # USA mode
        # otherwise, combine the sub-matrices into the output object as
        # specified by `output_assays`
        rd = {"S": range(0, ng), "U": range(ng, 2 * ng), "A": range(2 * ng, 3 * ng)}
        xcounts = output_assays["X"]
        o = x[:, rd[xcounts[0]]]
        for wc in xcounts[1:]:
            o += x[:, rd[wc]]
        af = sc.AnnData(o.T, var=abc_df, obs=afg_df)
        af = af.T

        # now, if there are other layers requested, populate those
        for other_layer in output_assays.keys() - "X":
            xcounts = output_assays[other_layer]
            o = x[:, rd[xcounts[0]]]
            for wc in xcounts[1:]:
                o += x[:, rd[wc]]
            af.layers[other_layer] = o

    if nonzero:
        import numpy as np

        not_zero_genes = af.X.sum(axis=0).A1 > 0
        if usa_mode:
            for other_layer in output_assays.keys() - "X":
                not_zero_genes = np.logical_or(
                    not_zero_genes, af.layers[other_layer].sum(axis=0).A1 > 0
                )

        af = af[:, not_zero_genes]

        if not quiet:
            print(f"Filtered {np.sum(~not_zero_genes)} non-expressed genes.")

    return af


def load_inDrops(library_ids, input_path):
    '''
    Imports data files generated by inDrops.py (https://github.com/indrops).  This function will expect
    files at the following locations:
    /input_path/library_id/library_id.counts.tsv.gz
    /input_path/library_id/abundant_barcodes.pickle
    where 'library_ids' contains one or more inDrops.py output folders located at the indicated path.

    The first time this function is executed, it will load counts matrices, gene names, cell names, and
    cell barcode sequences from original tsv and pickle files, respectively.  Fast-loading versions of
    these objects (e.g. *.npz) will be saved in place for future calls to this function.

    The returned dictionary object D with a ScanPy AnnData object for each library loaded, as follows:
    D[library_id] = AnnData object
    Cell names and barcodes are stored in the adata.obs (cell barcodes as adata.obs['unique_cell_id'])
    Gene names are stored in adata.var
    Raw counts data are stored in adata.X

    This workflow allows each original library to be examined and pre-processed independently (e.g. barcode
    filtering) prior to merging and further analysis.

    '''

    # Create a dictionary to hold data
    D = {}
    for j, s in enumerate(library_ids):
        D[s] = {}

    # Load counts data, metadata, & convert to AnnData objects
    for s in library_ids:
        print('_________________', s)

        # First attempt to load matrix data from preprocessed files (fast)
        if os.path.isfile(input_path + s + '/' + s + '.raw_counts.unfiltered.npz'):
            print('Loading from npz file')
            E = scipy.sparse.load_npz(
                input_path + s + '/' + s + '.raw_counts.unfiltered.npz')
            gene_names = np.loadtxt(
                fname=input_path + s + '/gene_names.txt', dtype='str')
            cell_names = np.loadtxt(
                fname=input_path + s + '/cell_names.txt', dtype='str')
            cell_bc_seqs = np.loadtxt(
                fname=input_path + s + '/cell_bc_seqs.txt', dtype='str')

        # Otherwise, load and preprocess from the original text files (slow)
        else:
            print('Loading from text file')
            counts_mat = pd.read_csv(
                input_path + s + '/' + s + '.counts.tsv.gz', sep='\t', index_col=0)
            E = scipy.sparse.coo_matrix(np.asmatrix(counts_mat.values)).tocsc()
            cell_names = counts_mat.index
            gene_names = counts_mat.columns

            # Load the barcode dictionary pickle file, format as keys=bcodes; values=sequences
            f = open(input_path + s + '/abundant_barcodes.pickle', 'rb')
            bc_dict = pickle.load(f)
            f.close()
            bcd_dict = {bc_dict[bc][0]: bc for bc in bc_dict}

            # Get barcode sequences corresponding to each cell index
            bcd_seqs = []
            for cname in counts_mat.index:
                bcd_seqs.append(s + '_' + bcd_dict.get(cname))
            cell_bc_seqs = bcd_seqs

            # Save fast files for next time
            scipy.sparse.save_npz(input_path + s + '/' +
                                  s + '.raw_counts.unfiltered.npz', E)
            np.savetxt(input_path + s + '/gene_names.txt',
                       counts_mat.columns, fmt='%s')
            np.savetxt(input_path + s + '/cell_names.txt',
                       counts_mat.index, fmt='%s')
            np.savetxt(input_path + s + '/cell_bc_seqs.txt',
                       bcd_seqs, fmt='%s')

        # Print matrix dimensions to screen
        print(E.shape, '\n')

        # Convert to ScanPy AnnData objects
        D[s] = sc.AnnData(E)
        D[s].var_names = gene_names
        D[s].obs['unique_cell_id'] = cell_bc_seqs
        D[s].obs['cell_names'] = cell_names
        D[s].obs['library_id'] = np.tile(s, [D[s].n_obs, 1])
        D[s].uns['library_id'] = s

    return D

load_inDrops_V3 = load_inDrops # alias function name


def load_genedata(adata, csv_filename):
    '''
    Adds annotations to the 'var' dataframe of a ScanPy AnnData object (adata) from an imported CSV file.
    Uses a set of unique identifiers (e.g. Ensembl gene IDs) to match genes.  These identifiers must be present
    in AnnData (in adata.obs.var_names) and in the first column of the CSV file.

    The structure of the CSV file is as follows:
    Column 1: unique gene identifiers (exact string matches to elements of adata.var_names)
    Column 2: first gene annotation
    Column 3: second gene annotation
      ...          ....
    Column n: last cell annotation
    Column headers in the CSV file (required) will become headers of new columns in adata.var

    Unique gene ids in adata that do not appear in the CSV file will be populated with the original unique ID.
    '''
    # load the unique gene IDs from adata that will be matched to the csv file
    uID_query = adata.var_names

    # load CSV header, get the names and number of IDs
    header = pd.read_csv(csv_filename, nrows=0)
    annotation_names = list(header.columns.values)[
        1:]  # ignore the first column header
    nAnnotations = len(annotation_names)

    # make a dictionary of unique gene IDs and annotations from the CSV file
    loadtxt = np.loadtxt(csv_filename, dtype='str', delimiter=',', skiprows=1)
    annotation_dict = {}
    for uID, *annots in loadtxt:   # column1 = uID, all remaining columns are annotations
        uID=uID.replace('-','')
        annotation_dict[uID] = annots

    # lookup each query in the dictionary, return matching annotations (or original uID)
    annotations = []
    for j, uID in enumerate(uID_query):
        if uID in annotation_dict:
            match = annotation_dict.get(uID)
            annotations.append(match)
        else:
            annotations.append(np.repeat(uID, nAnnotations).tolist())

    # convert from list of lists to array
    annotations = np.array(annotations)

    # now copy the matched annotations to adata
    for j in range(0, nAnnotations):
        adata.var[annotation_names[j]] = annotations[:, j]

    return adata


def load_celldata(adata, csv_filename, filter_nomatch=False):
    '''
    Adds annotations to the 'obs' dataframe of a ScanPy AnnData object (adata) from an imported CSV file.
    Uses a set of unique cell identifiers (e.g. inDrops cell barcode sequences) to match cells.  These
    identifiers must be present in AnnData (as adata.obs.unique_cell_id) and in the first column of the CSV file.

    The structure of the CSV file is as follows:
    Column 1: unique cell identifiers (exact string matches to elements of adata.obs.unique_cell_id)
    Column 2: first cell annotation
    Column 3: second cell annotation
      ...          ....
    Column n: last cell annotation
    Column headers in the CSV file (required) will become headers of new columns in adata.obs

    Unique cell ids in adata that no not appear in the CSV file will be annotated as 'no match'.
    'filter_nomatch' gives an option to filter these cells from the outputted version of adata.
    '''

    # load the unique cell IDs from adata that will be matched to the csv file
    uID_query = adata.obs.unique_cell_id
    uID_query.replace('-','')

    # load CSV header, get the names and number of IDs
    header = pd.read_csv(csv_filename, nrows=0)
    annotation_names = list(header.columns.values)[1:]  # ignore the first column header
    nAnnotations = len(annotation_names)

    # make a dictionary of unique cell IDs and annotations from the CSV file
    loadtxt = np.loadtxt(csv_filename, dtype='str', delimiter=',', skiprows=1)
    annotation_dict = {}
    for uID, *annots in loadtxt:   # column1 = uID, all remaining columns are annotations
        uID=uID.replace('-','')
        annotation_dict[uID] = annots

    # lookup each query in the dictionary, return matching annotations (or NaN if no match)
    annotations = []
    for j, uID in enumerate(uID_query):
        if uID in annotation_dict:
            match = annotation_dict.get(uID)
            annotations.append(match)
        else:
            annotations.append(np.repeat('no match', nAnnotations).tolist())

    # convert from list of lists to array
    annotations = np.array(annotations)

    # now copy the matched annotations to adata
    for j in range(0, nAnnotations):
        adata.obs[annotation_names[j]] = annotations[:, j]

    # if invoked, remove cells that were not present in the annotation CSV file
    if filter_nomatch:
        adata = adata[adata.obs[annotation_names[j]] != 'no match', :]

    return adata


# DATA PRE-PROCESSING

def filter_abundant_barcodes(adata, filter_cells=False, threshold=1000, library_id='', save_path='./figures/'):
    '''
    Plots a weighted histogram of transcripts per cell barcode for guiding the
    placement of a filtering threshold. Returns a filtered version of adata.
    '''

    # if necessary, create the output directory
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    # use adata.uns['library_id'] if it exists
    if not library_id:
      if 'library_id' in adata.uns:
        library_id = adata.uns['library_id']

    # Sum total UMI counts and genes for each cell-barcode, save to obs
    counts = np.array(adata.X.sum(1))
    genes = np.array(adata.X.astype(bool).sum(axis=1))
    adata.obs['total_counts'] = counts
    adata.obs['n_genes_by_counts'] = genes
    ix = counts >= threshold

    # Plot and format a weighted cell-barcode counts histogram
    sc.set_figure_params(dpi=100, figsize=[4,4], fontsize=12)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(counts, bins=np.logspace(0, 6, 100), weights=counts / sum(counts))
    ax.set_xscale('log')
    ax.set_xlabel('Transcripts per cell barcode')
    ax.set_ylabel('Fraction of total transcripts')
    ax.set_title(library_id)
    ax.text(0.99,0.95, str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained', ha='right', va='center', transform=ax.transAxes)

    # Overlay the counts threshold as a vertical line
    ax.plot([threshold, threshold], ax.get_ylim())

    # Save figure to file
    fig.tight_layout()
    plt.savefig(save_path + 'barcode_hist_' + library_id + '.png')
    plt.show()
    plt.close()

    # Print the number of cell barcodes that will be retained
    print('Barcode Filtering ' + library_id + ' (' + str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained)')
    print()

    # If requested, return a filtered version of adata
    if filter_cells:
        sc.pp.filter_cells(adata, min_counts=threshold, inplace=True)
        return adata


def filter_mito(adata, filter_cells=False, upper_threshold=100, lower_threshold=0, library_id='', save_path='./figures/'):
    '''
    Plots a weighted histogram of % mitochondrial transcripts per cell barcode for guiding the
    placement of filtering thresholds. Returns a filtered version of adata if filter_cells=True.
    '''

    # If necessary, create the output directory
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    # Use adata.uns['library_id'] if it exists
    if not library_id:
      if 'library_id' in adata.uns:
        library_id = adata.uns['library_id']

    # Calculate QC metric for % mitochondrial counts per cell
    adata.var["mito"] = adata.var_names.str.startswith(('mt-','MT-'))
    adata.var['ribo'] = adata.var_names.str.startswith(('RPS','rps','RPL','rpl'))
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mito','ribo'], inplace=True)
    counts = adata.obs['pct_counts_mito']
    ix = np.where((counts > lower_threshold) & (counts < upper_threshold), True, False)

    #ix1 = counts < upper_threshold && counts > lower_threshold

    # Plot and format a weighted mito counts histogram
    sc.set_figure_params(dpi=100, figsize=[4,4], fontsize=12)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(counts, bins=100)
    ax.set_yscale('log')
    ax.set_xlabel('% Mitochondrial RNA counts per cell')
    ax.set_ylabel('# Cells per bin')
    ax.set_title(library_id)
    ax.text(0.99,0.95, str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained', ha='right', va='center', transform=ax.transAxes)

    # Overlay the counts thresholds as vertical lines
    ax.plot([upper_threshold, upper_threshold], [0, ax.get_ylim()[1]])
    ax.plot([lower_threshold, lower_threshold], [0, ax.get_ylim()[1]])

    # Save figure to file
    fig.tight_layout()
    plt.savefig(save_path + 'mito_hist_' + library_id + '.png')
    plt.show()
    plt.close()

    # Print the number of cell barcodes that will be retained
    print('Mito-Filtering ' + library_id + ' (' + str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained)')
    print()

    # If requested, return a filtered version of adata
    if filter_cells:
        adata = adata[ix, :]

    return adata


def filter_ribo(adata, filter_cells=False, upper_threshold=100, lower_threshold=0, library_id='', save_path='./figures/'):
    '''
    Plots a weighted histogram of % ribosomal protein transcripts per cell barcode for guiding the
    placement of filtering thresholds. Returns a filtered version of adata if filter_cells=True.
    '''

    # If necessary, create the output directory
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    # Use adata.uns['library_id'] if it exists
    if not library_id:
      if 'library_id' in adata.uns:
        library_id = adata.uns['library_id']

    # Calculate QC metric for % mitochondrial counts per cell
    adata.var['ribo'] = adata.var_names.str.startswith(('RPS','rps','RPL','rpl','Rps','Rpl'))
    sc.pp.calculate_qc_metrics(adata, qc_vars=['ribo'], inplace=True)
    counts = adata.obs['pct_counts_ribo']
    ix = np.where((counts > lower_threshold) & (counts < upper_threshold), True, False)

    #ix1 = counts < upper_threshold && counts > lower_threshold

    # Plot and format a weighted mito counts histogram
    sc.set_figure_params(dpi=100, figsize=[4,4], fontsize=12)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(counts, bins=100)
    ax.set_yscale('log')
    ax.set_xlabel('% Ribosomal Protein mRNA counts per cell')
    ax.set_ylabel('# Cells per bin')
    ax.set_title(library_id)
    ax.text(0.99,0.95, str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained', ha='right', va='center', transform=ax.transAxes)

    # Overlay the counts thresholds as vertical lines
    ax.plot([upper_threshold, upper_threshold], [0, ax.get_ylim()[1]])
    ax.plot([lower_threshold, lower_threshold], [0, ax.get_ylim()[1]])

    # Save figure to file
    fig.tight_layout()
    plt.savefig(save_path + 'ribo_hist_' + library_id + '.png')
    plt.show()
    plt.close()

    # Print the number of cell barcodes that will be retained
    print('Ribo-Filtering ' + library_id + ' (' + str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained)')
    print()

    # If requested, return a filtered version of adata
    if filter_cells:
        adata = adata[ix, :]

    return adata


def filter_scrublet(adata, filter_cells=False, threshold=5):

    # disable copy data warning
    warnings.filterwarnings('ignore')

    # use adata.uns['library_id'] if it exists
    if 'library_id' in adata.uns:
      library_id = adata.uns['library_id']
    else:
      library_id = ''

    # calculate and plot doublet scores
    sc.external.pp.scrublet(adata, threshold=threshold, verbose=False)
    sc.external.pl.scrublet_score_distribution(adata, scale_hist_sim='log')

    # print filtering summary
    print('Doublet Filtering ' + library_id + ' (' + str(len(adata) - sum(adata.obs['predicted_doublet'])) + '/' + str(adata.shape[0]) + ' cells retained)')
    print()

    if filter_cells:
        adata = adata[~adata.obs['predicted_doublet'],:]

    return adata


def get_sampling_stats(adata, groupby=[]):
    lib_umi_per_cell = []
    lib_genes_per_cell = []
    for n, name in enumerate(groupby):
      lib_umi_per_cell.append(np.mean(adata.obs['total_counts'][adata.obs['library_id']==name]))
      lib_genes_per_cell.append(np.mean(adata.obs['n_genes_by_counts'][adata.obs['library_id']==name]))

    df = pd.DataFrame(data={'UMI per Cell': lib_umi_per_cell, 'Genes per Cell': lib_genes_per_cell}, index=groupby)
    return df


# VARIABLE GENES

def get_vscores(E, min_mean=0, nBins=50, fit_percentile=0.1, error_wt=1):
    '''
    Calculate v-score (above-Poisson noise statistic) for genes in the input counts matrix
    Return v-scores and other stats
    '''

    ncell = E.shape[0]

    mu_gene = E.mean(axis=0).A.squeeze()
    gene_ix = np.nonzero(mu_gene > min_mean)[0]
    mu_gene = mu_gene[gene_ix]

    tmp = E[:, gene_ix]
    tmp.data **= 2
    var_gene = tmp.mean(axis=0).A.squeeze() - mu_gene ** 2
    del tmp
    FF_gene = var_gene / mu_gene

    data_x = np.log(mu_gene)
    data_y = np.log(FF_gene / mu_gene)

    x, y = runningquantile(data_x, data_y, fit_percentile, nBins)
    x = x[~np.isnan(y)]
    y = y[~np.isnan(y)]

    def gLog(input): return np.log(input[1] * np.exp(-input[0]) + input[2])
    h, b = np.histogram(np.log(FF_gene[mu_gene > 0]), bins=200)
    b = b[:-1] + np.diff(b) / 2
    max_ix = np.argmax(h)
    c = np.max((np.exp(b[max_ix]), 1))

    def errFun(b2): return np.sum(abs(gLog([x, c, b2]) - y) ** error_wt)
    b0 = 0.1
    b = scipy.optimize.fmin(func=errFun, x0=[b0], disp=False)
    a = c / (1 + b) - 1

    v_scores = FF_gene / ((1 + a) * (1 + b) + b * mu_gene)
    CV_eff = np.sqrt((1 + a) * (1 + b) - 1)
    CV_input = np.sqrt(b)

    return v_scores, CV_eff, CV_input, gene_ix, mu_gene, FF_gene, a, b


def runningquantile(x, y, p, nBins):
    """ calculate the quantile of y in bins of x """

    ind = np.argsort(x)
    x = x[ind]
    y = y[ind]

    dx = (x[-1] - x[0]) / nBins
    xOut = np.linspace(x[0]+dx/2, x[-1]-dx/2, nBins)

    yOut = np.zeros(xOut.shape)

    for i in range(len(xOut)):
        ind = np.nonzero((x >= xOut[i]-dx/2) & (x < xOut[i]+dx/2))[0]
        if len(ind) > 0:
            yOut[i] = np.percentile(y[ind], p)
        else:
            if i > 0:
                yOut[i] = yOut[i-1]
            else:
                yOut[i] = np.nan

    return xOut, yOut


def get_variable_genes(E, base_ix=[], min_vscore_pctl=85, min_counts=3, min_cells=3, show_FF_plot=False, show_vscore_plot=False, return_stats=False, plot_title=''):

    '''
    Filter genes by expression level and variability
    Return list of filtered gene indices
    '''

    if len(base_ix) == 0:
        base_ix = np.arange(E.shape[0])

    # get variability statistics
    Vscores, CV_eff, CV_input, gene_ix, mu_gene, FF_gene, a, b = get_vscores(E[base_ix, :])

    # index genes with positive vscores
    ix2 = Vscores > 0

    # index genes based on vscore percentile
    min_vscore = np.percentile(Vscores[ix2], min_vscore_pctl)
    ix = (((E[:, gene_ix[ix2]] >= min_counts).sum(0).A.squeeze()>= min_cells) & (Vscores[ix2] >= min_vscore))

    if show_FF_plot:
        x_min = 0.5 * np.min(mu_gene[ix2])
        x_max = 2 * np.max(mu_gene[ix2])
        xTh = x_min * np.exp(np.log(x_max / x_min) * np.linspace(0, 1, 100))
        yTh = (1 + a) * (1 + b) + b * xTh
        plt.figure(figsize=(6, 6))
        plt.scatter(np.log10(mu_gene[ix2]), np.log10(FF_gene[ix2]), c=np.array(['grey']), alpha=0.3, edgecolors=None, s=4)
        plt.scatter(np.log10(mu_gene[ix2])[ix], np.log10(FF_gene[ix2])[ix], c=np.log10(Vscores[ix2])[ix], cmap='jet', alpha=0.3, edgecolors=None, s=4)
        plt.plot(np.log10(xTh), np.log10(yTh))
        plt.title(plot_title)
        plt.xlabel('Mean Transcripts Per Cell (log10)')
        plt.ylabel('Gene Fano Factor (log10)')
        plt.show()

    if show_vscore_plot:
        plt.figure(figsize=(6, 6))
        plt.scatter(np.log10(mu_gene[ix2]), np.log10(Vscores[ix2]), c=np.array(['grey']), alpha=0.3, edgecolors=None, s=4)
        plt.scatter(np.log10(mu_gene[ix2])[ix], np.log10(Vscores[ix2])[ix], c=np.log10(FF_gene[ix2])[ix], cmap='jet', alpha=0.3, edgecolors=None, s=4)
        plt.title(plot_title)
        plt.xlabel('Mean Transcripts Per Cell (log10)')
        plt.ylabel('Vscores (log10)')
        plt.show()

    if return_stats:
        return {'gene_ix': gene_ix[ix2][ix],
                'vscores': Vscores[ix2][ix],
                'mu_gene': mu_gene[ix2][ix],
                'FF_gene': FF_gene[ix2][ix],
                'CV_eff': CV_eff,
                'CV_input': CV_input,
                'a': a,
                'b': b,
                'min_vscore': min_vscore}
    else:
        return gene_ix[ix2][ix]


def get_covarying_genes(E, gene_ix, minimum_correlation=0.2, show_hist=False, sample_name=''):


    # subset input matrix to gene_ix
    E = E[:,gene_ix]

    # compute gene-gene correlation distance matrix (1-correlation)
    #gene_correlation_matrix1 = sklearn.metrics.pairwise_distances(E.todense().T, metric='correlation',n_jobs=-1)
    gene_correlation_matrix = 1-sparse_corr(E) # approx. 2X faster than sklearn

    # for each gene, get correlation to the nearest gene neighbor (ignoring self)
    np.fill_diagonal(gene_correlation_matrix, np.inf)
    max_neighbor_corr = 1-gene_correlation_matrix.min(axis=1)

    # filter genes whose nearest neighbor correlation is above threshold
    ix_keep = np.array(max_neighbor_corr > minimum_correlation, dtype=bool).squeeze()

    # plot distribution of top gene-gene correlations
    if show_hist:
        plt.figure(figsize=(6, 6))
        plt.hist(max_neighbor_corr,bins=100)
        plt.title(sample_name)
        plt.xlabel('Nearest Gene Correlation')
        plt.ylabel('Counts')
        plt.show()

    return gene_ix[ix_keep]


# GEPHI IMPORT & EXPORT

def export_to_graphml(adata, filename='test.graphml', directed=None):

    adjacency = adata.uns['neighbors']['connectivities']

    sources, targets = adjacency.nonzero()
    weights = adjacency[sources, targets]
    if isinstance(weights, np.matrix):
        weights = weights.A1
    g = ig.Graph(directed=directed)
    g.add_vertices(adjacency.shape[0])  # this adds adjacency.shap[0] vertices
    g.add_edges(list(zip(sources, targets)))
    try:
        g.es['weight'] = weights
    except:
        pass
    if g.vcount() != adjacency.shape[0]:
        logg.warn('The constructed graph has only {} nodes. '
                  'Your adjacency matrix contained redundant nodes.'
                  .format(g.vcount()))
    g.write_graphml(filename)


def import_pajek_xy(adata, filename='test.net'):

    # first determine the number of graph nodes in *.net file
    with open(filename,'r') as file:
        nNodes = 0
        for ln,line in enumerate(file):
            if line.startswith("*Edges"):
                nNodes = ln-1

    # extract xy coordinates from *.net file
    with open(filename,'r') as file:
        lines=file.readlines()[1:nNodes+1]
        xy = np.empty((nNodes,2))
        for ln,line in enumerate(lines):
            xy[ln,0]=(float(line.split(' ')[2]))
            xy[ln,1]=(float(line.split(' ')[3]))

    # generate ForceAtlas2 data structures and update coordinates
    sc.tl.draw_graph(adata, layout='fa', iterations=1)
    adata.obsm['X_draw_graph_fa']=xy

    return adata


# CLASSIFICATION

def train_classifiers(X, labels, PCs, gene_ind):
    '''
    Trains a series of machine learning classifiers to associate individual cells with class labels.
    Does so in a low-dimensional PCA representation of the data (PCs) over pre-defined genes (gene_ind).
    '''

    # Import sklearn classifier packages
    #from sklearn.model_selection import train_test_split
    #from sklearn.neural_network import MLPClassifier
    #from sklearn.neighbors import KNeighborsClassifier
    #from sklearn.svm import SVC
    #from sklearn.tree import DecisionTreeClassifier
    #from sklearn.ensemble import RandomForestClassifier
    #from sklearn.naive_bayes import GaussianNB
    #from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    # Subset by gene indices; project X into PCA subspace
    X_ind = X[:,gene_ind]
    PCs_ind = PCs[gene_ind,:]
    X_PCA = np.matmul(X_ind,PCs_ind)

    # Specify classifiers and their settings
    classifier_names = ['NearestNeighbors', 'SVM-Linear', 'SVM-RBF', 'DecisionTree', 'RandomForest',
                        'NeuralNet', 'NaiveBayes', 'LDA']
    classifiers = [sklearn.neighbors.KNeighborsClassifier(20, weights='distance', metric='correlation'),
                   sklearn.svm.SVC(kernel='linear', gamma='scale', C=1, random_state=802),
                   sklearn.svm.SVC(kernel='rbf', gamma='scale', C=1, random_state=802),
                   sklearn.tree.DecisionTreeClassifier(random_state=802),
                   sklearn.ensemble.RandomForestClassifier(n_estimators=200, random_state=802),
                   sklearn.neural_network.MLPClassifier(random_state=802),
                   sklearn.naive_bayes.GaussianNB(),
                   sklearn.discriminant_analysis.LinearDiscriminantAnalysis()]

    # Split data into training and test subsets
    X_train, X_test, labels_train, labels_test = sklearn.model_selection.train_test_split(X_PCA, labels, test_size=0.5, random_state=802)

    # Build a dictionary of classifiers
    scores = []
    ClassifierDict={}
    for n,name in enumerate(classifier_names):
        clf_test = classifiers[n].fit(X_train, labels_train)
        score = clf_test.score(X_test, labels_test)
        scores.append(score)
        print(name,round(score,3))
        ClassifierDict[name]=classifiers[n].fit(X_PCA, labels)

    # Export classifier dictionary and subspace projection objects

    return {'Classes' : np.unique(labels),
            'Classifiers' : ClassifierDict,
    		'Classifier_Scores' : dict(zip(classifier_names, scores)),
            'PC_Loadings' : PCs,
            'Gene_Ind' : gene_ind}


def predict_classes(adata, Classifier):
    '''
    '''
    X = adata.X
    X[np.isnan(X)]=0
    PCs = Classifier['PC_Loadings']
    gene_ind = Classifier['Gene_Ind']

    # First check to see if genes match between adata and Classifier
    adata_genes = np.array(adata.var.index)
    classifier_genes = np.array(gene_ind.index)
    if len(classifier_genes)==len(adata_genes):
        if (classifier_genes==adata_genes).all():
            # Subset by gene indices; project X into PCA subspace
            X_ind = X[:,gene_ind]
            PCs_ind = PCs[gene_ind,:]
            X_PCA = np.matmul(X_ind,PCs_ind)

    else:
        # Match highly variable classifier genes to adata genes, correcting for case
        adata_genes = np.array([x.upper() for x in adata_genes])
        classifier_genes = np.array([x.upper() for x in np.array(classifier_genes[gene_ind])])
        # Get overlap
        gene_overlap, dataset_ind, classifier_ind = np.intersect1d(adata_genes,classifier_genes,return_indices=True)
        # Subset by gene indices; project X into PCA subspace
        PCs_ind = PCs[gene_ind,:]
        PCs_ind = PCs_ind[classifier_ind,:]
        X_ind = X[:,dataset_ind]
        X_PCA = np.matmul(X_ind,PCs_ind)