diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 869ce5c..28eb579 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: - name: Setup python uses: actions/setup-python@v1 with: - python-version: '3.8' + python-version: '3.9.22' architecture: x64 - name: Install dependencies run: pip install -r dev-requirements.txt @@ -22,7 +22,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: [3.8, 3.9 ] + python: [3.9.22, 3.10.17 ] os: [ubuntu-20.04] name: Test on Python ${{ matrix.python }} steps: diff --git a/kb_python/bins/darwin/bustools/bustools b/kb_python/bins/darwin/bustools/bustools index b7a3152..78d8e24 100755 Binary files a/kb_python/bins/darwin/bustools/bustools and b/kb_python/bins/darwin/bustools/bustools differ diff --git a/kb_python/bins/darwin/m1/bustools/bustools b/kb_python/bins/darwin/m1/bustools/bustools index b7a3152..78d8e24 100755 Binary files a/kb_python/bins/darwin/m1/bustools/bustools and b/kb_python/bins/darwin/m1/bustools/bustools differ diff --git a/kb_python/bins/linux/bustools/bustools b/kb_python/bins/linux/bustools/bustools index 038d41a..069081a 100755 Binary files a/kb_python/bins/linux/bustools/bustools and b/kb_python/bins/linux/bustools/bustools differ diff --git a/kb_python/bins/windows/bustools/bustools.exe b/kb_python/bins/windows/bustools/bustools.exe index df5ce36..22d731c 100755 Binary files a/kb_python/bins/windows/bustools/bustools.exe and b/kb_python/bins/windows/bustools/bustools.exe differ diff --git a/kb_python/config.py b/kb_python/config.py index f4b1919..df843ef 100755 --- a/kb_python/config.py +++ b/kb_python/config.py @@ -127,6 +127,7 @@ class Technology(NamedTuple): '10XFB', '10x Feature Barcode', ngs.chemistry.get_chemistry('10xFBonly'), False ), + Technology('10XV4', '10x version 4', ngs.chemistry.get_chemistry('10xv4')), Technology('CELSEQ', 'CEL-Seq', ngs.chemistry.get_chemistry('celseq')), Technology( 'CELSEQ2', 'CEL-SEQ version 2', ngs.chemistry.get_chemistry('celseq2') @@ -168,7 +169,8 @@ class Technology(NamedTuple): ), Technology('Visium', '10x Visium', ngs.chemistry.get_chemistry('visium')), Technology( - 'SPLIT-SEQ', 'SPLiT-seq', ngs.chemistry.get_chemistry('split-seq') + 'SPLIT-SEQ', 'SPLiT-seq (version 2)', + ngs.chemistry.get_chemistry('split-seq') ), ], key=lambda t: t.name) diff --git a/kb_python/count.py b/kb_python/count.py index 9c105b3..0c6166a 100755 --- a/kb_python/count.py +++ b/kb_python/count.py @@ -87,6 +87,23 @@ INSPECT_PARSER = re.compile(r'^.*?(?P[0-9]+)') +def make_transcript_t2g(txnames_path: str, out_path: str) -> str: + """Make a two-column t2g file from a transcripts file + + Args: + txnames_path: Path to transcripts.txt + out_path: Path to output t2g file + + Returns: + Path to output t2g file + """ + with open_as_text(txnames_path, 'r') as f, open_as_text(out_path, + 'w') as out: + for line in f: + out.write(f'{line.strip()}\t{line.strip()}\n') + return out_path + + def kallisto_bus( fastqs: Union[List[str], str], index_path: str, @@ -164,7 +181,11 @@ def kallisto_bus( command += ['-i', index_path] command += ['-o', out_dir] if not demultiplexed: - command += ['-x', technology] + if technology.upper() == "10XV4": + # TODO: REMOVE THIS WHEN KALLISTO IS UPDATED + command += ['-x', "10XV3"] + else: + command += ['-x', technology] elif technology[0] == '-': # User supplied a custom demuxed (no-barcode) technology command += ['-x', technology] @@ -412,7 +433,8 @@ def bustools_correct( bus_path: str, out_path: str, whitelist_path: str, - replace: bool = False + replace: bool = False, + exact_barcodes: bool = False ) -> Dict[str, str]: """Runs `bustools correct`. @@ -421,6 +443,7 @@ def bustools_correct( out_path: Path to output corrected BUS file whitelist_path: Path to whitelist replace: If whitelist is a replacement file, defaults to `False` + exact_barcodes: Use exact matching for 'correction', defaults to `False` Returns: Dictionary containing path to generated index @@ -436,6 +459,8 @@ def bustools_correct( command += [bus_path] if replace: command += ['--replace'] + if exact_barcodes: + command += ['--nocorrect'] run_executable(command) return {'bus': out_path} @@ -1214,6 +1239,7 @@ def count( no_jump: bool = False, quant_umis: bool = False, keep_flags: bool = False, + exact_barcodes: bool = False, ) -> Dict[str, Union[str, Dict[str, str]]]: """Generates count matrices for single-cell RNA seq. @@ -1286,6 +1312,7 @@ def count( no_jump: Disable pseudoalignment "jumping", defaults to `False` quant_umis: Whether to run quant-tcc when there are UMIs, defaults to `False` keep_flags: Preserve flag column when sorting BUS file, defaults to `False` + exact_barcodes: Use exact match for 'correcting' barcodes to on-list, defaults to `False` Returns: Dictionary containing paths to generated files @@ -1349,6 +1376,10 @@ def count( ) unfiltered_results.update(bus_result) + if t2g_path.upper() == "NONE": + tmp_t2g = os.path.join(temp_dir, "t2g.txt") + t2g_path = make_transcript_t2g(bus_result['txnames'], tmp_t2g) + sort_result = bustools_sort( bus_result['bus'], os.path.join( @@ -1388,7 +1419,7 @@ def count( update_filename( os.path.basename(prev_result['bus']), CORRECT_CODE ) - ), whitelist_path + ), whitelist_path, False, exact_barcodes ) prev_result = bustools_sort( prev_result['bus'], @@ -1757,6 +1788,7 @@ def count_nac( no_jump: bool = False, quant_umis: bool = False, keep_flags: bool = False, + exact_barcodes: bool = False, ) -> Dict[str, Union[Dict[str, str], str]]: """Generates RNA velocity matrices for single-cell RNA seq. @@ -1826,6 +1858,7 @@ def count_nac( no_jump: Disable pseudoalignment "jumping", defaults to `False` quant_umis: Whether to run quant-tcc when there are UMIs, defaults to `False` keep_flags: Preserve flag column when sorting BUS file, defaults to `False` + exact_barcodes: Use exact match for 'correcting' barcodes to on-list, defaults to `False` Returns: Dictionary containing path to generated index @@ -1886,6 +1919,10 @@ def count_nac( ) unfiltered_results.update(bus_result) + if t2g_path.upper() == "NONE": + tmp_t2g = os.path.join(temp_dir, "t2g.txt") + t2g_path = make_transcript_t2g(bus_result['txnames'], tmp_t2g) + sort_result = bustools_sort( bus_result['bus'], os.path.join( @@ -1926,7 +1963,7 @@ def count_nac( update_filename( os.path.basename(sort_result['bus']), CORRECT_CODE ) - ), whitelist_path + ), whitelist_path, False, exact_barcodes ) prev_result = bustools_sort( prev_result['bus'], diff --git a/kb_python/main.py b/kb_python/main.py index 026fc5f..25610c1 100755 --- a/kb_python/main.py +++ b/kb_python/main.py @@ -639,7 +639,8 @@ def parse_count( union=args.union, no_jump=args.no_jump, quant_umis=args.quant_umis, - keep_flags=args.keep_flags + keep_flags=args.keep_flags, + exact_barcodes=args.exact_barcodes ) elif args.workflow in {'nucleus', 'lamanno'}: # Smartseq can not be used with lamanno or nucleus. @@ -762,7 +763,8 @@ def parse_count( union=args.union, no_jump=args.no_jump, quant_umis=args.quant_umis, - keep_flags=args.keep_flags + keep_flags=args.keep_flags, + exact_barcodes=args.exact_barcodes ) @@ -1241,6 +1243,11 @@ def setup_count_args( ), type=str ) + parser_count.add_argument( + '--exact-barcodes', + help=('Only exact matches are used for matching barcodes to on-list.'), + action='store_true' + ) parser_count.add_argument( '-r', metavar='REPLACEMENT', diff --git a/setup.py b/setup.py index 7652659..898d48f 100755 --- a/setup.py +++ b/setup.py @@ -37,10 +37,9 @@ def read(path): 'Operating System :: POSIX :: Linux', 'Operating System :: MacOS', 'Operating System :: Microsoft :: Windows', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', 'Topic :: Scientific/Engineering :: Bio-Informatics', 'Topic :: Utilities', ], diff --git a/tests/test_count.py b/tests/test_count.py index c491f2d..d37215e 100755 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -1027,6 +1027,30 @@ def test_convert_transcripts_to_genes(self): line.strip() for line in f if not line.isspace() ]) + def test_make_transcript_t2g(self): + # Create a sample transcripts.txt + txnames_path = os.path.join(self.temp_dir, 'transcripts.txt') + with open(txnames_path, 'w') as f: + f.write('ENST00000335137.4\n') + f.write('ENST00000448914.6\n') + + # Define output path + out_path = os.path.join(self.temp_dir, 't2g.txt') + + # Call function + result_path = count.make_transcript_t2g(txnames_path, out_path) + + # Check return value + self.assertEqual(result_path, out_path) + + # Check file contents + with open(out_path, 'r') as f: + lines = [line.strip() for line in f if line.strip()] + self.assertEqual(lines, [ + 'ENST00000335137.4\tENST00000335137.4', + 'ENST00000448914.6\tENST00000448914.6' + ]) + def test_matrix_to_cellranger(self): out_dir = self.temp_dir result = count.matrix_to_cellranger( @@ -1156,7 +1180,7 @@ def test_count_with_whitelist(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -1295,7 +1319,7 @@ def test_count_report(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -1435,7 +1459,7 @@ def test_count_convert(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -1591,7 +1615,7 @@ def test_count_cellranger(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -1757,7 +1781,7 @@ def test_count_filter(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) self.assertEqual(1, bustools_count.call_count) bustools_count.assert_called_once_with( @@ -1904,7 +1928,7 @@ def test_count_without_whitelist(self): self.technology, bus_s_path, out_dir ) bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -2030,7 +2054,7 @@ def test_count_kite_convert(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -2201,7 +2225,7 @@ def test_count_kite_filter(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) self.assertEqual(1, bustools_count.call_count) bustools_count.assert_called_once_with( @@ -2367,7 +2391,7 @@ def test_count_kite_FB(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scsps_path, @@ -2503,7 +2527,7 @@ def test_count_bulk_multi_paired(self): 'SMARTSEQ2', bus_s_path, out_dir ) bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -2658,7 +2682,7 @@ def test_count_bulk_multi_single(self): 'SMARTSEQ2', bus_s_path, out_dir ) bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, @@ -4077,7 +4101,7 @@ def test_count_strand(self): ) copy_or_create_whitelist.assert_not_called() bustools_correct.assert_called_once_with( - bus_s_path, bus_sc_path, self.whitelist_path + bus_s_path, bus_sc_path, self.whitelist_path, False, False ) bustools_count.assert_called_once_with( bus_scs_path, diff --git a/tests/test_utils.py b/tests/test_utils.py index 0e8a412..d09f4d2 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -55,7 +55,7 @@ def test_run_executable_no_wait(self): with mock.patch('kb_python.utils.sp') as sp_mock: sp_mock.Popen().returncode = 0 utils.run_executable(['echo', 'TEST'], wait=False) - sp_mock.Popen().poll.assert_not_called() + sp_mock.Popen().pollssert_not_called() def test_run_executable_with_stream(self): with mock.patch('kb_python.utils.logger.debug') as debug_mock: @@ -275,9 +275,9 @@ def test_collapse_anndata_by_index(self): pd.testing.assert_index_equal( pd.Index(['c', 'd'], name='gene_id'), collapsed.var.index ) - np.testing.assert_array_equal(np.array([[1, 2], [7, 5]]), collapsed.X.A) + np.testing.assert_array_equal(np.array([[1, 2], [7, 5]]), collapsed.X.toarray()) np.testing.assert_array_equal( - np.array([[13, 8], [19, 11]]), collapsed.layers['layer'].A + np.array([[13, 8], [19, 11]]), collapsed.layers['layer'].toarray() ) def test_collapse_anndata_by_column(self): @@ -303,9 +303,9 @@ def test_collapse_anndata_by_column(self): pd.testing.assert_index_equal( pd.Index(['e', 'f'], name='gene_name'), collapsed.var.index ) - np.testing.assert_array_equal(np.array([[0, 3], [3, 9]]), collapsed.X.A) + np.testing.assert_array_equal(np.array([[0, 3], [3, 9]]), collapsed.X.toarray()) np.testing.assert_array_equal( - np.array([[6, 15], [9, 21]]), collapsed.layers['layer'].A + np.array([[6, 15], [9, 21]]), collapsed.layers['layer'].toarray() ) # def test_collapse_anndata_with_missing(self): @@ -328,9 +328,9 @@ def test_collapse_anndata_by_column(self): # pd.testing.assert_index_equal( # pd.Index(['c'], name='gene_id'), collapsed.var.index # ) -# np.testing.assert_array_equal(np.array([[2], [8]]), collapsed.X.A) +# np.testing.assert_array_equal(np.array([[2], [8]]), collapsed.X.toarray()) # np.testing.assert_array_equal( -# np.array([[14], [20]]), collapsed.layers['layer'].A +# np.array([[14], [20]]), collapsed.layers['layer'].toarray() # ) def test_create_10x_feature_barcode_map(self):