pachterlab · Yenaled · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v1
         with:
-          python-version: '3.8'
+          python-version: '3.9.22'
           architecture: x64
       - name: Install dependencies
         run: pip install -r dev-requirements.txt
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [3.8, 3.9 ]
+        python: [3.9.22, 3.10.17 ]
         os: [ubuntu-20.04]
     name: Test on Python ${{ matrix.python }}
     steps:

diff --git a/kb_python/bins/darwin/bustools/bustools b/kb_python/bins/darwin/bustools/bustools
diff --git a/kb_python/bins/darwin/m1/bustools/bustools b/kb_python/bins/darwin/m1/bustools/bustools
diff --git a/kb_python/bins/linux/bustools/bustools b/kb_python/bins/linux/bustools/bustools
diff --git a/kb_python/bins/windows/bustools/bustools.exe b/kb_python/bins/windows/bustools/bustools.exe
diff --git a/kb_python/config.py b/kb_python/config.py
@@ -127,6 +127,7 @@ class Technology(NamedTuple):
         '10XFB', '10x Feature Barcode',
         ngs.chemistry.get_chemistry('10xFBonly'), False
     ),
+    Technology('10XV4', '10x version 4', ngs.chemistry.get_chemistry('10xv4')),
     Technology('CELSEQ', 'CEL-Seq', ngs.chemistry.get_chemistry('celseq')),
     Technology(
         'CELSEQ2', 'CEL-SEQ version 2', ngs.chemistry.get_chemistry('celseq2')
@@ -168,7 +169,8 @@ class Technology(NamedTuple):
     ),
     Technology('Visium', '10x Visium', ngs.chemistry.get_chemistry('visium')),
     Technology(
-        'SPLIT-SEQ', 'SPLiT-seq', ngs.chemistry.get_chemistry('split-seq')
+        'SPLIT-SEQ', 'SPLiT-seq (version 2)',
+        ngs.chemistry.get_chemistry('split-seq')
     ),
 ],
                       key=lambda t: t.name)

diff --git a/kb_python/count.py b/kb_python/count.py
@@ -87,6 +87,23 @@
 INSPECT_PARSER = re.compile(r'^.*?(?P<count>[0-9]+)')
 
 
+def make_transcript_t2g(txnames_path: str, out_path: str) -> str:
+    """Make a two-column t2g file from a transcripts file
+
+    Args:
+        txnames_path: Path to transcripts.txt
+        out_path: Path to output t2g file
+
+    Returns:
+       Path to output t2g file
+    """
+    with open_as_text(txnames_path, 'r') as f, open_as_text(out_path,
+                                                            'w') as out:
+        for line in f:
+            out.write(f'{line.strip()}\t{line.strip()}\n')
+    return out_path
+
+
 def kallisto_bus(
     fastqs: Union[List[str], str],
     index_path: str,
@@ -164,7 +181,11 @@
     command += ['-i', index_path]
     command += ['-o', out_dir]
     if not demultiplexed:
-        command += ['-x', technology]
+        if technology.upper() == "10XV4":
+            # TODO: REMOVE THIS WHEN KALLISTO IS UPDATED
+            command += ['-x', "10XV3"]
+        else:
+            command += ['-x', technology]
     elif technology[0] == '-':
         # User supplied a custom demuxed (no-barcode) technology
         command += ['-x', technology]
@@ -412,7 +433,8 @@
     bus_path: str,
     out_path: str,
     whitelist_path: str,
-    replace: bool = False
+    replace: bool = False,
+    exact_barcodes: bool = False
 ) -> Dict[str, str]:
     """Runs `bustools correct`.
 
@@ -421,6 +443,7 @@
         out_path: Path to output corrected BUS file
         whitelist_path: Path to whitelist
         replace: If whitelist is a replacement file, defaults to `False`
+        exact_barcodes: Use exact matching for 'correction', defaults to `False`
 
     Returns:
         Dictionary containing path to generated index
@@ -436,6 +459,8 @@
     command += [bus_path]
     if replace:
         command += ['--replace']
+    if exact_barcodes:
+        command += ['--nocorrect']
     run_executable(command)
     return {'bus': out_path}
 
@@ -1214,6 +1239,7 @@
     no_jump: bool = False,
     quant_umis: bool = False,
     keep_flags: bool = False,
+    exact_barcodes: bool = False,
 ) -> Dict[str, Union[str, Dict[str, str]]]:
     """Generates count matrices for single-cell RNA seq.
 
@@ -1286,6 +1312,7 @@
         no_jump: Disable pseudoalignment "jumping", defaults to `False`
         quant_umis: Whether to run quant-tcc when there are UMIs, defaults to `False`
         keep_flags: Preserve flag column when sorting BUS file, defaults to `False`
+        exact_barcodes: Use exact match for 'correcting' barcodes to on-list, defaults to `False`
 
     Returns:
         Dictionary containing paths to generated files
@@ -1349,6 +1376,10 @@
         )
     unfiltered_results.update(bus_result)
 
+    if t2g_path.upper() == "NONE":
+        tmp_t2g = os.path.join(temp_dir, "t2g.txt")
+        t2g_path = make_transcript_t2g(bus_result['txnames'], tmp_t2g)
+
     sort_result = bustools_sort(
         bus_result['bus'],
         os.path.join(
@@ -1388,7 +1419,7 @@
                 update_filename(
                     os.path.basename(prev_result['bus']), CORRECT_CODE
                 )
-            ), whitelist_path
+            ), whitelist_path, False, exact_barcodes
         )
         prev_result = bustools_sort(
             prev_result['bus'],
@@ -1757,6 +1788,7 @@
     no_jump: bool = False,
     quant_umis: bool = False,
     keep_flags: bool = False,
+    exact_barcodes: bool = False,
 ) -> Dict[str, Union[Dict[str, str], str]]:
     """Generates RNA velocity matrices for single-cell RNA seq.
 
@@ -1826,6 +1858,7 @@
         no_jump: Disable pseudoalignment "jumping", defaults to `False`
         quant_umis: Whether to run quant-tcc when there are UMIs, defaults to `False`
         keep_flags: Preserve flag column when sorting BUS file, defaults to `False`
+        exact_barcodes: Use exact match for 'correcting' barcodes to on-list, defaults to `False`
 
     Returns:
         Dictionary containing path to generated index
@@ -1886,6 +1919,10 @@
         )
     unfiltered_results.update(bus_result)
 
+    if t2g_path.upper() == "NONE":
+        tmp_t2g = os.path.join(temp_dir, "t2g.txt")
+        t2g_path = make_transcript_t2g(bus_result['txnames'], tmp_t2g)
+
     sort_result = bustools_sort(
         bus_result['bus'],
         os.path.join(
@@ -1926,7 +1963,7 @@
                 update_filename(
                     os.path.basename(sort_result['bus']), CORRECT_CODE
                 )
-            ), whitelist_path
+            ), whitelist_path, False, exact_barcodes
         )
         prev_result = bustools_sort(
             prev_result['bus'],

diff --git a/kb_python/main.py b/kb_python/main.py
@@ -639,7 +639,8 @@ def parse_count(
             union=args.union,
             no_jump=args.no_jump,
             quant_umis=args.quant_umis,
-            keep_flags=args.keep_flags
+            keep_flags=args.keep_flags,
+            exact_barcodes=args.exact_barcodes
         )
     elif args.workflow in {'nucleus', 'lamanno'}:
         # Smartseq can not be used with lamanno or nucleus.
@@ -762,7 +763,8 @@ def parse_count(
             union=args.union,
             no_jump=args.no_jump,
             quant_umis=args.quant_umis,
-            keep_flags=args.keep_flags
+            keep_flags=args.keep_flags,
+            exact_barcodes=args.exact_barcodes
         )
 
 
@@ -1241,6 +1243,11 @@ def setup_count_args(
         ),
         type=str
     )
+    parser_count.add_argument(
+        '--exact-barcodes',
+        help=('Only exact matches are used for matching barcodes to on-list.'),
+        action='store_true'
+    )
     parser_count.add_argument(
         '-r',
         metavar='REPLACEMENT',

diff --git a/setup.py b/setup.py
@@ -37,10 +37,9 @@ def read(path):
         'Operating System :: POSIX :: Linux',
         'Operating System :: MacOS',
         'Operating System :: Microsoft :: Windows',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Topic :: Scientific/Engineering :: Bio-Informatics',
         'Topic :: Utilities',
     ],

diff --git a/tests/test_count.py b/tests/test_count.py
@@ -1027,6 +1027,30 @@ def test_convert_transcripts_to_genes(self):
                     line.strip() for line in f if not line.isspace()
                 ])
 
+    def test_make_transcript_t2g(self):
+        # Create a sample transcripts.txt
+        txnames_path = os.path.join(self.temp_dir, 'transcripts.txt')
+        with open(txnames_path, 'w') as f:
+            f.write('ENST00000335137.4\n')
+            f.write('ENST00000448914.6\n')
+
+        # Define output path
+        out_path = os.path.join(self.temp_dir, 't2g.txt')
+
+        # Call function
+        result_path = count.make_transcript_t2g(txnames_path, out_path)
+
+        # Check return value
+        self.assertEqual(result_path, out_path)
+
+        # Check file contents
+        with open(out_path, 'r') as f:
+            lines = [line.strip() for line in f if line.strip()]
+            self.assertEqual(lines, [
+                'ENST00000335137.4\tENST00000335137.4',
+                'ENST00000448914.6\tENST00000448914.6'
+            ])
+
     def test_matrix_to_cellranger(self):
         out_dir = self.temp_dir
         result = count.matrix_to_cellranger(
@@ -1156,7 +1180,7 @@ def test_count_with_whitelist(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -1295,7 +1319,7 @@ def test_count_report(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -1435,7 +1459,7 @@ def test_count_convert(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -1591,7 +1615,7 @@ def test_count_cellranger(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -1757,7 +1781,7 @@ def test_count_filter(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             self.assertEqual(1, bustools_count.call_count)
             bustools_count.assert_called_once_with(
@@ -1904,7 +1928,7 @@ def test_count_without_whitelist(self):
                 self.technology, bus_s_path, out_dir
             )
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -2030,7 +2054,7 @@ def test_count_kite_convert(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -2201,7 +2225,7 @@ def test_count_kite_filter(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             self.assertEqual(1, bustools_count.call_count)
             bustools_count.assert_called_once_with(
@@ -2367,7 +2391,7 @@ def test_count_kite_FB(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scsps_path,
@@ -2503,7 +2527,7 @@ def test_count_bulk_multi_paired(self):
                 'SMARTSEQ2', bus_s_path, out_dir
             )
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -2658,7 +2682,7 @@ def test_count_bulk_multi_single(self):
                 'SMARTSEQ2', bus_s_path, out_dir
             )
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,
@@ -4077,7 +4101,7 @@ def test_count_strand(self):
             )
             copy_or_create_whitelist.assert_not_called()
             bustools_correct.assert_called_once_with(
-                bus_s_path, bus_sc_path, self.whitelist_path
+                bus_s_path, bus_sc_path, self.whitelist_path, False, False
             )
             bustools_count.assert_called_once_with(
                 bus_scs_path,

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -55,7 +55,7 @@ def test_run_executable_no_wait(self):
         with mock.patch('kb_python.utils.sp') as sp_mock:
             sp_mock.Popen().returncode = 0
             utils.run_executable(['echo', 'TEST'], wait=False)
-            sp_mock.Popen().poll.assert_not_called()
+            sp_mock.Popen().pollssert_not_called()
 
     def test_run_executable_with_stream(self):
         with mock.patch('kb_python.utils.logger.debug') as debug_mock:
@@ -275,9 +275,9 @@ def test_collapse_anndata_by_index(self):
         pd.testing.assert_index_equal(
             pd.Index(['c', 'd'], name='gene_id'), collapsed.var.index
         )
-        np.testing.assert_array_equal(np.array([[1, 2], [7, 5]]), collapsed.X.A)
+        np.testing.assert_array_equal(np.array([[1, 2], [7, 5]]), collapsed.X.toarray())
         np.testing.assert_array_equal(
-            np.array([[13, 8], [19, 11]]), collapsed.layers['layer'].A
+            np.array([[13, 8], [19, 11]]), collapsed.layers['layer'].toarray()
         )
 
     def test_collapse_anndata_by_column(self):
@@ -303,9 +303,9 @@ def test_collapse_anndata_by_column(self):
         pd.testing.assert_index_equal(
             pd.Index(['e', 'f'], name='gene_name'), collapsed.var.index
         )
-        np.testing.assert_array_equal(np.array([[0, 3], [3, 9]]), collapsed.X.A)
+        np.testing.assert_array_equal(np.array([[0, 3], [3, 9]]), collapsed.X.toarray())
         np.testing.assert_array_equal(
-            np.array([[6, 15], [9, 21]]), collapsed.layers['layer'].A
+            np.array([[6, 15], [9, 21]]), collapsed.layers['layer'].toarray()
         )
 
 #    def test_collapse_anndata_with_missing(self):
@@ -328,9 +328,9 @@ def test_collapse_anndata_by_column(self):
 #        pd.testing.assert_index_equal(
 #            pd.Index(['c'], name='gene_id'), collapsed.var.index
 #        )
-#        np.testing.assert_array_equal(np.array([[2], [8]]), collapsed.X.A)
+#        np.testing.assert_array_equal(np.array([[2], [8]]), collapsed.X.toarray())
 #        np.testing.assert_array_equal(
-#            np.array([[14], [20]]), collapsed.layers['layer'].A
+#            np.array([[14], [20]]), collapsed.layers['layer'].toarray()
 #        )
 
     def test_create_10x_feature_barcode_map(self):