diff --git a/benchmarks/benchmarks/tools.py b/benchmarks/benchmarks/tools.py
index 4cd56180d3..7bcdc56e38 100644
--- a/benchmarks/benchmarks/tools.py
+++ b/benchmarks/benchmarks/tools.py
@@ -55,3 +55,11 @@ def time_rank_genes_groups() -> None:
 
 def peakmem_rank_genes_groups() -> None:
     sc.tl.rank_genes_groups(adata, "bulk_labels", method="wilcoxon")
+
+
+def time_score_genes() -> None:
+    sc.tl.score_genes(adata, adata.var_names[:500])
+
+
+def peakmem_score_genes() -> None:
+    sc.tl.score_genes(adata, adata.var_names[:500])
diff --git a/docs/release-notes/3570.performance.md b/docs/release-notes/3570.performance.md
new file mode 100644
index 0000000000..199e42a93f
--- /dev/null
+++ b/docs/release-notes/3570.performance.md
@@ -0,0 +1,3 @@
+Performance Enhancement: Optimized Mean Calculation in Gene Expression Matrix
+
+Refactored the mean calculation logic in the gene expression matrix to eliminate unnecessary data copying. This optimization significantly improves execution speed, particularly beneficial for large-scale datasets.​
diff --git a/src/scanpy/tools/_score_genes.py b/src/scanpy/tools/_score_genes.py
index f9ac20761a..c352db7310 100644
--- a/src/scanpy/tools/_score_genes.py
+++ b/src/scanpy/tools/_score_genes.py
@@ -6,9 +6,10 @@
 
 import numpy as np
 import pandas as pd
+from numba import prange
 
 from .. import logging as logg
-from .._compat import CSBase, old_positionals
+from .._compat import CSBase, CSCBase, njit, old_positionals
 from .._utils import _check_use_raw, is_backed_type
 from ..get import _get_obs_rep
 
@@ -28,28 +29,74 @@
     _GetSubset = Callable[[_StrIdx], np.ndarray | CSBase]
 
 
+@njit
+def _get_sparce_nanmean_indptr(
+    data: NDArray[np.float64], indptr: NDArray[np.int32], shape: tuple[int, int]
+) -> NDArray[np.float64]:
+    n_rows = len(indptr) - 1
+    result = np.empty(n_rows, dtype=np.float64)
+
+    for i in prange(n_rows):
+        start = indptr[i]
+        end = indptr[i + 1]
+        count = np.float64(shape[1])
+        total = 0.0
+        for j in prange(start, end):
+            val = data[j]
+            if not np.isnan(val):
+                total += val
+            else:
+                count -= 1
+        if count == 0:
+            result[i] = np.nan
+        else:
+            result[i] = total / count
+    return result
+
+
+@njit
+def _get_sparce_nanmean_indices(
+    data: NDArray[np.float64], indices: NDArray[np.int32], shape: tuple
+) -> NDArray[np.float64]:
+    num_bins = shape[1]
+    num_elements = np.float64(shape[0])
+    sum_arr = np.zeros(num_bins, dtype=np.float64)
+    count_arr = np.repeat(num_elements, num_bins)
+    result = np.zeros(num_bins, dtype=np.float64)
+
+    for i in range(data.size):
+        idx = indices[i]
+        val = data[i]
+        if not np.isnan(val):
+            sum_arr[idx] += val
+        else:
+            count_arr[idx] -= 1.0
+
+    for i in range(num_bins):
+        if count_arr[i] == 0:
+            result[i] = np.nan
+        else:
+            result[i] = sum_arr[i] / count_arr[i]
+    return result
+
+
 def _sparse_nanmean(X: CSBase, axis: Literal[0, 1]) -> NDArray[np.float64]:
     """np.nanmean equivalent for sparse matrices."""
     if not isinstance(X, CSBase):
         msg = "X must be a compressed sparse matrix"
         raise TypeError(msg)
-
-    # count the number of nan elements per row/column (dep. on axis)
-    Z = X.copy()
-    Z.data = np.isnan(Z.data)
-    Z.eliminate_zeros()
-    n_elements = Z.shape[axis] - Z.sum(axis)
-
-    # set the nans to 0, so that a normal .sum() works
-    Y = X.copy()
-    Y.data[np.isnan(Y.data)] = 0
-    Y.eliminate_zeros()
-
-    # the average
-    s = Y.sum(axis, dtype="float64")  # float64 for score_genes function compatibility)
-    m = s / n_elements
-
-    return m
+    algo_shape = X.shape
+    algo_axis = axis
+    # in CSC ans CSR we have "transposed" form of data storaging (indices is colums/rows, indptr is row/columns)
+    # as a result, algorythm for CSC is algorythm for CSR but with transposed shape (columns in CSC is equal rows in CSR)
+    # base algo for CSR, for csc we should "transpose" matrix size and use same logics
+    if isinstance(X, CSCBase):
+        algo_shape = X.shape[::-1]
+        algo_axis = int(not axis)
+    if algo_axis == 1:
+        return _get_sparce_nanmean_indptr(X.data, X.indptr, algo_shape)
+    else:
+        return _get_sparce_nanmean_indices(X.data, X.indices, algo_shape)
 
 
 @old_positionals(