SONARPY-1907 implement Rule S6983 : The nb_workers parameter should be specified for torch.utils.data.DataLoader (#1955)

Seppli11 · web-flow · commit 5192d901cca2 · 2024-09-16T15:15:52.000+02:00
* SONARPY-1907 add metadata * SONARPY-1907 implement Rule S6983 : The nb_workers parameter should be specified for torch.utils.data.DataLoader * SONARPY-1907 update expected raised issues for S6983 * SONARPY-1907 small change in accordance with PR
diff --git a/its/ruling/src/test/resources/expected/python-S6983.json b/its/ruling/src/test/resources/expected/python-S6983.json
@@ -0,0 +1,21 @@
+{
+  "project:pecos/examples/MACLR/evaluate.py": [
+    86,
+    112
+  ],
+  "project:pecos/examples/MACLR/main.py": [
+    227,
+    231,
+    307
+  ],
+  "project:pecos/examples/giant-xrt/OGB_baselines/ogbn-papers100M/mlp_sgc.py": [
+    118,
+    119,
+    120
+  ],
+  "project:pecos/examples/giant-xrt/OGB_baselines/ogbn-papers100M/mlp_xrt.py": [
+    127,
+    128,
+    129
+  ]
+}
diff --git a/python-checks/src/main/java/org/sonar/python/checks/CheckList.java b/python-checks/src/main/java/org/sonar/python/checks/CheckList.java
@@ -311,6 +311,7 @@ public static Iterable<Class> getChecks() {
       PublicApiIsSecuritySensitiveCheck.class,
       PubliclyWritableDirectoriesCheck.class,
       PublicNetworkAccessToCloudResourcesCheck.class,
+      PyTorchDataLoaderNumWorkersCheck.class,
       PytzTimeZoneInDatetimeConstructorCheck.class,
       RaiseOutsideExceptCheck.class,
       RandomSeedCheck.class,
diff --git a/python-checks/src/main/java/org/sonar/python/checks/PyTorchDataLoaderNumWorkersCheck.java b/python-checks/src/main/java/org/sonar/python/checks/PyTorchDataLoaderNumWorkersCheck.java
@@ -0,0 +1,61 @@
+/*
+ * SonarQube Python Plugin
+ * Copyright (C) 2011-2024 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.python.checks;
+
+import java.util.List;
+import org.sonar.check.Rule;
+import org.sonar.plugins.python.api.PythonSubscriptionCheck;
+import org.sonar.plugins.python.api.symbols.Symbol;
+import org.sonar.plugins.python.api.tree.Argument;
+import org.sonar.plugins.python.api.tree.CallExpression;
+import org.sonar.plugins.python.api.tree.Tree;
+import org.sonar.plugins.python.api.tree.UnpackingExpression;
+import org.sonar.python.tree.TreeUtils;
+
+@Rule(key = "S6983")
+public class PyTorchDataLoaderNumWorkersCheck extends PythonSubscriptionCheck {
+  private static final String TORCH_UTILS_DATA_DATA_LOADER = "torch.utils.data.DataLoader";
+  public static final String MESSAGE = "Specify the `num_workers` parameter.";
+  public static final String NUM_WORKERS_ARG_NAME = "num_workers";
+  public static final int NUM_WORKERS_ARG_POSITION = 5;
+
+  @Override
+  public void initialize(Context context) {
+    context.registerSyntaxNodeConsumer(Tree.Kind.CALL_EXPR, ctx -> {
+      CallExpression callExpression = (CallExpression) ctx.syntaxNode();
+      Symbol calleeSymbol = callExpression.calleeSymbol();
+      List<Argument> arguments = callExpression.arguments();
+      if (calleeSymbol != null && TORCH_UTILS_DATA_DATA_LOADER.equals(calleeSymbol.fullyQualifiedName())
+        && isNumWorkersArgPresent(arguments)
+        && !isUnpackArgPresent(arguments)) {
+
+        ctx.addIssue(callExpression.callee(), MESSAGE);
+      }
+    });
+  }
+
+  private static boolean isNumWorkersArgPresent(List<Argument> arguments) {
+    return TreeUtils.nthArgumentOrKeyword(NUM_WORKERS_ARG_POSITION, NUM_WORKERS_ARG_NAME, arguments) == null;
+  }
+
+  private static boolean isUnpackArgPresent(List<Argument> arguments) {
+    return arguments.stream().anyMatch(UnpackingExpression.class::isInstance);
+  }
+}
diff --git a/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S6983.html b/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S6983.html
@@ -0,0 +1,39 @@
+<p>This rule raises an issue when a <code>torch.utils.data.Dataloader</code> is instantiated without specifying the <code>num_workers</code>
+parameter.</p>
+<h2>Why is this an issue?</h2>
+<p>In the PyTorch library, the data loaders are used to provide an interface where common operations such as batching can be implemented. It is also
+possible to parallelize the data loading process by using multiple worker processes. This can improve performance by increasing the number of batches
+being fetched in parallel, at the cost of higher memory usage. This performance increase can also be attributed to avoiding the Global Interpreter
+Lock (GIL) in the Python interpreter.</p>
+<h2>How to fix it</h2>
+<p>Specify the <code>num_workers</code> parameter when instantiating the <code>torch.utils.data.Dataloader</code> object.</p>
+<p>The default value of <code>0</code> will use the main process to load the data, and might be faster for small datasets that can fit completely in
+memory.</p>
+<p>For larger datasets, it is recommended to use a value of <code>1</code> or higher to parallelize the data loading process.</p>
+<h3>Code examples</h3>
+<h4>Noncompliant code example</h4>
+<pre data-diff-id="1" data-diff-type="noncompliant">
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+train_dataset = datasets.MNIST(root='data', train=True, transform=ToTensor())
+train_data_loader = DataLoader(train_dataset, batch_size=32)# Noncompliant: the num_workers parameter is not specified
+</pre>
+<h4>Compliant solution</h4>
+<pre data-diff-id="1" data-diff-type="compliant">
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+train_dataset = datasets.MNIST(root='data', train=True, transform=ToTensor())
+train_data_loader = DataLoader(train_dataset, batch_size=32, num_workers=4)
+</pre>
+<h2>Resources</h2>
+<h3>Documentation</h3>
+<ul>
+  <li> PyTorch documentation - <a href="https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading">Single- and Multi-process
+  Data Loading</a> </li>
+  <li> PyTorch documentation - <a href="https://pytorch.org/tutorials/beginner/basics/data_tutorial.html">Datasets and DataLoaders</a> </li>
+</ul>
+
diff --git a/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S6983.json b/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S6983.json
@@ -0,0 +1,24 @@
+{
+  "title": "The \"num_workers\" parameter should be specified for \"torch.utils.data.DataLoader\"",
+  "type": "CODE_SMELL",
+  "status": "ready",
+  "remediation": {
+    "func": "Constant\/Issue",
+    "constantCost": "2min"
+  },
+  "tags": [
+    "pytorch",
+    "machine-learning"
+  ],
+  "defaultSeverity": "Minor",
+  "ruleSpecification": "RSPEC-6983",
+  "sqKey": "S6983",
+  "scope": "All",
+  "quickfix": "targeted",
+  "code": {
+    "impacts": {
+      "RELIABILITY": "LOW"
+    },
+    "attribute": "COMPLETE"
+  }
+}
diff --git a/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json b/python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json
@@ -246,6 +246,7 @@
     "S6972",
     "S6973",
     "S6974",
-    "S6979"
+    "S6979",
+    "S6983"
   ]
 }
diff --git a/python-checks/src/test/java/org/sonar/python/checks/PyTorchDataLoaderNumWorkersCheckTest.java b/python-checks/src/test/java/org/sonar/python/checks/PyTorchDataLoaderNumWorkersCheckTest.java
@@ -0,0 +1,31 @@
+/*
+ * SonarQube Python Plugin
+ * Copyright (C) 2011-2024 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.python.checks;
+
+
+import org.junit.jupiter.api.Test;
+import org.sonar.python.checks.utils.PythonCheckVerifier;
+
+class PyTorchDataLoaderNumWorkersCheckTest {
+  @Test
+  void test() {
+    PythonCheckVerifier.verify("src/test/resources/checks/pyTorchDataLoaderNumWorkersCheck.py", new PyTorchDataLoaderNumWorkersCheck());
+  }
+}
diff --git a/python-checks/src/test/resources/checks/pyTorchDataLoaderNumWorkersCheck.py b/python-checks/src/test/resources/checks/pyTorchDataLoaderNumWorkersCheck.py
@@ -0,0 +1,44 @@
+from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader as AliasedDataLoader
+import torch.utils.data
+import os
+
+train_dataset = ...
+
+noncomp = DataLoader(dataset=train_dataset, batch_size=32) # Noncompliant {{Specify the `num_workers` parameter.}}
+         #^^^^^^^^^^
+
+noncomp = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32) # Noncompliant
+         #^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+noncomp = AliasedDataLoader(dataset=train_dataset, batch_size=32) # Noncompliant
+         #^^^^^^^^^^^^^^^^^
+
+noncomp = DataLoader() # Noncompliant
+
+comp1 = DataLoader(dataset=train_dataset, batch_size=32, num_workers=len(train_dataset) / os.cpu_count())
+comp2 = DataLoader(dataset=train_dataset, batch_size=32, num_workers=0)
+comp3 = DataLoader(dataset=train_dataset, batch_size=32, num_workers=1)
+comp4 = DataLoader(train_dataset, 32, False, False, False, 3) # the num_workers is the 6th arg, and in this case `3`
+comp5 = DataLoader(train_dataset, 32, False, False, False, 3, False)
+
+dict = {"someStuff":4}
+comp5 = DataLoader(**dict)
+comp6 = DataLoader(dataset=train_dataset, **dict)
+comp7 = DataLoader(**{"someStuff": 3})
+
+list = [1, 2, 3, 4, 5, 6]
+comp8 = DataLoader(*list)
+comp8 = DataLoader(dataset=train_dataset, *list)
+comp9 = DataLoader(*[1, 2, 3])
+
+comp10 = DataLoader(dataset=train_dataset, num_workers=None)
+
+class SubDataLoader(DataLoader):
+    pass
+
+# this should raise an issue but this is currently not supported
+comp10 = SubDataLoader()
+
+# checks coverage for if the symbol is null
+(lambda x: x)(2)

Original file line number	Diff line number	Diff line change
`@@ -246,6 +246,7 @@`
`246`	`246`	`"S6972",`
`247`	`247`	`"S6973",`
`248`	`248`	`"S6974",`
`249`		`- "S6979"`
	`249`	`+ "S6979",`
	`250`	`+ "S6983"`
`250`	`251`	`]`
`251`	`252`	`}`