Merge pull request #71 from cleophass/GCI98-python

dedece35 · web-flow · commit 16b22de4dce9 · 2025-07-18T23:12:10.000+02:00
GCI96 PandasRequireUsecolsArgument #Python #DLG #Build
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+<<<<<<< HEAD
+- [#71](https://github.com/green-code-initiative/creedengo-python/pull/71) Add rule GCI96 Require Usecols Argument in Pandas Read Functions
+=======
 - [#72](https://github.com/green-code-initiative/creedengo-python/pull/72) Add rule GCI97 Optimize square computation (scalar vs vectorized method)
+>>>>>>> upstream/main
 
 ### Changed
 
diff --git a/src/it/java/org/greencodeinitiative/creedengo/python/integration/tests/GCIRulesIT.java b/src/it/java/org/greencodeinitiative/creedengo/python/integration/tests/GCIRulesIT.java
@@ -274,6 +274,19 @@ void testGCI203_compliant() {
     }
 
     @Test
+    void testGCI96() {
+        String filePath = "src/pandasRequireUsecols.py";
+        String ruleId = "creedengo-python:GCI96";
+        String ruleMsg = "Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns";
+        int[] startLines = new int[]{
+            3, 4, 5, 6, 7, 16, 19
+        };
+        int[] endLines = new int[]{
+            3, 4, 5, 6, 7, 16, 19
+        };
+
+        checkIssuesForFile(filePath, ruleId, ruleMsg, startLines, endLines, SEVERITY, TYPE, EFFORT_10MIN);
+    }
     void testGCI97(){
         String filePath = "src/optimizeSquareComputation.py";
         String ruleId = "creedengo-python:GCI97";
diff --git a/src/it/test-projects/creedengo-python-plugin-test-project/src/pandasRequireUsecols.py b/src/it/test-projects/creedengo-python-plugin-test-project/src/pandasRequireUsecols.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+df1 = pd.read_csv('data.csv')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df2 = pd.read_parquet('data.parquet')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df3 = pd.read_excel('data.xlsx')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df4 = pd.read_json('data.json')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df5 = pd.read_feather('data.feather')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+
+df7 = pd.read_csv('data.csv', usecols=['col1', 'col2'])
+df8 = pd.read_parquet('data.parquet', columns=['col1', 'col2'])
+df9 = pd.read_excel('data.xlsx', usecols=[0, 1, 2])
+df10 = pd.read_json('data.json', columns=['col1', 'col2'])
+df11 = pd.read_feather('data.feather', columns=['col1', 'col2'])
+
+import pandas as pandas_alias
+df14 = pandas_alias.read_csv('data.csv')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df15 = pandas_alias.read_csv('data.csv', usecols=['col1'])
+
+df16 = pd.read_csv('data.csv', sep=',', header=0)  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df17 = pd.read_csv('data.csv', sep=',', header=0, usecols=['col1', 'col2'])
+
+cols_to_use = ['col1', 'col2', 'col3']
+df18 = pd.read_parquet('data.parquet', columns=cols_to_use)
diff --git a/src/main/java/org/greencodeinitiative/creedengo/python/PythonRuleRepository.java b/src/main/java/org/greencodeinitiative/creedengo/python/PythonRuleRepository.java
@@ -41,6 +41,7 @@ public class PythonRuleRepository implements RulesDefinition, PythonCustomRuleRe
             AvoidListComprehensionInIterations.class,
             DetectUnoptimizedImageFormat.class,
             AvoidMultipleIfElseStatementCheck.class,
+            PandasRequireUsecolsArgument.class,
             OptimizeSquareComputation.class
     );
 
diff --git a/src/main/java/org/greencodeinitiative/creedengo/python/checks/PandasRequireUsecolsArgument.java b/src/main/java/org/greencodeinitiative/creedengo/python/checks/PandasRequireUsecolsArgument.java
@@ -0,0 +1,77 @@
+/*
+ * creedengo - Python language - Provides rules to reduce the environmental footprint of your Python programs
+ * Copyright © 2024 Green Code Initiative (https://green-code-initiative.org)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.greencodeinitiative.creedengo.python.checks;
+
+import java.util.Arrays;
+import java.util.List;
+import org.sonar.check.Rule;
+import org.sonar.plugins.python.api.PythonSubscriptionCheck;
+import org.sonar.plugins.python.api.SubscriptionContext;
+import org.sonar.plugins.python.api.tree.Argument;
+import org.sonar.plugins.python.api.tree.CallExpression;
+import org.sonar.plugins.python.api.tree.Tree;
+import org.sonar.plugins.python.api.tree.Expression;
+import org.sonar.plugins.python.api.tree.QualifiedExpression;
+import org.sonar.plugins.python.api.tree.RegularArgument;
+import static org.sonar.plugins.python.api.tree.Tree.Kind.*;
+
+@Rule(key = "GCI96")
+public class PandasRequireUsecolsArgument extends PythonSubscriptionCheck {
+
+    public static final String DESCRIPTION = "Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns";
+    private static final List<String> READ_METHODS = Arrays.asList(
+            "read_csv", "read_parquet", "read_excel", "read_feather", "read_json"
+    );
+    
+    @Override
+    public void initialize(Context context) {
+        context.registerSyntaxNodeConsumer(Tree.Kind.CALL_EXPR, this::visitCallExpression);
+    }
+    
+    public void visitCallExpression(SubscriptionContext ctx) {
+        CallExpression callExpression = (CallExpression) ctx.syntaxNode();
+        Expression callee = callExpression.callee();
+        
+        if (callee.is(Tree.Kind.QUALIFIED_EXPR)) {
+            QualifiedExpression qualifiedExpression = (QualifiedExpression) callee;
+            String methodName = qualifiedExpression.name().name();
+            
+            if (READ_METHODS.contains(methodName)) {
+                
+                if (!hasColumnsSpecified(callExpression)) {
+                    ctx.addIssue(callExpression.firstToken(), DESCRIPTION);
+                }
+            }
+        }
+    }
+    
+    private boolean hasColumnsSpecified(CallExpression callExpression) {
+        List<Argument> arguments = callExpression.arguments();
+
+        for (Argument arg : arguments) {
+            if (arg.is(REGULAR_ARGUMENT)) {
+                RegularArgument regularArg = (RegularArgument) arg;                
+                String paramName = regularArg.keywordArgument() != null ? regularArg.keywordArgument().name() : null;
+                if (paramName != null && (paramName.equals("usecols") || paramName.equals("columns"))) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+}
diff --git a/src/main/resources/org/greencodeinitiative/creedengo/python/creedengo_way_profile.json b/src/main/resources/org/greencodeinitiative/creedengo/python/creedengo_way_profile.json
@@ -10,8 +10,12 @@
 		"GCI72",
 		"GCI74",
 		"GCI89",
+<<<<<<< HEAD
+		"GCI96",
+=======
 		"GCI97",
+>>>>>>> upstream/main
 		"GCI203",
 		"GCI404"
-  ]
+	]
 }
diff --git a/src/test/java/org/greencodeinitiative/creedengo/python/checks/PandasRequireUsecolsArgumentTest.java b/src/test/java/org/greencodeinitiative/creedengo/python/checks/PandasRequireUsecolsArgumentTest.java
@@ -0,0 +1,29 @@
+/*
+ * creedengo - Python language - Provides rules to reduce the environmental footprint of your Python programs
+ * Copyright © 2024 Green Code Initiative (https://green-code-initiative.org)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.greencodeinitiative.creedengo.python.checks;
+
+import org.junit.Test;
+import org.sonar.python.checks.utils.PythonCheckVerifier;
+
+public class PandasRequireUsecolsArgumentTest {
+
+    @Test
+    public void test() {
+        PythonCheckVerifier.verify("src/test/resources/checks/pandasRequireUsecolsArgument.py", new PandasRequireUsecolsArgument());
+    }
+}
diff --git a/src/test/resources/checks/pandasRequireUsecolsArgument.py b/src/test/resources/checks/pandasRequireUsecolsArgument.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+df1 = pd.read_csv('data.csv')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df2 = pd.read_parquet('data.parquet')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df3 = pd.read_excel('data.xlsx')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df4 = pd.read_json('data.json')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df5 = pd.read_feather('data.feather')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+
+df7 = pd.read_csv('data.csv', usecols=['col1', 'col2'])
+df8 = pd.read_parquet('data.parquet', columns=['col1', 'col2'])
+df9 = pd.read_excel('data.xlsx', usecols=[0, 1, 2])
+df10 = pd.read_json('data.json', columns=['col1', 'col2'])
+df11 = pd.read_feather('data.feather', columns=['col1', 'col2'])
+
+import pandas as pandas_alias
+df14 = pandas_alias.read_csv('data.csv')  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df15 = pandas_alias.read_csv('data.csv', usecols=['col1'])
+
+df16 = pd.read_csv('data.csv', sep=',', header=0)  # Noncompliant {{Specify 'usecols' or 'columns' when reading a DataFrame using Pandas to load only necessary columns}}
+df17 = pd.read_csv('data.csv', sep=',', header=0, usecols=['col1', 'col2'])
+
+cols_to_use = ['col1', 'col2', 'col3']
+df18 = pd.read_parquet('data.parquet', columns=cols_to_use)