Merge pull request #69 from cleophass/GCI95-python

dedece35 · web-flow · commit 52d81acacf44 · 2025-07-31T22:46:48.000+02:00
GCI99 avoidCSVFormat #Python #DLG #Build
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- [#69](https://github.com/green-code-initiative/creedengo-python/pull/69) Add rule GCI99 Avoid CSV Format
 - [#76](https://github.com/green-code-initiative/creedengo-python/pull/76) Add rule GCI103 Dictionary Items Unused. A rule specifying that dictionary iteration should consider the pertinence of the element used.
 - [#79](https://github.com/green-code-initiative/creedengo-python/pull/79) Add rule GCI106 Avoid SQRT in a loop
 - [#71](https://github.com/green-code-initiative/creedengo-python/pull/71) Add rule GCI96 Require Usecols Argument in Pandas Read Functions
diff --git a/src/it/java/org/greencodeinitiative/creedengo/python/integration/tests/GCIRulesIT.java b/src/it/java/org/greencodeinitiative/creedengo/python/integration/tests/GCIRulesIT.java
@@ -17,15 +17,15 @@
  */
 package org.greencodeinitiative.creedengo.python.integration.tests;
 
-import org.junit.jupiter.api.Test;
-import org.sonarqube.ws.Issues;
-import org.sonarqube.ws.Measures;
+import static java.util.Optional.ofNullable;
+import static org.assertj.core.api.Assertions.assertThat;
 
 import java.util.List;
 import java.util.Map;
 
-import static java.util.Optional.ofNullable;
-import static org.assertj.core.api.Assertions.assertThat;
+import org.junit.jupiter.api.Test;
+import org.sonarqube.ws.Issues;
+import org.sonarqube.ws.Measures;
 
 class GCIRulesIT extends GCIRulesBase {
 
@@ -320,6 +320,21 @@ void testGCI103(){
 
     }
 
+    @Test
+    void testGCI99(){
+        String filePath = "src/avoidCSVFormat.py";
+        String ruleId = "creedengo-python:GCI99";
+        String ruleMsg = "Use Parquet or Feather format instead of CSV";
+        int[] startLines = new int[]{
+            4, 6, 10, 12, 14, 15, 17, 18, 23, 39, 47, 48
+        };
+        int[] endLines = new int[]{
+            4, 6, 10, 12, 14, 15, 17, 18, 23, 39, 47, 48
+        };
+
+        checkIssuesForFile(filePath, ruleId, ruleMsg, startLines, endLines, SEVERITY, TYPE, EFFORT_50MIN);
+    }
+
     @Test
     void testGCI106() {
         String filePath = "src/avoidSqrtInLoop.py";
@@ -334,4 +349,6 @@ void testGCI106() {
         checkIssuesForFile(filePath, ruleId, ruleMsg, startLines, endLines, SEVERITY, TYPE, EFFORT_5MIN);
     }
 
+
+
 }
diff --git a/src/it/test-projects/creedengo-python-plugin-test-project/src/avoidCSVFormat.py b/src/it/test-projects/creedengo-python-plugin-test-project/src/avoidCSVFormat.py
@@ -0,0 +1,51 @@
+import pandas as pd
+import pandas as pandas_alias
+
+df = pd.read_csv('data.csv') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df.to_csv('output.csv') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df = pd.read_parquet('data.parquet')
+
+path_to_file = 'MNIST.csv' # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df2 = pandas_alias.read_csv('another_data.csv') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+with open('data.csv') as f: # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+    df3 = pd.read_csv(f) # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df4 = pd.read_csv( # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+    'complex_data.csv', # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+    sep=',',
+    header=0
+)
+
+df4.to_csv("output.csv") # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df5 = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+
+other_path = 'data.json'
+
+df6 = pd.read_json(other_path)
+
+df7 = pd.read_feather('features.feather')
+
+df8 = pd.read_parquet("file.parquet")
+
+df9 = pandas_alias.read_feather("something.feather")
+
+df10 = pandas_alias.read_parquet("nested/dir/file.parquet")
+
+result = pd.read_csv("log.csv", encoding='utf-8') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df11 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
+df11.to_parquet("output.parquet")
+
+df12 = pd.DataFrame({'x': [5, 6]})
+df12.to_feather("output.feather")
+
+filename = "report.csv" # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+data = pd.read_csv(filename) # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+log_file = "logfile.log"
+df13 = pd.read_table(log_file, delimiter='|')
diff --git a/src/main/java/org/greencodeinitiative/creedengo/python/PythonRuleRepository.java b/src/main/java/org/greencodeinitiative/creedengo/python/PythonRuleRepository.java
@@ -44,7 +44,8 @@ public class PythonRuleRepository implements RulesDefinition, PythonCustomRuleRe
             PandasRequireUsecolsArgument.class,
             OptimizeSquareComputation.class,
             AvoidSqrtInLoop.class,
-            DictionaryItemsUnused.class
+            DictionaryItemsUnused.class,
+            AvoidCSVFormat.class
     );
 
     public static final String LANGUAGE = "py";
diff --git a/src/main/java/org/greencodeinitiative/creedengo/python/checks/AvoidCSVFormat.java b/src/main/java/org/greencodeinitiative/creedengo/python/checks/AvoidCSVFormat.java
@@ -0,0 +1,82 @@
+/*
+ * creedengo - Python language - Provides rules to reduce the environmental footprint of your Python programs
+ * Copyright © 2024 Green Code Initiative (https://green-code-initiative.org)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.greencodeinitiative.creedengo.python.checks;
+
+import org.sonar.plugins.python.api.SubscriptionContext;
+import org.sonar.plugins.python.api.tree.CallExpression;
+import org.sonar.plugins.python.api.tree.QualifiedExpression;
+import org.sonar.plugins.python.api.tree.StringLiteral;
+import org.sonar.plugins.python.api.tree.Tree;
+import org.sonar.plugins.python.api.tree.Expression;
+import org.sonar.check.Rule;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.sonar.plugins.python.api.PythonSubscriptionCheck;
+
+@Rule(key = "GCI99")
+public class AvoidCSVFormat extends PythonSubscriptionCheck {
+
+    public static final String DESCRIPTION = "Use Parquet or Feather format instead of CSV";
+    protected static final Pattern CSV_EXTENSION = Pattern.compile("\\.csv");
+    private final Set<Integer> reportedLines = new HashSet<>();
+
+    @Override
+    public void initialize(Context context) {
+        context.registerSyntaxNodeConsumer(Tree.Kind.CALL_EXPR, this::visitCallExpression);
+        context.registerSyntaxNodeConsumer(Tree.Kind.STRING_LITERAL, this::visitNodeString);
+    }
+
+    public void visitCallExpression(SubscriptionContext ctx) {
+        CallExpression callExpression = (CallExpression) ctx.syntaxNode();
+        Expression callee = callExpression.callee();
+
+        if (callee.is(Tree.Kind.QUALIFIED_EXPR)) {
+            QualifiedExpression qualifiedExpression = (QualifiedExpression) callee;
+            String methodName = qualifiedExpression.name().name();
+
+            if (methodName.equals("read_csv") || methodName.equals("to_csv")) {
+                int line = callExpression.firstToken().line();
+
+                if (!reportedLines.contains(line)) {
+                    reportedLines.add(line);
+                    ctx.addIssue(callExpression.firstToken(), DESCRIPTION);
+                }
+            }
+        }
+    }
+    
+    public void visitNodeString(SubscriptionContext ctx) {
+        StringLiteral stringLiteral = (StringLiteral) ctx.syntaxNode();
+        int line = stringLiteral.firstToken().line();
+        
+        if (reportedLines.contains(line)) {
+            return;
+        }
+        String strValue = stringLiteral.trimmedQuotesValue();
+        Matcher matcher = CSV_EXTENSION.matcher(strValue);
+
+        if (matcher.find()) {
+            reportedLines.add(line);
+            ctx.addIssue(stringLiteral, DESCRIPTION);
+        }
+    }
+}
diff --git a/src/main/resources/org/greencodeinitiative/creedengo/python/creedengo_way_profile.json b/src/main/resources/org/greencodeinitiative/creedengo/python/creedengo_way_profile.json
@@ -12,6 +12,7 @@
 		"GCI89",
 		"GCI96",
 		"GCI97",
+		"GCI99",
 		"GCI103",
 		"GCI106",
 		"GCI203",
diff --git a/src/test/java/org/greencodeinitiative/creedengo/python/checks/AvoidCSVFormatTest.java b/src/test/java/org/greencodeinitiative/creedengo/python/checks/AvoidCSVFormatTest.java
@@ -0,0 +1,29 @@
+/*
+ * creedengo - Python language - Provides rules to reduce the environmental footprint of your Python programs
+ * Copyright © 2024 Green Code Initiative (https://green-code-initiative.org)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.greencodeinitiative.creedengo.python.checks;
+
+import org.junit.Test;
+import org.sonar.python.checks.utils.PythonCheckVerifier;
+
+public class AvoidCSVFormatTest {
+
+    @Test
+    public void test() {
+        PythonCheckVerifier.verify("src/test/resources/checks/avoidCSVFormat.py", new AvoidCSVFormat());
+    }
+}
diff --git a/src/test/resources/checks/avoidCSVFormat.py b/src/test/resources/checks/avoidCSVFormat.py
@@ -0,0 +1,51 @@
+import pandas as pd
+import pandas as pandas_alias
+
+df = pd.read_csv('data.csv') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df.to_csv('output.csv') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df = pd.read_parquet('data.parquet')
+
+path_to_file = 'MNIST.csv' # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df2 = pandas_alias.read_csv('another_data.csv') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+with open('data.csv') as f: # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+    df3 = pd.read_csv(f) # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df4 = pd.read_csv( # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+    'complex_data.csv', # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+    sep=',',
+    header=0
+)
+
+df4.to_csv("output.csv") # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df5 = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+
+other_path = 'data.json'
+
+df6 = pd.read_json(other_path)
+
+df7 = pd.read_feather('features.feather')
+
+df8 = pd.read_parquet("file.parquet")
+
+df9 = pandas_alias.read_feather("something.feather")
+
+df10 = pandas_alias.read_parquet("nested/dir/file.parquet")
+
+result = pd.read_csv("log.csv", encoding='utf-8') # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+df11 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
+df11.to_parquet("output.parquet")
+
+df12 = pd.DataFrame({'x': [5, 6]})
+df12.to_feather("output.feather")
+
+filename = "report.csv" # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+data = pd.read_csv(filename) # Noncompliant {{Use Parquet or Feather format instead of CSV}}
+
+log_file = "logfile.log"
+df13 = pd.read_table(log_file, delimiter='|')