Skip to content

Commit bba8ff1

Browse files
committed
feat: added logic for ai/ml usage detection
1 parent 1e35b93 commit bba8ff1

File tree

10 files changed

+315
-2
lines changed

10 files changed

+315
-2
lines changed

package-lock.json

Lines changed: 61 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
"connect-mongo": "^5.1.0",
5252
"cors": "^2.8.5",
5353
"diff2html": "^3.4.33",
54+
"exiftool-vendored": "^29.0.0",
5455
"express": "^4.18.2",
5556
"express-http-proxy": "^2.0.0",
5657
"express-rate-limit": "^7.1.5",

proxy.config.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,11 @@
7777
"block": {
7878
"literals": [],
7979
"patterns": [],
80-
"providers": {}
80+
"providers": {},
81+
"aiMlUsage": {
82+
"enabled": true,
83+
"blockPatterns": ["modelWeights", "largeDatasets", "aiLibraries", "configKeys", "aiFunctions"]
84+
}
8185
}
8286
}
8387
},
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
const { Step } = require('../../actions');
2+
const config = require('../../../config');
3+
const commitConfig = config.getCommitConfig();
4+
const file = require('../../../config/file');
5+
const fs = require('fs');
6+
7+
// Patterns for detecting different types of AI/ML assets
8+
const FILE_PATTERNS = {
9+
modelWeights: /\.(h5|pb|pt|ckpt|pkl)$/,
10+
// Regex for model weight files like .h5, .pt, .ckpt, or .pkl
11+
largeDatasets: /\.(csv|json|xlsx)$/,
12+
// Regex for large dataset files
13+
aiLibraries: /(?:import\s+(tensorflow|torch|keras|sklearn|tokenizer)|require\(['"]tensorflow|torch|keras|sklearn|tokenizer['"]\))/,
14+
// Regex for AI/ML libraries and tokenizers
15+
configKeys: /\b(epochs|learning_rate|batch_size|token)\b/,
16+
// Regex for config keys in JSON/YAML including token-related keys
17+
aiFunctionNames: /\b(train_model|predict|evaluate|fit|transform|tokenize|tokenizer)\b/
18+
// Regex for AI/ML function/class names with token/tokenizer
19+
};
20+
21+
22+
// Function to check if a file name suggests it is AI/ML related (model weights or dataset)
23+
const isAiMlFileByExtension = (fileName) => {
24+
const checkAiMlConfig = commitConfig.diff.block.aiMlUsage;
25+
// check file extensions for common model weight files
26+
if(checkAiMlConfig.blockPatterns.includes('modelWeights')
27+
&& FILE_PATTERNS.modelWeights.test(fileName)){
28+
// console.log("FOUND MODEL WEIGHTS");
29+
return true; }
30+
// check file extensions for large datasets
31+
if(checkAiMlConfig.blockPatterns.includes('largeDatasets')
32+
&& FILE_PATTERNS.largeDatasets.test(fileName)){
33+
// console.log("FOUND LARGE DATASETS");
34+
return true; }
35+
return false;
36+
};
37+
38+
// Function to check if file content suggests it is AI/ML related
39+
const isAiMlFileByContent = (fileContent) => {
40+
const checkAiMlConfig = commitConfig.diff.block.aiMlUsage;
41+
// check file content for AI/ML libraries
42+
if(checkAiMlConfig.blockPatterns.includes('aiLibraries')
43+
&& FILE_PATTERNS.aiLibraries.test(fileContent)){
44+
// console.log("FOUND AI LIBRARIES");
45+
return true; }
46+
// check file content for config keys
47+
if(checkAiMlConfig.blockPatterns.includes('configKeys')
48+
&& FILE_PATTERNS.configKeys.test(fileContent)){
49+
// console.log("FOUND CONFIG KEYS");
50+
return true; }
51+
// check file content for AI/ML function/class names
52+
if(checkAiMlConfig.blockPatterns.includes('aiFunctionNames')
53+
&& FILE_PATTERNS.aiFunctionNames.test(fileContent)){
54+
// console.log("FOUND AI FUNCTION NAMES");
55+
return true; }
56+
return false;
57+
};
58+
59+
60+
// Main function to detect AI/ML usage in an array of file paths
61+
const detectAiMlUsageFiles = async (filePaths) => {
62+
const results = [];
63+
// console.log("filePaths!", filePaths);
64+
for (const filePath of filePaths) {
65+
try {
66+
const fileName = filePath.split('/').pop();
67+
// console.log(fileName, "!!!");
68+
// Check if the file name itself indicates AI/ML usage
69+
if (isAiMlFileByExtension(fileName)) {
70+
// console.log("FOUND EXTENSION for ", fileName);
71+
results.push(false); continue;
72+
// Skip content check if the file name is a match
73+
}
74+
// Check for AI/ML indicators within the file content
75+
// console.log("testing content for ", fileName);
76+
const content = await fs.promises.readFile(filePath, 'utf8');
77+
if (isAiMlFileByContent(content)) {
78+
results.push(false); continue;
79+
}
80+
results.push(true); // No indicators found in content
81+
} catch (err) {
82+
console.error(`Error reading file ${filePath}:`, err);
83+
results.push(false); // Treat errors as no AI/ML usage found
84+
}
85+
}
86+
87+
return results;
88+
};
89+
90+
// Helper function to parse file paths from git diff content
91+
const extractFilePathsFromDiff = (diffContent) => {
92+
const filePaths = [];
93+
const lines = diffContent.split('\n');
94+
95+
lines.forEach(line => {
96+
const match = line.match(/^diff --git a\/(.+?) b\/(.+?)$/);
97+
if (match) {
98+
filePaths.push(match[1]); // Extract the file path from "a/" in the diff line
99+
}
100+
});
101+
102+
return filePaths;
103+
};
104+
105+
// Main exec function
106+
const exec = async (req, action, log = console.log) => {
107+
// console.log("HEYYY");
108+
const diffStep = action.steps.find((s) => s.stepName === 'diff');
109+
const step = new Step('checkForAiMlUsage');
110+
action.addStep(step);
111+
if(!commitConfig.diff.block.aiMlUsage.enabled) {
112+
// console.log("INSIDW!!")
113+
return action;
114+
}
115+
116+
if (diffStep && diffStep.content) {
117+
const filePaths = extractFilePathsFromDiff(diffStep.content);
118+
// console.log(filePaths);
119+
120+
if (filePaths.length) {
121+
const aiMlDetected = await detectAiMlUsageFiles(filePaths);
122+
// console.log(aiMlDetected);
123+
const isBlocked = aiMlDetected.some(found => !found);
124+
// const isBlocked = false;
125+
126+
if (isBlocked) {
127+
step.blocked = true;
128+
step.error = true;
129+
step.errorMessage = 'Your push has been blocked due to AI/ML usage detection';
130+
log(step.errorMessage);
131+
}
132+
} else {
133+
log('No valid image files found in the diff content.');
134+
}
135+
} else {
136+
log('No diff content available.');
137+
}
138+
139+
return action;
140+
};
141+
142+
exec.displayName = 'logFileChanges.exec';
143+
module.exports = { exec };

test/checkAiMlUsage.test.js

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
const { exec } = require('../src/proxy/processors/push-action/checkForAiMlUsage.js');
2+
const sinon = require('sinon');
3+
const { Action } = require('../src/proxy/actions/Action.js');
4+
const { Step } = require('../src/proxy/actions/Step.js');
5+
6+
7+
describe('Detect AI/ML usage from git diff', () => {
8+
let logStub;
9+
10+
beforeEach(() => {
11+
// Stub console.log and config.getCommitConfig for isolation in each test case
12+
logStub = sinon.stub(console, 'log');
13+
});
14+
15+
afterEach(() => {
16+
// Restore stubs to avoid cross-test interference
17+
logStub.restore();
18+
// configStub.restore();
19+
});
20+
21+
const createDiffContent = (filePaths) => {
22+
// Creates diff-like content for each file path to simulate actual git diff output
23+
return filePaths.map((filePath) => `diff --git a/${filePath} b/${filePath}`).join('\n');
24+
};
25+
26+
it('Block push if AI/ML file extensions detected', async () => {
27+
// Create action and step instances with test data that should trigger blocking
28+
const action = new Action('action_id', 'push', 'create', Date.now(), 'owner/repo');
29+
const step = new Step('diff');
30+
31+
const filePaths = [
32+
'test/test_data/ai_test_data/model.h5',
33+
'test/test_data/ai_test_data/dataset.csv',
34+
];
35+
step.setContent(createDiffContent(filePaths));
36+
action.addStep(step);
37+
38+
await exec(null, action);
39+
40+
// Check that console.log was called with the blocking message
41+
sinon.assert.calledWith(
42+
logStub,
43+
sinon.match(
44+
/Your push has been blocked due to AI\/ML usage detection/,
45+
),
46+
);
47+
});
48+
49+
it('Block push if AI/ML file content detected', async () => {
50+
// Create action and step instances with test data that should trigger blocking
51+
const action = new Action('action_id', 'push', 'create', Date.now(), 'owner/repo');
52+
const step = new Step('diff');
53+
54+
const filePaths = [
55+
'test/test_data/ai_test_data/ai_script.py',
56+
'test/test_data/ai_test_data/ai_config.json',
57+
];
58+
step.setContent(createDiffContent(filePaths));
59+
action.addStep(step);
60+
61+
await exec(null, action);
62+
63+
// Check that console.log was called with the blocking message
64+
sinon.assert.calledWith(
65+
logStub,
66+
sinon.match(
67+
/Your push has been blocked due to AI\/ML usage detection/,
68+
),
69+
);
70+
});
71+
72+
it('Allow push if no AI/ML usage is detected', async () => {
73+
// Configure with no sensitive EXIF parameters
74+
75+
const action = new Action('action_id', 'push', 'create', Date.now(), 'owner/repo');
76+
const step = new Step('diff');
77+
78+
const filePaths = ['test/test_data/ai_test_data/non_ai_script.py'];
79+
step.setContent(createDiffContent(filePaths));
80+
action.addStep(step);
81+
82+
await exec(null, action);
83+
84+
// Ensure no blocking message was logged
85+
sinon.assert.neverCalledWith(
86+
logStub,
87+
sinon.match(
88+
/Your push has been blocked due to AI\/ML usage detection/,
89+
),
90+
);
91+
});
92+
});
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"epochs": 100,
3+
"learning_rate": 0.01
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import tensorflow as tf
2+
model = tf.keras.models.Sequential()
3+
model.add(tf.keras.layers.Dense(10, activation='relu'))
4+
model.compile(optimizer='adam', loss='mse')
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
id,feature1,feature2,label
2+
1,0.5,0.3,1
3+
2,0.6,0.2,0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a dummy model file
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
print("Hello World") # No AI/ML content

0 commit comments

Comments
 (0)