Skip to content

Commit aea06a4

Browse files
yaronskayaanatolystansler
authored andcommitted
fix: do not process js comments (#165) (#196)
* fix: do not process js comments (#165) * wip: fix pr * fix: style
1 parent affed37 commit aea06a4

14 files changed

+105
-70
lines changed

src/main/kotlin/app/extractors/CExtractor.kt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ class CExtractor : ExtractorInterface {
1414
val evaluator by lazy {
1515
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
1616
}
17+
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
18+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
19+
val extractImportRegex =
20+
Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
1721
}
1822

1923
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,9 +28,8 @@ class CExtractor : ExtractorInterface {
2428
override fun extractImports(fileContent: List<String>): List<String> {
2529
val imports = mutableSetOf<String>()
2630

27-
val regex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
2831
fileContent.forEach {
29-
val res = regex.find(it)
32+
val res = extractImportRegex.find(it)
3033
if (res != null) {
3134
val lineLib = res.groupValues.last()
3235
imports.add(lineLib)
@@ -37,8 +40,6 @@ class CExtractor : ExtractorInterface {
3740
}
3841

3942
override fun tokenize(line: String): List<String> {
40-
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
41-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
4243
var newLine = importRegex.replace(line, "")
4344
newLine = commentRegex.replace(newLine, "")
4445
return super.tokenize(newLine)

src/main/kotlin/app/extractors/CSharpExtractor.kt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ class CSharpExtractor : ExtractorInterface {
1515
val evaluator by lazy {
1616
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
1717
}
18+
val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""")
19+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
20+
val extractImportRegex = Regex("""using\s+(\w+[.\w+]*)""")
1821
}
1922

2023
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -25,9 +28,8 @@ class CSharpExtractor : ExtractorInterface {
2528
override fun extractImports(fileContent: List<String>): List<String> {
2629
val imports = mutableSetOf<String>()
2730

28-
val regex = Regex("""using\s+(\w+[.\w+]*)""")
2931
fileContent.forEach {
30-
val res = regex.find(it)
32+
val res = extractImportRegex.find(it)
3133
if (res != null) {
3234
val importedName = res.groupValues[1]
3335
LIBRARIES.forEach { library ->
@@ -52,6 +54,7 @@ class CSharpExtractor : ExtractorInterface {
5254
override fun getLineLibraries(line: String,
5355
fileLibraries: List<String>): List<String> {
5456

55-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
57+
return super.getLineLibraries(line, fileLibraries, evaluator,
58+
LANGUAGE_NAME)
5659
}
5760
}

src/main/kotlin/app/extractors/CppExtractor.kt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ class CppExtractor : ExtractorInterface {
1616
}
1717
val MULTI_IMPORT_TO_LIB =
1818
ExtractorInterface.getMultipleImportsToLibraryMap(LANGUAGE_NAME)
19+
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
20+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
21+
val extractImportRegex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
1922
}
2023

2124
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -26,9 +29,8 @@ class CppExtractor : ExtractorInterface {
2629
override fun extractImports(fileContent: List<String>): List<String> {
2730
val imports = mutableSetOf<String>()
2831

29-
val regex = Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
3032
fileContent.forEach {
31-
val res = regex.find(it)
33+
val res = extractImportRegex.find(it)
3234
if (res != null) {
3335
val lineLib = res.groupValues.last()
3436
imports.add(lineLib)
@@ -40,8 +42,6 @@ class CppExtractor : ExtractorInterface {
4042
}
4143

4244
override fun tokenize(line: String): List<String> {
43-
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
44-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
4545
var newLine = importRegex.replace(line, "")
4646
newLine = commentRegex.replace(newLine, "")
4747
return super.tokenize(newLine)
@@ -50,6 +50,7 @@ class CppExtractor : ExtractorInterface {
5050
override fun getLineLibraries(line: String,
5151
fileLibraries: List<String>): List<String> {
5252

53-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
53+
return super.getLineLibraries(line, fileLibraries, evaluator,
54+
LANGUAGE_NAME)
5455
}
5556
}

src/main/kotlin/app/extractors/ExtractorInterface.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ interface ExtractorInterface {
2020
private val classifiersCache = hashMapOf<String, Classifier>()
2121
private val modelsDir = "models"
2222
private val pbExt = ".pb"
23+
val stringRegex = Regex("""(".+?"|'.+?')""")
24+
val splitRegex =
25+
Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")
2326

2427
private fun getResource(path: String): InputStream {
2528
return ExtractorInterface::class.java.classLoader
@@ -173,11 +176,8 @@ interface ExtractorInterface {
173176
}
174177

175178
fun tokenize(line: String): List<String> {
176-
val stringRegex = Regex("""(".+?"|'.+?')""")
177179
val newLine = stringRegex.replace(line, "")
178180
//TODO(lyaronskaya): multiline comment regex
179-
val splitRegex =
180-
Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")
181181
val tokens = splitRegex.split(newLine)
182182
.filter { it.isNotBlank() && !it.contains('"') && !it.contains('\'')
183183
&& it != "-" && it != "@"}

src/main/kotlin/app/extractors/GoExtractor.kt

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ class GoExtractor : ExtractorInterface {
1414
val evaluator by lazy {
1515
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
1616
}
17+
val importRegex = Regex("""^(.*import)\s[^\n]*""")
18+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
19+
val singleImportRegex = Regex("""import\s+"(\w+)"""")
20+
val multipleImportRegex = Regex("""import[\s\t\n]+\((.+?)\)""",
21+
RegexOption.DOT_MATCHES_ALL)
1722
}
1823

1924
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,31 +29,27 @@ class GoExtractor : ExtractorInterface {
2429
override fun extractImports(fileContent: List<String>): List<String> {
2530
val imports = mutableSetOf<String>()
2631

27-
val singleImportRegex = Regex("""import\s+"(\w+)"""")
2832
fileContent.forEach {
2933
val res = singleImportRegex.find(it)
3034
if (res != null) {
3135
val lineLib = res.groupValues.last()
3236
imports.add(lineLib)
3337
}
3438
}
35-
val multipleImportRegex = Regex("""import[\s\t\n]+\((.+?)\)""",
36-
RegexOption.DOT_MATCHES_ALL)
3739
val contentJoined = fileContent.joinToString(separator = "")
3840
multipleImportRegex.findAll(contentJoined).forEach { matchResult ->
3941
imports.addAll(matchResult.groupValues.last()
4042
.split(Regex("""(\t+|\n+|\s+|")"""))
4143
.filter { it.isNotEmpty() }
4244
.map { it -> it.replace("\"", "") }
43-
.map { it -> if (it.contains("github.com")) it.split("/")[2] else it})
45+
.map { it -> if (it.contains("github.com")) it.split("/")[2]
46+
else it})
4447
}
4548

4649
return imports.toList()
4750
}
4851

4952
override fun tokenize(line: String): List<String> {
50-
val importRegex = Regex("""^(.*import)\s[^\n]*""")
51-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
5253
var newLine = importRegex.replace(line, "")
5354
newLine = commentRegex.replace(newLine, "")
5455
return super.tokenize(newLine)
@@ -57,6 +58,7 @@ class GoExtractor : ExtractorInterface {
5758
override fun getLineLibraries(line: String,
5859
fileLibraries: List<String>): List<String> {
5960

60-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
61+
return super.getLineLibraries(line, fileLibraries, evaluator,
62+
LANGUAGE_NAME)
6163
}
6264
}

src/main/kotlin/app/extractors/JavaExtractor.kt

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ class JavaExtractor : ExtractorInterface {
2323
val evaluator by lazy {
2424
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
2525
}
26+
val importRegex = Regex("""^(.*import)\s[^\n]*""")
27+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
28+
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
29+
val extractImportRegex = Regex("""import\s+(\w+[.\w+]*)""")
2630
}
2731

2832
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -60,9 +64,8 @@ class JavaExtractor : ExtractorInterface {
6064
override fun extractImports(fileContent: List<String>): List<String> {
6165
val imports = mutableSetOf<String>()
6266

63-
val regex = Regex("""import\s+(\w+[.\w+]*)""")
6467
fileContent.forEach {
65-
val res = regex.find(it)
68+
val res = extractImportRegex.find(it)
6669
if (res != null) {
6770
val importedName = res.groupValues[1]
6871
LIBRARIES.forEach { library ->
@@ -77,9 +80,6 @@ class JavaExtractor : ExtractorInterface {
7780
}
7881

7982
override fun tokenize(line: String): List<String> {
80-
val importRegex = Regex("""^(.*import)\s[^\n]*""")
81-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
82-
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
8383
var newLine = importRegex.replace(line, "")
8484
newLine = commentRegex.replace(newLine, "")
8585
newLine = packageRegex.replace(newLine, "")
@@ -89,6 +89,7 @@ class JavaExtractor : ExtractorInterface {
8989
override fun getLineLibraries(line: String,
9090
fileLibraries: List<String>): List<String> {
9191

92-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
92+
return super.getLineLibraries(line, fileLibraries, evaluator,
93+
LANGUAGE_NAME)
9394
}
9495
}

src/main/kotlin/app/extractors/JavascriptExtractor.kt

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ class JavascriptExtractor : ExtractorInterface {
1515
val evaluator by lazy {
1616
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
1717
}
18+
val splitRegex =
19+
Regex("""\s+|,|;|:|\*|\n|\(|\)|\[|]|\{|}|\+|=|\.|>|<|#|@|\$""")
20+
val multilineCommentRegex = Regex("""/\*.+?\*/""")
21+
val twoOrMoreWordsRegex = Regex("""(".+?\s.+?"|'.+?\s.+?')""")
22+
23+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
1824
}
1925

2026
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -23,13 +29,11 @@ class JavascriptExtractor : ExtractorInterface {
2329
}
2430

2531
override fun extractImports(fileContent: List<String>): List<String> {
26-
val splitRegex =
27-
Regex("""\s+|,|;|:|\*|\n|\(|\)|\\[|]|\{|}|\+|=|\.|>|<|#|@|\$""")
28-
val twoOrMoreWordsRegex = Regex("""(".+?\s.+?"|'.+?\s.+?')""")
29-
30-
val line = fileContent.joinToString(separator = " ").toLowerCase()
31-
val fileTokens = twoOrMoreWordsRegex.replace(line, "").split(splitRegex)
32-
32+
val line = fileContent.map { line -> commentRegex.replace(line, "")}
33+
.joinToString(separator = " ").toLowerCase()
34+
val fileTokens = multilineCommentRegex.replace(
35+
twoOrMoreWordsRegex.replace(line, ""), "")
36+
.split(splitRegex)
3337
return fileTokens.filter { token -> token in LIBRARIES }.distinct()
3438
}
3539

@@ -38,4 +42,9 @@ class JavascriptExtractor : ExtractorInterface {
3842
return super.getLineLibraries(line, fileLibraries, evaluator,
3943
LANGUAGE_NAME)
4044
}
45+
46+
override fun tokenize(line: String): List<String> {
47+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
48+
return super.tokenize(commentRegex.replace(line, ""))
49+
}
4150
}

src/main/kotlin/app/extractors/KotlinExtractor.kt

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ class KotlinExtractor : ExtractorInterface {
1414
val evaluator by lazy {
1515
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
1616
}
17+
val importRegex = Regex("""^(.*import)\s[^\n]*""")
18+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
19+
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
20+
val extractImportRegex = Regex("""import\s+(\w+[.\w+]*)""")
1721
}
1822

1923
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,9 +28,8 @@ class KotlinExtractor : ExtractorInterface {
2428
override fun extractImports(fileContent: List<String>): List<String> {
2529
val imports = mutableSetOf<String>()
2630

27-
val regex = Regex("""import\s+(\w+[.\w+]*)""")
2831
fileContent.forEach {
29-
val res = regex.find(it)
32+
val res = extractImportRegex.find(it)
3033
if (res != null) {
3134
val importedName = res.groupValues[1]
3235
LIBRARIES.forEach { library ->
@@ -41,9 +44,6 @@ class KotlinExtractor : ExtractorInterface {
4144
}
4245

4346
override fun tokenize(line: String): List<String> {
44-
val importRegex = Regex("""^(.*import)\s[^\n]*""")
45-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
46-
val packageRegex = Regex("""^(.*package)\s[^\n]*""")
4747
var newLine = importRegex.replace(line, "")
4848
newLine = commentRegex.replace(newLine, "")
4949
newLine = packageRegex.replace(newLine, "")
@@ -53,6 +53,7 @@ class KotlinExtractor : ExtractorInterface {
5353
override fun getLineLibraries(line: String,
5454
fileLibraries: List<String>): List<String> {
5555

56-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
56+
return super.getLineLibraries(line, fileLibraries, evaluator,
57+
LANGUAGE_NAME)
5758
}
5859
}

src/main/kotlin/app/extractors/ObjectiveCExtractor.kt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ class ObjectiveCExtractor : ExtractorInterface {
1414
val evaluator by lazy {
1515
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
1616
}
17+
val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
18+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
19+
val sharpImportIncludeRegex =
20+
Regex("""#(import|include)\s+[">](\w+)[/\w+]*\.\w+[">]""")
21+
val atImportRegex = Regex("""@import\s+(\w+)""")
1722
}
1823

1924
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,10 +29,6 @@ class ObjectiveCExtractor : ExtractorInterface {
2429
override fun extractImports(fileContent: List<String>): List<String> {
2530
val imports = mutableSetOf<String>()
2631

27-
val sharpImportIncludeRegex =
28-
Regex("""#(import|include)\s+[">](\w+)[/\w+]*\.\w+[">]""")
29-
val atImportRegex = Regex("""@import\s+(\w+)""")
30-
3132
fileContent.forEach {
3233
val res = sharpImportIncludeRegex.findAll(it) +
3334
atImportRegex.findAll(it)
@@ -41,8 +42,6 @@ class ObjectiveCExtractor : ExtractorInterface {
4142
}
4243

4344
override fun tokenize(line: String): List<String> {
44-
val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
45-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
4645
var newLine = importRegex.replace(line, "")
4746
newLine = commentRegex.replace(newLine, "")
4847
return super.tokenize(newLine)
@@ -51,6 +50,7 @@ class ObjectiveCExtractor : ExtractorInterface {
5150
override fun getLineLibraries(line: String,
5251
fileLibraries: List<String>): List<String> {
5352

54-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
53+
return super.getLineLibraries(line, fileLibraries, evaluator,
54+
LANGUAGE_NAME)
5555
}
5656
}

src/main/kotlin/app/extractors/PhpExtractor.kt

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ class PhpExtractor : ExtractorInterface {
1414
val evaluator by lazy {
1515
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
1616
}
17+
val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
18+
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
19+
val useRegex = Regex("""use\s+(\w+)[\\\w+]*""")
20+
val requireIncludeRegex = Regex("""(require|require_once|include|""" +
21+
""""include_once)\s*[(]?'(\w+)[.\w+]*'[)]?""")
1722
}
1823

1924
override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -24,9 +29,6 @@ class PhpExtractor : ExtractorInterface {
2429
override fun extractImports(fileContent: List<String>): List<String> {
2530
val imports = mutableSetOf<String>()
2631

27-
val useRegex = Regex("""use\s+(\w+)[\\\w+]*""")
28-
val requireIncludeRegex = Regex("""(require|require_once|include|""" +
29-
""""include_once)\s*[(]?'(\w+)[.\w+]*'[)]?""")
3032
fileContent.forEach {
3133
val res = useRegex.findAll(it) + requireIncludeRegex.findAll(it)
3234
if (res.toList().isNotEmpty()) {
@@ -39,8 +41,6 @@ class PhpExtractor : ExtractorInterface {
3941
}
4042

4143
override fun tokenize(line: String): List<String> {
42-
val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
43-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
4444
var newLine = importRegex.replace(line, "")
4545
newLine = commentRegex.replace(newLine, "")
4646
return super.tokenize(newLine)
@@ -49,6 +49,7 @@ class PhpExtractor : ExtractorInterface {
4949
override fun getLineLibraries(line: String,
5050
fileLibraries: List<String>): List<String> {
5151

52-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
52+
return super.getLineLibraries(line, fileLibraries, evaluator,
53+
LANGUAGE_NAME)
5354
}
5455
}

0 commit comments

Comments
 (0)