Skip to content

Commit 8474535

Browse files
yaronskayaanatolystansler
authored andcommitted
feat: individual classifier for each library (#287)
* feat: individual classifier for each library * wip: refactor classifiers * wip: refactor * wip: refactor * wip: refactor * wip: refactor fsharp classifiers * wip: refactor imports starts with * wip: refactor * wip: move consts to the right place * wip: refactor * fix: remove submodule * feat: add loading librariesMeta, clf bounds * fix * fix: kotlin test * fix: initialization for primary constructor, formating * fix: refactor libs meta downloading * chore: delete libraries * wip: always download libraries_meta.pb
1 parent 0cdc9e3 commit 8474535

30 files changed

+960
-919
lines changed

build.gradle

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ buildConfig {
3636
buildConfigField 'String', 'PROFILE_URL', 'https://sourcerer.io/'
3737

3838
// App version.
39-
buildConfigField 'int', 'VERSION_CODE', '11'
40-
buildConfigField 'String', 'VERSION', '0.2.6'
39+
buildConfigField 'int', 'VERSION_CODE', '12'
40+
buildConfigField 'String', 'VERSION', '0.3.0'
4141

4242
// Logging.
4343
buildConfigField 'String', 'ENV', project.hasProperty('env') ? env : 'production'
@@ -53,7 +53,7 @@ buildConfig {
5353
buildConfigField 'boolean', 'IS_GA_ENABLED', 'true'
5454

5555
// Models storage path.
56-
buildConfigField 'String', 'LIBRARY_MODELS_URL', 'https://storage.googleapis.com/sourcerer-app/library-models/v1/'
56+
buildConfigField 'String', 'LIBRARY_MODELS_URL', 'https://storage.googleapis.com/sourcerer-app/library-models/v2/'
5757

5858
// Hashing.
5959
buildConfigField 'boolean', 'COMMIT_HASHER_ENABLED', project.hasProperty('commit-hasher-enabled') ? project.property('commit-hasher-enabled').toString() : 'true'

src/main/kotlin/app/extractors/CExtractor.kt

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,15 @@
44

55
package app.extractors
66

7-
import app.model.CommitStats
8-
import app.model.DiffFile
9-
107
class CExtractor : ExtractorInterface {
118
companion object {
129
const val LANGUAGE_NAME = Lang.C
13-
val evaluator by lazy {
14-
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
15-
}
1610
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
1711
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
1812
val extractImportRegex =
1913
Regex("""#include\s+["<](\w+)[/\w+]*\.\w+[">]""")
2014
}
2115

22-
override fun extract(files: List<DiffFile>): List<CommitStats> {
23-
files.map { file -> file.language = LANGUAGE_NAME }
24-
return super.extract(files)
25-
}
26-
2716
override fun extractImports(fileContent: List<String>): List<String> {
2817
val imports = mutableSetOf<String>()
2918

@@ -44,9 +33,13 @@ class CExtractor : ExtractorInterface {
4433
return super.tokenize(newLine)
4534
}
4635

47-
override fun getLineLibraries(line: String,
48-
fileLibraries: List<String>): List<String> {
36+
override fun mapImportToIndex(import: String, lang: String,
37+
startsWith: Boolean): String? {
38+
// TODO(lyaronskaya): Add C to libraries.
39+
return super.mapImportToIndex(import, Lang.CPP, startsWith = true)
40+
}
4941

50-
return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
42+
override fun getLanguageName(): String? {
43+
return LANGUAGE_NAME
5144
}
5245
}

src/main/kotlin/app/extractors/CSharpExtractor.kt

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,56 +4,39 @@
44

55
package app.extractors
66

7-
import app.model.CommitStats
8-
import app.model.DiffFile
9-
107
class CSharpExtractor : ExtractorInterface {
118
companion object {
12-
const val LANGUAGE_NAME = Lang.CSharp
13-
val LIBRARIES = ExtractorInterface.getLibraries("cs")
14-
val evaluator by lazy {
15-
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
16-
}
9+
const val LANGUAGE_NAME = Lang.CSHARP
1710
val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""")
1811
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
1912
val extractImportRegex = Regex("""using\s+(\w+[.\w+]*)""")
2013
}
2114

22-
override fun extract(files: List<DiffFile>): List<CommitStats> {
23-
files.map { file -> file.language = LANGUAGE_NAME }
24-
return super.extract(files)
25-
}
26-
2715
override fun extractImports(fileContent: List<String>): List<String> {
2816
val imports = mutableSetOf<String>()
2917

3018
fileContent.forEach {
3119
val res = extractImportRegex.find(it)
3220
if (res != null) {
33-
val importedName = res.groupValues[1]
34-
LIBRARIES.forEach { library ->
35-
if (importedName.startsWith(library)) {
36-
imports.add(library)
37-
}
38-
}
21+
imports.add(res.groupValues[1])
3922
}
4023
}
4124

4225
return imports.toList()
4326
}
4427

4528
override fun tokenize(line: String): List<String> {
46-
val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""")
47-
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
4829
var newLine = importRegex.replace(line, "")
4930
newLine = commentRegex.replace(newLine, "")
5031
return super.tokenize(newLine)
5132
}
5233

53-
override fun getLineLibraries(line: String,
54-
fileLibraries: List<String>): List<String> {
34+
override fun mapImportToIndex(import: String, lang: String,
35+
startsWith: Boolean): String? {
36+
return super.mapImportToIndex(import, lang, startsWith = true)
37+
}
5538

56-
return super.getLineLibraries(line, fileLibraries, evaluator,
57-
LANGUAGE_NAME)
39+
override fun getLanguageName(): String? {
40+
return LANGUAGE_NAME
5841
}
5942
}
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// Copyright 2018 Sourcerer Inc. All Rights Reserved.
2+
// Author: Liubov Yaronskaya ([email protected])
3+
// Author: Anatoly Kislov ([email protected])
4+
5+
package app.extractors
6+
7+
import app.BuildConfig
8+
import app.Logger
9+
import app.model.Classifier
10+
import app.model.LibraryMeta
11+
import app.utils.FileHelper
12+
import org.apache.http.client.methods.HttpGet
13+
import org.apache.http.impl.client.HttpClientBuilder
14+
import java.io.FileOutputStream
15+
16+
class ClassifierManager {
17+
companion object {
18+
private const val CLASSIFIERS_DIR = "classifiers"
19+
private const val DATA_EXT = ".pb"
20+
private const val LIBS_META_DIR = ClassifierManager.CLASSIFIERS_DIR
21+
private const val LIBS_META_FILENAME = "libraries_meta.pb"
22+
}
23+
24+
val cache = hashMapOf<String, Classifier>()
25+
val libsMeta = getLibraryMeta()
26+
27+
/**
28+
* Returns libraries used in a line.
29+
*/
30+
fun estimate(line: List<String>, libraries: List<String>): List<String> {
31+
return libraries.filter { libId ->
32+
if (!cache.containsKey(libId)) {
33+
// Library not downloaded from cloud storage.
34+
if (FileHelper.notExists(libId + DATA_EXT, CLASSIFIERS_DIR)) {
35+
Logger.info { "Downloading $libId classifier" }
36+
downloadClassifier(libId)
37+
Logger.info { "Finished downloading $libId classifier" }
38+
}
39+
40+
// Library not loaded from local storage.
41+
Logger.info { "Loading $libId evaluator" }
42+
loadClassifier(libId)
43+
Logger.info { "$libId evaluator ready" }
44+
}
45+
46+
// Check line for usage of a library.
47+
val prediction = cache[libId]!!.evaluate(line)
48+
// Prediction based on two classes.
49+
val prob = prediction[cache[libId]!!.libraries.indexOf(libId)]
50+
// Libraries with no imports.
51+
if (libId == "rb.rails") {
52+
prob > 0.8
53+
} else {
54+
prob > 0.5
55+
}
56+
}
57+
}
58+
59+
/**
60+
* Downloads libraries from cloud.
61+
*/
62+
private fun downloadClassifier(libId: String) {
63+
val file = FileHelper.getFile(libId + DATA_EXT, CLASSIFIERS_DIR)
64+
val langId = libId.split('.')[0]
65+
val url = "${BuildConfig.LIBRARY_MODELS_URL}$langId/$libId$DATA_EXT"
66+
val builder = HttpClientBuilder.create()
67+
val client = builder.build()
68+
try {
69+
client.execute(HttpGet(url)).use { response ->
70+
val entity = response.entity
71+
if (entity != null) {
72+
FileOutputStream(file).use { outstream ->
73+
entity.writeTo(outstream)
74+
outstream.flush()
75+
outstream.close()
76+
}
77+
}
78+
79+
}
80+
} catch (e: Exception) {
81+
Logger.error(e, "Failed to download $libId classifier")
82+
}
83+
}
84+
85+
/**
86+
* Loads libraries from local storage to cache.
87+
*/
88+
private fun loadClassifier(libId: String) {
89+
val bytesArray = FileHelper.getFile(libId + DATA_EXT, CLASSIFIERS_DIR)
90+
.readBytes()
91+
cache[libId] = Classifier(bytesArray)
92+
}
93+
94+
/**
95+
* Downloads libraries meta data from cloud.
96+
*/
97+
private fun downloadLibrariesMeta() {
98+
val file = FileHelper.getFile(LIBS_META_FILENAME, LIBS_META_DIR)
99+
val url = BuildConfig.LIBRARY_MODELS_URL + LIBS_META_FILENAME
100+
val builder = HttpClientBuilder.create()
101+
val client = builder.build()
102+
try {
103+
client.execute(HttpGet(url)).use { response ->
104+
val entity = response.entity
105+
if (entity != null) {
106+
FileOutputStream(file).use { outstream ->
107+
entity.writeTo(outstream)
108+
outstream.flush()
109+
outstream.close()
110+
}
111+
}
112+
}
113+
} catch (e: Exception) {
114+
Logger.error(e, "Failed to download $LIBS_META_FILENAME")
115+
}
116+
}
117+
118+
/**
119+
* Loads libraries meta data from local storage.
120+
*/
121+
private fun getLibraryMeta(): LibraryMeta {
122+
Logger.info { "Downloading $LIBS_META_FILENAME" }
123+
downloadLibrariesMeta()
124+
Logger.info { "Finished downloading $LIBS_META_FILENAME" }
125+
126+
val bytesArray = FileHelper.getFile(LIBS_META_FILENAME,
127+
LIBS_META_DIR).readBytes()
128+
return LibraryMeta(bytesArray)
129+
}
130+
}

src/main/kotlin/app/extractors/CommonExtractor.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ package app.extractors
66
import app.model.CommitStats
77
import app.model.DiffFile
88

9-
class CommonExtractor(val languageName: String) : ExtractorInterface {
9+
class CommonExtractor(private val langName: String) : ExtractorInterface {
1010
override fun extract(files: List<DiffFile>): List<CommitStats> {
11-
files.map { file -> file.language = languageName }
11+
files.map { file -> file.lang = langName }
1212
return super.extract(files)
1313
}
1414
}

src/main/kotlin/app/extractors/CppExtractor.kt

Lines changed: 8 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,14 @@
44

55
package app.extractors
66

7-
import app.model.CommitStats
8-
import app.model.DiffFile
9-
107
class CppExtractor : ExtractorInterface {
118
companion object {
12-
const val LANGUAGE_NAME = Lang.CPlusPlus
13-
val evaluator by lazy {
14-
ExtractorInterface.getLibraryClassifier(LANGUAGE_NAME)
15-
}
16-
val MULTI_IMPORT_TO_LIB =
17-
ExtractorInterface.getMultipleImportsToLibraryMap(LANGUAGE_NAME)
9+
const val LANGUAGE_NAME = Lang.CPP
1810
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
1911
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
2012
val extractImportRegex = Regex("""#include\s+["<](\w+)[/\w+]*(\.\w+)?[">]""")
2113
}
2214

23-
override fun extract(files: List<DiffFile>): List<CommitStats> {
24-
files.map { file -> file.language = LANGUAGE_NAME }
25-
return super.extract(files)
26-
}
27-
2815
override fun extractImports(fileContent: List<String>): List<String> {
2916
val imports = mutableSetOf<String>()
3017

@@ -36,15 +23,7 @@ class CppExtractor : ExtractorInterface {
3623
imports.add(lineLib)
3724
}
3825
}
39-
val libraries = imports.map { MULTI_IMPORT_TO_LIB.getOrDefault(it, it) }
40-
.map { import -> when {
41-
import.startsWith("Q") -> "Qt"
42-
import.startsWith("Lzma") -> "Lzma"
43-
import.startsWith("Ogre") -> "Ogre"
44-
else -> import
45-
}}
46-
.toSet().toList()
47-
return libraries
26+
return imports.toSet().toList()
4827
}
4928

5029
override fun tokenize(line: String): List<String> {
@@ -53,10 +32,12 @@ class CppExtractor : ExtractorInterface {
5332
return super.tokenize(newLine)
5433
}
5534

56-
override fun getLineLibraries(line: String,
57-
fileLibraries: List<String>): List<String> {
35+
override fun mapImportToIndex(import: String, lang: String,
36+
startsWith: Boolean): String? {
37+
return super.mapImportToIndex(import, lang, startsWith = true)
38+
}
5839

59-
return super.getLineLibraries(line, fileLibraries, evaluator,
60-
LANGUAGE_NAME)
40+
override fun getLanguageName(): String? {
41+
return LANGUAGE_NAME
6142
}
6243
}

src/main/kotlin/app/extractors/CssExtractor.kt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright 2018 Sourcerer Inc. All Rights Reserved.
22
// Author: Liubov Yaronskaya ([email protected])
3+
// Author: Anatoly Kislov ([email protected])
34

45
package app.extractors
56

@@ -13,7 +14,7 @@ class CssExtractor : ExtractorInterface {
1314
}
1415

1516
override fun extract(files: List<DiffFile>): List<CommitStats> {
16-
files.map { file -> file.language = LANGUAGE_NAME }
17+
files.map { file -> file.lang = LANGUAGE_NAME }
1718
val stats = FILE_EXTS.filter { it != "css" }.map { extension ->
1819
val result = files.filter { it.extension == extension }
1920
.fold(Pair(0, 0)) { total, file ->
@@ -26,10 +27,14 @@ class CssExtractor : ExtractorInterface {
2627

2728
CommitStats(numLinesAdded = result[0],
2829
numLinesDeleted = result[1],
29-
type = Extractor.TYPE_LIBRARY,
30+
type = ExtractorInterface.TYPE_LIBRARY,
3031
tech = extension)
3132
}.filter { it.numLinesAdded > 0 || it.numLinesDeleted > 0 }
3233

3334
return stats + super.extract(files)
3435
}
36+
37+
override fun getLanguageName(): String? {
38+
return LANGUAGE_NAME
39+
}
3540
}

src/main/kotlin/app/extractors/Extractor.kt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,6 @@ import app.model.DiffFile
99

1010
class Extractor : ExtractorInterface {
1111
companion object {
12-
val TYPE_LANGUAGE = 1
13-
val TYPE_LIBRARY = 2
14-
val TYPE_KEYWORD = 3
15-
val TYPE_SYNTAX = 4
16-
val SEPARATOR = ">"
1712
val RESTRICTED_EXTS = listOf(".min.js")
1813
}
1914

0 commit comments

Comments
 (0)