Skip to content

Commit a6c9c98

Browse files
committed
Add importer for project KB MSR2019 dataset
Signed-off-by: Shivam Sandbhor <[email protected]>
1 parent e65ac68 commit a6c9c98

File tree

3 files changed

+107
-2
lines changed

3 files changed

+107
-2
lines changed

vulnerabilities/importer_yielder.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@
172172
'db_url': 'https://usn.ubuntu.com/usn-db/database-all.json.bz2'
173173
},
174174
},
175-
176175
{
177176
'name': 'github',
178177
'license': '',
@@ -182,7 +181,16 @@
182181
'endpoint': 'https://api.github.com/graphql',
183182
'ecosystems': ['MAVEN', 'NUGET', 'COMPOSER']
184183
}
185-
}
184+
},
185+
{
186+
'name': 'msr2019',
187+
'license': '',
188+
'last_run': None,
189+
'data_source': 'ProjectKBMSRDataSource',
190+
'data_source_cfg': {
191+
'etag': {}
192+
},
193+
},
186194

187195
]
188196

vulnerabilities/importers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,4 @@
3838
from vulnerabilities.importers.ubuntu_usn import UbuntuUSNDataSource
3939
from vulnerabilities.importers.github import GitHubAPIDataSource
4040
from vulnerabilities.importers.nvd import NVDDataSource
41+
from vulnerabilities.importers.project_kb_msr2019 import ProjectKBMSRDataSource
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright (c) nexB Inc. and others. All rights reserved.
2+
# http://nexb.com and https://github.com/nexB/vulnerablecode/
3+
# The VulnerableCode software is licensed under the Apache License version 2.0.
4+
# Data generated with VulnerableCode require an acknowledgment.
5+
#
6+
# You may not use this software except in compliance with the License.
7+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
8+
# Unless required by applicable law or agreed to in writing, software distributed
9+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
10+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
11+
# specific language governing permissions and limitations under the License.
12+
#
13+
# When you publish or redistribute any data created with VulnerableCode or any VulnerableCode
14+
# derivative work, you must accompany this data with the following acknowledgment:
15+
#
16+
# Generated with VulnerableCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
17+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
18+
# VulnerableCode should be considered or used as legal advice. Consult an Attorney
19+
# for any legal advice.
20+
# VulnerableCode is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/vulnerablecode/ for support and download.
22+
23+
import csv
24+
import dataclasses
25+
import urllib.request
26+
27+
# Reading CSV file from a url using `requests` is bit too complicated.
28+
# Use `urllib.request` for that purpose. Need `requests` because making
29+
# a HEADER request using `urllib.request` is too complicated.
30+
import requests
31+
from packageurl import PackageURL
32+
33+
34+
from vulnerabilities.data_source import Advisory
35+
from vulnerabilities.data_source import DataSource
36+
from vulnerabilities.data_source import Reference
37+
from vulnerabilities.data_source import DataSourceConfiguration
38+
39+
40+
@dataclasses.dataclass
41+
class ProjectKBDataSourceConfiguration(DataSourceConfiguration):
42+
etag: dict
43+
44+
45+
class ProjectKBMSRDataSource(DataSource):
46+
47+
CONFIG_CLASS = ProjectKBDataSourceConfiguration
48+
49+
url = "https://raw.githubusercontent.com/SAP/project-kb/master/MSR2019/dataset/vulas_db_msr2019_release.csv" # nopep8
50+
51+
def updated_advisories(self):
52+
# etag are like hashes of web responses. We maintain
53+
# (url, etag) mappings in the DB. `create_etag` creates
54+
# (url, etag) pair. If a (url, etag) already exists then the code
55+
# skips processing the response further to avoid duplicate work
56+
if self.create_etag(self.url):
57+
raw_data = self.fetch()
58+
advisories = self.to_advisories(raw_data)
59+
return self.batch_advisories(advisories)
60+
61+
return []
62+
63+
def create_etag(self, url):
64+
etag = requests.head(url).headers.get("ETag")
65+
if not etag:
66+
return True
67+
68+
elif url in self.config.etag:
69+
if self.config.etag[url] == etag:
70+
return False
71+
72+
self.config.etag[url] = etag
73+
return True
74+
75+
def fetch(self):
76+
response = urllib.request.urlopen(self.url)
77+
lines = [l.decode("utf-8") for l in response.readlines()]
78+
return csv.reader(lines)
79+
80+
@staticmethod
81+
def to_advisories(csv_reader):
82+
# Project KB MSR csv file has no header row
83+
advsiories = []
84+
for row in csv_reader:
85+
vuln_id, proj_home, fix_commit, _ = row
86+
commit_link = proj_home + "/commit/" + fix_commit
87+
advsiories.append(
88+
Advisory(
89+
summary="",
90+
impacted_package_urls=[],
91+
vuln_references=[Reference(url=commit_link)],
92+
cve_id=vuln_id,
93+
)
94+
)
95+
96+
return advisories

0 commit comments

Comments
 (0)