Skip to content

Commit 5465642

Browse files
authored
Merge pull request #563 from clearlydefined/master
Merge master into prod branch in preparation for the next release
2 parents 9ac0a7e + 8a04999 commit 5465642

23 files changed

+3312
-8099
lines changed

.github/workflows/test.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
name: Run tests
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
pull_request:
8+
branches:
9+
- master
10+
11+
permissions:
12+
contents: read
13+
14+
jobs:
15+
test:
16+
runs-on: ubuntu-latest
17+
steps:
18+
- uses: actions/[email protected]
19+
20+
- uses: actions/[email protected]
21+
with:
22+
node-version: 18
23+
cache: 'npm'
24+
25+
- name: Install dependencies
26+
run: npm ci
27+
28+
- name: Run tests
29+
run: npm test

.vscode/settings.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
"editor.folding": false,
77
"editor.tabSize": 2,
88
"editor.detectIndentation": false,
9-
"editor.formatOnSave": true,
9+
"editor.formatOnSave": false,
1010
"editor.formatOnType": true,
1111
"editor.insertSpaces": true,
1212
"files.trimTrailingWhitespace": true
13-
}
13+
}

DevDockerfile

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Copyright (c) Microsoft Corporation and others. Licensed under the MIT license.
22
# SPDX-License-Identifier: MIT
33

4-
FROM node:16
4+
FROM node:18-bullseye
55
ENV APPDIR=/opt/service
66

77
ARG BUILD_NUMBER=0
@@ -13,7 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --no-install-su
1313
curl -L https://github.com/rbenv/ruby-build/archive/v20180822.tar.gz | tar -zxvf - -C /tmp/ && \
1414
cd /tmp/ruby-build-* && ./install.sh && cd / && \
1515
ruby-build -v 2.5.1 /usr/local && rm -rfv /tmp/ruby-build-* && \
16-
gem install bundler --no-rdoc --no-ri
16+
gem install bundler -v 2.3.26 --no-document
1717

1818
# Scancode
1919
ARG SCANCODE_VERSION="30.1.0"
@@ -31,16 +31,19 @@ ENV SCANCODE_HOME=/usr/local/bin
3131
# the current RubyGem is 2.5.1. However, after upgrading RubyGem to 3.1.2, licensee:9.12.0 starts
3232
# to have hard time to find license in LICENSE file, like component npm/npmjs/-/caniuse-lite/1.0.30001344.
3333
# So we pin to the previous version of nokogiri and faraday.
34-
RUN gem install nokogiri:1.12.5 --no-rdoc --no-ri && \
35-
gem install faraday:1.10.0 --no-rdoc --no-ri && \
36-
gem install public_suffix:4.0.7 --no-rdoc --no-ri && \
37-
gem install licensee:9.12.0 --no-rdoc --no-ri
34+
RUN gem install nokogiri:1.12.5 --no-document && \
35+
gem install faraday:1.10.0 --no-document && \
36+
gem install public_suffix:4.0.7 --no-document && \
37+
gem install licensee:9.12.0 --no-document
3838

3939
# REUSE
4040
RUN pip3 install setuptools
4141
RUN pip3 install reuse
4242

43+
RUN git config --global --add safe.directory '*'
44+
4345
COPY package*.json /tmp/
46+
COPY patches /tmp/patches
4447
RUN cd /tmp && npm install
4548
RUN mkdir -p "${APPDIR}" && cp -a /tmp/node_modules "${APPDIR}"
4649

Dockerfile

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#COPY fossology_init.sh fossology_init.sh
66
#RUN ./fossology_init.sh
77

8-
FROM node:16
8+
FROM node:18-bullseye
99
ENV APPDIR=/opt/service
1010
#RUN apk update && apk upgrade && \
1111
# apk add --no-cache bash git openssh
@@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --no-install-su
1919
curl -L https://github.com/rbenv/ruby-build/archive/v20180822.tar.gz | tar -zxvf - -C /tmp/ && \
2020
cd /tmp/ruby-build-* && ./install.sh && cd / && \
2121
ruby-build -v 2.5.1 /usr/local && rm -rfv /tmp/ruby-build-* && \
22-
gem install bundler --no-rdoc --no-ri
22+
gem install bundler -v 2.3.26 --no-document
2323

2424
# Scancode
2525
ARG SCANCODE_VERSION="30.1.0"
@@ -37,10 +37,10 @@ ENV SCANCODE_HOME=/usr/local/bin
3737
# the current RubyGem is 2.5.1. However, after upgrading RubyGem to 3.1.2, licensee:9.12.0 starts
3838
# to have hard time to find license in LICENSE file, like component npm/npmjs/-/caniuse-lite/1.0.30001344.
3939
# So we pin to the previous version of nokogiri and faraday.
40-
RUN gem install nokogiri:1.12.5 --no-rdoc --no-ri && \
41-
gem install faraday:1.10.0 --no-rdoc --no-ri && \
42-
gem install public_suffix:4.0.7 --no-rdoc --no-ri && \
43-
gem install licensee:9.12.0 --no-rdoc --no-ri
40+
RUN gem install nokogiri:1.12.5 --no-document && \
41+
gem install faraday:1.10.0 --no-document && \
42+
gem install public_suffix:4.0.7 --no-document && \
43+
gem install licensee:9.12.0 --no-document
4444

4545
# REUSE
4646
RUN pip3 install setuptools
@@ -81,7 +81,10 @@ ENV CRAWLER_STORE_PROVIDER=cdDispatch+cd(azblob)+azqueue
8181
ENV CRAWLER_WEBHOOK_URL=https://api.clearlydefined.io/webhook
8282
ENV CRAWLER_AZBLOB_CONTAINER_NAME=production
8383

84+
RUN git config --global --add safe.directory '*'
85+
8486
COPY package*.json /tmp/
87+
COPY patches /tmp/patches
8588
RUN cd /tmp && npm install --production
8689
RUN mkdir -p "${APPDIR}" && cp -a /tmp/node_modules "${APPDIR}"
8790

README.md

100644100755
Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ Here are a few example request objects.
5252
}
5353
```
5454

55-
The request `type` describes the crawling activity being requested. For example, "do `package` crawling". It is typically the same as the `type` in the url (see below). There are some more advanced scenarios where the two values are different but for starters, treat them as the same. The general form of a request URL is (note: it is a URL because of the underlying crawling infrastructure, the `cd` scheme is not particularly relevant)
55+
The request `type` describes the crawling activity being requested. For example, "do `package` crawling" (see [More on type](#more-on-type) for a description of valid type values). It is typically the same as the `type` in the url (see segments description below). There are some more advanced scenarios where the two values are different but for starters, treat them as the same. The general form of a request URL is (note: it is a URL because of the underlying crawling infrastructure, the `cd` scheme is not particularly relevant)
5656

5757
```
5858
cd:/type/provider/namespace/name/revision
@@ -80,6 +80,18 @@ Process the source, if any:
8080

8181
The crawler's output is stored for use by the rest of the ClearlyDefined infrastructure -- it is not intended to be used directly by humans. Note that each tool's output is stored separately and the results of processing the component and the component source are also separated.
8282

83+
### <a id="more-on-type"></a>More on `type`
84+
The `type` in the request object typically corresponds to an internal processor in CD.
85+
1. `component` is the most generic type. Internally, it is converted to a `package` or `source` request by the component processor.
86+
2. `package` request is processed by the package processor and is further converted to a request with a specific type (`crate`, `deb`, `gem`, `go`, `maven`, `npm`, `nuget`, `composer`, `pod`, `pypi`). For a `package` typed request, if the mentioned specific binary package type is known, the specific type (e.g. `npm`) can be used (instead of `package`) in the harvest request and skip the conversion step. For example,
87+
```json
88+
{
89+
"type": "npm",
90+
"url": "cd:/npm/npmjs/-/redie/0.3.0"
91+
}
92+
```
93+
3. `source` requests are processed by the source processor, which subsequently dispatches a `clearlydefined` typed request for the supported source types and other requests (one for each scanning tool). These are the more advanced scenarios where the request type and the coordinate type differ.
94+
8395
# Configuration
8496

8597
The crawler is quite configuable. Out of the box it is setup for demo-level use directly on your computer. In its full glory it can run with arbitrarily many distributed clients using an array of different queuing, caching and storage technologies.
@@ -121,7 +133,7 @@ If a CRAWLER_ID is specified, then each instance must have this setting globally
121133
## Run Docker image from Docker Hub
122134

123135
You can run the image as is from docker (this is w/o any port forwarding, which means the only way you can interact with the crawler locally is through the queue. See below for examples of how to run with ports exposed to do curl based testing).
124-
`docker run --env-file ../<env_name>.env.list clearlydefined/crawler`
136+
`docker run --platform linux/amd64 --env-file ../<env_name>.env.list clearlydefined/crawler`
125137

126138
See `local.env.list`, `dev.env.list` and `prod.env.list` tempate files.
127139

@@ -133,13 +145,13 @@ See `local.env.list`, `dev.env.list` and `prod.env.list` tempate files.
133145

134146
## Build and run Docker image locally
135147

136-
`docker build -t cdcrawler:latest .`
148+
`docker build --platform linux/amd64 -t cdcrawler:latest .`
137149

138-
`docker run --rm --env-file ../dev.env.list -p 5000:5000 -p 9229:9229 cdcrawler:latest`
150+
`docker run --platform linux/amd64 --rm --env-file ../dev.env.list -p 5000:5000 -p 9229:9229 cdcrawler:latest`
139151

140152
With a debugger:
141153

142-
`docker run --rm -d --env-file ../dev.env.list -p 9229:9229 -p 5000:5000 --entrypoint node cdcrawler:latest --inspect-brk=0.0.0.0:9229 index.js`
154+
`docker run --platform linux/amd64 --rm -d --env-file ../dev.env.list -p 9229:9229 -p 5000:5000 --entrypoint node cdcrawler:latest --inspect-brk=0.0.0.0:9229 index.js`
143155

144156
At this point you can attach VS Code with the built in debugging profile (see .vscode/launch.json)
145157

azure-pipelines.yml

Lines changed: 0 additions & 21 deletions
This file was deleted.

lib/utils.js

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
// Copyright (c) Microsoft Corporation and others. Licensed under the MIT license.
22
// SPDX-License-Identifier: MIT
33
const { DateTime } = require('luxon')
4+
const { spawn } = require('child_process')
5+
const { intersection } = require('lodash')
46

57
const dateTimeFormats = [
68
'EEE MMM d HH:mm:ss \'GMT\'ZZ yyyy' //in pom properties
@@ -30,6 +32,12 @@ function trimAllParents(paths, parents) {
3032
return paths.map(path => trimParents(path, parents))
3133
}
3234

35+
function isGitFile(file) {
36+
if (!file) return false
37+
const segments = file.split(/[\\/]/g)
38+
return intersection(segments, ['.git']).length > 0
39+
}
40+
3341
function extractDate(dateAndTime, formats = dateTimeFormats) {
3442
if (!dateAndTime) return dateAndTime
3543
let luxonResult = DateTime.fromISO(dateAndTime)
@@ -49,4 +57,29 @@ function extractDate(dateAndTime, formats = dateTimeFormats) {
4957
return (instant.isBefore(validStart) || instant.isAfter(validEnd)) ? null : luxonResult
5058
}
5159

52-
module.exports = { normalizePath, normalizePaths, trimParents, trimAllParents, extractDate }
60+
function attachListeners(child, resolve, reject) {
61+
let stdoutData = [], stderrData = []
62+
63+
child.stdout.on('data', chunk => stdoutData.push(chunk))
64+
child.stderr.on('data', chunk => stderrData.push(chunk))
65+
66+
child
67+
.on('error', (err) => reject(err))
68+
.on('close', (code) => {
69+
if (code === 0) resolve(stdoutData.join(''))
70+
else {
71+
const errorFromChild = new Error(stderrData.join(''))
72+
errorFromChild.code = code
73+
reject(errorFromChild)
74+
}
75+
})
76+
}
77+
78+
function spawnPromisified(command, args, options) {
79+
const childProcess = spawn(command, args, options)
80+
return new Promise((resolve, reject) => {
81+
attachListeners(childProcess, resolve, reject)
82+
})
83+
}
84+
85+
module.exports = { normalizePath, normalizePaths, trimParents, trimAllParents, isGitFile, extractDate, spawnPromisified }

0 commit comments

Comments
 (0)