Skip to content
This repository was archived by the owner on Nov 6, 2023. It is now read-only.

Commit 7efc7e6

Browse files
authored
Rewrite Alexa Labeller (#19135)
* Updates and new node app file for Labeller rewrite - Switched over old packages for up to date packages in npm. - Octokit pkg is the new Interface for github auth * Further build out parsing and files for Alexa labeller * Rewrite rest of logic flow to return labels - utilized patch to not parse each XML file - Wrote in logic to determine labelling - tested successful * Adjust app.js - naming conventions that make more sense for functions * Set new index.js * Add test foundation - Apart of the practice of integrating tests when needed for tooling - This us a bare bones file since the data needed to do tests are dynamic and need real Github configuration to complete more tests - For now there is a test running a proper label rank return since this is the most static set of data listed * Restructure and cleanup - Move process function to own module for cleaner logic flow - Amend test foundation - Clean up typos and unused modules
1 parent c511972 commit 7efc7e6

File tree

6 files changed

+1707
-208
lines changed

6 files changed

+1707
-208
lines changed

utils/labeller/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
FROM node:8-alpine
2-
LABEL maintainer="William Budington <bill@eff.org>"
1+
FROM node:10-alpine
2+
LABEL maintainer="Alexis Hancock <alexis@eff.org>"
33

44
WORKDIR /opt
55

utils/labeller/index.js

Lines changed: 99 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -1,216 +1,118 @@
11
"use strict";
22

3-
var fs = require('fs');
4-
var readline = require('readline');
5-
6-
var GitHubApi = require('github');
7-
var _ = require('lodash');
8-
var parseXML = require('xml2js').parseString;
9-
var async = require('async');
10-
var request = require('request');
11-
var unzip = require('unzip');
12-
var ProgressBar = require('progress');
13-
14-
var config = require('./config');
3+
const { Octokit } = require('@octokit/rest');
4+
const process = require('./process');
5+
const axios = require('axios');
6+
const unzip = require('unzipper');
7+
const config = require('./config');
8+
9+
const octokit = new Octokit({
10+
auth: config.github_token,
11+
userAgent: 'Labeller v2'
12+
});
13+
const httpse = {
14+
owner: config.github_user,
15+
repo: config.github_repo
16+
}
1517

16-
// Fetch the Alexa top 1M sites and push it to an array `alexa` via streams
17-
function get_alexa(alexa_cb){
18+
let ProgressBar = require('progress');
1819

19-
var alexa = []
20-
var csv_regex = /^[0-9]+,(.+)/
20+
// Background process functions for logic flow below
21+
let Process = new process.Process(octokit, httpse);
2122

22-
request.get('https://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
23-
.on('error', function(err) {
24-
alexa_cb(err);
25-
})
26-
.pipe(unzip.Parse())
27-
.on('entry', function (entry) {
23+
/**
24+
* @description Fetch the Alexa top 1M sites and push it to an array `alexa` via streams
25+
* @returns object
26+
*/
27+
function initiate() {
2828

29-
var bar = new ProgressBar('Processing Alexa Top 1M [:bar] :percent :etas', {
30-
total: 100
31-
});
29+
let alexa = [];
30+
let regex = /^[0-9]+,(.+)/
31+
const alexa_csv = 'https://s3.amazonaws.com/alexa-static/top-1m.csv.zip';
3232

33-
var lineReader = require('readline').createInterface({
34-
input: entry
35-
});
33+
// Grab Alexa data
34+
axios({
35+
method: 'get',
36+
url: alexa_csv,
37+
responseType: 'stream'
38+
})
39+
.then(function (response) {
40+
response.data.pipe(unzip.Parse())
41+
.on('entry', function (entry) {
42+
let bar = new ProgressBar('Processing Alexa Top 1M [:bar] :percent :etas', {
43+
total: 100
44+
});
3645

37-
var x = 0;
38-
lineReader.on('line', function (line) {
39-
var domain = line.match(csv_regex)[1]
40-
alexa.push(domain);
46+
let lineReader = require('readline').createInterface({
47+
input: entry,
48+
});
4149

42-
if(x % 10000 == 0) bar.tick();
43-
x++;
44-
});
50+
let x = 0;
4551

46-
lineReader.on('close', function(){
47-
alexa_cb(null, alexa);
48-
});
52+
lineReader.on('line', function (line) {
53+
let domain = line.match(regex)[1];
54+
alexa.push(domain);
55+
if(x % 10000 == 0) bar.tick();
56+
x++;
57+
});
4958

59+
lineReader.on('close', function(){
60+
try {
61+
get_prs(alexa);
62+
} catch (error) {
63+
console.log(error);
64+
}
65+
});
66+
})
67+
})
68+
.catch(function (error) {
69+
console.log(error);
5070
});
51-
};
52-
53-
function get_most_recent_pr(alexa, recent_cb){
54-
fs.readFile(config.state_file, function(err, data){
55-
if(err){
56-
fs.writeFile(config.state_file, '0', function(err){
57-
if(err) return recent_cb(err);
58-
recent_cb(null, [alexa, 0]);
59-
});
60-
} else {
61-
recent_cb(null, [alexa, Number(data)]);
62-
}
63-
});
6471
}
6572

66-
function github_process_prs(res, pr_cb){
67-
var alexa = res[0],
68-
most_recent_pr_checked = res[1];
69-
70-
var github = new GitHubApi();
71-
var wildcard_www_regex = /^(www|\*)\.(.+)/
72-
73-
var httpse = {
74-
user: config.github_user,
75-
repo: config.github_repo
76-
}
77-
78-
github.authenticate({
79-
type: "oauth",
80-
token: config.github_token || process.env.GITHUB_TOKEN
73+
/**
74+
* @param {obj} alexa
75+
* @description Returns Pull Requests to label
76+
*/
77+
function get_prs(alexa) {
78+
let wildcard_www_regex = /^(www|\*)\.(.+)/
79+
80+
octokit.paginate(
81+
"GET /repos/:owner/:repo/pulls",
82+
httpse,
83+
)
84+
.then(prs => {
85+
process_prs(alexa, prs)
8186
})
87+
.catch(reason => {
88+
console.log(reason);
89+
})
90+
}
8291

83-
// Label all PRs which meet the criteria for labelling
84-
function github_process_pr_page(first_page){
85-
return function(err, pull_requests){
86-
if(first_page){
87-
fs.writeFile(config.state_file, pull_requests[0].number, function(err){
88-
if(err) return pr_cb(err);
89-
});
90-
}
91-
92-
_.each(pull_requests, function(pull_request){
93-
94-
if(pull_request.number > most_recent_pr_checked){
95-
github.pullRequests.getFiles(_.extend(httpse, {
96-
number: pull_request.number
97-
}), function(err, files){
98-
if(err) return pr_cb(err);
99-
100-
// Rank a list of target hosts, returning the minimum alexa placing
101-
function rank_targets(targets){
102-
var minimum_placing = 9999999;
103-
104-
_.each(targets, function(host){
105-
if(host.match(wildcard_www_regex)){
106-
host = host.match(wildcard_www_regex)[2];
107-
}
108-
109-
var alexa_placing = alexa.indexOf(host);
110-
if(~alexa_placing && alexa_placing < minimum_placing){
111-
minimum_placing = alexa_placing;
112-
}
113-
});
114-
115-
if(minimum_placing != 9999999){
116-
return minimum_placing;
117-
}
118-
}
119-
120-
// Given the url of an HTTPSE ruleset, return a list of targets to fetch_cb
121-
function fetch_url_and_parse_targets(url, fetch_cb){
122-
request({url: url}, function(err, res, body){
123-
if(err) return fetch_cb(err);
124-
125-
parseXML(body, function(err, root){
126-
if(err) return fetch_cb(err);
127-
128-
fetch_cb(null, _.map(root.ruleset.target, function(target){
129-
return target.$.host;
130-
}));
131-
});
132-
});
133-
}
134-
135-
var file_fetches = [];
136-
137-
// Out of the list of files for this PR, figure out the minimum Alexa ranking for each
138-
_.each(files, function(file){
139-
if(file.filename.match(/^src\/chrome\/content\/rules\//)){
140-
file_fetches.push(function(file_cb){
141-
fetch_url_and_parse_targets(file.raw_url, function(err, targets){
142-
if(err) return file_cb(err);
143-
144-
console.log("Processing PR: " + pull_request.number + ", file: " + file.filename);
145-
146-
var ranking = rank_targets(targets);
147-
if(ranking){
148-
return file_cb(null, {
149-
alexa_placing: ranking,
150-
pr_number: pull_request.number
151-
});
152-
} else {
153-
return file_cb();
154-
}
155-
});
156-
});
157-
}
158-
});
159-
160-
async.parallel(file_fetches, function(err, res){
161-
if(err) pr_cb(err);
162-
163-
var reduced_pr_ranking = _.reduce(_.filter(res),
164-
function(minimum_file_res, file_res){
165-
if(file_res.alexa_placing < minimum_file_res.alexa_placing){
166-
return file_res;
167-
}
168-
return minimum_file_res;
169-
});
170-
171-
if(reduced_pr_ranking){
172-
let label;
173-
if(reduced_pr_ranking.alexa_placing < 100){
174-
label = "top-100";
175-
} else if(reduced_pr_ranking.alexa_placing < 1000){
176-
label = "top-1k";
177-
} else if(reduced_pr_ranking.alexa_placing < 10000){
178-
label = "top-10k";
179-
} else if(reduced_pr_ranking.alexa_placing < 100000){
180-
label = "top-100k";
181-
} else {
182-
label = "top-1m";
183-
}
184-
console.log("Applying label `" + label + "` to PR: " + reduced_pr_ranking.pr_number);
185-
186-
github.issues.addLabels(_.extend(httpse, {
187-
number: reduced_pr_ranking.pr_number,
188-
body: [label]
189-
}), function(err, res){
190-
if(err) console.log(err);
191-
});
192-
}
193-
});
194-
});
195-
}
196-
});
197-
198-
if(github.hasNextPage(pull_requests)){
199-
github.getNextPage(pull_requests, github_process_pr_page(false));
92+
/**
93+
* @param {obj} alexa
94+
* @param {obj} prs
95+
* @description Labels Pull Requests
96+
*/
97+
function process_prs(alexa, prs) {
98+
let filtered_prs = prs.filter(Process.labelled);
99+
100+
prs.forEach(pr => {
101+
102+
let domain_label_pairs = [];
103+
104+
octokit.pulls.listFiles({
105+
...httpse,
106+
pull_number: pr.number,
107+
}).then(files => {
108+
let rank_number = Process.files(files, alexa);
109+
if(rank_number !== null) {
110+
let determined_label = Process.return_label(rank_number);
111+
// pr is interchangeable with issue in API ¯\_(ツ)_/¯
112+
Process.add_label(determined_label, pr.number);
200113
}
201-
}
202-
}
203-
204-
github.pullRequests.getAll(_.extend(httpse, {
205-
state: "open",
206-
per_page: 100
207-
}), github_process_pr_page(true));
114+
})
115+
});
208116
}
209117

210-
async.waterfall([
211-
get_alexa,
212-
get_most_recent_pr,
213-
github_process_prs
214-
], function(err, result){
215-
if(err) console.log(err);
216-
});
118+
initiate();

0 commit comments

Comments
 (0)