Skip to content

Commit 1b0e339

Browse files
authored
Add scripts for downloading open source corpora. (#4193)
Add scripts for downloading open source corpora.
1 parent cdee9bc commit 1b0e339

File tree

9 files changed

+429
-0
lines changed

9 files changed

+429
-0
lines changed

.github/workflows/dart.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ jobs:
2424
- name: dart pub get (working/macros/example)
2525
run: dart pub get
2626
working-directory: working/macros/example
27+
- name: dart pub get (tools/corpus)
28+
run: dart pub get
29+
working-directory: tools/corpus/scripts
2730
- name: dart pub get (accepted/2.3/spread-collections/benchmarks)
2831
run: dart pub get
2932
working-directory: accepted/2.3/spread-collections/benchmarks

tools/corpus/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Don't commit the downloaded files.
2+
download/
3+
out/

tools/corpus/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
This directory contains a package with scripts for downloading corpora of open
2+
source Dart code for automated analysis. There are a few scripts for
3+
downloading from various places:
4+
5+
* `clone_flutter_apps.dart`: Clones GitHub repositories linked to from
6+
[github.com/tortuvshin/open-source-flutter-apps](https://github.com/tortuvshin/open-source-flutter-apps), which is a registry of open source Flutter apps.
7+
Downloads them to `download/apps`.
8+
9+
* `clone_widgets.apps.dart`: Clones GitHub repositories referenced by
10+
[itsallwidgets.com](https://itsallwidgets.com/), which is a collection of
11+
open source Flutter apps and widgets. Downloads them to `download/widgets`.
12+
13+
* `download_packages.dart`: Downloads recent packages from
14+
[pub.dev](https://pub.dev/). Downloads to `download/pub`.
15+
16+
Once a corpus is downloaded, there is another script that copies over just the
17+
`.dart` files while discardinging "uninteresting" files like generated ones:
18+
19+
* `copy_corpus.dart`: Copies `.dart` files from one of the download
20+
directories. Pass `apps`, `widgets`, `pub`, etc. Can also copy sources from
21+
the Dart SDK repo (`dart`) or Flutter repo (`flutter`). For that to work,
22+
those repos must be in directories next to the language repo.
23+
24+
You can pass `--sample=<percent>` to take a random sample of a corpus. For
25+
example, `--sample=5` will copy over only 5% of the files, chosen randomly.
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import 'package:corpus/utils.dart';
2+
3+
/// Match URIs that point to GitHub repos. Look for a trailing ")" (after an
4+
/// allowed trailing "/") in order to only find Markdown link URIs that are
5+
/// directly to repos and not to paths within them like the images in the
6+
/// header.
7+
final _gitHubRepoPattern =
8+
RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)/?\)');
9+
10+
const _readmeUri =
11+
'https://raw.githubusercontent.com/tortuvshin/open-source-flutter-apps/'
12+
'refs/heads/master/README.md';
13+
14+
/// Clones the GitHub repos listed on:
15+
///
16+
/// https://github.com/tortuvshin/open-source-flutter-apps
17+
///
18+
/// Downloads them to downloads/apps.
19+
void main(List<String> arguments) async {
20+
clean('download/apps');
21+
22+
print('Getting README.md...');
23+
var readme = await httpGet(_readmeUri);
24+
25+
// Find all the repo URLs and remove the duplicates.
26+
var repoPaths = _gitHubRepoPattern
27+
.allMatches(readme)
28+
.map((match) => (user: match[1]!, repo: match[2]!))
29+
.toSet()
30+
.toList();
31+
32+
// Skip the reference to the repo itself.
33+
repoPaths.remove((user: 'tortuvshin', repo: 'open-source-flutter-apps'));
34+
35+
var downloader = Downloader(totalResources: repoPaths.length, concurrency: 5);
36+
for (var (:user, :repo) in repoPaths) {
37+
downloader.cloneGitHubRepo('apps', user, repo);
38+
}
39+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import 'package:corpus/utils.dart';
2+
3+
/// Match URIs that point to GitHub repos.
4+
final _gitHubRepoPattern =
5+
RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)');
6+
7+
/// Download open source apps from itsallwidgets.com.
8+
void main(List<String> arguments) async {
9+
clean("download/widgets");
10+
11+
print('Getting page feed...');
12+
var feed =
13+
await httpGetJson('https://itsallwidgets.com/feed?open_source=true');
14+
15+
var repos = <({String user, String repo})>{};
16+
for (var entry in (feed as List<Object?>)) {
17+
var entryMap = entry as Map<String, Object?>;
18+
if (entryMap['type'] != 'app') continue;
19+
20+
var repo = entryMap['repo_url'] as String?;
21+
if (repo == null) continue;
22+
23+
// Only know how to download from GitHub. There are a couple of BitBucket
24+
// ones in there.
25+
if (_gitHubRepoPattern.firstMatch(repo) case var match?) {
26+
repos.add((user: match[1]!, repo: match[2]!));
27+
}
28+
}
29+
30+
var downloader = Downloader(totalResources: repos.length, concurrency: 10);
31+
for (var (:user, :repo) in repos) {
32+
downloader.cloneGitHubRepo('widgets', user, repo);
33+
}
34+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import 'dart:io';
2+
import 'dart:math';
3+
4+
import 'package:args/args.dart';
5+
import 'package:path/path.dart' as p;
6+
7+
/// What percentage of files should be copied over. Used to take a random
8+
/// sample of a corpus.
9+
int _samplePercent = 100;
10+
11+
final _random = Random();
12+
13+
const _ignoreDirs = [
14+
'pkg/dev_compiler/gen/',
15+
'tests/co19/',
16+
'third_party/observatory_pub_packages/',
17+
'tools/sdks/',
18+
'out/',
19+
'xcodebuild/',
20+
21+
// Redundant stuff in Flutter.
22+
'bin/cache/',
23+
24+
// Redundant packages that are in the SDK.
25+
'analyzer-',
26+
'compiler_unsupported-',
27+
'dev_compiler-',
28+
];
29+
30+
// Note! Assumes the Dart SDK and Flutter repos have been cloned in
31+
// directories next to the corpus repo. Also assumes this script has been run
32+
// from the root directory of this repo.
33+
const _corpora = [
34+
('apps', 'download/apps'),
35+
('dart', '../../../dart/sdk'),
36+
('flutter', '../../../flutter'),
37+
('pub', 'download/pub'),
38+
('widgets', 'download/widgets'),
39+
];
40+
41+
final generatedSuffixes = ['.g.dart', '.freezed.dart'];
42+
43+
void main(List<String> arguments) async {
44+
var argParser = ArgParser();
45+
argParser.addFlag('omit-slow');
46+
argParser.addOption('sample', abbr: 's', defaultsTo: '100');
47+
48+
var argResults = argParser.parse(arguments);
49+
_samplePercent = int.parse(argResults['sample']);
50+
51+
for (var (name, directory) in _corpora) {
52+
if (arguments.contains(name)) await copyDir(directory, name);
53+
}
54+
}
55+
56+
Future<void> copyDir(String fromDirectory, String toDirectory) async {
57+
// If we're taking a random sample, put that in a separate directory.
58+
if (_samplePercent != 100) {
59+
toDirectory += '-$_samplePercent';
60+
}
61+
62+
var i = 0;
63+
var inDir = Directory(fromDirectory);
64+
65+
await inDir.list(recursive: true, followLinks: false).listen((entry) async {
66+
var relative = p.relative(entry.path, from: inDir.path);
67+
68+
if (entry is Link) return;
69+
if (entry is! File || !entry.path.endsWith('.dart')) return;
70+
71+
// Skip redundant stuff.
72+
for (var ignore in _ignoreDirs) {
73+
if (relative.startsWith(ignore)) return;
74+
}
75+
76+
if (_random.nextInt(100) >= _samplePercent) return;
77+
78+
// If the path is in a subdirectory starting with '.', ignore it.
79+
var parts = p.split(relative);
80+
if (parts.any((part) => part.startsWith('.'))) return;
81+
82+
var outPath = p.join('out', toDirectory, relative);
83+
84+
var outDir = Directory(p.dirname(outPath));
85+
if (!await outDir.exists()) await outDir.create(recursive: true);
86+
87+
await entry.copy(outPath);
88+
89+
i++;
90+
if (i % 100 == 0) print(relative);
91+
}).asFuture();
92+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import 'dart:io';
2+
3+
import 'package:corpus/utils.dart';
4+
5+
const _totalPackages = 2000;
6+
7+
void main(List<String> arguments) async {
8+
clean('download/pub');
9+
10+
// Iterate through the pages (which are in most recent order) until we get
11+
// enough packages.
12+
var packagePage = 'http://pub.dartlang.org/api/packages';
13+
var downloaded = 1;
14+
15+
var downloader = Downloader(totalResources: _totalPackages);
16+
for (;;) {
17+
downloader.log('Getting index page $downloaded...');
18+
var packages = await httpGetJson(packagePage);
19+
20+
for (var package in packages['packages']) {
21+
downloader.withResource((logger) async {
22+
var name = package['name'] as String;
23+
var version = package['latest']['version'] as String;
24+
var archiveUrl = package['latest']['archive_url'] as String;
25+
26+
try {
27+
logger.begin('Downloading $archiveUrl...');
28+
var archiveBytes = await httpGetBytes(archiveUrl);
29+
var tarFile = 'download/pub/$name-$version.tar.gz';
30+
await File(tarFile).writeAsBytes(archiveBytes);
31+
32+
logger.log('Extracting $tarFile...');
33+
var outputDir = 'download/pub/$name-$version';
34+
await Directory(outputDir).create(recursive: true);
35+
var result =
36+
await Process.run('tar', ['-xf', tarFile, '-C', outputDir]);
37+
38+
if (result.exitCode != 0) {
39+
logger.end('Could not extract $tarFile:\n${result.stderr}');
40+
} else {
41+
await File(tarFile).delete();
42+
logger.end('Finished $outputDir');
43+
}
44+
} catch (error) {
45+
logger.end('Error downloading $archiveUrl:\n$error');
46+
}
47+
});
48+
49+
downloaded++;
50+
if (downloaded >= _totalPackages) return;
51+
}
52+
53+
var nextUrl = packages['next_url'];
54+
if (nextUrl is! String) break;
55+
packagePage = nextUrl;
56+
}
57+
}

0 commit comments

Comments
 (0)