Skip to content

Commit af3865b

Browse files
committed
update url-check, add-and-update patterns
1 parent cc37b9a commit af3865b

File tree

2 files changed

+104
-58
lines changed

2 files changed

+104
-58
lines changed

url-check-config.json

Lines changed: 103 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -30,71 +30,117 @@
3030
}
3131
},
3232
"ignore_patterns" : {
33-
"^http[s]\\?://archive\\.org/web/": "often times out",
34-
"^http[s]\\?://twitter\\.com": "302; does not serve scripts",
35-
"^http[s]\\?://linkedin\\.com": "302; does not serve scripts",
36-
"^http[s]\\?://www\\.linkedin\\.com": "999; does not serve scripts",
37-
"^http[s]\\?://chat\\.openai\\.com": "302; does not serve scripts",
38-
"^https://github.com/org_name/codebase_name.git": "bogus example URL",
39-
"^http[s]\\?://github\\.com/.*/edit/": "may point to yet-to-exist page",
40-
"^http[s]\\?://docs\\.github\\.com/": "seems blocked as DoS protection",
41-
"^http[s]\\?://github\\.com/[-0-9A-Za-z_\\./]\\+/\\(issues\\|pull\\)/[0-9]\\+[\\.,)]*": "ignore github issues and PRs",
33+
"\\[subdomain\\]\\.publiccode\\.net": "template",
34+
"FILE_BASE}.html": "template",
35+
"http[s]\\?://archive\\.org/web/": "often times out",
36+
"http[s]\\?://twitter\\.com": "302; does not serve scripts",
37+
"http[s]\\?://linkedin\\.com": "302; does not serve scripts",
38+
"http[s]\\?://www\\.linkedin\\.com": "999; does not serve scripts",
39+
"http[s]\\?://chat\\.openai\\.com": "302; does not serve scripts",
40+
"https://github.com/org_name/codebase_name.git": "bogus example URL",
41+
"http[s]\\?://github\\.com/.*/edit/": "may point to yet-to-exist page",
42+
"http[s]\\?://docs\\.github\\.com/": "seems blocked as DoS protection",
43+
"http[s]\\?://github\\.com/[-0-9A-Za-z_\\./]\\+/\\(issues\\|pull\\)/[0-9]\\+": "ignore github issues and PRs",
4244
"plausible\\.io/js/plausible\\.js": "does not serve to scripts",
43-
"^https://github.com/publiccodenet/standard/compare/main...release": "example URL in docs/releasing",
45+
"https://github.com/publiccodenet/standard/compare/main...release": "example URL in docs/releasing",
4446
"opensource\\.org": "failed: 503 No error",
4547
"belastingdienst\\.nl/wps/wcm/connect/bldcontenten": "regular timeouts",
46-
"reclameland\\.nl/drukken/softcover-boeken": "failed: 403 No error",
47-
"^https://help.miro.com": "403 to script",
48+
"reclameland\\.nl/drukken": "failed: 403 No error",
49+
"https://help.miro.com": "403 to script",
4850
"www\\.dta\\.gov\\.au/help-and-advice": "failed: 403 No error",
49-
"^https://pixabay\\.com/": "gives 403 to curl",
50-
"^https://fonts.google.com/download?family=": "bash param in the URL",
51+
"https://pixabay\\.com/": "gives 403 to curl",
52+
"https://fonts.google.com/download?family=": "bash param in the URL",
5153
"https://standard.publiccode.net/criteria/\\\\2.html": "regex in URL",
52-
"^https://www.go-fair.org/": "gives 400s when run as GitHub workflow",
53-
"^https://support\\.google\\.com/": "gives 404 to curl",
54-
"^https://www\\.komoot\\.com/": "gives 404 to curl, works in browser",
55-
"^https://www\\.grammarly\\.com/": "HTTP/2 405, allow: POST, GET",
56-
"^https://giphy\\.com": "gives 503 to curl",
57-
"^https://www\\.lonebeard\\.com": "defunct, referenced in binary files",
58-
"^http[s]\\?://cipa\\.jp/exif": "defunct, embedded in some .jpg files",
59-
"^http://ns\\.adobe\\.com/": "defunct, embedded in .jpg",
60-
"^http://www\\.gimp\\.org/xmp/": "defunct, embedded in .jpg",
61-
"^http://www\\.inkscape\\.org/namespaces/inkscape": "defunct, in .svg",
62-
"^http[s]\\?://sodipodi\\.sourceforge\\.net/DTD/sodipodi-0\\.dtd": "defunct, in SVGs",
63-
"^http[s]\\?://www\\.omg\\.org/spec/.*/20100524": "defunct, embedded in old .bpmn files",
64-
"^http[s]\\?://bpmn.io/schema/bpmn": "unreliable",
65-
"^http[s]\\?://www\\.un\\.org/en/content/": "frequent timeout",
66-
"^http[s]\\?://arkitektur\\.digst\\.dk/node/1173": "times out",
67-
"^http[s]\\?://eur-lex\\.europa\\.eu/legal-content/EN/TXT": "timeouts",
68-
"^https://www\\.uwv\\.nl": "gives 404 to curl",
69-
"listennotes\\.com/": "frequent timeouts",
70-
"lists\\.publiccode\\.net/mailman/": "frequent timeouts",
71-
"https://wetten\\.overheid\\.nl/BWBR0025279/2013-01-01": "times out",
54+
"https://www.go-fair.org/": "gives 400s when run as GitHub workflow",
55+
"https://support\\.google\\.com/": "gives 404 to curl",
56+
"https://www\\.komoot\\.com/": "gives 404 to curl, works in browser",
57+
"https://www\\.grammarly\\.com/": "HTTP/2 405, allow: POST, GET",
58+
"https://giphy\\.com": "gives 503 to curl",
59+
"https://www\\.lonebeard\\.com": "defunct, referenced in binary files",
60+
"http[s]\\?://cipa\\.jp/exif": "defunct, embedded in some .jpg files",
61+
"http://ns\\.adobe\\.com/": "defunct, embedded in .jpg",
62+
"http://www\\.gimp\\.org/xmp/": "defunct, embedded in .jpg",
63+
"http://www\\.inkscape\\.org/namespaces/inkscape": "defunct, in .svg",
64+
"http[s]\\?://sodipodi\\.sourceforge\\.net/DTD/sodipodi-0\\.dtd": "defunct, in SVGs",
65+
"http[s]\\?://www\\.omg\\.org/spec/.*/20100524": "defunct, embedded in old .bpmn files",
66+
"http[s]\\?://bpmn.io/schema/bpmn": "unreliable",
67+
"http[s]\\?://www\\.un\\.org/en/content/": "frequent timeout",
68+
"http[s]\\?://arkitektur\\.digst\\.dk/node/1173": "times out",
69+
"http[s]\\?://eur-lex\\.europa\\.eu/legal-content/EN/TXT": "timeouts",
70+
"https://www\\.uwv\\.nl": "gives 404 to curl",
71+
"listennotes\\.com/": "frequent timeouts",
72+
"lists\\.publiccode\\.net/mailman/": "frequent timeouts",
73+
"https://wetten\\.overheid\\.nl/BWBR0025279/2013-01-01": "times out",
7274
"amsterdam\\.nl/en/": "frequent timeouts",
7375
"iso\\.org/drafting-standards\\.html": "timeouts",
74-
"https://flickr.com/e/tFzM3d9XsB": "defunct, embedded in a .jpg",
75-
"^http://www\\.instagram\\.com/lottedale": "429, embedded in .jpg",
76-
"^http[s]\\?://www\\.figma\\.com": "gives 404 to curl"
76+
"https://flickr.com/e/tFzM3d9XsB": "defunct, embedded in a .jpg",
77+
"http://www\\.instagram\\.com/lottedale": "429, embedded in .jpg",
78+
"http[s]\\?://www\\.figma\\.com": "gives 404 to curl"
7779
},
7880
"transforms" : {
79-
"sed 's@/[\\.,)]*$@/@'":
80-
"remove trailing punctuation from links ending in '/'",
81-
"sed 's@\\.net[\\.,)]*[email protected]@'":
82-
"remove trailing punctuation from links ending in '.net'",
83-
"sed 's@\\.com[\\.,)]*[email protected]@'":
84-
"remove trailing punctuation from links ending in '.com'",
85-
"sed 's@^\\(http.*\\.html\\)[\\.,)]*$@\\1@'":
86-
"remove trailing punctuation from links ending in '.html'",
87-
"sed 's@^\\(http.*\\.pdf\\)[\\.,)]*$@\\1@'":
88-
"remove trailing punctuation from links ending in '.pdf'",
89-
"sed 's@Open_air_school).$@Open_air_school@'":
90-
"remove trailing punctuation'",
91-
"sed 's@\\(nextcloud/index.php/s/[-0-9a-zA-Z]*\\)[\\.,)]*$@\\1@'":
92-
"remove trailing punctuation from nextcloud files",
93-
"sed 's@poortwachter[\\.,)]*$@poortwachter@'":
94-
"remove trailing punctuation",
95-
"sed 's@\\(://tools\\.ietf\\.org/html/rfc[0-9]*\\)[\\.,)]*$@\\1@'":
96-
"remove trailing punctuation",
97-
"sed 's@\\(publiccode\\.net/careers/marketing\\)[\\.),:]*@\\1@'":
98-
"remove trailing punctuation"
81+
"sed 's@)\\](http@\\nhttp@g'":
82+
"split double-urls",
83+
"sed 's@\\](http@\\nhttp@g'":
84+
"split double-urls",
85+
"sed 's@/[\\.,):\\!\\?\\*\u2019]*$@/@g'":
86+
"remove trailing punctuation from links ending in '/'",
87+
"sed 's@\\.net[\\.,):\\!]*[email protected]@g'":
88+
"remove trailing punctuation from links ending in '.net'",
89+
"sed 's@\\.com[\\.,):\\!]*[email protected]@g'":
90+
"remove trailing punctuation from links ending in '.com'",
91+
"sed 's@\\.org[\\.,):\\!]*[email protected]@g'":
92+
"remove trailing punctuation from links ending in '.org'",
93+
"sed 's@\\.html[\\.,):\\!]*[email protected]@g'":
94+
"remove trailing punctuation from links ending in '.html'",
95+
"sed 's@\\.json[\\.,):\\!]*[email protected]@g'":
96+
"remove trailing punctuation from links ending in '.json'",
97+
"sed 's@^\\(http.*\\.pdf\\)[\\.,):\\!]*$@\\1@g'":
98+
"remove trailing punctuation from links ending in '.pdf'",
99+
"sed 's@Open_air_school).$@Open_air_school@g'":
100+
"remove trailing punctuation'",
101+
"sed 's@\\(nextcloud/index.php/s/[-0-9a-zA-Z]*\\)[\\.,):\\!]*$@\\1@g'":
102+
"remove trailing punctuation from nextcloud files",
103+
"sed 's@poortwachter[\\.,):\\!]*$@poortwachter@g'":
104+
"remove trailing punctuation",
105+
"sed 's@\\(://tools\\.ietf\\.org/html/rfc[0-9]*\\)[\\.,):\\!]*$@\\1@g'":
106+
"remove trailing punctuation",
107+
"sed -r 's@(http://hintjens\\.com/blog:[0-9]+)[^0-9]+.*@\\1@g'":
108+
"remove anchors, trailing punctuation, parameters",
109+
"sed -r 's@(http[s]\\?://hackmd\\.io/[^#]+)#.*@\\1@g'":
110+
"remove anchors, trailing punctuation, parameters",
111+
"sed 's@)/\\[.*@@g'":
112+
"remove trailing punctuation, and following text",
113+
"sed 's@\\(https://youtu\\.be/[-A-Za-z0-9_]*\\).*@\\1@g'":
114+
"remove anchors, trailing punctuation, parameters",
115+
"sed 's@\\(https://www\\.youtube\\.com/watch?v=[-A-Za-z0-9_]*\\).*@\\1@g'":
116+
"remove anchors, trailing punctuation, parameters",
117+
"sed 's@\\(https://youtube\\.com/watch?v=[-A-Za-z0-9_]*\\).*@\\1@g'":
118+
"remove anchors, trailing punctuation, parameters",
119+
"sed 's@publiccode\\.net/organization/staff).*@publiccode.net/organization/staff@g'":
120+
"remove anchors, trailing punctuation, parameters",
121+
"sed 's@publiccode\\.net/logo/mark\\.svg.*@publiccode.net/logo/mark.svg@'":
122+
"remove anchors, trailing punctuation, parameters",
123+
"sed 's@Open_air_school)\\?@Open_air_school@g'":
124+
"remove anchors, trailing punctuation, parameters",
125+
"sed 's@Frontend)!$@Frontend@g'":
126+
"remove anchors, trailing punctuation, parameters",
127+
"sed 's@alliance)!$@alliance@g'":
128+
"remove anchors, trailing punctuation, parameters",
129+
"sed 's@export_processing)\\.$@export_processing@g'":
130+
"remove anchors, trailing punctuation, parameters",
131+
"sed 's@say)\\*\\*$@say@g'":
132+
"remove anchors, trailing punctuation, parameters",
133+
"sed 's@\\(bmj\\.com/[-a-zA-Z0-9/\\.]*\\)[)\\?]*@\\1@g'":
134+
"remove trailing punctuation",
135+
"sed 's@edit)\\.\\*\\*$@edit@g'":
136+
"remove trailing punctuation",
137+
"sed 's@\\(oeffentliche-it\\.de/[-a-zA-Z0-9/%\\.\\+]*\\)[)\\]]*$@\\1@g'":
138+
"remove trailing punctuation",
139+
"sed 's@\\(reclameland\\.nl/[-a-zA-Z0-9/%\\.\\+]*\\)[\\.)\\],:!\\?]*$@\\1@g'":
140+
"remove trailing punctuation",
141+
"sed 's@\\(/publiccodenet/[-_a-zA-Z0-9/\\.\\+]*\\)[]\\._),:!\\?]*$@\\1@g'":
142+
"remove trailing punctuation",
143+
"sed 's@\\(publiccode\\.net/[-_a-zA-Z0-9/\\.\\+]*\\)[]\\._),:!\\?]*$@\\1@g'":
144+
"remove trailing punctuation"
99145
}
100146
}

0 commit comments

Comments
 (0)