|
30 | 30 | } |
31 | 31 | }, |
32 | 32 | "ignore_patterns" : { |
33 | | - "^http[s]\\?://archive\\.org/web/": "often times out", |
34 | | - "^http[s]\\?://twitter\\.com": "302; does not serve scripts", |
35 | | - "^http[s]\\?://linkedin\\.com": "302; does not serve scripts", |
36 | | - "^http[s]\\?://www\\.linkedin\\.com": "999; does not serve scripts", |
37 | | - "^http[s]\\?://chat\\.openai\\.com": "302; does not serve scripts", |
38 | | - "^https://github.com/org_name/codebase_name.git": "bogus example URL", |
39 | | - "^http[s]\\?://github\\.com/.*/edit/": "may point to yet-to-exist page", |
40 | | - "^http[s]\\?://docs\\.github\\.com/": "seems blocked as DoS protection", |
41 | | - "^http[s]\\?://github\\.com/[-0-9A-Za-z_\\./]\\+/\\(issues\\|pull\\)/[0-9]\\+[\\.,)]*": "ignore github issues and PRs", |
| 33 | + "\\[subdomain\\]\\.publiccode\\.net": "template", |
| 34 | + "FILE_BASE}.html": "template", |
| 35 | + "http[s]\\?://archive\\.org/web/": "often times out", |
| 36 | + "http[s]\\?://twitter\\.com": "302; does not serve scripts", |
| 37 | + "http[s]\\?://linkedin\\.com": "302; does not serve scripts", |
| 38 | + "http[s]\\?://www\\.linkedin\\.com": "999; does not serve scripts", |
| 39 | + "http[s]\\?://chat\\.openai\\.com": "302; does not serve scripts", |
| 40 | + "https://github.com/org_name/codebase_name.git": "bogus example URL", |
| 41 | + "http[s]\\?://github\\.com/.*/edit/": "may point to yet-to-exist page", |
| 42 | + "http[s]\\?://docs\\.github\\.com/": "seems blocked as DoS protection", |
| 43 | + "http[s]\\?://github\\.com/[-0-9A-Za-z_\\./]\\+/\\(issues\\|pull\\)/[0-9]\\+": "ignore github issues and PRs", |
42 | 44 | "plausible\\.io/js/plausible\\.js": "does not serve to scripts", |
43 | | - "^https://github.com/publiccodenet/standard/compare/main...release": "example URL in docs/releasing", |
| 45 | + "https://github.com/publiccodenet/standard/compare/main...release": "example URL in docs/releasing", |
44 | 46 | "opensource\\.org": "failed: 503 No error", |
45 | 47 | "belastingdienst\\.nl/wps/wcm/connect/bldcontenten": "regular timeouts", |
46 | | - "reclameland\\.nl/drukken/softcover-boeken": "failed: 403 No error", |
47 | | - "^https://help.miro.com": "403 to script", |
| 48 | + "reclameland\\.nl/drukken": "failed: 403 No error", |
| 49 | + "https://help.miro.com": "403 to script", |
48 | 50 | "www\\.dta\\.gov\\.au/help-and-advice": "failed: 403 No error", |
49 | | - "^https://pixabay\\.com/": "gives 403 to curl", |
50 | | - "^https://fonts.google.com/download?family=": "bash param in the URL", |
| 51 | + "https://pixabay\\.com/": "gives 403 to curl", |
| 52 | + "https://fonts.google.com/download?family=": "bash param in the URL", |
51 | 53 | "https://standard.publiccode.net/criteria/\\\\2.html": "regex in URL", |
52 | | - "^https://www.go-fair.org/": "gives 400s when run as GitHub workflow", |
53 | | - "^https://support\\.google\\.com/": "gives 404 to curl", |
54 | | - "^https://www\\.komoot\\.com/": "gives 404 to curl, works in browser", |
55 | | - "^https://www\\.grammarly\\.com/": "HTTP/2 405, allow: POST, GET", |
56 | | - "^https://giphy\\.com": "gives 503 to curl", |
57 | | - "^https://www\\.lonebeard\\.com": "defunct, referenced in binary files", |
58 | | - "^http[s]\\?://cipa\\.jp/exif": "defunct, embedded in some .jpg files", |
59 | | - "^http://ns\\.adobe\\.com/": "defunct, embedded in .jpg", |
60 | | - "^http://www\\.gimp\\.org/xmp/": "defunct, embedded in .jpg", |
61 | | - "^http://www\\.inkscape\\.org/namespaces/inkscape": "defunct, in .svg", |
62 | | - "^http[s]\\?://sodipodi\\.sourceforge\\.net/DTD/sodipodi-0\\.dtd": "defunct, in SVGs", |
63 | | - "^http[s]\\?://www\\.omg\\.org/spec/.*/20100524": "defunct, embedded in old .bpmn files", |
64 | | - "^http[s]\\?://bpmn.io/schema/bpmn": "unreliable", |
65 | | - "^http[s]\\?://www\\.un\\.org/en/content/": "frequent timeout", |
66 | | - "^http[s]\\?://arkitektur\\.digst\\.dk/node/1173": "times out", |
67 | | - "^http[s]\\?://eur-lex\\.europa\\.eu/legal-content/EN/TXT": "timeouts", |
68 | | - "^https://www\\.uwv\\.nl": "gives 404 to curl", |
69 | | - "listennotes\\.com/": "frequent timeouts", |
70 | | - "lists\\.publiccode\\.net/mailman/": "frequent timeouts", |
71 | | - "https://wetten\\.overheid\\.nl/BWBR0025279/2013-01-01": "times out", |
| 54 | + "https://www.go-fair.org/": "gives 400s when run as GitHub workflow", |
| 55 | + "https://support\\.google\\.com/": "gives 404 to curl", |
| 56 | + "https://www\\.komoot\\.com/": "gives 404 to curl, works in browser", |
| 57 | + "https://www\\.grammarly\\.com/": "HTTP/2 405, allow: POST, GET", |
| 58 | + "https://giphy\\.com": "gives 503 to curl", |
| 59 | + "https://www\\.lonebeard\\.com": "defunct, referenced in binary files", |
| 60 | + "http[s]\\?://cipa\\.jp/exif": "defunct, embedded in some .jpg files", |
| 61 | + "http://ns\\.adobe\\.com/": "defunct, embedded in .jpg", |
| 62 | + "http://www\\.gimp\\.org/xmp/": "defunct, embedded in .jpg", |
| 63 | + "http://www\\.inkscape\\.org/namespaces/inkscape": "defunct, in .svg", |
| 64 | + "http[s]\\?://sodipodi\\.sourceforge\\.net/DTD/sodipodi-0\\.dtd": "defunct, in SVGs", |
| 65 | + "http[s]\\?://www\\.omg\\.org/spec/.*/20100524": "defunct, embedded in old .bpmn files", |
| 66 | + "http[s]\\?://bpmn.io/schema/bpmn": "unreliable", |
| 67 | + "http[s]\\?://www\\.un\\.org/en/content/": "frequent timeout", |
| 68 | + "http[s]\\?://arkitektur\\.digst\\.dk/node/1173": "times out", |
| 69 | + "http[s]\\?://eur-lex\\.europa\\.eu/legal-content/EN/TXT": "timeouts", |
| 70 | + "https://www\\.uwv\\.nl": "gives 404 to curl", |
| 71 | + "listennotes\\.com/": "frequent timeouts", |
| 72 | + "lists\\.publiccode\\.net/mailman/": "frequent timeouts", |
| 73 | + "https://wetten\\.overheid\\.nl/BWBR0025279/2013-01-01": "times out", |
72 | 74 | "amsterdam\\.nl/en/": "frequent timeouts", |
73 | 75 | "iso\\.org/drafting-standards\\.html": "timeouts", |
74 | | - "https://flickr.com/e/tFzM3d9XsB": "defunct, embedded in a .jpg", |
75 | | - "^http://www\\.instagram\\.com/lottedale": "429, embedded in .jpg", |
76 | | - "^http[s]\\?://www\\.figma\\.com": "gives 404 to curl" |
| 76 | + "https://flickr.com/e/tFzM3d9XsB": "defunct, embedded in a .jpg", |
| 77 | + "http://www\\.instagram\\.com/lottedale": "429, embedded in .jpg", |
| 78 | + "http[s]\\?://www\\.figma\\.com": "gives 404 to curl" |
77 | 79 | }, |
78 | 80 | "transforms" : { |
79 | | - "sed 's@/[\\.,)]*$@/@'": |
80 | | - "remove trailing punctuation from links ending in '/'", |
81 | | - "sed 's@\\.net[\\.,)]*[email protected]@'": |
82 | | - "remove trailing punctuation from links ending in '.net'", |
83 | | - "sed 's@\\.com[\\.,)]*[email protected]@'": |
84 | | - "remove trailing punctuation from links ending in '.com'", |
85 | | - "sed 's@^\\(http.*\\.html\\)[\\.,)]*$@\\1@'": |
86 | | - "remove trailing punctuation from links ending in '.html'", |
87 | | - "sed 's@^\\(http.*\\.pdf\\)[\\.,)]*$@\\1@'": |
88 | | - "remove trailing punctuation from links ending in '.pdf'", |
89 | | - "sed 's@Open_air_school).$@Open_air_school@'": |
90 | | - "remove trailing punctuation'", |
91 | | - "sed 's@\\(nextcloud/index.php/s/[-0-9a-zA-Z]*\\)[\\.,)]*$@\\1@'": |
92 | | - "remove trailing punctuation from nextcloud files", |
93 | | - "sed 's@poortwachter[\\.,)]*$@poortwachter@'": |
94 | | - "remove trailing punctuation", |
95 | | - "sed 's@\\(://tools\\.ietf\\.org/html/rfc[0-9]*\\)[\\.,)]*$@\\1@'": |
96 | | - "remove trailing punctuation", |
97 | | - "sed 's@\\(publiccode\\.net/careers/marketing\\)[\\.),:]*@\\1@'": |
98 | | - "remove trailing punctuation" |
| 81 | + "sed 's@)\\](http@\\nhttp@g'": |
| 82 | + "split double-urls", |
| 83 | + "sed 's@\\](http@\\nhttp@g'": |
| 84 | + "split double-urls", |
| 85 | + "sed 's@/[\\.,):\\!\\?\\*\u2019]*$@/@g'": |
| 86 | + "remove trailing punctuation from links ending in '/'", |
| 87 | + "sed 's@\\.net[\\.,):\\!]*[email protected]@g'": |
| 88 | + "remove trailing punctuation from links ending in '.net'", |
| 89 | + "sed 's@\\.com[\\.,):\\!]*[email protected]@g'": |
| 90 | + "remove trailing punctuation from links ending in '.com'", |
| 91 | + "sed 's@\\.org[\\.,):\\!]*[email protected]@g'": |
| 92 | + "remove trailing punctuation from links ending in '.org'", |
| 93 | + "sed 's@\\.html[\\.,):\\!]*[email protected]@g'": |
| 94 | + "remove trailing punctuation from links ending in '.html'", |
| 95 | + "sed 's@\\.json[\\.,):\\!]*[email protected]@g'": |
| 96 | + "remove trailing punctuation from links ending in '.json'", |
| 97 | + "sed 's@^\\(http.*\\.pdf\\)[\\.,):\\!]*$@\\1@g'": |
| 98 | + "remove trailing punctuation from links ending in '.pdf'", |
| 99 | + "sed 's@Open_air_school).$@Open_air_school@g'": |
| 100 | + "remove trailing punctuation'", |
| 101 | + "sed 's@\\(nextcloud/index.php/s/[-0-9a-zA-Z]*\\)[\\.,):\\!]*$@\\1@g'": |
| 102 | + "remove trailing punctuation from nextcloud files", |
| 103 | + "sed 's@poortwachter[\\.,):\\!]*$@poortwachter@g'": |
| 104 | + "remove trailing punctuation", |
| 105 | + "sed 's@\\(://tools\\.ietf\\.org/html/rfc[0-9]*\\)[\\.,):\\!]*$@\\1@g'": |
| 106 | + "remove trailing punctuation", |
| 107 | + "sed -r 's@(http://hintjens\\.com/blog:[0-9]+)[^0-9]+.*@\\1@g'": |
| 108 | + "remove anchors, trailing punctuation, parameters", |
| 109 | + "sed -r 's@(http[s]\\?://hackmd\\.io/[^#]+)#.*@\\1@g'": |
| 110 | + "remove anchors, trailing punctuation, parameters", |
| 111 | + "sed 's@)/\\[.*@@g'": |
| 112 | + "remove trailing punctuation, and following text", |
| 113 | + "sed 's@\\(https://youtu\\.be/[-A-Za-z0-9_]*\\).*@\\1@g'": |
| 114 | + "remove anchors, trailing punctuation, parameters", |
| 115 | + "sed 's@\\(https://www\\.youtube\\.com/watch?v=[-A-Za-z0-9_]*\\).*@\\1@g'": |
| 116 | + "remove anchors, trailing punctuation, parameters", |
| 117 | + "sed 's@\\(https://youtube\\.com/watch?v=[-A-Za-z0-9_]*\\).*@\\1@g'": |
| 118 | + "remove anchors, trailing punctuation, parameters", |
| 119 | + "sed 's@publiccode\\.net/organization/staff).*@publiccode.net/organization/staff@g'": |
| 120 | + "remove anchors, trailing punctuation, parameters", |
| 121 | + "sed 's@publiccode\\.net/logo/mark\\.svg.*@publiccode.net/logo/mark.svg@'": |
| 122 | + "remove anchors, trailing punctuation, parameters", |
| 123 | + "sed 's@Open_air_school)\\?@Open_air_school@g'": |
| 124 | + "remove anchors, trailing punctuation, parameters", |
| 125 | + "sed 's@Frontend)!$@Frontend@g'": |
| 126 | + "remove anchors, trailing punctuation, parameters", |
| 127 | + "sed 's@alliance)!$@alliance@g'": |
| 128 | + "remove anchors, trailing punctuation, parameters", |
| 129 | + "sed 's@export_processing)\\.$@export_processing@g'": |
| 130 | + "remove anchors, trailing punctuation, parameters", |
| 131 | + "sed 's@say)\\*\\*$@say@g'": |
| 132 | + "remove anchors, trailing punctuation, parameters", |
| 133 | + "sed 's@\\(bmj\\.com/[-a-zA-Z0-9/\\.]*\\)[)\\?]*@\\1@g'": |
| 134 | + "remove trailing punctuation", |
| 135 | + "sed 's@edit)\\.\\*\\*$@edit@g'": |
| 136 | + "remove trailing punctuation", |
| 137 | + "sed 's@\\(oeffentliche-it\\.de/[-a-zA-Z0-9/%\\.\\+]*\\)[)\\]]*$@\\1@g'": |
| 138 | + "remove trailing punctuation", |
| 139 | + "sed 's@\\(reclameland\\.nl/[-a-zA-Z0-9/%\\.\\+]*\\)[\\.)\\],:!\\?]*$@\\1@g'": |
| 140 | + "remove trailing punctuation", |
| 141 | + "sed 's@\\(/publiccodenet/[-_a-zA-Z0-9/\\.\\+]*\\)[]\\._),:!\\?]*$@\\1@g'": |
| 142 | + "remove trailing punctuation", |
| 143 | + "sed 's@\\(publiccode\\.net/[-_a-zA-Z0-9/\\.\\+]*\\)[]\\._),:!\\?]*$@\\1@g'": |
| 144 | + "remove trailing punctuation" |
99 | 145 | } |
100 | 146 | } |
0 commit comments