diff --git a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/metadata.json b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/metadata.json index f229768d..5c773935 100644 --- a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/metadata.json +++ b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/metadata.json @@ -258,6 +258,126 @@ } } ] + }, + { + "@id": "wat-records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "wat-records/url", + "@type": "cr:Field", + "name": "url", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "wat-files" + }, + "extract": { + "fileProperty": "fullpath" + } + } + } + ] + }, + { + "@id": "wet-records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "wet-records/url", + "@type": "cr:Field", + "name": "url", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "wet-files" + }, + "extract": { + "fileProperty": "fullpath" + } + } + } + ] + }, + { + "@id": "robotstxt-records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "robotstxt-records/url", + "@type": "cr:Field", + "name": "url", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "robotstxt-files" + }, + "extract": { + "fileProperty": "fullpath" + } + } + } + ] + }, + { + "@id": "non200responses-records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "non200responses-records/url", + "@type": "cr:Field", + "name": "url", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "non200responses-files" + }, + "extract": { + "fileProperty": "fullpath" + } + } + } + ] + }, + { + "@id": "cc-index-records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "cc-index-records/url", + "@type": "cr:Field", + "name": "url", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "cc-index-files" + }, + "extract": { + "fileProperty": "fullpath" + } + } + } + ] + }, + { + "@id": "cc-index-table-records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "cc-index-table-records/url", + "@type": "cr:Field", + "name": "url", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "cc-index-table-files" + }, + "extract": { + "fileProperty": "fullpath" + } + } + } + ] } ] } diff --git a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/cc-index-records.jsonl b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/cc-index-records.jsonl new file mode 100644 index 00000000..e159d816 --- /dev/null +++ b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/cc-index-records.jsonl @@ -0,0 +1,3 @@ +{"cc-index-records/url": "cc-index/collections/CC-MAIN-2025-43/indexes/cdx-00000.gz"} +{"cc-index-records/url": "cc-index/collections/CC-MAIN-2025-43/indexes/cdx-00001.gz"} +{"cc-index-records/url": "cc-index/collections/CC-MAIN-2025-43/indexes/cdx-00002.gz"} diff --git a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/cc-index-table-records.jsonl b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/cc-index-table-records.jsonl new file mode 100644 index 00000000..9ab2a22d --- /dev/null +++ b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/cc-index-table-records.jsonl @@ -0,0 +1,3 @@ +{"cc-index-table-records/url": "cc-index/table/cc-main/warc/crawl=CC-MAIN-2025-43/subset=crawldiagnostics/part-00000-08d3d8a4-29d7-4627-90c5-75c34ee698ca.c000.gz.parquet"} +{"cc-index-table-records/url": "cc-index/table/cc-main/warc/crawl=CC-MAIN-2025-43/subset=crawldiagnostics/part-00001-08d3d8a4-29d7-4627-90c5-75c34ee698ca.c000.gz.parquet"} +{"cc-index-table-records/url": "cc-index/table/cc-main/warc/crawl=CC-MAIN-2025-43/subset=crawldiagnostics/part-00002-08d3d8a4-29d7-4627-90c5-75c34ee698ca.c000.gz.parquet"} diff --git a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/non200responses-records.jsonl b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/non200responses-records.jsonl new file mode 100644 index 00000000..12870c77 --- /dev/null +++ b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/non200responses-records.jsonl @@ -0,0 +1,3 @@ +{"non200responses-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/crawldiagnostics/CC-MAIN-20251005114239-20251005144239-00000.warc.gz"} +{"non200responses-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/crawldiagnostics/CC-MAIN-20251005114239-20251005144239-00001.warc.gz"} +{"non200responses-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/crawldiagnostics/CC-MAIN-20251005114239-20251005144239-00002.warc.gz"} diff --git a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/robotstxt-records.jsonl b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/robotstxt-records.jsonl new file mode 100644 index 00000000..e528fbe7 --- /dev/null +++ b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/robotstxt-records.jsonl @@ -0,0 +1,3 @@ +{"robotstxt-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/robotstxt/CC-MAIN-20251005114239-20251005144239-00000.warc.gz"} +{"robotstxt-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/robotstxt/CC-MAIN-20251005114239-20251005144239-00001.warc.gz"} +{"robotstxt-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/robotstxt/CC-MAIN-20251005114239-20251005144239-00002.warc.gz"} diff --git a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/wat-records.jsonl b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/wat-records.jsonl new file mode 100644 index 00000000..0922074d --- /dev/null +++ b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/wat-records.jsonl @@ -0,0 +1,3 @@ +{"wat-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/wat/CC-MAIN-20251005114239-20251005144239-00000.warc.wat.gz"} +{"wat-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/wat/CC-MAIN-20251005114239-20251005144239-00001.warc.wat.gz"} +{"wat-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/wat/CC-MAIN-20251005114239-20251005144239-00002.warc.wat.gz"} diff --git a/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/wet-records.jsonl b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/wet-records.jsonl new file mode 100644 index 00000000..05b865ff --- /dev/null +++ b/datasets/1.1/commoncrawl-CC-MAIN-2025-43-draft/output/wet-records.jsonl @@ -0,0 +1,3 @@ +{"wet-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/wet/CC-MAIN-20251005114239-20251005144239-00000.warc.wet.gz"} +{"wet-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/wet/CC-MAIN-20251005114239-20251005144239-00001.warc.wet.gz"} +{"wet-records/url": "crawl-data/CC-MAIN-2025-43/segments/1759648357851.76/wet/CC-MAIN-20251005114239-20251005144239-00002.warc.wet.gz"}