Skip to content

Commit 7833486

Browse files
chore(ua-blocker): update robots.json from upstream (#1590)
Co-authored-by: yusukebe <10682+yusukebe@users.noreply.github.com>
1 parent 02ab9e8 commit 7833486

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

.changeset/auto-sync-robots.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@hono/ua-blocker': patch
3+
---
4+
5+
chore(ua-blocker): sync `robots.json` with upstream

packages/ua-blocker/src/data/robots.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,13 @@
440440
"frequency": "Indexes based on 'change signals' and user configuration.",
441441
"description": "Indexes content to tailor AI experiences, generate content, answers and recommendations."
442442
},
443+
"LAIONDownloader": {
444+
"operator": "[Large-scale Artificial Intelligence Open Network](https://laion.ai/)",
445+
"respect": "[No](https://laion.ai/faq/)",
446+
"function": "AI tools and models for machine learning research.",
447+
"frequency": "Unclear at this time.",
448+
"description": "LAIONDownloader is a bot by LAION, a non-profit organization that provides datasets, tools and models to liberate machine learning research."
449+
},
443450
"LinerBot": {
444451
"operator": "Unclear at this time.",
445452
"respect": "Unclear at this time.",

packages/ua-blocker/src/generated.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ User-agent: img2dataset
6363
User-agent: ISSCyberRiskCrawler
6464
User-agent: Kangaroo Bot
6565
User-agent: KlaviyoAIBot
66+
User-agent: LAIONDownloader
6667
User-agent: LinerBot
6768
User-agent: Linguee Bot
6869
User-agent: meta-externalagent
@@ -113,10 +114,10 @@ User-agent: YouBot
113114
Disallow: /
114115
`;
115116
// prettier-ignore
116-
export const ALL_BOTS = ["AddSearchBot", "AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "AmazonBuyForMe", "atlassian-bot", "amazon-kendra", "Amazonbot", "Andibot", "Anomura", "anthropic-ai", "Applebot", "Applebot-Extended", "Awario", "bedrockbot", "bigsur.ai", "Bravebot", "Brightbot 1.0", "BuddyBot", "Bytespider", "CCBot", "ChatGPT Agent", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "Cloudflare-AutoRAG", "CloudVertexBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Datenbank Crawler", "DeepSeekBot", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "FacebookBot", "facebookexternalhit", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Gemini-Deep-Research", "Google-CloudVertexBot", "Google-Extended", "Google-Firebase", "Google-NotebookLM", "GoogleAgent-Mariner", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "IbouBot", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "KlaviyoAIBot", "LinerBot", "Linguee Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "meta-webindexer", "MistralAI-User", "MistralAI-User/1.0", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NotebookLM", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "OpenAI", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot-OCOB", "SemrushBot-SWA", "ShapBot", "Sidetrade indexer bot", "TerraCotta", "Thinkbot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "WARDBot", "Webzio-Extended", "wpbot", "YaK", "YandexAdditional", "YandexAdditionalBot", "YouBot"];
117+
export const ALL_BOTS = ["AddSearchBot", "AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "AmazonBuyForMe", "atlassian-bot", "amazon-kendra", "Amazonbot", "Andibot", "Anomura", "anthropic-ai", "Applebot", "Applebot-Extended", "Awario", "bedrockbot", "bigsur.ai", "Bravebot", "Brightbot 1.0", "BuddyBot", "Bytespider", "CCBot", "ChatGPT Agent", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "Cloudflare-AutoRAG", "CloudVertexBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Datenbank Crawler", "DeepSeekBot", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "FacebookBot", "facebookexternalhit", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Gemini-Deep-Research", "Google-CloudVertexBot", "Google-Extended", "Google-Firebase", "Google-NotebookLM", "GoogleAgent-Mariner", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "IbouBot", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "KlaviyoAIBot", "LAIONDownloader", "LinerBot", "Linguee Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "meta-webindexer", "MistralAI-User", "MistralAI-User/1.0", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NotebookLM", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "OpenAI", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot-OCOB", "SemrushBot-SWA", "ShapBot", "Sidetrade indexer bot", "TerraCotta", "Thinkbot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "WARDBot", "Webzio-Extended", "wpbot", "YaK", "YandexAdditional", "YandexAdditionalBot", "YouBot"];
117118
// prettier-ignore
118-
export const NON_RESPECTING_BOTS = ["AddSearchBot", "AmazonBuyForMe", "Andibot", "anthropic-ai", "Applebot", "Awario", "bigsur.ai", "Brightbot 1.0", "BuddyBot", "Bytespider", "Claude-Web", "CloudVertexBot", "cohere-ai", "cohere-training-data-crawler", "Datenbank Crawler", "DeepSeekBot", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "facebookexternalhit", "Factset_spyderbot", "Gemini-Deep-Research", "Google-Firebase", "Google-NotebookLM", "GoogleAgent-Mariner", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "LinerBot", "Linguee Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "meta-webindexer", "MistralAI-User", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NotebookLM", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "Thinkbot", "TikTokSpider", "Timpibot", "WARDBot", "Webzio-Extended", "wpbot", "YaK"];
119+
export const NON_RESPECTING_BOTS = ["AddSearchBot", "AmazonBuyForMe", "Andibot", "anthropic-ai", "Applebot", "Awario", "bigsur.ai", "Brightbot 1.0", "BuddyBot", "Bytespider", "Claude-Web", "CloudVertexBot", "cohere-ai", "cohere-training-data-crawler", "Datenbank Crawler", "DeepSeekBot", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "facebookexternalhit", "Factset_spyderbot", "Gemini-Deep-Research", "Google-Firebase", "Google-NotebookLM", "GoogleAgent-Mariner", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "LAIONDownloader", "LinerBot", "Linguee Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "meta-webindexer", "MistralAI-User", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NotebookLM", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "Thinkbot", "TikTokSpider", "Timpibot", "WARDBot", "Webzio-Extended", "wpbot", "YaK"];
119120
// prettier-ignore
120-
export const ALL_BOTS_REGEX = /(ADDSEARCHBOT|AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBUYFORME|ATLASSIAN-BOT|AMAZON-KENDRA|AMAZONBOT|ANDIBOT|ANOMURA|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|AWARIO|BEDROCKBOT|BIGSUR.AI|BRAVEBOT|BRIGHTBOT 1.0|BUDDYBOT|BYTESPIDER|CCBOT|CHATGPT AGENT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|CLOUDFLARE-AUTORAG|CLOUDVERTEXBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DATENBANK CRAWLER|DEEPSEEKBOT|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GEMINI-DEEP-RESEARCH|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLE-FIREBASE|GOOGLE-NOTEBOOKLM|GOOGLEAGENT-MARINER|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|IBOUBOT|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|KLAVIYOAIBOT|LINERBOT|LINGUEE BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|META-WEBINDEXER|MISTRALAI-USER|MISTRALAI-USER\/1.0|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOTEBOOKLM|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPENAI|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT-OCOB|SEMRUSHBOT-SWA|SHAPBOT|SIDETRADE INDEXER BOT|TERRACOTTA|THINKBOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WARDBOT|WEBZIO-EXTENDED|WPBOT|YAK|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/;
121+
export const ALL_BOTS_REGEX = /(ADDSEARCHBOT|AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBUYFORME|ATLASSIAN-BOT|AMAZON-KENDRA|AMAZONBOT|ANDIBOT|ANOMURA|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|AWARIO|BEDROCKBOT|BIGSUR.AI|BRAVEBOT|BRIGHTBOT 1.0|BUDDYBOT|BYTESPIDER|CCBOT|CHATGPT AGENT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|CLOUDFLARE-AUTORAG|CLOUDVERTEXBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DATENBANK CRAWLER|DEEPSEEKBOT|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GEMINI-DEEP-RESEARCH|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLE-FIREBASE|GOOGLE-NOTEBOOKLM|GOOGLEAGENT-MARINER|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|IBOUBOT|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|KLAVIYOAIBOT|LAIONDOWNLOADER|LINERBOT|LINGUEE BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|META-WEBINDEXER|MISTRALAI-USER|MISTRALAI-USER\/1.0|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOTEBOOKLM|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPENAI|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT-OCOB|SEMRUSHBOT-SWA|SHAPBOT|SIDETRADE INDEXER BOT|TERRACOTTA|THINKBOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WARDBOT|WEBZIO-EXTENDED|WPBOT|YAK|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/;
121122
// prettier-ignore
122-
export const NON_RESPECTING_BOTS_REGEX = /(ADDSEARCHBOT|AMAZONBUYFORME|ANDIBOT|ANTHROPIC-AI|APPLEBOT|AWARIO|BIGSUR.AI|BRIGHTBOT 1.0|BUDDYBOT|BYTESPIDER|CLAUDE-WEB|CLOUDVERTEXBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DATENBANK CRAWLER|DEEPSEEKBOT|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|GEMINI-DEEP-RESEARCH|GOOGLE-FIREBASE|GOOGLE-NOTEBOOKLM|GOOGLEAGENT-MARINER|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|LINERBOT|LINGUEE BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|META-WEBINDEXER|MISTRALAI-USER|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOTEBOOKLM|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|THINKBOT|TIKTOKSPIDER|TIMPIBOT|WARDBOT|WEBZIO-EXTENDED|WPBOT|YAK)/;
123+
export const NON_RESPECTING_BOTS_REGEX = /(ADDSEARCHBOT|AMAZONBUYFORME|ANDIBOT|ANTHROPIC-AI|APPLEBOT|AWARIO|BIGSUR.AI|BRIGHTBOT 1.0|BUDDYBOT|BYTESPIDER|CLAUDE-WEB|CLOUDVERTEXBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DATENBANK CRAWLER|DEEPSEEKBOT|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|GEMINI-DEEP-RESEARCH|GOOGLE-FIREBASE|GOOGLE-NOTEBOOKLM|GOOGLEAGENT-MARINER|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|LAIONDOWNLOADER|LINERBOT|LINGUEE BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|META-WEBINDEXER|MISTRALAI-USER|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOTEBOOKLM|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|THINKBOT|TIKTOKSPIDER|TIMPIBOT|WARDBOT|WEBZIO-EXTENDED|WPBOT|YAK)/;

0 commit comments

Comments
 (0)