forked from Zie619/n8n-workflows
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_categories.py
More file actions
248 lines (191 loc) · 9.73 KB
/
create_categories.py
File metadata and controls
248 lines (191 loc) · 9.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import json
import os
from pathlib import Path
import glob
import re
def load_def_categories():
"""Load the definition categories from def_categories.json"""
def_categories_path = Path("context/def_categories.json")
with open(def_categories_path, 'r', encoding='utf-8') as f:
raw_map = json.load(f)
# Normalize keys: strip non-alphanumerics and lowercase
integration_to_category = {
re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
for item in raw_map
}
return integration_to_category
def extract_tokens_from_filename(filename):
"""Extract tokens from filename by splitting on '_' and removing '.json'"""
# Remove .json extension
name_without_ext = filename.replace('.json', '')
# Split by underscore
tokens = name_without_ext.split('_')
# Convert to lowercase for matching
tokens = [token.lower() for token in tokens if token]
return tokens
def find_matching_category(tokens, integration_to_category):
"""Find the first matching category for the given tokens"""
for token in tokens:
# Normalize token same as keys
norm = re.sub(r"[^a-z0-9]", "", token.lower())
if norm in integration_to_category:
return integration_to_category[norm]
# Try partial matches for common variations
for token in tokens:
norm = re.sub(r"[^a-z0-9]", "", token.lower())
for integration_key in integration_to_category:
if norm in integration_key or integration_key in norm:
return integration_to_category[integration_key]
return ""
def categorize_by_filename(filename):
"""
Categorize workflow based on filename patterns.
Returns the most likely category or None if uncertain.
"""
filename_lower = filename.lower()
# Security & Authentication
if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
return "Technical Infrastructure & DevOps"
# Data Processing & File Operations
if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile', 'calcslive']):
return "Data Processing & Analysis"
# Utility & Business Process Automation
if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
return "Business Process Automation"
# Webhook & API related
if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
return "Web Scraping & Data Extraction"
# Form & Data Collection
if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
return "Data Processing & Analysis"
# Local file operations
if any(word in filename_lower for word in ['localfile', 'filemaker']):
return "Cloud Storage & File Management"
# Database operations
if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
return "Data Processing & Analysis"
# AI & Machine Learning
if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
return "AI Agent Development"
# E-commerce specific
if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
return "E-commerce & Retail"
# Social media specific
if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
return "Social Media Management"
# Customer support
if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
return "Communication & Messaging"
# Analytics & Tracking
if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
return "Data Processing & Analysis"
# Development tools
if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
return "Technical Infrastructure & DevOps"
# CRM & Sales tools
if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
return "CRM & Sales"
# Marketing tools
if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
return "Marketing & Advertising Automation"
# Project management
if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
return "Project Management"
# Communication
if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
return "Communication & Messaging"
# Cloud storage
if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
return "Cloud Storage & File Management"
# Creative tools
if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
return "Creative Design Automation"
# Video & content
if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
return "Creative Content & Video Automation"
# Financial tools
if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
return "Financial & Accounting"
# Weather & external APIs
if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
return "Web Scraping & Data Extraction"
return ""
def main():
# Load definition categories
integration_to_category = load_def_categories()
# Get all JSON files from workflows directory
workflows_dir = Path("workflows")
json_files = glob.glob(
os.path.join(workflows_dir, "**", "*.json"),
recursive=True
)
# Process each file
search_categories = []
for json_file in json_files:
path_obj = Path(json_file)
filename = path_obj.name
tokens = extract_tokens_from_filename(filename)
category = find_matching_category(tokens, integration_to_category)
search_categories.append({
"filename": filename,
"category": category
})
# Second pass for categorization
for item in search_categories:
if not item['category']:
item['category'] = categorize_by_filename(item['filename'])
# Sort by filename for consistency
search_categories.sort(key=lambda x: x['filename'])
# Write to search_categories.json
output_path = Path("context/search_categories.json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(search_categories, f, indent=2, ensure_ascii=False)
print(f"Generated search_categories.json with {len(search_categories)} entries")
# Generate unique categories list for API
unique_categories = set()
for item in search_categories:
if item['category']:
unique_categories.add(item['category'])
# Always include 'Uncategorized' for workflows without categories
unique_categories.add('Uncategorized')
# Sort categories alphabetically
categories_list = sorted(list(unique_categories))
# Write unique categories to a separate file for API consumption
categories_output_path = Path("context/unique_categories.json")
with open(categories_output_path, 'w', encoding='utf-8') as f:
json.dump(categories_list, f, indent=2, ensure_ascii=False)
print(f"Generated unique_categories.json with {len(categories_list)} categories")
# Print some statistics
categorized = sum(1 for item in search_categories if item['category'])
uncategorized = len(search_categories) - categorized
print(f"Categorized: {categorized}, Uncategorized: {uncategorized}")
# Print detailed category statistics
print("\n" + "="*50)
print("CATEGORY DISTRIBUTION (Top 20)")
print("="*50)
# Count categories
category_counts = {}
for item in search_categories:
category = item['category'] if item['category'] else "Uncategorized"
category_counts[category] = category_counts.get(category, 0) + 1
# Sort by count (descending)
sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
# Display top 20
for i, (category, count) in enumerate(sorted_categories[:20], 1):
print(f"{i:2d}. {category:<40} {count:>4} files")
if len(sorted_categories) > 20:
remaining = len(sorted_categories) - 20
print(f"\n... and {remaining} more categories")
# Write tips on uncategorized workflows
print("\n" + "="*50)
print("Tips on uncategorized workflows")
print("="*50)
print("1. At the search, you'll be able to list all uncategorized workflows.")
print("2. If the workflow JSON filename has a clear service name (eg. Twilio), it could just be we are missing its category definition at context/def_categories.json.")
print("3. You can contribute to the category definitions and then make a pull request to help improve the search experience.")
# Done message
print("\n" + "="*50)
print("Done! Search re-indexed with categories.")
print("="*50)
if __name__ == "__main__":
main()