Skip to content

Commit 9492ca7

Browse files
authored
Merge pull request #329 from namehash/rename-names-to-labels
Rename names to labels
2 parents a9bd549 + 9b67902 commit 9492ca7

30 files changed

+81
-737573
lines changed

models.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ class Params(BaseModel):
7575
description='if true, the results will be sorted by '
7676
'learning to rank algorithm')
7777
label_diversity_ratio: Optional[float] = \
78-
Field(0.5, examples=[0.5], ge=0.0, le=1.0, title='collection diversity parameter based on names',
79-
description='adds penalty to collections with similar names to other collections\n'
78+
Field(0.5, examples=[0.5], ge=0.0, le=1.0, title='collection diversity parameter based on labels',
79+
description='adds penalty to collections with similar labels to other collections\n'
8080
'if null, then no penalty will be added')
8181
max_per_type: Optional[int] = \
8282
Field(2, examples=[2], ge=1, title='collection diversity parameter based on collection types',
@@ -119,8 +119,8 @@ class RelatedCategoryParams(BaseModel):
119119
max_related_collections: int = Field(6, ge=0, le=10,
120120
title='max number of related collections returned. '
121121
'If 0 it effectively turns off any related collection search.')
122-
max_names_per_related_collection: int = Field(10, ge=1, le=10,
123-
title='max number of names returned in any related collection')
122+
max_labels_per_related_collection: int = Field(10, ge=1, le=10,
123+
title='max number of labels returned in any related collection')
124124
max_recursive_related_collections: int = Field(3, ge=0, le=10,
125125
title='Set to 0 to disable the "recursive related collection search". '
126126
'When set to a value between 1 and 10, '
@@ -131,8 +131,8 @@ class RelatedCategoryParams(BaseModel):
131131
description='if true, the results will be sorted by '
132132
'learning to rank algorithm')
133133
label_diversity_ratio: Optional[float] = \
134-
Field(0.5, examples=[0.5], ge=0.0, le=1.0, title='collection diversity parameter based on names',
135-
description='adds penalty to collections with similar names to other collections\n'
134+
Field(0.5, examples=[0.5], ge=0.0, le=1.0, title='collection diversity parameter based on labels',
135+
description='adds penalty to collections with similar labels to other collections\n'
136136
'if null, then no penalty will be added')
137137
max_per_type: Optional[int] = \
138138
Field(2, examples=[2], ge=1, title='collection diversity parameter based on collection types',
@@ -158,10 +158,10 @@ class GroupedLabelRequest(BaseModel):
158158
description='* cannot contain dots (.)'
159159
'\n* if enclosed in double quotes assuming label is pre-tokenized')
160160

161-
# min_primary_fraction: float = Field(0.1, title='minimal fraction of primary names',
161+
# min_primary_fraction: float = Field(0.1, title='minimal fraction of primary labels',
162162
# ge=0.0, le=1.0,
163163
# description='ensures at least `min_suggestions * min_primary_fraction` '
164-
# 'primary names will be generated')
164+
# 'primary labels will be generated')
165165
params: GroupedParams = Field(GroupedParams(), title='pipeline parameters',
166166
description='includes all the parameters for all nodes of the pipeline')
167167

@@ -179,10 +179,10 @@ class LabelRequest(BaseModel):
179179
ge=1, le=generator.config.generation.limit)
180180
max_suggestions: int = Field(100, title='maximal number of suggestions to generate',
181181
ge=1)
182-
min_primary_fraction: float = Field(0.1, title='minimal fraction of primary names',
182+
min_primary_fraction: float = Field(0.1, title='minimal fraction of primary labels',
183183
ge=0.0, le=1.0,
184184
description='ensures at least `min_suggestions * min_primary_fraction` '
185-
'primary names will be generated')
185+
'primary labels will be generated')
186186
params: Optional[Params] = Field(None, title='pipeline parameters',
187187
description='includes all the parameters for all nodes of the pipeline')
188188

namegraph/generation/collection_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def apply(self, name: InputName, interpretation: Interpretation) -> Iterable[Gen
8686
tokens.extend(emojis)
8787

8888
params = name.params if name.params is not None else dict()
89-
suggestions_limit = max(params.get('max_names_per_related_collection', 0), self.suggestions_limit)
89+
suggestions_limit = max(params.get('max_labels_per_related_collection', 0), self.suggestions_limit)
9090
logger.info(f'CollectionGenerator query: {tokens}')
9191
collections, _ = self.collection_matcher.search_for_generator(
9292
tuple(tokens),

namegraph/xgenerator.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def generate_grouped_names(
154154
self,
155155
name: str,
156156
max_related_collections: int = 5,
157-
max_names_per_related_collection: int = 5,
157+
max_labels_per_related_collection: int = 5,
158158
max_recursive_related_collections: int = 5,
159159
categories_params=None,
160160
min_total_suggestions: int = 50,
@@ -164,7 +164,7 @@ def generate_grouped_names(
164164
categories_params = categories_params or {}
165165

166166
params['max_related_collections'] = max_related_collections
167-
params['max_names_per_related_collection'] = max_names_per_related_collection
167+
params['max_labels_per_related_collection'] = max_labels_per_related_collection
168168
params['max_recursive_related_collections'] = max_recursive_related_collections
169169
params['categories_params'] = categories_params
170170
params['min_total_suggestions'] = min_total_suggestions
@@ -201,7 +201,7 @@ def generate_grouped_names(
201201
max_suggestions = category_params.max_suggestions
202202
except AttributeError: # RelatedCategoryParams
203203
min_suggestions = 0
204-
max_suggestions = 3 * category_params.max_related_collections * max(category_params.max_names_per_related_collection, self.config.collections.suggestions_limit) # 3 interpretations
204+
max_suggestions = 3 * category_params.max_related_collections * max(category_params.max_labels_per_related_collection, self.config.collections.suggestions_limit) # 3 interpretations
205205

206206
# TODO should they use the same set of suggestions (for deduplications)
207207
suggestions = meta_sampler.sample(name, 'weighted-sampling',
@@ -235,7 +235,7 @@ def is_already_sampled(suggestion: str) -> bool:
235235
max_suggestions = category_params.max_suggestions
236236
except AttributeError: # RelatedCategoryParams
237237
min_suggestions = 0
238-
max_suggestions = 3 * category_params.max_related_collections * max(category_params.max_names_per_related_collection, self.config.collections.suggestions_limit)
238+
max_suggestions = 3 * category_params.max_related_collections * max(category_params.max_labels_per_related_collection, self.config.collections.suggestions_limit)
239239

240240
futures[executor.submit(meta_sampler.sample, name, 'weighted-sampling',
241241
min_suggestions=min_suggestions, max_suggestions=max_suggestions,
@@ -265,7 +265,7 @@ def is_already_sampled(suggestion: str) -> bool:
265265
collections_id2related[suggestion.collection_id] = suggestion.related_collections or []
266266

267267
collection_suggestions = all_related_suggestions[suggestion.collection_id]
268-
if len(collection_suggestions) < max_names_per_related_collection:
268+
if len(collection_suggestions) < max_labels_per_related_collection:
269269
collection_suggestions.append(suggestion)
270270
del grouped_suggestions['related']
271271

@@ -319,7 +319,7 @@ def is_already_sampled(suggestion: str) -> bool:
319319

320320
category_params = getattr(categories_params, 'related')
321321
for category, related_suggestions in all_related_suggestions.items():
322-
max_suggestions = category_params.max_names_per_related_collection
322+
max_suggestions = category_params.max_labels_per_related_collection
323323
related_suggestions.data = related_suggestions.data[:max_suggestions]
324324

325325
# cap related collections to max_related_collections

research/elasticsearch/generate-report-only-collections.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def write(s: str):
107107
'country': 'pl'
108108
}})
109109

110-
input_names = ['fire', 'funny', 'funnyshit', 'funnyshitass', 'funnyshitshit', 'lightwalker', 'josiahadams',
110+
input_labels = ['fire', 'funny', 'funnyshit', 'funnyshitass', 'funnyshitshit', 'lightwalker', 'johndoe',
111111
'kwrobel', 'krzysztofwrobel', 'pikachu', 'mickey', 'adoreyoureyes', 'face', 'theman', 'goog',
112112
'billycorgan', '[003fda97309fd6aa9d7753dcffa37da8bb964d0fb99eba99d0770e76fc5bac91]', 'a' * 101,
113113
'dogcat', 'firepower', 'tubeyou', 'fireworks', 'hacker', 'firecar', '😊😊😊', 'anarchy',
@@ -155,7 +155,7 @@ def write(s: str):
155155
times = []
156156

157157
request_times = collections.defaultdict(list)
158-
for input_name in tqdm(input_names):
158+
for input_name in tqdm(input_labels):
159159
write(f'<h1>{input_name}</h1>')
160160

161161
write(f'<section>')
@@ -282,11 +282,11 @@ def write(s: str):
282282

283283
write(f'<h1>Mean share</h1>')
284284
for generator_name, values in sorted(stats.items(), key=lambda x: sum(x[1]), reverse=True):
285-
write(f'<p>{(100 * sum(values) / len(input_names)):.2f}% {generator_name}</p>')
285+
write(f'<p>{(100 * sum(values) / len(input_labels)):.2f}% {generator_name}</p>')
286286

287287
write(f'<h1>MRR</h1>')
288288
for generator_name, values in sorted(mrr.items(), key=lambda x: sum(x[1]), reverse=True):
289-
write(f'<p>{(sum(values) / len(input_names)):.2f} {generator_name}</p>')
289+
write(f'<p>{(sum(values) / len(input_labels)):.2f} {generator_name}</p>')
290290

291291
write(f'<h1>First position</h1>')
292292
for generator_name, values in sorted(first_position.items(), key=lambda x: sum(x[1]) / len(x[1]), reverse=False):
@@ -301,7 +301,7 @@ def write(s: str):
301301
for i, position in enumerate(positions):
302302
ap.append((i + 1) / position)
303303
map.append(sum(ap) / len(ap))
304-
maps.append((sum(map) / len(input_names), generator_name))
304+
maps.append((sum(map) / len(input_labels), generator_name))
305305

306306
for map, generator_name in sorted(maps, reverse=True):
307307
write(f'<p>{map:.2f} {generator_name}</p>')

research/elasticsearch/generate.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
python search.py $1 --host $ES_HOST --port $ES_PORT --username $ES_USERNAME --password $ES_PASSWORD "apple" "apples" "bmw" "hulk" "marvel" "marvel characters" "fruit" "fruits" "britney spears" "bmw car models" "cars" "football players" "cristiano ronaldo" "planets" "countries" "france" "switzerland" "bmw vehicles" "greek gods" "zeus" "athena" "fire" "funny" "funny shit" "funny shit ass" "funny shit shit" "light walker" "josiah adams" "k wrobel" "krzysztof wrobel" "pikachu" "mickey" "adore your eyes" "face" "the man" "goog" "billy corgan" "003 fda 97309 fd 6 a a 9 d 7753 dc ffa 37 da 8 bb 964 d 0 fb 99 eb a 99 d 0770 e 76 fc 5 bac 91" "aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaa a" "dog cat" "firepower" "tube you" "fireworks" "hacker" "fire car" "" "anarchy" "pray for ukraine" "krakow dragon" "fifty six" "" "" "asd" "bartek" "hongkong" "hongkonger" "tyler" "as df as df as df 3453212345" "nine inch nails" "krakow" "joe biden" "european union" "roger federer" "suzuki" "pirates" "doge" "eth corner" "google" "apple" "001" "stop doing fake bids its honestly lame my guy" "kfc so good" "wallet" "" "porno" "sex" "slut wife" "god" "im expensive" "htaccess" "nike" "80000" "starbucks" "ukraine" "" "sony" "kevin" "discord" "monaco" "market" "sports bet" "vol o dy myr ze lensky" "coffee" "gold" "hodl" "yeezy" "brantly" "jeezy" "vitalik" "example registration" "py me" "avalanche" "messy" "messi" "king messi" "abc" "testing" "superman" "facebook" "test" "name hash" "test b" "happy people" "muscle" "billy bob" "quo" "circle ci" "bitcoin mine" "power outage" "shooting arrow at the sky" "pink floyd" "highest mountains"
1+
python search.py $1 --host $ES_HOST --port $ES_PORT --username $ES_USERNAME --password $ES_PASSWORD "apple" "apples" "bmw" "hulk" "marvel" "marvel characters" "fruit" "fruits" "britney spears" "bmw car models" "cars" "football players" "cristiano ronaldo" "planets" "countries" "france" "switzerland" "bmw vehicles" "greek gods" "zeus" "athena" "fire" "funny" "funny shit" "funny shit ass" "funny shit shit" "light walker" "john doe" "k wrobel" "krzysztof wrobel" "pikachu" "mickey" "adore your eyes" "face" "the man" "goog" "billy corgan" "003 fda 97309 fd 6 a a 9 d 7753 dc ffa 37 da 8 bb 964 d 0 fb 99 eb a 99 d 0770 e 76 fc 5 bac 91" "aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaaaa aaaa a" "dog cat" "firepower" "tube you" "fireworks" "hacker" "fire car" "" "anarchy" "pray for ukraine" "krakow dragon" "fifty six" "" "" "asd" "bartek" "hongkong" "hongkonger" "tyler" "as df as df as df 3453212345" "nine inch nails" "krakow" "joe biden" "european union" "roger federer" "suzuki" "pirates" "doge" "eth corner" "google" "apple" "001" "stop doing fake bids its honestly lame my guy" "kfc so good" "wallet" "" "porno" "sex" "slut wife" "god" "im expensive" "htaccess" "nike" "80000" "starbucks" "ukraine" "" "sony" "kevin" "discord" "monaco" "market" "sports bet" "vol o dy myr ze lensky" "coffee" "gold" "hodl" "yeezy" "brantly" "jeezy" "vitalik" "example registration" "py me" "avalanche" "messy" "messi" "king messi" "abc" "testing" "superman" "facebook" "test" "name hash" "test b" "happy people" "muscle" "billy bob" "quo" "circle ci" "bitcoin mine" "power outage" "shooting arrow at the sky" "pink floyd" "highest mountains"

0 commit comments

Comments
 (0)