22import os .path
33from datetime import datetime
44import re
5- from typing import Optional
5+ from typing import Optional , Dict , Set
66
77import bcrypt
88from flask import url_for
1414
1515from . import thumbnail
1616from .database import Base
17+ from .str_similarity import meaningful_words , words_similarity
1718
1819
1920class Following (Base ): # type: ignore
@@ -235,6 +236,7 @@ class Mod(Base): # type: ignore
235236 followings = relationship ('Following' , back_populates = 'mod' )
236237 # List of users that follow this mods
237238 followers = association_proxy ('followings' , 'user' )
239+ similar_mods = association_proxy ('similarities' , 'other_mod' )
238240
239241 def background_thumb (self ) -> Optional [str ]:
240242 return thumbnail .get_or_create (self )
@@ -251,6 +253,20 @@ def background_url(self, protocol: Optional[str], cdn_domain: Optional[str]) ->
251253 else :
252254 return url_for ('mods.mod_background' , mod_id = self .id , mod_name = self .name )
253255
256+ def get_author_names (self ) -> Set [str ]:
257+ self ._author_names : Set [str ]
258+ if not hasattr (self , '_author_names' ):
259+ self ._author_names = {self .user .username , * (a .username for a in self .shared_authors )}
260+ return self ._author_names
261+
262+ def get_words (self , prop_name : str ) -> Set [str ]:
263+ """ Only parse the strings once to speed up mass-compares """
264+ if not hasattr (self , '_words' ):
265+ self ._words : Dict [str , Set [str ]] = {}
266+ if prop_name not in self ._words :
267+ self ._words [prop_name ] = meaningful_words (getattr (self , prop_name , '' ))
268+ return self ._words [prop_name ]
269+
254270 def __repr__ (self ) -> str :
255271 return '<Mod %r %r>' % (self .id , self .name )
256272
@@ -300,6 +316,36 @@ def __repr__(self) -> str:
300316 return '<SharedAuthor %r>' % self .user_id
301317
302318
319+ class ModSimilarity (Base ): # type: ignore
320+ __tablename__ = 'mod_similarity'
321+ __table_args__ = (PrimaryKeyConstraint ('main_mod_id' , 'other_mod_id' , name = 'pk_mods' ), )
322+ similarity = Column (Float (precision = 5 ), nullable = False )
323+ main_mod_id = Column (Integer , ForeignKey ('mod.id' , ondelete = 'CASCADE' ), nullable = False )
324+ main_mod = relationship ('Mod' ,
325+ foreign_keys = main_mod_id ,
326+ backref = backref ('similarities' ,
327+ passive_deletes = True ,
328+ order_by = similarity .desc ()))
329+ other_mod_id = Column (Integer , ForeignKey ('mod.id' , ondelete = 'CASCADE' ), nullable = False )
330+ other_mod = relationship ('Mod' , foreign_keys = other_mod_id )
331+
332+ Index ('ix_mod_similarity_main_mod_similarity' , main_mod_id , similarity .desc ())
333+
334+ WORD_PROPS = ['name' , 'short_description' , 'description' ]
335+
336+ def __init__ (self , main_mod : Mod , other_mod : Mod ) -> None :
337+ self .main_mod_id = main_mod .id
338+ self .other_mod_id = other_mod .id
339+ self .similarity = (0.1 * words_similarity (main_mod .get_author_names (),
340+ other_mod .get_author_names ())
341+ + sum (words_similarity (main_mod .get_words (prop_name ),
342+ other_mod .get_words (prop_name ))
343+ for prop_name in self .WORD_PROPS ))
344+
345+ def __repr__ (self ) -> str :
346+ return f'<Mod Similarity { self .main_mod_id } { self .other_mod_id } >'
347+
348+
303349class DownloadEvent (Base ): # type: ignore
304350 __tablename__ = 'downloadevent'
305351 id = Column (Integer , primary_key = True )
0 commit comments