@@ -136,7 +136,7 @@ def get_files(doc_keys=None, drop=True):
136136def gen_features (X , wday , yday , i , j , n , tf = None , u = None , n_features = 1400 ):
137137 from sklearn .feature_extraction import FeatureHasher
138138 from sklearn .decomposition import TruncatedSVD
139- from scipy .spatial .distance import cosine
139+ from scipy .spatial .distance import cosine , cdist
140140
141141 from collections import Counter
142142 import numpy
@@ -182,11 +182,17 @@ def s_from_w(s):
182182
183183 max_sim = wday * 0
184184
185+ i = numpy .array (i )
186+ j = numpy .array (j )
187+
188+
185189 for K , _ in enumerate (max_sim ):
186190 if i [K ] > 0 :
187- max_sim [K ] = max (( 1 - cosine (SX [K ,:], SX [K2 ,:])) ** 2 for K2 in range ( K ) if j [K ] == j [ K2 ] )
191+ max_sim [K ] = ( 1 - cdist (SX [[ K ] ,:], SX [ ( j == j [K ]) & ( i < i [ K ]), :], metric = 'cosine' ) ** 2 ). max ( )
188192
193+
189194 i = numpy .array (i , ndmin = 2 )
195+
190196 i_scaled = i / numpy .array (n , ndmin = 2 )
191197
192198
@@ -271,7 +277,7 @@ def objective(params):
271277 return MODEL
272278
273279
274- def score_index (model_key = "model.pickle" ):
280+ def score_index (model_key = "model.pickle" , save = True ):
275281 import xgboost as xgb
276282 import numpy
277283
@@ -291,21 +297,37 @@ def score_index(model_key="model.pickle"):
291297 # Remove links that were already clicked
292298 print ("Removing %d already clicked links" % sum (Y ))
293299 Y = numpy .array (Y )
300+ orig2 = [o for i ,o in enumerate (orig ) if Y [i ] == 0 ]
294301 X = xgb .DMatrix (X [Y == 0 , :], Y [Y == 0 ])
295302
296303 yhat = r .predict (X )
297304
298- bandit_max = min (1 , max (yhat ) * (yhat .shape [0 ] + 1 ) / yhat .shape [0 ] )
299305
300306 for i , _ in enumerate (yhat ):
301- orig [i ] = orig [i ].replace ("<div" , f"<div data-score ={ yhat [i ]} " ,1 )
307+ orig2 [i ] = orig2 [i ].replace ("<div" , f"<div data-score0 ={ yhat [i ]} " ,1 )
302308 # Five percent greedy-epsilon bandit
303309 if numpy .random .uniform () < .05 :
304- yhat [i ] = numpy .random .uniform () * bandit_max
305- orig [i ] = orig [i ].replace ("<div" , "<div data-bandit=1" , 1 )
310+ yhat [i ] = numpy .random .choice ( yhat )
311+ #orig2 [i] = orig2 [i].replace("<div", "<div data-bandit=1", 1)
306312
313+
314+ # Rescore with positioning
315+ index2 = Y * 9999
316+ index2 [Y == 0 ] = numpy .argsort (yhat ) # rescore per actual position.
317+ index [3 ] = index2
318+ X , _ , _ = gen_features (* index , tf = tf , u = u )
319+ X = xgb .DMatrix (X [Y == 0 , :], Y [Y == 0 ])
320+ yhat2 = r .predict (X )
307321
308- scores , lines = list (zip (* sorted (zip (- yhat , orig ))))
322+ for i , _ in enumerate (yhat ):
323+ orig2 [i ] = orig2 [i ].replace ("<div" , f"<div data-score1={ yhat2 [i ]} " ,1 )
324+ # Five percent greedy-epsilon bandit
325+ if numpy .random .uniform () < .05 :
326+ yhat2 [i ] = numpy .random .choice (yhat2 )
327+ orig2 [i ] = orig2 [i ].replace ("<div" , "<div data-bandit=1 " , 1 )
328+
329+
330+ scores , lines = list (zip (* sorted (zip (- (yhat + yhat2 )/ 2 , orig2 ))))
309331
310332 body , _ , = fetch_s3 (s3_client , "index.html" )
311333 body = "" .join (body .readlines ())
@@ -315,7 +337,9 @@ def score_index(model_key="model.pickle"):
315337 yesterdays_href = re .search ('(?<=<a href=")[0-9a-f]*[.]html(?=">yesterday\' s news</a>)' , body ).group (0 )
316338
317339 new_index = neal_news .build_new_index (lines , d , yesterdays_href )
318- neal_news .update_index (s3_client , new_index )
340+
341+ if save :
342+ neal_news .update_index (s3_client , new_index )
319343
320344
321345
0 commit comments