1111from abc import ABC , abstractmethod
1212from typing import Union , Optional , List
1313import numpy as np
14- import joblib
1514import os
16- import statsmodels .api as sm
1715from .utils import GLMLink
1816from ...envs .base import BaseEnvironment
1917from .mushroom_rl import PricingMushroomBaseAgent
2018from mushroom_rl .core import Agent
2119from ...utils import MDPInfo
2220from ..obsprocessors import FlattenTimeDimNumpy
2321from ...envs .actionprocessors import ClipAction
24-
22+ from scipy . optimize import minimize
2523
2624# %% ../../../nbs/30_agents/42_DP_agents/13_UCB_agent.ipynb 4
27- class UCBPolicy () :
25+ class UCBPolicy :
2826 def __init__ (self ,
2927 lam : float ,
3028 reg : float ,
3129 environment_info : MDPInfo ,
32- obsprocessors : Optional [List [object ]] = None ,
33- actionprocessors : Optional [List [object ]] = None ,
34- agent_name : str | None = None ,
35- ex_prices : np .ndarray | None = None ,
36- alpha : np .ndarray | None = None ,
37- beta : np .ndarray | None = None ,
38- price_function = None ,
39- g = None ,
40- ):
41- assert type (alpha ) == type (beta ), "alpha and beta must be of the same type"
30+ obsprocessors = None ,
31+ actionprocessors = None ,
32+ agent_name = None ,
33+ ex_prices = None ,
34+ alpha = None ,
35+ beta = None ,
36+ price_function = None ,
37+ g = None ):
38+
4239 if alpha is None :
4340 alpha = np .zeros (environment_info .observation_space ['features' ].shape [0 ])
41+ if beta is None :
4442 beta = np .zeros (environment_info .observation_space ['features' ].shape [0 ])
4543 if isinstance (ex_prices , list ):
4644 ex_prices = np .array (ex_prices )
@@ -50,85 +48,100 @@ def __init__(self,
5048 self .ex_prices = ex_prices
5149 self .alpha = alpha
5250 self .beta = beta
53- self .actionprocessors = actionprocessors
54- self .price_function = price_function # Needs to return an np array
51+ self .actionprocessors = actionprocessors or []
52+ self .obsprocessors = obsprocessors or []
53+ self .price_function = price_function
54+ self .g = g
5555 self .lam = lam
5656 self .reg = reg
57- self .g = g
5857 self .t = 0
5958 self .X = np .empty ((0 , environment_info .observation_space ['features' ].shape [0 ] * 2 ))
6059 self .Y = np .empty ((0 , 1 ))
6160 self .mode = "train"
6261 self .actionprocessors .append (ClipAction (environment_info .action_space .low , environment_info .action_space .high ))
6362
64- def draw_action (self , observation : np .ndarray ):
63+ def draw_action (self , observation ):
64+ x = observation ['features' ]
6565 if self .t in [0 , 1 ]:
66- prices = self .ex_prices [self .t ]
66+ price = self .ex_prices [self .t ]
6767 else :
68- X = observation ['features' ]
69- M = self .compute_uncertainty_M (X )
68+ M = self .compute_uncertainty_M (x )
7069 samples = self .sample_from_confidence_region (np .concatenate ([self .alpha , self .beta ]), M )
71- alpha , beta = self .max_rev (samples , X )
72- price = self .price_function (X , alpha , beta )
73-
70+ alpha , beta = self .max_rev (samples , x )
71+ price = self .price_function (x , alpha , beta )
7472 for processor in self .actionprocessors :
7573 price = processor (price )
76-
7774 return np .array (price , dtype = np .float32 )
78-
75+
76+ def fit (self , X , Y , action ):
77+ self .t += 1
78+ Z = np .concatenate ([X , X * action ])
79+ self .X = np .vstack ([self .X , Z ])
80+ self .Y = np .vstack ([self .Y , Y ])
81+ self .parameter_update ()
82+
83+ def parameter_update (self ):
84+ if self .X .shape [0 ] < 2 :
85+ return
86+ def loss (theta ):
87+ preds = self .g .g (self .X @ theta )
88+ errors = preds - self .Y .flatten ()
89+ weights = 1 / self .g .v (preds )
90+ return np .sum ((errors ** 2 ) * weights ) + self .reg * np .linalg .norm (theta )** 2
91+
92+ theta0 = np .concatenate ([self .alpha , self .beta ])
93+ res = minimize (loss , theta0 , method = 'L-BFGS-B' )
94+ if res .success :
95+ theta_hat = res .x
96+ d = self .environment_info .observation_space ['features' ].shape [0 ]
97+ self .alpha = theta_hat [:d ]
98+ self .beta = theta_hat [d :]
99+
79100 def sample_design_matrix (self ):
80- I = np . identity ( 2 * self .environment_info .observation_space ['features' ].shape [1 ])
81- I_lamdba = self .lam * I
101+ d = self .environment_info .observation_space ['features' ].shape [0 ]
102+ I = self .lam * np . identity ( 2 * d )
82103 if self .X .shape [0 ] == 0 :
83- return I_lamdba
84- matrix = np .sum ([np .outer (x , x .T ) for x in self .X ], axis = 0 )
85- return I_lamdba + matrix
86-
87- def sample_from_confidence_region (self , theta_hat , M , N = 50 ):
88- L = np .linalg .cholesky (np .linalg .inv (M ))
89- u = np .random .randn (len (theta_hat ), N )
90- u /= np .linalg .norm (u , axis = 0 )
91- samples = theta_hat [:, np .newaxis ] + 1 / self .environment_info .observation_space ['features' ].shape [1 ] * L @ u
92- return samples .T
93-
104+ return I
105+ return I + self .X .T @ self .X
106+
94107 def compute_uncertainty_M (self , x_t ):
95108 M = self .sample_design_matrix ()
109+ d = x_t .shape [0 ]
96110 block_matrix = np .block ([
97111 [x_t , np .zeros_like (x_t )],
98112 [np .zeros_like (x_t ), x_t ]
99113 ])
100- M_inverse = np .linalg .inv (M )
114+ return np .linalg .inv (block_matrix @ np .linalg .inv (M ) @ block_matrix .T )
115+
116+ def sample_from_confidence_region (self , theta_hat , M , N = 50 ):
117+ L = np .linalg .cholesky (np .linalg .inv (M ))
118+ u = np .random .randn (len (theta_hat ), N )
119+ u /= np .linalg .norm (u , axis = 0 )
120+ return (theta_hat [:, np .newaxis ] + (1 / self .environment_info .observation_space ['features' ].shape [0 ]) * (L @ u )).T
121+
122+ def max_rev (self , samples , x ):
123+ max_val = - np .inf
124+ best_alpha , best_beta = None , None
125+ for theta in samples :
126+ alpha = theta [:x .shape [0 ]]
127+ beta = theta [x .shape [0 ]:]
128+ price = self .price_function (x , alpha , beta )
129+ rev = price * self .g .g (np .dot (x , alpha ) + price * np .dot (x , beta ))
130+ if rev > max_val :
131+ max_val = rev
132+ best_alpha , best_beta = alpha , beta
133+ return best_alpha , best_beta
101134
102- projected_matrix = block_matrix @ M_inverse @ block_matrix .T
103- projected_matrix_inverse = np .linalg .inv (projected_matrix )
104- return projected_matrix_inverse
105-
106- def fit (self , X , Y , action ):
107- assert self .mode == "train"
108- self .t += 1
109- X = np .concatenate ([X , X * action ])
110- self .X = np .vstack ([self .X , X ])
111- self .Y = np .vstack ([self .Y , Y ])
112- self .parameter_update ()
113-
114- def parameter_update (self ):
115- if self .X .shape [0 ] < 2 :
116- return
117- model = sm .GLM (self .Y , self .X , family = sm .families .Binomial ())
118- results = model .fit ()
119- self .alpha = results .params [:self .environment_info .observation_space ['features' ].shape [0 ]]
120- self .beta = results .params [self .environment_info .observation_space ['features' ].shape [0 ]:]
121-
122135 def update_task (self , env ):
123136 self .environment_info = env .mdp_info
124- self .X = np .empty ((0 , self .environment_info .observation_space ['features' ].shape [0 ] * 2 ))
137+ d = self .environment_info .observation_space ['features' ].shape [0 ]
138+ self .X = np .empty ((0 , 2 * d ))
125139 self .Y = np .empty ((0 , 1 ))
126140 self .actionprocessors [- 1 ] = ClipAction (self .environment_info .action_space .low , self .environment_info .action_space .high )
127- self .M = [[np .power (x ,2 )+ i for x in range (0 , int (np .sqrt (self .environment_info .horizon )))] for i in range (0 , 2 )]
128- self .t = 0
129-
141+ self .t = 0
142+
130143 def reset (self ):
131- return
144+ pass
132145
133146
134147# %% ../../../nbs/30_agents/42_DP_agents/13_UCB_agent.ipynb 5
0 commit comments