Skip to content

Commit 547420a

Browse files
committed
Updated UCB to reflect the QLE optimization
1 parent a9fe00f commit 547420a

File tree

3 files changed

+159
-130
lines changed

3 files changed

+159
-130
lines changed

ddopai/_modidx.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,8 @@
346346
'ddopai/agents/dynamic_pricing/UCB.py'),
347347
'ddopai.agents.dynamic_pricing.UCB.UCBPolicy.fit': ( '30_agents/42_DP_agents/ucb_agent.html#ucbpolicy.fit',
348348
'ddopai/agents/dynamic_pricing/UCB.py'),
349+
'ddopai.agents.dynamic_pricing.UCB.UCBPolicy.max_rev': ( '30_agents/42_DP_agents/ucb_agent.html#ucbpolicy.max_rev',
350+
'ddopai/agents/dynamic_pricing/UCB.py'),
349351
'ddopai.agents.dynamic_pricing.UCB.UCBPolicy.parameter_update': ( '30_agents/42_DP_agents/ucb_agent.html#ucbpolicy.parameter_update',
350352
'ddopai/agents/dynamic_pricing/UCB.py'),
351353
'ddopai.agents.dynamic_pricing.UCB.UCBPolicy.reset': ( '30_agents/42_DP_agents/ucb_agent.html#ucbpolicy.reset',

ddopai/agents/dynamic_pricing/UCB.py

Lines changed: 78 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,34 @@
1111
from abc import ABC, abstractmethod
1212
from typing import Union, Optional, List
1313
import numpy as np
14-
import joblib
1514
import os
16-
import statsmodels.api as sm
1715
from .utils import GLMLink
1816
from ...envs.base import BaseEnvironment
1917
from .mushroom_rl import PricingMushroomBaseAgent
2018
from mushroom_rl.core import Agent
2119
from ...utils import MDPInfo
2220
from ..obsprocessors import FlattenTimeDimNumpy
2321
from ...envs.actionprocessors import ClipAction
24-
22+
from scipy.optimize import minimize
2523

2624
# %% ../../../nbs/30_agents/42_DP_agents/13_UCB_agent.ipynb 4
27-
class UCBPolicy():
25+
class UCBPolicy:
2826
def __init__(self,
2927
lam: float,
3028
reg: float,
3129
environment_info: MDPInfo,
32-
obsprocessors: Optional[List[object]] = None,
33-
actionprocessors: Optional[List[object]] = None,
34-
agent_name: str | None = None,
35-
ex_prices: np.ndarray | None = None,
36-
alpha: np.ndarray | None = None,
37-
beta: np.ndarray | None = None,
38-
price_function = None,
39-
g = None,
40-
):
41-
assert type(alpha) == type(beta), "alpha and beta must be of the same type"
30+
obsprocessors=None,
31+
actionprocessors=None,
32+
agent_name=None,
33+
ex_prices=None,
34+
alpha=None,
35+
beta=None,
36+
price_function=None,
37+
g=None):
38+
4239
if alpha is None:
4340
alpha = np.zeros(environment_info.observation_space['features'].shape[0])
41+
if beta is None:
4442
beta = np.zeros(environment_info.observation_space['features'].shape[0])
4543
if isinstance(ex_prices, list):
4644
ex_prices = np.array(ex_prices)
@@ -50,85 +48,100 @@ def __init__(self,
5048
self.ex_prices = ex_prices
5149
self.alpha = alpha
5250
self.beta = beta
53-
self.actionprocessors = actionprocessors
54-
self.price_function = price_function # Needs to return an np array
51+
self.actionprocessors = actionprocessors or []
52+
self.obsprocessors = obsprocessors or []
53+
self.price_function = price_function
54+
self.g = g
5555
self.lam = lam
5656
self.reg = reg
57-
self.g = g
5857
self.t = 0
5958
self.X = np.empty((0, environment_info.observation_space['features'].shape[0] * 2))
6059
self.Y = np.empty((0, 1))
6160
self.mode = "train"
6261
self.actionprocessors.append(ClipAction(environment_info.action_space.low, environment_info.action_space.high))
6362

64-
def draw_action(self, observation: np.ndarray):
63+
def draw_action(self, observation):
64+
x = observation['features']
6565
if self.t in [0, 1]:
66-
prices = self.ex_prices[self.t]
66+
price = self.ex_prices[self.t]
6767
else:
68-
X = observation['features']
69-
M = self.compute_uncertainty_M(X)
68+
M = self.compute_uncertainty_M(x)
7069
samples = self.sample_from_confidence_region(np.concatenate([self.alpha, self.beta]), M)
71-
alpha, beta = self.max_rev(samples, X)
72-
price = self.price_function(X, alpha, beta)
73-
70+
alpha, beta = self.max_rev(samples, x)
71+
price = self.price_function(x, alpha, beta)
7472
for processor in self.actionprocessors:
7573
price = processor(price)
76-
7774
return np.array(price, dtype=np.float32)
78-
75+
76+
def fit(self, X, Y, action):
77+
self.t += 1
78+
Z = np.concatenate([X, X * action])
79+
self.X = np.vstack([self.X, Z])
80+
self.Y = np.vstack([self.Y, Y])
81+
self.parameter_update()
82+
83+
def parameter_update(self):
84+
if self.X.shape[0] < 2:
85+
return
86+
def loss(theta):
87+
preds = self.g.g(self.X @ theta)
88+
errors = preds - self.Y.flatten()
89+
weights = 1 / self.g.v(preds)
90+
return np.sum((errors**2) * weights) + self.reg * np.linalg.norm(theta)**2
91+
92+
theta0 = np.concatenate([self.alpha, self.beta])
93+
res = minimize(loss, theta0, method='L-BFGS-B')
94+
if res.success:
95+
theta_hat = res.x
96+
d = self.environment_info.observation_space['features'].shape[0]
97+
self.alpha = theta_hat[:d]
98+
self.beta = theta_hat[d:]
99+
79100
def sample_design_matrix(self):
80-
I = np.identity(2*self.environment_info.observation_space['features'].shape[1])
81-
I_lamdba = self.lam * I
101+
d = self.environment_info.observation_space['features'].shape[0]
102+
I = self.lam * np.identity(2 * d)
82103
if self.X.shape[0] == 0:
83-
return I_lamdba
84-
matrix = np.sum([np.outer(x, x.T) for x in self.X], axis=0)
85-
return I_lamdba + matrix
86-
87-
def sample_from_confidence_region(self, theta_hat, M, N=50):
88-
L = np.linalg.cholesky(np.linalg.inv(M))
89-
u = np.random.randn(len(theta_hat), N)
90-
u /= np.linalg.norm(u, axis=0)
91-
samples = theta_hat[:, np.newaxis] + 1/self.environment_info.observation_space['features'].shape[1] * L @ u
92-
return samples.T
93-
104+
return I
105+
return I + self.X.T @ self.X
106+
94107
def compute_uncertainty_M(self, x_t):
95108
M = self.sample_design_matrix()
109+
d = x_t.shape[0]
96110
block_matrix = np.block([
97111
[x_t, np.zeros_like(x_t)],
98112
[np.zeros_like(x_t), x_t]
99113
])
100-
M_inverse = np.linalg.inv(M)
114+
return np.linalg.inv(block_matrix @ np.linalg.inv(M) @ block_matrix.T)
115+
116+
def sample_from_confidence_region(self, theta_hat, M, N=50):
117+
L = np.linalg.cholesky(np.linalg.inv(M))
118+
u = np.random.randn(len(theta_hat), N)
119+
u /= np.linalg.norm(u, axis=0)
120+
return (theta_hat[:, np.newaxis] + (1 / self.environment_info.observation_space['features'].shape[0]) * (L @ u)).T
121+
122+
def max_rev(self, samples, x):
123+
max_val = -np.inf
124+
best_alpha, best_beta = None, None
125+
for theta in samples:
126+
alpha = theta[:x.shape[0]]
127+
beta = theta[x.shape[0]:]
128+
price = self.price_function(x, alpha, beta)
129+
rev = price * self.g.g(np.dot(x, alpha) + price * np.dot(x, beta))
130+
if rev > max_val:
131+
max_val = rev
132+
best_alpha, best_beta = alpha, beta
133+
return best_alpha, best_beta
101134

102-
projected_matrix = block_matrix @ M_inverse @ block_matrix.T
103-
projected_matrix_inverse = np.linalg.inv(projected_matrix)
104-
return projected_matrix_inverse
105-
106-
def fit(self, X, Y, action):
107-
assert self.mode == "train"
108-
self.t += 1
109-
X = np.concatenate([X, X * action])
110-
self.X = np.vstack([self.X, X])
111-
self.Y = np.vstack([self.Y, Y])
112-
self.parameter_update()
113-
114-
def parameter_update(self):
115-
if self.X.shape[0] < 2:
116-
return
117-
model = sm.GLM(self.Y, self.X, family=sm.families.Binomial())
118-
results = model.fit()
119-
self.alpha = results.params[:self.environment_info.observation_space['features'].shape[0]]
120-
self.beta = results.params[self.environment_info.observation_space['features'].shape[0]:]
121-
122135
def update_task(self, env):
123136
self.environment_info = env.mdp_info
124-
self.X = np.empty((0, self.environment_info.observation_space['features'].shape[0] * 2))
137+
d = self.environment_info.observation_space['features'].shape[0]
138+
self.X = np.empty((0, 2 * d))
125139
self.Y = np.empty((0, 1))
126140
self.actionprocessors[-1] = ClipAction(self.environment_info.action_space.low, self.environment_info.action_space.high)
127-
self.M = [[np.power(x,2)+i for x in range(0, int(np.sqrt(self.environment_info.horizon)))] for i in range(0, 2)]
128-
self.t = 0
129-
141+
self.t = 0
142+
130143
def reset(self):
131-
return
144+
pass
132145

133146

134147
# %% ../../../nbs/30_agents/42_DP_agents/13_UCB_agent.ipynb 5

0 commit comments

Comments
 (0)