d3group
diff --git a/‎ddopai/_modidx.py‎
Lines changed: 9 additions & 9 deletions b/‎ddopai/_modidx.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎ddopai/agents/dynamic_pricing/TS.py‎
Lines changed: 93 additions & 97 deletions b/‎ddopai/agents/dynamic_pricing/TS.py‎
Lines changed: 93 additions & 97 deletions
diff --git a/‎ddopai/agents/dynamic_pricing/UCB.py‎
Lines changed: 11 additions & 21 deletions b/‎ddopai/agents/dynamic_pricing/UCB.py‎
Lines changed: 11 additions & 21 deletions
@@ -308,18 +308,12 @@
                                                                                                  'ddopai/agents/dynamic_pricing/TS.py'),
                                                   'ddopai.agents.dynamic_pricing.TS.TSPolicy.__init__': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.__init__',
                                                                                                           'ddopai/agents/dynamic_pricing/TS.py'),
-                                                  'ddopai.agents.dynamic_pricing.TS.TSPolicy.compute_uncertainty_M': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.compute_uncertainty_m',
-                                                                                                                       'ddopai/agents/dynamic_pricing/TS.py'),
                                                   'ddopai.agents.dynamic_pricing.TS.TSPolicy.draw_action': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.draw_action',
                                                                                                              'ddopai/agents/dynamic_pricing/TS.py'),
                                                   'ddopai.agents.dynamic_pricing.TS.TSPolicy.fit': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.fit',
                                                                                                      'ddopai/agents/dynamic_pricing/TS.py'),
-                                                  'ddopai.agents.dynamic_pricing.TS.TSPolicy.parameter_update': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.parameter_update',
-                                                                                                                  'ddopai/agents/dynamic_pricing/TS.py'),
                                                   'ddopai.agents.dynamic_pricing.TS.TSPolicy.reset': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.reset',
                                                                                                        'ddopai/agents/dynamic_pricing/TS.py'),
-                                                  'ddopai.agents.dynamic_pricing.TS.TSPolicy.sample_design_matrix': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.sample_design_matrix',
-                                                                                                                      'ddopai/agents/dynamic_pricing/TS.py'),
                                                   'ddopai.agents.dynamic_pricing.TS.TSPolicy.update_task': ( '30_agents/42_DP_agents/ts_agent.html#tspolicy.update_task',
                                                                                                              'ddopai/agents/dynamic_pricing/TS.py')},
             'ddopai.agents.dynamic_pricing.UCB': { 'ddopai.agents.dynamic_pricing.UCB.UCBAgent': ( '30_agents/42_DP_agents/ucb_agent.html#ucbagent',
@@ -404,12 +398,18 @@
                                                                                                                          'ddopai/agents/dynamic_pricing/greedy.py'),
                                                       'ddopai.agents.dynamic_pricing.greedy.GreedyPolicy.fit': ( '30_agents/42_DP_agents/greedy_agent.html#greedypolicy.fit',
                                                                                                                  'ddopai/agents/dynamic_pricing/greedy.py'),
-                                                      'ddopai.agents.dynamic_pricing.greedy.GreedyPolicy.parameter_update': ( '30_agents/42_DP_agents/greedy_agent.html#greedypolicy.parameter_update',
-                                                                                                                              'ddopai/agents/dynamic_pricing/greedy.py'),
                                                       'ddopai.agents.dynamic_pricing.greedy.GreedyPolicy.reset': ( '30_agents/42_DP_agents/greedy_agent.html#greedypolicy.reset',
                                                                                                                    'ddopai/agents/dynamic_pricing/greedy.py'),
                                                       'ddopai.agents.dynamic_pricing.greedy.GreedyPolicy.update_task': ( '30_agents/42_DP_agents/greedy_agent.html#greedypolicy.update_task',
-                                                                                                                         'ddopai/agents/dynamic_pricing/greedy.py')},
+                                                                                                                         'ddopai/agents/dynamic_pricing/greedy.py'),
+                                                      'ddopai.agents.dynamic_pricing.greedy._OLSIncremental': ( '30_agents/42_DP_agents/greedy_agent.html#_olsincremental',
+                                                                                                                'ddopai/agents/dynamic_pricing/greedy.py'),
+                                                      'ddopai.agents.dynamic_pricing.greedy._OLSIncremental.__init__': ( '30_agents/42_DP_agents/greedy_agent.html#_olsincremental.__init__',
+                                                                                                                         'ddopai/agents/dynamic_pricing/greedy.py'),
+                                                      'ddopai.agents.dynamic_pricing.greedy._OLSIncremental.theta_hat': ( '30_agents/42_DP_agents/greedy_agent.html#_olsincremental.theta_hat',
+                                                                                                                          'ddopai/agents/dynamic_pricing/greedy.py'),
+                                                      'ddopai.agents.dynamic_pricing.greedy._OLSIncremental.update': ( '30_agents/42_DP_agents/greedy_agent.html#_olsincremental.update',
+                                                                                                                       'ddopai/agents/dynamic_pricing/greedy.py')},
             'ddopai.agents.dynamic_pricing.inventory_constrained.IDP': { 'ddopai.agents.dynamic_pricing.inventory_constrained.IDP.IDPAgent': ( '30_agents/42_DP_agents/421_DP_inventory_agents/idp_agent.html#idpagent',
                                                                                                                                                'ddopai/agents/dynamic_pricing/inventory_constrained/IDP.py'),
                                                                          'ddopai.agents.dynamic_pricing.inventory_constrained.IDP.IDPAgent.__init__': ( '30_agents/42_DP_agents/421_DP_inventory_agents/idp_agent.html#idpagent.__init__',
 
@@ -24,111 +24,107 @@
 
 
 # %% ../../../nbs/30_agents/42_DP_agents/12_TS_agent.ipynb 4
-class TSPolicy():
+class TSPolicy:
+    """
+    Minimal Thompson-Sampling agent for the linear-demand model
+        D = x⊤α + p · x⊤β + ε
+    – Gaussian prior  θ∼N(0,λ⁻¹I)
+    – Incremental ridge update of  M_t = λI + Σ z zᵀ  and  θ̂_t = M_t⁻¹ q_t
+    – One Gaussian posterior draw at each round, priced by  p* = -a / (2b)
+    """
+
+    # ---------- ctor ---------------------------------------------------------
     def __init__(self,
                  lam: float,
-                 reg: float,
                  environment_info: MDPInfo,
-                 obsprocessors: Optional[List[object]] = None,
-                 actionprocessors: Optional[List[object]] = None,
-                 agent_name: str | None = None,
-                 ex_prices: np.ndarray | None = None,
-                 alpha: np.ndarray | None = None,
-                 beta: np.ndarray | None = None,
-                 price_function = None,
-                 g = None,
-                 ):
-        assert type(alpha) == type(beta), "alpha and beta must be of the same type"
-        if alpha is None:
-            alpha = np.zeros(environment_info.observation_space['features'].shape[0])
-            beta = np.zeros(environment_info.observation_space['features'].shape[0])
-        if isinstance(ex_prices, list):
-            ex_prices = np.array(ex_prices)
-        assert ex_prices.shape[0] >= 2
-
-        self.environment_info = environment_info
-        self.ex_prices = ex_prices
-        self.alpha = alpha
-        self.beta = beta
-        self.actionprocessors = actionprocessors
-        self.price_function = price_function # Needs to return an np array
-        self.lam = lam
-        self.reg = reg
-        self.g = g
-        self.t = 0
-        self.X = np.empty((0, environment_info.observation_space['features'].shape[0] * 2))
-        self.Y = np.empty((0, 1))
-        self.mode = "train"
-        self.actionprocessors.append(ClipAction(environment_info.action_space.low, environment_info.action_space.high))
-
-    def draw_action(self, observation: np.ndarray):
-        if self.t in [0, 1]:
-            price = self.ex_prices[self.t]
+                 price_function,               # takes (x, a, b) ➜ price
+                 actionprocessors=None,
+                 warm_start_prices=None,
+                 init_scale=None):
+        """
+        lam      : ridge / prior precision λ
+        price_function(x, a, b) returns the quadratic-optimal price (usually -a/2b)
+        warm_start_prices : iterable of k ∈{0,1,2,…} initial prices; can be empty
+        init_scale        : exploration std-multiplier; default √d / 25
+        """
+
+        d_feat = environment_info.observation_space['features'].shape[0]
+        self.d_param  = 2 * d_feat
+        self.lam      = lam
+        self.scale    = init_scale or (np.sqrt(d_feat) / 25.0)
+
+        # incremental posterior
+        self.M_inv = np.eye(self.d_param) / lam   if lam else np.eye(self.d_param)
+        self.q     = np.zeros(self.d_param)
+
+        # current point estimate
+        self.alpha = np.zeros(d_feat)
+        self.beta  = np.zeros(d_feat)
+
+        # misc
+        self.env_info   = environment_info
+        self.price_fn   = price_function
+        self.t          = 0
+        self.warm_p     = np.asarray(warm_start_prices) if warm_start_prices is not None else np.empty(0)
+
+        # processors (only clip)
+        self.actionprocessors = actionprocessors or []
+        self.actionprocessors.append(
+            ClipAction(environment_info.action_space.low,
+                       environment_info.action_space.high)
+        )
+
+    # ---------- draw_action ---------------------------------------------------
+    def draw_action(self, observation):
+        x = observation['features']
+
+        # warm-start if required
+        if self.t < self.warm_p.size:
+            p = self.warm_p[self.t]
         else:
-            X = observation['features']
-            M = self.compute_uncertainty_M(X)
-            noise = np.random.multivariate_normal(np.zeros(2), np.identity(2))         
-            M = np.linalg.inv(M)
-            M = np.linalg.cholesky(M).T
-            norm = M @ noise
-            norm = (1/self.environment_info.observation_space['features'].shape[0]) * norm
-            
-            alpha = self.alpha + norm[0]
-            beta = self.beta + norm[1]
-            price = self.price_function(X, alpha, beta)
-            
-                
-        for processor in self.actionprocessors:
-            price = processor(price)
-        
-        return np.array(price)
-    
-    def sample_design_matrix(self):
-        I = np.identity(2*self.environment_info.observation_space['features'].shape[0])
-        I_lamdba = self.lam * I
-        if self.X.shape[0] == 0:
-            return I_lamdba
-        matrix = np.sum([np.outer(x, x.T) for x in self.X], axis=0)
-        return I_lamdba + matrix
-    
-    def compute_uncertainty_M(self, x_t):
-        M = self.sample_design_matrix()
-        block_matrix = np.block([
-            [x_t, np.zeros_like(x_t)],
-            [np.zeros_like(x_t), x_t]
-        ])
-        M_inverse = np.linalg.inv(M)
-
-        projected_matrix = block_matrix @ M_inverse @ block_matrix.T
-        projected_matrix_inverse = np.linalg.inv(projected_matrix)
-        return projected_matrix_inverse
-    
-    def fit(self, X, Y, action):
-        assert self.mode == "train"
+            # posterior sample
+            L    = np.linalg.cholesky(self.M_inv)
+            noise    = np.random.randn(self.d_param)
+            theta_hat    = self.M_inv @ self.q + self.scale * (L @ noise)
+
+            a    = x @ theta_hat[:x.size]
+            b    = x @ theta_hat[x.size:]
+            b = np.minimum(np.array([-0.01]), b)
+            a = np.maximum(np.array([0.01]), a)
+
+            p    = self.price_fn(np.ones_like(a), a, b)         # usually -a / (2b)
+
+        for proc in self.actionprocessors:
+            p = proc(p)
+        return np.array(p, dtype=np.float32)
+
+    # ---------- fit -----------------------------------------------------------
+    def fit(self, X, D, price):
+        """Update posterior with (x,p,D)."""
+        z = np.concatenate([X, X * price])         # length 2d
+        Mz = self.M_inv @ z
+        self.M_inv -= np.outer(Mz, Mz) / (1.0 + z @ Mz)
+        self.q     += z * float(D)
+
+        θ_hat = self.M_inv @ self.q
+        split = θ_hat.size // 2
+        self.alpha, self.beta = θ_hat[:split], θ_hat[split:]
+
         self.t += 1
-        X = np.concatenate([X, X * action])
-        self.X = np.vstack([self.X, X])
-        self.Y = np.vstack([self.Y, Y])
-        self.parameter_update()
-    
-    def parameter_update(self):
-        if self.X.shape[0] < 2:
-            return
-        model = sm.GLM(self.Y, self.X, family=sm.families.Binomial())
-        results = model.fit()
-        self.alpha = results.params[:self.environment_info.observation_space['features'].shape[0]]
-        self.beta = results.params[self.environment_info.observation_space['features'].shape[0]:]
-    
+
+    # ---------- helpers -------------------------------------------------------
+    def reset(self):
+        pass
+
     def update_task(self, env):
+        """Start fresh on a new MDP / feature dimension."""
         self.environment_info = env.mdp_info
-        self.X = np.empty((0, self.environment_info.observation_space['features'].shape[0] * 2))
-        self.Y = np.empty((0, 1))
+        self.d = self.environment_info.observation_space['features'].shape[0] * 2
+        self.M_inv = np.eye(self.d) / self.lam   if self.lam != 0 else np.eye(self.d)
+        self.q = np.zeros(self.d)
         self.actionprocessors[-1] = ClipAction(self.environment_info.action_space.low, self.environment_info.action_space.high)
-        self.M = [[np.power(x,2)+i for x in range(0, int(np.sqrt(self.environment_info.horizon)))] for i in range(0, 2)]
-        self.t = 0 
-        
-    def reset(self):
-        return
+        self.t = 0
+
 
 # %% ../../../nbs/30_agents/42_DP_agents/12_TS_agent.ipynb 5
 class TSCoreAgent(Agent):
 
@@ -55,11 +55,13 @@ def __init__(self,
         self.lam = lam
         self.reg = reg
         self.t = 0
-        self.X = np.empty((0, environment_info.observation_space['features'].shape[0] * 2))
-        self.Y = np.empty((0, 1))
+        
         self.mode = "train"
         self.actionprocessors.append(ClipAction(environment_info.action_space.low, environment_info.action_space.high))
-        self.d = environment_info.observation_space['features'].shape[0]
+
+        self.d = environment_info.observation_space['features'].shape[0] * 2
+        self.M_inv = np.eye(self.d) / self.lam   if self.lam != 0 else np.eye(self.d)
+        self.q = np.zeros(self.d)
     def draw_action(self, observation):
         x = observation['features']
         # if self.t in [0, 1]:
@@ -76,8 +78,6 @@ def draw_action(self, observation):
     def fit(self, X, Y, action):
 
         Z = np.concatenate([X, X * action])
-        self.X = np.vstack([self.X, Z])
-        self.Y = np.vstack([self.Y, Y])
         self.parameter_update(Z, Y)
         self.t += 1
 
@@ -86,11 +86,6 @@ def parameter_update(self, z, D_t):
         One-step Sherman-Morrison update of the quasi-MLE for the *identity* link g(u)=u
         (linear demand).  If you keep a general g, replace D_t by the *score* below.
         """
-        if self.t == 0:
-            # first call: initialise
-            d = len(z)
-            self.M_inv = np.eye(d) / self.lam   if self.lam != 0 else np.eye(d)
-            self.q = np.zeros(d)
 
         # rank-1 update of M_t^{-1}
         Mz = self.M_inv @ z
@@ -105,11 +100,6 @@ def parameter_update(self, z, D_t):
 
 
     def sample_design_matrix(self):
-        # d = self.environment_info.observation_space['features'].shape[0]
-        # I = self.lam * np.identity(2 * d)
-        # if self.X.shape[0] == 0:
-        #     return I
-        # return I + self.X.T @ self.X
         return np.linalg.inv(self.M_inv)
 
     def sample_from_confidence_region(self, theta_hat, M, N=50, gamma=None):
@@ -146,9 +136,9 @@ def max_rev(self, samples, x):
             beta = theta[x.shape[0]:]
             a = np.dot(x, alpha)
             b = np.dot(x, beta)
-            b = min(-0.01, b)
-            a = max( 0.01, a)
-            price = self.price_function(np.ones_like(x), a, b)
+            b = np.minimum(np.array([-0.01]), b)
+            a = np.maximum(np.array([0.01]), a)
+            price = self.price_function(np.ones_like(a), a, b)
 
             rev = price * self.g.g(a + price * b)
             if rev > max_val:
@@ -158,9 +148,9 @@ def max_rev(self, samples, x):
 
     def update_task(self, env):
         self.environment_info = env.mdp_info
-        self.d = self.environment_info.observation_space['features'].shape[0]
-        self.X = np.empty((0, 2 * self.d))
-        self.Y = np.empty((0, 1))
+        self.d = self.environment_info.observation_space['features'].shape[0] * 2
+        self.M_inv = np.eye(self.d) / self.lam   if self.lam != 0 else np.eye(self.d)
+        self.q = np.zeros(self.d)
         self.actionprocessors[-1] = ClipAction(self.environment_info.action_space.low, self.environment_info.action_space.high)
         self.t = 0