microsoft
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 2 deletions b/‎.gitignore‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎docs/source/key_components/data_model.rst‎
Lines changed: 24 additions & 32 deletions b/‎docs/source/key_components/data_model.rst‎
Lines changed: 24 additions & 32 deletions
diff --git a/‎docs/source/key_components/rl_toolkit.rst‎
Lines changed: 0 additions & 1 deletion b/‎docs/source/key_components/rl_toolkit.rst‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/cim/rl/__init__.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/cim/rl/__init__.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/cim/rl/algorithms/ac.py‎
Lines changed: 5 additions & 8 deletions b/‎examples/cim/rl/algorithms/ac.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎examples/cim/rl/algorithms/dqn.py‎
Lines changed: 4 additions & 5 deletions b/‎examples/cim/rl/algorithms/dqn.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎examples/cim/rl/algorithms/maddpg.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/cim/rl/algorithms/maddpg.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/cim/rl/algorithms/ppo.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/cim/rl/algorithms/ppo.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/cim/rl/callbacks.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/cim/rl/callbacks.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/cim/rl/config.py‎
Lines changed: 6 additions & 1 deletion b/‎examples/cim/rl/config.py‎
Lines changed: 6 additions & 1 deletion
@@ -3,6 +3,7 @@
 *.pyd
 *.log
 *.csv
+*.parquet
 *.c
 *.cpp
 *.DS_Store
@@ -12,12 +13,14 @@
 .vs/
 build/
 log/
+logs/
+checkpoint/
+checkpoints/
+streamit/
 dist/
 *.egg-info/
 tools/schedule
 docs/_build
-test/
-data/
 .eggs/
 maro_venv/
 pyvenv.cfg
 
@@ -1165,15 +1165,15 @@ Transport cost per product.
 distribution
 ++++++++++++
 
-remaining_order_quantity
+pending_product_quantity
 ************************
 
 type: unsigned int
 slots: 1
 
 Sum of product number in current order list for now.
 
-remaining_order_number
+pending_order_number
 **********************
 
 type: unsigned int
@@ -1184,22 +1184,6 @@ How many pending order for now.
 consumer
 ++++++++
 
-total_purchased
-***************
-
-type: unsigned int
-slots: 1
-
-How many products this node purchased from start to now.
-
-total_received
-**************
-
-type: unsigned int
-slots: 1
-
-How many products this node received from start to now.
-
 purchased
 *********
 
@@ -1232,14 +1216,6 @@ slots: 1
 
 Per tick states. Consumption of current tick, 1.0 if there is purchase, or 0.
 
-order_quantity
-**************
-
-type: unsigned int
-slots: 1
-
-How many product to order, from action.
-
 price
 *****
 
@@ -1248,13 +1224,13 @@ slots: 1
 
 Price per product.
 
-order_cost
+order_base_cost
 **********
 
 type: float
 slots: 1
 
-Cost per order.
+Base cost for orders in this tick.
 
 reward_discount
 ***************
@@ -1267,21 +1243,37 @@ Reward discount from action.
 manufacture
 +++++++++++
 
-manufacture_quantity
+start_manufacture_quantity
+**************************
+
+type: unsigned int
+slots: 1
+
+How many products start to produce at current tick, controlled by action.
+
+in_pipeline_quantity
 ********************
 
 type: unsigned int
 slots: 1
 
-How many products being produced at current tick, controlled by action.
+How many products in manufacture pipeline at current tick, will lead to manufacture cost.
+
+finished_quantity
+*****************
+
+type: unsigned int
+slots: 1
+
+How many products are finished and exit manufacture pipeline at current tick.
 
-product_unit_cost
+manufacture_cost
 *****************
 
 type: float
 slots: 1
 
-Cost to procedue a product.
+Manufacture cost spent to produce products in pipeline at current tick.
 
 seller
 ++++++
 
@@ -170,7 +170,6 @@ An example of creating an actor-critic trainer:
    DiscreteActorCriticTrainer(
        name='ac',
        params=DiscreteActorCriticParams(
-           device="cpu",
            get_v_critic_net_func=lambda: MyCriticNet(state_dim=128),
            reward_discount=.0,
            grad_iters=10,
 
@@ -3,13 +3,14 @@
 
 from .callbacks import post_collect, post_evaluate
 from .env_sampler import agent2policy, env_sampler_creator
-from .policy_trainer import policy_creator, trainer_creator
+from .policy_trainer import device_mapping, policy_creator, trainer_creator
 
 __all__ = [
     "agent2policy",
+    "device_mapping",
     "env_sampler_creator",
     "policy_creator",
     "post_collect",
     "post_evaluate",
-    "trainer_creator"
+    "trainer_creator",
 ]
@@ -8,24 +8,22 @@
 
 from maro.rl.model import DiscretePolicyNet, FullyConnected, VNet
 from maro.rl.policy import DiscretePolicyGradient
-from maro.rl.training.algorithms import (
-    DiscreteActorCriticTrainer, DiscreteActorCriticParams, DiscretePPOParams, DiscretePPOTrainer,
-)
+from maro.rl.training.algorithms import DiscreteActorCriticTrainer, DiscreteActorCriticParams
 
 actor_net_conf = {
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.Tanh,
     "softmax": True,
     "batch_norm": False,
-    "head": True
+    "head": True,
 }
 critic_net_conf = {
     "hidden_dims": [256, 128, 64],
     "output_dim": 1,
     "activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": True,
-    "head": True
+    "head": True,
 }
 actor_learning_rate = 0.001
 critic_learning_rate = 0.001
@@ -64,7 +62,7 @@ def apply_gradients(self, grad: Dict[str, torch.Tensor]) -> None:
     def get_state(self) -> dict:
         return {
             "network": self.state_dict(),
-            "optim": self._optim.state_dict()
+            "optim": self._optim.state_dict(),
         }
 
     def set_state(self, net_state: dict) -> None:
@@ -99,7 +97,7 @@ def apply_gradients(self, grad: Dict[str, torch.Tensor]) -> None:
     def get_state(self) -> dict:
         return {
             "network": self.state_dict(),
-            "optim": self._optim.state_dict()
+            "optim": self._optim.state_dict(),
         }
 
     def set_state(self, net_state: dict) -> None:
@@ -121,7 +119,6 @@ def get_ac(state_dim: int, name: str) -> DiscreteActorCriticTrainer:
     return DiscreteActorCriticTrainer(
         name=name,
         params=DiscreteActorCriticParams(
-            device="cpu",
             get_v_critic_net_func=lambda: MyCriticNet(state_dim),
             reward_discount=.0,
             grad_iters=10,
 
@@ -18,7 +18,7 @@
     "batch_norm": True,
     "skip_connection": False,
     "head": True,
-    "dropout_p": 0.0
+    "dropout_p": 0.0,
 }
 learning_rate = 0.05
 
@@ -75,22 +75,21 @@ def get_policy(state_dim: int, action_num: int, name: str) -> ValueBasedPolicy:
                 "final_value": 0.0,
             }
         )],
-        warmup=100
+        warmup=100,
     )
 
 
 def get_dqn(name: str) -> DQNTrainer:
     return DQNTrainer(
         name=name,
         params=DQNParams(
-            device="cpu",
             reward_discount=.0,
             update_target_every=5,
             num_epochs=10,
             soft_update_coef=0.1,
             double=False,
             replay_memory_capacity=10000,
             random_overwrite=False,
-            batch_size=32
-        )
+            batch_size=32,
+        ),
     )
@@ -126,7 +126,6 @@ def get_maddpg(state_dim: int, action_dims: List[int], name: str) -> DiscreteMAD
     return DiscreteMADDPGTrainer(
         name=name,
         params=DiscreteMADDPGParams(
-            device="cpu",
             reward_discount=.0,
             num_epoch=10,
             get_q_critic_net_func=partial(get_multi_critic_net, state_dim, action_dims),
 
@@ -14,7 +14,6 @@ def get_ppo(state_dim: int, name: str) -> DiscretePPOTrainer:
     return DiscretePPOTrainer(
         name=name,
         params=DiscretePPOParams(
-            device="cpu",
             get_v_critic_net_func=lambda: MyCriticNet(state_dim),
             reward_discount=.0,
             grad_iters=10,
 
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-
 def post_collect(info_list: list, ep: int, segment: int) -> None:
     # print the env metric from each rollout worker
     for info in info_list:
 
@@ -7,6 +7,11 @@
     "durations": 560
 }
 
+if env_conf["topology"].startswith("toy"):
+    num_agents = int(env_conf["topology"].split(".")[1][0])
+else:
+    num_agents = int(env_conf["topology"].split(".")[1][:2])
+
 port_attributes = ["empty", "full", "on_shipper", "on_consignee", "booking", "shortage", "fulfillment"]
 vessel_attributes = ["empty", "full", "remaining_space"]
 
@@ -34,4 +39,4 @@
     + len(vessel_attributes)
 )
 
-algorithm = "ac"  # ac, ppo, dqn or discrete_maddpg
+algorithm = "ppo"  # ac, ppo, dqn or discrete_maddpg
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,11 @@`
`7`	`7`	`"durations": 560`
`8`	`8`	`}`
`9`	`9`
	`10`	`+if env_conf["topology"].startswith("toy"):`
	`11`	`+ num_agents = int(env_conf["topology"].split(".")[1][0])`
	`12`	`+else:`
	`13`	`+ num_agents = int(env_conf["topology"].split(".")[1][:2])`
	`14`	`+`
`10`	`15`	`port_attributes = ["empty", "full", "on_shipper", "on_consignee", "booking", "shortage", "fulfillment"]`
`11`	`16`	`vessel_attributes = ["empty", "full", "remaining_space"]`
`12`	`17`
`@@ -34,4 +39,4 @@`
`34`	`39`	`+ len(vessel_attributes)`
`35`	`40`	`)`
`36`	`41`
`37`		`-algorithm = "ac" # ac, ppo, dqn or discrete_maddpg`
	`42`	`+algorithm = "ppo" # ac, ppo, dqn or discrete_maddpg`