vd2: swap+pool

junzhezhang · junzhezhang · commit 2fb3f027c0e4 · 2018-09-19T11:10:52.000+08:00
new class of pool: SwapPool
important APIs: PoolOpt(), Malloc(), Free()
PoolOpt() takes in M/F sequences including those induced by swapping
cross-iteration variables and last iteration case solved.

record down MF after swap done, for one iteration
diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
@@ -130,7 +130,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         dev = device.get_default_device()
     else:
         print('Using GPU')
-        dev = device.create_cuda_gpu_on(1)
+        dev = device.create_cuda_gpu_on(0)
 
     net.to_device(dev)
     opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
@@ -153,7 +153,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         fileTimeLog.write('Epoch %d: ' % epoch)
         fileTimeLog.write(str(int(round(time.time()*1000))))
         fileTimeLog.write('\n')
-        for b in range(15): #num_train_batch):
+        for b in range(20): #num_train_batch):
             print ("start of iteration %d: " %b)
             #time.sleep(1)
             fileTimeLog.write('iteration %d: ' % b)
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
@@ -365,6 +365,7 @@ class SwapGPU : public Device {
   //vec_block
   vector<string>vec_block; //iteration 0-3
   vector<string>vec_block_fresh; //iteration 4 5 6
+  vector<string>vec_block_mf; //itr 8 9 10
   vector<double>global_load; // from begining
   vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
   vector<onePieceMsg>vec_run;
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
@@ -50,6 +50,8 @@ class DeviceMemPool {
   virtual void Malloc(void** ptr, const size_t size)  = 0;
   virtual void Free(void* ptr)  = 0;
   virtual void Append(string blockInfo) = 0;
+
+  virtual void PoolOpt(vector<string> &vec_mf) = 0;
   
   virtual void SwapOut(void* data_) = 0;
   virtual void SwapIn(void* data_) = 0;
@@ -74,7 +76,9 @@ class CnMemPool : public DeviceMemPool {
 
   void Malloc(void** ptr, const size_t size);
   void Free(void* ptr);
-    void Append(string blockInfo){}
+  void Append(string blockInfo){}
+
+  void PoolOpt(vector<string> &vec_mf) override {}
     
   void SwapOut(void* data_) override {}
   void SwapIn(void* data_) override {}
@@ -102,7 +106,9 @@ class CudaMemPool : public DeviceMemPool {
  public:
   void Malloc(void** ptr, const size_t size) override;
   void Free(void* ptr) override;
-void Append(string blockInfo){}
+  void Append(string blockInfo){}
+
+  void PoolOpt(vector<string> &vec_mf) override {}
 
   void SwapOut(void* data_) override {}
   void SwapIn(void* data_) override {}
@@ -134,9 +140,11 @@ class SmartMemPool: public DeviceMemPool {
     void getMaxLoad(void);
     std::pair<size_t, size_t> GetMemUsage() override;
     void Append(string blockInfo);
+
+    void PoolOpt(vector<string> &vec_mf) override {}
     
-  void SwapOut(void* data_) override {}
-  void SwapIn(void* data_) override {}
+    void SwapOut(void* data_) override {}
+    void SwapIn(void* data_) override {}
 protected:
     void Init();
 private:
@@ -196,19 +204,22 @@ struct SwapMeta{
     void* d_ptr; //not used for
 };
 
-class Swap : public DeviceMemPool {
+class SwapPool : public DeviceMemPool {
 public:
-    Swap(const MemPoolConf &conf); //constructor
+    SwapPool(const MemPoolConf &conf); //constructor
     //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
     void Malloc(void** ptr, const size_t size);
     void Free(void* ptr);
-    ~Swap();
+    ~SwapPool();
     void getMaxLoad(void);
     std::pair<size_t, size_t> GetMemUsage() override;
     void Append(string blockInfo);
     
     void SwapOut(void* data_);
     void SwapIn(void* data_);
+
+    //PoolOpt() construct pool based on MF info after Swap constructed.
+    void PoolOpt(vector<string> &vec_mf);
 protected:
     void Init();
 private:
@@ -219,8 +230,15 @@ class Swap : public DeviceMemPool {
     std::mutex mtx_; 
     vector<string> vec_block;
     size_t swapLimit = 1<<23; //8MB
-    map<void*,swapLookUpElement>Table_id2LookUpElement; //old TODO(junzhe) remove
-    map<void*,pair<SwapMeta,SwapMeta>>Table_Meta;
+    int poolFlag = 0;
+    int pc = 0;
+    int maxLen_mf = 0;
+    void* ptrPool = nullptr;
+    map<void*,int>Table_p2r; //ptr for arrival idx, for look up Table during free
+    map<int,lookUpElement>Table_r2v; //r-> vertex
+    vector<pair<int,lookUpElement>>Vec_r2Ver; //Table_r2Ver No need anymore, replaced by Table_r2v TODO(junzhe)
+    // map<void*,swapLookUpElement>Table_id2LookUpElement; //old TODO(junzhe) remove
+    // map<void*,pair<SwapMeta,SwapMeta>>Table_Meta;
 };
 
 #endif
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
@@ -48,7 +48,7 @@ const int kNumCudaStream = 1;
 CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
   MemPoolConf conf;
   conf.add_device(id);
-  pool_ = std::make_shared<Swap>(conf);
+  pool_ = std::make_shared<CnMemPool>(conf);
   Setup();
 }
 
diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc
@@ -128,7 +128,7 @@ Platform::CreateCudaGPUsOn(const vector<int> &devices, size_t init_size) {
     conf.add_device(device);
     CHECK_LE(bytes, Platform::GetGPUMemSize(device).first);
   }
-  auto pool = std::make_shared<CnMemPool>(conf);
+  auto pool = std::make_shared<SwapPool>(conf);
 
   vector<shared_ptr<Device> > ret;
   for (auto device : devices) {
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
@@ -42,7 +42,6 @@ const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
 
 ///functions to be used
 ///Section for structs and respective sorting function:
-// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm
 
 
 
@@ -924,7 +923,8 @@ SwapGPU::SwapGPU(int id) : Device(id, kNumCudaStream) {
 
   MemPoolConf conf;
   conf.add_device(id);
-  pool_ = std::make_shared<Swap>(conf);
+  //TODO(junzhe) note that it has been <Swap> for building SwapGPU, which doesnt matter.
+  pool_ = std::make_shared<SwapPool>(conf); 
   Setup();
 
 }
@@ -987,6 +987,26 @@ void* SwapGPU::Malloc(int size) {
   if (size > 0) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Malloc((void**)&ptr, size);
+
+    ///append vec_block_mf
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+      && ((gc - maxLen) >= three_more_globeCounter)){
+      string tempStr1 ="Malloc ";
+      stringstream strm2;
+      strm2<<ptr;
+      string tempStr2 = strm2.str();
+      stringstream strm3;
+      strm3<<size;
+      string tempStr3 = strm3.str();
+      string temp = tempStr1+tempStr2+" "+tempStr3;
+      vec_block_mf.push_back(temp);
+    }
+    //record mf semantics after swap plan done
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"Malloc "<<ptr<<" "<<size;
+      file_mf_one_itr<<endl;
+    }
     // TODO(wangwei) remove the memset.
     CUDA_CHECK(cudaMemset(ptr, 0, size));
   }
@@ -1000,6 +1020,21 @@ void SwapGPU::Free(void* ptr) {
   if (ptr != nullptr) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Free(ptr);
+    ///append vec_block_mf
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+      && ((gc - maxLen) >= three_more_globeCounter)){
+      string tempStr1 ="Free ";
+      stringstream strm2;
+      strm2<<ptr;
+      string tempStr2 = strm2.str();
+      string temp = tempStr1+tempStr2;
+      vec_block_mf.push_back(temp);
+    }
+
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"Free "<<ptr<<endl;
+    }
   }
 
   //cout<<"free done"<<endl; 
@@ -1115,6 +1150,21 @@ void SwapGPU::DeploySwap_exec(int r_gc){
     last_meta.block_->update_data(nullptr);
     // cout<<"to free data_"<<last_meta.data_<<endl;
     pool_->Free(last_meta.data_);
+    ///append vec_block_mf
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+      && ((gc - maxLen) >= three_more_globeCounter)){
+      string tempStr1 ="Free ";
+      stringstream strm2;
+      strm2<<last_meta.data_;
+      string tempStr2 = strm2.str();
+      string temp = tempStr1+tempStr2;
+      vec_block_mf.push_back(temp);
+    }
+
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"Free "<<last_meta.data_<<" SwapOut(Sync)"<<endl;
+    }
     last_meta.data_ = nullptr; //not really needed TODO(junzhe)
     cout<<"----sync out "<<sync_idx<<endl;
     Table_meta.find(sync_idx)->second = last_meta;
@@ -1213,8 +1263,24 @@ void SwapGPU::Append(string blockInfo){
 
   //test moved from start of malloc/free to end of append, only gc+1 changed
   Test_sched_switch_swap();
-  //NOTE: this gc++ includes read/write and AppendLayer as well, in addition to malloc/free.
+  //NOTE: this gc includes read/write and AppendLayer as well, in addition to malloc/free.
   gc++;
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) == three_more_globeCounter)){
+    cout<<"==================to call PoolOpt"<<endl;
+    fstream file_mf_8910("mf_8910.csv", ios::in|ios::out|ios::app);
+    for (int i = 0; i< vec_block_mf.size();i++){
+      file_mf_8910<<vec_block_mf[i]<<endl;
+    }
+    cout<<"len of vec_block_mf: "<<vec_block_mf.size()<<endl;
+    pool_->PoolOpt(vec_block_mf);
+    cout<<"==================to call PoolOpt done"<<endl;
+  }
+
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) 
+    && ((gc - three_more_globeCounter)%maxLen == 0)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"-----new itr------"<<endl;
+  }
 
 }
 
@@ -1297,6 +1363,23 @@ void SwapGPU::SwapIn_idx(const int r_idx){
   //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
   void* ptr = nullptr;
   pool_->Malloc((void**)&ptr, meta.size);
+  ///append vec_block_mf
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+    && ((gc - maxLen) >= three_more_globeCounter)){
+    string tempStr1 ="Malloc ";
+    stringstream strm2;
+    strm2<<ptr;
+    string tempStr2 = strm2.str();
+    stringstream strm3;
+    strm3<<meta.size;
+    string tempStr3 = strm3.str();
+    string temp = tempStr1+tempStr2+" "+tempStr3;
+    vec_block_mf.push_back(temp);
+  }
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+    file_mf_one_itr<<"Malloc "<<ptr<<" "<<meta.size<<" swapIn"<<endl;
+  }
   //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
   //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
   //void* to_rm_ptr = meta.data_;
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ const int kNumCudaStream = 1;`
`48`	`48`	`CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {`
`49`	`49`	`MemPoolConf conf;`
`50`	`50`	`conf.add_device(id);`
`51`		`- pool_ = std::make_shared<Swap>(conf);`
	`51`	`+ pool_ = std::make_shared<CnMemPool>(conf);`
`52`	`52`	`Setup();`
`53`	`53`	`}`
`54`	`54`
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ Platform::CreateCudaGPUsOn(const vector<int> &devices, size_t init_size) {`
`128`	`128`	`conf.add_device(device);`
`129`	`129`	`CHECK_LE(bytes, Platform::GetGPUMemSize(device).first);`
`130`	`130`	`}`
`131`		`- auto pool = std::make_shared<CnMemPool>(conf);`
	`131`	`+ auto pool = std::make_shared<SwapPool>(conf);`
`132`	`132`
`133`	`133`	`vector<shared_ptr<Device> > ret;`
`134`	`134`	`for (auto device : devices) {`