Add multi armed bandits environment. Add bernoulli distribution wrapper

pockerman · pockerman · commit badf71e9049d · 2025-03-08T09:59:50.000Z
diff --git a/src/rlenvs/envs/multi_armed_bandits/multi_armed_banditis.cpp b/src/rlenvs/envs/multi_armed_bandits/multi_armed_banditis.cpp
@@ -0,0 +1,108 @@
+#include "rlenvs/envs/multi_armed_bandits/multi_armed_bandits.h"
+#include "rlenvs/envs/time_step_type.h"
+
+#include <vector>
+#include <exception>
+
+namespace rlenvscpp{
+namespace envs{
+namespace bandits{
+
+const std::string MultiArmedBandits::name = "MultiArmedBandits";
+
+MultiArmedBandits::MultiArmedBandits()
+:
+EnvBase<TimeStep<Null>, MultiArmedBanditsSpace>(0, "MultiArmedBandits"),
+bandits_()
+{}
+
+void 
+MultiArmedBandits::make(const std::string& version,
+                        const std::unordered_map<std::string, std::any>& options){
+							
+	
+	auto p_itr = options.find("p");
+	if(p_itr == options.end()){
+		throw std::logic_error("option p is missing");
+	}
+	
+	auto p = std::any_cast<std::vector<real_t>>(p_itr -> second);
+	
+	bandits_.reserve(p.size());
+	for(auto p_: p){
+		bandits_.push_back(utils::maths::stats::BernoulliDist(p_));
+	}
+	
+	auto success_reward_itr = options.find("success_reward");
+	if(success_reward_itr != options.end()){
+		success_reward_ = std::any_cast<real_t>(success_reward_itr -> second);
+	}
+	else{
+		success_reward_ = 1.0;
+	}
+	
+	auto fail_reward_itr = options.find("fail_reward");
+	if(success_reward_itr != options.end()){
+		fail_reward_ = std::any_cast<real_t>(fail_reward_itr -> second);
+	}
+	else{
+		fail_reward_ = 0.0;
+	}
+	
+	
+	this -> set_version_(version);
+	this -> set_make_options_(options);
+	this -> make_created_();
+							
+}
+
+MultiArmedBandits::time_step_type 
+MultiArmedBandits::reset(uint_t seed,
+						 const std::unordered_map<std::string, std::any>& /*options*/){
+	seed_ = seed;
+	
+	return MultiArmedBandits::time_step_type(TimeStepTp::FIRST,
+	                                         0.0,
+                                             Null(), 
+											 1.0
+											 );
+							 
+}
+
+MultiArmedBandits::time_step_type 
+MultiArmedBandits::step(const action_type& action){
+	
+	if(action >= bandits_.size()){
+		throw std::logic_error("Invalid action index");
+	}
+	
+	auto result = bandits_[action].sample(seed_);
+	
+	if(result){
+		this -> get_current_time_step_() = MultiArmedBandits::time_step_type(TimeStepTp::LAST, 
+		                                                                     success_reward_, 
+																			 Null(),
+																			 1.0
+																			 );
+	}
+	else{
+		this -> get_current_time_step_() = MultiArmedBandits::time_step_type(TimeStepTp::LAST, 
+		                                                                     fail_reward_, 
+																			 Null(),
+																			 1.0);
+	}
+	
+	return this -> get_current_time_step_();
+}
+
+void 
+MultiArmedBandits::close(){
+	
+	bandits_.clear();
+	this -> EnvBase<TimeStep<Null>, MultiArmedBanditsSpace>::close();
+}
+
+	
+}
+}
+}
diff --git a/src/rlenvs/envs/multi_armed_bandits/multi_armed_bandits.h b/src/rlenvs/envs/multi_armed_bandits/multi_armed_bandits.h
@@ -0,0 +1,160 @@
+#ifndef MULTI_ARMED_BANDITS_H
+#define MULTI_ARMED_BANDITS_H
+
+#include "rlenvs/rlenvs_types_v2.h"
+#include "rlenvs/envs/env_base.h"
+#include "rlenvs/envs/time_step.h"
+#include "rlenvs/utils/maths/statistics/distributions/bernoulli_dist.h"
+
+#include <vector>
+#include <string>
+#include <any>
+#include <unordered_map>
+
+
+namespace rlenvscpp{
+namespace envs{
+namespace bandits{
+	
+///
+/// \brief struct MultiArmedBanditsSpace specifies the
+/// MultiArmedBandits state-action space
+///	
+struct  MultiArmedBanditsSpace
+{
+	
+	///
+	/// \brief The type describing the state space for the environment
+	///
+	typedef Null state_space;
+	
+	///
+	/// \brief The type of the state
+	///
+	typedef Null state_type;
+	
+	///
+	/// \brief The type of the action space for the environment
+	///
+	typedef Null action_space;
+
+    ///
+	/// \brief The type of the action to be undertaken in the environment
+	///
+    typedef uint_t action_type;
+	
+	
+};
+	
+///
+/// \brief class MultiArmedBandits. Environment for simulating armed-bandits
+/// The bandits are represented as Bernoulli distribution. At each step
+/// only one bandit can be executed
+///
+class MultiArmedBandits final: public EnvBase<TimeStep<Null>, MultiArmedBanditsSpace>{
+	
+public:
+	
+	///
+    /// \brief name
+    ///
+    static const std::string name;
+	
+	///
+	/// \brief The base type
+	///
+	typedef EnvBase<TimeStep<Null>, MultiArmedBanditsSpace> base_type;
+	
+	///
+	/// \brief The time step type we return every time a step in the
+	/// environment is performed
+	///
+    typedef typename base_type::time_step_type time_step_type;
+	
+	///
+	/// \brief The type describing the state space for the environment
+	///
+	typedef typename base_type::state_space_type state_space_type;
+	
+	///
+	/// \brief The type of the action space for the environment
+	///
+	typedef typename base_type::action_space_type action_space_type;
+
+    ///
+	/// \brief The type of the action to be undertaken in the environment
+	///
+    typedef typename base_type::action_type action_type;
+	
+	///
+	/// \brief The type of the action to be undertaken in the environment
+	///
+    typedef typename base_type::state_type state_type;
+
+	
+	///
+	/// \brief MultiArmedBandits Constructor
+	///
+	MultiArmedBandits();
+	
+	///
+	/// \brief make. Builds the environment.
+    /// \param version. the version of the environment to build
+	/// \param options. Options to use for building the environment.
+	/// Concrete classes may choose to hold a copy 
+	///
+    virtual void make(const std::string& version,
+                      const std::unordered_map<std::string, std::any>& options)override final;
+
+    ///
+	/// \brief close the environment
+	///
+    virtual void close()override final;
+	
+	/// 
+	/// \brief Reset the environment
+	/// \param seed. The seed to use for resetting the environment
+	/// \param options. Options to use for resetting the environment.
+	///
+    virtual time_step_type reset(uint_t seed,
+                                 const std::unordered_map<std::string, std::any>& options)override final;
+								 
+	///
+	/// \brief step in the environment by performing the given action
+    /// \param action. The action to execute in the environment 
+	/// \return An instance of time_step_type 
+    virtual time_step_type step(const action_type& action)override final;
+	
+private:
+	
+	///
+	/// \brief The seed to use
+	///
+	uint_t seed_;
+	
+	///
+	/// \brief The success reward
+	///
+	real_t success_reward_;
+	
+	///
+	/// \brief The reward to return on fail. 
+	/// Default is zero
+	///
+	real_t fail_reward_;
+	
+	///
+	/// \brief Every bandit is represented as a Bernoulli distribution
+	///
+	std::vector<utils::maths::stats::BernoulliDist> bandits_;
+	
+	
+};
+	
+}	
+}
+}
+
+
+
+#endif
diff --git a/src/rlenvs/utils/maths/statistics/distributions/bernoulli_dist.cpp b/src/rlenvs/utils/maths/statistics/distributions/bernoulli_dist.cpp
@@ -0,0 +1,65 @@
+#include "rlenvs/utils/maths/statistics/distributions/bernoulli_dist.h"
+
+
+namespace rlenvscpp {
+namespace utils{
+namespace maths {
+namespace stats {
+
+
+BernoulliDist::BernoulliDist(result_type p)
+:
+p_(p),
+dist_(p)
+{}
+
+BernoulliDist::result_type 
+BernoulliDist::sample() const{
+	
+	std::random_device rd{};
+    std::mt19937 gen{rd()};
+	return dist_(gen);
+}
+
+
+BernoulliDist::result_type 
+BernoulliDist::sample(uint_t seed) const{
+	
+	std::mt19937 gen{seed};
+	return dist_(gen);
+}
+
+
+std::vector<BernoulliDist::result_type> 
+BernoulliDist::sample_many(uint_t size) const{
+	
+	std::vector<BernoulliDist::result_type> samples(size);
+	std::random_device rd{};
+    std::mt19937 gen{rd()};
+	
+	for(uint_t i=0; i<size; ++i){
+		samples[i] = dist_(gen);
+	}
+	
+	return samples;
+	
+}
+
+
+std::vector<BernoulliDist::result_type> 
+BernoulliDist::sample_many(uint_t size, uint_t seed) const{
+	
+	std::vector<BernoulliDist::result_type> samples(size);
+    std::mt19937 gen(seed);
+	
+	for(uint_t i=0; i<size; ++i){
+		samples[i] = dist_(gen);
+	}
+	
+	return samples;
+}
+	
+}
+}
+}
+}
diff --git a/src/rlenvs/utils/maths/statistics/distributions/bernoulli_dist.h b/src/rlenvs/utils/maths/statistics/distributions/bernoulli_dist.h