re-enable folding

terhorst · terhorst · commit c3c97b96cb4b · 2017-01-22T12:43:40.000-08:00
diff --git a/include/bin_key.h b/include/bin_key.h
@@ -7,32 +7,28 @@ struct bin_key
     template <typename Derived1>
     static std::set<block_key> run(
             const block_key &key, 
-            const Eigen::MatrixBase<Derived1> &na,
-            const bool);
+            const Eigen::MatrixBase<Derived1> &na);
 
     template <typename Derived1, typename Derived2>
     static std::set<block_key> run(
         const Eigen::MatrixBase<Derived1> &key, 
-        const Eigen::MatrixBase<Derived2> &na,
-        const bool);
+        const Eigen::MatrixBase<Derived2> &na);
 };
 
 template <size_t P>
 template <typename Derived1>
 std::set<block_key> bin_key<P>::run(
         const block_key &key, 
-        const Eigen::MatrixBase<Derived1> &na,
-        const bool enabled)
+        const Eigen::MatrixBase<Derived1> &na)
 {
-    return bin_key<P>::run(key.vals, na, enabled);
+    return bin_key<P>::run(key.vals, na);
 }
 
 template <>
 template <typename Derived1, typename Derived2>
 std::set<block_key> bin_key<1>::run(
         const Eigen::MatrixBase<Derived1> &key, 
-        const Eigen::MatrixBase<Derived2> &na,
-        const bool enabled)
+        const Eigen::MatrixBase<Derived2> &na)
 {
     Vector<int> tmp = key;
     std::set<block_key> init, ret;
@@ -45,32 +41,17 @@ std::set<block_key> bin_key<1>::run(
         }
     else
         init.emplace(tmp);
-    if (not enabled)
-        return init;
-    for (const block_key &k : init)
-    {
-        const int nseg = k(0) + k(1);
-        const int nb = k(2);
-        for (int aa = std::max(0, nseg - nb); aa <= std::min(na(0), nseg); ++aa)
-        {
-            const int bb = nseg - aa;
-            tmp(0) = aa;
-            tmp(1) = bb;
-            ret.emplace(tmp);
-        }
-    }
-    return ret;
+    return init;
 }
 
 template <size_t P>
 template <typename Derived1, typename Derived2>
 std::set<block_key> bin_key<P>::run(
         const Eigen::MatrixBase<Derived1> &key, 
-        const Eigen::MatrixBase<Derived2> &na,
-        const bool enabled)
+        const Eigen::MatrixBase<Derived2> &na)
 {
-    std::set<block_key> bk1 = bin_key<1>::run(key.head(3), na.head(1), enabled);
-    std::set<block_key> bk2 = bin_key<P - 1>::run(key.tail(3 * (P - 1)), na.tail(P - 1), enabled);
+    std::set<block_key> bk1 = bin_key<1>::run(key.head(3), na.head(1));
+    std::set<block_key> bk2 = bin_key<P - 1>::run(key.tail(3 * (P - 1)), na.tail(P - 1));
     std::set<block_key> ret;
     Vector<int> v(3 * P);
     for (const block_key& b1 : bk1)
diff --git a/include/block_key.h b/include/block_key.h
@@ -10,6 +10,7 @@ struct block_key
     Vector<int> vals;
 
     int operator()(int k) const { return vals(k); }
+    int& operator()(int k) { return vals.coeffRef(k); }
 
     int size() const { return vals.size(); }
 
diff --git a/include/inference_manager.h b/include/inference_manager.h
@@ -30,7 +30,7 @@ class InferenceManager
 
     void setParams(const ParameterVector &params);
 
-    bool saveGamma, folded;
+    bool saveGamma;
     std::vector<double> hidden_states;
     std::map<block_key, Vector<adouble> > emission_probs;
     std::vector<Matrix<double>*> getXisums();
@@ -87,11 +87,11 @@ class NPopInferenceManager : public InferenceManager
             const std::vector<int*> observations,
             const std::vector<double> hidden_states,
             ConditionedSFS<adouble> *csfs,
-            const bool binning) :
+            const bool fold) :
         InferenceManager(P,
                 (na.tail(na.size() - 1).array() + 1).prod() * (n.array() + 1).prod(),
                 obs_lengths, observations, hidden_states, csfs),
-                n(n), na(na), tensordims(make_tensordims()), bins(construct_bins(binning))
+                n(n), na(na), tensordims(make_tensordims()), bins(construct_bins(fold))
     {}
 
     virtual ~NPopInferenceManager() = default;
@@ -100,6 +100,8 @@ class NPopInferenceManager : public InferenceManager
     protected:
     // Virtual overrides
     void recompute_emission_probs();
+    bool is_monomorphic(const block_key&);
+    block_key folded_key(const block_key&);
     FixedVector<int, 2 * P> make_tensordims();
     block_key bk_to_map_key(const block_key &bk);
 
diff --git a/smcpp/_smcpp.pxd b/smcpp/_smcpp.pxd
@@ -50,7 +50,6 @@ cdef extern from "inference_manager.h":
         vector[adouble] Q() except +
         bool debug
         bool saveGamma
-        bool folded
         vector[double] hidden_states
         vector[pMatrixD] getGammas()
         vector[pMatrixD] getXisums()
diff --git a/smcpp/_smcpp.pyx b/smcpp/_smcpp.pyx
@@ -147,13 +147,6 @@ cdef class _PyInferenceManager:
     def __dealloc__(self):
         del self._im
 
-    property folded:
-        def __get__(self):
-            return self._im.folded
-
-        def __set__(self, bint f):
-            self._im.folded = f
-
     property observations:
         def __get__(self):
             return self._observations
@@ -299,11 +292,11 @@ cdef class _PyInferenceManager:
 
 cdef class PyOnePopInferenceManager(_PyInferenceManager):
 
-    def __cinit__(self, int n, observations, hidden_states, im_id):
+    def __cinit__(self, int n, observations, hidden_states, im_id, bool fold):
         # This is needed because cinit cannot be inherited
         self.__my_cinit__(observations, hidden_states, im_id)
         with nogil:
-            self._im = new OnePopInferenceManager(n, self._Ls, self._obs_ptrs, self._hs, False)
+            self._im = new OnePopInferenceManager(n, self._Ls, self._obs_ptrs, self._hs, fold)
 
     @property
     def pid(self):
@@ -322,7 +315,7 @@ cdef class PyTwoPopInferenceManager(_PyInferenceManager):
     cdef TwoPopInferenceManager* _im2
     cdef int _a1
 
-    def __cinit__(self, int n1, int n2, int a1, int a2, observations, hidden_states, im_id):
+    def __cinit__(self, int n1, int n2, int a1, int a2, observations, hidden_states, im_id, bool fold):
         # This is needed because cinit cannot be inherited
         assert a1 + a2 == 2
         assert a1 in [1, 2]
@@ -331,7 +324,7 @@ cdef class PyTwoPopInferenceManager(_PyInferenceManager):
         self.__my_cinit__(observations, hidden_states, im_id)
         assert a1 in [1, 2], "a2=2 is not supported"
         with nogil:
-            self._im2 = new TwoPopInferenceManager(n1, n2, a1, a2, self._Ls, self._obs_ptrs, self._hs, False)
+            self._im2 = new TwoPopInferenceManager(n1, n2, a1, a2, self._Ls, self._obs_ptrs, self._hs, fold)
             self._im = self._im2
 
     @targets("model update")
diff --git a/smcpp/analysis.py b/smcpp/analysis.py
@@ -154,7 +154,7 @@ def _init_hidden_states(self, prior_model, M):
             )
         logger.debug("%d hidden states:\n%s" % (len(self._hidden_states), str(self._hidden_states)))
 
-    def _init_inference_manager(self, folded):
+    def _init_inference_manager(self, fold):
         ## Create inference object which will be used for all further calculations.
         logger.debug("Creating inference manager...")
         d = {}
@@ -166,10 +166,12 @@ def _init_inference_manager(self, folded):
             k = (pid, n, a)
             data = [contig.data for contig in d[k]]
             if len(pid) == 1:
-                im = _smcpp.PyOnePopInferenceManager(n[0], data, self._hidden_states, k)
+                im = _smcpp.PyOnePopInferenceManager(n[0], data, 
+                        self._hidden_states, k, fold)
             else:
                 assert len(pid) == 2
-                im = _smcpp.PyTwoPopInferenceManager(n[0], n[1], a[0], a[1], data, self._hidden_states, k)
+                im = _smcpp.PyTwoPopInferenceManager(n[0], n[1], a[0], a[1], 
+                        data, self._hidden_states, k, fold)
             im.model = self._model
             im.theta = self._theta
             im.rho = self._rho
@@ -276,7 +278,7 @@ def __init__(self, files, args):
 
         if not args.no_initialize:
             self._hidden_states = np.array([0., np.inf])
-            self._init_inference_manager(False)
+            self._init_inference_manager(args.fold)
             self._init_optimizer(args, files, args.outdir, args.block_size,
                     args.algorithm, args.tolerance, learn_rho=False)
             self._optimizer.run(1)
@@ -286,7 +288,7 @@ def __init__(self, files, args):
 
         # Continue initializing
         self._init_hidden_states(args.prior_model, args.M)
-        self._init_inference_manager(False)
+        self._init_inference_manager(args.fold)
         self._init_optimizer(args, files, args.outdir, args.block_size,
                 args.algorithm, args.tolerance, learn_rho=True)
 
diff --git a/smcpp/commands/command.py b/smcpp/commands/command.py
@@ -21,9 +21,9 @@ def add_common_estimation_args(parser):
     data.add_argument('--no-filter', help="do not drop contigs with extreme heterozygosity. "
                                           "(not recommended unless data set is small)",
                       action="store_true", default=False)
-    # data.add_argument("--folded", action="store_true", default=False,
-    #                         help="use folded SFS for emission probabilites. "
-    #                              "useful if polarization is not known.")
+    data.add_argument("--fold", action="store_true", default=False,
+                            help="use folded SFS for emission probabilites. "
+                                 "(if polarization is not known.)")
 
     optimizer = parser.add_argument_group("Optimization parameters")
     optimizer.add_argument(
diff --git a/src/inference_manager.cpp b/src/inference_manager.cpp
@@ -23,7 +23,7 @@ InferenceManager::InferenceManager(
         const std::vector<int*> observations,
         const std::vector<double> hidden_states,
         ConditionedSFS<adouble> *csfs) :
-    saveGamma(false), folded(false),
+    saveGamma(false),
     hidden_states(hidden_states),
     npop(npop),
     sfs_dim(sfs_dim),
@@ -271,7 +271,42 @@ block_key NPopInferenceManager<P>::bk_to_map_key(const block_key &bk)
 }
 
 template <size_t P>
-std::map<block_key, std::map<block_key, double> > NPopInferenceManager<P>::construct_bins(const bool binning)
+bool NPopInferenceManager<P>::is_monomorphic(const block_key &bk)
+{
+    for (unsigned int p = 0; p < P; ++p)
+    {
+        const int ind = 3 * p;
+        if (bk(ind) != na(p) or bk(ind + 1) != bk(ind + 2))
+            return false;
+    }
+    return true;
+}
+
+template <size_t P>
+block_key NPopInferenceManager<P>::folded_key(const block_key &bk)
+{
+    block_key ret = bk;
+    for (unsigned int p = 0; p < P; ++p)
+    {
+        const int ind = 3 * p;
+        ret(ind) = na(p) - bk(ind);
+        ret(ind + 1) = bk(ind + 2) - bk(ind + 1);
+        ret(ind + 2) = bk(ind + 2);
+    }
+    if (is_monomorphic(ret))
+    {
+        for (unsigned int p = 0; p < P; ++p)
+        {
+            const int ind = 3 * p;
+            ret(ind) = 0;
+            ret(ind + 1) = ret(ind + 2);
+        }
+    }
+    return ret;
+}
+
+template <size_t P>
+std::map<block_key, std::map<block_key, double> > NPopInferenceManager<P>::construct_bins(const bool fold)
 {
     std::map<block_key, std::map<block_key, double> > ret;
     for (auto ob : obs)
@@ -283,7 +318,14 @@ std::map<block_key, std::map<block_key, double> > NPopInferenceManager<P>::const
             if (ret.count(bk) == 0)
             {
                 std::map<block_key, double> m;
-                for (const block_key &kbin : bin_key<P>::run(bk, na, binning))
+                std::set<block_key> new_keys;
+                for (const block_key &k : bin_key<P>::run(bk, na))
+                {
+                    new_keys.emplace(k);
+                    if (fold) 
+                        new_keys.emplace(folded_key(k));
+                }
+                for (const block_key &kbin : new_keys)
                     for (const auto &p : marginalize_key<P>::run(kbin.vals, n, na))
                         m[bk_to_map_key(p.first)] += p.second;
                 ret[bk] = m;
@@ -341,23 +383,6 @@ void NPopInferenceManager<P>::recompute_emission_probs()
 #pragma omp parallel for
     for (auto it = bpm_keys.begin(); it < bpm_keys.end(); ++it)
     {
-        // std::set<block_key> keys;
-        // keys.insert(key);
-        /*
-        if (this->folded)
-        {
-            Vector<int> new_key(it->size());
-            for (size_t p = 0; p < P; ++p)
-            {
-                int a = key(3 * p);
-                int b = key(3 * p + 1);
-                int nb = key(3 * p + 2);
-                new_key(1 + 2 * p) = nb - b;
-                new_key(2 + 2 * p) = nb;
-            }
-            keys.emplace(new_key);
-        }
-        */
         const block_key k = *it;
         std::array<std::set<FixedVector<int, 3> >, P> s;
         Vector<adouble> tmp(M);
@@ -379,8 +404,12 @@ void NPopInferenceManager<P>::recompute_emission_probs()
                 tmp = e2.col(a.sum() % 2);
         }
         else
+        {
             for (const auto &p : bins.at(k))
+            {
                 tmp += p.second * tensorRef(p.first);
+            }
+        }
         if (tmp.maxCoeff() > 1.0 or tmp.minCoeff() <= 0.0)
         {
             std::cout << k << std::endl;
@@ -421,13 +450,13 @@ OnePopInferenceManager::OnePopInferenceManager(
             const std::vector<int> obs_lengths,
             const std::vector<int*> observations,
             const std::vector<double> hidden_states,
-            const bool binning) :
+            const bool fold) :
         NPopInferenceManager(
                 FixedVector<int, 1>::Constant(n),
                 FixedVector<int, 1>::Constant(2),
                 obs_lengths, observations, hidden_states, 
                 new OnePopConditionedSFS<adouble>(n),
-                binning) {}
+                fold) {}
 
 JointCSFS<adouble>* create_jcsfs(int n1, int n2, int a1, int a2, const std::vector<double> &hidden_states)
 {
@@ -442,13 +471,13 @@ TwoPopInferenceManager::TwoPopInferenceManager(
             const std::vector<int> obs_lengths,
             const std::vector<int*> observations,
             const std::vector<double> hidden_states,
-            const bool binning) :
+            const bool fold) :
         NPopInferenceManager(
                 (FixedVector<int, 2>() << n1, n2).finished(),
                 (FixedVector<int, 2>() << a1, a2).finished(),
                 obs_lengths, observations, hidden_states, 
                 create_jcsfs(n1, n2, a1, a2, hidden_states),
-                binning), a1(a1), a2(a2)
+                fold), a1(a1), a2(a2)
 {
     if (a1 + a2 != 2)
         throw std::runtime_error("configuration not supported");
diff --git a/test/integration/test.sh b/test/integration/test.sh
@@ -4,13 +4,17 @@ set -e
 $SMC vcf2smc example/example.vcf.gz /tmp/example.1.smc.gz 1 msp1:msp_0,msp_1
 $SMC vcf2smc example/example.vcf.gz /tmp/example.2.smc.gz 1 msp2:msp_2
 $SMC vcf2smc example/example.vcf.gz /tmp/example.12.smc.gz 1 msp1:msp_0,msp_1 msp2:msp_2
-$SMC estimate -o /tmp/out/1 --theta .00025 --em-iterations 1 /tmp/example.1.smc.gz
+$SMC estimate -o /tmp/out/1 --theta .00025 --fold --em-iterations 1 /tmp/example.1.smc.gz
 $SMC estimate -o /tmp/out/2 --theta .00025 --em-iterations 1 /tmp/example.2.smc.gz
-$SMC estimate -o /tmp/out/12 --theta .00025 --em-iterations 1 /tmp/example.12.smc.gz
+$SMC estimate -o /tmp/out/12 --fold --theta .00025 --em-iterations 1 /tmp/example.12.smc.gz
 $SMC split -o /tmp/out/split --em-iterations 1 \
     /tmp/out/1/model.final.json \
     /tmp/out/2/model.final.json \
     /tmp/example.*.smc.gz
+$SMC split --fold -o /tmp/out/split --em-iterations 1 \
+    /tmp/out/1/model.final.json \
+    /tmp/out/2/model.final.json \
+    /tmp/example.*.smc.gz
 $SMC plot -c -g 29 --logy /tmp/1.png /tmp/out/1/model.final.json
 $SMC plot /tmp/2.pdf /tmp/out/2/model.final.json
 $SMC plot -c --logy /tmp/12.png /tmp/out/12/model.final.json