Skip to content

Commit 1894eba

Browse files
arthur gillyclaude
andcommitted
Fix Digby exponent and variance formula, add simulation notebook
- Change default Digby tetrachoric exponent from 0.75 to pi/4 (0.7854), reducing bias in correlation estimates (e.g. from -0.02 to -0.003 at rho=0.5) - Add --digby-exponent CLI option to override the exponent - Fix missing factor of 2 in produit_reciproque_asymetrique(), which caused underestimation of the corrected z-score and beta standard errors, inflating type 1 error rates under sample overlap - Remove ~300 lines of dead code (commented-out legacy meta-analysis and 2-study prototype with undefined tetrachoric() call) - Add simulation notebook (sim_compare_fixes.ipynb) comparing old vs new configurations across overlap levels Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 097b928 commit 1894eba

File tree

2 files changed

+218
-306
lines changed

2 files changed

+218
-306
lines changed

src/metacarpa.cpp

Lines changed: 9 additions & 306 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
namespace ublas = boost::numeric::ublas;
2626
bool VERBOSE=false;
27+
double DIGBY_EXPONENT=0.7853981633974483; // pi/4
2728
/* Matrix inversion routine.
2829
Uses lu_factorize and lu_substitute in uBLAS to invert a matrix */
2930
template<class T>
@@ -189,7 +190,7 @@ inline long double produit_reciproque_asymetrique(boost::numeric::ublas::matrix<
189190
ret+= addendum;
190191
}
191192
}
192-
return ret;
193+
return 2*ret;
193194
}
194195

195196
// inline std::vector<long double> produit(std::vector<long double>& u, std::vector<long double>& v) {
@@ -665,7 +666,7 @@ class studies_correlation {
665666
for(unsigned int j=i+1;j<nstudies;j++){
666667
long double alpha=m(i,j).get_alpha();
667668
if(VERBOSE){info("between study ",i,"and",j, " the alpha is ",alpha);}
668-
correlations[mask](i,j)=(pow(alpha, 0.75)-1)/(pow(alpha, 0.75)+1);
669+
correlations[mask](i,j)=(pow(alpha, DIGBY_EXPONENT)-1)/(pow(alpha, DIGBY_EXPONENT)+1);
669670
if(correlations[mask](i,j)==-1){correlations[mask](i,j)=0;}
670671
}
671672
}
@@ -970,6 +971,7 @@ NB:\tMETACARPA currently supports only one header line in input files, which is
970971
("stop,x", "Stop METACARPA after generating the matrix.")
971972
("debug,d", "Toggles an extremely verbose output, for debugging purposes only.")
972973
("use-beta-sign", "Use sign of beta for dichotomization in correlation matrix calculation (as described in Southam et al. 2017), instead of the default p-value transform.")
974+
("digby-exponent", po::value<double>(), "Exponent used in Digby's tetrachoric correlation approximation: rho = (alpha^e - 1)/(alpha^e + 1). Default: pi/4 = 0.7854.")
973975

974976

975977
// ("ss1,1", po::value<int>(), "Sample size for study 1 (in order of join).")
@@ -1069,6 +1071,11 @@ if(vm.count("use-beta-sign")){
10691071
info("Using sign of beta for dichotomization (Southam et al. 2017 method).");
10701072
}
10711073

1074+
if(vm.count("digby-exponent")){
1075+
DIGBY_EXPONENT=vm["digby-exponent"].as<double>();
1076+
info("Using Digby exponent = "+to_string(DIGBY_EXPONENT));
1077+
}
1078+
10721079
if (vm.count("input") && vm.count("output")) {
10731080
ifiles=vm["input"].as<vector<string>>();
10741081
if(ifiles.size()==1){error("Only one input file. Nothing to do.");}
@@ -1600,307 +1607,3 @@ info("Goodbye.");
16001607
return 0;
16011608

16021609
}
1603-
1604-
1605-
1606-
// DEFUNCT CODE
1607-
// OLD BODY OF META_ANALYSE (WITH COMMENTS)
1608-
1609-
1610-
// ord.chrpos=working_rs[0];
1611-
// ord.rsid=working_id[0];
1612-
// ord.a1=working_a1[0];
1613-
// ord.a2=working_a2[0];
1614-
1615-
// unsigned int j=0;
1616-
// for(unsigned short int i=0; i<ifiles.size();i++){
1617-
// if((working_mask & (1<<i)) == 0){
1618-
// ord.status.push_back('?');
1619-
// }else{
1620-
// //if((mask & mask-1)==0){error("mask is a power of two: "+to_string(mask)+" whereas i="+to_string(i)+" and maskcheck is "+to_string(1<<i));}
1621-
// char effect_dir=working_betas[j] > 0? '+' : '-';
1622-
// j++;
1623-
// ord.status.push_back(effect_dir);
1624-
// }
1625-
// }
1626-
1627-
// if(bit_count(working_mask)>1){
1628-
// // compute weights
1629-
// int sum_ss=somme(working_weights);
1630-
// i=0;
1631-
// //for(long double l : working_weights){working_weights[i]/=sum_ss;i++;}
1632-
1633-
// for(long double l : working_weights){working_weights[i]=sqrt(working_weights[i]);working_weights[i]/=sqrt(sum_ss);i++;}
1634-
1635-
1636-
// // compute transforms and sum thereof
1637-
// i=0;
1638-
// vector<cpp_dec_float_1000> zs(working_weights.size());
1639-
// vector<cpp_dec_float_1000> zs_fess(working_weights.size());
1640-
// for(long double l : working_ps){
1641-
// zs[i]=z_transform_fess(working_ps[i], working_betas[i],working_rs[i]);
1642-
// zs_fess[i]=z_transform_fess(working_ps[i], working_betas[i],working_rs[i]);
1643-
// i++;
1644-
// }
1645-
// ord.z=somme(produit(zs, working_weights));
1646-
1647-
1648-
// ord.z_fess=somme(produit(zs_fess, working_weights));
1649-
// // compute p-value SD
1650-
// // the first product in the line below sums to 1 by definition
1651-
// //ord.zse=sqrt(somme(produit(working_weights, working_weights))+produit_reciproque_asymetrique(correlations.getmat(working_mask), working_weights));
1652-
// ord.zse=sqrt(1+produit_reciproque_asymetrique(correlations.getmat(working_mask), working_weights));
1653-
1654-
1655-
// //compute matrix for beta weights
1656-
// //info("Working mask ", working_mask);
1657-
// ublas::matrix<long double> c=correlations.getmat(working_mask);
1658-
// for(i=0;i<c.size1();i++){
1659-
// for(unsigned short int j=0;j<c.size1();j++){
1660-
// if(i>j){c(i,j)=c(j,i);}else if (i==j){c(i,j)=1;}
1661-
// c(i,j)=c(i,j)*working_betase[i]*working_betase[j];
1662-
// }
1663-
// }
1664-
// // calculate weights
1665-
// // * sum of columns/total sum of matrix
1666-
// ublas::matrix<long double> inverse(c.size1(),c.size1());
1667-
1668-
// if(!InvertMatrix(c,inverse)){correlations.print(c);error("Could not invert variance/covariance matrix.");}
1669-
1670-
// working_weights=colsum(inverse);
1671-
// long double totsum=somme(working_weights);
1672-
// for(i=0;i<working_weights.size();i++) working_weights[i]/=totsum;
1673-
1674-
// // calculate betase
1675-
// std::vector<long double> working_var=produit(working_betase, working_betase);
1676-
// ord.betase=sqrt(somme(produit(produit(working_weights, working_weights), working_var)) + produit_reciproque_asymetrique(c, working_weights));
1677-
1678-
// // meta-beta
1679-
// ord.beta=somme(produit(working_weights, working_betas));
1680-
// // compute meta-analysis p-value
1681-
// boost::math::normal_distribution<long double> correctednormal(0.0, ord.zse);
1682-
// boost::math::normal_distribution<cpp_dec_float_1000> correctednormal_p(0.0, ord.zse);
1683-
// boost::math::normal_distribution<long double> wald(0.0, 1);
1684-
// boost::math::normal_distribution<cpp_dec_float_1000> wald_p(0.0, 1);
1685-
1686-
// try{
1687-
// ord.p=2*(cdf(correctednormal, -1*abs(static_cast<long double>(ord.z))));
1688-
// //ord.p_uncorrected=2*(1-cdf(wald, abs(static_cast<long double>(ord.z))));
1689-
// //ord.p=1-cdf(correctednormal, -1*abs(static_cast<long double>(ord.z)));
1690-
// // ord.p_uncorrected=1-cdf(wald, -1*abs(static_cast<long double>(ord.z)));
1691-
// ord.p_fess=2*cdf(wald, -1*abs(static_cast<long double>(ord.z_fess)));
1692-
// //ord.p=1-cdf(correctednormal, static_cast<long double>(ord.z));
1693-
// // ord.p_uncorrected=1-cdf(wald, static_cast<long double>(ord.z));
1694-
// ord.p_wald=2*cdf(wald, -1*abs(ord.beta/ord.betase));
1695-
// if(ord.p==0){
1696-
1697-
// //info("Warning, position "+minimum+" meta-analyses below long double precision. Analysing with multiprecision...");
1698-
// //ord.p =1-cdf(correctednormal_p, -1*abs(ord.z));
1699-
// //ord.p =1-cdf(correctednormal_p, ord.z);
1700-
// // ord.p_uncorrected=1-cdf(wald_p, abs(ord.z));
1701-
// ord.p_wald=2*cdf(wald_p, -1*abs(ord.beta/ord.betase));
1702-
// ord.p=2*(cdf(correctednormal_p, -1*abs(ord.z)));
1703-
// ord.p_fess=2*cdf(wald_p, -1*abs(ord.z_fess));
1704-
// //ord.p_uncorrected=2*(1-cdf(correctednormal_p, abs(ord.z)));
1705-
1706-
// }
1707-
// }catch(exception e){
1708-
// // ord.p=1-cdf(correctednormal_p, -1*abs(ord.z));
1709-
// //ord.p=1-cdf(correctednormal_p, ord.z);
1710-
// // ord.p_uncorrected=1-cdf(wald_p, -1*abs(ord.z));
1711-
// ord.p=2*(cdf(correctednormal, -1*abs(ord.z)));
1712-
// //ord.p_uncorrected=2*(1-cdf(wald_p, abs(ord.z)));
1713-
// ord.p_wald=2*cdf(wald_p, -1*abs(ord.beta/ord.betase));
1714-
// ord.p_fess=2*cdf(wald_p, -1*abs(ord.z_fess));
1715-
// }
1716-
1717-
// }else{
1718-
// // SNP present in only 1 study, skip all calculations
1719-
// ord.rsid=working_id[0];
1720-
// ord.beta=working_betas[0];
1721-
// ord.betase=working_betase[0];
1722-
// ord.p=-1;
1723-
// ord.p_wald=-1;
1724-
// ord.p_fess=-1;
1725-
// ord.z=-1;
1726-
// ord.zse=-1;
1727-
// ord.p_uncorrected=-1;
1728-
// ord.a1=working_a1[0];
1729-
// ord.a2=working_a2[0];
1730-
// }
1731-
// ord.print(ofs);
1732-
/*
1733-
map <string, position_info, cmp_rsid> thisstudy;
1734-
bool firstline=true;
1735-
info("Calculating z-scores and binomial transformation for file "+filename+"...");
1736-
string tempLine;
1737-
boost::math::normal_distribution<long double> standardnormal;
1738-
boost::math::normal_distribution<cpp_dec_float_1000> standardnormal_p;
1739-
means[i]=0;
1740-
while ( getline(inputFile, tempLine, '\n') ) {
1741-
if(firstline==true){firstline=false;continue;}
1742-
vector <string> line;
1743-
stringstream ss(tempLine);
1744-
string temp;
1745-
while (getline(ss, temp, '\t')) {
1746-
line.push_back(temp);
1747-
}
1748-
string chrpos=line[0]+":"+line[2];
1749-
position_info thisposition;
1750-
//thisposition.beta=stold(line[7]);
1751-
//means[i]=means[i]+thisposition.beta;
1752-
//meancount[i]++;
1753-
//thisposition.sdbeta=stold(line[8]);
1754-
thisposition.pvalue=stold(line[13]);
1755-
thisposition.rsid=line[1];
1756-
try
1757-
{
1758-
thisposition.zf=(cpp_dec_float_1000)(quantile(standardnormal, 1-thisposition.pvalue/2)*sign(thisposition.beta));
1759-
thisposition.bpval=(thisposition.zf<=0);
1760-
thisposition.need_precision=false;
1761-
}catch(exception &ex){
1762-
thisposition.need_precision=true;
1763-
info("\tPosition "+chrpos+" could not be transformed with normal machine precision. Using arbitrary precision (p="+line[13]+")");
1764-
cpp_dec_float_1000 p1=1-(cpp_dec_float_1000)thisposition.pvalue/2;
1765-
thisposition.zf=quantile(standardnormal_p, p1)*sign(thisposition.beta);
1766-
thisposition.bpval=(thisposition.zf<=0);
1767-
}
1768-
1769-
thisstudy[chrpos]=thisposition;
1770-
if(where_present.count(chrpos)==0){
1771-
where_present[chrpos]=current_mask;
1772-
}else{
1773-
where_present[chrpos]=where_present[chrpos] | current_mask;
1774-
}
1775-
1776-
}
1777-
studydata.push_back(thisstudy);
1778-
means[i]=means[i]/meancount[i];
1779-
// This is the mean beta for the whole of the file
1780-
// It can be quite different from the mean calculated on overlapping positions only
1781-
info("Mean effect for file "+s+" is "+to_string(means[i])+" ("+to_string(meancount[i])+" rows)");
1782-
i++;
1783-
});
1784-
int removed=0;
1785-
1786-
1787-
1788-
// All studies have been read. Do we have positions only present in one study?
1789-
// typedef map <string, unsigned short int, cmp_rsid>::iterator it_type;
1790-
// for(it_type it =where_present.begin(); it != where_present.end();){
1791-
// unsigned short int mask=it->second;
1792-
// if((mask & mask-1)==0){removed++;}
1793-
// ++it;
1794-
// }
1795-
// info(to_string(removed)+" positions were only genotyped in one study. Sadly, they will be lost.");
1796-
1797-
1798-
// Time to compute relatedness matrix.
1799-
// This calculates the correlations using the binomially tansformed betas.
1800-
// It is the main innovation introduced by Province and Borecki
1801-
info("Calculating p-value correlation matrix...");
1802-
vector<vector<long double>> cormat(ifiles.size(), vector<long double>(ifiles.size(), 0));
1803-
i=0;
1804-
vector<long double> sds(ifiles.size(), 0);
1805-
vector<bool> done(ifiles.size(), false);
1806-
vector<int> counts(ifiles.size(), 0);
1807-
for_each(ifiles.begin(), ifiles.end(), [&](string s)-> void {
1808-
unsigned short int j=i+1;
1809-
unsigned short int expj=1<<j;
1810-
unsigned short int expi=1<<i;
1811-
for_each(next(ifiles.begin(),i+1), ifiles.end(), [&](string t)-> void {
1812-
// get all positions that have at least these two studies in common
1813-
info("between file "+s+" and file "+t);
1814-
if (i==j){cormat[i][j]=1;}else{
1815-
typedef map <string, unsigned short int, cmp_rsid>::iterator it_type;
1816-
vector<bool> a;
1817-
vector<bool> b;
1818-
for(it_type it =where_present.begin(); it != where_present.end();){
1819-
unsigned short int mask=it->second;
1820-
string chrpos=it->first;
1821-
// Here we calculate SDs for entire studies...
1822-
// info("Treating position "+chrpos+" with mask "+to_string(mask));
1823-
if(!done[j] && (mask & expj)==expj){sds[j]=sds[j]+(studydata[j][chrpos].beta-means[j])*(studydata[j][chrpos].beta-means[j]);counts[j]++;}
1824-
if(!done[i] && (mask & expi)==expi){sds[i]=sds[i]+(studydata[i][chrpos].beta-means[i])*(studydata[i][chrpos].beta-means[i]);counts[i]++;}
1825-
if((mask & expj)==expj && (mask & expi)==expi){
1826-
// info("Position "+chrpos+" is present in both studies.");
1827-
a.push_back(studydata[i][chrpos].bpval);
1828-
b.push_back(studydata[j][chrpos].bpval);
1829-
1830-
}
1831-
++it;
1832-
}
1833-
done[j]=true;
1834-
done[i]=true;
1835-
1836-
cormat[i][j]=tetrachoric(a,b);
1837-
1838-
}
1839-
info("\t...is "+to_string(cormat[i][j]));
1840-
j++;
1841-
});
1842-
i++;
1843-
});
1844-
studies_correlation sc=studies_correlation();
1845-
sc.add_mask((unsigned short int)3);
1846-
bool v[]={true, true};
1847-
sc.update_mask((unsigned short int)3, v);
1848-
v[1]=false;
1849-
sc.update_mask((unsigned short int)3, v);
1850-
v[0]=false;
1851-
sc.update_mask((unsigned short int)3, v);
1852-
v[0]=true;
1853-
sc.update_mask((unsigned short int)3, v);
1854-
v[1]=true;
1855-
sc.update_mask((unsigned short int)3, v);
1856-
v[0]=false;
1857-
sc.update_mask((unsigned short int)3, v);
1858-
v[1]=false;
1859-
sc.update_mask((unsigned short int)3, v);
1860-
sc.update_mask((unsigned short int)3, v);
1861-
v[0]=true;
1862-
v[1]=true;
1863-
sc.update_mask((unsigned short int)3, v);
1864-
v[0]=false;
1865-
sc.update_mask((unsigned short int)3, v);
1866-
v[1]=true;
1867-
v[0]=true;
1868-
sc.update_mask((unsigned short int)3, v);
1869-
v[0]=false;
1870-
sc.update_mask((unsigned short int)3, v);
1871-
v[1]=false;
1872-
sc.update_mask((unsigned short int)3, v);
1873-
sc.update_mask((unsigned short int)3, v);
1874-
1875-
1876-
1877-
1878-
sc.compute_correlations();
1879-
1880-
sc.write("test_serial");
1881-
1882-
studies_correlation scr=studies_correlation();
1883-
scr.read("test_serial");
1884-
info(scr.getmat((unsigned short int)3));
1885-
*/
1886-
1887-
1888-
1889-
1890-
1891-
1892-
1893-
1894-
1895-
1896-
1897-
1898-
1899-
1900-
1901-
1902-
1903-
1904-
1905-
1906-

0 commit comments

Comments
 (0)