|
15 | 15 | df_pca: TypeAlias = Annotated[
|
16 | 16 | pd.DataFrame,
|
17 | 17 | """
|
18 |
| - A dataframe of sample metadata, with columns "PC1", "PC2", "PC3", |
19 |
| - etc., added. |
| 18 | + A dataframe of projections along principal components, one row per sample. The columns are: |
| 19 | + `sample_id` is the identifier of the sample, |
| 20 | + `partner_sample_id` is the identifier of the sample used by the partners who contributed it, |
| 21 | + `contributor` is the partner who contributed the sample, |
| 22 | + `country` is the country the sample was collected in, |
| 23 | + `location` is the location the sample was collected in, |
| 24 | + `year` is the year the sample was collected, |
| 25 | + `month` is the month the sample was collected, |
| 26 | + `latitude` is the latitude of the location the sample was collected in, |
| 27 | + `longitude` is the longitude of the location the sample was collected in, |
| 28 | + `sex_call` is the sex of the sample, |
| 29 | + `sample_set` is the sample set containing the sample, |
| 30 | + `release` is the release containing the sample, |
| 31 | + `quarter` is the quarter of the year the sample was collected, |
| 32 | + `study_id* is the identifier of the study the sample set containing the sample came from, |
| 33 | + `study_url` is the URL of the study the sample set containing the sample came from, |
| 34 | + `terms_of_use_expiry_date` is the date the terms of use for the sample expire, |
| 35 | + `terms_of_use_url` is the URL of the terms of use for the sample, |
| 36 | + `unrestricted_use` indicates whether the sample can be used without restrictions (e.g., if the terms of use of expired), |
| 37 | + `mean_cov` is mean value of the coverage, |
| 38 | + `median_cov` is the median value of the coverage, |
| 39 | + `modal_cov` is the mode of the coverage, |
| 40 | + `mean_cov_2L` is mean value of the coverage on 2L, |
| 41 | + `median_cov_2L` is the median value of the coverage on 2L, |
| 42 | + `mode_cov_2L` is the mode of the coverage on 2L, |
| 43 | + `mean_cov_2R` is mean value of the coverage on 2R, |
| 44 | + `median_cov_2R` is the median value of the coverage on 2R, |
| 45 | + `mode_cov_2R` is the mode of the coverage on 2R, |
| 46 | + `mean_cov_3L` is mean value of the coverage on 3L, |
| 47 | + `median_cov_3L` is the median value of the coverage on 3L, |
| 48 | + `mode_cov_3L` is the mode of the coverage on 3L, |
| 49 | + `mean_cov_3R` is mean value of the coverage on 3R, |
| 50 | + `median_cov_3R` is the median value of the coverage on 3R, |
| 51 | + `mode_cov_3R` is the mode of the coverage on 3R, |
| 52 | + `mean_cov_X` is mean value of the coverage on X, |
| 53 | + `median_cov_X` is the median value of the coverage on X, |
| 54 | + `mode_cov_X` is the mode of the coverage on X, |
| 55 | + `frac_gen_cov` is the faction of the genome covered, |
| 56 | + `divergence` is the divergence, |
| 57 | + `contam_pct` is the percentage of contamination, |
| 58 | + `contam_LLR` is the log-likelihood ratio of contamination, |
| 59 | + `aim_species_fraction_arab` is the fraction of the gambcolu vs. arabiensis AIMs that indicated arabiensis (this column is only present for *Ag3*), |
| 60 | + `aim_species_fraction_colu` is the fraction of the gambiae vs. coluzzii AIMs that indicated coluzzii (this column is only present for *Ag3*), |
| 61 | + `aim_species_fraction_colu_no2l` is the fraction of the gambiae vs. coluzzii AIMs that indicated coluzzii, not including the chromosome arm 2L which contains an introgression (this column is only present for *Ag3*), |
| 62 | + `aim_species_gambcolu_arabiensis` is the taxonomic group assigned by the gambcolu vs. arabiensis AIMs (this column is only present for *Ag3*), |
| 63 | + `aim_species_gambiae_coluzzi` is the taxonomic group assigned by the gambiae vs. coluzzii AIMs (this column is only present for *Ag3*), |
| 64 | + `aim_species_gambcolu_arabiensis` is the taxonomic group assigned by the combination of both AIMs analyses (this column is only present for *Ag3*), |
| 65 | + `country_iso` is the ISO code of the country the sample was collected in, |
| 66 | + `admin1_name` is the name of the first administrative level the sample was collected in, |
| 67 | + `admin1_iso` is the ISO code of the first administrative level the sample was collected in, |
| 68 | + `admin2_name` is the name of the second administrative level the sample was collected in, |
| 69 | + `taxon` is the taxon assigned to the sample by the combination of the AIMs analysis and the cohort analysis, |
| 70 | + `cohort_admin1_year` is the cohort the sample belongs to when samples are grouped by first administrative level and year, |
| 71 | + `cohort_admin1_month` is the cohort the sample belongs to when samples are grouped by first administrative level and month, |
| 72 | + `cohort_admin1_quarter` is the cohort the sample belongs to when samples are grouped by first administrative level and quarter, |
| 73 | + `cohort_admin2_year` is the cohort the sample belongs to when samples are grouped by second administrative level and year, |
| 74 | + `cohort_admin2_month` is the cohort the sample belongs to when samples are grouped by second administrative level and month, |
| 75 | + `cohort_admin2_quarter` is the cohort the sample belong to when samples are grouped by second administrative level and quarter. |
| 76 | + `PC?` is the projection along principal component ? (? being an integer between 1 and the number of components). There are as many such columns as components, |
| 77 | + `pca_fit` is whether this sample was used for fitting. |
20 | 78 | """,
|
21 | 79 | ]
|
22 | 80 |
|
|
0 commit comments