-
Notifications
You must be signed in to change notification settings - Fork 0
consistent strategy doc strings #447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -13,14 +13,36 @@ use std::any::type_name; | |||||||
| use std::collections::{HashMap, HashSet}; | ||||||||
|
|
||||||||
| #[derive(Debug)] | ||||||||
| /// # Description | ||||||||
| /// | ||||||||
| /// Given a column whose cells contains ages (e.g. subject age, age of death, age of onset) | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please, refer to our types everywhere.
Suggested change
|
||||||||
| /// this strategy converts integer entries to ISO8601 durations: 47 -> P47Y | ||||||||
| /// | ||||||||
| /// NOTE: the integers must be between 0 and 150. | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would:
Suggested change
or maybe leave it out here, since its already explained in the error section. |
||||||||
| /// | ||||||||
| /// If an entry is already in ISO8601 duration format, it will be left unchanged. | ||||||||
| /// | ||||||||
| /// If there are cell values which are neither ISO8601 durations nor integers | ||||||||
| /// # Example | ||||||||
| /// | ||||||||
| /// The table | ||||||||
| /// ```text | ||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think, this is:
Suggested change
|
||||||||
| /// PatientId, age_at_last_encounter | ||||||||
| /// P001, 47 | ||||||||
| /// P002, P56Y12M3D | ||||||||
| /// ``` | ||||||||
| /// | ||||||||
| /// will be mapped to | ||||||||
| /// ```text | ||||||||
| /// PatientId, age_at_last_encounter | ||||||||
| /// P001, P47Y | ||||||||
| /// P002, P56Y12M3D | ||||||||
| /// ``` | ||||||||
| /// | ||||||||
| /// # Errors | ||||||||
| /// | ||||||||
| /// If there are cell values which are neither ISO8601 durations nor integers (between 0 and 150) | ||||||||
| /// an error will be returned. | ||||||||
| /// | ||||||||
| pub struct AgeToIso8601Strategy { | ||||||||
| min_age: i32, | ||||||||
| max_age: i32, | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -8,15 +8,47 @@ use polars::datatypes::{DataType, PlSmallStr}; | |||||
| use polars::prelude::{ChunkApply, Column}; | ||||||
| use std::borrow::Cow; | ||||||
|
|
||||||
| /// Given a collection of contextualised dataframes, this strategy will apply all the aliases | ||||||
| /// found in the SeriesContexts. | ||||||
| /// For example if a Contextualised Dataframe has a SeriesContext consisting of a SubjectSex column | ||||||
| /// and a ToString AliasMap which converts "M" to "Male" and "F" to "Female" | ||||||
| /// # Description | ||||||
| /// | ||||||
| /// Given a collection of `ContextualiseDataframes`, this strategy will apply all the aliases | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also here:
Suggested change
|
||||||
| /// found in the `SeriesContexts`. | ||||||
| /// | ||||||
| /// For example if a `ContextualisedDataframe` has a `SeriesContext` consisting of a `subject_sex` column | ||||||
| /// and a `ToString` `AliasMap`, which converts "M" to "Male" and "F" to "Female", | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by ToString? Do you mean the trait ToString? Could be confusing. |
||||||
| /// then the strategy will apply those aliases to each cell. | ||||||
| /// | ||||||
| /// # NOTE | ||||||
| /// - This does not transform the headers of the Dataframe. | ||||||
| /// - Only non-null cells may be aliased. | ||||||
| /// - Non-null cells may be aliased to null. | ||||||
| /// | ||||||
| /// # Example | ||||||
| /// | ||||||
| /// If the `SeriesContext` for the `age_at_last_encounter` column | ||||||
| /// has a `ToInt` `AliasMap` with a single alias `Less than 1 year` -> `0` then the table | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| /// | ||||||
| /// ```text | ||||||
| /// PatientId, age_at_last_encounter | ||||||
| /// P001, 4 | ||||||
| /// P002, Less than 1 year | ||||||
| /// ``` | ||||||
| /// | ||||||
| /// is mapped to | ||||||
| /// ```text | ||||||
| /// PatientId, age_at_last_encounter | ||||||
| /// P001, 4 | ||||||
| /// P002, 0 | ||||||
| /// ``` | ||||||
| /// | ||||||
| /// # Errors | ||||||
| /// | ||||||
| /// Errors will be thrown if: | ||||||
| /// - Any columns to be aliased cannot be cast to String datatype. | ||||||
|
Comment on lines
+39
to
+46
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also here: [ |
||||||
| /// - Once the aliases have been applied, | ||||||
| /// the column cannot be cast to the desired `OutputDataType` of the `AliasMap`. | ||||||
| /// For example if it is a `ToInt` `AliasMap`, yet there remain strings in the column after | ||||||
| /// the aliases have been applied. | ||||||
| /// | ||||||
| #[derive(Debug)] | ||||||
| pub struct AliasMapStrategy; | ||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -21,12 +21,42 @@ use std::collections::{HashMap, HashSet}; | |||||
|
|
||||||
| #[allow(dead_code)] | ||||||
| #[derive(Debug, Default)] | ||||||
| /// # Description | ||||||
| /// | ||||||
| /// This strategy finds columns whose cells contain dates, and converts these dates | ||||||
| /// to a certain age of the patient, by leveraging the patient's date of birth. | ||||||
| /// | ||||||
| /// If there is no data on a certain patient's date of birth, | ||||||
| /// yet there is a date corresponding to this patient, | ||||||
| /// then an error will be thrown. | ||||||
| /// | ||||||
| /// # Example | ||||||
| /// | ||||||
| /// The table | ||||||
| /// | ||||||
| /// ```text | ||||||
| /// PatientId, DOB, TimeAtLastEncounter | ||||||
| /// P001, 1990, 1995 | ||||||
| /// P002, 1992, | ||||||
| /// P003, 2000, 2004 | ||||||
| /// P004,, | ||||||
| /// ``` | ||||||
| /// | ||||||
| /// is mapped to | ||||||
| /// ```text | ||||||
| /// PatientId, DOB, TimeAtLastEncounter | ||||||
| /// P001, 1990, 5 | ||||||
| /// P002, 1992, | ||||||
| /// P003, 2000, 4 | ||||||
| /// P004,, | ||||||
| /// ``` | ||||||
| /// | ||||||
| /// # Errors | ||||||
| /// | ||||||
| /// An error will be thrown if | ||||||
| /// 1. A DOB is before to a date for a patient, leading to a negative age. | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here you use numbers on AliasMapStrategy you used dashes.
Suggested change
or might even link the context type. |
||||||
| /// 2. There exists a date which cannot be converted to an age due to missing DOB data. | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I didn't know. This will be a major annoyance, as soon as we have large datasets.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. For those situations we have a few options:
Probably 2 is best. But I would point out that if you are creating a cohort of Phenopackets, you probably either want just ages in a field, or just dates. To have a mix is strange. So strictness did make sense. And DOB data probably is less likely to be missing than other data. Miraculously the strategy seemed to work with the i_data. |
||||||
| /// | ||||||
| pub struct DateToAgeStrategy; | ||||||
|
|
||||||
| impl Strategy for DateToAgeStrategy { | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,24 +12,54 @@ use std::any::type_name; | |
| use std::collections::HashSet; | ||
| use std::sync::Arc; | ||
|
|
||
| /// # Description | ||
| /// | ||
| /// This strategy will find every column whose context is HpoOrDisease | ||
| /// And split it into two separate columns: a Hpo column and a disease column. | ||
| /// | ||
| /// Hpo is prioritised: the strategy will find all Hpo labels and IDs, and then put them into the | ||
| /// Hpo column. All other cells will be assumed to refer to disease. | ||
| /// | ||
| /// # Fields | ||
| /// | ||
| /// * `hpo_bidict_lib` - This should contain BiDictLibrary for the version of HPO that you want to use. | ||
| /// * `disease_bidict_lib` - All non-HPO cells will be processed by this disease BiDictLibrary. | ||
|
Comment on lines
+23
to
+26
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think Fields is a common section. At least not to the standard. Maybe we should drop it. |
||
| /// | ||
| /// # Example | ||
| /// | ||
| /// The table | ||
| /// | ||
| /// ```text | ||
| /// PatientId, conditions | ||
| /// P001, HP:1234567 | ||
| /// P002, Arachnodactyly | ||
| /// P003, Marfan Syndrome | ||
| /// ``` | ||
| /// is mapped to | ||
| /// | ||
| /// ```text | ||
| /// PatientId, conditions_hpo, conditions_disease | ||
| /// P001, HP:1234567, | ||
| /// P002, Arachnodactyly, | ||
| /// P003,,Marfan Syndrome | ||
| /// ``` | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// A `TransformError::MappingError` will be thrown if any cells in the HpoOrDisease column | ||
| /// are not a label or ID in either the `hpo_bidict_lib` or the `disease_bidict_lib`. | ||
| #[derive(Debug)] | ||
| pub struct HpoDiseaseSplitterStrategy { | ||
| hpo_dict_lib: Arc<BiDictLibrary>, | ||
| disease_dict_lib: Arc<BiDictLibrary>, | ||
| hpo_bidict_lib: Arc<BiDictLibrary>, | ||
| disease_bidict_lib: Arc<BiDictLibrary>, | ||
| } | ||
|
|
||
| impl HpoDiseaseSplitterStrategy { | ||
| #[allow(unused)] | ||
| pub fn new(hpo_dict_lib: Arc<BiDictLibrary>, disease_dict_lib: Arc<BiDictLibrary>) -> Self { | ||
| pub fn new(hpo_bidict_lib: Arc<BiDictLibrary>, disease_bidict_lib: Arc<BiDictLibrary>) -> Self { | ||
| Self { | ||
| hpo_dict_lib, | ||
| disease_dict_lib, | ||
| hpo_bidict_lib, | ||
| disease_bidict_lib, | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -67,10 +97,10 @@ impl Strategy for HpoDiseaseSplitterStrategy { | |
| for hpo_or_disease_opt in hpo_or_disease_col.str()?.iter() { | ||
| match hpo_or_disease_opt { | ||
| Some(hpo_or_disease) => { | ||
| if self.hpo_dict_lib.lookup(hpo_or_disease).is_some() { | ||
| if self.hpo_bidict_lib.lookup(hpo_or_disease).is_some() { | ||
| new_hpo_col_data.push(AnyValue::String(hpo_or_disease)); | ||
| new_disease_col_data.push(AnyValue::Null); | ||
| } else if self.disease_dict_lib.lookup(hpo_or_disease).is_some() { | ||
| } else if self.disease_bidict_lib.lookup(hpo_or_disease).is_some() { | ||
| new_hpo_col_data.push(AnyValue::Null); | ||
| new_disease_col_data.push(AnyValue::String(hpo_or_disease)) | ||
| } else { | ||
|
|
@@ -171,8 +201,8 @@ mod tests { | |
| .unwrap(); | ||
|
|
||
| let strategy = HpoDiseaseSplitterStrategy { | ||
| hpo_dict_lib: Arc::new(BiDictLibrary::new("hpo", vec![Box::new(HPO_DICT.clone())])), | ||
| disease_dict_lib: Arc::new(BiDictLibrary::new( | ||
| hpo_bidict_lib: Arc::new(BiDictLibrary::new("hpo", vec![Box::new(HPO_DICT.clone())])), | ||
| disease_bidict_lib: Arc::new(BiDictLibrary::new( | ||
| "disease", | ||
| vec![Box::new(MONDO_BIDICT.clone())], | ||
| )), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The first line of a doc string will be used in different places as a headline for this class on the doc website of crates.io.
For example:
https://docs.rs/securiety/0.2.8/securiety/curie/index.html
https://github.com/SmartMonkey-git/securiety/blob/c6eeb0b7aac212d331b0876dc6e284500ad1c2ad/src/curie.rs#L3
Can use see how the first line is used?
So, if we keep this we will see something like:
AgeToIso8601Strategy Description
So, the first line should be a single sentence summary.