Skip to content

Commit a158c0e

Browse files
committed
Updates on the tables and cross-cutting stage
1 parent cc92aa8 commit a158c0e

File tree

15 files changed

+8197
-8366
lines changed

15 files changed

+8197
-8366
lines changed

src/data/table1.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {renderUrlCell} from "../utils/renderUrlCell";
22

33
export const TABLE_1 = [
44
{
5-
"id": "Mr1",
5+
"id": "MR1",
66
"Phase Category": "Requirement definition",
77
"Subcategory": "Metric selection",
88
"Description": "The goal of the model that is going to be trained should be considered when selecting the most appropriate metrics for assessing the model performance.",
@@ -13,7 +13,7 @@ export const TABLE_1 = [
1313
"External URL(S) in post(s)-link": "Imbalanced learn-Sklearn"
1414
},
1515
{
16-
"id": "Mr2",
16+
"id": "MR2",
1717
"Phase Category": "Requirement definition",
1818
"Subcategory": "Retraining model",
1919
"Description": "It is important to identify the needs/requirements of model retraining.",
@@ -23,7 +23,7 @@ export const TABLE_1 = [
2323
"STE Post(s)-link": "STO/56859324",
2424
},
2525
{
26-
"id": "Mr3",
26+
"id": "MR3",
2727
"Phase Category": "Requirement definition",
2828
"Subcategory": "External services",
2929
"Description": "If an ML model is published as a cloud service, specifically, when a classification/prediction task uses ML cloud-based services, it is important to define the use case and the model requirements in order to identify how frequently the service should be invoked.",
@@ -33,7 +33,7 @@ export const TABLE_1 = [
3333
"STE Post(s)-link": "STO/56859324",
3434
},
3535
{
36-
"id": "Mr4",
36+
"id": "MR4",
3737
"Phase Category": "Requirement definition",
3838
"Subcategory": "Probabilistic model",
3939
"Description": "When using probabilistic forecasting in a decision system, it is necessary to decouple the probabilistic model optimization from the probability threshold selection.",

src/data/table10.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {renderUrlCell} from "../utils/renderUrlCell";
22

33
export const TABLE_10 = [
44
{
5-
"id": "Me1",
5+
"id": "ME1",
66
"Phase Category": "Validation",
77
"Subcategory": "Aspects to consider when validating a model",
88
"Description": "Use adversarial inputs for testing a model in order to ensure robustness.",
@@ -14,7 +14,7 @@ export const TABLE_10 = [
1414
"References to practice terminology-link": "",
1515
},
1616
{
17-
"id": "Me2",
17+
"id": "ME2",
1818
"Phase Category": "Validation",
1919
"Subcategory": "Hyper parameter tuning",
2020
"Description": "After the best hyper-parameters are found, the model with those \“optimum\" hyper-parameters should be tested in the set built for testing.",
@@ -26,7 +26,7 @@ export const TABLE_10 = [
2626
"References to practice terminology-link": "",
2727
},
2828
{
29-
"id": "Me3",
29+
"id": "ME3",
3030
"Phase Category": "Validation",
3131
"Subcategory": "Aspects to consider when validating a model",
3232
"Description": "If a superset vocabulary was built for a natural language processing {NLP} task, then the model bias should be checked.",
@@ -38,7 +38,7 @@ export const TABLE_10 = [
3838
"References to practice terminology-link": "",
3939
},
4040
{
41-
"id": "Me4",
41+
"id": "ME4",
4242
"Phase Category": "Validation",
4343
"Subcategory": "Aspects to consider when validating a model",
4444
"Description": "The comparison between models to identify the best model should be made in the test set.",
@@ -50,7 +50,7 @@ export const TABLE_10 = [
5050
"References to practice terminology-link": "",
5151
},
5252
{
53-
"id": "Me5",
53+
"id": "ME5",
5454
"Phase Category": "Validation",
5555
"Subcategory": "Unit testing",
5656
"Description": "Previously annotated data should be used when testing machine learning models with unit tests.",
@@ -62,7 +62,7 @@ export const TABLE_10 = [
6262
"References to practice terminology-link": "",
6363
},
6464
{
65-
"id": "Me6",
65+
"id": "ME6",
6666
"Phase Category": "Validation",
6767
"Subcategory": "Aspects to consider when validating a model",
6868
"Description": "The performance and the time required to train a model should be taken into account when comparing models",
@@ -75,7 +75,7 @@ export const TABLE_10 = [
7575
"References to practice terminology-link": "",
7676
},
7777
{
78-
"id": "Me7",
78+
"id": "ME7",
7979
"Phase Category": "Validation",
8080
"Subcategory": "Aspects to consider when validating a model",
8181
"Description": "The membership of the testing data to the training data should be tested.",
@@ -87,7 +87,7 @@ export const TABLE_10 = [
8787
"References to practice terminology-link": "",
8888
},
8989
{
90-
"id": "Me8",
90+
"id": "ME8",
9191
"Phase Category": "Validation",
9292
"Subcategory": "Aspects to consider when validating a model",
9393
"Description": "When cross-validation is used for testing, then the hold-out set should not be used.",

src/data/table11.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {renderUrlCell} from "../utils/renderUrlCell";
22

33
export const TABLE_11 = [
44
{
5-
"id": "Md1",
5+
"id": "MD1",
66
"Phase Category": "Deployment",
77
"Subcategory": "-",
88
"Description": "A deployed model should be the one that has the best hyper-parameters, but it should be re-trained with the entire dataset.",
@@ -14,7 +14,7 @@ export const TABLE_11 = [
1414
"References to practice terminology": "-",
1515
},
1616
{
17-
"id": "Md2",
17+
"id": "MD2",
1818
"Phase Category": "Deployment",
1919
"Subcategory": "-",
2020
"Description": "When deploying a model, the entire pipeline should be exported instead of exporting only the model.",

src/data/table12.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {renderUrlCell} from "../utils/renderUrlCell";
22

33
export const TABLE_12 = [
44
{
5-
"id": "Mm1",
5+
"id": "MM1",
66
"Phase Category": "Monitoring",
77
"Subcategory": "Be aware of model performance and new data",
88
"Description": "After deployment, the new data that will serve as input for the model should be constantly monitored to detect any deviation from the original data.",
@@ -14,7 +14,7 @@ export const TABLE_12 = [
1414
"References to practice terminology-link": "Model degradation (Mauri & Damiani), Model degradation (Adam et al.)",
1515
},
1616
{
17-
"id": "Mm2",
17+
"id": "MM2",
1818
"Phase Category": "Monitoring",
1919
"Subcategory": "Be aware of model performance and new data",
2020
"Description": "When monitoring the data distribution, if it deviates from the original one used for training and tuning models, then the model should be retrained in order to avoid degradation.",

src/data/table13.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {renderUrlCell} from "../utils/renderUrlCell";
22

33
export const TABLE_13 = [
44
{
5-
"id": "I1",
5+
"id": "CC1",
66
"Phase Category": "Implementation",
77
"Subcategory": "Reproducibility/replicability",
88
"Description": "For the data cleaning procedure stage, routines should be used to avoid implementing everything from scratch.",
@@ -14,7 +14,7 @@ export const TABLE_13 = [
1414
"References to practice terminology-link": "",
1515
},
1616
{
17-
"id": "I2",
17+
"id": "CC2",
1818
"Phase Category": "Implementation",
1919
"Subcategory": "Reproducibility/replicability",
2020
"Description": "Across the ML stages, pipelines should be used to automate processes and save time in complex tasks.",
@@ -27,7 +27,7 @@ export const TABLE_13 = [
2727
"References to practice terminology-link": "",
2828
},
2929
{
30-
"id": "I3",
30+
"id": "CC3",
3131
"Phase Category": "Implementation",
3232
"Subcategory": "Reproducibility/replicability",
3333
"Description": "In order to enable reproducibility in data pre-processing, pipelines should be used and exported.",
@@ -39,7 +39,7 @@ export const TABLE_13 = [
3939
"References to practice terminology-link": "",
4040
},
4141
{
42-
"id": "I4",
42+
"id": "CC4",
4343
"Phase Category": "Implementation",
4444
"Subcategory": "Documentation/traceability",
4545
"Description": "For each trained model, a separate file should be used in order to keep track of all the possible experiments.",
@@ -51,7 +51,7 @@ export const TABLE_13 = [
5151
"References to practice terminology-link": "",
5252
},
5353
{
54-
"id": "I5",
54+
"id": "CC5",
5555
"Phase Category": "Implementation",
5656
"Subcategory": "Documentation/traceability",
5757
"Description": "The (hyper-)parameters used in the training process should be documented.",
@@ -63,7 +63,7 @@ export const TABLE_13 = [
6363
"References to practice terminology-link": "",
6464
},
6565
{
66-
"id": "I6",
66+
"id": "CC6",
6767
"Phase Category": "Implementation",
6868
"Subcategory": "Consistency/Integrity",
6969
"Description": "Once a model is deployed and a petition of data deletion is made, the data should be deleted across the entire pipeline.",
@@ -76,7 +76,7 @@ export const TABLE_13 = [
7676
"References to practice terminology-link": "",
7777
},
7878
{
79-
"id": "I7",
79+
"id": "CC7",
8080
"Phase Category": "Implementation",
8181
"Subcategory": "Resources usage",
8282
"Description": "When dealing with large datasets or large files, aspects such as parallel executions, GPU usage, and input/output efficiency should be taken into account.",
@@ -89,7 +89,7 @@ export const TABLE_13 = [
8989
"References to practice terminology-link": "",
9090
},
9191
{
92-
"id": "I8",
92+
"id": "CC8",
9393
"Phase Category": "Implementation",
9494
"Subcategory": "Resources usage",
9595
"Description": "In order to optimize, parallel execution should be used if it supported by the algorithms.",
@@ -102,7 +102,7 @@ export const TABLE_13 = [
102102
"References to practice terminology-link": "",
103103
},
104104
{
105-
"id": "I9",
105+
"id": "CC9",
106106
"Phase Category": "Implementation",
107107
"Subcategory": "Resources usage",
108108
"Description": "It should be verified if both, model and dataset, fit in memory.",
@@ -115,7 +115,7 @@ export const TABLE_13 = [
115115
"References to practice terminology-link": "",
116116
},
117117
{
118-
"id": "I10",
118+
"id": "CC10",
119119
"Phase Category": "Implementation",
120120
"Subcategory": "Resources usage",
121121
"Description": "When dealing with large corpus, in NLP-related tasks, sparse structures should be used to improve the implementation performance.",
@@ -127,7 +127,7 @@ export const TABLE_13 = [
127127
"References to practice terminology-link": "",
128128
},
129129
{
130-
"id": "I11",
130+
"id": "CC11",
131131
"Phase Category": "Implementation",
132132
"Subcategory": "Resources usage",
133133
"Description": "When dealing with large data, resource-aware implementations should be used.",

src/data/table2.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {renderUrlCell} from "../utils/renderUrlCell";
22

33
export const TABLE_2 = [
44
{
5-
"id": "Dc1",
5+
"id": "DC1",
66
"Phase Category": "Data",
77
"Subcategory": "Dataset construction",
88
"Description": "The distribution of the training data should reflect the real distribution.",
@@ -16,7 +16,7 @@ export const TABLE_2 = [
1616

1717
},
1818
{
19-
"id": "Dc2",
19+
"id": "DC2",
2020
"Phase Category": "Data",
2121
"Subcategory": "Dataset construction",
2222
"Description": "If a model is expected to detect \"something\" in an image, then representative examples of that \"something\" should be present in the training and testing data.",
@@ -29,7 +29,7 @@ export const TABLE_2 = [
2929

3030
},
3131
{
32-
"id": "Dc3",
32+
"id": "DC3",
3333
"Phase Category": "Data",
3434
"Subcategory": "Ensure minimum size and how to measure the size",
3535
"Description": "When measuring a dataset size it should not be done only by referring to storage space but also in terms of rows and columns.",
@@ -42,7 +42,7 @@ export const TABLE_2 = [
4242
"References to practice terminology-link": "Compute models complexity- Surana, Time complexity analysis- Lee & Chen, Complexity NN- Bianchini & Scarselli",
4343
},
4444
{
45-
"id": "Dc4",
45+
"id": "DC4",
4646
"Phase Category": "Data",
4747
"Subcategory": "Dataset construction",
4848
"Description": "If it is required to augment the number of instances in the negative class, preexisting datasets could be used for including more instances in the dataset.",
@@ -55,7 +55,7 @@ export const TABLE_2 = [
5555
"References to practice terminology-link": "For example: RICO dataset",
5656
},
5757
{
58-
"id": "Dc5",
58+
"id": "DC5",
5959
"Phase Category": "Data",
6060
"Subcategory": "Dataset construction",
6161
"Description": "The images that are going to be used as instances of the negative class should have some common characteristics with the positive ones.",
@@ -68,7 +68,7 @@ export const TABLE_2 = [
6868

6969
},
7070
{
71-
"id": "Dc6",
71+
"id": "DC6",
7272
"Phase Category": "Data",
7373
"Subcategory": "Dataset construction",
7474
"Description": "The minimum size of the object that is going to be detected should be present in the data that is going to be used for training the model.",
@@ -80,7 +80,7 @@ export const TABLE_2 = [
8080

8181
},
8282
{
83-
"id": "Dc7",
83+
"id": "DC7",
8484
"Phase Category": "Data",
8585
"Subcategory": "Dataset construction",
8686
"Description": "The object region of interest (ROI) should have a similar aspect ratio in all the positive images.",

src/data/table3.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {renderUrlCell} from "../utils/renderUrlCell";
22

33
export const TABLE_3 = [
44
{
5-
"id": "Dcl1",
5+
"id": "DCL1",
66
"Phase Category": "Exploratory data analysis",
77
"Subcategory": "Define types of features and dependencies between them",
88
"Description": "For each possible attribute/column in a dataset, we should identify its type before applying any pre-processing technique.",
@@ -15,7 +15,7 @@ export const TABLE_3 = [
1515
"References to practice terminology-link": "Pre-processing techniques- Berthold",
1616
},
1717
{
18-
"id": "Dcl2",
18+
"id": "DCL2",
1919
"Phase Category": "Exploratory data analysis",
2020
"Subcategory": "Define types of features and dependencies between them",
2121
"Description": "It should be determined what variables are dependent and what ones are independent.",
@@ -27,7 +27,7 @@ export const TABLE_3 = [
2727
"References to practice terminology": "-",
2828
},
2929
{
30-
"id": "Dcl3",
30+
"id": "DCL3",
3131
"Phase Category": "Exploratory data analysis",
3232
"Subcategory": "Detect trends, errors and relations in data",
3333
"Description": "When analyzing time series/temporal data, possible temporal trends should be identified.",
@@ -38,7 +38,7 @@ export const TABLE_3 = [
3838
"References to practice terminology": "-",
3939
},
4040
{
41-
"id": "Dcl4",
41+
"id": "DCL4",
4242
"Phase Category": "Exploratory data analysis",
4343
"Subcategory": "Detect trends, errors and relations in data",
4444
"Description": "When analyzing time series/temporal data, weird trends should be identified.",
@@ -49,7 +49,7 @@ export const TABLE_3 = [
4949
"References to practice terminology": "-",
5050
},
5151
{
52-
"id": "Dcl5",
52+
"id": "DCL5",
5353
"Phase Category": "Exploratory data analysis",
5454
"Subcategory": "Detect trends, errors and relations in data",
5555
"Description": "It is needed to identify missing values and their nature before doing any pre-processing.",
@@ -60,7 +60,7 @@ export const TABLE_3 = [
6060
"References to practice terminology": "-",
6161
},
6262
{
63-
"id": "Dcl6",
63+
"id": "DCL6",
6464
"Phase Category": "Exploratory data analysis",
6565
"Subcategory": "Detect trends, errors and relations in data",
6666
"Description": "Before applying any pre-processing technique, potential errors in data should be identified.",
@@ -71,7 +71,7 @@ export const TABLE_3 = [
7171
"References to practice terminology": "-",
7272
},
7373
{
74-
"id": "Dcl7",
74+
"id": "DCL7",
7575
"Phase Category": "Exploratory data analysis",
7676
"Subcategory": "Dataset construction",
7777
"Description": "If correlation between existing features and instances of geographical data are needed, then, geographical data should be clustered.",

0 commit comments

Comments
 (0)