Skip to content

Commit 6e3ac2e

Browse files
authored
Merge pull request #226 from RECETOX/fix_eic_splitting
fixed eic splitting and updated tests
2 parents 169b929 + a393e64 commit 6e3ac2e

15 files changed

+182
-134
lines changed

CHANGELOG.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,27 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7-
## [dev] - unreleased
7+
## [0.13.3] - 2024-09-16
8+
### Changed
9+
- fixed not working eic splitting code [#226](https://github.com/RECETOX/recetox-aplcms/pull/226)
10+
11+
## [0.13.2] - 2024-07-30
12+
### Changed
13+
- Moved import order in the hope it will fix the import bugs by @hechth in [#224](https://github.com/RECETOX/recetox-aplcms/pull/224)
14+
15+
16+
## [0.13.0] - 2024-07-29
817
### Added
918
- added tests for mzdata and mzxml formats [#216](https://github.com/RECETOX/recetox-aplcms/pull/216)
19+
1020
### Changed
21+
- feature.alignment function refactored by @KristinaGomoryova in [#221](https://github.com/RECETOX/recetox-aplcms/pull/221)
22+
- remove_noise function updated by @KristinaGomoryova in [#219](https://github.com/RECETOX/recetox-aplcms/pull/219)
23+
- Updated correct time to use precomputed clusters by @zargham-ahmad in [#220](https://github.com/RECETOX/recetox-aplcms/pull/220)
24+
- Fix tests by @hechth in [#222](https://github.com/RECETOX/recetox-aplcms/pull/222)
25+
1126
### Removed
27+
- removed outdated files by @hechth in [#223](https://github.com/RECETOX/recetox-aplcms/pull/223)
1228

1329
## [0.12.0] - 2023-07-10
1430
### Changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: recetox.aplcms
22
Title: Adaptive processing of LC-MS data
3-
Version: 0.13.2
3+
Version: 0.13.3
44
Date: 2024-07-30
55
Authors@R: c(
66
person("Tianwei", "Yu", email = "tianwei.Yu@emory.edu", role = "aut"),

R/remove_noise.R

Lines changed: 18 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -92,24 +92,17 @@ remove_noise <- function(filename,
9292

9393
run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_pres & raw.prof$height.rec[, 3] > baseline_correct), 1]
9494

95-
newprof <- newprof[newprof[, 4] %in% run.sel, ]
96-
97-
if (grouping_threshold < Inf) {
98-
sorted_newprof <- newprof[order(newprof[,2]),]
99-
new_grps <- cumsum(c(0, diff(sorted_newprof[,2])) > grouping_threshold)
100-
sorted_newprof <- cbind(sorted_newprof, new_grps, deparse.level = 0)
101-
102-
sorted_newprof_df <- tibble::as_tibble(sorted_newprof)
103-
104-
newprof <- as.matrix(sorted_newprof_df |>
105-
dplyr::group_by(V4, V5) |>
106-
dplyr::mutate(cluster = cur_group_id()) |>
107-
dplyr::ungroup() |>
108-
dplyr::arrange(cluster) |>
109-
dplyr::select(-V4, -V5)
110-
)
111-
colnames(newprof) <- NULL
112-
}
95+
newprof <- as.data.frame(newprof[newprof[, 4] %in% run.sel, ])
96+
colnames(newprof) <- c("mz", "rt", "intensity", "group_number")
97+
98+
newprof <- tibble::tibble(newprof |>
99+
dplyr::group_by(group_number) |>
100+
dplyr::arrange_at("rt") |>
101+
dplyr::mutate(subset_group_number = cumsum(c(0, abs(diff(rt)) > grouping_threshold))) |>
102+
dplyr::group_by(group_number, subset_group_number) |>
103+
dplyr::mutate(grps = cur_group_id()) |>
104+
dplyr::ungroup() |>
105+
dplyr::select(mz, rt, intensity, grps))
113106

114107
new.prof <- run_filter(
115108
newprof,
@@ -128,12 +121,12 @@ remove_noise <- function(filename,
128121
)
129122
}
130123

131-
new_rec_tibble <- tibble::tibble(
132-
mz = new.prof$new_rec[, 1],
133-
rt = new.prof$new_rec[, 2],
134-
intensity = new.prof$new_rec[, 3],
135-
group_number = new.prof$new_rec[, 4]
136-
)
124+
# new_rec_tibble <- tibble::tibble(
125+
# mz = new.prof$new_rec[, 1],
126+
# rt = new.prof$new_rec[, 2],
127+
# intensity = new.prof$new_rec[, 3],
128+
# group_number = new.prof$new_rec[, 4]
129+
# )
137130

138-
return(new_rec_tibble)
131+
return(new.prof)
139132
}

R/run_filter.R

Lines changed: 52 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
#' @return unique_grp.
77
#' @export
88
compute_uniq_grp <- function(profile, min_count_run, min_pres) {
9-
grps <- profile
10-
ttt <- table(grps)
9+
ttt <- table(profile)
1110
ttt <- ttt[ttt >= max(min_count_run * min_pres, 2)]
1211
unique_grp <- as.numeric(names(ttt))
1312
return(unique_grp)
@@ -52,7 +51,7 @@ label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) {
5251

5352
# filtering based on the kernel regression estimate
5453
this_smooth <- predict_smoothed_rt(min_run, this_timeline)
55-
if (max(this_smooth) >= min_pres) {
54+
if (max(this_smooth, na.rm = TRUE) >= min_pres) {
5655
measured_points <- good_points <- timeline
5756
measured_points[this_times] <- 1
5857

@@ -82,63 +81,68 @@ label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) {
8281
run_filter <- function(newprof,
8382
min_pres,
8483
min_run) {
85-
newprof <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensi = newprof[, 3], grps = newprof[, 4])
86-
8784
# ordering retention time values
88-
labels <- newprof$rt
89-
times <- unique(labels)
90-
times <- times[order(times)]
85+
# labels <- newprof$rt
86+
# times <- unique(labels)
87+
# times <- times[order(times)]
9188

92-
for (i in 1:length(times)) {
93-
labels[which(newprof$rt == times[i])] <- i # now labels is the index of time points
94-
}
89+
# for (i in 1:length(times)) {
90+
# labels[which(newprof$rt == times[i])] <- i # now labels is the index of time points
91+
# }
9592

96-
newprof$rt <- labels
93+
# newprof$rt <- labels
94+
newprof <- dplyr::arrange_at(newprof, "rt")
95+
times <- unique(newprof$rt)
9796

9897
# calculates the minimun number of rt points to be considered a peak
99-
min_count_run <- min_run * length(times) / (max(times) - min(times))
100-
min_run <- round(min_count_run)
98+
scan_rate <- 1.0 / abs(median(diff(times)))
99+
min_count_run <- round(min_pres * min_run * scan_rate)
101100

102101
# computes unique groups
103102
uniq_grp <- compute_uniq_grp(newprof$grps, min_count_run, min_pres)
104103

105104
# ordered by mz and grps data that are inside unigrps
106105
newprof <- dplyr::filter(newprof, grps %in% uniq_grp) |> dplyr::arrange(grps, mz)
107106

108-
# computes break points i.e. indices of mass differences greater than min_mz_tol
109-
breaks <- compute_breaks_3(newprof$grps)
110-
111-
# init counters for loop
112-
new_rec <- newprof * 0
113-
rec_pointer <- 1
114-
timeline <- rep(0, length(times))
115-
for (m in 2:length(breaks))
116-
{
117-
this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt")
118-
119-
to_keep <- label_val_to_keep(
120-
min_run,
121-
timeline,
122-
min_pres,
123-
this_prof$rt,
124-
times
125-
)
126-
127-
# operation over selected indices
128-
if (sum(to_keep) > 0) {
129-
this_sel <- which(to_keep == 1)
130-
this_new <- dplyr::slice(this_prof, this_sel)
131-
r_new <- nrow(this_new)
132-
new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new
133-
rec_pointer <- rec_pointer + r_new
134-
}
135-
}
136-
137-
new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1))
138-
new_rec[, 2] <- times[new_rec[, 2]]
139-
140-
results <- new("list")
141-
results$new_rec <- new_rec
107+
results <- dplyr::group_by(newprof, grps) |>
108+
dplyr::filter(n() >= min_count_run && abs(span(rt)) >= min_run) |>
109+
dplyr::ungroup() |>
110+
dplyr::rename(group_number = grps)
111+
112+
# # computes break points i.e. indices of mass differences greater than min_mz_tol
113+
# breaks <- compute_breaks_3(newprof$grps)
114+
115+
# # init counters for loop
116+
# new_rec <- newprof * 0
117+
# rec_pointer <- 1
118+
# timeline <- rep(0, length(times))
119+
# for (m in 2:length(breaks))
120+
# {
121+
# this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt")
122+
123+
# to_keep <- label_val_to_keep(
124+
# min_run,
125+
# timeline,
126+
# min_pres,
127+
# this_prof$rt,
128+
# times
129+
# )
130+
# browser()
131+
# # operation over selected indices
132+
# if (sum(to_keep) > 0) {
133+
# this_sel <- which(to_keep == 1)
134+
# this_new <- dplyr::slice(this_prof, this_sel)
135+
# r_new <- nrow(this_new)
136+
# new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new
137+
# rec_pointer <- rec_pointer + r_new
138+
# }
139+
# }
140+
141+
# new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1))
142+
# new_rec[, 2] <- times[new_rec[, 2]]
143+
144+
# results <- new("list")
145+
# results$new_rec <- new_rec
142146

143147
return(results)
144148
}

R/utils.R

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,18 +110,15 @@ concatenate_feature_tables <- function(features, sample_names) {
110110
}
111111

112112
#' @export
113-
load_aligned_features <- function(metadata_file, intensities_file, rt_file, tol_file) {
113+
load_aligned_features <- function(metadata_file, intensities_file, rt_file) {
114114
metadata <- arrow::read_parquet(metadata_file)
115115
intensities <- arrow::read_parquet(intensities_file)
116116
rt <- arrow::read_parquet(rt_file)
117-
tolerances <- arrow::read_parquet(tol_file)
118117

119118
result <- list()
120119
result$metadata <- as_tibble(metadata)
121120
result$intensity <- as_tibble(intensities)
122121
result$rt <- as_tibble(rt)
123-
result$mz_tol_relative <- tolerances$mz_tolerance
124-
result$rt_tol_relative <- tolerances$rt_tolerance
125122
return(result)
126123
}
127124

tests/testthat/test-adjust-time.R

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ patrick::with_parameters_test_that(
44
testdata <- file.path("..", "testdata")
55

66
extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")
7-
template_features <- compute_template(extracted)
7+
actual <- compute_template(extracted)
88

9-
expected <- file.path(testdata, "template", "RCX_shortened.parquet")
10-
expected <- arrow::read_parquet(expected)
11-
12-
expect_equal(template_features, expected)
9+
expected_path <- file.path(testdata, "template", "RCX_shortened.parquet")
10+
expected <- arrow::read_parquet(expected_path)
11+
12+
expect_equal(actual, expected)
1313
},
1414
patrick::cases(
1515
RCX_shortened = list(
@@ -32,6 +32,10 @@ patrick::with_parameters_test_that(
3232
correct_time(x, template_features)
3333
})
3434

35+
# for(i in 1:3) {
36+
# arrow::write_parquet(corrected[[i]], file.path(testdata, "adjusted", paste0(files[i], ".parquet")))
37+
# }
38+
3539
expected <- read_parquet_files(files, "adjusted", ".parquet")
3640
expect_equal(corrected, expected, tolerance = 0.01)
3741
},
@@ -56,5 +60,8 @@ test_that("correct_time_v2 is close to correct time", {
5660
expected <- correct_time(clustered_table, template_features)
5761
actual <- correct_time_v2(clustered_table, template_features)
5862

63+
actual <- dplyr::arrange_at(actual, c("cluster", "mz", "area", "rt"))
64+
expected <- dplyr::arrange_at(expected, c("cluster", "mz", "area", "rt"))
65+
5966
expect_equal(actual, expected, tolerance = 0.02)
6067
})

tests/testthat/test-compute_clusters.R

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,18 @@ patrick::with_parameters_test_that(
1515
sample_names = files
1616
)
1717

18+
# for(i in 1:3) {
19+
# arrow::write_parquet(actual$feature_tables[[i]], file.path(testdata, "clusters", paste0(files[i], "_", input ,"_clusters.parquet")))
20+
# }
1821
expected <- read_parquet_files(files, "clusters", paste0("_", input, "_clusters.parquet"))
1922

2023

2124
for(i in seq_along(files)) {
2225
expect_equal(actual$feature_tables[[i]], expected[[i]])
2326
}
2427

25-
expect_equal(actual$mz_tol_relative, expected_mz_tol_relative)
26-
expect_equal(actual$rt_tol_relative, expected_rt_tol_relative)
28+
expect_equal(actual$mz_tol_relative, expected_mz_tol_relative, tolerance=0.1)
29+
expect_equal(actual$rt_tol_relative, expected_rt_tol_relative, tolerance=0.1)
2730

2831
},
2932
patrick::cases(
@@ -32,16 +35,16 @@ patrick::with_parameters_test_that(
3235
input = "extracted",
3336
mz_max_diff = 10 * 1e-05,
3437
mz_tol_absolute = 0.01,
35-
expected_mz_tol_relative = 6.85676325338646e-06,
36-
expected_rt_tol_relative = 3.61858118506494
38+
expected_mz_tol_relative = 6.849039e-06,
39+
expected_rt_tol_relative = 2.894385
3740
),
3841
RCX_shortened_adjusted = list(
3942
files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened"),
4043
input = "adjusted",
4144
mz_max_diff = 10 * 1e-05,
4245
mz_tol_absolute = 0.01,
43-
expected_mz_tol_relative = 6.85676325338646e-06,
44-
expected_rt_tol_relative = 2.17918873407775
46+
expected_mz_tol_relative = 6.856763e-06,
47+
expected_rt_tol_relative = 1.93185408267324
4548
)
4649
)
4750
)

tests/testthat/test-extract_features.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ patrick::with_parameters_test_that(
5353
})
5454

5555
expected <- read_parquet_files(expected_files, "extracted", ".parquet")
56-
5756
expect_equal(actual, expected, tolerance = 0.02)
5857
},
5958
patrick::cases(

tests/testthat/test-feature-align.R

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,11 @@ patrick::with_parameters_test_that(
2929
res$mz_tol_relative,
3030
get_num_workers()
3131
)
32-
33-
aligned_actual$mz_tol_relative <- res$mz_tol_relative
34-
aligned_actual$rt_tol_relative <- res$rt_tol_relative
35-
32+
3633
aligned_expected <- load_aligned_features(
3734
file.path(testdata, "aligned", "metadata_table.parquet"),
3835
file.path(testdata, "aligned", "intensity_table.parquet"),
39-
file.path(testdata, "aligned", "rt_table.parquet"),
40-
file.path(testdata, "aligned", "tolerances.parquet")
36+
file.path(testdata, "aligned", "rt_table.parquet")
4137
)
4238

4339
expect_equal(aligned_actual, aligned_expected)
@@ -75,21 +71,17 @@ patrick::with_parameters_test_that(
7571
aligned_expected <- load_aligned_features(
7672
file.path(testdata, "aligned", "metadata_table.parquet"),
7773
file.path(testdata, "aligned", "intensity_table.parquet"),
78-
file.path(testdata, "aligned", "rt_table.parquet"),
79-
file.path(testdata, "aligned", "tolerances.parquet")
74+
file.path(testdata, "aligned", "rt_table.parquet")
8075
)
8176

82-
aligned_expected["mz_tol_relative"] <- NULL
83-
aligned_expected["rt_tol_relative"] <- NULL
84-
8577
expect_equal(aligned_actual, aligned_expected)
8678
},
8779
patrick::cases(
8880
RCX_shortened = list(
8981
files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened"),
9082
min_occurrence = 2,
91-
mz_tol_relative = 6.85676325338646e-06,
92-
rt_tol_relative = 2.17918873407775
83+
mz_tol_relative = 6.84903911826453e-06,
84+
rt_tol_relative = 1.93185408267324
9385
)
9486
)
9587
)

tests/testthat/test-find_mz_tolerance.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ test_that("mz tolerance is found", {
1111
do.plot = FALSE
1212
)
1313

14-
expect_equal(mz_tol_relative, 0.01664097, tolerance=0.001)
14+
expect_equal(mz_tol_relative, 0.0135994, tolerance=0.001)
1515
})

0 commit comments

Comments
 (0)