Skip to content

Commit fa57361

Browse files
authored
DEV: allow to set rustboard file read buffer size via ENV var (#6251)
* Motivation for features / changes #6248 * Technical description of changes As discussed in the original issue, running `rustboard` on a large number of tf event files can result in a OOM because of a larged, fixed size read buffer for each file. Allow to configure the buffer size via the `TB_GCS_BUFFER_SIZE_KB` environment variable (buffer size in Kb). * Detailed steps to verify changes work correctly (as executed by you) ```sh > cargo build > RUST_LOG=debug TB_GCS_BUFFER_SIZE_KB=1024 cargo run -- --logdir=gs://PATH/TO/MANY/FILE --reload-once ``` * Alternate designs / implementations considered Could have been a CLI flag, see #6248 (comment)
1 parent 9aa78bb commit fa57361

File tree

1 file changed

+26
-4
lines changed

1 file changed

+26
-4
lines changed

tensorboard/data/server/gcs/logdir.rs

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License.
1818
use log::warn;
1919
use reqwest::StatusCode;
2020
use std::collections::HashMap;
21+
use std::env;
2122
use std::io::{self, BufReader, Read};
2223
use std::path::{Path, PathBuf};
2324

@@ -79,24 +80,45 @@ pub struct Logdir {
7980
/// Invariant: `prefix` either is empty or ends with `/`, and thus an event file name should be
8081
/// joined onto `prefix` to form its full object name.
8182
prefix: String,
83+
/// Size of the opened file read buffer (in Kb) when reading from GCS.
84+
/// The `gcs::Logdir::new` will attempt to fetch the `TB_GCS_BUFFER_SIZE_KB` environment
85+
/// variable that represent the read buffer size (in Kb) for each TF events file.
86+
/// Note: if reading a large number of TF events files, set an appropriate value for
87+
/// `buffer_capacity` to prevent running out of memory. This determines the total size of the
88+
/// allocated memory.
89+
/// The default value is defined by the `DEFAULT_BUFFER_CAPACITY_KB` constant.
90+
buffer_capacity: usize,
8291
}
8392

93+
/// Default size of the GCS file read buffer (in Kb).
94+
/// Read large chunks from GCS to reduce network roundtrips.
95+
const DEFAULT_BUFFER_CAPACITY_KB: usize = 1024 * 16;
96+
8497
impl Logdir {
8598
pub fn new(gcs: Client, bucket: String, mut prefix: String) -> Self {
8699
if !prefix.is_empty() && !prefix.ends_with('/') {
87100
prefix.push('/');
88101
}
102+
// convert the Kb buffer size to bytes
103+
let buffer_capacity = match env::var("TB_GCS_BUFFER_SIZE_KB") {
104+
Ok(val) => {
105+
val.parse::<usize>()
106+
.ok()
107+
.unwrap_or(DEFAULT_BUFFER_CAPACITY_KB)
108+
* 1024
109+
}
110+
Err(_) => DEFAULT_BUFFER_CAPACITY_KB * 1024,
111+
};
112+
89113
Self {
90114
gcs,
91115
bucket,
92116
prefix,
117+
buffer_capacity,
93118
}
94119
}
95120
}
96121

97-
/// Read large chunks from GCS to reduce network roundtrips.
98-
const BUFFER_CAPACITY: usize = 1024 * 1024 * 16;
99-
100122
impl crate::logdir::Logdir for Logdir {
101123
type File = BufReader<File>;
102124

@@ -140,6 +162,6 @@ impl crate::logdir::Logdir for Logdir {
140162
let mut object = self.prefix.clone();
141163
object.push_str(path.0.to_string_lossy().as_ref());
142164
let file = File::new(self.gcs.clone(), self.bucket.clone(), object);
143-
Ok(BufReader::with_capacity(BUFFER_CAPACITY, file))
165+
Ok(BufReader::with_capacity(self.buffer_capacity, file))
144166
}
145167
}

0 commit comments

Comments
 (0)