Skip to content

Commit 8576047

Browse files
authored
Merge pull request #116 from IBM/schema-updates
Add environment variables to Chart for Timeout and No Compression
2 parents 30daae8 + 924aea3 commit 8576047

File tree

12 files changed

+87
-33
lines changed

12 files changed

+87
-33
lines changed

Cargo.lock

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

FAQ.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
- [Why is my core dump truncated?](#why-is-my-core-dump-truncated)
66

7+
- [Why is my zip file corrupted?](#why-is-my-zip-file-corrupted)
8+
79
- [Why is my log file exactly half of my configured line count?](#why-is-my-log-file-exactly-half-of-my-configured-line-count)
810

911
- [Can I force an upload?](#can-i-force-an-upload)
@@ -12,6 +14,8 @@
1214

1315
- [How do I use the custom endpoint?](#how-do-i-use-the-custom-endpoint)
1416

17+
- [Why am I getting the wrong container info?](#why-am-i-getting-the-wrong-container-info)
18+
1519
## How should I integrate my own uploader?
1620

1721
The core dump handler is designed to quickly move the cores *"off-box"* to an object storage environment with as much additional runtime information as possible.
@@ -73,6 +77,14 @@ terminationGracePeriodSeconds: 120
7377
```
7478
Also see [Kubernetes best practices: terminating with grace](https://cloud.google.com/blog/products/containers-kubernetes/kubernetes-best-practices-terminating-with-grace)
7579

80+
## Why is my zip file corrupted?
81+
82+
As of v8.7.0 there is now have a timer on the core dump to prevent repeated hanging core dumps taking down the system.
83+
For very large core dumps this means the process can be truncated and the zipfile incomplete.
84+
85+
In v8.8.0 We have added the nocompression option to zip process to improve performance and you can increase the timeout default which is currently set to 10 minutes.
86+
87+
7688
## Why is my log file exactly half of my configured line count?
7789

7890
This appears to be a bug in some kubernetes services.
@@ -134,3 +146,9 @@ extraEnvVars: |
134146
- name: S3_ENDPOINT
135147
value: https://the-endpoint
136148
```
149+
150+
## Why am I getting the wrong container info?
151+
152+
Core dump handler trys to find the container information for the crashing process based on the hostname of the pod. This works fine in most scenarios but when pods are created directly in multiple namespaces or the same Statefulsets are created in the same namespaces.
153+
154+
The current recommendation is to create a unique name in both of those scenarios. [See issue 115](https://github.com/IBM/core-dump-handler/issues/115)

charts/core-dump-handler/README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,14 @@ The agent pod has the following environment variables and these are all set by t
182182
"img" (Default): This is the value most crictls expect.
183183
"images": Digital Ocean, Newer OpenShift require this value
184184
185+
* COMP_TIMEOUT - The timeout for the composer in seconds. Defaults to 600.
186+
187+
In testing ~ 3 mins per 512Mb so we have set it to 10 mins.
188+
189+
* COMP_COMPRESSION - Enable compression Default: true
190+
191+
Given the amount of time compression there is an option to disable it.
192+
185193
* CRIO_ENDPOINT - The CRIO endpoint to use.
186194
187195
"unix:///run/containerd/containerd.sock" (Default): This is the default for most containerd nodes
@@ -252,7 +260,9 @@ Composer
252260
* logLevel: The log level for the composer (Default "Warn")
253261
* ignoreCrio: Maps to the COMP_IGNORE_CRIO enviroment variable (Default false)
254262
* crioImageCmd: Maps to the COMP_CRIO_IMAGE_CMD enviroment variable (Default "img")
255-
* filenameTemplate: Maps to COMP_FILENAME_TEMPLATE environment variable
263+
* timeout: Maps to the COMP_TIMEOUT environment variable ("Default 600)
264+
* compression: Maps to the COMP_COMPRESSION environment varable (Default "true")
265+
* filenameTemplate: Maps to COMP_FILENAME_TEMPLATE environment variable
256266
(Default {{uuid}}-dump-{{timestamp}}-{{hostname}}-{{exe_name}}-{{pid}}-{{signal}})
257267
258268
Possible Values:

charts/core-dump-handler/templates/daemonset.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ spec:
4747
value: {{ .Values.composer.crioImageCmd }}
4848
- name: COMP_POD_SELECTOR_LABEL
4949
value: {{ .Values.composer.podSelectorLabel }}
50+
- name: COMP_TIMEOUT
51+
value: {{ .Values.composer.timeout | quote }}
52+
- name: COMP_COMPRESSION
53+
value: {{ .Values.composer.compression | quote }}
5054
- name: DEPLOY_CRIO_CONFIG
5155
value: {{ .Values.daemonset.deployCrioConfig | quote }}
5256
- name: CRIO_ENDPOINT

charts/core-dump-handler/values.schema.json

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,23 @@
115115
},
116116
"podSelectorLabel": {
117117
"type": "string"
118+
},
119+
"timeout": {
120+
"type": "integer",
121+
"minimum": 120
122+
},
123+
"compression": {
124+
"type": "boolean"
118125
}
119126
},
120127
"required": [
121128
"crioImageCmd",
122129
"ignoreCrio",
123130
"logLevel",
124131
"logLength",
125-
"filenameTemplate"
132+
"filenameTemplate",
133+
"timeout",
134+
"compression"
126135
],
127136
"title": "Composer"
128137
},
@@ -183,7 +192,7 @@
183192
"hostContainerRuntimeEndpoint"
184193
]
185194
}
186-
}
195+
}
187196
],
188197
"properties": {
189198
"name": {
@@ -316,4 +325,4 @@
316325
"title": "ServiceAccount"
317326
}
318327
}
319-
}
328+
}

charts/core-dump-handler/values.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ replicaCount: 1
33
image:
44
registry: quay.io
55
repository: icdh/core-dump-handler
6-
tag: v8.7.0
6+
tag: schema-updates
77
pullPolicy: Always
88
pullSecrets: []
99
request_mem: "64Mi"
@@ -27,6 +27,8 @@ composer:
2727
filenameTemplate: "{uuid}-dump-{timestamp}-{hostname}-{exe_name}-{pid}-{signal}"
2828
logLength: 500
2929
podSelectorLabel: ""
30+
timeout: 600
31+
compression: true
3032

3133
daemonset:
3234
name: "core-dump-handler"

core-dump-agent/src/main.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ async fn main() -> Result<(), anyhow::Error> {
288288
async fn process_file(zip_path: &Path, bucket: &Bucket) {
289289
info!("Uploading: {}", zip_path.display());
290290

291-
let f = File::open(&zip_path).expect("no file found");
291+
let f = File::open(zip_path).expect("no file found");
292292

293293
match f.try_lock(FileLockMode::Shared) {
294294
Ok(_) => { /* If we can lock then we are ok */ }
@@ -305,7 +305,7 @@ async fn process_file(zip_path: &Path, bucket: &Bucket) {
305305
}
306306
}
307307

308-
let metadata = fs::metadata(&zip_path).expect("unable to read metadata");
308+
let metadata = fs::metadata(zip_path).expect("unable to read metadata");
309309
info!("zip size is {}", metadata.len());
310310
let path_str = match zip_path.to_str() {
311311
Some(v) => v,
@@ -473,11 +473,15 @@ fn create_env_file(host_location: &str) -> Result<(), std::io::Error> {
473473
});
474474
let log_length = env::var("LOG_LENGTH").unwrap_or_else(|_| "500".to_string());
475475
let pod_selector_label = env::var("COMP_POD_SELECTOR_LABEL").unwrap_or_default();
476+
let timeout = env::var("COMP_TIMEOUT").unwrap_or_else(|_| "600".to_string());
477+
let compression = env::var("COMP_COMPRESSION")
478+
.unwrap_or_else(|_| "true".to_string())
479+
.to_lowercase();
476480
info!("Creating {} file with LOG_LEVEL={}", destination, loglevel);
477481
let mut env_file = File::create(destination)?;
478482
let text = format!(
479-
"LOG_LEVEL={}\nIGNORE_CRIO={}\nCRIO_IMAGE_CMD={}\nUSE_CRIO_CONF={}\nFILENAME_TEMPLATE={}\nLOG_LENGTH={}\nPOD_SELECTOR_LABEL={}\n",
480-
loglevel, ignore_crio, crio_image, use_crio_config, filename_template, log_length, pod_selector_label
483+
"LOG_LEVEL={}\nIGNORE_CRIO={}\nCRIO_IMAGE_CMD={}\nUSE_CRIO_CONF={}\nFILENAME_TEMPLATE={}\nLOG_LENGTH={}\nPOD_SELECTOR_LABEL={}\nTIMEOUT={}\nCOMPRESSION={}\n",
484+
loglevel, ignore_crio, crio_image, use_crio_config, filename_template, log_length, pod_selector_label, timeout, compression
481485
);
482486
info!("Writing composer .env \n{}", text);
483487
env_file.write_all(text.as_bytes())?;
@@ -496,7 +500,7 @@ fn get_sysctl(name: &str) -> Result<String, anyhow::Error> {
496500
info!("Getting sysctl for {}", name);
497501
let output = Command::new("sysctl")
498502
.env("PATH", get_path())
499-
.args(&["-n", name])
503+
.args(["-n", name])
500504
.output()?;
501505
let lines = String::from_utf8(output.stdout)?;
502506
let line = lines.lines().take(1).next().unwrap_or("");
@@ -522,7 +526,7 @@ fn overwrite_sysctl(name: &str, value: &str) -> Result<(), anyhow::Error> {
522526
let s = format!("{}={}", name, value);
523527
let output = Command::new("sysctl")
524528
.env("PATH", get_path())
525-
.args(&["-w", s.as_str()])
529+
.args(["-w", s.as_str()])
526530
.status()?;
527531
if !output.success() {
528532
let e = Error::InvalidOverWrite {

core-dump-agent/tests/basic.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ fn basic() -> Result<(), std::io::Error> {
9595
"FILENAME_TEMPLATE={uuid}-dump-{timestamp}-{hostname}-{exe_name}-{pid}-{signal}"
9696
));
9797
assert!(env_content.contains("LOG_LENGTH=500"));
98-
assert_eq!(env_content.lines().count(), 7);
98+
assert_eq!(env_content.lines().count(), 9);
9999
//TODO: [No9] Test uploading of a corefile
100100
//TODO: [No9] Test remove option
101101
//TODO: [No9] Test sweep option

core-dump-composer/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ uuid = { version = "1.1.0", features = ["serde", "v4"] }
1313
zip = "0.6.2"
1414
dotenv = "0.15.0"
1515
log = "0.4.14"
16-
log4rs = { git = "https://github.com/No9/log4rs/", branch = "typemap-ors-fix" }
16+
log4rs = "1.2.0"
1717
anyhow = "1.0.53"
1818
serde_json = "1.0.76"
1919
serde = { version = "1.0.134", features = ["derive"] }

core-dump-composer/src/config.rs

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@ pub struct CoreConfig {
2121
pub pod_selector_label: String,
2222
pub use_crio_config: bool,
2323
pub ignore_crio: bool,
24+
pub timeout: u32,
25+
pub compression: bool,
2426
pub image_command: ImageCommand,
2527
pub bin_path: String,
2628
pub os_hostname: String,
2729
pub filename_template: String,
2830
pub params: CoreParams,
29-
pub disable_compression: bool,
3031
}
3132

3233
#[derive(Serialize)]
@@ -39,7 +40,6 @@ pub struct CoreParams {
3940
pub directory: String,
4041
pub hostname: String,
4142
pub pathname: String,
42-
pub timeout: u64,
4343
pub namespace: Option<String>,
4444
pub podname: Option<String>,
4545
pub uuid: Uuid,
@@ -58,12 +58,12 @@ impl CoreConfig {
5858
let directory = matches.value_of("directory").unwrap_or("").to_string();
5959
let hostname = matches.value_of("hostname").unwrap_or("").to_string();
6060
let pathname = matches.value_of("pathname").unwrap_or("").to_string();
61-
let timeout = matches
62-
.value_of("timeout")
63-
.unwrap_or("600")
64-
.parse::<u64>()
65-
.unwrap();
66-
let disable_compression = matches.contains_id("disable-compression");
61+
// let timeout = matches
62+
// .value_of("timeout")
63+
// .unwrap_or("600")
64+
// .parse::<u64>()
65+
// .unwrap();
66+
// let disable_compression = matches.contains_id("disable-compression");
6767

6868
let uuid = Uuid::new_v4();
6969

@@ -76,7 +76,6 @@ impl CoreConfig {
7676
directory,
7777
hostname,
7878
pathname,
79-
timeout,
8079
namespace: None,
8180
podname: None,
8281
uuid,
@@ -112,6 +111,14 @@ impl CoreConfig {
112111
.unwrap_or_else(|_| "false".to_string().to_lowercase())
113112
.parse::<bool>()
114113
.unwrap();
114+
let compression = env::var("COMPRESSION")
115+
.unwrap_or_else(|_| "true".to_string().to_lowercase())
116+
.parse::<bool>()
117+
.unwrap();
118+
let timeout = env::var("TIMEOUT")
119+
.unwrap_or_else(|_| "600".to_string())
120+
.parse::<u32>()
121+
.unwrap();
115122
let os_hostname = hostname::get()
116123
.unwrap_or_else(|_| OsString::from_str("unknown").unwrap_or_default())
117124
.into_string()
@@ -146,7 +153,8 @@ impl CoreConfig {
146153
filename_template,
147154
log_length,
148155
params,
149-
disable_compression,
156+
compression,
157+
timeout,
150158
})
151159
}
152160

0 commit comments

Comments
 (0)