|
| 1 | +--- |
| 2 | +description: 'Advanced dashboard in ClickHouse Cloud' |
| 3 | +keywords: ['monitoring', 'observability', 'advanced dashboard', 'dashboard', 'observability |
| 4 | + dashboard'] |
| 5 | +sidebar_label: 'Advanced dashboard' |
| 6 | +sidebar_position: 45 |
| 7 | +slug: /cloud/manage/monitor/advanced-dashboard |
| 8 | +title: 'Advanced dashboard in ClickHouse Cloud' |
| 9 | +--- |
| 10 | + |
| 11 | +import AdvancedDashboard from '@site/static/images/cloud/manage/monitoring/advanced_dashboard.png'; |
| 12 | +import NativeAdvancedDashboard from '@site/static/images/cloud/manage/monitoring/native_advanced_dashboard.png'; |
| 13 | +import EditVisualization from '@site/static/images/cloud/manage/monitoring/edit_visualization.png'; |
| 14 | +import InsertedRowsSec from '@site/static/images/cloud/manage/monitoring/inserted_rows_max_parts_for_partition.png'; |
| 15 | +import ResourceIntensiveQuery from '@site/static/images/cloud/manage/monitoring/resource_intensive_query.png'; |
| 16 | +import SelectedRowsPerSecond from '@site/static/images/cloud/manage/monitoring/selected_rows_sec.png'; |
| 17 | + |
| 18 | +import Image from '@theme/IdealImage'; |
| 19 | + |
| 20 | +Monitoring your database system in a production environment is vital to |
| 21 | +understanding your deployment health so that you can prevent or solve outages. |
| 22 | + |
| 23 | +The advanced dashboard is a lightweight tool designed to give you deep insights |
| 24 | +into your ClickHouse system and its environment, helping you stay ahead of |
| 25 | +performance bottlenecks, system failures, and inefficiencies. |
| 26 | + |
| 27 | +The advanced dashboard is available in both ClickHouse OSS (Open Source Software) |
| 28 | +and Cloud. In this article we will show you how to use the advanced dashboard in |
| 29 | +Cloud. |
| 30 | + |
| 31 | +## Accessing the advanced dashboard {#accessing-the-advanced-dashboard} |
| 32 | + |
| 33 | +The advanced dashboard can be accessed by navigating to: |
| 34 | + |
| 35 | +* Left side panel |
| 36 | + * `Monitoring` → `Advanced dashboard` |
| 37 | + |
| 38 | +<Image img={AdvancedDashboard} size="lg" alt="Advanced dashboard"/> |
| 39 | + |
| 40 | +## Accessing the native advanced dashboard {#accessing-the-native-advanced-dashboard} |
| 41 | + |
| 42 | +The native advanced dashboard can be accessed by navigating to: |
| 43 | + |
| 44 | +* Left side panel |
| 45 | + * `Monitoring` → `Advanced dashboard` |
| 46 | + * Clicking `You can still access the native advanced dashboard.` |
| 47 | + |
| 48 | +This will open the native advanced dashboard in a new tab. You will need to |
| 49 | +authenticate to access the dashboard. |
| 50 | + |
| 51 | +<Image img={NativeAdvancedDashboard} size="lg" alt="Advanced dashboard"/> |
| 52 | + |
| 53 | +Each visualization has a SQL query associated with it that populates it. You can |
| 54 | +edit this query by clicking on the pen icon. |
| 55 | + |
| 56 | +<Image img={EditVisualization} size="lg" alt="Advanced dashboard"/> |
| 57 | + |
| 58 | +## Out-of-box visualizations {#out-of-box-visualizations} |
| 59 | + |
| 60 | +The default charts in the Advanced Dashboard are designed to provide real-time |
| 61 | +visibility into your ClickHouse system. Below is a list with descriptions for |
| 62 | +each chart. They are grouped into three categories to help you navigate them. |
| 63 | + |
| 64 | +### ClickHouse specific {#clickhouse-specific} |
| 65 | + |
| 66 | +These metrics are tailored to monitor the health and performance of your |
| 67 | +ClickHouse instance. |
| 68 | + |
| 69 | +| Metric | Description | |
| 70 | +|---------------------------|------------------------------------------------------------------------------------------| |
| 71 | +| Queries Per Second | Tracks the rate of queries being processed | |
| 72 | +| Selected Rows/Sec | Indicates the number of rows being read by queries | |
| 73 | +| Inserted Rows/Sec | Measures the data ingestion rate | |
| 74 | +| Total MergeTree Parts | Shows the number of active parts in MergeTree tables, helping identify unbatched inserts | |
| 75 | +| Max Parts for Partition | Highlights the maximum number of parts in any partition | |
| 76 | +| Queries Running | Displays the number of queries currently executing | |
| 77 | +| Selected Bytes Per Second | Indicates the volume of data being read by queries | |
| 78 | + |
| 79 | +### System health specific {#system-health-specific} |
| 80 | + |
| 81 | +Monitoring the underlying system is just as important as watching ClickHouse itself. |
| 82 | + |
| 83 | +| Metric | Description | |
| 84 | +|---------------------------|---------------------------------------------------------------------------| |
| 85 | +| IO Wait | Tracks I/O wait times | |
| 86 | +| CPU Wait | Measures delays caused by CPU resource contention | |
| 87 | +| Read From Disk | Tracks the number of bytes read from disks or block devices | |
| 88 | +| Read From Filesystem | Tracks the number of bytes read from the filesystem, including page cache | |
| 89 | +| Memory (tracked, bytes) | Shows memory usage for processes tracked by ClickHouse | |
| 90 | +| Load Average (15 minutes) | Report the current load average 15 from the system | |
| 91 | +| OS CPU Usage (Userspace) | CPU Usage running userspace code | |
| 92 | +| OS CPU Usage (Kernel) | CPU Usage running kernel code | |
| 93 | + |
| 94 | +## ClickHouse Cloud specific {#clickhouse-cloud-specific} |
| 95 | + |
| 96 | +ClickHouse Cloud stores data using object storage (S3 type). Monitoring this |
| 97 | +interface can help detect issues. |
| 98 | + |
| 99 | +| Metric | Description | |
| 100 | +|--------------------------------|-------------------------------------------------------------| |
| 101 | +| S3 Read wait | Measures the latency of read requests to S3 | |
| 102 | +| S3 read errors per second | Tracks the read errors rate | |
| 103 | +| Read From S3 (bytes/sec) | Tracks the rate data is read from S3 storage | |
| 104 | +| Disk S3 write req/sec | Monitors the frequency of write operations to S3 storage | |
| 105 | +| Disk S3 read req/sec | Monitors the frequency of read operations to S3 storage | |
| 106 | +| Page cache hit rate | The hit rate of the page cache | |
| 107 | +| Filesystem cache hit rate | Hit rate of the filesystem cache | |
| 108 | +| Filesystem cache size | The current size of the filesystem cache | |
| 109 | +| Network send bytes/sec | Tracks the current speed of incoming network traffic | |
| 110 | +| Network receive bytes/sec | Tracks the current speed of outbound network traffic | |
| 111 | +| Concurrent network connections | Tracks the number of current concurrent network connections | |
| 112 | + |
| 113 | +## Identifying issues with the Advanced dashboard {#identifying-issues-with-the-advanced-dashboard} |
| 114 | + |
| 115 | +Having this real-time view of the health of your ClickHouse service greatly helps |
| 116 | +mitigate issues before they impact your business or help solve them. Below are a |
| 117 | +few issues you can spot using the advanced dashboard. |
| 118 | + |
| 119 | +### Unbatched inserts {#unbatched-inserts} |
| 120 | + |
| 121 | +As described in the [best practices documentation](/best-practices/selecting-an-insert-strategy#batch-inserts-if-synchronous), it is recommended to always |
| 122 | +bulk insert data into ClickHouse if able to do so synchronously. |
| 123 | + |
| 124 | +A bulk insert with a reasonable batch size reduces the number of parts created |
| 125 | +during ingestion, resulting in more efficient write-on disks and fewer merge |
| 126 | +operations. |
| 127 | + |
| 128 | +The key metrics to spot sub-optimized insert are **Inserted Rows/sec** and |
| 129 | +**Max Parts for Partition** |
| 130 | + |
| 131 | +<Image img={InsertedRowsSec} size="lg" alt="Unbatched inserts"/> |
| 132 | + |
| 133 | +The example above shows two spikes in **Inserted Rows/sec** and **Max Parts for Partition** |
| 134 | +between 13h and 14h. This indicates that we ingest data at a reasonable speed. |
| 135 | + |
| 136 | +Then we see another big spike on **Max Parts for Partition** after 16h but a |
| 137 | +very slow **Inserted Rows/sec speed**. A lot of parts are being created with |
| 138 | +very little data generated, which indicates that the size of the parts is |
| 139 | +sub-optimal. |
| 140 | + |
| 141 | +### Resource intensive query {#resource-intensive-query} |
| 142 | + |
| 143 | +It is common to run SQL queries that consume a large amount of resources, such as |
| 144 | +CPU or memory. However, it is important to monitor these queries and understand |
| 145 | +their impact on your deployment's overall performance. |
| 146 | + |
| 147 | +A sudden change in resource consumption without a change in query throughput can |
| 148 | +indicate more expensive queries being executed. Depending on the type of queries |
| 149 | +you are running, this can be expected, but spotting them from the advanced |
| 150 | +dashboard is good. |
| 151 | + |
| 152 | +Below is an example of CPU usage peaking without significantly changing the |
| 153 | +number of queries per second executed. |
| 154 | + |
| 155 | +<Image img={ResourceIntensiveQuery} size="lg" alt="Resource intensive query"/> |
| 156 | + |
| 157 | +### Bad primary key design {#bad-primary-key-design} |
| 158 | + |
| 159 | +Another issue you can spot using the advanced dashboard is a bad primary key design. |
| 160 | +As described in ["A practical introduction to primary indexes in ClickHouse"](/guides/best-practices/sparse-primary-indexes#a-table-with-a-primary-key), |
| 161 | +choosing the primary key to fit best your use case will greatly improve performance |
| 162 | +by reducing the number of rows ClickHouse needs to read to execute your query. |
| 163 | + |
| 164 | +One of the metrics you can follow to spot potential improvements in primary keys |
| 165 | +is **Selected Rows per second**. A sudden peak in the number of selected rows can |
| 166 | +indicate both a general increase in overall query throughput, and queries that |
| 167 | +select a large number of rows to execute their query. |
| 168 | + |
| 169 | +<Image img={SelectedRowsPerSecond} size="lg" alt="Resource intensive query"/> |
| 170 | + |
| 171 | +Using the timestamp as a filter, you can find the queries executed at the time |
| 172 | +of the peak in the table `system.query_log`. |
| 173 | + |
| 174 | +For example, running a query that shows all the queries executed between 11 am |
| 175 | +and 11 am on a certain day to understand what queries are reading too many rows: |
| 176 | + |
| 177 | +```sql title="Query" |
| 178 | +SELECT |
| 179 | + type, |
| 180 | + event_time, |
| 181 | + query_duration_ms, |
| 182 | + query, |
| 183 | + read_rows, |
| 184 | + tables |
| 185 | +FROM system.query_log |
| 186 | +WHERE has(databases, 'default') AND (event_time >= '2024-12-23 11:20:00') AND (event_time <= '2024-12-23 11:30:00') AND (type = 'QueryFinish') |
| 187 | +ORDER BY query_duration_ms DESC |
| 188 | +LIMIT 5 |
| 189 | +FORMAT VERTICAL |
| 190 | +``` |
| 191 | + |
| 192 | +```response title="Response" |
| 193 | +Row 1: |
| 194 | +────── |
| 195 | +type: QueryFinish |
| 196 | +event_time: 2024-12-23 11:22:55 |
| 197 | +query_duration_ms: 37407 |
| 198 | +query: SELECT |
| 199 | + toStartOfMonth(review_date) AS month, |
| 200 | + any(product_title), |
| 201 | + avg(star_rating) AS avg_stars |
| 202 | +FROM amazon_reviews_no_pk |
| 203 | +WHERE |
| 204 | + product_category = 'Home' |
| 205 | +GROUP BY |
| 206 | + month, |
| 207 | + product_id |
| 208 | +ORDER BY |
| 209 | + month DESC, |
| 210 | + product_id ASC |
| 211 | +LIMIT 20 |
| 212 | +read_rows: 150957260 |
| 213 | +tables: ['default.amazon_reviews_no_pk'] |
| 214 | +
|
| 215 | +Row 2: |
| 216 | +────── |
| 217 | +type: QueryFinish |
| 218 | +event_time: 2024-12-23 11:26:50 |
| 219 | +query_duration_ms: 7325 |
| 220 | +query: SELECT |
| 221 | + toStartOfMonth(review_date) AS month, |
| 222 | + any(product_title), |
| 223 | + avg(star_rating) AS avg_stars |
| 224 | +FROM amazon_reviews_no_pk |
| 225 | +WHERE |
| 226 | + product_category = 'Home' |
| 227 | +GROUP BY |
| 228 | + month, |
| 229 | + product_id |
| 230 | +ORDER BY |
| 231 | + month DESC, |
| 232 | + product_id ASC |
| 233 | +LIMIT 20 |
| 234 | +read_rows: 150957260 |
| 235 | +tables: ['default.amazon_reviews_no_pk'] |
| 236 | +
|
| 237 | +Row 3: |
| 238 | +────── |
| 239 | +type: QueryFinish |
| 240 | +event_time: 2024-12-23 11:24:10 |
| 241 | +query_duration_ms: 3270 |
| 242 | +query: SELECT |
| 243 | + toStartOfMonth(review_date) AS month, |
| 244 | + any(product_title), |
| 245 | + avg(star_rating) AS avg_stars |
| 246 | +FROM amazon_reviews_pk |
| 247 | +WHERE |
| 248 | + product_category = 'Home' |
| 249 | +GROUP BY |
| 250 | + month, |
| 251 | + product_id |
| 252 | +ORDER BY |
| 253 | + month DESC, |
| 254 | + product_id ASC |
| 255 | +LIMIT 20 |
| 256 | +read_rows: 6242304 |
| 257 | +tables: ['default.amazon_reviews_pk'] |
| 258 | +
|
| 259 | +Row 4: |
| 260 | +────── |
| 261 | +type: QueryFinish |
| 262 | +event_time: 2024-12-23 11:28:10 |
| 263 | +query_duration_ms: 2786 |
| 264 | +query: SELECT |
| 265 | + toStartOfMonth(review_date) AS month, |
| 266 | + any(product_title), |
| 267 | + avg(star_rating) AS avg_stars |
| 268 | +FROM amazon_reviews_pk |
| 269 | +WHERE |
| 270 | + product_category = 'Home' |
| 271 | +GROUP BY |
| 272 | + month, |
| 273 | + product_id |
| 274 | +ORDER BY |
| 275 | + month DESC, |
| 276 | + product_id ASC |
| 277 | +LIMIT 20 |
| 278 | +read_rows: 6242304 |
| 279 | +tables: ['default.amazon_reviews_pk'] |
| 280 | +``` |
| 281 | + |
| 282 | +In this example, we can see the same query being executed against two |
| 283 | +tables `amazon_reviews_no_pk` and `amazon_reviews_pk`. It can be concluded that |
| 284 | +someone was testing a primary key option for the table `amazon_reviews`. |
0 commit comments