diff --git a/README.md b/README.md index f4075c4..1ec5adf 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ A resource-friendly, highly efficient, and minimal Prometheus exporter to track see a sample of the metrics page in [here](./extra/metrics.txt). ## 🎥 DEMO - + ## 📋 Step-by-Step Guide diff --git a/container_exporter.py b/container_exporter.py index dc201c1..4337306 100755 --- a/container_exporter.py +++ b/container_exporter.py @@ -10,72 +10,61 @@ app = Flask(__name__) -# TO-DO : handle init containers with better storage methods -# TO-DO : modulization - -# Create Prometheus gauge metrics for status and stats +# Create Prometheus gauge metrics container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name']) container_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container cpu usage', ['container_name']) container_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name']) container_memory_bytes_total = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name']) -# Create Prometheus Counter metric for Disk I/O disk_io_read_counter = Counter("cxp_disk_io_read_bytes_total", "Total number of bytes read from disk", ['container_name']) disk_io_write_counter = Counter("cxp_disk_io_write_bytes_total", "Total number of bytes written to disk", ['container_name']) -# Create Prometheus Counter metric for Network I/O network_rx_counter = Counter("cxp_network_rx_bytes_total", "Total number of bytes received over the network", ['container_name']) network_tx_counter = Counter("cxp_network_tx_bytes_total", "Total number of bytes transmitted over the network", ['container_name']) -# get the data that relates to running containers at the first startup -def get_init_container(): - client = docker_env() - return client.containers.list() - -init_containers_names = [c.name for c in get_init_container()] - -# get the data for all containers (killed exited stopped and running) -def get_all_container(): +# get the data for running or not running(unhealthy) containers +def get_containers(all=False): client = docker_env() - return client.containers.list(all=True) + return client.containers.list(all) +init_containers_names = [c.name for c in get_containers()] -def update_container_status(): - # update the running container_names that is offline with the status of all containers - all_containers = get_all_container() - for container in all_containers: +# update container status whether they are running. +def update_container_status(containers): + for container in containers: if container.name in init_containers_names: container_status.labels(container_name=container.name).set(1 if container.status == "running" else 0) elif container.status == "running": container_status.labels(container_name=container.name).set(1) init_containers_names.append(container.name) - for removed_container_name in init_containers_names: - if removed_container_name not in [c.name for c in all_containers]: - container_status.labels(container_name=removed_container_name).set(0) + for container_name in init_containers_names: + if container_name not in [c.name for c in containers]: + container_status.labels(container_name=container_name).set(0) + -# get containers' stats and update their metrics in async mode async def container_stats(): docker = Docker() - containers = await docker.containers.list() - tasks = [stat.get_container_stats(container) for container in containers] - all_stats = await gather(*tasks) - for stats in all_stats: - container_cpu_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_cpu_percentage(stats[0])) - container_memory_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_percentage(stats[0])) - container_memory_bytes_total.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_bytes(stats[0])) - disk_io_read_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[0]) - disk_io_write_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[1]) - network_rx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[0]) - network_tx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[1]) - + try: + containers = await docker.containers.list() + tasks = [stat.get_container_stats(container) for container in containers] + all_stats = await gather(*tasks) + for stats in all_stats: + container_cpu_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_cpu_percentage(stats[0])) + container_memory_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_percentage(stats[0])) + container_memory_bytes_total.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_bytes(stats[0])) + disk_io_read_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[0]) + disk_io_write_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[1]) + network_rx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[0]) + network_tx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[1]) + finally: + await docker.close() metrics_names = [container_cpu_percentage, container_memory_percentage , container_memory_bytes_total , disk_io_read_counter , disk_io_write_counter , network_rx_counter , network_tx_counter ] -def flush_metric_labels(): - all_containers = get_all_container() - for container in all_containers: +def flush_metric_labels(c): + for container in c: if container.status != "running": for m in metrics_names: m.clear() @@ -87,15 +76,15 @@ def index(): @app.route('/metrics') def metrics(): try: - update_container_status() - flush_metric_labels() + all_containers = get_containers(all=True) + update_container_status(all_containers) + flush_metric_labels(all_containers) loop = new_event_loop() t = [loop.create_task(container_stats())] loop.run_until_complete(wait(t)) except Exception as e: return f"Error running script: {str(e)}" - # generate the latest value of metrics return Response(generate_latest(), mimetype='text/plain') def create_app():