Add feature to display datasource breakdown

amotl · amotl · commit 00c164f5331d · 2021-12-10T12:22:55.000+01:00
This can specifically be used to find unused data sources.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -5,9 +5,14 @@ grafana-wtf changelog
 
 in progress
 ===========
-- Upgrade to ``colored==1.4.3``
+
+2021-12-10 0.11.0
+=================
+- Upgrade to ``colored==1.4.3``. Thanks, @dslackw!
 - Tests: Use ``.env`` file for propagating environment variables to Docker Compose
 - CI/GHA test matrix: Use Grafana 7.5.11 and 8.3.1 and add Python 3.10
+- Add feature to display datasource breakdown, specifically for finding unused
+  data sources. Thanks, @chenlujjj!
 
 2021-10-01 0.10.0
 =================
diff --git a/grafana_wtf/commands.py b/grafana_wtf/commands.py
@@ -14,7 +14,7 @@
 from grafana_wtf.core import GrafanaSearch
 from grafana_wtf.report import WtfReport
 from grafana_wtf.tabular_report import TabularReport
-from grafana_wtf.util import normalize_options, setup_logging, configure_http_logging, read_list
+from grafana_wtf.util import normalize_options, setup_logging, configure_http_logging, read_list, yaml_dump
 
 log = logging.getLogger(__name__)
 
@@ -25,6 +25,7 @@ def run():
       grafana-wtf [options] find [<search-expression>]
       grafana-wtf [options] replace <search-expression> <replacement>
       grafana-wtf [options] log [<dashboard_uid>] [--number=<count>]
+      grafana-wtf [options] datasource-breakdown
       grafana-wtf --version
       grafana-wtf (-h | --help)
 
@@ -92,6 +93,13 @@ def run():
       # Output full history table in Grid format
       grafana-wtf log --format=tabular:grid
 
+    Breakdown examples:
+
+      # Display all data sources and the dashboards using them, as well as unused data sources.
+      grafana-wtf datasource-breakdown --format=yaml
+
+      # Display names of unused datasources as a flat list.
+      grafana-wtf datasource-breakdown --format=json | jq -r '.unused[].datasource.name'
 
     """
 
@@ -186,6 +194,24 @@ def run():
 
         print(output)
 
+    if options.datasource_breakdown:
+        results = engine.datasource_breakdown()
+
+        unused_count = len(results["unused"])
+        if unused_count:
+            log.warning(f"Found {unused_count} unused data source(s)")
+
+        if output_format == "json":
+            output = json.dumps(results, indent=4)
+
+        elif output_format == "yaml":
+            output = yaml_dump(results)
+
+        else:
+            raise ValueError(f"Unknown output format \"{output_format}\"")
+
+        print(output)
+
 
 def get_table_format(output_format):
     tablefmt = None
diff --git a/grafana_wtf/core.py b/grafana_wtf/core.py
@@ -2,6 +2,8 @@
 # (c) 2019 Andreas Motl <andreas@hiveeyes.org>
 # License: GNU Affero General Public License, Version 3
 import json
+from pprint import pprint
+
 import colored
 import logging
 import asyncio
@@ -13,6 +15,7 @@
 from urllib.parse import urlparse, urljoin
 from concurrent.futures.thread import ThreadPoolExecutor
 
+from grafana_wtf.model import DatasourceBreakdownItem
 from grafana_wtf.monkey import monkeypatch_grafana_api
 # Apply monkeypatch to grafana-api
 # https://github.com/m0nhawk/grafana_api/pull/85/files
@@ -61,23 +64,30 @@ def clear_cache(self):
     def enable_concurrency(self, concurrency):
         self.concurrency = concurrency
 
-    def setup(self):
-        url = urlparse(self.grafana_url)
+    @staticmethod
+    def grafana_client_factory(grafana_url, grafana_token=None):
+        url = urlparse(grafana_url)
 
         # Grafana API Key auth
-        if self.grafana_token:
-            auth = self.grafana_token
+        if grafana_token:
+            auth = grafana_token
 
         # HTTP basic auth
         else:
             username = url.username or 'admin'
             password = url.password or 'admin'
             auth = (username, password)
 
-        self.grafana = GrafanaFace(
+        grafana = GrafanaFace(
             auth, protocol=url.scheme,
             host=url.hostname, port=url.port, url_path_prefix=url.path.lstrip('/'))
 
+        return grafana
+
+    def setup(self):
+
+        self.grafana = self.grafana_client_factory(self.grafana_url, grafana_token=self.grafana_token)
+
         # Configure a larger HTTP request pool.
         # Todo: Review the pool settings and eventually adjust according to concurrency level or other parameters.
         # https://urllib3.readthedocs.io/en/latest/advanced-usage.html#customizing-pool-behavior
@@ -175,6 +185,7 @@ def scan_datasources(self):
         try:
             self.data.datasources = munchify(self.grafana.datasource.list_datasources())
             log.info('Found {} data sources'.format(len(self.data.datasources)))
+            return self.data.datasources
         except GrafanaClientError as ex:
             message = '{name}: {ex}'.format(name=ex.__class__.__name__, ex=ex)
             log.error(self.get_red_message(message))
@@ -218,6 +229,8 @@ def scan_dashboards(self, dashboard_uids=None):
         if self.progressbar:
             self.taqadum.close()
 
+        return self.data.dashboards
+
     def handle_grafana_error(self, ex):
         message = '{name}: {ex}'.format(name=ex.__class__.__name__, ex=ex)
         message = colored.stylize(message, colored.fg("red") + colored.attr("bold"))
@@ -272,3 +285,93 @@ def get_dashboard_versions(self, dashboard_id):
         get_dashboard_versions_path = '/dashboards/id/%s/versions' % dashboard_id
         r = self.grafana.dashboard.api.GET(get_dashboard_versions_path)
         return r
+
+    def datasource_breakdown(self):
+
+        # Prepare indexes, mapping dashboards by uid, datasources by name
+        # as well as dashboards to datasources and vice versa.
+        ix = Indexer(engine=self)
+
+        # Compute list of breakdown items, associating datasources with the dashboards that use them.
+        results_used = []
+        results_unused = []
+        for name in sorted(ix.datasource_by_name):
+            datasource = ix.datasource_by_name[name]
+            dashboard_uids = ix.datasource_dashboard_index.get(name, [])
+            dashboards = list(map(ix.dashboard_by_uid.get, dashboard_uids))
+            item = DatasourceBreakdownItem(datasource=datasource, used_in=dashboards, grafana_url=self.grafana_url)
+
+            # Format results in a more compact form, using only a subset of all the attributes.
+            result = item.format_compact()
+
+            if dashboard_uids:
+                results_used.append(result)
+            else:
+                results_unused.append(result)
+
+        response = OrderedDict(
+            used=results_used,
+            unused=results_unused,
+        )
+
+        return response
+
+
+class Indexer:
+
+    def __init__(self, engine: GrafanaSearch):
+        self.engine = engine
+
+        # Prepare index data structures.
+        self.dashboard_by_uid = {}
+        self.datasource_by_name = {}
+        self.dashboard_datasource_index = {}
+        self.datasource_dashboard_index = {}
+
+        # Gather all data.
+        self.dashboards = self.engine.scan_dashboards()
+        self.datasources = self.engine.scan_datasources()
+
+        # Invoke indexer.
+        self.index()
+
+    def index(self):
+        self.index_dashboards()
+        self.index_datasources()
+
+    @staticmethod
+    def collect_datasource_names(root):
+        return list(set([item.datasource for item in root if item.datasource]))
+
+    def index_dashboards(self):
+
+        self.dashboard_by_uid = {}
+        self.dashboard_datasource_index = {}
+
+        for dashboard in self.dashboards:
+            if dashboard.meta.isFolder:
+                continue
+
+            # Index by uid.
+            uid = dashboard.dashboard.uid
+            self.dashboard_by_uid[uid] = dashboard
+
+            # Map to data source names.
+            ds_panels = self.collect_datasource_names(dashboard.dashboard.panels)
+            ds_annotations = self.collect_datasource_names(dashboard.dashboard.annotations.list)
+            ds_templating = self.collect_datasource_names(dashboard.dashboard.templating.list)
+            self.dashboard_datasource_index[uid] = list(sorted(set(ds_panels + ds_annotations + ds_templating)))
+
+    def index_datasources(self):
+
+        self.datasource_by_name = {}
+        self.datasource_dashboard_index = {}
+
+        for datasource in self.datasources:
+            name = datasource.name
+            self.datasource_by_name[name] = datasource
+
+        for dashboard_uid, datasource_names in self.dashboard_datasource_index.items():
+            for datasource_name in datasource_names:
+                self.datasource_dashboard_index.setdefault(datasource_name, [])
+                self.datasource_dashboard_index[datasource_name].append(dashboard_uid)
diff --git a/grafana_wtf/model.py b/grafana_wtf/model.py
@@ -0,0 +1,31 @@
+import dataclasses
+from typing import List
+
+from munch import Munch
+from collections import OrderedDict
+from urllib.parse import urljoin
+
+
+@dataclasses.dataclass
+class DatasourceBreakdownItem:
+    datasource: Munch
+    used_in: List[Munch]
+    grafana_url: str
+
+    def format_compact(self):
+        dsshort = OrderedDict(
+            name=self.datasource.name,
+            type=self.datasource.type,
+            url=self.datasource.url,
+        )
+        item = OrderedDict(datasource=dsshort)
+        for dashboard in self.used_in:
+            item.setdefault("dashboards", [])
+            dbshort = OrderedDict(
+                title=dashboard.dashboard.title,
+                uid=dashboard.dashboard.uid,
+                path=dashboard.meta.url,
+                url=urljoin(self.grafana_url, dashboard.meta.url),
+            )
+            item["dashboards"].append(dbshort)
+        return item
diff --git a/grafana_wtf/util.py b/grafana_wtf/util.py
@@ -4,6 +4,9 @@
 import sys
 import json
 import logging
+from collections import OrderedDict
+
+import yaml
 from munch import munchify
 from jsonpath_rw import parse
 from pygments import highlight
@@ -106,3 +109,22 @@ def find(self, needle, haystack):
 def prettify_json(data):
     json_str = json.dumps(data, indent=4)
     return highlight(json_str, JsonLexer(), TerminalFormatter())
+
+
+def yaml_dump(data, stream=None, Dumper=yaml.SafeDumper, **kwds):
+    """
+    https://stackoverflow.com/questions/5121931/in-python-how-can-you-load-yaml-mappings-as-ordereddicts
+    """
+
+    kwds["default_flow_style"] = False
+
+    class OrderedDumper(Dumper):
+        pass
+
+    def _dict_representer(dumper, data):
+        return dumper.represent_mapping(
+            yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
+            data.items())
+
+    OrderedDumper.add_representer(OrderedDict, _dict_representer)
+    return yaml.dump(data, stream, OrderedDumper, **kwds)
diff --git a/setup.py b/setup.py
@@ -9,6 +9,7 @@
 
     # Core
     'six',
+    'dataclasses; python_version<"3.7"',
     'docopt>=0.6.2,<0.7',
     'munch>=2.5.0,<3',
     'tqdm>=4.37.0,<5',
@@ -25,6 +26,7 @@
     'tabulate>=0.8.5,<0.9',
     'colored>=1.4.3,<2',
     'Pygments>=2.7.4,<3',
+    'PyYAML>=5,<6',
 
 ]
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,6 +3,9 @@
 from pathlib import Path
 
 import pytest
+from grafana_api.grafana_api import GrafanaClientError
+
+from grafana_wtf.core import GrafanaSearch
 
 
 def clean_environment():
@@ -37,4 +40,19 @@ def docker_grafana(docker_services):
     return url
 
 
+@pytest.fixture
+def create_datasource(docker_grafana):
+    # https://docs.pytest.org/en/4.6.x/fixture.html#factories-as-fixtures
+    def _create_datasource(name: str, type: str, access: str):
+        grafana = GrafanaSearch.grafana_client_factory(docker_grafana)
+        # TODO: Add fixture which completely resets everything in Grafana before running the test harness.
+        #       Move to a different port than 3000 then!
+        try:
+            grafana.datasource.create_datasource(dict(name=name, type=type, access=access))
+        except GrafanaClientError as ex:
+            if "Client Error 409: data source with the same name already exists" not in str(ex):
+                raise
+    return _create_datasource
+
+
 clean_environment()
diff --git a/tests/test_commands.py b/tests/test_commands.py
@@ -6,6 +6,7 @@
 
 import docopt
 import pytest
+import yaml
 
 import grafana_wtf.commands
 
@@ -173,3 +174,27 @@ def test_log_tabular_success(docker_grafana, capsys, caplog):
         first_item_raw = str.splitlines(captured.out)[-1]
         first_item_normalized = re.sub("(.*)Date: .+|(.*)", r"\1Date: xxxx-xx-xxTxx:xx:xxZ      |\2", first_item_raw, 1)
         assert first_item_normalized == reference
+
+
+def test_datasource_breakdown(docker_grafana, create_datasource, capsys, caplog):
+
+    # Create a datasource, which is not used by any dashboard.
+    create_datasource(name="foo", type="foo", access="foo")
+    create_datasource(name="bar", type="bar", access="bar")
+
+    # Compute breakdown.
+    set_command("datasource-breakdown", "--format=yaml")
+
+    # Proof the output is correct.
+    with caplog.at_level(logging.DEBUG):
+        grafana_wtf.commands.run()
+        assert "Found 2 unused data source(s)" in caplog.messages
+
+    captured = capsys.readouterr()
+    data = yaml.load(captured.out)
+
+    assert len(data["used"]) >= 1
+    assert len(data["unused"]) >= 2
+
+    assert data["unused"][0]["datasource"]["name"] == "bar"
+    assert data["unused"][1]["datasource"]["name"] == "foo"