refactor: remove direct logging without a logger (#4253)

ZanSara · web-flow · commit 13c4ff1b5298 · 2023-02-23T20:42:42.000+01:00
* remove direct logging without a logger

* add custom pylint checker

* add test

* pylint

* improve checker message

* mypy

* remove test

* add checker for basicConfig

* more logging missed

* ignore basicConfig

* move out logger

* move out statement

* remove logging configuration
diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py
@@ -196,7 +196,7 @@ def get_all_documents(
         :param batch_size: Number of documents that are passed to bulk function at a time.
         :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
         """
-        logging.warning(
+        logger.warning(
             "`get_all_documents()` can get very slow and resource-heavy since all documents must be loaded from deepset Cloud. "
             "Consider using `get_all_documents_generator()` instead."
         )
diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py
@@ -337,7 +337,7 @@ def get_document_by_id(
         try:
             result = self.weaviate_client.data_object.get_by_id(id, class_name=index, with_vector=True)
         except weaviate.exceptions.UnexpectedStatusCodeException as usce:
-            logging.debug("Weaviate could not get the document requested: %s", usce)
+            logger.debug("Weaviate could not get the document requested: %s", usce)
         if result:
             document = self._convert_weaviate_result_to_document(result, return_embedding=True)
         return document
@@ -364,7 +364,7 @@ def get_documents_by_id(
             try:
                 result = self.weaviate_client.data_object.get_by_id(id, class_name=index, with_vector=True)
             except weaviate.exceptions.UnexpectedStatusCodeException as usce:
-                logging.debug("Weaviate could not get the document requested: %s", usce)
+                logger.debug("Weaviate could not get the document requested: %s", usce)
             if result:
                 document = self._convert_weaviate_result_to_document(result, return_embedding=True)
                 documents.append(document)
diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py
@@ -390,9 +390,9 @@ def create_or_load_checkpoint(
             trainer = cls._load_checkpoint(
                 path=checkpoint_to_load, data_silo=data_silo, model=model, optimizer=optimizer, local_rank=local_rank
             )
-            logging.info("Resuming training from the train checkpoint at %s ...", checkpoint_to_load)
+            logger.info("Resuming training from the train checkpoint at %s ...", checkpoint_to_load)
         else:
-            logging.info("No train checkpoints found. Starting a new training ...")
+            logger.info("No train checkpoints found. Starting a new training ...")
             trainer = cls(
                 data_silo=data_silo,
                 model=model,
diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py
@@ -414,7 +414,7 @@ def load_schema():
     """
     schema_file_path = JSON_SCHEMAS_PATH / "haystack-pipeline-main.schema.json"
     if not os.path.exists(schema_file_path):
-        logging.info("Json schema not found, generating one at: %s", schema_file_path)
+        logger.info("Json schema not found, generating one at: %s", schema_file_path)
         try:
             update_json_schema(main_only=True)
         except Exception as e:
diff --git a/haystack/nodes/audio/_text_to_speech.py b/haystack/nodes/audio/_text_to_speech.py
@@ -7,22 +7,24 @@
 
 import numpy as np
 import torch
+from pydub import AudioSegment
+
+from haystack.errors import AudioNodeError
+from haystack.modeling.utils import initialize_device_settings
+
+
+logger = logging.getLogger(__name__)
+
 
 try:
     import soundfile as sf
     from espnet2.bin.tts_inference import Text2Speech as _Text2SpeechModel
 
 except OSError as ose:
-    logging.exception(
+    logger.exception(
         "`libsndfile` not found, it's probably not installed. The node will most likely crash. "
         "Please install soundfile's dependencies (https://python-soundfile.readthedocs.io/en/latest/)"
     )
-from pydub import AudioSegment
-
-from haystack.errors import AudioNodeError
-from haystack.modeling.utils import initialize_device_settings
-
-logger = logging.getLogger(__name__)
 
 
 class TextToSpeech:
diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py
@@ -365,11 +365,11 @@ def _write_file(
                 with open(file_path, "w", encoding="utf-8") as f:
                     json.dump(document.to_dict(), f)
             else:
-                logging.debug(
+                logger.debug(
                     "File '%s' already exists. Set 'overwrite_existing_files=True' to overwrite it.", file_path
                 )
         except Exception:
-            logging.exception(
+            logger.exception(
                 "Crawler can't save the content of '%s' under '%s'. "
                 "This webpage will be skipped, but links from this page will still be crawled. "
                 "Make sure the path above is accessible and the file name is valid. "
diff --git a/haystack/nodes/file_classifier/file_type.py b/haystack/nodes/file_classifier/file_type.py
@@ -4,20 +4,21 @@
 import logging
 from pathlib import Path
 
+from haystack.nodes.base import BaseComponent
+
+
+logger = logging.getLogger(__name__)
+
+
 try:
     import magic
 except ImportError as ie:
-    logging.debug(
+    logger.debug(
         "Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). "
         "FileTypeClassifier will not perform mimetype detection on extensionless files. "
         "Please make sure the necessary OS libraries are installed if you need this functionality."
     )
 
-from haystack.nodes.base import BaseComponent
-
-
-logger = logging.getLogger(__name__)
-
 
 DEFAULT_TYPES = ["txt", "pdf", "md", "docx", "html"]
 
diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py
@@ -854,7 +854,7 @@ def eval_beir(
                     qrels_new[query_id] = {_id: qrels[query_id][_id] for _id in document_rel_ids_intersection}
             qrels = qrels_new
         elif num_documents is not None and (num_documents < 1 or num_documents > len(corpus)):
-            logging.warning(
+            logger.warning(
                 "'num_documents' variable should be lower than corpus length and have a positive value, but it's %s."
                 " Dataset size remains unchanged.",
                 num_documents,
diff --git a/haystack/pipelines/config.py b/haystack/pipelines/config.py
@@ -175,7 +175,7 @@ def validate_yaml(
         extras=extras,
         overwrite_with_env_variables=overwrite_with_env_variables,
     )
-    logging.debug("'%s' contains valid Haystack pipelines.", path)
+    logger.debug("'%s' contains valid Haystack pipelines.", path)
 
 
 def validate_config(
@@ -260,7 +260,7 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
             )
         ok_to_ignore_version = pipeline_version == "ignore" and "rc" in __version__
         if not ok_to_ignore_version:
-            logging.warning(
+            logger.warning(
                 "This pipeline is version '%s', but you're using Haystack %s\n"
                 "This might cause bugs and unexpected behaviors."
                 "Please check out the release notes (https://github.com/deepset-ai/haystack/releases/latest), "
@@ -318,7 +318,7 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
                 f"Validation failed. {validation.message}. {error_location} " "See the stacktrace for more information."
             ) from validation
 
-    logging.debug("The given configuration is valid according to the JSON schema.")
+    logger.debug("The given configuration is valid according to the JSON schema.")
 
 
 def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_definitions: Dict[str, Any]):
@@ -332,7 +332,7 @@ def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_defin
     graph = _init_pipeline_graph(root_node_name=root_node_name)
     for node in pipeline_definition["nodes"]:
         graph = _add_node_to_pipeline_graph(graph=graph, node=node, components=component_definitions)
-    logging.debug("The graph for pipeline '%s' is valid.", pipeline_definition["name"])
+    logger.debug("The graph for pipeline '%s' is valid.", pipeline_definition["name"])
 
 
 def _find_root_in_pipeline_definition(pipeline_definition: Dict[str, Any]):
diff --git a/haystack/utils/docker.py b/haystack/utils/docker.py
@@ -3,8 +3,11 @@
 from haystack.nodes._json_schema import load_schema
 
 
+logger = logging.getLogger(__name__)
+
+
 def cache_nltk_model(model: str = "punkt"):
-    logging.info("Caching %s model...", model)
+    logger.info("Caching %s model...", model)
     import nltk
 
     nltk.download(model)
@@ -30,7 +33,7 @@ def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Un
     import transformers
 
     for model_to_cache in models:
-        logging.info("Caching %s", model_to_cache)
+        logger.info("Caching %s", model_to_cache)
         transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
         transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
 
diff --git a/haystack/utils/linting.py b/haystack/utils/linting.py
@@ -0,0 +1,80 @@
+# https://pylint.pycqa.org/en/latest/development_guide/how_tos/custom_checkers.html
+
+from typing import TYPE_CHECKING, Optional, List, Any
+
+from astroid import nodes
+
+from pylint.checkers import BaseChecker
+
+if TYPE_CHECKING:
+    from pylint.lint import PyLinter
+
+
+class DirectLoggingChecker(BaseChecker):
+    name = "no-direct-logging"
+    msgs = {
+        "W9001": (
+            "Use a logger object instead of a direct logging function like 'logging.%s()'",
+            "no-direct-logging",
+            "Do not use direct calls to logging functions like logging.info(), "
+            "rather create a logger object with getLogger and use it instead. "
+            "See https://github.com/deepset-ai/haystack/issues/4202.",
+        )
+    }
+
+    def __init__(self, linter: Optional["PyLinter"] = None) -> None:
+        super().__init__(linter)
+        self._function_stack: List[Any] = []
+
+    def visit_functiondef(self, node: nodes.FunctionDef) -> None:
+        self._function_stack.append([])
+
+    def leave_functiondef(self, node: nodes.FunctionDef) -> None:
+        self._function_stack.pop()
+
+    def visit_call(self, node: nodes.Call) -> None:
+        if isinstance(node.func, nodes.Attribute) and isinstance(node.func.expr, nodes.Name):
+            if node.func.expr.name == "logging" and node.func.attrname in [
+                "debug",
+                "info",
+                "warning",
+                "error",
+                "critical",
+                "exception",
+            ]:
+                self.add_message("no-direct-logging", args=node.func.attrname, node=node)
+
+
+class NoLoggingConfigurationChecker(BaseChecker):
+    name = "no-logging-basicconfig"
+    msgs = {
+        "W9002": (
+            "Do not use 'logging.basicConfig' in Haystack code: Haystack should not configure any loggers.",
+            "no-logging-basicconfig",
+            "Do not configure the logger explicitly, because this would be problematic for users. "
+            "Always configure the loggers only in scripts that use Haystack, like tutorials, rather than Haystack itself.",
+        )
+    }
+
+    def __init__(self, linter: Optional["PyLinter"] = None) -> None:
+        super().__init__(linter)
+        self._function_stack: List[Any] = []
+
+    def visit_functiondef(self, node: nodes.FunctionDef) -> None:
+        self._function_stack.append([])
+
+    def leave_functiondef(self, node: nodes.FunctionDef) -> None:
+        self._function_stack.pop()
+
+    def visit_call(self, node: nodes.Call) -> None:
+        if isinstance(node.func, nodes.Attribute) and isinstance(node.func.expr, nodes.Name):
+            if node.func.expr.name == "logging" and node.func.attrname in ["basicConfig"]:
+                self.add_message("no-logging-basicconfig", node=node)
+
+
+def register(linter: "PyLinter") -> None:
+    """This required method auto registers the checker during initialization.
+    :param linter: The linter to register the checker to.
+    """
+    linter.register_checker(DirectLoggingChecker(linter))
+    linter.register_checker(NoLoggingConfigurationChecker(linter))
diff --git a/haystack/utils/squad_data.py b/haystack/utils/squad_data.py
@@ -11,9 +11,8 @@
 from haystack.modeling.data_handler.processor import _read_squad_file
 
 
-logging.basicConfig()
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+
 
 tqdm.pandas()
 
diff --git a/haystack/utils/squad_to_dpr.py b/haystack/utils/squad_to_dpr.py
@@ -176,7 +176,7 @@ def create_dpr_training_dataset(squad_data: dict, retriever: BaseRetriever, num_
                 positive_ctxs = [{"title": article_title, "text": context, "passage_id": ""}]
 
                 if not hard_negative_ctxs or not positive_ctxs:
-                    logging.error(
+                    logger.error(
                         "No retrieved candidates for article %s, with question %s", article_title, question["question"]
                     )
                     n_non_added_questions += 1
diff --git a/pyproject.toml b/pyproject.toml
@@ -259,6 +259,7 @@ skip_magic_trailing_comma = true  # For compatibility with pydoc>=4.6, check if
 
 [tool.pylint.'MESSAGES CONTROL']
 max-line-length=120
+load-plugins = "haystack.utils.linting"
 disable = [
 
   # To keep
diff --git a/rest_api/rest_api/__about__.py b/rest_api/rest_api/__about__.py
@@ -4,7 +4,10 @@
 
 
 __version__ = "0.0.0"
+
+
 try:
     __version__ = open(Path(__file__).parent.parent / "VERSION.txt", "r").read()
 except Exception as e:
-    logging.exception("No VERSION.txt found!")
+    logger = logging.getLogger(__name__)
+    logger.exception("No VERSION.txt found!")
diff --git a/rest_api/rest_api/pipeline/__init__.py b/rest_api/rest_api/pipeline/__init__.py
@@ -23,7 +23,7 @@ def _load_pipeline(pipeline_yaml_path, indexing_pipeline_name):
     # Load pipeline (if available)
     try:
         pipeline = Pipeline.load_from_yaml(Path(pipeline_yaml_path), pipeline_name=indexing_pipeline_name)
-        logging.info("Loaded pipeline nodes: %s", pipeline.graph.nodes.keys())
+        logger.info("Loaded pipeline nodes: %s", pipeline.graph.nodes.keys())
         document_store = _get_pipeline_doc_store(pipeline)
     except PipelineConfigError as e:
         pipeline, document_store = None, None
@@ -35,7 +35,7 @@ def _load_pipeline(pipeline_yaml_path, indexing_pipeline_name):
 
 def _get_pipeline_doc_store(pipeline):
     document_store = pipeline.get_document_store()
-    logging.info("Loaded docstore: %s", document_store)
+    logger.info("Loaded docstore: %s", document_store)
     if isinstance(document_store, SINGLE_PROCESS_DOC_STORES):
         logger.warning("FAISSDocumentStore or InMemoryDocumentStore should only be used with 1 worker.")
     return document_store
@@ -54,7 +54,7 @@ def setup_pipelines() -> Dict[str, Any]:
 
     # Setup concurrency limiter
     concurrency_limiter = RequestLimiter(config.CONCURRENT_REQUEST_PER_WORKER)
-    logging.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER)
+    logger.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER)
     pipelines["concurrency_limiter"] = concurrency_limiter
 
     # Load indexing pipeline
diff --git a/test/benchmarks/nq_to_squad.py b/test/benchmarks/nq_to_squad.py
@@ -57,6 +57,10 @@
 import os
 import re
 
+
+logger = logging.getLogger(__name__)
+
+
 # Dropped samples
 n_yn = 0
 n_ms = 0
@@ -263,7 +267,7 @@ def main():
     nq_as_squad = {"version": args.version, "data": []}
 
     for file in sorted(glob.iglob(args.data_pattern)):
-        logging.info("opening %s", file)
+        logger.info("opening %s", file)
         with gzip.GzipFile(file, "r") as f:
             for line in f:
                 records += 1
@@ -277,7 +281,7 @@ def main():
                 if squad_record:
                     nq_as_squad["data"].append(squad_record)
                 if records % 100 == 0:
-                    logging.info("processed %s records", records)
+                    logger.info("processed %s records", records)
     print("Converted %s NQ records into %s SQuAD records." % (records, len(nq_as_squad["data"])))
     print(
         f"Removed samples: yes/no: {n_yn} multi_short: {n_ms} non_para {n_non_p} long_ans_only: {n_long_ans_only} errors: {n_error}"
diff --git a/test/others/test_utils.py b/test/others/test_utils.py
@@ -33,7 +33,7 @@
     haystack_version,
 )
 
-TEST_CONTEXT = context = """Der Merkantilismus förderte Handel und Verkehr mit teils marktkonformen, teils dirigistischen Maßnahmen.
+TEST_CONTEXT = """Der Merkantilismus förderte Handel und Verkehr mit teils marktkonformen, teils dirigistischen Maßnahmen.
 An der Schwelle zum 19. Jahrhundert entstand ein neuer Typus des Nationalstaats, der die Säkularisation durchsetzte,
 moderne Bildungssysteme etablierte und die Industrialisierung vorantrieb.\n
 Beim Begriff der Aufklärung geht es auch um die Prozesse zwischen diesen frühneuzeitlichen Eckpunkten.
diff --git a/test/pipelines/test_pipeline.py b/test/pipelines/test_pipeline.py

Original file line number	Diff line number	Diff line change
`@@ -196,7 +196,7 @@ def get_all_documents(`
`196`	`196`	`:param batch_size: Number of documents that are passed to bulk function at a time.`
`197`	`197`	`:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)`
`198`	`198`	`"""`
`199`		`- logging.warning(`
	`199`	`+ logger.warning(`
`200`	`200`	"`get_all_documents()` can get very slow and resource-heavy since all documents must be loaded from deepset Cloud. "
`201`	`201`	"Consider using `get_all_documents_generator()` instead."
`202`	`202`	`)`