2121import logging
2222import os
2323import random
24+ import re
2425import sys
2526import unittest
2627from typing import Any
3334
3435import pytest
3536import yaml
37+ from jinja2 import DictLoader
38+ from jinja2 import Environment
39+ from jinja2 import StrictUndefined
3640
3741import apache_beam as beam
3842from apache_beam import PCollection
@@ -339,8 +343,21 @@ def test_yaml_example(self):
339343 for i , line in enumerate (expected ):
340344 expected [i ] = line .replace ('# ' , '' ).replace ('\n ' , '' )
341345 expected = [line for line in expected if line ]
346+
347+ raw_spec_string = '' .join (lines )
348+ # Filter for any jinja preprocessor - this has to be done before other
349+ # preprocessors.
350+ jinja_preprocessor = [
351+ preprocessor for preprocessor in custom_preprocessors
352+ if 'jinja_preprocessor' in preprocessor .__name__
353+ ]
354+ if jinja_preprocessor :
355+ jinja_preprocessor = jinja_preprocessor [0 ]
356+ raw_spec_string = jinja_preprocessor (raw_spec_string )
357+ custom_preprocessors .remove (jinja_preprocessor )
358+
342359 pipeline_spec = yaml .load (
343- '' . join ( lines ) , Loader = yaml_transform .SafeLineLoader )
360+ raw_spec_string , Loader = yaml_transform .SafeLineLoader )
344361
345362 with TestEnvironment () as env :
346363 for fn in custom_preprocessors :
@@ -513,8 +530,9 @@ def apply(preprocessor):
513530 return apply
514531
515532
516- @YamlExamplesTestSuite .register_test_preprocessor ('test_wordcount_minimal_yaml' )
517- def _wordcount_test_preprocessor (
533+ @YamlExamplesTestSuite .register_test_preprocessor (
534+ ['test_wordcount_minimal_yaml' ])
535+ def _wordcount_minimal_test_preprocessor (
518536 test_spec : dict , expected : List [str ], env : TestEnvironment ):
519537 """
520538 Preprocessor for the wordcount_minimal.yaml test.
@@ -523,6 +541,8 @@ def _wordcount_test_preprocessor(
523541 of the wordcount example. This allows the test to verify the pipeline's
524542 correctness without relying on a fixed input file.
525543
544+ Based on this expected output: # Row(word='king', count=311)
545+
526546 Args:
527547 test_spec: The dictionary representation of the YAML pipeline specification.
528548 expected: A list of strings representing the expected output of the
@@ -538,8 +558,64 @@ def _wordcount_test_preprocessor(
538558 word = element .split ('=' )[1 ].split (',' )[0 ].replace ("'" , '' )
539559 count = int (element .split ('=' )[2 ].replace (')' , '' ))
540560 all_words += [word ] * count
541- random .shuffle (all_words )
542561
562+ return _wordcount_random_shuffler (test_spec , all_words , env )
563+
564+
565+ @YamlExamplesTestSuite .register_test_preprocessor (
566+ ['test_wordCountInclude_yaml' ])
567+ def _wordcount_jinja_test_preprocessor (
568+ test_spec : dict , expected : List [str ], env : TestEnvironment ):
569+ """
570+ Preprocessor for the wordcount Jinja tests.
571+
572+ This preprocessor generates a random input file based on the expected output
573+ of the wordcount example. This allows the test to verify the pipeline's
574+ correctness without relying on a fixed input file.
575+
576+ Based on this expected output: # Row(output='king - 311')
577+
578+ Args:
579+ test_spec: The dictionary representation of the YAML pipeline specification.
580+ expected: A list of strings representing the expected output of the
581+ pipeline.
582+ env: The TestEnvironment object providing utilities for creating temporary
583+ files.
584+
585+ Returns:
586+ The modified test_spec dictionary with the input file path replaced.
587+ """
588+ all_words = []
589+ for element in expected :
590+ match = re .search (r"output='(.*) - (\d+)'" , element )
591+ if match :
592+ word , count_str = match .groups ()
593+ all_words += [word ] * int (count_str )
594+ return _wordcount_random_shuffler (test_spec , all_words , env )
595+
596+
597+ def _wordcount_random_shuffler (
598+ test_spec : dict , all_words : List [str ], env : TestEnvironment ):
599+ """
600+ Helper function to create a randomized input file for wordcount-style tests.
601+
602+ This function takes a list of words, shuffles them, and arranges them into
603+ randomly sized lines. It then creates a temporary input file with this
604+ content and updates the provided test specification to use this file as
605+ the input for a 'ReadFromText' transform.
606+
607+ Args:
608+ test_spec: The dictionary representation of the YAML pipeline specification.
609+ all_words: A list of strings, where each string is a word to be included
610+ in the generated input file.
611+ env: The TestEnvironment object providing utilities for creating temporary
612+ files.
613+
614+ Returns:
615+ The modified test_spec dictionary with the input file path for
616+ 'ReadFromText' replaced with the path to the newly generated file.
617+ """
618+ random .shuffle (all_words )
543619 lines = []
544620 while all_words :
545621 line_length = random .randint (1 , min (10 , len (all_words )))
@@ -599,7 +675,8 @@ def _kafka_test_preprocessor(
599675 'test_streaming_sentiment_analysis_yaml' ,
600676 'test_iceberg_migration_yaml' ,
601677 'test_ml_preprocessing_yaml' ,
602- 'test_anomaly_scoring_yaml'
678+ 'test_anomaly_scoring_yaml' ,
679+ 'test_wordCountInclude_yaml'
603680])
604681def _io_write_test_preprocessor (
605682 test_spec : dict , expected : List [str ], env : TestEnvironment ):
@@ -1175,6 +1252,50 @@ def _batch_log_analysis_test_preprocessor(
11751252 return test_spec
11761253
11771254
1255+ @YamlExamplesTestSuite .register_test_preprocessor (
1256+ ['test_wordCountInclude_yaml' ])
1257+ def _jinja_preprocessor (raw_spec_string : str ):
1258+ """
1259+ Preprocessor for Jinja-based YAML tests.
1260+
1261+ This function takes a raw YAML string, which is treated as a Jinja2
1262+ template, and renders it to produce the final pipeline specification.
1263+ It specifically handles templates that use the `{% include ... %}`
1264+ directive by manually loading the content of the included files from the
1265+ filesystem.
1266+
1267+ The Jinja variables required for rendering are loaded from a predefined
1268+ data source.
1269+
1270+ Args:
1271+ raw_spec_string: A string containing the raw YAML content, which is a
1272+ Jinja2 template.
1273+
1274+ Returns:
1275+ A string containing the fully rendered YAML pipeline specification.
1276+ """
1277+
1278+ jinja_variables = json .loads (input_data .word_count_jinja_parameter_data ())
1279+ test_file_dir = os .path .dirname (__file__ )
1280+ sdk_root = os .path .abspath (os .path .join (test_file_dir , '../../../..' ))
1281+
1282+ include_files = input_data .word_count_jinja_template_data ()
1283+ mock_templates = {'main_template' : raw_spec_string }
1284+ for file_path in include_files :
1285+ full_path = os .path .join (sdk_root , file_path )
1286+ with open (full_path , 'r' , encoding = 'utf-8' ) as f :
1287+ mock_templates [file_path ] = f .read ()
1288+
1289+ # Can't use the standard expand_jinja method due to it not supporting
1290+ # `% include` jinja templization.
1291+ # TODO(#35936): Maybe update expand_jinja to handle this case.
1292+ jinja_env = Environment (
1293+ loader = DictLoader (mock_templates ), undefined = StrictUndefined )
1294+ template = jinja_env .get_template ('main_template' )
1295+ rendered_yaml_string = template .render (jinja_variables )
1296+ return rendered_yaml_string
1297+
1298+
11781299INPUT_FILES = {
11791300 'products.csv' : input_data .products_csv (),
11801301 'kinglear.txt' : input_data .text_data (),
@@ -1216,6 +1337,9 @@ def _batch_log_analysis_test_preprocessor(
12161337 os .path .join (YAML_DOCS_DIR , '../transforms/elementwise/*.yaml' )).run ()
12171338ExamplesTest = YamlExamplesTestSuite (
12181339 'ExamplesTest' , os .path .join (YAML_DOCS_DIR , '../*.yaml' )).run ()
1340+ JinjaTest = YamlExamplesTestSuite (
1341+ 'JinjaExamplesTest' ,
1342+ os .path .join (YAML_DOCS_DIR , '../transforms/jinja/**/*.yaml' )).run ()
12191343IOTest = YamlExamplesTestSuite (
12201344 'IOExamplesTest' , os .path .join (YAML_DOCS_DIR ,
12211345 '../transforms/io/*.yaml' )).run ()
0 commit comments