Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions sdks/python/apache_beam/yaml/examples/regex_matches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
# limitations under the License.
#

# pytype: skip-file

# This pipline creates a series of {plant: description} key pairs, matches all
# elements to a valid regex, filters out non-matching entries, then logs the
# output.
Expand Down Expand Up @@ -46,14 +44,14 @@ pipeline:
def regex_filter(row):
match = re.match("(?P<icon>[^\s,]+), *(\w+), *(\w+)", row.plant)
return match.group(0) if match else match
# Filters out None values produced by values that don't match regex
- type: Filter
config:
language: python
keep: plant
- type: LogForTesting

# Expected:
# Row(plant='🍓, Strawberry, perennial')
# Row(plant='🥕, Carrot, biennial')
Expand Down
3 changes: 1 addition & 2 deletions sdks/python/apache_beam/yaml/examples/simple_filter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
#`ReadFromText` to a local file.
#
# To set up Application Default Credentials,
# see https://cloud.google.com/docs/authentication/external/set-up-adc for more
# information
# see https://cloud.google.com/docs/authentication/external/set-up-adc.
#
# The following example reads mock transaction data from resources/products.csv
# then performs a simple filter for "Electronics".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
#`ReadFromText` to a local file.
#
# To set up Application Default Credentials,
# see https://cloud.google.com/docs/authentication/external/set-up-adc for more
# information
# see https://cloud.google.com/docs/authentication/external/set-up-adc.
#
# The following example reads mock transaction data from resources/products.csv,
# performs a simple filter for "Electronics", then calculates the revenue and
Expand Down
7 changes: 3 additions & 4 deletions sdks/python/apache_beam/yaml/examples/wordcount_minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@
#`ReadFromText` to a local file.
#
# To set up Application Default Credentials,
# see https://cloud.google.com/docs/authentication/external/set-up-adc for more
# information
# see https://cloud.google.com/docs/authentication/external/set-up-adc.
#
# This pipeline reads in a text file, counts distinct words found in the text,
# This pipeline reads in a text file, counts distinct words found in the text,
# then logs a row containing each word and its count.
pipeline:
type: chain
Expand Down Expand Up @@ -70,7 +69,7 @@ pipeline:

# Log out results
- type: LogForTesting

# Expected:
# Row(word='king', count=311)
# Row(word='lear', count=253)
Expand Down
169 changes: 127 additions & 42 deletions sdks/python/apache_beam/yaml/generate_yaml_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
#

import argparse
import glob
import io
import itertools
import os
import re

import docstring_parser
Expand Down Expand Up @@ -258,19 +260,7 @@ def transform_docs(transform_base, transforms, providers, extra_docs=''):
])


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--markdown_file')
parser.add_argument('--html_file')
parser.add_argument('--schema_file')
parser.add_argument('--include', default='.*')
parser.add_argument('--exclude', default='')
options = parser.parse_args()
include = re.compile(options.include).match
exclude = (
re.compile(options.exclude).match
if options.exclude else lambda x: x in SKIP)

def create_index(include, exclude, options):
with subprocess_server.SubprocessServer.cache_subprocesses():
json_config_schemas = []
markdown_out = io.StringIO()
Expand Down Expand Up @@ -322,29 +312,78 @@ def main():
}
}
})

if options.schema_file:
with open(options.schema_file, 'w') as fout:
yaml.dump(json_config_schemas, fout, sort_keys=False)

if options.markdown_file:
with open(options.markdown_file, 'w') as fout:
fout.write(markdown_out.getvalue())

if options.html_file:
import markdown
import markdown.extensions.toc
import pygments.formatters

title = 'Beam YAML Transform Index'
md = markdown.Markdown(
extensions=[
markdown.extensions.toc.TocExtension(toc_depth=2),
'codehilite',
])
pygments_style = pygments.formatters.HtmlFormatter().get_style_defs(
'.codehilite')
extra_style = '''
return json_config_schemas, markdown_out.getvalue()


def create_examples_markdown():
markdown_out = io.StringIO()
base = os.path.join(os.path.dirname(__file__), 'examples')
section = last_section = ''
for path in sorted(glob.glob(os.path.join(base, '**', '*.yaml'),
recursive=True),
key=lambda path: (path.count(os.sep), path)):
short_path = path[len(base):].replace('transforms', '').strip(os.sep)

def to_title(path):
base, _ = os.path.splitext(path)
nice = base.replace('_', ' ').replace(os.sep, ' ').title()
# These acronyms should be upper, not title.
nice = re.sub(r'\bMl\b', 'ML', nice)
nice = re.sub(r'\bIo\b', 'IO', nice)
return nice

def clean_yaml(content):
content = re.sub(
'# Licensed to the Apache Software Foundation.*'
'# limitations under the License.',
'',
content,
flags=re.MULTILINE | re.DOTALL)
content = re.sub('# coding=.*', '', content)
return content

def split_header(yaml):
lines = yaml.split('\n')
ix = 0 # make lint happy
for ix, line in enumerate(lines):
if not line.strip():
continue
if not line.startswith('#'):
break
return (
'\n'.join([line[1:].strip() for line in lines[:ix]]),
'\n'.join(lines[ix:]))

if os.sep in short_path:
section = to_title(short_path.split(os.sep)[0])
if section != last_section:
markdown_out.write(f'# {section}\n\n')
last_section = section
title = to_title(short_path)[len(section):]
markdown_out.write(f'## {title}\n\n')
with open(path) as fin:
content = fin.read()
header, body = split_header(clean_yaml(content))
markdown_out.write(header)
markdown_out.write('\n\n :::yaml\n\n')
markdown_out.write(' ' + body.replace('\n', '\n '))
markdown_out.write('\n')
return markdown_out.getvalue()


def markdown_to_html(title, markdown_content, header=''):
import markdown
import markdown.extensions.toc
import pygments.formatters

md = markdown.Markdown(
extensions=[
markdown.extensions.toc.TocExtension(toc_depth=2),
'codehilite',
])
pygments_style = pygments.formatters.HtmlFormatter().get_style_defs(
'.codehilite')
extra_style = '''
* {
box-sizing: border-box;
}
Expand Down Expand Up @@ -476,10 +515,8 @@ def main():
}
'''

html = md.convert(markdown_out.getvalue())
with open(options.html_file, 'w') as html_out:
html_out.write(
f'''
html = md.convert(markdown_content)
return f'''
<html>
<head>
<title>{title}</title>
Expand All @@ -492,7 +529,7 @@ def main():
<div class="grid-for-nav">
<nav class="nav-side">
<div class="nav-header">
<a href=#>Beam YAML Transform Index</a>
<a href=#>{title}</a>
<div class="version">
{beam_version}
</div>
Expand All @@ -502,13 +539,61 @@ def main():
<section class="transform-content-wrap">
<div class="transform-content">
<h1>{title}</h1>
{header}
{html.replace('<h2', '<hr><h2')}
</div>
</section>
</div>
</body>
</html>
''')
'''


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--examples_file')
parser.add_argument('--markdown_file')
parser.add_argument('--html_file')
parser.add_argument('--schema_file')
parser.add_argument('--include', default='.*')
parser.add_argument('--exclude', default='')
options = parser.parse_args()
include = re.compile(options.include).match
exclude = (
re.compile(options.exclude).match
if options.exclude else lambda x: x in SKIP)

json_config_schemas, markdown_content = create_index(
include, exclude, options)

if options.schema_file:
with open(options.schema_file, 'w') as fout:
yaml.dump(json_config_schemas, fout, sort_keys=False)

if options.markdown_file:
with open(options.markdown_file, 'w') as fout:
fout.write(markdown_content)

if options.html_file:
with open(options.html_file, 'w') as html_out:
html_out.write(
markdown_to_html('Beam YAML Transform Index', markdown_content))

if options.examples_file:
with open(options.examples_file, 'w') as html_out:
html_out.write(
markdown_to_html(
'Beam YAML Examples',
create_examples_markdown(),
header='''
<p>Example pipelines using the
<a href="https://beam.apache.org/documentation/sdks/yaml/">
Beam YAML API</a>.
These examples can also be found on
<a href="https://github.com/apache/beam/tree/master/sdks/'''
'''python/apache_beam/yaml/examples">github</a>.
</p>
'''))


if __name__ == '__main__':
Expand Down
Loading