Skip to content

Commit 3ac7693

Browse files
committed
Add support for pyspark streaming applications
PNDA-2729
1 parent 8e3a804 commit 3ac7693

File tree

5 files changed

+39
-6
lines changed

5 files changed

+39
-6
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
All notable changes to this project will be documented in this file.
33

44
## Unreleased
5+
### Added
6+
- PNDA-2729: Added support for spark streaming jobs written in python (pyspark). Use `main_py` instead of `main_jar` in properties.json and specify additional files using `py_files`.
57
### Changed
68
- PNDA-2700: Spark streaming jobs no longer require upstart.conf or yarn-kill.py files, default ones are supplied by the deployment manager.
79

10+
811
## [0.3.0] 2017-01-20
912
### Fixed
1013
- PNDA-2498: Application package data is now stored in HDFS with a reference to the path only held in the HBase record. This prevents HBase being overloaded with large packages (100MB+).

api/src/main/resources/plugins/sparkStreaming.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,26 +80,34 @@ def create_component(self, staged_component_path, application_name, component, p
8080

8181
if 'component_spark_submit_args' not in properties:
8282
properties['component_spark_submit_args'] = ''
83+
if 'component_py_files' not in properties:
84+
properties['component_py_files'] = ''
8385

8486
if 'upstart.conf' in component['component_detail']:
8587
# old style applications for backward compatibility
8688
service_script = 'upstart.conf'
8789
service_script_install_path = '/etc/init/%s.conf' % service_name
8890
else:
8991
# new style applications that don't need to provide upstart.conf or yarn-kill.py
90-
if 'component_main_class' not in properties:
92+
if 'component_main_jar' in properties and 'component_main_class' not in properties:
9193
raise Exception('properties.json must contain "main_class" for %s sparkStreaming %s' % (application_name, component['component_name']))
92-
if 'component_main_jar' not in properties:
93-
raise Exception('properties.json must contain "main_jar" for %s sparkStreaming %s' % (application_name, component['component_name']))
94+
95+
java_app = None
96+
if 'component_main_jar' in properties:
97+
java_app = True
98+
elif 'component_main_py' in properties:
99+
java_app = False
100+
else:
101+
raise Exception('properties.json must contain "main_jar or main_py" for %s sparkStreaming %s' % (application_name, component['component_name']))
94102

95103
this_dir = os.path.dirname(os.path.realpath(__file__))
96104
copy(os.path.join(this_dir, 'yarn-kill.py'), staged_component_path)
97105
distro = platform.dist()
98106
if redhat:
99-
service_script = 'systemd.service.tpl'
107+
service_script = 'systemd.service.tpl' if java_app else 'systemd.service.py.tpl'
100108
service_script_install_path = '/usr/lib/systemd/system/%s.service' % service_name
101109
else:
102-
service_script = 'upstart.conf.tpl'
110+
service_script = 'upstart.conf.tpl' if java_app else 'upstart.conf.py.tpl'
103111
service_script_install_path = '/etc/init/%s.conf' % service_name
104112
copy(os.path.join(this_dir, service_script), staged_component_path)
105113

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[Unit]
2+
Description=PNDA Application: ${component_application}-${component_name}
3+
4+
[Service]
5+
Type=simple
6+
User=hdfs
7+
WorkingDirectory=/opt/${environment_namespace}/${component_application}/${component_name}/
8+
ExecStartPre=/opt/${environment_namespace}/${component_application}/${component_name}/yarn-kill.py
9+
ExecStopPost=/opt/${environment_namespace}/${component_application}/${component_name}/yarn-kill.py
10+
ExecStart=/usr/bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:////opt/${environment_namespace}/${component_application}/${component_name}/log4j.properties" --conf 'spark.executor.extraJavaOptions=-Dlog4j.configuration=file:////opt/${environment_namespace}/${component_application}/${component_name}/log4j.properties' --name '${component_job_name}' --master yarn-cluster --py-files application.properties,${component_py_files} ${component_spark_submit_args} ${component_main_py}
11+
Restart=always
12+
RestartSec=2
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
start on runlevel [2345]
2+
stop on runlevel [016]
3+
respawn
4+
respawn limit unlimited
5+
pre-start exec /opt/${environment_namespace}/${component_application}/${component_name}/yarn-kill.py
6+
pre-stop exec /opt/${environment_namespace}/${component_application}/${component_name}/yarn-kill.py
7+
env programDir=/opt/${environment_namespace}/${component_application}/${component_name}/
8+
chdir /opt/${environment_namespace}/${component_application}/${component_name}/
9+
exec sudo -u hdfs spark-submit --driver-java-options "-Dlog4j.configuration=file:///${programDir}log4j.properties" --conf 'spark.executor.extraJavaOptions=-Dlog4j.configuration=file:///${programDir}log4j.properties' --name '${component_job_name}' --master yarn-cluster --py-files application.properties,${component_py_files} ${component_spark_submit_args} ${component_main_py}

api/src/main/resources/plugins/upstart.conf.tpl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ respawn limit unlimited
55
pre-start exec /opt/${environment_namespace}/${component_application}/${component_name}/yarn-kill.py
66
pre-stop exec /opt/${environment_namespace}/${component_application}/${component_name}/yarn-kill.py
77
env programDir=/opt/${environment_namespace}/${component_application}/${component_name}/
8-
exec sudo -u hdfs spark-submit --driver-java-options "-Dlog4j.configuration=file:///${programDir}log4j.properties" --class ${component_main_class} --name '${component_job_name}' --master yarn-cluster --files ${programDir}log4j.properties ${component_spark_submit_args} ${programDir}${component_main_jar}
8+
chdir /opt/${environment_namespace}/${component_application}/${component_name}/
9+
exec sudo -u hdfs spark-submit --driver-java-options "-Dlog4j.configuration=file:///${programDir}log4j.properties" --class ${component_main_class} --name '${component_job_name}' --master yarn-cluster --files log4j.properties ${component_spark_submit_args} ${component_main_jar}

0 commit comments

Comments
 (0)