Skip to content

ValueError("can not infer schema from empty dataset") #6

@placerda

Description

@placerda

When running the Synapse pipeline with a query that does not return any data from the News API I face the error below in Ingest_Process_News.

Operation on target Process_News_Twitter_Data failed: ---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-d54605cc056a> in <module>
1 mssparkutils.notebook.run("/Ingest_Process_News", 1800, \
----> 2 {"data_lake_account_name": data_lake_account_name, "file_system_name": file_system_name, "keyvault_name": keyvault_name, "query": query, "topic": topic})
~/cluster-env/env/lib/python3.6/site-packages/notebookutils/mssparkutils/notebook.py in run(path, timeout_seconds, arguments)
11
12 def run(path, timeout_seconds=90, arguments={}):
---> 13 exit_val = nb.run(path, timeout_seconds, arguments)
14
15 if exit_val == constants.STOP_SESSION_REQUEST_EXIT_VAL:
~/cluster-env/env/lib/python3.6/site-packages/notebookutils/mssparkutils/handlers/notebookHandler.py in run(self, path, timeout_seconds, arguments)
50 run_result_snapshot = j_notebook_run_result.snapshotMetaStr()
51 print(run_result_snapshot)
---> 52 j_notebook_run_result.throwExceptionIfHave()
53 return exit_val
54
/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py in deco(*a, **kw)
67 def deco(*a, **kw):
68 try:
---> 69 return f(*a, **kw)
70 except py4j.protocol.Py4JJavaError as e:
71 s = e.java_exception.toString()
/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o592.throwExceptionIfHave.
: com.microsoft.spark.notebook.msutils.NotebookExecutionException: can not infer schema from empty dataset
---------------------------------------------------------------------------ValueError Traceback (most recent call last)<ipython-input-12-ff046479c156> in <module>
----> 1 df_articles = spark.createDataFrame(Row(**x) for x in all_articles)
2 file_path = base_path + 'NewsRawData/' + config["q"] + '_' + str(topic) + '_' + foldername_suffix
3 df_articles.write.format('json').save(file_path)
/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema)
746 rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
747 else:
--> 748 rdd, schema = self._createFromLocal(map(prepare, data), schema)
749 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
750 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py in _createFromLocal(self, data, schema)
414
415 if schema is None or isinstance(schema, (list, tuple)):
--> 416 struct = self._inferSchemaFromList(data, names=schema)
417 converter = _create_converter(struct)
418 data = map(converter, data)
/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py in _inferSchemaFromList(self, data, names)
341 """
342 if not data:
--> 343 raise ValueError("can not infer schema from empty dataset")

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions