|
21 | 21 | import dataclasses
|
22 | 22 | import datetime
|
23 | 23 | import functools
|
| 24 | +import hashlib |
24 | 25 | import json
|
25 | 26 | import os
|
26 | 27 | import pathlib
|
27 | 28 | import subprocess
|
28 | 29 | import tempfile
|
29 |
| -from typing import Any, Iterator, Mapping |
| 30 | +from typing import Any, Iterator, Mapping, Sequence |
30 | 31 | from unittest import mock
|
31 | 32 |
|
32 | 33 | from etils import epath
|
@@ -711,74 +712,94 @@ def now(cls, tz=None) -> datetime.datetime:
|
711 | 712 |
|
712 | 713 |
|
713 | 714 | @contextlib.contextmanager
|
714 |
| -def dummy_croissant_file() -> Iterator[epath.Path]: |
| 715 | +def dummy_croissant_file( |
| 716 | + dataset_name: str = 'DummyDataset', |
| 717 | + entries: Sequence[dict[str, Any]] | None = None, |
| 718 | + raw_data_filename: epath.PathLike = 'raw_data.jsonl', |
| 719 | + croissant_filename: epath.PathLike = 'croissant.json', |
| 720 | +) -> Iterator[epath.Path]: |
715 | 721 | """Yields temporary path to a dummy Croissant file.
|
716 | 722 |
|
717 | 723 | The function creates a temporary directory that stores raw data files and the
|
718 | 724 | Croissant JSON-LD.
|
| 725 | +
|
| 726 | + Args: |
| 727 | + dataset_name: The name of the dataset. |
| 728 | + entries: A list of dictionaries representing the dataset's entries. Each |
| 729 | + dictionary should contain an 'index' and a 'text' key. If None, the |
| 730 | + function will create two entries with indices 0 and 1 and dummy text. |
| 731 | + raw_data_filename: Filename of the raw data file. |
| 732 | + croissant_filename: Filename of the Croissant JSON-LD file. |
719 | 733 | """
|
720 |
| - entries = [{'index': i, 'text': f'Dummy example {i}'} for i in range(2)] |
721 |
| - distribution = [ |
722 |
| - mlc.FileObject( |
723 |
| - id='raw_data', |
724 |
| - description='File with the data.', |
725 |
| - encoding_format='application/jsonlines', |
726 |
| - content_url='data/raw_data.jsonl', |
727 |
| - sha256=( |
728 |
| - 'b13bbcd65bb5ec7c0c64cbceb635de3eadda17f3311c5982dc2d5a342ed97690' |
| 734 | + if not entries: |
| 735 | + entries = [{'index': i, 'text': f'Dummy example {i}'} for i in range(2)] |
| 736 | + |
| 737 | + fields = [ |
| 738 | + mlc.Field( |
| 739 | + name='index', |
| 740 | + description='The sample index.', |
| 741 | + data_types=mlc.DataType.INTEGER, |
| 742 | + source=mlc.Source( |
| 743 | + file_object='raw_data', |
| 744 | + extract=mlc.Extract(column='index'), |
| 745 | + ), |
| 746 | + ), |
| 747 | + mlc.Field( |
| 748 | + name='text', |
| 749 | + description='The dummy sample text.', |
| 750 | + data_types=mlc.DataType.TEXT, |
| 751 | + source=mlc.Source( |
| 752 | + file_object='raw_data', |
| 753 | + extract=mlc.Extract(column='text'), |
729 | 754 | ),
|
730 | 755 | ),
|
731 | 756 | ]
|
| 757 | + |
732 | 758 | record_sets = [
|
733 | 759 | mlc.RecordSet(
|
734 | 760 | id='jsonl',
|
735 | 761 | description='Dummy record set.',
|
736 |
| - fields=[ |
737 |
| - mlc.Field( |
738 |
| - name='index', |
739 |
| - description='The sample index.', |
740 |
| - data_types=mlc.DataType.INTEGER, |
741 |
| - source=mlc.Source( |
742 |
| - file_object='raw_data', |
743 |
| - extract=mlc.Extract(column='index'), |
744 |
| - ), |
745 |
| - ), |
746 |
| - mlc.Field( |
747 |
| - name='text', |
748 |
| - description='The dummy sample text.', |
749 |
| - data_types=mlc.DataType.TEXT, |
750 |
| - source=mlc.Source( |
751 |
| - file_object='raw_data', |
752 |
| - extract=mlc.Extract(column='text'), |
753 |
| - ), |
754 |
| - ), |
755 |
| - ], |
| 762 | + fields=fields, |
756 | 763 | )
|
757 | 764 | ]
|
758 |
| - dummy_metadata = mlc.Metadata( |
759 |
| - name='DummyDataset', |
760 |
| - description='Dummy description.', |
761 |
| - cite_as=( |
762 |
| - '@article{dummyarticle, title={title}, author={author}, year={2020}}' |
763 |
| - ), |
764 |
| - url='https://dummy_url', |
765 |
| - distribution=distribution, |
766 |
| - record_sets=record_sets, |
767 |
| - version='1.2.0', |
768 |
| - license='Public', |
769 |
| - ) |
770 | 765 |
|
771 | 766 | with tempfile.TemporaryDirectory() as tempdir:
|
772 | 767 | tempdir = epath.Path(tempdir)
|
773 | 768 |
|
774 | 769 | # Write raw examples to tempdir/data.
|
775 | 770 | raw_data_dir = tempdir / 'data'
|
776 | 771 | raw_data_dir.mkdir()
|
777 |
| - raw_data_file = raw_data_dir / 'raw_data.jsonl' |
| 772 | + raw_data_file = raw_data_dir / raw_data_filename |
778 | 773 | raw_data_file.write_text('\n'.join(map(json.dumps, entries)))
|
779 | 774 |
|
| 775 | + # Get the actual raw file's hash, set distribution and metadata. |
| 776 | + raw_data_file_content = raw_data_file.read_text() |
| 777 | + sha256 = hashlib.sha256(raw_data_file_content.encode()).hexdigest() |
| 778 | + distribution = [ |
| 779 | + mlc.FileObject( |
| 780 | + id='raw_data', |
| 781 | + description='File with the data.', |
| 782 | + encoding_format='application/jsonlines', |
| 783 | + content_url=f'data/{raw_data_filename}', |
| 784 | + sha256=sha256, |
| 785 | + ), |
| 786 | + ] |
| 787 | + dummy_metadata = mlc.Metadata( |
| 788 | + name=dataset_name, |
| 789 | + description='Dummy description.', |
| 790 | + cite_as=( |
| 791 | + '@article{dummyarticle, title={title}, author={author},' |
| 792 | + ' year={2020}}' |
| 793 | + ), |
| 794 | + url='https://dummy_url', |
| 795 | + distribution=distribution, |
| 796 | + record_sets=record_sets, |
| 797 | + version='1.2.0', |
| 798 | + license='Public', |
| 799 | + ) |
| 800 | + |
780 | 801 | # Write Croissant JSON-LD to tempdir.
|
781 |
| - croissant_file = tempdir / 'croissant.json' |
| 802 | + croissant_file = tempdir / croissant_filename |
782 | 803 | croissant_file.write_text(json.dumps(dummy_metadata.to_json(), indent=2))
|
783 | 804 |
|
784 | 805 | yield croissant_file
|
0 commit comments