Date: Wed, 29 Apr 2026 15:28:54 +0000 From: Yuri Victorovich <yuri@FreeBSD.org> To: ports-committers@FreeBSD.org, dev-commits-ports-all@FreeBSD.org, dev-commits-ports-main@FreeBSD.org Subject: git: c2dc4b4d9965 - main - misc/py-datasets: update 4.8.2=?utf-8?Q? =E2=86=92 4.8?=.5 Message-ID: <69f223b6.1899d.6091db79@gitrepo.freebsd.org>
index | next in thread | raw e-mail
The branch main has been updated by yuri: URL: https://cgit.FreeBSD.org/ports/commit/?id=c2dc4b4d99654cdca038efd04e5f8e49766a6459 commit c2dc4b4d99654cdca038efd04e5f8e49766a6459 Author: Yuri Victorovich <yuri@FreeBSD.org> AuthorDate: 2026-04-29 09:39:07 +0000 Commit: Yuri Victorovich <yuri@FreeBSD.org> CommitDate: 2026-04-29 15:28:46 +0000 misc/py-datasets: update 4.8.2 → 4.8.5 --- misc/py-datasets/Makefile | 36 +- misc/py-datasets/distinfo | 6 +- .../files/patch-tests__test_patching.py | 17 + misc/py-datasets/files/patch-tests_conftest.py | 68 +++ .../files/patch-tests_fixtures_files.py | 636 +++++++++++++++++++++ .../files/patch-tests_fixtures_fsspec.py | 119 ++++ misc/py-datasets/files/patch-tests_fixtures_hub.py | 235 ++++++++ misc/py-datasets/files/patch-tests_utils.py | 626 ++++++++++++++++++++ 8 files changed, 1729 insertions(+), 14 deletions(-) diff --git a/misc/py-datasets/Makefile b/misc/py-datasets/Makefile index 774f71a63205..b9356b8a6908 100644 --- a/misc/py-datasets/Makefile +++ b/misc/py-datasets/Makefile @@ -1,13 +1,13 @@ PORTNAME= datasets -DISTVERSION= 4.8.2 -PORTREVISION= 1 +DISTVERSION= 4.8.5 CATEGORIES= misc python # machine-learning MASTER_SITES= PYPI PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} MAINTAINER= yuri@FreeBSD.org COMMENT= HuggingFace community-driven open-source library of datasets -WWW= https://huggingface.co/docs/datasets/index +WWW= https://huggingface.co/docs/datasets/index \ + https://github.com/huggingface/datasets LICENSE= MIT LICENSE_FILE= ${WRKSRC}/LICENSE @@ -30,11 +30,9 @@ RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}aiohttp>0:www/py-aiohttp@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}requests>=2.32.2:www/py-requests@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}tqdm>=4.66.3:misc/py-tqdm@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}xxhash>0:devel/py-xxhash@${PY_FLAVOR} -RUN_DEPENDS_AUDIO= \ - ${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \ +RUN_DEPENDS_AUDIO= ${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}pytorch>=2.8.0:misc/py-pytorch@${PY_FLAVOR} -RUN_DEPENDS_VISION= \ - ${PY_PILLOW} +RUN_DEPENDS_VISION= ${PY_PILLOW} RUN_DEPENDS+= ${RUN_DEPENDS_AUDIO} \ ${RUN_DEPENDS_VISION} TEST_DEPENDS= ${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \ @@ -55,16 +53,32 @@ TEST_DEPENDS= ${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}tiktoken>=0:textproc/py-tiktoken@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}typing-extensions>=4.6.1:devel/py-typing-extensions@${PY_FLAVOR} \ ${PYTHON_PKGNAMEPREFIX}zstandard>=0:archivers/py-zstandard@${PY_FLAVOR} -# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow +# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow, transformers USES= python USE_PYTHON= pep517 concurrent autoplist pytest -TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} +TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} \ + DATASETS_TEST_SKIP_TF=yes +PYTEST_ARGS= tests/ NO_ARCH= yes -pre-test: # prevent failure due to missing pyspark - @${RM} ${WRKSRC}/tests/packaged_modules/test_spark.py +pre-patch: + @${MKDIR} ${WRKSRC}/tests/fixtures + @${TOUCH} ${WRKSRC}/tests/__init__.py ${WRKSRC}/tests/fixtures/__init__.py + +pre-test: # skip tests requiring unavailable dependencies/data + @${RM} ${WRKSRC}/tests/test_fingerprint_tokenizer_stability.py + @${RM} ${WRKSRC}/tests/test_formatting.py + @${RM} ${WRKSRC}/tests/test_load.py + @${RM} ${WRKSRC}/tests/test_distributed.py + @${RM} ${WRKSRC}/tests/test_search.py + @${RM} ${WRKSRC}/tests/test_table.py + @${RM} ${WRKSRC}/tests/test_builder.py + @${RM} ${WRKSRC}/tests/test_parallel.py + @${RM} ${WRKSRC}/tests/test_iterable_dataset.py + @${RM} ${WRKSRC}/tests/test_upstream_hub.py + @${RM} ${WRKSRC}/tests/test_fingerprint.py .include <bsd.port.mk> diff --git a/misc/py-datasets/distinfo b/misc/py-datasets/distinfo index 612ac838107b..19c878e8b494 100644 --- a/misc/py-datasets/distinfo +++ b/misc/py-datasets/distinfo @@ -1,3 +1,3 @@ -TIMESTAMP = 1773758107 -SHA256 (datasets-4.8.2.tar.gz) = c6ad7e6c28c7436a9c6c23f817d1a450d395c771df881252dfe63697297cbcdf -SIZE (datasets-4.8.2.tar.gz) = 603879 +TIMESTAMP = 1777403895 +SHA256 (datasets-4.8.5.tar.gz) = 0f0c1c3d56ffff2c93b2f4c63c95bac94f3d7e8621aea2a2a576275233bba772 +SIZE (datasets-4.8.5.tar.gz) = 605649 diff --git a/misc/py-datasets/files/patch-tests__test_patching.py b/misc/py-datasets/files/patch-tests__test_patching.py new file mode 100644 index 000000000000..6beda41ed21b --- /dev/null +++ b/misc/py-datasets/files/patch-tests__test_patching.py @@ -0,0 +1,17 @@ +-- This patch adds tests/_test_patching.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/_test_patching.py +@@ -0,0 +1,11 @@ ++# ruff: noqa: F401 ++# This is the module that test_patching.py uses to test patch_submodule() ++import os ++import os as renamed_os ++from os import path ++from os import path as renamed_path ++from os.path import join ++from os.path import join as renamed_join ++ ++ ++open = open # we just need to have a builtin inside this module to test it properly diff --git a/misc/py-datasets/files/patch-tests_conftest.py b/misc/py-datasets/files/patch-tests_conftest.py new file mode 100644 index 000000000000..248e9b692e63 --- /dev/null +++ b/misc/py-datasets/files/patch-tests_conftest.py @@ -0,0 +1,68 @@ +-- This patch adds tests/conftest.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/conftest.py +@@ -0,0 +1,62 @@ ++import pytest ++ ++import datasets ++import datasets.config ++ ++ ++# Import fixture modules as plugins ++pytest_plugins = ["tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"] ++ ++ ++def pytest_collection_modifyitems(config, items): ++ # Mark tests as "unit" by default if not marked as "integration" (or already marked as "unit") ++ for item in items: ++ if any(marker in item.keywords for marker in ["integration", "unit"]): ++ continue ++ item.add_marker(pytest.mark.unit) ++ ++ ++@pytest.fixture(autouse=True) ++def set_test_cache_config(tmp_path_factory, monkeypatch): ++ # test_hf_cache_home = tmp_path_factory.mktemp("cache") # TODO: why a cache dir per test function does not work? ++ test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache" ++ test_hf_datasets_cache = test_hf_cache_home / "datasets" ++ monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache)) ++ test_downloaded_datasets_path = test_hf_datasets_cache / "downloads" ++ monkeypatch.setattr("datasets.config.DOWNLOADED_DATASETS_PATH", str(test_downloaded_datasets_path)) ++ test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted" ++ monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path)) ++ ++ # used in dataset viewer, we may set it to true by default in the future ++ monkeypatch.setattr("datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS", True) ++ ++ ++@pytest.fixture(autouse=True) ++def disable_implicit_token(monkeypatch): ++ monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", True) ++ ++ ++@pytest.fixture(autouse=True, scope="session") ++def disable_tqdm_output(): ++ datasets.disable_progress_bar() ++ ++ ++@pytest.fixture(autouse=True) ++def set_update_download_counts_to_false(monkeypatch): ++ # don't take tests into account when counting downloads ++ monkeypatch.setattr("datasets.config.HF_UPDATE_DOWNLOAD_COUNTS", False) ++ ++ ++@pytest.fixture ++def set_sqlalchemy_silence_uber_warning(monkeypatch): ++ # Required to suppress RemovedIn20Warning when feature(s) are not compatible with SQLAlchemy 2.0 ++ # To be removed once SQLAlchemy 2.0 supported ++ try: ++ monkeypatch.setattr("sqlalchemy.util.deprecations.SILENCE_UBER_WARNING", True) ++ except (ModuleNotFoundError, AttributeError): ++ pass ++ ++ ++@pytest.fixture(autouse=True, scope="session") ++def zero_time_out_for_remote_code(): ++ datasets.config.TIME_OUT_REMOTE_CODE = 0 diff --git a/misc/py-datasets/files/patch-tests_fixtures_files.py b/misc/py-datasets/files/patch-tests_fixtures_files.py new file mode 100644 index 000000000000..7053267f2eaa --- /dev/null +++ b/misc/py-datasets/files/patch-tests_fixtures_files.py @@ -0,0 +1,636 @@ +-- This patch adds tests/fixtures/files.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/fixtures/files.py +@@ -0,0 +1,630 @@ ++import contextlib ++import csv ++import json ++import os ++import sqlite3 ++import tarfile ++import textwrap ++import zipfile ++ ++import pandas as pd ++import pyarrow as pa ++import pyarrow.parquet as pq ++import pytest ++ ++import datasets ++import datasets.config ++ ++ ++# dataset + arrow_file ++ ++ ++@pytest.fixture(scope="session") ++def dataset(): ++ n = 10 ++ features = datasets.Features( ++ { ++ "tokens": datasets.List(datasets.Value("string")), ++ "labels": datasets.List(datasets.ClassLabel(names=["negative", "positive"])), ++ "answers": { ++ "text": datasets.List(datasets.Value("string")), ++ "answer_start": datasets.List(datasets.Value("int32")), ++ }, ++ "id": datasets.Value("int64"), ++ } ++ ) ++ dataset = datasets.Dataset.from_dict( ++ { ++ "tokens": [["foo"] * 5] * n, ++ "labels": [[1] * 5] * n, ++ "answers": [{"answer_start": [97], "text": ["1976"]}] * 10, ++ "id": list(range(n)), ++ }, ++ features=features, ++ ) ++ return dataset ++ ++ ++@pytest.fixture(scope="session") ++def arrow_file(tmp_path_factory, dataset): ++ filename = str(tmp_path_factory.mktemp("data") / "file.arrow") ++ dataset.map(cache_file_name=filename) ++ return filename ++ ++ ++# FILE_CONTENT + files ++ ++ ++FILE_CONTENT = """\ ++ Text data. ++ Second line of data.""" ++ ++ ++@pytest.fixture(scope="session") ++def text_file_content(): ++ return FILE_CONTENT ++ ++ ++@pytest.fixture(scope="session") ++def text_file(tmp_path_factory): ++ filename = tmp_path_factory.mktemp("data") / "file.txt" ++ data = FILE_CONTENT ++ with open(filename, "w") as f: ++ f.write(data) ++ return filename ++ ++ ++@pytest.fixture(scope="session") ++def bz2_file(tmp_path_factory): ++ import bz2 ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.bz2" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with bz2.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def gz_file(tmp_path_factory): ++ import gzip ++ ++ path = str(tmp_path_factory.mktemp("data") / "file.txt.gz") ++ data = bytes(FILE_CONTENT, "utf-8") ++ with gzip.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def lz4_file(tmp_path_factory): ++ if datasets.config.LZ4_AVAILABLE: ++ import lz4.frame ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.lz4" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with lz4.frame.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def seven_zip_file(tmp_path_factory, text_file): ++ if datasets.config.PY7ZR_AVAILABLE: ++ import py7zr ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.7z" ++ with py7zr.SevenZipFile(path, "w") as archive: ++ archive.write(text_file, arcname=os.path.basename(text_file)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def tar_file(tmp_path_factory, text_file): ++ import tarfile ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.tar" ++ with tarfile.TarFile(path, "w") as f: ++ f.add(text_file, arcname=os.path.basename(text_file)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def xz_file(tmp_path_factory): ++ import lzma ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.xz" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with lzma.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_file(tmp_path_factory, text_file): ++ import zipfile ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_file, arcname=os.path.basename(text_file)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zstd_file(tmp_path_factory): ++ if datasets.config.ZSTANDARD_AVAILABLE: ++ import zstandard as zstd ++ ++ path = tmp_path_factory.mktemp("data") / "file.txt.zst" ++ data = bytes(FILE_CONTENT, "utf-8") ++ with zstd.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++# xml_file ++ ++ ++@pytest.fixture(scope="session") ++def xml_file(tmp_path_factory): ++ filename = tmp_path_factory.mktemp("data") / "file.xml" ++ data = textwrap.dedent( ++ """\ ++ <?xml version="1.0" encoding="UTF-8" ?> ++ <tmx version="1.4"> ++ <header segtype="sentence" srclang="ca" /> ++ <body> ++ <tu> ++ <tuv xml:lang="ca"><seg>Contingut 1</seg></tuv> ++ <tuv xml:lang="en"><seg>Content 1</seg></tuv> ++ </tu> ++ <tu> ++ <tuv xml:lang="ca"><seg>Contingut 2</seg></tuv> ++ <tuv xml:lang="en"><seg>Content 2</seg></tuv> ++ </tu> ++ <tu> ++ <tuv xml:lang="ca"><seg>Contingut 3</seg></tuv> ++ <tuv xml:lang="en"><seg>Content 3</seg></tuv> ++ </tu> ++ <tu> ++ <tuv xml:lang="ca"><seg>Contingut 4</seg></tuv> ++ <tuv xml:lang="en"><seg>Content 4</seg></tuv> ++ </tu> ++ <tu> ++ <tuv xml:lang="ca"><seg>Contingut 5</seg></tuv> ++ <tuv xml:lang="en"><seg>Content 5</seg></tuv> ++ </tu> ++ </body> ++ </tmx>""" ++ ) ++ with open(filename, "w") as f: ++ f.write(data) ++ return filename ++ ++ ++DATA = [ ++ {"col_1": "0", "col_2": 0, "col_3": 0.0}, ++ {"col_1": "1", "col_2": 1, "col_3": 1.0}, ++ {"col_1": "2", "col_2": 2, "col_3": 2.0}, ++ {"col_1": "3", "col_2": 3, "col_3": 3.0}, ++] ++DATA2 = [ ++ {"col_1": "4", "col_2": 4, "col_3": 4.0}, ++ {"col_1": "5", "col_2": 5, "col_3": 5.0}, ++] ++DATA_DICT_OF_LISTS = { ++ "col_1": ["0", "1", "2", "3"], ++ "col_2": [0, 1, 2, 3], ++ "col_3": [0.0, 1.0, 2.0, 3.0], ++} ++ ++DATA_312 = [ ++ {"col_3": 0.0, "col_1": "0", "col_2": 0}, ++ {"col_3": 1.0, "col_1": "1", "col_2": 1}, ++] ++ ++DATA_STR = [ ++ {"col_1": "s0", "col_2": 0, "col_3": 0.0}, ++ {"col_1": "s1", "col_2": 1, "col_3": 1.0}, ++ {"col_1": "s2", "col_2": 2, "col_3": 2.0}, ++ {"col_1": "s3", "col_2": 3, "col_3": 3.0}, ++] ++ ++DATA_MISSING_FIELDS = [ ++ {"col_1": 1, "col_2": 2}, ++ {"col_1": 1, "col_3": 3}, ++] ++ ++DATA_MIXED_TYPES = [ ++ {"col_1": 1, "col_2": {"a": "a"}, "col_3": [{"x": "x"}]}, ++ {"col_1": "one", "col_2": {"b": "b"}, "col_3": [{"y": "y"}]}, ++ {"col_1": None, "col_2": None, "col_3": [None]}, ++] ++ ++ ++@pytest.fixture(scope="session") ++def dataset_dict(): ++ return DATA_DICT_OF_LISTS ++ ++ ++@pytest.fixture(scope="session") ++def arrow_path(tmp_path_factory): ++ dataset = datasets.Dataset.from_dict(DATA_DICT_OF_LISTS) ++ path = str(tmp_path_factory.mktemp("data") / "dataset.arrow") ++ dataset.map(cache_file_name=path) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def sqlite_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.sqlite") ++ with contextlib.closing(sqlite3.connect(path)) as con: ++ cur = con.cursor() ++ cur.execute("CREATE TABLE dataset(col_1 text, col_2 int, col_3 real)") ++ for item in DATA: ++ cur.execute("INSERT INTO dataset(col_1, col_2, col_3) VALUES (?, ?, ?)", tuple(item.values())) ++ con.commit() ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def csv_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.csv") ++ with open(path, "w", newline="") as f: ++ writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) ++ writer.writeheader() ++ for item in DATA: ++ writer.writerow(item) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def csv2_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset2.csv") ++ with open(path, "w", newline="") as f: ++ writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) ++ writer.writeheader() ++ for item in DATA: ++ writer.writerow(item) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def bz2_csv_path(csv_path, tmp_path_factory): ++ import bz2 ++ ++ path = tmp_path_factory.mktemp("data") / "dataset.csv.bz2" ++ with open(csv_path, "rb") as f: ++ data = f.read() ++ # data = bytes(FILE_CONTENT, "utf-8") ++ with bz2.open(path, "wb") as f: ++ f.write(data) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_csv_path(csv_path, csv2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(csv_path, arcname=os.path.basename(csv_path)) ++ f.write(csv2_path, arcname=os.path.basename(csv2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_uppercase_csv_path(csv_path, csv2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.csv.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(csv_path, arcname=os.path.basename(csv_path.replace(".csv", ".CSV"))) ++ f.write(csv2_path, arcname=os.path.basename(csv2_path.replace(".csv", ".CSV"))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_with_dir.csv.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(csv_path, arcname=os.path.join("main_dir", os.path.basename(csv_path))) ++ f.write(csv2_path, arcname=os.path.join("main_dir", os.path.basename(csv2_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def parquet_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.parquet") ++ schema = pa.schema( ++ { ++ "col_1": pa.string(), ++ "col_2": pa.int64(), ++ "col_3": pa.float64(), ++ } ++ ) ++ with open(path, "wb") as f: ++ writer = pq.ParquetWriter(f, schema=schema) ++ pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema) ++ writer.write_table(pa_table) ++ writer.close() ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def geoparquet_path(tmp_path_factory): ++ df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet") ++ path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet") ++ df.to_parquet(path=path) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def json_list_of_dicts_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.json") ++ data = {"data": DATA} ++ with open(path, "w") as f: ++ json.dump(data, f) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def json_dict_of_lists_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.json") ++ data = {"data": DATA_DICT_OF_LISTS} ++ with open(path, "w") as f: ++ json.dump(data, f) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl") ++ with open(path, "w") as f: ++ for item in DATA: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl2_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset2.jsonl") ++ with open(path, "w") as f: ++ for item in DATA: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_312_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_312: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_str_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_STR: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_missing_fields_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset-missing-fields.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_MISSING_FIELDS: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_mixed_types_path(tmp_path_factory): ++ path = str(tmp_path_factory.mktemp("data") / "dataset-mixed-types.jsonl") ++ with open(path, "w") as f: ++ for item in DATA_MIXED_TYPES: ++ f.write(json.dumps(item) + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_gz_path(tmp_path_factory, text_path): ++ import gzip ++ ++ path = str(tmp_path_factory.mktemp("data") / "dataset.txt.gz") ++ with open(text_path, "rb") as orig_file: ++ with gzip.open(path, "wb") as zipped_file: ++ zipped_file.writelines(orig_file) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def jsonl_gz_path(tmp_path_factory, jsonl_path): ++ import gzip ++ ++ path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz") ++ with open(jsonl_path, "rb") as orig_file: ++ with gzip.open(path, "wb") as zipped_file: ++ zipped_file.writelines(orig_file) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.jsonl.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(jsonl_path, arcname=os.path.basename(jsonl_path)) ++ f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_nested_jsonl_path(zip_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(zip_jsonl_path, arcname=os.path.join("nested", os.path.basename(zip_jsonl_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_with_dir.jsonl.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(jsonl_path, arcname=os.path.join("main_dir", os.path.basename(jsonl_path))) ++ f.write(jsonl2_path, arcname=os.path.join("main_dir", os.path.basename(jsonl2_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.jsonl.tar" ++ with tarfile.TarFile(path, "w") as f: ++ f.add(jsonl_path, arcname=os.path.basename(jsonl_path)) ++ f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.tar" ++ with tarfile.TarFile(path, "w") as f: ++ f.add(tar_jsonl_path, arcname=os.path.join("nested", os.path.basename(tar_jsonl_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_path(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = str(tmp_path_factory.mktemp("data") / "dataset.txt") ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text2_path(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = str(tmp_path_factory.mktemp("data") / "dataset2.txt") ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_dir(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = tmp_path_factory.mktemp("data_text_dir") / "dataset.txt" ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path.parent ++ ++ ++@pytest.fixture(scope="session") ++def text_dir_with_unsupported_extension(tmp_path_factory): ++ data = ["0", "1", "2", "3"] ++ path = tmp_path_factory.mktemp("data") / "dataset.abc" ++ with open(path, "w") as f: ++ for item in data: ++ f.write(item + "\n") ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_text_path(text_path, text2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.text.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_path, arcname=os.path.basename(text_path)) ++ f.write(text2_path, arcname=os.path.basename(text2_path)) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_text_with_dir_path(text_path, text2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset_with_dir.text.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_path, arcname=os.path.join("main_dir", os.path.basename(text_path))) ++ f.write(text2_path, arcname=os.path.join("main_dir", os.path.basename(text2_path))) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_unsupported_ext_path(text_path, text2_path, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.ext.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(text_path, arcname=os.path.basename("unsupported.ext")) ++ f.write(text2_path, arcname=os.path.basename("unsupported_2.ext")) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def text_path_with_unicode_new_lines(tmp_path_factory): ++ text = "\n".join(["First", "Second\u2029with Unicode new line", "Third"]) ++ path = str(tmp_path_factory.mktemp("data") / "dataset_with_unicode_new_lines.txt") ++ with open(path, "w", encoding="utf-8") as f: ++ f.write(text) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def image_file(): ++ return os.path.join("tests", "features", "data", "test_image_rgb.jpg") ++ ++ ++@pytest.fixture(scope="session") ++def audio_file(): ++ return os.path.join("tests", "features", "data", "test_audio_44100.wav") ++ ++ ++@pytest.fixture(scope="session") ++def audio_file_44100(): ++ return os.path.join("tests", "features", "data", "test_audio_44100.mp3") ++ ++ ++@pytest.fixture(scope="session") ++def audio_file_16000(): ++ return os.path.join("tests", "features", "data", "test_audio_16000.mp3") ++ ++ ++@pytest.fixture(scope="session") ++def tensor_file(tmp_path_factory): ++ import torch ++ ++ path = tmp_path_factory.mktemp("data") / "tensor.pth" ++ with open(path, "wb") as f: ++ torch.save(torch.ones(128), f) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def zip_image_path(image_file, tmp_path_factory): ++ path = tmp_path_factory.mktemp("data") / "dataset.img.zip" ++ with zipfile.ZipFile(path, "w") as f: ++ f.write(image_file, arcname=os.path.basename(image_file)) ++ f.write(image_file, arcname=os.path.basename(image_file).replace(".jpg", "2.jpg")) ++ return path ++ ++ ++@pytest.fixture(scope="session") ++def data_dir_with_hidden_files(tmp_path_factory): ++ data_dir = tmp_path_factory.mktemp("data_dir") ++ ++ (data_dir / "subdir").mkdir() ++ with open(data_dir / "subdir" / "train.txt", "w") as f: ++ f.write("foo\n" * 10) ++ with open(data_dir / "subdir" / "test.txt", "w") as f: ++ f.write("bar\n" * 10) ++ # hidden file ++ with open(data_dir / "subdir" / ".test.txt", "w") as f: ++ f.write("bar\n" * 10) ++ ++ # hidden directory ++ (data_dir / ".subdir").mkdir() ++ with open(data_dir / ".subdir" / "train.txt", "w") as f: ++ f.write("foo\n" * 10) ++ with open(data_dir / ".subdir" / "test.txt", "w") as f: ++ f.write("bar\n" * 10) ++ ++ return data_dir diff --git a/misc/py-datasets/files/patch-tests_fixtures_fsspec.py b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py new file mode 100644 index 000000000000..311541e7a5dd --- /dev/null +++ b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py @@ -0,0 +1,119 @@ +-- This patch adds tests/fixtures/fsspec.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/fixtures/fsspec.py +@@ -0,0 +1,113 @@ ++import posixpath ++from pathlib import Path ++from unittest.mock import patch ++ ++import pytest ++from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem, stringify_path ++from fsspec.registry import _registry as _fsspec_registry ++ ++ ++class MockFileSystem(AbstractFileSystem): ++ protocol = "mock" ++ ++ def __init__(self, *args, local_root_dir, **kwargs): ++ super().__init__() ++ self._fs = LocalFileSystem(*args, **kwargs) ++ self.local_root_dir = Path(local_root_dir).resolve().as_posix() + "/" ++ ++ def mkdir(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.mkdir(path, *args, **kwargs) ++ ++ def makedirs(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.makedirs(path, *args, **kwargs) ++ ++ def rmdir(self, path): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.rmdir(path) ++ ++ def ls(self, path, detail=True, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ out = self._fs.ls(path, detail=detail, *args, **kwargs) ++ if detail: ++ return [{**info, "name": info["name"][len(self.local_root_dir) :]} for info in out] ++ else: ++ return [name[len(self.local_root_dir) :] for name in out] ++ ++ def info(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ out = dict(self._fs.info(path, *args, **kwargs)) ++ out["name"] = out["name"][len(self.local_root_dir) :] ++ return out ++ ++ def cp_file(self, path1, path2, *args, **kwargs): ++ path1 = posixpath.join(self.local_root_dir, self._strip_protocol(path1)) ++ path2 = posixpath.join(self.local_root_dir, self._strip_protocol(path2)) ++ return self._fs.cp_file(path1, path2, *args, **kwargs) ++ ++ def rm_file(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.rm_file(path, *args, **kwargs) ++ ++ def rm(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.rm(path, *args, **kwargs) ++ ++ def _open(self, path, *args, **kwargs): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs._open(path, *args, **kwargs) ++ ++ def created(self, path): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.created(path) ++ ++ def modified(self, path): ++ path = posixpath.join(self.local_root_dir, self._strip_protocol(path)) ++ return self._fs.modified(path) ++ ++ @classmethod ++ def _strip_protocol(cls, path): ++ path = stringify_path(path) ++ if path.startswith("mock://"): ++ path = path[7:] ++ return path ++ ++ ++class TmpDirFileSystem(MockFileSystem): ++ protocol = "tmp" ++ tmp_dir = None ++ ++ def __init__(self, *args, **kwargs): ++ assert self.tmp_dir is not None, "TmpDirFileSystem.tmp_dir is not set" ++ super().__init__(*args, **kwargs, local_root_dir=self.tmp_dir, auto_mkdir=True) ++ ++ @classmethod ++ def _strip_protocol(cls, path): ++ path = stringify_path(path) ++ if path.startswith("tmp://"): ++ path = path[6:] ++ return path ++ ++ ++@pytest.fixture ++def mock_fsspec(): ++ _fsspec_registry["mock"] = MockFileSystem ++ _fsspec_registry["tmp"] = TmpDirFileSystem ++ yield ++ del _fsspec_registry["mock"] ++ del _fsspec_registry["tmp"] ++ ++ ++@pytest.fixture ++def mockfs(tmp_path_factory, mock_fsspec): ++ local_fs_dir = tmp_path_factory.mktemp("mockfs") ++ return MockFileSystem(local_root_dir=local_fs_dir, auto_mkdir=True) ++ ++ ++@pytest.fixture ++def tmpfs(tmp_path_factory, mock_fsspec): ++ tmp_fs_dir = tmp_path_factory.mktemp("tmpfs") ++ with patch.object(TmpDirFileSystem, "tmp_dir", tmp_fs_dir): ++ yield TmpDirFileSystem() ++ TmpDirFileSystem.clear_instance_cache() diff --git a/misc/py-datasets/files/patch-tests_fixtures_hub.py b/misc/py-datasets/files/patch-tests_fixtures_hub.py new file mode 100644 index 000000000000..771dd0d56344 --- /dev/null +++ b/misc/py-datasets/files/patch-tests_fixtures_hub.py @@ -0,0 +1,235 @@ +-- This patch adds tests/fixtures/hub.py which is missing from the PyPI source distribution. +-- The file is taken from the GitHub repository at the same version tag. +-- Without this file, the test suite cannot be run. +--- /dev/null ++++ tests/fixtures/hub.py +@@ -0,0 +1,229 @@ ++import os ++import time ++import uuid ++from contextlib import contextmanager ++from typing import Optional ++ *** 855 LINES SKIPPED ***home | help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?69f223b6.1899d.6091db79>
