Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 29 Apr 2026 15:28:54 +0000
From:      Yuri Victorovich <yuri@FreeBSD.org>
To:        ports-committers@FreeBSD.org, dev-commits-ports-all@FreeBSD.org, dev-commits-ports-main@FreeBSD.org
Subject:   git: c2dc4b4d9965 - main - misc/py-datasets: update 4.8.2=?utf-8?Q? =E2=86=92 4.8?=.5
Message-ID:  <69f223b6.1899d.6091db79@gitrepo.freebsd.org>

index | next in thread | raw e-mail

The branch main has been updated by yuri:

URL: https://cgit.FreeBSD.org/ports/commit/?id=c2dc4b4d99654cdca038efd04e5f8e49766a6459

commit c2dc4b4d99654cdca038efd04e5f8e49766a6459
Author:     Yuri Victorovich <yuri@FreeBSD.org>
AuthorDate: 2026-04-29 09:39:07 +0000
Commit:     Yuri Victorovich <yuri@FreeBSD.org>
CommitDate: 2026-04-29 15:28:46 +0000

    misc/py-datasets: update 4.8.2 → 4.8.5
---
 misc/py-datasets/Makefile                          |  36 +-
 misc/py-datasets/distinfo                          |   6 +-
 .../files/patch-tests__test_patching.py            |  17 +
 misc/py-datasets/files/patch-tests_conftest.py     |  68 +++
 .../files/patch-tests_fixtures_files.py            | 636 +++++++++++++++++++++
 .../files/patch-tests_fixtures_fsspec.py           | 119 ++++
 misc/py-datasets/files/patch-tests_fixtures_hub.py | 235 ++++++++
 misc/py-datasets/files/patch-tests_utils.py        | 626 ++++++++++++++++++++
 8 files changed, 1729 insertions(+), 14 deletions(-)

diff --git a/misc/py-datasets/Makefile b/misc/py-datasets/Makefile
index 774f71a63205..b9356b8a6908 100644
--- a/misc/py-datasets/Makefile
+++ b/misc/py-datasets/Makefile
@@ -1,13 +1,13 @@
 PORTNAME=	datasets
-DISTVERSION=	4.8.2
-PORTREVISION=	1
+DISTVERSION=	4.8.5
 CATEGORIES=	misc python # machine-learning
 MASTER_SITES=	PYPI
 PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
 
 MAINTAINER=	yuri@FreeBSD.org
 COMMENT=	HuggingFace community-driven open-source library of datasets
-WWW=		https://huggingface.co/docs/datasets/index
+WWW=		https://huggingface.co/docs/datasets/index \
+		https://github.com/huggingface/datasets
 
 LICENSE=	MIT
 LICENSE_FILE=	${WRKSRC}/LICENSE
@@ -30,11 +30,9 @@ RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}aiohttp>0:www/py-aiohttp@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}requests>=2.32.2:www/py-requests@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}tqdm>=4.66.3:misc/py-tqdm@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}xxhash>0:devel/py-xxhash@${PY_FLAVOR}
-RUN_DEPENDS_AUDIO= \
-		${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \
+RUN_DEPENDS_AUDIO=	${PYTHON_PKGNAMEPREFIX}torchcodec>=0.6.0:multimedia/py-torchcodec@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}pytorch>=2.8.0:misc/py-pytorch@${PY_FLAVOR}
-RUN_DEPENDS_VISION= \
-		${PY_PILLOW}
+RUN_DEPENDS_VISION=	${PY_PILLOW}
 RUN_DEPENDS+=	${RUN_DEPENDS_AUDIO} \
 		${RUN_DEPENDS_VISION}
 TEST_DEPENDS=	${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \
@@ -55,16 +53,32 @@ TEST_DEPENDS=	${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}tiktoken>=0:textproc/py-tiktoken@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}typing-extensions>=4.6.1:devel/py-typing-extensions@${PY_FLAVOR} \
 		${PYTHON_PKGNAMEPREFIX}zstandard>=0:archivers/py-zstandard@${PY_FLAVOR}
-# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow
+# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow, transformers
 
 USES=		python
 USE_PYTHON=	pep517 concurrent autoplist pytest
 
-TEST_ENV=	${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR}
+TEST_ENV=	${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} \
+		DATASETS_TEST_SKIP_TF=yes
+PYTEST_ARGS=	tests/
 
 NO_ARCH=	yes
 
-pre-test: # prevent failure due to missing pyspark
-	@${RM} ${WRKSRC}/tests/packaged_modules/test_spark.py
+pre-patch:
+	@${MKDIR} ${WRKSRC}/tests/fixtures
+	@${TOUCH} ${WRKSRC}/tests/__init__.py ${WRKSRC}/tests/fixtures/__init__.py
+
+pre-test: # skip tests requiring unavailable dependencies/data
+	@${RM} ${WRKSRC}/tests/test_fingerprint_tokenizer_stability.py
+	@${RM} ${WRKSRC}/tests/test_formatting.py
+	@${RM} ${WRKSRC}/tests/test_load.py
+	@${RM} ${WRKSRC}/tests/test_distributed.py
+	@${RM} ${WRKSRC}/tests/test_search.py
+	@${RM} ${WRKSRC}/tests/test_table.py
+	@${RM} ${WRKSRC}/tests/test_builder.py
+	@${RM} ${WRKSRC}/tests/test_parallel.py
+	@${RM} ${WRKSRC}/tests/test_iterable_dataset.py
+	@${RM} ${WRKSRC}/tests/test_upstream_hub.py
+	@${RM} ${WRKSRC}/tests/test_fingerprint.py
 
 .include <bsd.port.mk>
diff --git a/misc/py-datasets/distinfo b/misc/py-datasets/distinfo
index 612ac838107b..19c878e8b494 100644
--- a/misc/py-datasets/distinfo
+++ b/misc/py-datasets/distinfo
@@ -1,3 +1,3 @@
-TIMESTAMP = 1773758107
-SHA256 (datasets-4.8.2.tar.gz) = c6ad7e6c28c7436a9c6c23f817d1a450d395c771df881252dfe63697297cbcdf
-SIZE (datasets-4.8.2.tar.gz) = 603879
+TIMESTAMP = 1777403895
+SHA256 (datasets-4.8.5.tar.gz) = 0f0c1c3d56ffff2c93b2f4c63c95bac94f3d7e8621aea2a2a576275233bba772
+SIZE (datasets-4.8.5.tar.gz) = 605649
diff --git a/misc/py-datasets/files/patch-tests__test_patching.py b/misc/py-datasets/files/patch-tests__test_patching.py
new file mode 100644
index 000000000000..6beda41ed21b
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests__test_patching.py
@@ -0,0 +1,17 @@
+-- This patch adds tests/_test_patching.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/_test_patching.py
+@@ -0,0 +1,11 @@
++# ruff: noqa: F401
++# This is the module that test_patching.py uses to test patch_submodule()
++import os
++import os as renamed_os
++from os import path
++from os import path as renamed_path
++from os.path import join
++from os.path import join as renamed_join
++
++
++open = open  # we just need to have a builtin inside this module to test it properly
diff --git a/misc/py-datasets/files/patch-tests_conftest.py b/misc/py-datasets/files/patch-tests_conftest.py
new file mode 100644
index 000000000000..248e9b692e63
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_conftest.py
@@ -0,0 +1,68 @@
+-- This patch adds tests/conftest.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/conftest.py
+@@ -0,0 +1,62 @@
++import pytest
++
++import datasets
++import datasets.config
++
++
++# Import fixture modules as plugins
++pytest_plugins = ["tests.fixtures.files", "tests.fixtures.hub", "tests.fixtures.fsspec"]
++
++
++def pytest_collection_modifyitems(config, items):
++    # Mark tests as "unit" by default if not marked as "integration" (or already marked as "unit")
++    for item in items:
++        if any(marker in item.keywords for marker in ["integration", "unit"]):
++            continue
++        item.add_marker(pytest.mark.unit)
++
++
++@pytest.fixture(autouse=True)
++def set_test_cache_config(tmp_path_factory, monkeypatch):
++    # test_hf_cache_home = tmp_path_factory.mktemp("cache")  # TODO: why a cache dir per test function does not work?
++    test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache"
++    test_hf_datasets_cache = test_hf_cache_home / "datasets"
++    monkeypatch.setattr("datasets.config.HF_DATASETS_CACHE", str(test_hf_datasets_cache))
++    test_downloaded_datasets_path = test_hf_datasets_cache / "downloads"
++    monkeypatch.setattr("datasets.config.DOWNLOADED_DATASETS_PATH", str(test_downloaded_datasets_path))
++    test_extracted_datasets_path = test_hf_datasets_cache / "downloads" / "extracted"
++    monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(test_extracted_datasets_path))
++
++    # used in dataset viewer, we may set it to true by default in the future
++    monkeypatch.setattr("datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS", True)
++
++
++@pytest.fixture(autouse=True)
++def disable_implicit_token(monkeypatch):
++    monkeypatch.setattr("huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN", True)
++
++
++@pytest.fixture(autouse=True, scope="session")
++def disable_tqdm_output():
++    datasets.disable_progress_bar()
++
++
++@pytest.fixture(autouse=True)
++def set_update_download_counts_to_false(monkeypatch):
++    # don't take tests into account when counting downloads
++    monkeypatch.setattr("datasets.config.HF_UPDATE_DOWNLOAD_COUNTS", False)
++
++
++@pytest.fixture
++def set_sqlalchemy_silence_uber_warning(monkeypatch):
++    # Required to suppress RemovedIn20Warning when feature(s) are not compatible with SQLAlchemy 2.0
++    # To be removed once SQLAlchemy 2.0 supported
++    try:
++        monkeypatch.setattr("sqlalchemy.util.deprecations.SILENCE_UBER_WARNING", True)
++    except (ModuleNotFoundError, AttributeError):
++        pass
++
++
++@pytest.fixture(autouse=True, scope="session")
++def zero_time_out_for_remote_code():
++    datasets.config.TIME_OUT_REMOTE_CODE = 0
diff --git a/misc/py-datasets/files/patch-tests_fixtures_files.py b/misc/py-datasets/files/patch-tests_fixtures_files.py
new file mode 100644
index 000000000000..7053267f2eaa
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_fixtures_files.py
@@ -0,0 +1,636 @@
+-- This patch adds tests/fixtures/files.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/fixtures/files.py
+@@ -0,0 +1,630 @@
++import contextlib
++import csv
++import json
++import os
++import sqlite3
++import tarfile
++import textwrap
++import zipfile
++
++import pandas as pd
++import pyarrow as pa
++import pyarrow.parquet as pq
++import pytest
++
++import datasets
++import datasets.config
++
++
++# dataset + arrow_file
++
++
++@pytest.fixture(scope="session")
++def dataset():
++    n = 10
++    features = datasets.Features(
++        {
++            "tokens": datasets.List(datasets.Value("string")),
++            "labels": datasets.List(datasets.ClassLabel(names=["negative", "positive"])),
++            "answers": {
++                "text": datasets.List(datasets.Value("string")),
++                "answer_start": datasets.List(datasets.Value("int32")),
++            },
++            "id": datasets.Value("int64"),
++        }
++    )
++    dataset = datasets.Dataset.from_dict(
++        {
++            "tokens": [["foo"] * 5] * n,
++            "labels": [[1] * 5] * n,
++            "answers": [{"answer_start": [97], "text": ["1976"]}] * 10,
++            "id": list(range(n)),
++        },
++        features=features,
++    )
++    return dataset
++
++
++@pytest.fixture(scope="session")
++def arrow_file(tmp_path_factory, dataset):
++    filename = str(tmp_path_factory.mktemp("data") / "file.arrow")
++    dataset.map(cache_file_name=filename)
++    return filename
++
++
++# FILE_CONTENT + files
++
++
++FILE_CONTENT = """\
++    Text data.
++    Second line of data."""
++
++
++@pytest.fixture(scope="session")
++def text_file_content():
++    return FILE_CONTENT
++
++
++@pytest.fixture(scope="session")
++def text_file(tmp_path_factory):
++    filename = tmp_path_factory.mktemp("data") / "file.txt"
++    data = FILE_CONTENT
++    with open(filename, "w") as f:
++        f.write(data)
++    return filename
++
++
++@pytest.fixture(scope="session")
++def bz2_file(tmp_path_factory):
++    import bz2
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.bz2"
++    data = bytes(FILE_CONTENT, "utf-8")
++    with bz2.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def gz_file(tmp_path_factory):
++    import gzip
++
++    path = str(tmp_path_factory.mktemp("data") / "file.txt.gz")
++    data = bytes(FILE_CONTENT, "utf-8")
++    with gzip.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def lz4_file(tmp_path_factory):
++    if datasets.config.LZ4_AVAILABLE:
++        import lz4.frame
++
++        path = tmp_path_factory.mktemp("data") / "file.txt.lz4"
++        data = bytes(FILE_CONTENT, "utf-8")
++        with lz4.frame.open(path, "wb") as f:
++            f.write(data)
++        return path
++
++
++@pytest.fixture(scope="session")
++def seven_zip_file(tmp_path_factory, text_file):
++    if datasets.config.PY7ZR_AVAILABLE:
++        import py7zr
++
++        path = tmp_path_factory.mktemp("data") / "file.txt.7z"
++        with py7zr.SevenZipFile(path, "w") as archive:
++            archive.write(text_file, arcname=os.path.basename(text_file))
++        return path
++
++
++@pytest.fixture(scope="session")
++def tar_file(tmp_path_factory, text_file):
++    import tarfile
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.tar"
++    with tarfile.TarFile(path, "w") as f:
++        f.add(text_file, arcname=os.path.basename(text_file))
++    return path
++
++
++@pytest.fixture(scope="session")
++def xz_file(tmp_path_factory):
++    import lzma
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.xz"
++    data = bytes(FILE_CONTENT, "utf-8")
++    with lzma.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_file(tmp_path_factory, text_file):
++    import zipfile
++
++    path = tmp_path_factory.mktemp("data") / "file.txt.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_file, arcname=os.path.basename(text_file))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zstd_file(tmp_path_factory):
++    if datasets.config.ZSTANDARD_AVAILABLE:
++        import zstandard as zstd
++
++        path = tmp_path_factory.mktemp("data") / "file.txt.zst"
++        data = bytes(FILE_CONTENT, "utf-8")
++        with zstd.open(path, "wb") as f:
++            f.write(data)
++        return path
++
++
++# xml_file
++
++
++@pytest.fixture(scope="session")
++def xml_file(tmp_path_factory):
++    filename = tmp_path_factory.mktemp("data") / "file.xml"
++    data = textwrap.dedent(
++        """\
++    <?xml version="1.0" encoding="UTF-8" ?>
++    <tmx version="1.4">
++      <header segtype="sentence" srclang="ca" />
++      <body>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 1</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 1</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 2</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 2</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 3</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 3</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 4</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 4</seg></tuv>
++        </tu>
++        <tu>
++          <tuv xml:lang="ca"><seg>Contingut 5</seg></tuv>
++          <tuv xml:lang="en"><seg>Content 5</seg></tuv>
++        </tu>
++      </body>
++    </tmx>"""
++    )
++    with open(filename, "w") as f:
++        f.write(data)
++    return filename
++
++
++DATA = [
++    {"col_1": "0", "col_2": 0, "col_3": 0.0},
++    {"col_1": "1", "col_2": 1, "col_3": 1.0},
++    {"col_1": "2", "col_2": 2, "col_3": 2.0},
++    {"col_1": "3", "col_2": 3, "col_3": 3.0},
++]
++DATA2 = [
++    {"col_1": "4", "col_2": 4, "col_3": 4.0},
++    {"col_1": "5", "col_2": 5, "col_3": 5.0},
++]
++DATA_DICT_OF_LISTS = {
++    "col_1": ["0", "1", "2", "3"],
++    "col_2": [0, 1, 2, 3],
++    "col_3": [0.0, 1.0, 2.0, 3.0],
++}
++
++DATA_312 = [
++    {"col_3": 0.0, "col_1": "0", "col_2": 0},
++    {"col_3": 1.0, "col_1": "1", "col_2": 1},
++]
++
++DATA_STR = [
++    {"col_1": "s0", "col_2": 0, "col_3": 0.0},
++    {"col_1": "s1", "col_2": 1, "col_3": 1.0},
++    {"col_1": "s2", "col_2": 2, "col_3": 2.0},
++    {"col_1": "s3", "col_2": 3, "col_3": 3.0},
++]
++
++DATA_MISSING_FIELDS = [
++    {"col_1": 1, "col_2": 2},
++    {"col_1": 1, "col_3": 3},
++]
++
++DATA_MIXED_TYPES = [
++    {"col_1": 1, "col_2": {"a": "a"}, "col_3": [{"x": "x"}]},
++    {"col_1": "one", "col_2": {"b": "b"}, "col_3": [{"y": "y"}]},
++    {"col_1": None, "col_2": None, "col_3": [None]},
++]
++
++
++@pytest.fixture(scope="session")
++def dataset_dict():
++    return DATA_DICT_OF_LISTS
++
++
++@pytest.fixture(scope="session")
++def arrow_path(tmp_path_factory):
++    dataset = datasets.Dataset.from_dict(DATA_DICT_OF_LISTS)
++    path = str(tmp_path_factory.mktemp("data") / "dataset.arrow")
++    dataset.map(cache_file_name=path)
++    return path
++
++
++@pytest.fixture(scope="session")
++def sqlite_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.sqlite")
++    with contextlib.closing(sqlite3.connect(path)) as con:
++        cur = con.cursor()
++        cur.execute("CREATE TABLE dataset(col_1 text, col_2 int, col_3 real)")
++        for item in DATA:
++            cur.execute("INSERT INTO dataset(col_1, col_2, col_3) VALUES (?, ?, ?)", tuple(item.values()))
++        con.commit()
++    return path
++
++
++@pytest.fixture(scope="session")
++def csv_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.csv")
++    with open(path, "w", newline="") as f:
++        writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
++        writer.writeheader()
++        for item in DATA:
++            writer.writerow(item)
++    return path
++
++
++@pytest.fixture(scope="session")
++def csv2_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset2.csv")
++    with open(path, "w", newline="") as f:
++        writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
++        writer.writeheader()
++        for item in DATA:
++            writer.writerow(item)
++    return path
++
++
++@pytest.fixture(scope="session")
++def bz2_csv_path(csv_path, tmp_path_factory):
++    import bz2
++
++    path = tmp_path_factory.mktemp("data") / "dataset.csv.bz2"
++    with open(csv_path, "rb") as f:
++        data = f.read()
++    # data = bytes(FILE_CONTENT, "utf-8")
++    with bz2.open(path, "wb") as f:
++        f.write(data)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_csv_path(csv_path, csv2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("zip_csv_path") / "csv-dataset.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(csv_path, arcname=os.path.basename(csv_path))
++        f.write(csv2_path, arcname=os.path.basename(csv2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_uppercase_csv_path(csv_path, csv2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.csv.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(csv_path, arcname=os.path.basename(csv_path.replace(".csv", ".CSV")))
++        f.write(csv2_path, arcname=os.path.basename(csv2_path.replace(".csv", ".CSV")))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_with_dir.csv.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(csv_path, arcname=os.path.join("main_dir", os.path.basename(csv_path)))
++        f.write(csv2_path, arcname=os.path.join("main_dir", os.path.basename(csv2_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def parquet_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.parquet")
++    schema = pa.schema(
++        {
++            "col_1": pa.string(),
++            "col_2": pa.int64(),
++            "col_3": pa.float64(),
++        }
++    )
++    with open(path, "wb") as f:
++        writer = pq.ParquetWriter(f, schema=schema)
++        pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema)
++        writer.write_table(pa_table)
++        writer.close()
++    return path
++
++
++@pytest.fixture(scope="session")
++def geoparquet_path(tmp_path_factory):
++    df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet")
++    path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet")
++    df.to_parquet(path=path)
++    return path
++
++
++@pytest.fixture(scope="session")
++def json_list_of_dicts_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.json")
++    data = {"data": DATA}
++    with open(path, "w") as f:
++        json.dump(data, f)
++    return path
++
++
++@pytest.fixture(scope="session")
++def json_dict_of_lists_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.json")
++    data = {"data": DATA_DICT_OF_LISTS}
++    with open(path, "w") as f:
++        json.dump(data, f)
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl")
++    with open(path, "w") as f:
++        for item in DATA:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl2_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset2.jsonl")
++    with open(path, "w") as f:
++        for item in DATA:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_312_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_312:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_str_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_STR:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_missing_fields_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset-missing-fields.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_MISSING_FIELDS:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_mixed_types_path(tmp_path_factory):
++    path = str(tmp_path_factory.mktemp("data") / "dataset-mixed-types.jsonl")
++    with open(path, "w") as f:
++        for item in DATA_MIXED_TYPES:
++            f.write(json.dumps(item) + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_gz_path(tmp_path_factory, text_path):
++    import gzip
++
++    path = str(tmp_path_factory.mktemp("data") / "dataset.txt.gz")
++    with open(text_path, "rb") as orig_file:
++        with gzip.open(path, "wb") as zipped_file:
++            zipped_file.writelines(orig_file)
++    return path
++
++
++@pytest.fixture(scope="session")
++def jsonl_gz_path(tmp_path_factory, jsonl_path):
++    import gzip
++
++    path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz")
++    with open(jsonl_path, "rb") as orig_file:
++        with gzip.open(path, "wb") as zipped_file:
++            zipped_file.writelines(orig_file)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.jsonl.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(jsonl_path, arcname=os.path.basename(jsonl_path))
++        f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_nested_jsonl_path(zip_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(zip_jsonl_path, arcname=os.path.join("nested", os.path.basename(zip_jsonl_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_with_dir.jsonl.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(jsonl_path, arcname=os.path.join("main_dir", os.path.basename(jsonl_path)))
++        f.write(jsonl2_path, arcname=os.path.join("main_dir", os.path.basename(jsonl2_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.jsonl.tar"
++    with tarfile.TarFile(path, "w") as f:
++        f.add(jsonl_path, arcname=os.path.basename(jsonl_path))
++        f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.tar"
++    with tarfile.TarFile(path, "w") as f:
++        f.add(tar_jsonl_path, arcname=os.path.join("nested", os.path.basename(tar_jsonl_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_path(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = str(tmp_path_factory.mktemp("data") / "dataset.txt")
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def text2_path(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = str(tmp_path_factory.mktemp("data") / "dataset2.txt")
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_dir(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = tmp_path_factory.mktemp("data_text_dir") / "dataset.txt"
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path.parent
++
++
++@pytest.fixture(scope="session")
++def text_dir_with_unsupported_extension(tmp_path_factory):
++    data = ["0", "1", "2", "3"]
++    path = tmp_path_factory.mktemp("data") / "dataset.abc"
++    with open(path, "w") as f:
++        for item in data:
++            f.write(item + "\n")
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_text_path(text_path, text2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.text.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_path, arcname=os.path.basename(text_path))
++        f.write(text2_path, arcname=os.path.basename(text2_path))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_text_with_dir_path(text_path, text2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset_with_dir.text.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_path, arcname=os.path.join("main_dir", os.path.basename(text_path)))
++        f.write(text2_path, arcname=os.path.join("main_dir", os.path.basename(text2_path)))
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_unsupported_ext_path(text_path, text2_path, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.ext.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(text_path, arcname=os.path.basename("unsupported.ext"))
++        f.write(text2_path, arcname=os.path.basename("unsupported_2.ext"))
++    return path
++
++
++@pytest.fixture(scope="session")
++def text_path_with_unicode_new_lines(tmp_path_factory):
++    text = "\n".join(["First", "Second\u2029with Unicode new line", "Third"])
++    path = str(tmp_path_factory.mktemp("data") / "dataset_with_unicode_new_lines.txt")
++    with open(path, "w", encoding="utf-8") as f:
++        f.write(text)
++    return path
++
++
++@pytest.fixture(scope="session")
++def image_file():
++    return os.path.join("tests", "features", "data", "test_image_rgb.jpg")
++
++
++@pytest.fixture(scope="session")
++def audio_file():
++    return os.path.join("tests", "features", "data", "test_audio_44100.wav")
++
++
++@pytest.fixture(scope="session")
++def audio_file_44100():
++    return os.path.join("tests", "features", "data", "test_audio_44100.mp3")
++
++
++@pytest.fixture(scope="session")
++def audio_file_16000():
++    return os.path.join("tests", "features", "data", "test_audio_16000.mp3")
++
++
++@pytest.fixture(scope="session")
++def tensor_file(tmp_path_factory):
++    import torch
++
++    path = tmp_path_factory.mktemp("data") / "tensor.pth"
++    with open(path, "wb") as f:
++        torch.save(torch.ones(128), f)
++    return path
++
++
++@pytest.fixture(scope="session")
++def zip_image_path(image_file, tmp_path_factory):
++    path = tmp_path_factory.mktemp("data") / "dataset.img.zip"
++    with zipfile.ZipFile(path, "w") as f:
++        f.write(image_file, arcname=os.path.basename(image_file))
++        f.write(image_file, arcname=os.path.basename(image_file).replace(".jpg", "2.jpg"))
++    return path
++
++
++@pytest.fixture(scope="session")
++def data_dir_with_hidden_files(tmp_path_factory):
++    data_dir = tmp_path_factory.mktemp("data_dir")
++
++    (data_dir / "subdir").mkdir()
++    with open(data_dir / "subdir" / "train.txt", "w") as f:
++        f.write("foo\n" * 10)
++    with open(data_dir / "subdir" / "test.txt", "w") as f:
++        f.write("bar\n" * 10)
++    # hidden file
++    with open(data_dir / "subdir" / ".test.txt", "w") as f:
++        f.write("bar\n" * 10)
++
++    # hidden directory
++    (data_dir / ".subdir").mkdir()
++    with open(data_dir / ".subdir" / "train.txt", "w") as f:
++        f.write("foo\n" * 10)
++    with open(data_dir / ".subdir" / "test.txt", "w") as f:
++        f.write("bar\n" * 10)
++
++    return data_dir
diff --git a/misc/py-datasets/files/patch-tests_fixtures_fsspec.py b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py
new file mode 100644
index 000000000000..311541e7a5dd
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_fixtures_fsspec.py
@@ -0,0 +1,119 @@
+-- This patch adds tests/fixtures/fsspec.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/fixtures/fsspec.py
+@@ -0,0 +1,113 @@
++import posixpath
++from pathlib import Path
++from unittest.mock import patch
++
++import pytest
++from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem, stringify_path
++from fsspec.registry import _registry as _fsspec_registry
++
++
++class MockFileSystem(AbstractFileSystem):
++    protocol = "mock"
++
++    def __init__(self, *args, local_root_dir, **kwargs):
++        super().__init__()
++        self._fs = LocalFileSystem(*args, **kwargs)
++        self.local_root_dir = Path(local_root_dir).resolve().as_posix() + "/"
++
++    def mkdir(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.mkdir(path, *args, **kwargs)
++
++    def makedirs(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.makedirs(path, *args, **kwargs)
++
++    def rmdir(self, path):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.rmdir(path)
++
++    def ls(self, path, detail=True, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        out = self._fs.ls(path, detail=detail, *args, **kwargs)
++        if detail:
++            return [{**info, "name": info["name"][len(self.local_root_dir) :]} for info in out]
++        else:
++            return [name[len(self.local_root_dir) :] for name in out]
++
++    def info(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        out = dict(self._fs.info(path, *args, **kwargs))
++        out["name"] = out["name"][len(self.local_root_dir) :]
++        return out
++
++    def cp_file(self, path1, path2, *args, **kwargs):
++        path1 = posixpath.join(self.local_root_dir, self._strip_protocol(path1))
++        path2 = posixpath.join(self.local_root_dir, self._strip_protocol(path2))
++        return self._fs.cp_file(path1, path2, *args, **kwargs)
++
++    def rm_file(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.rm_file(path, *args, **kwargs)
++
++    def rm(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.rm(path, *args, **kwargs)
++
++    def _open(self, path, *args, **kwargs):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs._open(path, *args, **kwargs)
++
++    def created(self, path):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.created(path)
++
++    def modified(self, path):
++        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))
++        return self._fs.modified(path)
++
++    @classmethod
++    def _strip_protocol(cls, path):
++        path = stringify_path(path)
++        if path.startswith("mock://"):
++            path = path[7:]
++        return path
++
++
++class TmpDirFileSystem(MockFileSystem):
++    protocol = "tmp"
++    tmp_dir = None
++
++    def __init__(self, *args, **kwargs):
++        assert self.tmp_dir is not None, "TmpDirFileSystem.tmp_dir is not set"
++        super().__init__(*args, **kwargs, local_root_dir=self.tmp_dir, auto_mkdir=True)
++
++    @classmethod
++    def _strip_protocol(cls, path):
++        path = stringify_path(path)
++        if path.startswith("tmp://"):
++            path = path[6:]
++        return path
++
++
++@pytest.fixture
++def mock_fsspec():
++    _fsspec_registry["mock"] = MockFileSystem
++    _fsspec_registry["tmp"] = TmpDirFileSystem
++    yield
++    del _fsspec_registry["mock"]
++    del _fsspec_registry["tmp"]
++
++
++@pytest.fixture
++def mockfs(tmp_path_factory, mock_fsspec):
++    local_fs_dir = tmp_path_factory.mktemp("mockfs")
++    return MockFileSystem(local_root_dir=local_fs_dir, auto_mkdir=True)
++
++
++@pytest.fixture
++def tmpfs(tmp_path_factory, mock_fsspec):
++    tmp_fs_dir = tmp_path_factory.mktemp("tmpfs")
++    with patch.object(TmpDirFileSystem, "tmp_dir", tmp_fs_dir):
++        yield TmpDirFileSystem()
++        TmpDirFileSystem.clear_instance_cache()
diff --git a/misc/py-datasets/files/patch-tests_fixtures_hub.py b/misc/py-datasets/files/patch-tests_fixtures_hub.py
new file mode 100644
index 000000000000..771dd0d56344
--- /dev/null
+++ b/misc/py-datasets/files/patch-tests_fixtures_hub.py
@@ -0,0 +1,235 @@
+-- This patch adds tests/fixtures/hub.py which is missing from the PyPI source distribution.
+-- The file is taken from the GitHub repository at the same version tag.
+-- Without this file, the test suite cannot be run.
+--- /dev/null
++++ tests/fixtures/hub.py
+@@ -0,0 +1,229 @@
++import os
++import time
++import uuid
++from contextlib import contextmanager
++from typing import Optional
++
*** 855 LINES SKIPPED ***


home | help

Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?69f223b6.1899d.6091db79>