Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 6 Aug 2024 09:45:21 GMT
From:      Yuri Victorovich <yuri@FreeBSD.org>
To:        ports-committers@FreeBSD.org, dev-commits-ports-all@FreeBSD.org, dev-commits-ports-main@FreeBSD.org
Subject:   git: a7b24d1d42b6 - main - misc/py-datasets: New port: HuggingFace community-driven open-source library of datasets
Message-ID:  <202408060945.4769jL5A078553@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by yuri:

URL: https://cgit.FreeBSD.org/ports/commit/?id=a7b24d1d42b6adbe2910950f281df23c67525a66

commit a7b24d1d42b6adbe2910950f281df23c67525a66
Author:     Yuri Victorovich <yuri@FreeBSD.org>
AuthorDate: 2024-08-06 09:34:48 +0000
Commit:     Yuri Victorovich <yuri@FreeBSD.org>
CommitDate: 2024-08-06 09:45:18 +0000

    misc/py-datasets: New port: HuggingFace community-driven open-source library of datasets
---
 misc/Makefile                                      |  1 +
 misc/py-datasets/Makefile                          | 64 ++++++++++++++++++++++
 misc/py-datasets/distinfo                          |  3 +
 misc/py-datasets/files/patch-setup.py              | 11 ++++
 .../files/patch-src_datasets_features_features.py  | 10 ++++
 misc/py-datasets/pkg-descr                         |  9 +++
 6 files changed, 98 insertions(+)

diff --git a/misc/Makefile b/misc/Makefile
index 0b72d6dd0af7..10ad9ff1e43c 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -417,6 +417,7 @@
     SUBDIR += py-colorbrewer
     SUBDIR += py-colored
     SUBDIR += py-crudini
+    SUBDIR += py-datasets
     SUBDIR += py-detecta
     SUBDIR += py-dictdiffer
     SUBDIR += py-eemeter
diff --git a/misc/py-datasets/Makefile b/misc/py-datasets/Makefile
new file mode 100644
index 000000000000..7e969039db0f
--- /dev/null
+++ b/misc/py-datasets/Makefile
@@ -0,0 +1,64 @@
+PORTNAME=	datasets
+DISTVERSION=	2.20.0
+CATEGORIES=	misc python # machine-learning
+MASTER_SITES=	PYPI
+PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
+
+MAINTAINER=	yuri@FreeBSD.org
+COMMENT=	HuggingFace community-driven open-source library of datasets
+WWW=		https://huggingface.co/docs/datasets/index
+
+LICENSE=	MIT
+LICENSE_FILE=	${WRKSRC}/LICENSE
+
+BUILD_DEPENDS=	${PYTHON_PKGNAMEPREFIX}pyproject_hooks>0:devel/py-pyproject_hooks@${PY_FLAVOR} \
+		${PY_SETUPTOOLS} \
+		${PYTHON_PKGNAMEPREFIX}wheel>0:devel/py-wheel@${PY_FLAVOR}
+RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}aiohttp>0:www/py-aiohttp@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}dill>0.3.0<0.3.9:devel/py-dill@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}filelock>0:sysutils/py-filelock@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}fsspec>=2023.1.0:devel/py-fsspec@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}huggingface-hub>0.21.2:misc/py-huggingface-hub@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}multiprocess>0:devel/py-multiprocess@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}numpy>=1.17:math/py-numpy@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}packaging>0:devel/py-packaging@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}pandas>0:math/py-pandas@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}pyarrow>=15.0.0:databases/py-pyarrow@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}pyyaml>=5.1:devel/py-pyyaml@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}requests>=2.32.2:www/py-requests@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}tqdm>=4.66.3:misc/py-tqdm@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}xxhash>0:devel/py-xxhash@${PY_FLAVOR}
+RUN_DEPENDS+=	${PYTHON_PKGNAMEPREFIX}librosa>0:audio/py-librosa@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}SoundFile>=0.12.1:audio/py-SoundFile@${PY_FLAVOR}
+RUN_DEPENDS+=	${PY_PILLOW}
+TEST_DEPENDS=	${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}elasticsearch>0:textproc/py-elasticsearch@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}faiss>=1.6.4:math/py-faiss@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}jax>=0.3.14:math/py-jax@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}jiwer>0:misc/py-jiwer@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}joblib>=1.3.0:devel/py-joblib@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}lz4>=0:archivers/py-lz4@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}polars>=0.20.0:misc/py-polars@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}protobuf>=4.0.0:devel/py-protobuf@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}pytest-datadir>=0:devel/py-pytest-datadir@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}pytest-xdist>=0:devel/py-pytest-xdist@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}pytest>=0:devel/py-pytest@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}pytorch>=2.0.0:misc/py-pytorch@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}rarfile>=4.0:archivers/py-rarfile@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}s3fs>=2021.11.1:devel/py-s3fs@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}tiktoken>=0:textproc/py-tiktoken@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}typing-extensions>=4.6.1:devel/py-typing-extensions@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}zstandard>=0:archivers/py-zstandard@${PY_FLAVOR}
+# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow
+
+USES=		python
+USE_PYTHON=	pep517 concurrent autoplist pytest
+
+TEST_ENV=	${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR}
+
+NO_ARCH=	yes
+
+pre-test: # prevent failure due to missing pyspark
+	@${RM} ${WRKSRC}/tests/packaged_modules/test_spark.py
+
+.include <bsd.port.mk>
diff --git a/misc/py-datasets/distinfo b/misc/py-datasets/distinfo
new file mode 100644
index 000000000000..6a0baa6a8083
--- /dev/null
+++ b/misc/py-datasets/distinfo
@@ -0,0 +1,3 @@
+TIMESTAMP = 1722803032
+SHA256 (datasets-2.20.0.tar.gz) = 3c4dbcd27e0f642b9d41d20ff2efa721a5e04b32b2ca4009e0fc9139e324553f
+SIZE (datasets-2.20.0.tar.gz) = 2225757
diff --git a/misc/py-datasets/files/patch-setup.py b/misc/py-datasets/files/patch-setup.py
new file mode 100644
index 000000000000..d28e71a481c8
--- /dev/null
+++ b/misc/py-datasets/files/patch-setup.py
@@ -0,0 +1,11 @@
+--- setup.py.orig	2024-08-05 18:50:31 UTC
++++ setup.py
+@@ -115,8 +115,6 @@ REQUIRED_PKGS = [
+     # Backend and serialization.
+     # Minimum 15.0.0 to be able to cast dictionary types to their underlying types
+     "pyarrow>=15.0.0",
+-    # As long as we allow pyarrow < 14.0.1, to fix vulnerability CVE-2023-47248
+-    "pyarrow-hotfix",
+     # For smart caching dataset processing
+     "dill>=0.3.0,<0.3.9",  # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19
+     # For performance gains with apache arrow
diff --git a/misc/py-datasets/files/patch-src_datasets_features_features.py b/misc/py-datasets/files/patch-src_datasets_features_features.py
new file mode 100644
index 000000000000..9a4e24c28ec6
--- /dev/null
+++ b/misc/py-datasets/files/patch-src_datasets_features_features.py
@@ -0,0 +1,10 @@
+--- src/datasets/features/features.py.orig	2024-08-05 18:52:07 UTC
++++ src/datasets/features/features.py
+@@ -32,7 +32,6 @@ import pyarrow.types
+ import pyarrow as pa
+ import pyarrow.compute as pc
+ import pyarrow.types
+-import pyarrow_hotfix  # noqa: F401  # to fix vulnerability on pyarrow<14.0.1
+ from pandas.api.extensions import ExtensionArray as PandasExtensionArray
+ from pandas.api.extensions import ExtensionDtype as PandasExtensionDtype
+ 
diff --git a/misc/py-datasets/pkg-descr b/misc/py-datasets/pkg-descr
new file mode 100644
index 000000000000..e8316e0af8f0
--- /dev/null
+++ b/misc/py-datasets/pkg-descr
@@ -0,0 +1,9 @@
+Datasets is a library for easily accessing and sharing datasets for Audio,
+Computer Vision, and Natural Language Processing (NLP) tasks.
+
+Load a dataset in a single line of code, and use our powerful data processing
+methods to quickly get your dataset ready for training in a deep learning model.
+Backed by the Apache Arrow format, process large datasets with zero-copy reads
+without any memory constraints for optimal speed and efficiency. We also feature
+a deep integration with the Hugging Face Hub, allowing you to easily load and
+share a dataset with the wider machine learning community.



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202408060945.4769jL5A078553>