uboot: (firmwareOdroidC2/C4) don't invoke patch tool, use patches = [] instead
https://github.com/NixOS/nixpkgs/blob/master/pkgs/stdenv/generic/setup.sh#L948 this can do it nicely. Signed-off-by: Anton Arapov <anton@deadbeef.mx>
This commit is contained in:
commit
56de2bcd43
30691 changed files with 3076956 additions and 0 deletions
145
pkgs/development/python-modules/tokenizers/default.nix
Normal file
145
pkgs/development/python-modules/tokenizers/default.nix
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
{ lib
|
||||
, stdenv
|
||||
, buildPythonPackage
|
||||
, datasets
|
||||
, fetchFromGitHub
|
||||
, fetchurl
|
||||
, libiconv
|
||||
, numpy
|
||||
, openssl
|
||||
, pkg-config
|
||||
, pytestCheckHook
|
||||
, pythonOlder
|
||||
, requests
|
||||
, rustPlatform
|
||||
, Security
|
||||
, setuptools-rust
|
||||
}:
|
||||
|
||||
let
|
||||
# See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
|
||||
# about URLs and file names
|
||||
robertaVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
|
||||
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
|
||||
};
|
||||
robertaMerges = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
|
||||
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
|
||||
};
|
||||
albertVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
|
||||
sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
|
||||
};
|
||||
bertVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
|
||||
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
|
||||
};
|
||||
norvigBig = fetchurl {
|
||||
url = "https://norvig.com/big.txt";
|
||||
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
|
||||
};
|
||||
docPipelineTokenizer = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
|
||||
hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
|
||||
};
|
||||
docQuicktourTokenizer = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
|
||||
hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
|
||||
};
|
||||
openaiVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
|
||||
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
|
||||
};
|
||||
openaiMerges = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
|
||||
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
|
||||
};
|
||||
in
|
||||
buildPythonPackage rec {
|
||||
pname = "tokenizers";
|
||||
version = "0.12.1";
|
||||
|
||||
disabled = pythonOlder "3.7";
|
||||
|
||||
src = fetchFromGitHub {
|
||||
owner = "huggingface";
|
||||
repo = pname;
|
||||
rev = "python-v${version}";
|
||||
hash = "sha256-XIXKgcqa6ToAH4OkyaaJALOS9F+sD8d5Z71RttRcIsw=";
|
||||
};
|
||||
|
||||
cargoDeps = rustPlatform.fetchCargoTarball {
|
||||
inherit src sourceRoot;
|
||||
name = "${pname}-${version}";
|
||||
sha256 = "sha256-Euvf0LNMa2Od+6gY1Ldge/7VPrH5mJoZduRRsb+lM/E=";
|
||||
};
|
||||
|
||||
sourceRoot = "source/bindings/python";
|
||||
|
||||
nativeBuildInputs = [
|
||||
pkg-config
|
||||
setuptools-rust
|
||||
] ++ (with rustPlatform; [
|
||||
cargoSetupHook
|
||||
rust.cargo
|
||||
rust.rustc
|
||||
]);
|
||||
|
||||
buildInputs = [
|
||||
openssl
|
||||
] ++ lib.optionals stdenv.isDarwin [
|
||||
libiconv
|
||||
Security
|
||||
];
|
||||
|
||||
propagatedBuildInputs = [
|
||||
numpy
|
||||
];
|
||||
|
||||
checkInputs = [
|
||||
datasets
|
||||
pytestCheckHook
|
||||
requests
|
||||
];
|
||||
|
||||
postUnpack = ''
|
||||
# Add data files for tests, otherwise tests attempt network access
|
||||
mkdir $sourceRoot/tests/data
|
||||
( cd $sourceRoot/tests/data
|
||||
ln -s ${robertaVocab} roberta-base-vocab.json
|
||||
ln -s ${robertaMerges} roberta-base-merges.txt
|
||||
ln -s ${albertVocab} albert-base-v1-tokenizer.json
|
||||
ln -s ${bertVocab} bert-base-uncased-vocab.txt
|
||||
ln -s ${docPipelineTokenizer} bert-wiki.json
|
||||
ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
|
||||
ln -s ${norvigBig} big.txt
|
||||
ln -s ${openaiVocab} openai-gpt-vocab.json
|
||||
ln -s ${openaiMerges} openai-gpt-merges.txt )
|
||||
'';
|
||||
|
||||
preCheck = ''
|
||||
export HOME=$(mktemp -d);
|
||||
'';
|
||||
|
||||
pythonImportsCheck = [
|
||||
"tokenizers"
|
||||
];
|
||||
|
||||
disabledTests = [
|
||||
# Downloads data using the datasets module
|
||||
"TestTrainFromIterators"
|
||||
# Those tests require more data
|
||||
"test_from_pretrained"
|
||||
"test_from_pretrained_revision"
|
||||
"test_continuing_prefix_trainer_mistmatch"
|
||||
];
|
||||
|
||||
meta = with lib; {
|
||||
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
|
||||
homepage = "https://github.com/huggingface/tokenizers";
|
||||
license = licenses.asl20;
|
||||
maintainers = with maintainers; [ ];
|
||||
platforms = platforms.unix;
|
||||
};
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue