@inproceedings{lhoest-etal-2021-datasets,
title = "Datasets: A Community Library for Natural Language Processing",
author = "Lhoest, Quentin and
Villanova del Moral, Albert and
Jernite, Yacine and
Thakur, Abhishek and
von Platen, Patrick and
Patil, Suraj and
Chaumond, Julien and
Drame, Mariama and
Plu, Julien and
Tunstall, Lewis and
Davison, Joe and
{\v{S}}a{\v{s}}ko, Mario and
Chhablani, Gunjan and
Malik, Bhavitvya and
Brandeis, Simon and
Le Scao, Teven and
Sanh, Victor and
Xu, Canwen and
Patry, Nicolas and
McMillan-Major, Angelina and
Schmid, Philipp and
Gugger, Sylvain and
Delangue, Cl{\'e}ment and
Matussi{\`e}re, Th{\'e}o and
Debut, Lysandre and
Bekman, Stas and
Cistac, Pierric and
Goehringer, Thibault and
Mustar, Victor and
Lagunas, Fran{\c{c}}ois and
Rush, Alexander and
Wolf, Thomas",
editor = "Adel, Heike and
Shi, Shuming",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-demo.21",
doi = "10.18653/v1/2021.emnlp-demo.21",
pages = "175--184",
abstract = "The scale, variety, and quantity of publicly-available NLP datasets has grown rapidly as researchers propose new tasks, larger models, and novel benchmarks. Datasets is a community library for contemporary NLP designed to support this ecosystem. Datasets aims to standardize end-user interfaces, versioning, and documentation, while providing a lightweight front-end that behaves similarly for small datasets as for internet-scale corpora. The design of the library incorporates a distributed, community-driven approach to adding datasets and documenting usage. After a year of development, the library now includes more than 650 unique datasets, has more than 250 contributors, and has helped support a variety of novel cross-dataset research projects and shared tasks. The library is available at \url{https://github.com/huggingface/datasets}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lhoest-etal-2021-datasets">
<titleInfo>
<title>Datasets: A Community Library for Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Quentin</namePart>
<namePart type="family">Lhoest</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Villanova del Moral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yacine</namePart>
<namePart type="family">Jernite</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhishek</namePart>
<namePart type="family">Thakur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">von Platen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suraj</namePart>
<namePart type="family">Patil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julien</namePart>
<namePart type="family">Chaumond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mariama</namePart>
<namePart type="family">Drame</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julien</namePart>
<namePart type="family">Plu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lewis</namePart>
<namePart type="family">Tunstall</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joe</namePart>
<namePart type="family">Davison</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mario</namePart>
<namePart type="family">Šaško</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gunjan</namePart>
<namePart type="family">Chhablani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhavitvya</namePart>
<namePart type="family">Malik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Brandeis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teven</namePart>
<namePart type="family">Le Scao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Sanh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Canwen</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Patry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angelina</namePart>
<namePart type="family">McMillan-Major</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Schmid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sylvain</namePart>
<namePart type="family">Gugger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clément</namePart>
<namePart type="family">Delangue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Théo</namePart>
<namePart type="family">Matussière</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lysandre</namePart>
<namePart type="family">Debut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stas</namePart>
<namePart type="family">Bekman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierric</namePart>
<namePart type="family">Cistac</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thibault</namePart>
<namePart type="family">Goehringer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Mustar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">François</namePart>
<namePart type="family">Lagunas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Rush</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Heike</namePart>
<namePart type="family">Adel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuming</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online and Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The scale, variety, and quantity of publicly-available NLP datasets has grown rapidly as researchers propose new tasks, larger models, and novel benchmarks. Datasets is a community library for contemporary NLP designed to support this ecosystem. Datasets aims to standardize end-user interfaces, versioning, and documentation, while providing a lightweight front-end that behaves similarly for small datasets as for internet-scale corpora. The design of the library incorporates a distributed, community-driven approach to adding datasets and documenting usage. After a year of development, the library now includes more than 650 unique datasets, has more than 250 contributors, and has helped support a variety of novel cross-dataset research projects and shared tasks. The library is available at https://github.com/huggingface/datasets.</abstract>
<identifier type="citekey">lhoest-etal-2021-datasets</identifier>
<identifier type="doi">10.18653/v1/2021.emnlp-demo.21</identifier>
<location>
<url>https://aclanthology.org/2021.emnlp-demo.21</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>175</start>
<end>184</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Datasets: A Community Library for Natural Language Processing
%A Lhoest, Quentin
%A Villanova del Moral, Albert
%A Jernite, Yacine
%A Thakur, Abhishek
%A von Platen, Patrick
%A Patil, Suraj
%A Chaumond, Julien
%A Drame, Mariama
%A Plu, Julien
%A Tunstall, Lewis
%A Davison, Joe
%A Šaško, Mario
%A Chhablani, Gunjan
%A Malik, Bhavitvya
%A Brandeis, Simon
%A Le Scao, Teven
%A Sanh, Victor
%A Xu, Canwen
%A Patry, Nicolas
%A McMillan-Major, Angelina
%A Schmid, Philipp
%A Gugger, Sylvain
%A Delangue, Clément
%A Matussière, Théo
%A Debut, Lysandre
%A Bekman, Stas
%A Cistac, Pierric
%A Goehringer, Thibault
%A Mustar, Victor
%A Lagunas, François
%A Rush, Alexander
%A Wolf, Thomas
%Y Adel, Heike
%Y Shi, Shuming
%S Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online and Punta Cana, Dominican Republic
%F lhoest-etal-2021-datasets
%X The scale, variety, and quantity of publicly-available NLP datasets has grown rapidly as researchers propose new tasks, larger models, and novel benchmarks. Datasets is a community library for contemporary NLP designed to support this ecosystem. Datasets aims to standardize end-user interfaces, versioning, and documentation, while providing a lightweight front-end that behaves similarly for small datasets as for internet-scale corpora. The design of the library incorporates a distributed, community-driven approach to adding datasets and documenting usage. After a year of development, the library now includes more than 650 unique datasets, has more than 250 contributors, and has helped support a variety of novel cross-dataset research projects and shared tasks. The library is available at https://github.com/huggingface/datasets.
%R 10.18653/v1/2021.emnlp-demo.21
%U https://aclanthology.org/2021.emnlp-demo.21
%U https://doi.org/10.18653/v1/2021.emnlp-demo.21
%P 175-184
Markdown (Informal)
[Datasets: A Community Library for Natural Language Processing](https://aclanthology.org/2021.emnlp-demo.21) (Lhoest et al., EMNLP 2021)
ACL
- Quentin Lhoest, Albert Villanova del Moral, Yacine Jernite, Abhishek Thakur, Patrick von Platen, Suraj Patil, Julien Chaumond, Mariama Drame, Julien Plu, Lewis Tunstall, Joe Davison, Mario Šaško, Gunjan Chhablani, Bhavitvya Malik, Simon Brandeis, Teven Le Scao, Victor Sanh, Canwen Xu, Nicolas Patry, et al.. 2021. Datasets: A Community Library for Natural Language Processing. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 175–184, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.