Buckets:

AIencoder
/

TONS_OF_TRANING_DATA

Files

xet

AIencoder/TONS_OF_TRANING_DATA / dolma.py

AIencoder

9 days ago

download

raw

4.84 kB

	# Copyright 2024 Allen Institute for AI
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Lint as: python3
	"""Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research"""

	import gzip
	import json
	import os
	from typing import List

	import datasets

	logger = datasets.logging.get_logger(__name__)


	_DESCRIPTION = """\
	Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research
	"""

	_URL_LISTS = {
	"v1": "urls/v1.txt",
	"v1_5": "urls/v1_5.txt",
	"v1_5-sample": "urls/v1_5-sample.txt",
	"v1_6": "urls/v1_6.txt",
	"v1_6-sample": "urls/v1_6-sample.txt",
	"v1_7": "urls/v1_7.txt",
	}
	_VERSIONS = {
	"v1": "1.0.0",
	"v1_5": "1.5.0",
	"v1_5-sample": "1.5.0",
	"v1_6": "1.6.0",
	"v1_6-sample": "1.6.0",
	"v1_7": "1.7.0",
	}
	_DATES = {
	"v1": "(Aug 2023)",
	"v1_5": "(Oct 2023)",
	"v1_5-sample": "(Oct 2023)",
	"v1_6": "(Jan 2024)",
	"v1_6-sample": "(Jan 2024)",
	"v1_7": "(Apr 2024)",
	}
	_BASE_URL = "https://olmo-data.org"

	_DATA_DIR = os.environ.get("DOLMA_DATA_DIR", None)

	_CITATION = """\
	@article{dolma,
	title = {{Dolma: An Open Corpus of Three Trillion Tokens for Language Model Pretraining Research}},
	author = {
	Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and
	Russell Authur and Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and
	Valentin Hofmann and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and Ian Magnusson and
	Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and
	Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani and
	Oyvind Tafjord and Evan Pete Walsh and Hannaneh Hajishirzi and Noah A. Smith and Luke Zettlemoyer and
	Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo
	},
	year = {2024},
	journal={arXiv preprint},
	}
	"""


	class Dolma(datasets.GeneratorBasedBuilder):
	"""Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research"""

	BUILDER_CONFIGS = [
	datasets.BuilderConfig(
	name=name,
	version=_VERSIONS[name],
	description=f"{_DESCRIPTION} {_DATES[name]}",
	)
	for name in _URL_LISTS.keys()
	]

	DEFAULT_CONFIG_NAME = "v1_7"

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{
	"id": datasets.Value("string"),
	"text": datasets.Value("string"),
	# "metadata": datasets.Value("string"),
	"added": datasets.Value("string"),
	"created": datasets.Value("string"),
	"source": datasets.Value("string"),
	}
	),
	supervised_keys=None,
	)

	def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
	path = dl_manager.download(_URL_LISTS[self.config.name])

	with open(path, mode="rt", encoding="utf-8") as f: # type: ignore[no-untyped-call]
	subset_urls = f.read().splitlines()

	if _DATA_DIR is not None:
	subset_files = [os.path.join(_DATA_DIR, url.replace(_BASE_URL, "").lstrip("/")) for url in subset_urls]
	else:
	subset_files = dl_manager.download(subset_urls)

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN, # type: ignore[assignment]
	gen_kwargs={"files": subset_files},
	)
	]

	def _generate_examples(self, files: List[str]):
	"""This function returns the examples in the raw (text) form."""
	for fn in files:
	logger.info("generating examples from = %s", fn)

	with gzip.open(fn, mode="rt", encoding="utf-8") as f:
	for line in f:
	row = json.loads(line)
	yield row["id"], {
	"id": row["id"],
	"text": row["text"],
	"added": row.get("added", ""),
	"created": row.get("created", ""),
	"source": row.get("source", ""),
	}

Xet Storage Details

Size:: 4.84 kB
Xet hash:: 21b68562245820442153b560f0e676c9401ad13bd6c4f1839d47516dd4078773

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.