Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions connectors/huggingface_connecter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from datasets import load_dataset
import hashlib

# Gets data from a Hugging Face dataset with automatic configuration
def fetch_data_from_huggingface(dataset_identifier):
try:
# Try loading the dataset without specifying a configuration
dataset = load_dataset(dataset_identifier, trust_remote_code=True)
except ValueError as e:
# If there is an error it might be because of the config selection
if "Please pick one among the available configs" in str(e):
# Gets available config and selects first one
available_configs = str(e).split("['")[1].split("']")[0].split("', '")
dataset = load_dataset(dataset_identifier, available_configs[0], trust_remote_code=True)
else:
raise e

data = []
for split in dataset.keys():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same question here about large datasets - are we sure this will not break?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could I solve this with batch processing and by passing streaming=True through load dataset? Aaron mentioned that so it seems like that could prevent that issue

for i, example in enumerate(dataset[split]):
# Creates a unique and shortened ID
unique_str = f"{dataset_identifier}_{split}_{i}"
short_id = hashlib.sha1(unique_str.encode()).hexdigest()[:25]
example['id'] = short_id
data.append(example)
return data

# Main load function to be used as a connector
def load(dataset_identifier):
data = fetch_data_from_huggingface(dataset_identifier.strip())

if data:
return data
else:
raise ValueError("No data was found for the provided dataset.")
if __name__ == "__main__":
dataset_identifier = input("Enter Hugging Face dataset identifier: ").strip()

try:
data = load(dataset_identifier)
print(f"Dataset has been loaded successfully. Number of entries: {len(data)}")
# You can add more processing logic here if needed
except ValueError as e:
print(f"Error loading dataset: {e}")