-
Notifications
You must be signed in to change notification settings - Fork 203
added connector folder and HF file #313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
68df99a
16257b9
4cd48ff
86b230f
497e04b
74be0dc
a5a41f4
152a99e
2f783ca
9f0575b
d30529d
829d7df
8bf9c52
9ae14f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| from datasets import load_dataset | ||
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| import hashlib | ||
|
|
||
| # Gets data from a Hugging Face dataset with automatic configuration | ||
| def fetch_data_from_huggingface(dataset_identifier): | ||
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| try: | ||
| # Try loading the dataset without specifying a configuration | ||
| dataset = load_dataset(dataset_identifier, trust_remote_code=True) | ||
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| except ValueError as e: | ||
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # If there is an error it might be because of the config selection | ||
| if "Please pick one among the available configs" in str(e): | ||
| # Gets available config and selects first one | ||
| available_configs = str(e).split("['")[1].split("']")[0].split("', '") | ||
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| dataset = load_dataset(dataset_identifier, available_configs[0], trust_remote_code=True) | ||
| else: | ||
| raise e | ||
|
|
||
| data = [] | ||
| for split in dataset.keys(): | ||
|
||
| for i, example in enumerate(dataset[split]): | ||
| # Creates a unique and shortened ID | ||
| unique_str = f"{dataset_identifier}_{split}_{i}" | ||
| short_id = hashlib.sha1(unique_str.encode()).hexdigest()[:25] | ||
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| example['id'] = short_id | ||
| data.append(example) | ||
| return data | ||
|
|
||
| # Main load function to be used as a connector | ||
| def load(dataset_identifier): | ||
| data = fetch_data_from_huggingface(dataset_identifier.strip()) | ||
|
|
||
| if data: | ||
| return data | ||
| else: | ||
| raise ValueError("No data was found for the provided dataset.") | ||
| if __name__ == "__main__": | ||
abhisomala marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| dataset_identifier = input("Enter Hugging Face dataset identifier: ").strip() | ||
|
|
||
| try: | ||
| data = load(dataset_identifier) | ||
| print(f"Dataset has been loaded successfully. Number of entries: {len(data)}") | ||
| # You can add more processing logic here if needed | ||
| except ValueError as e: | ||
| print(f"Error loading dataset: {e}") | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.