Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions examples/pytorch/speech-recognition/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface

```bash
python run_speech_recognition_ctc.py \
--dataset_name="common_voice" \
--dataset_name="mozilla-foundation/common_voice_17_0" \
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
--dataset_config_name="tr" \
--output_dir="./wav2vec2-common_voice-tr-demo" \
Expand Down Expand Up @@ -102,7 +102,7 @@ The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface
```bash
torchrun \
--nproc_per_node 8 run_speech_recognition_ctc.py \
--dataset_name="common_voice" \
--dataset_name="mozilla-foundation/common_voice_17_0" \
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
--dataset_config_name="tr" \
--output_dir="./wav2vec2-common_voice-tr-demo-dist" \
Expand Down Expand Up @@ -149,7 +149,7 @@ However, the `--shuffle_buffer_size` argument controls how many examples we can
```bash
**torchrun \
--nproc_per_node 4 run_speech_recognition_ctc_streaming.py \
--dataset_name="common_voice" \
--dataset_name="mozilla-foundation/common_voice_17_0" \
--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
--tokenizer_name_or_path="anton-l/wav2vec2-tokenizer-turkish" \
--dataset_config_name="tr" \
Expand Down Expand Up @@ -314,7 +314,7 @@ below 27%.
For an example run, you can have a look at [`patrickvonplaten/wav2vec2-common_voice-tr-mms-demo`](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-mms-demo).


If you'd like to train another adapter model with the same base model, you can simply re-use the same `--output_dir`,
If you'd like to train another adapter model with the same base model, you can simply reuse the same `--output_dir`,
but make sure to pass the `--output_dir` folder also to `--tokenizer_name_or_path` so that the vocabulary is not
overwritten but **extended**. Assuming you would like to train adapter weights on Swedish in addition to Turkish and save
the adapter weights in the same model repo, you can run:
Expand Down
38 changes: 26 additions & 12 deletions examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.57.0.dev0")

require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
require_version(
"datasets>=1.18.0",
"To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt",
)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -91,13 +94,16 @@ class ModelArguments:
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
default=True,
metadata={"help": "Whether to freeze the feature encoder layers of the model."},
)
attention_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
default=0.0,
metadata={"help": "The dropout ratio for the attention probabilities."},
)
activation_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
default=0.0,
metadata={"help": "The dropout ratio for activations inside the fully connected layer."},
)
feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
hidden_dropout: float = field(
Expand Down Expand Up @@ -140,7 +146,8 @@ class ModelArguments:
)
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
ctc_loss_reduction: Optional[str] = field(
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
default="mean",
metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."},
)
ctc_zero_infinity: Optional[bool] = field(
default=False,
Expand Down Expand Up @@ -169,10 +176,13 @@ class DataTrainingArguments:
"""

dataset_name: str = field(
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
metadata={"help": "Path or name of the dataset (cf `load_dataset` method of the Datasets library)."}
)
dataset_config_name: str = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
default=None,
metadata={
"help": "The configuration name of the dataset to use (cf `load_dataset` method of the Datasets library)."
},
)
train_split_name: str = field(
default="train+validation",
Expand All @@ -198,7 +208,8 @@ class DataTrainingArguments:
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
default=False,
metadata={"help": "Overwrite the cached preprocessed datasets or not."},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
Expand Down Expand Up @@ -240,7 +251,8 @@ class DataTrainingArguments:
},
)
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
default=0.0,
metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"},
)
preprocessing_only: bool = field(
default=False,
Expand Down Expand Up @@ -383,7 +395,8 @@ def extract_all_chars(batch):

# take union of all unique characters in each dataset
vocab_set = functools.reduce(
lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]),
vocabs.values(),
)

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
Expand Down Expand Up @@ -571,7 +584,7 @@ def remove_special_characters(batch):
# it is defined by `tokenizer_class` if present in config else by `model_type`
tokenizer_kwargs = {
"config": config if config.tokenizer_class is not None else None,
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
"tokenizer_type": (config.model_type if config.tokenizer_class is None else None),
"unk_token": unk_token,
"pad_token": pad_token,
"word_delimiter_token": word_delimiter_token,
Expand Down Expand Up @@ -639,7 +652,8 @@ def remove_special_characters(batch):
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
data_args.audio_column_name,
datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
)

# derive max & min input length for sample rate & max duration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.57.0.dev0")

require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
require_version(
"datasets>=1.18.0",
"To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt",
)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -127,7 +130,8 @@ class ModelArguments:
)
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
ctc_loss_reduction: Optional[str] = field(
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
default="mean",
metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."},
)
adapter_attn_dim: int = field(
default=16,
Expand All @@ -148,9 +152,9 @@ class DataTrainingArguments:
"""

dataset_name: str = field(
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
metadata={"help": "Path or name of the dataset (cf `load_dataset` method of the Datasets library)."}
)
target_language: Optional[str] = field(
target_language: str = field(
metadata={
"help": (
"The target language on which the adapter attention layers"
Expand All @@ -162,7 +166,10 @@ class DataTrainingArguments:
},
)
dataset_config_name: str = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
default=None,
metadata={
"help": "The configuration name of the dataset to use (cf `load_dataset` method of the Datasets library)."
},
)
train_split_name: str = field(
default="train+validation",
Expand All @@ -188,7 +195,8 @@ class DataTrainingArguments:
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
default=False,
metadata={"help": "Overwrite the cached preprocessed datasets or not."},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
Expand Down Expand Up @@ -230,7 +238,8 @@ class DataTrainingArguments:
},
)
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
default=0.0,
metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"},
)
preprocessing_only: bool = field(
default=False,
Expand Down Expand Up @@ -363,7 +372,8 @@ def extract_all_chars(batch):

# take union of all unique characters in each dataset
vocab_set = functools.reduce(
lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]),
vocabs.values(),
)

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
Expand Down Expand Up @@ -578,7 +588,7 @@ def remove_special_characters(batch):
# it is defined by `tokenizer_class` if present in config else by `model_type`
tokenizer_kwargs = {
"config": config if config.tokenizer_class is not None else None,
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
"tokenizer_type": (config.model_type if config.tokenizer_class is None else None),
"unk_token": unk_token,
"pad_token": pad_token,
"word_delimiter_token": word_delimiter_token,
Expand Down Expand Up @@ -650,7 +660,8 @@ def remove_special_characters(batch):
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
data_args.audio_column_name,
datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
)

# derive max & min input length for sample rate & max duration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.57.0.dev0")

require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
require_version(
"datasets>=1.18.0",
"To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt",
)

logger = logging.getLogger(__name__)

Expand All @@ -77,13 +80,16 @@ class ModelArguments:
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
default=None,
metadata={"help": "Pretrained config name or path if not the same as model_name"},
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
default=None,
metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
)
feature_extractor_name: Optional[str] = field(
default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
default=None,
metadata={"help": "feature extractor name or path if not the same as model_name"},
)
cache_dir: Optional[str] = field(
default=None,
Expand Down Expand Up @@ -117,10 +123,12 @@ class ModelArguments:
},
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
default=True,
metadata={"help": "Whether to freeze the feature encoder layers of the model."},
)
freeze_encoder: bool = field(
default=False, metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."}
default=False,
metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."},
)
forced_decoder_ids: list[list[int]] = field(
default=None,
Expand Down Expand Up @@ -150,13 +158,17 @@ class DataTrainingArguments:
"""

dataset_name: str = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
metadata={"help": "Path or name of the dataset (cf `load_dataset` method of the Datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
dataset_config_name: str = field(
default=None,
metadata={
"help": "The configuration name of the dataset to use (cf `load_dataset` method of the Datasets library)."
},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
default=False,
metadata={"help": "Overwrite the cached training and evaluation sets"},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
Expand Down Expand Up @@ -198,7 +210,8 @@ class DataTrainingArguments:
},
)
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
default=0.0,
metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"},
)
preprocessing_only: bool = field(
default=False,
Expand Down Expand Up @@ -387,7 +400,7 @@ def main():
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
(model_args.config_name if model_args.config_name else model_args.model_name_or_path),
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
Expand All @@ -399,14 +412,14 @@ def main():
config.update({"apply_spec_augment": model_args.apply_spec_augment})

feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
(model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
(model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
Expand Down Expand Up @@ -465,7 +478,8 @@ def main():
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
data_args.audio_column_name,
datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
)

# 7. Preprocessing the datasets.
Expand Down Expand Up @@ -494,7 +508,9 @@ def prepare_dataset(batch):
# process audio
sample = batch[audio_column_name]
inputs = feature_extractor(
sample["array"], sampling_rate=sample["sampling_rate"], return_attention_mask=forward_attention_mask
sample["array"],
sampling_rate=sample["sampling_rate"],
return_attention_mask=forward_attention_mask,
)
# process audio length
batch[model_input_name] = inputs.get(model_input_name)[0]
Expand Down Expand Up @@ -579,7 +595,7 @@ def compute_metrics(pred):
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
processing_class=feature_extractor,
data_collator=data_collator,
compute_metrics=compute_metrics if training_args.predict_with_generate else None,
compute_metrics=(compute_metrics if training_args.predict_with_generate else None),
)

# 12. Training
Expand Down Expand Up @@ -621,7 +637,10 @@ def compute_metrics(pred):
trainer.save_metrics("eval", metrics)

# 14. Write Training Stats
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "automatic-speech-recognition"}
kwargs = {
"finetuned_from": model_args.model_name_or_path,
"tasks": "automatic-speech-recognition",
}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
Expand Down