Skip to content

Commit bc6537e

Browse files
authored
Merge pull request #7 from gty1829/dev_3
modify docs
2 parents 0050c91 + bba9548 commit bc6537e

File tree

8 files changed

+134
-124
lines changed

8 files changed

+134
-124
lines changed

docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7' # Set visible GPU device
1919
from dataflow.utils.storage import FileStorage
2020
from dataflow.operators.core_audio import (
2121
SileroVADGenerator,
22-
MergeChunksByTimestamps,
22+
MergeChunksRowGenerator,
2323
PromptedAQAGenerator,
24-
# CTCForcedAlignFilter, # Import this for filtering instead of evaluation
25-
CTCForcedAlignSampleEvaluator,
24+
# CTCForcedAlignmentFilter, # Import this for filtering instead of evaluation
25+
CTCForcedAlignmentSampleEvaluator,
2626
)
2727
from dataflow.serving import LocalModelVLMServing_vllm
2828
from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt
@@ -57,7 +57,7 @@ class Pipeline:
5757
num_workers=2, # Process count; each process loads one model instance
5858
)
5959

60-
self.merger = MergeChunksByTimestamps(num_workers=2)
60+
self.merger = MergeChunksRowGenerator(num_workers=2)
6161

6262
self.prompted_generator = PromptedAQAGenerator(
6363
vlm_serving=self.serving,
@@ -70,7 +70,7 @@ class Pipeline:
7070
# num_workers=1,
7171
# )
7272

73-
self.evaluator = CTCForcedAlignSampleEvaluator(
73+
self.evaluator = CTCForcedAlignmentSampleEvaluator(
7474
model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
7575
device=["cuda:3"], # GPUs that the model can be loaded on
7676
num_workers=2, # Process count; each process loads one model instance
@@ -90,7 +90,6 @@ class Pipeline:
9090
return_seconds=True,
9191
time_resolution=1,
9292
neg_threshold=0.35,
93-
window_size_samples=512,
9493
min_silence_at_max_speech=0.098,
9594
use_max_poss_sil_at_max_speech=True
9695
)

docs/en/notes/mm_guide/audio_understanding/audio_caption.md

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,41 +10,52 @@ permalink: /en/mm_guide/2gjc47qb/
1010
## Step 1: Install Environment
1111
See[ Audio Environment Installation](./install_audio_understanding.md)
1212

13-
## Step 2: Start the Local Model Service
13+
## Step 2: Import Relevant Packages
14+
```python
15+
from dataflow.operators.core_audio import PromptedAQAGenerator
16+
from dataflow.serving import LocalModelVLMServing_vllm
17+
from dataflow.utils.storage import FileStorage
18+
from dataflow.prompts.audio import AudioCaptionGeneratorPrompt
19+
```
20+
21+
## Step 3: Start the Local Model Service
1422
The local model serving method is as follows:
1523
```python
16-
llm_serving = LocalModelLLMServing_vllm(
17-
hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path
24+
vlm_serving = LocalModelVLMServing_vllm(
25+
hf_model_name_or_path="Qwen/Qwen2-Audio-7B-Instruct", # set to your own model path
1826
vllm_tensor_parallel_size=2,
1927
vllm_max_tokens=8192,
2028
vllm_gpu_memory_utilization=0.7
2129
)
2230
```
2331

24-
## Step 3: Prepare the Audio Data for Caption Generation
32+
## Step 4: Prepare the Audio Data for Caption Generation
2533
Fill in the audio paths in the following format:
2634
```jsonl
27-
{"audio": ["your_audio_path"]}
35+
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"], "conversation": [{"from": "human", "value": "<audio>\nTranscribe the audio into Chinese." }]}
36+
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/audio_vqa/Santa%20Motor.wav"], "conversation": [{"from": "human", "value": "<audio>\nDescribe the sound in this audio clip." }]}
37+
2838
```
2939

30-
## Step 4: Add the Data Path to FileStorage in the Following Format
40+
## Step 5: Add the Data Path to FileStorage in the Following Format
3141
```python
3242
storage = FileStorage(
33-
first_entry_file_name="your_path",
43+
first_entry_file_name="./dataflow/example/audio_aqa/sample_data.jsonl",
3444
cache_path="./cache",
3545
file_name_prefix="audio_caption",
3646
cache_type="jsonl",
37-
media_key="audio",
38-
media_type="audio"
3947
)
4048
```
4149

42-
## Step 5: Initialize the CaptionGenerator Operator
50+
## Step 6: Initialize the PromptedAQAGenerator Operator
4351
```python
44-
generator = CaptionGenerator(llm_serving)
52+
prompt_generator = PromptedAQAGenerator(
53+
vlm_serving=vlm_serving,
54+
system_prompt=AudioCaptionGeneratorPrompt().generate_prompt()
55+
)
4556
```
4657

47-
## Step 6: Execute the Operator
58+
## Step 7: Execute the Operator
4859
```python
49-
generator.run(storage=storage.step(), output_key="caption")
60+
prompt_generator.run(storage=storage.step(), output_key="caption")
5061
```

docs/en/notes/mm_guide/audio_understanding/install_audio_understanding.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ conda create -n myvenv python=3.10
1111
conda activate myvenv
1212

1313
cd ./DataFlow-MM
14-
pip install open-dataflow[audio]
14+
pip install open-dataflow-mm[audio]
1515
```

docs/en/notes/mm_guide/audio_understanding/whisper_asr.md

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -10,59 +10,66 @@ permalink: /en/mm_guide/dl0jhc6u/
1010
## Step 1: Install Environment
1111
See [ Audio Environment Installation](./install_audio_understanding.md)
1212

13-
## Step 2: Start the Local Model Service
13+
## Step 2: Import Relevant Packages
14+
```python
15+
from dataflow.operators.core_audio import PromptedAQAGenerator
16+
from dataflow.serving import LocalModelVLMServing_vllm
17+
from dataflow.utils.storage import FileStorage
18+
from dataflow.prompts.audio import WhisperTranscriptionPrompt
19+
```
20+
21+
## Step 3: Start the Local Model Service
1422
The method for launching the local model serving service is as follows:
1523
```python
16-
llm_serving = LocalModelLLMServing_vllm(
17-
hf_model_name_or_path="./models/whisper-large-v3", # set to your own model path
24+
vlm_serving = LocalModelLLMServing_vllm(
25+
hf_model_name_or_path="openai/whisper-large-v3", # set to your own model path
26+
hf_cache_dir='./dataflow_cache',
1827
vllm_tensor_parallel_size=2,
19-
vllm_max_tokens=None,
20-
vllm_gpu_memory_utilization=0.7
28+
vllm_temperature=0.3,
29+
vllm_top_p=0.9,
30+
vllm_max_tokens=512,
31+
vllm_max_model_len=448,
32+
vllm_gpu_memory_utilization=0.9
2133
)
2234
```
2335

24-
## Step 3: Prepare the Audio Data for Transcription or Translation
36+
## Step 4: Prepare the Audio Data for Transcription or Translation
2537
Fill in the audio paths in the following format:
2638
```jsonl
27-
{"audio": ["your_audio_path"]}
39+
{"conversation": [{"from": "human", "value": "<audio>"}], "audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"]}
40+
2841
```
2942

30-
## Step 4: Add the Data Path to FileStorage
43+
## Step 5: Add the Data Path to FileStorage
3144
```python
3245
storage = FileStorage(
33-
first_entry_file_name="your_path",
46+
first_entry_file_name="./dataflow/example/whisper_transcription/sample_data.jsonl",
3447
cache_path="./cache",
3548
file_name_prefix="whisper_transcription",
3649
cache_type="jsonl",
37-
media_key="audio",
38-
media_type="audio"
3950
)
4051
```
4152

42-
## Step 5: Initialize the WhisperTranscriptionGenerator Operator
53+
## Step 6: Initialize the PromptedAQAGenerator Operator
4354
```python
44-
generator = WhisperTranscriptionGenerator(self.llm_serving)
45-
```
46-
47-
## Step 6: Execute the Operator
48-
Speech Transcription
49-
```python
50-
generator.run(
51-
storage=self.storage.step(),
52-
task="transcribe", # Indicates that the task is speech transcription
53-
language="mandarin", # Spoken language in the audio; default is "english"
54-
use_no_time_stamps=True, # Whether to use the no-timestamp format; default is True
55-
output_key="transcription" # Key for the output result
55+
prompt_generator = PromptedAQAGenerator(
56+
vlm_serving=vlm_serving,
57+
system_prompt=WhisperTranscriptionPrompt().generate_prompt(
58+
language="mandarin",
59+
task="transcribe", # If task == 'translate', the model will translate input speech into English text.
60+
with_timestamps=False
61+
)
5662
)
5763
```
5864

59-
Speech Translation (translate audio content into English)
65+
## Step 7: Execute the Operator
66+
Speech Transcription
6067
```python
61-
generator.run(
62-
storage=self.storage.step(),
63-
task="translate", # Indicates that the task is speech translation
64-
language="mandarin", # Spoken language in the audio; default is "english"
65-
use_no_time_stamps=True, # Whether to use the no-timestamp format; default is True
66-
output_key="transcription" # Key for the output result
68+
prompt_generator.run(
69+
storage = self.storage.step(),
70+
input_audio_key="audio",
71+
input_conversation_key="conversation",
72+
output_answer_key="answer",
73+
storage=storage.step(),
6774
)
6875
```

docs/zh/notes/mm_guide/audio_understanding/audio_asr_pipeline.md

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7' # 设置可见的GPU设
1818
from dataflow.utils.storage import FileStorage
1919
from dataflow.operators.core_audio import (
2020
SileroVADGenerator,
21-
MergeChunksByTimestamps,
22-
PromptedAQAGenerator,
23-
# CTCForcedAlignFilter, # 如果是过滤而非评估, 则导入过滤算子
24-
CTCForcedAlignSampleEvaluator,
21+
MergeChunksRowGenerator,
22+
PromptedAQAGenerator,
23+
# CTCForcedAlignmentFilter, # 如果是过滤而非评估, 则导入过滤算子
24+
CTCForcedAlignmentSampleEvaluator,
2525
)
2626
from dataflow.serving import LocalModelVLMServing_vllm
2727
from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt
@@ -56,20 +56,20 @@ class Pipeline:
5656
num_workers=2, # num_workers为进程数, 每个进程启动一个模型, 平均分配在device列表中的每个设备上
5757
)
5858

59-
self.merger = MergeChunksByTimestamps(num_workers=2)
59+
self.merger = MergeChunksRowGenerator(num_workers=2)
6060

6161
self.prompted_generator = PromptedAQAGenerator(
6262
vlm_serving=self.serving,
6363
system_prompt=WhisperTranscriptionPrompt().generate_prompt(language="german", task="transcribe", with_timestamps=False),
6464
)
6565

66-
# self.filter = CTCForcedAlignFilter(
66+
# self.filter = CTCForcedAlignmentFilter(
6767
# model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
6868
# device=["cuda:3"],
6969
# num_workers=1,
7070
# )
7171

72-
self.evaluator = CTCForcedAlignSampleEvaluator(
72+
self.evaluator = CTCForcedAlignmentSampleEvaluator(
7373
model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
7474
device=["cuda:3"], # 可以加载模型的GPU列表
7575
num_workers=2, # num_workers为进程数, 每个进程启动一个模型, 平均分配在device列表中的每个设备上
@@ -89,7 +89,6 @@ class Pipeline:
8989
return_seconds=True,
9090
time_resolution=1,
9191
neg_threshold=0.35,
92-
window_size_samples=512,
9392
min_silence_at_max_speech=0.098,
9493
use_max_poss_sil_at_max_speech=True
9594
)
@@ -151,5 +150,4 @@ class Pipeline:
151150
if __name__ == "__main__":
152151
pipeline = Pipeline()
153152
pipeline.forward()
154-
155153
```

docs/zh/notes/mm_guide/audio_understanding/audio_caption.md

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,55 +8,54 @@ permalink: /zh/mm_guide/2gjc47qb/
88

99
## 音频字幕生成
1010

11-
<!-- ## 第一步: 准备Dataflow环境
12-
```bash
13-
conda create -n myvenv python=3.12
14-
pip install open-dataflow
15-
pip install open-dataflow[vllm]
16-
```
17-
18-
## 第二步: 安装Dataflow音频模块
19-
```bash
20-
pip install open-dataflow[audio]
21-
``` -->
22-
2311
## 第一步: 安装环境
2412
[Audio环境安装](./install_audio_understanding.md)
2513

26-
## 第二步: 启动本地模型服务
14+
## 第二步: 导入包
15+
```python
16+
from dataflow.operators.core_audio import PromptedAQAGenerator
17+
from dataflow.serving import LocalModelVLMServing_vllm
18+
from dataflow.utils.storage import FileStorage
19+
from dataflow.prompts.audio import AudioCaptionGeneratorPrompt
20+
```
21+
22+
## 第三步: 启动本地模型服务
2723
本地模型调用服务方法如下:
2824
```python
29-
llm_serving = LocalModelLLMServing_vllm(
30-
hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path
25+
vlm_serving = LocalModelVLMServing_vllm(
26+
hf_model_name_or_path="Qwen/Qwen2-Audio-7B-Instruct", # set to your own model path
3127
vllm_tensor_parallel_size=2,
3228
vllm_max_tokens=8192,
3329
vllm_gpu_memory_utilization=0.7
3430
)
3531
```
3632

37-
## 第三步: 按如下格式填写音频路径, 准备需要增加音频字幕的数据
33+
## 第四步: 按如下格式填写音频路径, 准备需要增加音频字幕的数据
3834
```jsonl
39-
{"audio": ["your_audio_path"]}
35+
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"], "conversation": [{"from": "human", "value": "<audio>\nTranscribe the audio into Chinese." }]}
36+
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/audio_vqa/Santa%20Motor.wav"], "conversation": [{"from": "human", "value": "<audio>\nDescribe the sound in this audio clip." }]}
37+
4038
```
4139

42-
## 第四步: 按下述格式将数据路径填入FileStorage中
40+
## 第五步: 按下述格式将数据路径填入FileStorage中
4341
```python
4442
storage = FileStorage(
45-
first_entry_file_name="your_path",
43+
first_entry_file_name="./dataflow/example/audio_aqa/sample_data.jsonl",
4644
cache_path="./cache",
4745
file_name_prefix="audio_caption",
4846
cache_type="jsonl",
49-
media_key="audio",
50-
media_type="audio"
5147
)
5248
```
5349

54-
## 第五步: 初始化CaptionGenerator算子
50+
## 第六步: 初始化PromptedAQAGenerator算子
5551
```python
56-
generator = CaptionGenerator(llm_serving)
52+
prompt_generator = PromptedAQAGenerator(
53+
vlm_serving=vlm_serving,
54+
system_prompt=AudioCaptionGeneratorPrompt().generate_prompt()
55+
)
5756
```
5857

59-
## 第六步: 执行算子
58+
## 第七步: 执行算子
6059
```python
61-
generator.run(storage=storage.step(), output_key="caption")
60+
prompt_generator.run(storage=storage.step(), output_key="caption")
6261
```

docs/zh/notes/mm_guide/audio_understanding/install_audio_understanding.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ conda create -n myvenv python=3.10
1111
conda activate myvenv
1212

1313
cd ./DataFlow-MM
14-
pip install open-dataflow[audio]
14+
pip install open-dataflow-mm[audio]
1515
```

0 commit comments

Comments
 (0)