# Dodrio
Data format designed for TTS training
### 数据准备
首先需要准备含有wav或mp3格式音频的文件夹
```python
test_data_dir = '/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/origin_data/test_data'
```
确定输出文件夹路径
```python
import os
outdir = '/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout'
dataname = 'test'
stockdir = outdir + '/stockdir'
usagedir = outdir + '/usagedir'
parquet_dir = os.path.join(stockdir, dataname, 'parquet_dir')
pack_dir = os.path.join(usagedir, dataname, 'pack_dir')
```
parquet_dir 为 parquet 格式文件夹
pack_dir 为package格式文件夹
info_outdir 为text和spk等信息的存储文件夹
### 数据打包
```python
import dodrio
# 输入 test_data_dir 生成 parquet 数据包
dodrio.gen_parquet(test_data_dir, parquet_dir, mid_name=dataname, file_type='wav')
# 输入 test_data_dir 生成 package 数据包 注意需要指定 采样率,pack会统一音频采样率存储
dodrio.gen_package(test_data_dir, pack_dir, mid_name=dataname, target_sample_rate=48000, file_type='wav')
# 也可以通过parquet数据格式生成package数据包
dodrio.parquet2package(parquet_dir, pack_dir, sample_rate=48000)
```
### 还原音频
将数据包中数据还原成音频
```python
reout = '/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/reout'
re_paruquet = os.path.join(reout, 'reparquet_dir')
re_pack = os.path.join(reout, 'repack_dir')
# parquet还原音频是还原成原始的格式,比如之前是mp3还原后还是mp3,且采样率这些不变
dodrio.parquet2wav(parquet_dir, re_paruquet)
# package 还原音频只会还原成 特定采样率的wav,对应采样率在一开始打包的时候已经设定好,且比特率和通道这些都固定
dodrio.package2wav(pack_dir, re_pack)
```
### 打包文本信息
打包存储 text 等信息
```python
# parquet
info_type = 'libritts'
info_outdir = os.path.join(stockdir, dataname, 'info_dir')
dodrio.gen_infodir(parquet_dir, test_data_dir, info_outdir, info_type, kl=['text', 'unnorm_text'], lang='en', from_type='parquet')
# package
info_type = 'libritts'
pack_info_outdir = os.path.join(usagedir, dataname, 'info_dir')
dodrio.gen_infodir(pack_dir, test_data_dir, pack_info_outdir, info_type, kl=['text', 'unnorm_text'], lang='en', from_type='pack')
```
parquet 和 package 调用的函数相同,都是 gen_infodir 。
需要注意的是 因为原始数据的存储方式千奇百怪,且文本不会按照唯一方式存储,所以调用的访问函数实际是不同的,这里预设了几种数据类型,比如上面的代码中就是从 libritts里加载数据格式
函数中的参数 第一个 parquet_dir 为 打包好的音频数据文件夹,这里主要是为了和打包数据分块列表一致所以载入;第二个参数 test_data_dir为文本等信息存储的文件夹;第三个参数 info_outdir 为 info的输出文件夹。
参数 info_type 为指定数据类型,目前只支持几种特定数据类型。 参数 kl 是keys list 这是因为有时文本有不同版本的文本,所以在这里设定一个帮助参数, 参数 lang 为language的默认值,数据文件有时会不带语种标签,在这里可以硬指定。
### 特征提取
可以用 extract_feat 提取特征并存储
```python
dodrio.extract_feat(extractor_func, featname, input_dir, out_dir, from_type, **params)
```
extractor_func 为 特征提取函数, featname 为对应特征名, input_dir为对应 package数据包,out_dir为输出文件夹, from_type为输入的数据包类型(目前仅支持 package);params为特征提取所需额外参数
以cosyvoice的embedding举例,目前内置了对应的特征提取函数
```python
# 预设模型加载
from dodrio.afeat.exp_fun import extractor_embedding
tt = extractor_embedding(onnx_path)
extractor_func = tt.extractor
from_type = 'package'
featname = 'embed'
input_dir = pack_dir
out_dir = os.path.join(usagedir, dataname, featname+'_dir')
# 准备需要的额外参数 utt2spk
utt2spk = dodrio.get_utt2spk(info_outdir)
# 提取 embedding
dodrio.extract_feat(extractor_func, featname, input_dir, out_dir, from_type, utt2spk=utt2spk)
# 根据 spk 计算 spk embedding 均值
tt.mean_spk_embedding()
# 提取 spk 的平均 embedding
featname = 'spkembed'
input_dir = pack_dir
out_dir = os.path.join(usagedir, dataname, featname+'_dir')
extractor_func = tt.spk_embedding_save
dodrio.extract_feat(extractor_func, featname, input_dir, out_dir, from_type, utt2spk=utt2spk)
```
### 准备训练所需列表
目前也有预设的列表准备版本
```python
# supdir_list 可以包含多个数据包 目录
supdir_list = [os.path.join(usagedir, dataname)]
listoutdir= 'listoutdir'
# featlist 为需要添加的特征
featlist= ['embed', 'spkembed', 'speechtoken']
# check_func 为数据筛选函数, prefix 为 数据表名前缀
dodrio.gen_datalist(supdir_list, listoutdir, featlist, dodrio.check_func, prefix='test')
```
### 数据读取
以上面表格为例,加载单条数据可以通过 load_data_from_line 得到
```python
infoline = '296_142727_000010_000000|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/pack_dir/wav_test_00000.pack|119946232|121158716|4|This reduction, if admitted, would much facilitate the introduction of emotion into our system, which, being founded on the distinction between the consciousness and the object, is likewise an intellectualist system.|en|embed|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/embed_dir/wav_test_00000.embed|135168|135936|192|spkembed|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/spkembed_dir/wav_test_00000.spkembed|135168|135936|192|speechtoken|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/speechtoken_dir/wav_test_00000.speechtoken|250040|252568|632'
data_dict = dodrio.load_data_from_line(infoline)
# data_dict.keys()
# dict_keys(['uttid', 'audio', 'spkid', 'text', 'language', 'embed', 'spkembed', 'speechtoken'])
```
Raw data
{
"_id": null,
"home_page": "https://github.com/yixiangchen1995/python-Dodrio",
"name": "dodrio",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "python, first package",
"author": "Yixiang Chen",
"author_email": "<yixiangchen1995@gmail.com>",
"download_url": "https://files.pythonhosted.org/packages/ce/4f/24addc80ac2856b77c1aa52d88c9d9d1141d41a327d2ada48dfcdec117c3/dodrio-0.3.6.tar.gz",
"platform": null,
"description": "# Dodrio\nData format designed for TTS training\n\n### \u6570\u636e\u51c6\u5907\n\u9996\u5148\u9700\u8981\u51c6\u5907\u542b\u6709wav\u6216mp3\u683c\u5f0f\u97f3\u9891\u7684\u6587\u4ef6\u5939 \n\n```python\ntest_data_dir = '/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/origin_data/test_data'\n```\n\n\u786e\u5b9a\u8f93\u51fa\u6587\u4ef6\u5939\u8def\u5f84\n\n```python\nimport os\noutdir = '/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout'\ndataname = 'test'\nstockdir = outdir + '/stockdir'\nusagedir = outdir + '/usagedir'\nparquet_dir = os.path.join(stockdir, dataname, 'parquet_dir')\npack_dir = os.path.join(usagedir, dataname, 'pack_dir')\n\n```\n\nparquet_dir \u4e3a parquet \u683c\u5f0f\u6587\u4ef6\u5939\npack_dir \u4e3apackage\u683c\u5f0f\u6587\u4ef6\u5939\ninfo_outdir \u4e3atext\u548cspk\u7b49\u4fe1\u606f\u7684\u5b58\u50a8\u6587\u4ef6\u5939\n\n### \u6570\u636e\u6253\u5305\n\n```python\nimport dodrio\n\n# \u8f93\u5165 test_data_dir \u751f\u6210 parquet \u6570\u636e\u5305\ndodrio.gen_parquet(test_data_dir, parquet_dir, mid_name=dataname, file_type='wav')\n\n# \u8f93\u5165 test_data_dir \u751f\u6210 package \u6570\u636e\u5305 \u6ce8\u610f\u9700\u8981\u6307\u5b9a \u91c7\u6837\u7387\uff0cpack\u4f1a\u7edf\u4e00\u97f3\u9891\u91c7\u6837\u7387\u5b58\u50a8\ndodrio.gen_package(test_data_dir, pack_dir, mid_name=dataname, target_sample_rate=48000, file_type='wav')\n\n# \u4e5f\u53ef\u4ee5\u901a\u8fc7parquet\u6570\u636e\u683c\u5f0f\u751f\u6210package\u6570\u636e\u5305\ndodrio.parquet2package(parquet_dir, pack_dir, sample_rate=48000)\n```\n\n### \u8fd8\u539f\u97f3\u9891\n\u5c06\u6570\u636e\u5305\u4e2d\u6570\u636e\u8fd8\u539f\u6210\u97f3\u9891\n\n```python\nreout = '/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/reout'\nre_paruquet = os.path.join(reout, 'reparquet_dir')\nre_pack = os.path.join(reout, 'repack_dir')\n\n# parquet\u8fd8\u539f\u97f3\u9891\u662f\u8fd8\u539f\u6210\u539f\u59cb\u7684\u683c\u5f0f\uff0c\u6bd4\u5982\u4e4b\u524d\u662fmp3\u8fd8\u539f\u540e\u8fd8\u662fmp3\uff0c\u4e14\u91c7\u6837\u7387\u8fd9\u4e9b\u4e0d\u53d8\ndodrio.parquet2wav(parquet_dir, re_paruquet)\n\n# package \u8fd8\u539f\u97f3\u9891\u53ea\u4f1a\u8fd8\u539f\u6210 \u7279\u5b9a\u91c7\u6837\u7387\u7684wav\uff0c\u5bf9\u5e94\u91c7\u6837\u7387\u5728\u4e00\u5f00\u59cb\u6253\u5305\u7684\u65f6\u5019\u5df2\u7ecf\u8bbe\u5b9a\u597d\uff0c\u4e14\u6bd4\u7279\u7387\u548c\u901a\u9053\u8fd9\u4e9b\u90fd\u56fa\u5b9a\ndodrio.package2wav(pack_dir, re_pack)\n```\n\n\n### \u6253\u5305\u6587\u672c\u4fe1\u606f\n\u6253\u5305\u5b58\u50a8 text \u7b49\u4fe1\u606f\n\n```python\n\n# parquet\ninfo_type = 'libritts'\ninfo_outdir = os.path.join(stockdir, dataname, 'info_dir')\ndodrio.gen_infodir(parquet_dir, test_data_dir, info_outdir, info_type, kl=['text', 'unnorm_text'], lang='en', from_type='parquet')\n\n# package\ninfo_type = 'libritts'\npack_info_outdir = os.path.join(usagedir, dataname, 'info_dir')\ndodrio.gen_infodir(pack_dir, test_data_dir, pack_info_outdir, info_type, kl=['text', 'unnorm_text'], lang='en', from_type='pack')\n```\n\nparquet \u548c package \u8c03\u7528\u7684\u51fd\u6570\u76f8\u540c\uff0c\u90fd\u662f gen_infodir \u3002\n\n\u9700\u8981\u6ce8\u610f\u7684\u662f \u56e0\u4e3a\u539f\u59cb\u6570\u636e\u7684\u5b58\u50a8\u65b9\u5f0f\u5343\u5947\u767e\u602a\uff0c\u4e14\u6587\u672c\u4e0d\u4f1a\u6309\u7167\u552f\u4e00\u65b9\u5f0f\u5b58\u50a8\uff0c\u6240\u4ee5\u8c03\u7528\u7684\u8bbf\u95ee\u51fd\u6570\u5b9e\u9645\u662f\u4e0d\u540c\u7684\uff0c\u8fd9\u91cc\u9884\u8bbe\u4e86\u51e0\u79cd\u6570\u636e\u7c7b\u578b\uff0c\u6bd4\u5982\u4e0a\u9762\u7684\u4ee3\u7801\u4e2d\u5c31\u662f\u4ece libritts\u91cc\u52a0\u8f7d\u6570\u636e\u683c\u5f0f\n\n\u51fd\u6570\u4e2d\u7684\u53c2\u6570 \u7b2c\u4e00\u4e2a parquet_dir \u4e3a \u6253\u5305\u597d\u7684\u97f3\u9891\u6570\u636e\u6587\u4ef6\u5939\uff0c\u8fd9\u91cc\u4e3b\u8981\u662f\u4e3a\u4e86\u548c\u6253\u5305\u6570\u636e\u5206\u5757\u5217\u8868\u4e00\u81f4\u6240\u4ee5\u8f7d\u5165\uff1b\u7b2c\u4e8c\u4e2a\u53c2\u6570 test_data_dir\u4e3a\u6587\u672c\u7b49\u4fe1\u606f\u5b58\u50a8\u7684\u6587\u4ef6\u5939\uff1b\u7b2c\u4e09\u4e2a\u53c2\u6570 info_outdir \u4e3a info\u7684\u8f93\u51fa\u6587\u4ef6\u5939\u3002\n\n\u53c2\u6570 info_type \u4e3a\u6307\u5b9a\u6570\u636e\u7c7b\u578b\uff0c\u76ee\u524d\u53ea\u652f\u6301\u51e0\u79cd\u7279\u5b9a\u6570\u636e\u7c7b\u578b\u3002 \u53c2\u6570 kl \u662fkeys list \u8fd9\u662f\u56e0\u4e3a\u6709\u65f6\u6587\u672c\u6709\u4e0d\u540c\u7248\u672c\u7684\u6587\u672c\uff0c\u6240\u4ee5\u5728\u8fd9\u91cc\u8bbe\u5b9a\u4e00\u4e2a\u5e2e\u52a9\u53c2\u6570\uff0c \u53c2\u6570 lang \u4e3alanguage\u7684\u9ed8\u8ba4\u503c\uff0c\u6570\u636e\u6587\u4ef6\u6709\u65f6\u4f1a\u4e0d\u5e26\u8bed\u79cd\u6807\u7b7e\uff0c\u5728\u8fd9\u91cc\u53ef\u4ee5\u786c\u6307\u5b9a\u3002\n\n### \u7279\u5f81\u63d0\u53d6\n\n\u53ef\u4ee5\u7528 extract_feat \u63d0\u53d6\u7279\u5f81\u5e76\u5b58\u50a8\n\n```python\ndodrio.extract_feat(extractor_func, featname, input_dir, out_dir, from_type, **params)\n```\n\nextractor_func \u4e3a \u7279\u5f81\u63d0\u53d6\u51fd\u6570\uff0c featname \u4e3a\u5bf9\u5e94\u7279\u5f81\u540d\uff0c input_dir\u4e3a\u5bf9\u5e94 package\u6570\u636e\u5305\uff0cout_dir\u4e3a\u8f93\u51fa\u6587\u4ef6\u5939\uff0c from_type\u4e3a\u8f93\u5165\u7684\u6570\u636e\u5305\u7c7b\u578b\uff08\u76ee\u524d\u4ec5\u652f\u6301 package\uff09\uff1bparams\u4e3a\u7279\u5f81\u63d0\u53d6\u6240\u9700\u989d\u5916\u53c2\u6570\n\n\u4ee5cosyvoice\u7684embedding\u4e3e\u4f8b\uff0c\u76ee\u524d\u5185\u7f6e\u4e86\u5bf9\u5e94\u7684\u7279\u5f81\u63d0\u53d6\u51fd\u6570\n\n```python\n\n# \u9884\u8bbe\u6a21\u578b\u52a0\u8f7d\nfrom dodrio.afeat.exp_fun import extractor_embedding\ntt = extractor_embedding(onnx_path)\nextractor_func = tt.extractor\n\nfrom_type = 'package'\nfeatname = 'embed'\ninput_dir = pack_dir\nout_dir = os.path.join(usagedir, dataname, featname+'_dir')\n\n# \u51c6\u5907\u9700\u8981\u7684\u989d\u5916\u53c2\u6570 utt2spk\nutt2spk = dodrio.get_utt2spk(info_outdir)\n\n# \u63d0\u53d6 embedding\ndodrio.extract_feat(extractor_func, featname, input_dir, out_dir, from_type, utt2spk=utt2spk)\n\n# \u6839\u636e spk \u8ba1\u7b97 spk embedding \u5747\u503c\ntt.mean_spk_embedding()\n\n# \u63d0\u53d6 spk \u7684\u5e73\u5747 embedding\nfeatname = 'spkembed'\ninput_dir = pack_dir\nout_dir = os.path.join(usagedir, dataname, featname+'_dir')\nextractor_func = tt.spk_embedding_save\ndodrio.extract_feat(extractor_func, featname, input_dir, out_dir, from_type, utt2spk=utt2spk)\n```\n\n### \u51c6\u5907\u8bad\u7ec3\u6240\u9700\u5217\u8868\n\n\u76ee\u524d\u4e5f\u6709\u9884\u8bbe\u7684\u5217\u8868\u51c6\u5907\u7248\u672c\n\n```python\n# supdir_list \u53ef\u4ee5\u5305\u542b\u591a\u4e2a\u6570\u636e\u5305 \u76ee\u5f55\nsupdir_list = [os.path.join(usagedir, dataname)]\nlistoutdir= 'listoutdir'\n# featlist \u4e3a\u9700\u8981\u6dfb\u52a0\u7684\u7279\u5f81\nfeatlist= ['embed', 'spkembed', 'speechtoken']\n\n# check_func \u4e3a\u6570\u636e\u7b5b\u9009\u51fd\u6570\uff0c prefix \u4e3a \u6570\u636e\u8868\u540d\u524d\u7f00\ndodrio.gen_datalist(supdir_list, listoutdir, featlist, dodrio.check_func, prefix='test')\n```\n\n### \u6570\u636e\u8bfb\u53d6\n\n\u4ee5\u4e0a\u9762\u8868\u683c\u4e3a\u4f8b\uff0c\u52a0\u8f7d\u5355\u6761\u6570\u636e\u53ef\u4ee5\u901a\u8fc7 load_data_from_line \u5f97\u5230\n\n```python\ninfoline = '296_142727_000010_000000|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/pack_dir/wav_test_00000.pack|119946232|121158716|4|This reduction, if admitted, would much facilitate the introduction of emotion into our system, which, being founded on the distinction between the consciousness and the object, is likewise an intellectualist system.|en|embed|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/embed_dir/wav_test_00000.embed|135168|135936|192|spkembed|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/spkembed_dir/wav_test_00000.spkembed|135168|135936|192|speechtoken|/home/jovyan/chenyixiang/workspace/20250324_dodrio/testdata/testout/usagedir/test/speechtoken_dir/wav_test_00000.speechtoken|250040|252568|632'\n\ndata_dict = dodrio.load_data_from_line(infoline)\n# data_dict.keys()\n# dict_keys(['uttid', 'audio', 'spkid', 'text', 'language', 'embed', 'spkembed', 'speechtoken'])\n\n```\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Data Package for TTS ",
"version": "0.3.6",
"project_urls": {
"Homepage": "https://github.com/yixiangchen1995/python-Dodrio"
},
"split_keywords": [
"python",
" first package"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "3d9a65ed89da9e7c7b4d16e5c3e2f0e543e218e0a6f59b0c671a0662c939c023",
"md5": "44e4a462ce94e253b646f49e3c4ec93b",
"sha256": "efe57a2462767359bcd3c6921a10c6f03e363ad22fbc74fc07677c08fedd22e8"
},
"downloads": -1,
"filename": "dodrio-0.3.6-py3-none-any.whl",
"has_sig": false,
"md5_digest": "44e4a462ce94e253b646f49e3c4ec93b",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 24219,
"upload_time": "2025-08-18T08:19:39",
"upload_time_iso_8601": "2025-08-18T08:19:39.329317Z",
"url": "https://files.pythonhosted.org/packages/3d/9a/65ed89da9e7c7b4d16e5c3e2f0e543e218e0a6f59b0c671a0662c939c023/dodrio-0.3.6-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "ce4f24addc80ac2856b77c1aa52d88c9d9d1141d41a327d2ada48dfcdec117c3",
"md5": "9946b146f5d6e7f37f13713b53f0314f",
"sha256": "db6670cf06b2b65987a1a6d94eec308990dad99d949c50b89a70b7b0e021a3ee"
},
"downloads": -1,
"filename": "dodrio-0.3.6.tar.gz",
"has_sig": false,
"md5_digest": "9946b146f5d6e7f37f13713b53f0314f",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 21593,
"upload_time": "2025-08-18T08:19:40",
"upload_time_iso_8601": "2025-08-18T08:19:40.820024Z",
"url": "https://files.pythonhosted.org/packages/ce/4f/24addc80ac2856b77c1aa52d88c9d9d1141d41a327d2ada48dfcdec117c3/dodrio-0.3.6.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-08-18 08:19:40",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "yixiangchen1995",
"github_project": "python-Dodrio",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "dodrio"
}