fastdatasets

Name	fastdatasets JSON
Version	0.9.17 JSON
	download
home_page	https://github.com/ssbuild/fastdatasets
Summary	fastdatasets: datasets for tfrecords
upload_time	2023-10-28 04:01:06
maintainer
docs_url	None
author	ssbuild
requires_python	>=3, <4
license	Apache 2.0
keywords	fastdatasets tfrecords dataset datasets
VCS
bugtrack_url
requirements	No requirements were recorded.
Travis-CI	No Travis.
coveralls test coverage	No coveralls.

            
## The update statement 

```text
2023-10-28  support more torch well known datatasets 
2023-07-08: support some nested case
2023-07-02: support arrow parquet
2023-04-28: fix lmdb mutiprocess
2023-02-13: add TopDataset with iterable_dataset and patch
2022-12-07: modify a bug for randomdataset for batch reminder
2022-11-07: add numpy writer and parser,add memory writer and parser
2022-10-29: add kv dataset 
```

## usage
  [numpy_io](https://github.com/ssbuild/numpy_io) 

## Install
```commandline
pip install -U fastdatasets
```


### 1. Record Write

```python
import data_serialize
from fastdatasets.record import load_dataset, gfile,TFRecordOptions, TFRecordCompressionType, TFRecordWriter

# Example Features结构兼容tensorflow.dataset
def test_write_featrue():
    options = 'GZIP'

    def test_write(filename, N=3, context='aaa'):
        with TFRecordWriter(filename, options=options) as file_writer:
            for _ in range(N):
                val1 = data_serialize.Int64List(value=[1, 2, 3] * 20)
                val2 = data_serialize.FloatList(value=[1, 2, 3] * 20)
                val3 = data_serialize.BytesList(value=[b'The china', b'boy'])
                featrue = data_serialize.Features(feature=
                {
                    "item_0": data_serialize.Feature(int64_list=val1),
                    "item_1": data_serialize.Feature(float_list=val2),
                    "item_2": data_serialize.Feature(bytes_list=val3)
                }
                )
                example = data_serialize.Example(features=featrue)
                file_writer.write(example.SerializeToString())

    test_write('d:/example.tfrecords0', 3, 'file0')
    test_write('d:/example.tfrecords1', 10, 'file1')
    test_write('d:/example.tfrecords2', 12, 'file2')


# 写任意字符串
def test_write_string():
    options = 'GZIP'

    def test_write(filename, N=3, context='aaa'):
        with TFRecordWriter(filename, options=options) as file_writer:
            for _ in range(N):
                # x, y = np.random.random(), np.random.random()
                file_writer.write(context + '____' + str(_))

    test_write('d:/example.tfrecords0', 3, 'file0')
    test_write('d:/example.tfrecords1', 10, 'file1')
    test_write('d:/example.tfrecords2', 12, 'file2')



```

### 2. record Simple Writer Demo

```python
# @Time    : 2022/9/18 23:27
import pickle
import data_serialize
import numpy as np
from fastdatasets.record import load_dataset
from fastdatasets.record import RECORD, WriterObject,FeatureWriter,StringWriter,PickleWriter,DataType,NumpyWriter

filename= r'd:\\example_writer.record'

def test_writer(filename):
    print('test_feature ...')
    options = RECORD.TFRecordOptions(compression_type='GZIP')
    f = NumpyWriter(filename,options=options)

    values = []
    n = 30
    for i in range(n):
        train_node = {
            "index": np.asarray(i, dtype=np.int64),
            'image': np.random.rand(3, 4),
            'labels': np.random.randint(0, 21128, size=(10), dtype=np.int64),
            'bdata': np.asarray(b'11111111asdadasdasdaa')
        }

        values.append(train_node)
        if (i + 1) % 10000 == 0:
            f.write_batch( values)
            values.clear()
    if len(values):
        f.write_batch(values)
    f.close()

def test_iterable(filename):
    options = RECORD.TFRecordOptions(compression_type='GZIP')
    datasets = load_dataset.IterableDataset(filename, options=options).parse_from_numpy_writer()
    for i, d in enumerate(datasets):
        print(i, d)

def test_random(filename):
    options = RECORD.TFRecordOptions(compression_type='GZIP')
    datasets = load_dataset.RandomDataset(filename, options=options).parse_from_numpy_writer()
    print(len(datasets))
    for i in range(len(datasets)):
        d = datasets[i]
        print(i, d)

test_writer(filename)
test_iterable(filename)
```

### 3. IterableDataset demo

```python
import data_serialize
from fastdatasets.record import load_dataset, gfile, RECORD

data_path = gfile.glob('d:/example.tfrecords*')
options = RECORD.TFRecordOptions(compression_type=None)
base_dataset = load_dataset.IterableDataset(data_path, cycle_length=1,
                                            block_length=1,
                                            buffer_size=128,
                                            options=options,
                                            with_share_memory=True)


def test_batch():
    num = 0
    for _ in base_dataset:
        num += 1
    print('base_dataset num', num)

    base_dataset.reset()
    ds = base_dataset.repeat(2).repeat(2).repeat(3).map(lambda x: x + bytes('_aaaaaaaaaaaaaa', encoding='utf-8'))
    num = 0
    for _ in ds:
        num += 1

    print('repeat(2).repeat(2).repeat(3) num ', num)


def test_torch():
    def filter_fn(x):
        if x == b'file2____2':
            return True
        return False

    base_dataset.reset()
    dataset = base_dataset.filter(filter_fn).interval(2, 0)
    i = 0
    for d in dataset:
        i += 1
        print(i, d)

    base_dataset.reset()
    dataset = base_dataset.batch(3)
    i = 0
    for d in dataset:
        i += 1
        print(i, d)

    # torch.utils.data.IterableDataset
    from fastdatasets.torch_dataset import IterableDataset
    dataset.reset()
    ds = IterableDataset(dataset=dataset)
    for d in ds:
        print(d)


def test_mutiprocess():
    print('mutiprocess 0...')
    base_dataset.reset()
    dataset = base_dataset.shard(num_shards=3, index=0)
    i = 0
    for d in dataset:
        i += 1
        print(i, d)

    print('mutiprocess 1...')
    base_dataset.reset()
    dataset = base_dataset.shard(num_shards=3, index=1)
    i = 0
    for d in dataset:
        i += 1
        print(i, d)

    print('mutiprocess 2...')
    base_dataset.reset()
    dataset = base_dataset.shard(num_shards=3, index=2)
    i = 0
    for d in dataset:
        i += 1
        print(i, d)

```



### 4. RandomDataset demo

```python
from fastdatasets.record import load_dataset, gfile, RECORD

data_path = gfile.glob('d:/example.tfrecords*')
options = RECORD.TFRecordOptions(compression_type=None)
dataset = load_dataset.RandomDataset(data_path, options=options,
                                     with_share_memory=True)

dataset = dataset.map(lambda x: x + b"adasdasdasd")
print(len(dataset))

for i in range(len(dataset)):
    print(i + 1, dataset[i])

print('batch...')
dataset = dataset.batch(7)
for i in range(len(dataset)):
    print(i + 1, dataset[i])

print('unbatch...')
dataset = dataset.unbatch()
for i in range(len(dataset)):
    print(i + 1, dataset[i])

print('shuffle...')
dataset = dataset.shuffle(10)
for i in range(len(dataset)):
    print(i + 1, dataset[i])

print('map...')
dataset = dataset.map(transform_fn=lambda x: x + b'aa22222222222222222222222222222')
for i in range(len(dataset)):
    print(i + 1, dataset[i])

print('torch Dataset...')
from fastdatasets.torch_dataset import Dataset

d = Dataset(dataset)
for i in range(len(d)):
    print(i + 1, d[i])


```



### 5. leveldb dataset

```python
# @Time    : 2022/10/27 20:37
# @Author  : tk
import numpy as np
from tqdm import tqdm
from fastdatasets.leveldb import DB,load_dataset,WriterObject,DataType,StringWriter,JsonWriter,FeatureWriter,NumpyWriter

db_path = 'd:\\example_leveldb_numpy'

def test_write(db_path):
    options = DB.LeveldbOptions(create_if_missing=True,error_if_exists=False)
    f = NumpyWriter(db_path, options = options)
    keys,values = [],[]
    n = 30
    for i in range(n):
        train_node = {
            "index":np.asarray(i,dtype=np.int64),
            'image': np.random.rand(3,4),
            'labels': np.random.randint(0,21128,size=(10),dtype=np.int64),
            'bdata': np.asarray(b'11111111asdadasdasdaa')
        }
        keys.append('input{}'.format(i))
        values.append(train_node)
        if (i+1) % 10000 == 0:
            f.put_batch(keys,values)
            keys.clear()
            values.clear()
    if len(keys):
        f.put_batch(keys, values)
        
    f.get_writer.put('total_num',str(n))
    f.close()



def test_random(db_path):
    options = DB.LeveldbOptions(create_if_missing=False, error_if_exists=False)
    dataset = load_dataset.RandomDataset(db_path,
                                        data_key_prefix_list=('input',),
                                        num_key='total_num',
                                        options = options)

    dataset = dataset.parse_from_numpy_writer().shuffle(10)
    print(len(dataset))
    for i in tqdm(range(len(dataset)),total=len(dataset)):
        d = dataset[i]
        print(i,d)

test_write(db_path)
test_random(db_path)

```


### 6. lmdb dataset

```python
# @Time    : 2022/10/27 20:37
# @Author  : tk

import numpy as np
from tqdm import tqdm
from fastdatasets.lmdb import DB,LMDB,load_dataset,WriterObject,DataType,StringWriter,JsonWriter,FeatureWriter,NumpyWriter

db_path = 'd:\\example_lmdb_numpy'

def test_write(db_path):
    options = DB.LmdbOptions(env_open_flag = 0,
                env_open_mode = 0o664, # 8进制表示
                txn_flag = 0,
                dbi_flag = 0,
                put_flag = 0)

    f = NumpyWriter(db_path, options = options,map_size=1024 * 1024 * 1024)

    keys, values = [], []
    n = 30
    for i in range(n):
        train_node = {
            'image': np.random.rand(3, 4),
            'labels': np.random.randint(0, 21128, size=(10), dtype=np.int64),
            'bdata': np.asarray(b'11111111asdadasdasdaa')
        }
        keys.append('input{}'.format(i))
        values.append(train_node)
        if (i + 1) % 10000 == 0:
            f.put_batch(keys, values)
            keys.clear()
            values.clear()
    if len(keys):
        f.put_batch(keys, values)

    f.get_writer.put('total_num',str(n))
    f.close()



def test_random(db_path):
    options = DB.LmdbOptions(env_open_flag=DB.LmdbFlag.MDB_RDONLY,
                               env_open_mode=0o664,  # 8进制表示
                               txn_flag=LMDB.LmdbFlag.MDB_RDONLY,
                               dbi_flag=0,
                               put_flag=0)
    dataset = load_dataset.RandomDataset(db_path,
                                        data_key_prefix_list=('input',),
                                        num_key='total_num',
                                        options = options)

    dataset = dataset.parse_from_numpy_writer().shuffle(10)
    print(len(dataset))
    for i in tqdm(range(len(dataset)), total=len(dataset)):
        d = dataset[i]
        print(d)

test_write(db_path)
test_random(db_path)
```



### 7. arrow dataset 


```python


from fastdatasets.arrow.writer import PythonWriter
from fastdatasets.arrow.dataset import load_dataset,arrow


path_file = 'd:/tmp/data.arrow'



with_stream = True
def test_write():
    fs = PythonWriter(path_file,
                        schema={'id': 'int32',
                                'text': 'str',
                                'map': 'map',
                                'map2': 'map_list'
                                },
                        with_stream=with_stream,
                        options=None)
    for i in range(2):
        data = {
            "id": list(range(i * 3,(i+ 1) * 3)),
            'text': ['asdasdasdas' + str(i) for i in range(3)],
            'map': [
                {"a": "aa1" + str(i), "b": "bb1", "c": "ccccccc"},
                {"a": "aa2", "b": "bb2", "c": "ccccccc"},
                {"a": "aa3", "b": "bb3", "c": "ccccccc"},
            ],
            'map2': [

                [
                    {"a": "11" + str(i), "b": "bb", "c": "ccccccc"},
                    {"a": "12", "b": "bb", "c": "ccccccc"},
                    {"a": "13", "b": "bb", "c": "ccccccc"},
                ],
                [
                    {"a": "21", "b": "bb", "c": "ccccccc"},
                    {"a": "22", "b": "bb", "c": "ccccccc"},
                ],
                [
                    {"a": "31", "b": "bb", "c": "ccccccc"},
                    {"a": "32", "b": "bb", "c": "ccccccc"},
                    {"a": "32", "b": "bb", "c": "ccccccc22222222222222"},
                ]
            ]
        }
        # fs.write_batch(data.keys(),data.values())
        status = fs.write_batch(data.keys(),data.values())
        assert status.ok(),status.message()


    fs.close()

def test_random():
    dataset = load_dataset.RandomDataset(path_file,with_share_memory=not with_stream)
    print('total', len(dataset))
    for i in range(len(dataset)):
        print(i,dataset[i])



def test_read_iter():
    dataset = load_dataset.IterableDataset(path_file,with_share_memory=not with_stream,batch_size=1)
    for d in dataset:
        print('iter',d)


test_write()

test_random()

test_read_iter()

```

### 8. parquet dataset 

```python

from fastdatasets.parquet.writer import PythonWriter
from fastdatasets.parquet.dataset import load_dataset
from tfrecords.python.io.arrow import ParquetReader,arrow


path_file = 'd:/tmp/data.parquet'



def test_write():
    fs = PythonWriter(path_file,
                      schema={'id': 'int32',
                              'text': 'str',
                              'map': 'map',
                              'map2': 'map_list'
                              },
                        parquet_options=dict(write_batch_size = 10))
    for i in range(2):
        data = {
            "id": list(range(i * 3, (i + 1) * 3)),
            'text': ['asdasdasdas' + str(i) for i in range(3)],
            'map': [
                {"a": "aa1", "b": "bb1", "c": "ccccccc"},
                {"a": "aa2", "b": "bb2", "c": "ccccccc"},
                {"a": "aa3", "b": "bb3", "c": "ccccccc"},
            ],
            'map2': [

                [
                    {"a": "11", "b": "bb", "c": "ccccccc"},
                    {"a": "12", "b": "bb", "c": "ccccccc"},
                    {"a": "13", "b": "bb", "c": "ccccccc"},
                ],
                [
                    {"a": "21", "b": "bb", "c": "ccccccc"},
                    {"a": "22", "b": "bb", "c": "ccccccc"},
                ],
                [
                    {"a": "31", "b": "bb", "c": "ccccccc"},
                    {"a": "32", "b": "bb", "c": "ccccccc"},
                    {"a": "32", "b": "bb", "c": "ccccccc22222222222222"},
                ]
            ]
        }
        # fs.write_batch(data.keys(),data.values())
        fs.write_table(data.keys(),data.values())


    fs.close()

def test_random():
    dataset = load_dataset.RandomDataset(path_file)
    print('total', len(dataset))
    for i in range(len(dataset)):
        print(dataset[i])



def test_read_iter():
    dataset = load_dataset.IterableDataset(path_file,batch_size=1)
    for d in dataset:
        print('iter',d)


test_write()

test_random()

test_read_iter()

```

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/ssbuild/fastdatasets",
    "name": "fastdatasets",
    "maintainer": "",
    "docs_url": null,
    "requires_python": ">=3, <4",
    "maintainer_email": "",
    "keywords": "fastdatasets,tfrecords,dataset,datasets",
    "author": "ssbuild",
    "author_email": "9727464@qq.com",
    "download_url": "",
    "platform": null,
    "description": "\n## The update statement \n\n```text\n2023-10-28  support more torch well known datatasets \n2023-07-08: support some nested case\n2023-07-02: support arrow parquet\n2023-04-28: fix lmdb mutiprocess\n2023-02-13: add TopDataset with iterable_dataset and patch\n2022-12-07: modify a bug for randomdataset for batch reminder\n2022-11-07: add numpy writer and parser,add memory writer and parser\n2022-10-29: add kv dataset \n```\n\n## usage\n  [numpy_io](https://github.com/ssbuild/numpy_io) \n\n## Install\n```commandline\npip install -U fastdatasets\n```\n\n\n### 1. Record Write\n\n```python\nimport data_serialize\nfrom fastdatasets.record import load_dataset, gfile,TFRecordOptions, TFRecordCompressionType, TFRecordWriter\n\n# Example Features\u7ed3\u6784\u517c\u5bb9tensorflow.dataset\ndef test_write_featrue():\n    options = 'GZIP'\n\n    def test_write(filename, N=3, context='aaa'):\n        with TFRecordWriter(filename, options=options) as file_writer:\n            for _ in range(N):\n                val1 = data_serialize.Int64List(value=[1, 2, 3] * 20)\n                val2 = data_serialize.FloatList(value=[1, 2, 3] * 20)\n                val3 = data_serialize.BytesList(value=[b'The china', b'boy'])\n                featrue = data_serialize.Features(feature=\n                {\n                    \"item_0\": data_serialize.Feature(int64_list=val1),\n                    \"item_1\": data_serialize.Feature(float_list=val2),\n                    \"item_2\": data_serialize.Feature(bytes_list=val3)\n                }\n                )\n                example = data_serialize.Example(features=featrue)\n                file_writer.write(example.SerializeToString())\n\n    test_write('d:/example.tfrecords0', 3, 'file0')\n    test_write('d:/example.tfrecords1', 10, 'file1')\n    test_write('d:/example.tfrecords2', 12, 'file2')\n\n\n# \u5199\u4efb\u610f\u5b57\u7b26\u4e32\ndef test_write_string():\n    options = 'GZIP'\n\n    def test_write(filename, N=3, context='aaa'):\n        with TFRecordWriter(filename, options=options) as file_writer:\n            for _ in range(N):\n                # x, y = np.random.random(), np.random.random()\n                file_writer.write(context + '____' + str(_))\n\n    test_write('d:/example.tfrecords0', 3, 'file0')\n    test_write('d:/example.tfrecords1', 10, 'file1')\n    test_write('d:/example.tfrecords2', 12, 'file2')\n\n\n\n```\n\n### 2. record Simple Writer Demo\n\n```python\n# @Time    : 2022/9/18 23:27\nimport pickle\nimport data_serialize\nimport numpy as np\nfrom fastdatasets.record import load_dataset\nfrom fastdatasets.record import RECORD, WriterObject,FeatureWriter,StringWriter,PickleWriter,DataType,NumpyWriter\n\nfilename= r'd:\\\\example_writer.record'\n\ndef test_writer(filename):\n    print('test_feature ...')\n    options = RECORD.TFRecordOptions(compression_type='GZIP')\n    f = NumpyWriter(filename,options=options)\n\n    values = []\n    n = 30\n    for i in range(n):\n        train_node = {\n            \"index\": np.asarray(i, dtype=np.int64),\n            'image': np.random.rand(3, 4),\n            'labels': np.random.randint(0, 21128, size=(10), dtype=np.int64),\n            'bdata': np.asarray(b'11111111asdadasdasdaa')\n        }\n\n        values.append(train_node)\n        if (i + 1) % 10000 == 0:\n            f.write_batch( values)\n            values.clear()\n    if len(values):\n        f.write_batch(values)\n    f.close()\n\ndef test_iterable(filename):\n    options = RECORD.TFRecordOptions(compression_type='GZIP')\n    datasets = load_dataset.IterableDataset(filename, options=options).parse_from_numpy_writer()\n    for i, d in enumerate(datasets):\n        print(i, d)\n\ndef test_random(filename):\n    options = RECORD.TFRecordOptions(compression_type='GZIP')\n    datasets = load_dataset.RandomDataset(filename, options=options).parse_from_numpy_writer()\n    print(len(datasets))\n    for i in range(len(datasets)):\n        d = datasets[i]\n        print(i, d)\n\ntest_writer(filename)\ntest_iterable(filename)\n```\n\n### 3. IterableDataset demo\n\n```python\nimport data_serialize\nfrom fastdatasets.record import load_dataset, gfile, RECORD\n\ndata_path = gfile.glob('d:/example.tfrecords*')\noptions = RECORD.TFRecordOptions(compression_type=None)\nbase_dataset = load_dataset.IterableDataset(data_path, cycle_length=1,\n                                            block_length=1,\n                                            buffer_size=128,\n                                            options=options,\n                                            with_share_memory=True)\n\n\ndef test_batch():\n    num = 0\n    for _ in base_dataset:\n        num += 1\n    print('base_dataset num', num)\n\n    base_dataset.reset()\n    ds = base_dataset.repeat(2).repeat(2).repeat(3).map(lambda x: x + bytes('_aaaaaaaaaaaaaa', encoding='utf-8'))\n    num = 0\n    for _ in ds:\n        num += 1\n\n    print('repeat(2).repeat(2).repeat(3) num ', num)\n\n\ndef test_torch():\n    def filter_fn(x):\n        if x == b'file2____2':\n            return True\n        return False\n\n    base_dataset.reset()\n    dataset = base_dataset.filter(filter_fn).interval(2, 0)\n    i = 0\n    for d in dataset:\n        i += 1\n        print(i, d)\n\n    base_dataset.reset()\n    dataset = base_dataset.batch(3)\n    i = 0\n    for d in dataset:\n        i += 1\n        print(i, d)\n\n    # torch.utils.data.IterableDataset\n    from fastdatasets.torch_dataset import IterableDataset\n    dataset.reset()\n    ds = IterableDataset(dataset=dataset)\n    for d in ds:\n        print(d)\n\n\ndef test_mutiprocess():\n    print('mutiprocess 0...')\n    base_dataset.reset()\n    dataset = base_dataset.shard(num_shards=3, index=0)\n    i = 0\n    for d in dataset:\n        i += 1\n        print(i, d)\n\n    print('mutiprocess 1...')\n    base_dataset.reset()\n    dataset = base_dataset.shard(num_shards=3, index=1)\n    i = 0\n    for d in dataset:\n        i += 1\n        print(i, d)\n\n    print('mutiprocess 2...')\n    base_dataset.reset()\n    dataset = base_dataset.shard(num_shards=3, index=2)\n    i = 0\n    for d in dataset:\n        i += 1\n        print(i, d)\n\n```\n\n\n\n### 4. RandomDataset demo\n\n```python\nfrom fastdatasets.record import load_dataset, gfile, RECORD\n\ndata_path = gfile.glob('d:/example.tfrecords*')\noptions = RECORD.TFRecordOptions(compression_type=None)\ndataset = load_dataset.RandomDataset(data_path, options=options,\n                                     with_share_memory=True)\n\ndataset = dataset.map(lambda x: x + b\"adasdasdasd\")\nprint(len(dataset))\n\nfor i in range(len(dataset)):\n    print(i + 1, dataset[i])\n\nprint('batch...')\ndataset = dataset.batch(7)\nfor i in range(len(dataset)):\n    print(i + 1, dataset[i])\n\nprint('unbatch...')\ndataset = dataset.unbatch()\nfor i in range(len(dataset)):\n    print(i + 1, dataset[i])\n\nprint('shuffle...')\ndataset = dataset.shuffle(10)\nfor i in range(len(dataset)):\n    print(i + 1, dataset[i])\n\nprint('map...')\ndataset = dataset.map(transform_fn=lambda x: x + b'aa22222222222222222222222222222')\nfor i in range(len(dataset)):\n    print(i + 1, dataset[i])\n\nprint('torch Dataset...')\nfrom fastdatasets.torch_dataset import Dataset\n\nd = Dataset(dataset)\nfor i in range(len(d)):\n    print(i + 1, d[i])\n\n\n```\n\n\n\n### 5. leveldb dataset\n\n```python\n# @Time    : 2022/10/27 20:37\n# @Author  : tk\nimport numpy as np\nfrom tqdm import tqdm\nfrom fastdatasets.leveldb import DB,load_dataset,WriterObject,DataType,StringWriter,JsonWriter,FeatureWriter,NumpyWriter\n\ndb_path = 'd:\\\\example_leveldb_numpy'\n\ndef test_write(db_path):\n    options = DB.LeveldbOptions(create_if_missing=True,error_if_exists=False)\n    f = NumpyWriter(db_path, options = options)\n    keys,values = [],[]\n    n = 30\n    for i in range(n):\n        train_node = {\n            \"index\":np.asarray(i,dtype=np.int64),\n            'image': np.random.rand(3,4),\n            'labels': np.random.randint(0,21128,size=(10),dtype=np.int64),\n            'bdata': np.asarray(b'11111111asdadasdasdaa')\n        }\n        keys.append('input{}'.format(i))\n        values.append(train_node)\n        if (i+1) % 10000 == 0:\n            f.put_batch(keys,values)\n            keys.clear()\n            values.clear()\n    if len(keys):\n        f.put_batch(keys, values)\n        \n    f.get_writer.put('total_num',str(n))\n    f.close()\n\n\n\ndef test_random(db_path):\n    options = DB.LeveldbOptions(create_if_missing=False, error_if_exists=False)\n    dataset = load_dataset.RandomDataset(db_path,\n                                        data_key_prefix_list=('input',),\n                                        num_key='total_num',\n                                        options = options)\n\n    dataset = dataset.parse_from_numpy_writer().shuffle(10)\n    print(len(dataset))\n    for i in tqdm(range(len(dataset)),total=len(dataset)):\n        d = dataset[i]\n        print(i,d)\n\ntest_write(db_path)\ntest_random(db_path)\n\n```\n\n\n### 6. lmdb dataset\n\n```python\n# @Time    : 2022/10/27 20:37\n# @Author  : tk\n\nimport numpy as np\nfrom tqdm import tqdm\nfrom fastdatasets.lmdb import DB,LMDB,load_dataset,WriterObject,DataType,StringWriter,JsonWriter,FeatureWriter,NumpyWriter\n\ndb_path = 'd:\\\\example_lmdb_numpy'\n\ndef test_write(db_path):\n    options = DB.LmdbOptions(env_open_flag = 0,\n                env_open_mode = 0o664, # 8\u8fdb\u5236\u8868\u793a\n                txn_flag = 0,\n                dbi_flag = 0,\n                put_flag = 0)\n\n    f = NumpyWriter(db_path, options = options,map_size=1024 * 1024 * 1024)\n\n    keys, values = [], []\n    n = 30\n    for i in range(n):\n        train_node = {\n            'image': np.random.rand(3, 4),\n            'labels': np.random.randint(0, 21128, size=(10), dtype=np.int64),\n            'bdata': np.asarray(b'11111111asdadasdasdaa')\n        }\n        keys.append('input{}'.format(i))\n        values.append(train_node)\n        if (i + 1) % 10000 == 0:\n            f.put_batch(keys, values)\n            keys.clear()\n            values.clear()\n    if len(keys):\n        f.put_batch(keys, values)\n\n    f.get_writer.put('total_num',str(n))\n    f.close()\n\n\n\ndef test_random(db_path):\n    options = DB.LmdbOptions(env_open_flag=DB.LmdbFlag.MDB_RDONLY,\n                               env_open_mode=0o664,  # 8\u8fdb\u5236\u8868\u793a\n                               txn_flag=LMDB.LmdbFlag.MDB_RDONLY,\n                               dbi_flag=0,\n                               put_flag=0)\n    dataset = load_dataset.RandomDataset(db_path,\n                                        data_key_prefix_list=('input',),\n                                        num_key='total_num',\n                                        options = options)\n\n    dataset = dataset.parse_from_numpy_writer().shuffle(10)\n    print(len(dataset))\n    for i in tqdm(range(len(dataset)), total=len(dataset)):\n        d = dataset[i]\n        print(d)\n\ntest_write(db_path)\ntest_random(db_path)\n```\n\n\n\n### 7. arrow dataset \n\n\n```python\n\n\nfrom fastdatasets.arrow.writer import PythonWriter\nfrom fastdatasets.arrow.dataset import load_dataset,arrow\n\n\npath_file = 'd:/tmp/data.arrow'\n\n\n\nwith_stream = True\ndef test_write():\n    fs = PythonWriter(path_file,\n                        schema={'id': 'int32',\n                                'text': 'str',\n                                'map': 'map',\n                                'map2': 'map_list'\n                                },\n                        with_stream=with_stream,\n                        options=None)\n    for i in range(2):\n        data = {\n            \"id\": list(range(i * 3,(i+ 1) * 3)),\n            'text': ['asdasdasdas' + str(i) for i in range(3)],\n            'map': [\n                {\"a\": \"aa1\" + str(i), \"b\": \"bb1\", \"c\": \"ccccccc\"},\n                {\"a\": \"aa2\", \"b\": \"bb2\", \"c\": \"ccccccc\"},\n                {\"a\": \"aa3\", \"b\": \"bb3\", \"c\": \"ccccccc\"},\n            ],\n            'map2': [\n\n                [\n                    {\"a\": \"11\" + str(i), \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"12\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"13\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                ],\n                [\n                    {\"a\": \"21\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"22\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                ],\n                [\n                    {\"a\": \"31\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"32\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"32\", \"b\": \"bb\", \"c\": \"ccccccc22222222222222\"},\n                ]\n            ]\n        }\n        # fs.write_batch(data.keys(),data.values())\n        status = fs.write_batch(data.keys(),data.values())\n        assert status.ok(),status.message()\n\n\n    fs.close()\n\ndef test_random():\n    dataset = load_dataset.RandomDataset(path_file,with_share_memory=not with_stream)\n    print('total', len(dataset))\n    for i in range(len(dataset)):\n        print(i,dataset[i])\n\n\n\ndef test_read_iter():\n    dataset = load_dataset.IterableDataset(path_file,with_share_memory=not with_stream,batch_size=1)\n    for d in dataset:\n        print('iter',d)\n\n\ntest_write()\n\ntest_random()\n\ntest_read_iter()\n\n```\n\n### 8. parquet dataset \n\n```python\n\nfrom fastdatasets.parquet.writer import PythonWriter\nfrom fastdatasets.parquet.dataset import load_dataset\nfrom tfrecords.python.io.arrow import ParquetReader,arrow\n\n\npath_file = 'd:/tmp/data.parquet'\n\n\n\ndef test_write():\n    fs = PythonWriter(path_file,\n                      schema={'id': 'int32',\n                              'text': 'str',\n                              'map': 'map',\n                              'map2': 'map_list'\n                              },\n                        parquet_options=dict(write_batch_size = 10))\n    for i in range(2):\n        data = {\n            \"id\": list(range(i * 3, (i + 1) * 3)),\n            'text': ['asdasdasdas' + str(i) for i in range(3)],\n            'map': [\n                {\"a\": \"aa1\", \"b\": \"bb1\", \"c\": \"ccccccc\"},\n                {\"a\": \"aa2\", \"b\": \"bb2\", \"c\": \"ccccccc\"},\n                {\"a\": \"aa3\", \"b\": \"bb3\", \"c\": \"ccccccc\"},\n            ],\n            'map2': [\n\n                [\n                    {\"a\": \"11\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"12\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"13\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                ],\n                [\n                    {\"a\": \"21\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"22\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                ],\n                [\n                    {\"a\": \"31\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"32\", \"b\": \"bb\", \"c\": \"ccccccc\"},\n                    {\"a\": \"32\", \"b\": \"bb\", \"c\": \"ccccccc22222222222222\"},\n                ]\n            ]\n        }\n        # fs.write_batch(data.keys(),data.values())\n        fs.write_table(data.keys(),data.values())\n\n\n    fs.close()\n\ndef test_random():\n    dataset = load_dataset.RandomDataset(path_file)\n    print('total', len(dataset))\n    for i in range(len(dataset)):\n        print(dataset[i])\n\n\n\ndef test_read_iter():\n    dataset = load_dataset.IterableDataset(path_file,batch_size=1)\n    for d in dataset:\n        print('iter',d)\n\n\ntest_write()\n\ntest_random()\n\ntest_read_iter()\n\n```\n",
    "bugtrack_url": null,
    "license": "Apache 2.0",
    "summary": "fastdatasets: datasets for tfrecords",
    "version": "0.9.17",
    "project_urls": {
        "Homepage": "https://github.com/ssbuild/fastdatasets"
    },
    "split_keywords": [
        "fastdatasets",
        "tfrecords",
        "dataset",
        "datasets"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "fb0e51b4fc048e1342e723e62235f1a98cf827a30c972e6db6608195b0e947a7",
                "md5": "8546018db1023c45d901404be0820284",
                "sha256": "49f9b2334f1bd4c4669c714d5072d0e432710bac653046bdb6cdcfcae7a38e35"
            },
            "downloads": -1,
            "filename": "fastdatasets-0.9.17-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "8546018db1023c45d901404be0820284",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3, <4",
            "size": 59883,
            "upload_time": "2023-10-28T04:01:06",
            "upload_time_iso_8601": "2023-10-28T04:01:06.365358Z",
            "url": "https://files.pythonhosted.org/packages/fb/0e/51b4fc048e1342e723e62235f1a98cf827a30c972e6db6608195b0e947a7/fastdatasets-0.9.17-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-10-28 04:01:06",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "ssbuild",
    "github_project": "fastdatasets",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "lcname": "fastdatasets"
}

ssbuild