pytorch-benchmark

Name	pytorch-benchmark JSON
Version	0.3.6 JSON
	download
home_page	https://github.com/LukasHedegaard/pytorch-benchmark
Summary	Easily benchmark PyTorch model FLOPs, latency, throughput, max allocated memory and energy consumption in one go.
upload_time	2023-08-10 07:04:47
maintainer
docs_url	None
author	Lukas Hedegaard
requires_python
license
keywords	deep learning pytorch ai benchmark speed energy memory
VCS
bugtrack_url
requirements	torch ptflops tqdm numpy psutil gputil py-cpuinfo pyyaml
Travis-CI	No Travis.
coveralls test coverage	No coveralls.

            # ⏱ pytorch-benchmark
__Easily benchmark model inference FLOPs, latency, throughput, max allocated memory and energy consumption__
<div align="left">
  <a href="https://pypi.org/project/pytorch-benchmark/">
    <img src="https://img.shields.io/pypi/pyversions/pytorch-benchmark" height="20" >
  </a>
  <a href="https://badge.fury.io/py/pytorch-benchmark">
    <img src="https://badge.fury.io/py/pytorch-benchmark.svg" height="20" >
  </a>
  <a href="https://pepy.tech/project/pytorch-benchmark">
    <img src="https://pepy.tech/badge/pytorch-benchmark" height="20">
  </a>
  <a href="https://www.codefactor.io/repository/github/lukashedegaard/pytorch-benchmark/overview/main">
    <img src="https://www.codefactor.io/repository/github/lukashedegaard/pytorch-benchmark/badge/main" alt="CodeFactor" />
  </a>
  <a href="https://opensource.org/licenses/Apache-2.0">
    <img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" height="20">
  </a>
  <a href="https://github.com/psf/black">
    <img src="https://img.shields.io/badge/code%20style-black-000000.svg" height="20">
  </a>
  <a href="https://codecov.io/gh/LukasHedegaard/pytorch-benchmark">
    <img src="https://codecov.io/gh/LukasHedegaard/pytorch-benchmark/branch/main/graph/badge.svg?token=B91XGSKSFJ"/>
  </a>
   <sup>*</sup>
</div>

###### \*Actual coverage is higher as GPU-related code is skipped by Codecov

## Install 
```bash
pip install pytorch-benchmark
```

## Usage 
```python
import torch
from torchvision.models import efficientnet_b0
from pytorch_benchmark import benchmark


model = efficientnet_b0().to("cpu")  # Model device sets benchmarking device
sample = torch.randn(8, 3, 224, 224)  # (B, C, H, W)
results = benchmark(model, sample, num_runs=100)
```

### Sample results 💻
<details>
  <summary>Macbook Pro (16-inch, 2019), 2.6 GHz 6-Core Intel Core i7</summary>
  
  ```
  device: cpu
  flops: 401669732
  machine_info:
    cpu:
      architecture: x86_64
      cores:
        physical: 6
        total: 12
      frequency: 2.60 GHz
      model: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
    gpus: null
    memory:
      available: 5.86 GB
      total: 16.00 GB
      used: 7.29 GB
    system:
      node: d40049
      release: 21.2.0
      system: Darwin
  params: 5288548
  timing:
    batch_size_1:
      on_device_inference:
        human_readable:
          batch_latency: 74.439 ms +/- 6.459 ms [64.604 ms, 96.681 ms]
          batches_per_second: 13.53 +/- 1.09 [10.34, 15.48]
        metrics:
          batches_per_second_max: 15.478907181264278
          batches_per_second_mean: 13.528026359855625
          batches_per_second_min: 10.343281300091244
          batches_per_second_std: 1.0922382209314958
          seconds_per_batch_max: 0.09668111801147461
          seconds_per_batch_mean: 0.07443853378295899
          seconds_per_batch_min: 0.06460404396057129
          seconds_per_batch_std: 0.006458734193132054
    batch_size_8:
      on_device_inference:
        human_readable:
          batch_latency: 509.410 ms +/- 30.031 ms [405.296 ms, 621.773 ms]
          batches_per_second: 1.97 +/- 0.11 [1.61, 2.47]
        metrics:
          batches_per_second_max: 2.4673319862230025
          batches_per_second_mean: 1.9696935126370148
          batches_per_second_min: 1.6083039834656554
          batches_per_second_std: 0.11341204895590185
          seconds_per_batch_max: 0.6217730045318604
          seconds_per_batch_mean: 0.509410228729248
          seconds_per_batch_min: 0.40529608726501465
          seconds_per_batch_std: 0.030031445467788704
  ```
</details>

<details>
  <summary>Server with NVIDIA GeForce RTX 2080 and Intel Xeon 2.10GHz CPU</summary>
  
  ```
  device: cuda
  flops: 401669732
  machine_info:
    cpu:
      architecture: x86_64
      cores:
        physical: 16
        total: 32
      frequency: 3.00 GHz
      model: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
    gpus:
    - memory: 8192.0 MB
      name: NVIDIA GeForce RTX 2080
    - memory: 8192.0 MB
      name: NVIDIA GeForce RTX 2080
    - memory: 8192.0 MB
      name: NVIDIA GeForce RTX 2080
    - memory: 8192.0 MB
      name: NVIDIA GeForce RTX 2080
    memory:
      available: 119.98 GB
      total: 125.78 GB
      used: 4.78 GB
    system:
      node: monster
      release: 4.15.0-167-generic
      system: Linux
  max_inference_memory: 736250368
  params: 5288548
  post_inference_memory: 21402112
  pre_inference_memory: 21402112
  timing:
    batch_size_1:
      cpu_to_gpu:
        human_readable:
          batch_latency: "144.815 \xB5s +/- 16.103 \xB5s [136.614 \xB5s, 272.751 \xB5\
            s]"
          batches_per_second: 6.96 K +/- 535.06 [3.67 K, 7.32 K]
        metrics:
          batches_per_second_max: 7319.902268760908
          batches_per_second_mean: 6962.865857677197
          batches_per_second_min: 3666.3496503496503
          batches_per_second_std: 535.0581873859935
          seconds_per_batch_max: 0.0002727508544921875
          seconds_per_batch_mean: 0.00014481544494628906
          seconds_per_batch_min: 0.0001366138458251953
          seconds_per_batch_std: 1.6102982159292097e-05
      gpu_to_cpu:
        human_readable:
          batch_latency: "106.168 \xB5s +/- 17.829 \xB5s [53.167 \xB5s, 248.909 \xB5\
            s]"
          batches_per_second: 9.64 K +/- 1.60 K [4.02 K, 18.81 K]
        metrics:
          batches_per_second_max: 18808.538116591928
          batches_per_second_mean: 9639.942102368092
          batches_per_second_min: 4017.532567049808
          batches_per_second_std: 1595.7983033708472
          seconds_per_batch_max: 0.00024890899658203125
          seconds_per_batch_mean: 0.00010616779327392578
          seconds_per_batch_min: 5.316734313964844e-05
          seconds_per_batch_std: 1.7829135190772566e-05
      on_device_inference:
        human_readable:
          batch_latency: "15.567 ms +/- 546.154 \xB5s [15.311 ms, 19.261 ms]"
          batches_per_second: 64.31 +/- 1.96 [51.92, 65.31]
        metrics:
          batches_per_second_max: 65.31149174711928
          batches_per_second_mean: 64.30692850265713
          batches_per_second_min: 51.918698784442846
          batches_per_second_std: 1.9599322351815833
          seconds_per_batch_max: 0.019260883331298828
          seconds_per_batch_mean: 0.015567030906677246
          seconds_per_batch_min: 0.015311241149902344
          seconds_per_batch_std: 0.0005461537255227954
      total:
        human_readable:
          batch_latency: "15.818 ms +/- 549.873 \xB5s [15.561 ms, 19.461 ms]"
          batches_per_second: 63.29 +/- 1.92 [51.38, 64.26]
        metrics:
          batches_per_second_max: 64.26476266356143
          batches_per_second_mean: 63.28565696640637
          batches_per_second_min: 51.38378232692614
          batches_per_second_std: 1.9198343850767468
          seconds_per_batch_max: 0.019461393356323242
          seconds_per_batch_mean: 0.01581801414489746
          seconds_per_batch_min: 0.015560626983642578
          seconds_per_batch_std: 0.0005498731526138171
    batch_size_8:
      cpu_to_gpu:
        human_readable:
          batch_latency: "805.674 \xB5s +/- 157.254 \xB5s [773.191 \xB5s, 2.303 ms]"
          batches_per_second: 1.26 K +/- 97.51 [434.24, 1.29 K]
        metrics:
          batches_per_second_max: 1293.3407338883749
          batches_per_second_mean: 1259.5653105357776
          batches_per_second_min: 434.23791282741485
          batches_per_second_std: 97.51424036939879
          seconds_per_batch_max: 0.002302885055541992
          seconds_per_batch_mean: 0.000805673599243164
          seconds_per_batch_min: 0.0007731914520263672
          seconds_per_batch_std: 0.0001572538140613121
      gpu_to_cpu:
        human_readable:
          batch_latency: "104.215 \xB5s +/- 12.658 \xB5s [59.605 \xB5s, 128.031 \xB5\
            s]"
          batches_per_second: 9.81 K +/- 1.76 K [7.81 K, 16.78 K]
        metrics:
          batches_per_second_max: 16777.216
          batches_per_second_mean: 9806.840626578907
          batches_per_second_min: 7810.621973929236
          batches_per_second_std: 1761.6008872740726
          seconds_per_batch_max: 0.00012803077697753906
          seconds_per_batch_mean: 0.00010421514511108399
          seconds_per_batch_min: 5.9604644775390625e-05
          seconds_per_batch_std: 1.2658293070174213e-05
      on_device_inference:
        human_readable:
          batch_latency: "16.623 ms +/- 759.017 \xB5s [16.301 ms, 22.584 ms]"
          batches_per_second: 60.26 +/- 2.22 [44.28, 61.35]
        metrics:
          batches_per_second_max: 61.346243290283894
          batches_per_second_mean: 60.25881046175457
          batches_per_second_min: 44.27827629162004
          batches_per_second_std: 2.2193085956672296
          seconds_per_batch_max: 0.02258443832397461
          seconds_per_batch_mean: 0.01662288188934326
          seconds_per_batch_min: 0.01630091667175293
          seconds_per_batch_std: 0.0007590167680596548
      total:
        human_readable:
          batch_latency: "17.533 ms +/- 836.015 \xB5s [17.193 ms, 23.896 ms]"
          batches_per_second: 57.14 +/- 2.20 [41.85, 58.16]
        metrics:
          batches_per_second_max: 58.16374528511205
          batches_per_second_mean: 57.140338855126565
          batches_per_second_min: 41.84762740950632
          batches_per_second_std: 2.1985066663972677
          seconds_per_batch_max: 0.023896217346191406
          seconds_per_batch_mean: 0.01753277063369751
          seconds_per_batch_min: 0.017192840576171875
          seconds_per_batch_std: 0.0008360147274630088
  ```
</details>

... Your turn

## How we benchmark
The overall flow can be summarized with the diagram shown below (best viewed on GitHub):
```mermaid
flowchart TB;
    A([Start]) --> B
    B(prepare_samples)
    B --> C[get_machine_info]
    C --> D[measure_params]
    D --> E[warm_up, batch_size=1]
    E --> F[measure_flops]
    
    subgraph SG[Repeat for batch_size 1 and x]
        direction TB
        G[measure_allocated_memory]
        G --> H[warm_up, given batch_size]
        H --> I[measure_detailed_inference_timing]
        I --> J[measure_repeated_inference_timing]
        J --> K[measure_energy]
    end

    F --> SG
    SG --> END([End])
```

Usually, the sample and model don't reside on the same device initially (e.g., a GPU holds the model while the sample is on CPU after being loaded from disk or collected as live data). Accordingly, we measure timing in three parts: `cpu_to_gpu`, `on_device_inference`, and `gpu_to_cpu`, as well as a sum of the three, `total`. Note that the `model.device()` determines the execution device. The inference flow is shown below:

```mermaid
flowchart LR;
    A([sample])
    A --> B[cpu -> gpu]
    B --> C[model __call__]
    C --> D[gpu -> cpu]
    D --> E([result])
```

## Advanced use
Trying to benchmark a custom class, which is not a `torch.nn.Module`?
You can pass custom functions to `benchmark` as seen in [this example](tests/test_custom_class.py).


## Limitations
- Allocated memory measurements are only available on CUDA devices.
- Energy consumption can only be measured on NVIDIA Jetson platforms at the moment.
- FLOPs and parameter count is not support for custom classes.


## Acknowledgement
This work has received funding from the European Union’s Horizon 2020 research and innovation programme under grant agreement No 871449 (OpenDR).
It was developed for benchmarking tools in [OpenDR](https://github.com/opendr-eu/opendr), a non-proprietary toolkit for deep learning based functionalities for robotics and vision.


## Citation
If you like the tool and use it in research, please consider citing it:
```bibtex
@software{hedegaard2022pytorchbenchmark,
  author = {Hedegaard, Lukas},
  doi = {10.5281/zenodo.7223585},
  month = {10},
  title = {{PyTorch-Benchmark}},
  version = {0.3.5},
  year = {2022}
}
```

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/LukasHedegaard/pytorch-benchmark",
    "name": "pytorch-benchmark",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "deep learning,pytorch,AI,benchmark,speed,energy,memory",
    "author": "Lukas Hedegaard",
    "author_email": "lukasxhedegaard@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/91/9e/597879f4df381ae4e8cc2bb02c7ec6e0dc4a3f226fd967e07a6a4e29c667/pytorch-benchmark-0.3.6.tar.gz",
    "platform": null,
    "description": "# \u23f1 pytorch-benchmark\n__Easily benchmark model inference FLOPs, latency, throughput, max allocated memory and energy consumption__\n<div align=\"left\">\n  <a href=\"https://pypi.org/project/pytorch-benchmark/\">\n    <img src=\"https://img.shields.io/pypi/pyversions/pytorch-benchmark\" height=\"20\" >\n  </a>\n  <a href=\"https://badge.fury.io/py/pytorch-benchmark\">\n    <img src=\"https://badge.fury.io/py/pytorch-benchmark.svg\" height=\"20\" >\n  </a>\n  <a href=\"https://pepy.tech/project/pytorch-benchmark\">\n    <img src=\"https://pepy.tech/badge/pytorch-benchmark\" height=\"20\">\n  </a>\n  <a href=\"https://www.codefactor.io/repository/github/lukashedegaard/pytorch-benchmark/overview/main\">\n    <img src=\"https://www.codefactor.io/repository/github/lukashedegaard/pytorch-benchmark/badge/main\" alt=\"CodeFactor\" />\n  </a>\n  <a href=\"https://opensource.org/licenses/Apache-2.0\">\n    <img src=\"https://img.shields.io/badge/License-Apache%202.0-blue.svg\" height=\"20\">\n  </a>\n  <a href=\"https://github.com/psf/black\">\n    <img src=\"https://img.shields.io/badge/code%20style-black-000000.svg\" height=\"20\">\n  </a>\n  <a href=\"https://codecov.io/gh/LukasHedegaard/pytorch-benchmark\">\n    <img src=\"https://codecov.io/gh/LukasHedegaard/pytorch-benchmark/branch/main/graph/badge.svg?token=B91XGSKSFJ\"/>\n  </a>\n   <sup>*</sup>\n</div>\n\n###### \\*Actual coverage is higher as GPU-related code is skipped by Codecov\n\n## Install \n```bash\npip install pytorch-benchmark\n```\n\n## Usage \n```python\nimport torch\nfrom torchvision.models import efficientnet_b0\nfrom pytorch_benchmark import benchmark\n\n\nmodel = efficientnet_b0().to(\"cpu\")  # Model device sets benchmarking device\nsample = torch.randn(8, 3, 224, 224)  # (B, C, H, W)\nresults = benchmark(model, sample, num_runs=100)\n```\n\n### Sample results \ud83d\udcbb\n<details>\n  <summary>Macbook Pro (16-inch, 2019), 2.6 GHz 6-Core Intel Core i7</summary>\n  \n  ```\n  device: cpu\n  flops: 401669732\n  machine_info:\n    cpu:\n      architecture: x86_64\n      cores:\n        physical: 6\n        total: 12\n      frequency: 2.60 GHz\n      model: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz\n    gpus: null\n    memory:\n      available: 5.86 GB\n      total: 16.00 GB\n      used: 7.29 GB\n    system:\n      node: d40049\n      release: 21.2.0\n      system: Darwin\n  params: 5288548\n  timing:\n    batch_size_1:\n      on_device_inference:\n        human_readable:\n          batch_latency: 74.439 ms +/- 6.459 ms [64.604 ms, 96.681 ms]\n          batches_per_second: 13.53 +/- 1.09 [10.34, 15.48]\n        metrics:\n          batches_per_second_max: 15.478907181264278\n          batches_per_second_mean: 13.528026359855625\n          batches_per_second_min: 10.343281300091244\n          batches_per_second_std: 1.0922382209314958\n          seconds_per_batch_max: 0.09668111801147461\n          seconds_per_batch_mean: 0.07443853378295899\n          seconds_per_batch_min: 0.06460404396057129\n          seconds_per_batch_std: 0.006458734193132054\n    batch_size_8:\n      on_device_inference:\n        human_readable:\n          batch_latency: 509.410 ms +/- 30.031 ms [405.296 ms, 621.773 ms]\n          batches_per_second: 1.97 +/- 0.11 [1.61, 2.47]\n        metrics:\n          batches_per_second_max: 2.4673319862230025\n          batches_per_second_mean: 1.9696935126370148\n          batches_per_second_min: 1.6083039834656554\n          batches_per_second_std: 0.11341204895590185\n          seconds_per_batch_max: 0.6217730045318604\n          seconds_per_batch_mean: 0.509410228729248\n          seconds_per_batch_min: 0.40529608726501465\n          seconds_per_batch_std: 0.030031445467788704\n  ```\n</details>\n\n<details>\n  <summary>Server with NVIDIA GeForce RTX 2080 and Intel Xeon 2.10GHz CPU</summary>\n  \n  ```\n  device: cuda\n  flops: 401669732\n  machine_info:\n    cpu:\n      architecture: x86_64\n      cores:\n        physical: 16\n        total: 32\n      frequency: 3.00 GHz\n      model: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz\n    gpus:\n    - memory: 8192.0 MB\n      name: NVIDIA GeForce RTX 2080\n    - memory: 8192.0 MB\n      name: NVIDIA GeForce RTX 2080\n    - memory: 8192.0 MB\n      name: NVIDIA GeForce RTX 2080\n    - memory: 8192.0 MB\n      name: NVIDIA GeForce RTX 2080\n    memory:\n      available: 119.98 GB\n      total: 125.78 GB\n      used: 4.78 GB\n    system:\n      node: monster\n      release: 4.15.0-167-generic\n      system: Linux\n  max_inference_memory: 736250368\n  params: 5288548\n  post_inference_memory: 21402112\n  pre_inference_memory: 21402112\n  timing:\n    batch_size_1:\n      cpu_to_gpu:\n        human_readable:\n          batch_latency: \"144.815 \\xB5s +/- 16.103 \\xB5s [136.614 \\xB5s, 272.751 \\xB5\\\n            s]\"\n          batches_per_second: 6.96 K +/- 535.06 [3.67 K, 7.32 K]\n        metrics:\n          batches_per_second_max: 7319.902268760908\n          batches_per_second_mean: 6962.865857677197\n          batches_per_second_min: 3666.3496503496503\n          batches_per_second_std: 535.0581873859935\n          seconds_per_batch_max: 0.0002727508544921875\n          seconds_per_batch_mean: 0.00014481544494628906\n          seconds_per_batch_min: 0.0001366138458251953\n          seconds_per_batch_std: 1.6102982159292097e-05\n      gpu_to_cpu:\n        human_readable:\n          batch_latency: \"106.168 \\xB5s +/- 17.829 \\xB5s [53.167 \\xB5s, 248.909 \\xB5\\\n            s]\"\n          batches_per_second: 9.64 K +/- 1.60 K [4.02 K, 18.81 K]\n        metrics:\n          batches_per_second_max: 18808.538116591928\n          batches_per_second_mean: 9639.942102368092\n          batches_per_second_min: 4017.532567049808\n          batches_per_second_std: 1595.7983033708472\n          seconds_per_batch_max: 0.00024890899658203125\n          seconds_per_batch_mean: 0.00010616779327392578\n          seconds_per_batch_min: 5.316734313964844e-05\n          seconds_per_batch_std: 1.7829135190772566e-05\n      on_device_inference:\n        human_readable:\n          batch_latency: \"15.567 ms +/- 546.154 \\xB5s [15.311 ms, 19.261 ms]\"\n          batches_per_second: 64.31 +/- 1.96 [51.92, 65.31]\n        metrics:\n          batches_per_second_max: 65.31149174711928\n          batches_per_second_mean: 64.30692850265713\n          batches_per_second_min: 51.918698784442846\n          batches_per_second_std: 1.9599322351815833\n          seconds_per_batch_max: 0.019260883331298828\n          seconds_per_batch_mean: 0.015567030906677246\n          seconds_per_batch_min: 0.015311241149902344\n          seconds_per_batch_std: 0.0005461537255227954\n      total:\n        human_readable:\n          batch_latency: \"15.818 ms +/- 549.873 \\xB5s [15.561 ms, 19.461 ms]\"\n          batches_per_second: 63.29 +/- 1.92 [51.38, 64.26]\n        metrics:\n          batches_per_second_max: 64.26476266356143\n          batches_per_second_mean: 63.28565696640637\n          batches_per_second_min: 51.38378232692614\n          batches_per_second_std: 1.9198343850767468\n          seconds_per_batch_max: 0.019461393356323242\n          seconds_per_batch_mean: 0.01581801414489746\n          seconds_per_batch_min: 0.015560626983642578\n          seconds_per_batch_std: 0.0005498731526138171\n    batch_size_8:\n      cpu_to_gpu:\n        human_readable:\n          batch_latency: \"805.674 \\xB5s +/- 157.254 \\xB5s [773.191 \\xB5s, 2.303 ms]\"\n          batches_per_second: 1.26 K +/- 97.51 [434.24, 1.29 K]\n        metrics:\n          batches_per_second_max: 1293.3407338883749\n          batches_per_second_mean: 1259.5653105357776\n          batches_per_second_min: 434.23791282741485\n          batches_per_second_std: 97.51424036939879\n          seconds_per_batch_max: 0.002302885055541992\n          seconds_per_batch_mean: 0.000805673599243164\n          seconds_per_batch_min: 0.0007731914520263672\n          seconds_per_batch_std: 0.0001572538140613121\n      gpu_to_cpu:\n        human_readable:\n          batch_latency: \"104.215 \\xB5s +/- 12.658 \\xB5s [59.605 \\xB5s, 128.031 \\xB5\\\n            s]\"\n          batches_per_second: 9.81 K +/- 1.76 K [7.81 K, 16.78 K]\n        metrics:\n          batches_per_second_max: 16777.216\n          batches_per_second_mean: 9806.840626578907\n          batches_per_second_min: 7810.621973929236\n          batches_per_second_std: 1761.6008872740726\n          seconds_per_batch_max: 0.00012803077697753906\n          seconds_per_batch_mean: 0.00010421514511108399\n          seconds_per_batch_min: 5.9604644775390625e-05\n          seconds_per_batch_std: 1.2658293070174213e-05\n      on_device_inference:\n        human_readable:\n          batch_latency: \"16.623 ms +/- 759.017 \\xB5s [16.301 ms, 22.584 ms]\"\n          batches_per_second: 60.26 +/- 2.22 [44.28, 61.35]\n        metrics:\n          batches_per_second_max: 61.346243290283894\n          batches_per_second_mean: 60.25881046175457\n          batches_per_second_min: 44.27827629162004\n          batches_per_second_std: 2.2193085956672296\n          seconds_per_batch_max: 0.02258443832397461\n          seconds_per_batch_mean: 0.01662288188934326\n          seconds_per_batch_min: 0.01630091667175293\n          seconds_per_batch_std: 0.0007590167680596548\n      total:\n        human_readable:\n          batch_latency: \"17.533 ms +/- 836.015 \\xB5s [17.193 ms, 23.896 ms]\"\n          batches_per_second: 57.14 +/- 2.20 [41.85, 58.16]\n        metrics:\n          batches_per_second_max: 58.16374528511205\n          batches_per_second_mean: 57.140338855126565\n          batches_per_second_min: 41.84762740950632\n          batches_per_second_std: 2.1985066663972677\n          seconds_per_batch_max: 0.023896217346191406\n          seconds_per_batch_mean: 0.01753277063369751\n          seconds_per_batch_min: 0.017192840576171875\n          seconds_per_batch_std: 0.0008360147274630088\n  ```\n</details>\n\n... Your turn\n\n## How we benchmark\nThe overall flow can be summarized with the diagram shown below (best viewed on GitHub):\n```mermaid\nflowchart TB;\n    A([Start]) --> B\n    B(prepare_samples)\n    B --> C[get_machine_info]\n    C --> D[measure_params]\n    D --> E[warm_up, batch_size=1]\n    E --> F[measure_flops]\n    \n    subgraph SG[Repeat for batch_size 1 and x]\n        direction TB\n        G[measure_allocated_memory]\n        G --> H[warm_up, given batch_size]\n        H --> I[measure_detailed_inference_timing]\n        I --> J[measure_repeated_inference_timing]\n        J --> K[measure_energy]\n    end\n\n    F --> SG\n    SG --> END([End])\n```\n\nUsually, the sample and model don't reside on the same device initially (e.g., a GPU holds the model while the sample is on CPU after being loaded from disk or collected as live data). Accordingly, we measure timing in three parts: `cpu_to_gpu`, `on_device_inference`, and `gpu_to_cpu`, as well as a sum of the three, `total`. Note that the `model.device()` determines the execution device. The inference flow is shown below:\n\n```mermaid\nflowchart LR;\n    A([sample])\n    A --> B[cpu -> gpu]\n    B --> C[model __call__]\n    C --> D[gpu -> cpu]\n    D --> E([result])\n```\n\n## Advanced use\nTrying to benchmark a custom class, which is not a `torch.nn.Module`?\nYou can pass custom functions to `benchmark` as seen in [this example](tests/test_custom_class.py).\n\n\n## Limitations\n- Allocated memory measurements are only available on CUDA devices.\n- Energy consumption can only be measured on NVIDIA Jetson platforms at the moment.\n- FLOPs and parameter count is not support for custom classes.\n\n\n## Acknowledgement\nThis work has received funding from the European Union\u2019s Horizon 2020 research and innovation programme under grant agreement No 871449 (OpenDR).\nIt was developed for benchmarking tools in [OpenDR](https://github.com/opendr-eu/opendr), a non-proprietary toolkit for deep learning based functionalities for robotics and vision.\n\n\n## Citation\nIf you like the tool and use it in research, please consider citing it:\n```bibtex\n@software{hedegaard2022pytorchbenchmark,\n  author = {Hedegaard, Lukas},\n  doi = {10.5281/zenodo.7223585},\n  month = {10},\n  title = {{PyTorch-Benchmark}},\n  version = {0.3.5},\n  year = {2022}\n}\n```\n\n\n",
    "bugtrack_url": null,
    "license": "",
    "summary": "Easily benchmark PyTorch model FLOPs, latency, throughput, max allocated memory and energy consumption in one go.",
    "version": "0.3.6",
    "project_urls": {
        "Homepage": "https://github.com/LukasHedegaard/pytorch-benchmark"
    },
    "split_keywords": [
        "deep learning",
        "pytorch",
        "ai",
        "benchmark",
        "speed",
        "energy",
        "memory"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "e97739d9fd682f57f2be5224139d4268a1c42ced1670ae469206caa0ce3de5bf",
                "md5": "452bed1a2d73d7e102cebca0824e8a4d",
                "sha256": "2b9534c1cd2bc583a03df1c70051375310d714165c189507e297f05362ae4ea7"
            },
            "downloads": -1,
            "filename": "pytorch_benchmark-0.3.6-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "452bed1a2d73d7e102cebca0824e8a4d",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 16703,
            "upload_time": "2023-08-10T07:04:45",
            "upload_time_iso_8601": "2023-08-10T07:04:45.932905Z",
            "url": "https://files.pythonhosted.org/packages/e9/77/39d9fd682f57f2be5224139d4268a1c42ced1670ae469206caa0ce3de5bf/pytorch_benchmark-0.3.6-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "919e597879f4df381ae4e8cc2bb02c7ec6e0dc4a3f226fd967e07a6a4e29c667",
                "md5": "5e3921ffa20c1210235aed2bc22dc718",
                "sha256": "1f36c179096cc1b5d4f9c7e176578f64582b7bfe248e84b031c6e955b80a0e12"
            },
            "downloads": -1,
            "filename": "pytorch-benchmark-0.3.6.tar.gz",
            "has_sig": false,
            "md5_digest": "5e3921ffa20c1210235aed2bc22dc718",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 20310,
            "upload_time": "2023-08-10T07:04:47",
            "upload_time_iso_8601": "2023-08-10T07:04:47.895857Z",
            "url": "https://files.pythonhosted.org/packages/91/9e/597879f4df381ae4e8cc2bb02c7ec6e0dc4a3f226fd967e07a6a4e29c667/pytorch-benchmark-0.3.6.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-08-10 07:04:47",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "LukasHedegaard",
    "github_project": "pytorch-benchmark",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "requirements": [
        {
            "name": "torch",
            "specs": [
                [
                    ">=",
                    "1.6"
                ]
            ]
        },
        {
            "name": "ptflops",
            "specs": [
                [
                    "~=",
                    "0.6.8"
                ]
            ]
        },
        {
            "name": "tqdm",
            "specs": [
                [
                    "~=",
                    "4.62"
                ]
            ]
        },
        {
            "name": "numpy",
            "specs": []
        },
        {
            "name": "psutil",
            "specs": [
                [
                    ">=",
                    "5.9"
                ]
            ]
        },
        {
            "name": "gputil",
            "specs": [
                [
                    ">=",
                    "1.4"
                ]
            ]
        },
        {
            "name": "py-cpuinfo",
            "specs": [
                [
                    ">=",
                    "7.0"
                ]
            ]
        },
        {
            "name": "pyyaml",
            "specs": [
                [
                    ">=",
                    "6.0"
                ]
            ]
        }
    ],
    "lcname": "pytorch-benchmark"
}

Lukas Hedegaard