minRL


NameminRL JSON
Version 0.0.3 PyPI version JSON
download
home_pageNone
SummaryDeep Reinforcement Learning for minimalists.
upload_time2024-10-21 08:12:29
maintainerNone
docs_urlNone
authorNone
requires_python>=3.8
licenseNone
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # minRL: Deep Reinforcement Learning for `minimalists`

```cmd
pip install minRL
```

```py
import gymnasium as gym

from minRL.ppo.PPOClip import PPOClip
from minRL.utils.nn_utils import mlp
from minRL.vpg.VanillaPG import VanillaPG

env = gym.make("CartPole-v1")
a_space = env.action_space
discrete = isinstance(a_space, gym.spaces.Discrete)
a_dim = a_space.n if discrete else a_space.shape[0]
o_dim = env.observation_space.shape[0]
pi_net = mlp([o_dim, 64, 64, a_dim])
V_net = mlp([o_dim, 64, 64, 1])

UsedPG = VanillaPG if 0 else PPOClip

pg = UsedPG(discrete, a_dim, pi_net, V_net, pi_lr=2e-3)
for e in range(100):
    pg.train_once(pg.get_D_from_env(env))

```

## Intro

- I am **starting** (`2024-10-19`) to learn RL following [OpenAI SpinningUp](https://github.com/openai/spinningup)

- As I dig into [the code](https://github.com/openai/spinningup/tree/master/spinup/algos/pytorch/vpg), I find it quite **verbose**, and found [some confusing places (maybe bugs?)](https://github.com/openai/spinningup/issues/424)

- Coming from a Physics background, I love **minimalism** and **simplicity**... hence I made this repo

## Example comparison: Vanilla Policy Gradient

- `VPG` by OpenAI SpinningUp:
    - [135 lines @ core.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/vpg/core.py)
        - Contains `combined_shape, mlp, count_vars, discount_cumsum, Actor, MLPCategoricalActor, MLPGaussianActor, MLPCritic, MLPActorCritic`
    - [350 lines @ vpg.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/vpg/vpg.py)
        - Contains `VPGBuffer, vpg`
    - Highly nested classes: `actor-critic -> actor & critic -> mlp`
    - Neural Network models coupled with RL algorithms
    - Key equations scattered all over the place
    - Highly coupled with `gym` interface

- `VPG` I wrote:
    - [27 lines @ nn_utils.py](./src/minRL/utils/nn_utils.py)
        - Contains `discount_cum_sum, tensor, mlp`
    - [149 lines @ VanillaPG.py](./src/minRL/vpg/VanillaPG.py)
        - Contains `VanillaPG`
    - No highly nested classes
    - Neural Network models **decoupled** from RL algorithms
    - All key equations in one place, following the [pseudo-code below](https://spinningup.openai.com/en/latest/algorithms/vpg.html#pseudocode)
    - **Decoupled** from the `gym` interface
        - This is the most important feature for the RL problem I am interested in, as it avoids back-and-forth interaction with the environment from the RL code, but instead only interacts with Replay Buffer data `D`
        - An optional util function `VanillaPG.get_D_from_env` is provided 
    - Correctness verified by [test_pg.py](./src/test_pg.py)

![](https://spinningup.openai.com/en/latest/_images/math/262538f3077a7be8ce89066abbab523575132996.svg)

```py
class VanillaPG:
    def train_once(s, D: BUFFER_TYPE):
        # requires: r, ended, V, V_next, o, a, logp
        D = s.find_R_and_A(D)
        s.update_pi(D)
        s.update_V(D)

    def find_R_and_A(s, D: BUFFER_TYPE):
        # 4, 5: compute R and A
        for k in ["R", "A"]:
            D[k] = np.zeros_like(D["r"])
        start, N = 0, len(D["r"])
        for i in range(N):
            if D["ended"][i]:
                V_next = D["V_next"][i]
                slc = slice(start, i + 1)
                r = np.append(D["r"][slc], V_next)
                V = np.append(D["V"][slc], V_next)
                # GAE-lambda advantage estimation
                A = r[:-1] + s.gam * V[1:] - V[:-1]
                D["A"][slc] = discount_cum_sum(A, s.gam * s.lam)
                D["R"][slc] = discount_cum_sum(r, s.gam)[:-1]
                start = i + 1
        D["A"] = mpi.normalize(D["A"])
        return {k: tensor(v) for k, v in D.items()}

    def update_pi(s, D: BUFFER_TYPE):
        # 6, 7: estimate pg and optimize
        o, a, A = D["o"], D["a"], D["A"]
        s.pi_opt.zero_grad()
        pi = s.get_pi(o)
        logp = s.get_logp(pi, a)
        loss = -(logp * A).mean()
        loss.backward()
        mpi.avg_grads(s.pi_params)
        s.pi_opt.step()

    def update_V(s, D: BUFFER_TYPE):
        # 8: fit V
        o, R = D["o"], D["R"]
        for _ in range(s.V_iters):
            s.V_opt.zero_grad()
            loss = ((s.get_V(o) - R) ** 2).mean()
            loss.backward()
            mpi.avg_grads(s.V_params)
            s.V_opt.step()
```

## Example comparison: PPO Clip

- `PPO Clip` by OpenAI SpinningUp:
    - [135 lines @ core.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/core.py)
    - [378 lines @ ppo.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/ppo.py)
- `PPO Clip` I wrote:
    - [32 lines @ PPOClip.py](./src/minRL/ppo/PPOClip.py)

![](https://spinningup.openai.com/en/latest/_images/math/e62a8971472597f4b014c2da064f636ffe365ba3.svg)

```py
class PPOClip(VanillaPG):
    pi_iters = 80
    eps = 0.2
    max_kl = 0.015

    def update_pi(s, D: BUFFER_TYPE):
        o, a, A, logp_old = D["o"], D["a"], D["A"], D["logp"]
        for _ in range(s.pi_iters):
            s.pi_opt.zero_grad()
            pi = s.get_pi(o)
            logp = s.get_logp(pi, a)

            ratio = tc.exp(logp - logp_old)
            r_clip = tc.clamp(ratio, 1 - s.eps, 1 + s.eps)
            loss = -(tc.min(ratio * A, r_clip * A)).mean()

            kl = mpi.avg((logp_old - logp).mean().item())
            if kl > s.max_kl:
                print("kl > max_kl, stopping!")
                break
            # ent = pi.entropy().mean().item()
            # clipped = ratio.gt(1 + s.eps) | ratio.lt(1 - s.eps)
            # clip_frac = tensor(clipped).mean().item()

            loss.backward()
            mpi.avg_grads(s.pi_params)
            s.pi_opt.step()
```

            

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "minRL",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.8",
    "maintainer_email": null,
    "keywords": null,
    "author": null,
    "author_email": "Ricky Ding <e0134117@u.nus.edu>",
    "download_url": "https://files.pythonhosted.org/packages/6d/ba/e59d1a0e88665b08bb8b202c420e8e2d39718b00bbd4e866d4ead4969cc4/minrl-0.0.3.tar.gz",
    "platform": null,
    "description": "# minRL: Deep Reinforcement Learning for `minimalists`\r\n\r\n```cmd\r\npip install minRL\r\n```\r\n\r\n```py\r\nimport gymnasium as gym\r\n\r\nfrom minRL.ppo.PPOClip import PPOClip\r\nfrom minRL.utils.nn_utils import mlp\r\nfrom minRL.vpg.VanillaPG import VanillaPG\r\n\r\nenv = gym.make(\"CartPole-v1\")\r\na_space = env.action_space\r\ndiscrete = isinstance(a_space, gym.spaces.Discrete)\r\na_dim = a_space.n if discrete else a_space.shape[0]\r\no_dim = env.observation_space.shape[0]\r\npi_net = mlp([o_dim, 64, 64, a_dim])\r\nV_net = mlp([o_dim, 64, 64, 1])\r\n\r\nUsedPG = VanillaPG if 0 else PPOClip\r\n\r\npg = UsedPG(discrete, a_dim, pi_net, V_net, pi_lr=2e-3)\r\nfor e in range(100):\r\n    pg.train_once(pg.get_D_from_env(env))\r\n\r\n```\r\n\r\n## Intro\r\n\r\n- I am **starting** (`2024-10-19`) to learn RL following [OpenAI SpinningUp](https://github.com/openai/spinningup)\r\n\r\n- As I dig into [the code](https://github.com/openai/spinningup/tree/master/spinup/algos/pytorch/vpg), I find it quite **verbose**, and found [some confusing places (maybe bugs?)](https://github.com/openai/spinningup/issues/424)\r\n\r\n- Coming from a Physics background, I love **minimalism** and **simplicity**... hence I made this repo\r\n\r\n## Example comparison: Vanilla Policy Gradient\r\n\r\n- `VPG` by OpenAI SpinningUp:\r\n    - [135 lines @ core.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/vpg/core.py)\r\n        - Contains `combined_shape, mlp, count_vars, discount_cumsum, Actor, MLPCategoricalActor, MLPGaussianActor, MLPCritic, MLPActorCritic`\r\n    - [350 lines @ vpg.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/vpg/vpg.py)\r\n        - Contains `VPGBuffer, vpg`\r\n    - Highly nested classes: `actor-critic -> actor & critic -> mlp`\r\n    - Neural Network models coupled with RL algorithms\r\n    - Key equations scattered all over the place\r\n    - Highly coupled with `gym` interface\r\n\r\n- `VPG` I wrote:\r\n    - [27 lines @ nn_utils.py](./src/minRL/utils/nn_utils.py)\r\n        - Contains `discount_cum_sum, tensor, mlp`\r\n    - [149 lines @ VanillaPG.py](./src/minRL/vpg/VanillaPG.py)\r\n        - Contains `VanillaPG`\r\n    - No highly nested classes\r\n    - Neural Network models **decoupled** from RL algorithms\r\n    - All key equations in one place, following the [pseudo-code below](https://spinningup.openai.com/en/latest/algorithms/vpg.html#pseudocode)\r\n    - **Decoupled** from the `gym` interface\r\n        - This is the most important feature for the RL problem I am interested in, as it avoids back-and-forth interaction with the environment from the RL code, but instead only interacts with Replay Buffer data `D`\r\n        - An optional util function `VanillaPG.get_D_from_env` is provided \r\n    - Correctness verified by [test_pg.py](./src/test_pg.py)\r\n\r\n![](https://spinningup.openai.com/en/latest/_images/math/262538f3077a7be8ce89066abbab523575132996.svg)\r\n\r\n```py\r\nclass VanillaPG:\r\n    def train_once(s, D: BUFFER_TYPE):\r\n        # requires: r, ended, V, V_next, o, a, logp\r\n        D = s.find_R_and_A(D)\r\n        s.update_pi(D)\r\n        s.update_V(D)\r\n\r\n    def find_R_and_A(s, D: BUFFER_TYPE):\r\n        # 4, 5: compute R and A\r\n        for k in [\"R\", \"A\"]:\r\n            D[k] = np.zeros_like(D[\"r\"])\r\n        start, N = 0, len(D[\"r\"])\r\n        for i in range(N):\r\n            if D[\"ended\"][i]:\r\n                V_next = D[\"V_next\"][i]\r\n                slc = slice(start, i + 1)\r\n                r = np.append(D[\"r\"][slc], V_next)\r\n                V = np.append(D[\"V\"][slc], V_next)\r\n                # GAE-lambda advantage estimation\r\n                A = r[:-1] + s.gam * V[1:] - V[:-1]\r\n                D[\"A\"][slc] = discount_cum_sum(A, s.gam * s.lam)\r\n                D[\"R\"][slc] = discount_cum_sum(r, s.gam)[:-1]\r\n                start = i + 1\r\n        D[\"A\"] = mpi.normalize(D[\"A\"])\r\n        return {k: tensor(v) for k, v in D.items()}\r\n\r\n    def update_pi(s, D: BUFFER_TYPE):\r\n        # 6, 7: estimate pg and optimize\r\n        o, a, A = D[\"o\"], D[\"a\"], D[\"A\"]\r\n        s.pi_opt.zero_grad()\r\n        pi = s.get_pi(o)\r\n        logp = s.get_logp(pi, a)\r\n        loss = -(logp * A).mean()\r\n        loss.backward()\r\n        mpi.avg_grads(s.pi_params)\r\n        s.pi_opt.step()\r\n\r\n    def update_V(s, D: BUFFER_TYPE):\r\n        # 8: fit V\r\n        o, R = D[\"o\"], D[\"R\"]\r\n        for _ in range(s.V_iters):\r\n            s.V_opt.zero_grad()\r\n            loss = ((s.get_V(o) - R) ** 2).mean()\r\n            loss.backward()\r\n            mpi.avg_grads(s.V_params)\r\n            s.V_opt.step()\r\n```\r\n\r\n## Example comparison: PPO Clip\r\n\r\n- `PPO Clip` by OpenAI SpinningUp:\r\n    - [135 lines @ core.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/core.py)\r\n    - [378 lines @ ppo.py](https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/ppo.py)\r\n- `PPO Clip` I wrote:\r\n    - [32 lines @ PPOClip.py](./src/minRL/ppo/PPOClip.py)\r\n\r\n![](https://spinningup.openai.com/en/latest/_images/math/e62a8971472597f4b014c2da064f636ffe365ba3.svg)\r\n\r\n```py\r\nclass PPOClip(VanillaPG):\r\n    pi_iters = 80\r\n    eps = 0.2\r\n    max_kl = 0.015\r\n\r\n    def update_pi(s, D: BUFFER_TYPE):\r\n        o, a, A, logp_old = D[\"o\"], D[\"a\"], D[\"A\"], D[\"logp\"]\r\n        for _ in range(s.pi_iters):\r\n            s.pi_opt.zero_grad()\r\n            pi = s.get_pi(o)\r\n            logp = s.get_logp(pi, a)\r\n\r\n            ratio = tc.exp(logp - logp_old)\r\n            r_clip = tc.clamp(ratio, 1 - s.eps, 1 + s.eps)\r\n            loss = -(tc.min(ratio * A, r_clip * A)).mean()\r\n\r\n            kl = mpi.avg((logp_old - logp).mean().item())\r\n            if kl > s.max_kl:\r\n                print(\"kl > max_kl, stopping!\")\r\n                break\r\n            # ent = pi.entropy().mean().item()\r\n            # clipped = ratio.gt(1 + s.eps) | ratio.lt(1 - s.eps)\r\n            # clip_frac = tensor(clipped).mean().item()\r\n\r\n            loss.backward()\r\n            mpi.avg_grads(s.pi_params)\r\n            s.pi_opt.step()\r\n```\r\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "Deep Reinforcement Learning for minimalists.",
    "version": "0.0.3",
    "project_urls": {
        "Homepage": "https://github.com/tesla-cat/minRL",
        "Issues": "https://github.com/tesla-cat/minRL/issues"
    },
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "c97a56043daeb051b0450463049a99823856362bc138dc6657df17f6566c1848",
                "md5": "b903789f72c61d2b3ddbf84afe464481",
                "sha256": "d3f99070dd14e43ddf497023303b1ca38f4740891ea6e6e40a77fc5a7683fb57"
            },
            "downloads": -1,
            "filename": "minRL-0.0.3-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "b903789f72c61d2b3ddbf84afe464481",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.8",
            "size": 8338,
            "upload_time": "2024-10-21T08:12:26",
            "upload_time_iso_8601": "2024-10-21T08:12:26.832934Z",
            "url": "https://files.pythonhosted.org/packages/c9/7a/56043daeb051b0450463049a99823856362bc138dc6657df17f6566c1848/minRL-0.0.3-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "6dbae59d1a0e88665b08bb8b202c420e8e2d39718b00bbd4e866d4ead4969cc4",
                "md5": "e5a365141515dfb8fdc67d1fa5b9c1cd",
                "sha256": "65ee7279e1a120c4698874b9de196d29881c1c88a00f2115ef21674a5104009d"
            },
            "downloads": -1,
            "filename": "minrl-0.0.3.tar.gz",
            "has_sig": false,
            "md5_digest": "e5a365141515dfb8fdc67d1fa5b9c1cd",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.8",
            "size": 7463,
            "upload_time": "2024-10-21T08:12:29",
            "upload_time_iso_8601": "2024-10-21T08:12:29.265750Z",
            "url": "https://files.pythonhosted.org/packages/6d/ba/e59d1a0e88665b08bb8b202c420e8e2d39718b00bbd4e866d4ead4969cc4/minrl-0.0.3.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-10-21 08:12:29",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "tesla-cat",
    "github_project": "minRL",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "lcname": "minrl"
}
        
Elapsed time: 0.46615s