# MolTx
[![CI](https://github.com/js-ish/MolTx/actions/workflows/test.yml/badge.svg)](https://github.com/js-ish/MolTx/actions/workflows/test.yml?query=branch%3Amain)
[![Coverage Status](https://coveralls.io/repos/github/js-ish/MolTx/badge.svg?branch=main)](https://coveralls.io/github/js-ish/MolTx?branch=main)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/moltx)
## Installation
```
pip install moltx
```
## Usage
### Pretrain
```python
import torch
# prepare dataset
from moltx import datasets, tokenizers, models
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Pretrain)
ds = datasets.AdaMR(tokenizer=tk, device=torch.device('cpu'))
generic_smiles = ["C=CC=CC=C", "...."]
canonical_smiles = ["c1cccc1c", "..."]
src, tgt, out = ds(generic_smiles, canonical_smiles)
# train
import torch.nn as nn
from torch.optim import Adam
from moltx import nets, models
## use custom config
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMR(conf)
crt = nn.CrossEntropyLoss(ignore_index=0)
optim = Adam(model.parameters(), lr=0.1)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred.view(-1, pred.size(-1)), out.view(-1))
loss.backward()
optim.step()
# save ckpt
torch.save(model.state_dict(), '/path/to/adamr.ckpt')
```
### Finetune
```python
# Classifier finetune
from moltx import datasets, tokenizers
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)
seq_len = 256 # max token lens of smiles in datasets, if None, use max token lens in smiles
ds = datasets.AdaMRClassifier(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
labels = [0, 1]
src, tgt, out = ds(smiles, labels, seq_len)
from moltx import nets, models
pretrained_conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRClassifier(num_classes=2, conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.CrossEntropyLoss()
optim = Adam(model.parameters(), lr=0.1)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred, out)
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/classifier.ckpt')
# Regression finetune
ds = datasets.AdaMRRegression(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
values = [0.23, 0.12]
src, tgt, out = ds(smiles, values, seq_len)
model = models.AdaMRRegression(conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.MSELoss()
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred, out)
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/regression.ckpt')
# Distributed Generation
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)
ds = datasets.AdaMRDistGeneration(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
src, tgt, out = ds(smiles, seq_len)
model = models.AdaMRDistGeneration(conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.CrossEntropyLoss(ignore_index=0)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred.view(-1, pred.size(-1)), out.view(-1))
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/distgen.ckpt')
# Goal Generation
ds = datasets.AdaMRGoalGeneration(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
goals = [0.23, 0.12]
src, tgt, out = ds(smiles, goals, seq_len)
model = models.AdaMRGoalGeneration(conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.CrossEntropyLoss(ignore_index=0)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred.view(-1, pred.size(-1)), out.view(-1))
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/goalgen.ckpt')
```
### Inference
```python
from moltx import nets, models, pipelines, tokenizers
# AdaMR
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMR(conf)
model.load_ckpt('/path/to/adamr.ckpt')
pipeline = pipelines.AdaMR(tk, model)
pipeline("C=CC=CC=C")
# {"smiles": ["c1ccccc1"], probabilities: [0.9]}
# Classifier
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRClassifier(2, conf)
model.load_ckpt('/path/to/classifier.ckpt')
pipeline = pipelines.AdaMRClassifier(tk, model)
pipeline("C=CC=CC=C")
# {"label": [1], "probability": [0.67]}
# Regression
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRRegression(2, conf)
model.load_ckpt('/path/to/regression.ckpt')
pipeline = pipelines.AdaMRRegression(tk, model)
pipeline("C=CC=CC=C")
# {"value": [0.467], "probability": [0.67]}
# DistGeneration
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRDistGeneration(conf)
model.load_ckpt('/path/to/distgen.ckpt')
pipeline = pipelines.AdaMRDistGeneration(tk, model)
pipeline(k=2)
# {"smiles": ["c1ccccc1", "...."], probabilities: [0.9, 0.1]}
# GoalGeneration
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRGoalGeneration(conf)
model.load_ckpt('/path/to/goalgen.ckpt')
pipeline = pipelines.AdaMRGoalGeneration(tk, model)
pipeline(0.48, k=2)
# {"smiles": ["c1ccccc1", "...."], probabilities: [0.9, 0.1]}
```
Raw data
{
"_id": null,
"home_page": null,
"name": "moltx",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": null,
"keywords": "molcule, AI, deep learning, transformer",
"author": "Michael Ding",
"author_email": "yandy.ding@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/b9/5b/5d420bd0c821887f0a671b6e42438b4424aaf1c1770919315f36eaa41dd3/moltx-1.0.3.tar.gz",
"platform": null,
"description": "# MolTx\n\n[![CI](https://github.com/js-ish/MolTx/actions/workflows/test.yml/badge.svg)](https://github.com/js-ish/MolTx/actions/workflows/test.yml?query=branch%3Amain)\n[![Coverage Status](https://coveralls.io/repos/github/js-ish/MolTx/badge.svg?branch=main)](https://coveralls.io/github/js-ish/MolTx?branch=main)\n![PyPI - Python Version](https://img.shields.io/pypi/pyversions/moltx)\n\n## Installation\n\n```\npip install moltx\n```\n\n## Usage\n\n### Pretrain\n\n```python\nimport torch\n\n# prepare dataset\nfrom moltx import datasets, tokenizers, models\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Pretrain)\nds = datasets.AdaMR(tokenizer=tk, device=torch.device('cpu'))\ngeneric_smiles = [\"C=CC=CC=C\", \"....\"]\ncanonical_smiles = [\"c1cccc1c\", \"...\"]\nsrc, tgt, out = ds(generic_smiles, canonical_smiles)\n\n# train\nimport torch.nn as nn\nfrom torch.optim import Adam\nfrom moltx import nets, models\n\n## use custom config\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMR(conf)\n\ncrt = nn.CrossEntropyLoss(ignore_index=0)\noptim = Adam(model.parameters(), lr=0.1)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred.view(-1, pred.size(-1)), out.view(-1))\nloss.backward()\noptim.step()\n\n# save ckpt\ntorch.save(model.state_dict(), '/path/to/adamr.ckpt')\n```\n\n\n### Finetune\n\n\n```python\n# Classifier finetune\nfrom moltx import datasets, tokenizers\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)\n\nseq_len = 256 # max token lens of smiles in datasets, if None, use max token lens in smiles\nds = datasets.AdaMRClassifier(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\nlabels = [0, 1]\nsrc, tgt, out = ds(smiles, labels, seq_len)\n\nfrom moltx import nets, models\npretrained_conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRClassifier(num_classes=2, conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.CrossEntropyLoss()\noptim = Adam(model.parameters(), lr=0.1)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred, out)\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/classifier.ckpt')\n\n# Regression finetune\nds = datasets.AdaMRRegression(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\nvalues = [0.23, 0.12]\nsrc, tgt, out = ds(smiles, values, seq_len)\n\nmodel = models.AdaMRRegression(conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.MSELoss()\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred, out)\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/regression.ckpt')\n\n# Distributed Generation\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)\nds = datasets.AdaMRDistGeneration(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\nsrc, tgt, out = ds(smiles, seq_len)\n\nmodel = models.AdaMRDistGeneration(conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.CrossEntropyLoss(ignore_index=0)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred.view(-1, pred.size(-1)), out.view(-1))\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/distgen.ckpt')\n\n# Goal Generation\nds = datasets.AdaMRGoalGeneration(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\ngoals = [0.23, 0.12]\nsrc, tgt, out = ds(smiles, goals, seq_len)\n\nmodel = models.AdaMRGoalGeneration(conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.CrossEntropyLoss(ignore_index=0)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred.view(-1, pred.size(-1)), out.view(-1))\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/goalgen.ckpt')\n```\n\n### Inference\n\n```python\nfrom moltx import nets, models, pipelines, tokenizers\n# AdaMR\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMR(conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\npipeline = pipelines.AdaMR(tk, model)\npipeline(\"C=CC=CC=C\")\n# {\"smiles\": [\"c1ccccc1\"], probabilities: [0.9]}\n\n# Classifier\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRClassifier(2, conf)\nmodel.load_ckpt('/path/to/classifier.ckpt')\npipeline = pipelines.AdaMRClassifier(tk, model)\npipeline(\"C=CC=CC=C\")\n# {\"label\": [1], \"probability\": [0.67]}\n\n# Regression\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRRegression(2, conf)\nmodel.load_ckpt('/path/to/regression.ckpt')\npipeline = pipelines.AdaMRRegression(tk, model)\npipeline(\"C=CC=CC=C\")\n# {\"value\": [0.467], \"probability\": [0.67]}\n\n# DistGeneration\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRDistGeneration(conf)\nmodel.load_ckpt('/path/to/distgen.ckpt')\npipeline = pipelines.AdaMRDistGeneration(tk, model)\npipeline(k=2)\n# {\"smiles\": [\"c1ccccc1\", \"....\"], probabilities: [0.9, 0.1]}\n\n# GoalGeneration\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRGoalGeneration(conf)\nmodel.load_ckpt('/path/to/goalgen.ckpt')\npipeline = pipelines.AdaMRGoalGeneration(tk, model)\npipeline(0.48, k=2)\n# {\"smiles\": [\"c1ccccc1\", \"....\"], probabilities: [0.9, 0.1]}\n```\n",
"bugtrack_url": null,
"license": null,
"summary": "Molcule Transformer X Model",
"version": "1.0.3",
"project_urls": null,
"split_keywords": [
"molcule",
" ai",
" deep learning",
" transformer"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "d4515ffc28df9768bdb8080e0f6b4fe0130127f02a683597d1a2f17c5a6eba0c",
"md5": "0b889cb4183d3ca86d4894e64bb4c2e6",
"sha256": "b412ccfd9da4c44011e309b9660b793d6ed90f582b4f68ff09cfdf99e08fd8c3"
},
"downloads": -1,
"filename": "moltx-1.0.3-py3-none-any.whl",
"has_sig": false,
"md5_digest": "0b889cb4183d3ca86d4894e64bb4c2e6",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 41468,
"upload_time": "2024-05-21T01:34:14",
"upload_time_iso_8601": "2024-05-21T01:34:14.908844Z",
"url": "https://files.pythonhosted.org/packages/d4/51/5ffc28df9768bdb8080e0f6b4fe0130127f02a683597d1a2f17c5a6eba0c/moltx-1.0.3-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "b95b5d420bd0c821887f0a671b6e42438b4424aaf1c1770919315f36eaa41dd3",
"md5": "f7f679728a8581f33c6edbc2e3c34412",
"sha256": "a6ff95a13b7fff13a5c1f4fe0a3c1a48140536029bbefb794a100b480f24b37d"
},
"downloads": -1,
"filename": "moltx-1.0.3.tar.gz",
"has_sig": false,
"md5_digest": "f7f679728a8581f33c6edbc2e3c34412",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8",
"size": 43479,
"upload_time": "2024-05-21T01:34:16",
"upload_time_iso_8601": "2024-05-21T01:34:16.349215Z",
"url": "https://files.pythonhosted.org/packages/b9/5b/5d420bd0c821887f0a671b6e42438b4424aaf1c1770919315f36eaa41dd3/moltx-1.0.3.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-05-21 01:34:16",
"github": false,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"lcname": "moltx"
}