# MolTx
[![CI](https://github.com/js-ish/MolTx/actions/workflows/test.yml/badge.svg)](https://github.com/js-ish/MolTx/actions/workflows/test.yml?query=branch%3Amain)
[![Coverage Status](https://coveralls.io/repos/github/js-ish/MolTx/badge.svg?branch=main)](https://coveralls.io/github/js-ish/MolTx?branch=main)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/moltx)
## Installation
```
pip install moltx
```
## Usage
### Pretrain
```python
import torch
# prepare dataset
from moltx import datasets, tokenizers, models
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Pretrain)
ds = datasets.AdaMR(tokenizer=tk, device=torch.device('cpu'))
generic_smiles = ["C=CC=CC=C", "...."]
canonical_smiles = ["c1cccc1c", "..."]
src, tgt, out = ds(generic_smiles, canonical_smiles)
# train
import torch.nn as nn
from torch.optim import Adam
from moltx import nets, models
## use custom config
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMR(conf)
crt = nn.CrossEntropyLoss(ignore_index=0)
optim = Adam(model.parameters(), lr=0.1)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred.view(-1, pred.size(-1)), out.view(-1))
loss.backward()
optim.step()
# save ckpt
torch.save(model.state_dict(), '/path/to/adamr.ckpt')
```
### Finetune
```python
# Classifier finetune
from moltx import datasets, tokenizers
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)
seq_len = 256 # max token lens of smiles in datasets, if None, use max token lens in smiles
ds = datasets.AdaMRClassifier(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
labels = [0, 1]
src, tgt, out = ds(smiles, labels, seq_len)
from moltx import nets, models
pretrained_conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRClassifier(num_classes=2, conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.CrossEntropyLoss()
optim = Adam(model.parameters(), lr=0.1)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred, out)
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/classifier.ckpt')
# Regression finetune
ds = datasets.AdaMRRegression(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
values = [0.23, 0.12]
src, tgt, out = ds(smiles, values, seq_len)
model = models.AdaMRRegression(conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.MSELoss()
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred, out)
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/regression.ckpt')
# Distributed Generation
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)
ds = datasets.AdaMRDistGeneration(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
src, tgt, out = ds(smiles, seq_len)
model = models.AdaMRDistGeneration(conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.CrossEntropyLoss(ignore_index=0)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred.view(-1, pred.size(-1)), out.view(-1))
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/distgen.ckpt')
# Goal Generation
ds = datasets.AdaMRGoalGeneration(tokenizer=tk, device=torch.device('cpu'))
smiles = ["c1cccc1c", "CC[N+](C)(C)Cc1ccccc1Br"]
goals = [0.23, 0.12]
src, tgt, out = ds(smiles, goals, seq_len)
model = models.AdaMRGoalGeneration(conf=pretrained_conf)
model.load_ckpt('/path/to/adamr.ckpt')
crt = nn.CrossEntropyLoss(ignore_index=0)
optim.zero_grad()
pred = model(src, tgt)
loss = crt(pred.view(-1, pred.size(-1)), out.view(-1))
loss.backward()
optim.step()
torch.save(model.state_dict(), '/path/to/goalgen.ckpt')
```
### Inference
```python
from moltx import nets, models, pipelines, tokenizers
# AdaMR
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMR(conf)
model.load_ckpt('/path/to/adamr.ckpt')
pipeline = pipelines.AdaMR(tk, model)
pipeline("C=CC=CC=C")
# {"smiles": ["c1ccccc1"], probabilities: [0.9]}
# Classifier
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRClassifier(2, conf)
model.load_ckpt('/path/to/classifier.ckpt')
pipeline = pipelines.AdaMRClassifier(tk, model)
pipeline("C=CC=CC=C")
# {"label": [1], "probability": [0.67]}
# Regression
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRRegression(2, conf)
model.load_ckpt('/path/to/regression.ckpt')
pipeline = pipelines.AdaMRRegression(tk, model)
pipeline("C=CC=CC=C")
# {"value": [0.467], "probability": [0.67]}
# DistGeneration
tk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRDistGeneration(conf)
model.load_ckpt('/path/to/distgen.ckpt')
pipeline = pipelines.AdaMRDistGeneration(tk, model)
pipeline(k=2)
# {"smiles": ["c1ccccc1", "...."], probabilities: [0.9, 0.1]}
# GoalGeneration
conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE
model = models.AdaMRGoalGeneration(conf)
model.load_ckpt('/path/to/goalgen.ckpt')
pipeline = pipelines.AdaMRGoalGeneration(tk, model)
pipeline(0.48, k=2)
# {"smiles": ["c1ccccc1", "...."], probabilities: [0.9, 0.1]}
```
Raw data
{
"_id": null,
"home_page": null,
"name": "moltx",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": null,
"keywords": "molcule, AI, deep learning, transformer",
"author": "Michael Ding",
"author_email": "yandy.ding@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/35/ba/0d73d4cd646d539e14cbdb919e0fdf06ed9682171a9714e49c922619debd/moltx-1.0.2.tar.gz",
"platform": null,
"description": "# MolTx\n\n[![CI](https://github.com/js-ish/MolTx/actions/workflows/test.yml/badge.svg)](https://github.com/js-ish/MolTx/actions/workflows/test.yml?query=branch%3Amain)\n[![Coverage Status](https://coveralls.io/repos/github/js-ish/MolTx/badge.svg?branch=main)](https://coveralls.io/github/js-ish/MolTx?branch=main)\n![PyPI - Python Version](https://img.shields.io/pypi/pyversions/moltx)\n\n## Installation\n\n```\npip install moltx\n```\n\n## Usage\n\n### Pretrain\n\n```python\nimport torch\n\n# prepare dataset\nfrom moltx import datasets, tokenizers, models\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Pretrain)\nds = datasets.AdaMR(tokenizer=tk, device=torch.device('cpu'))\ngeneric_smiles = [\"C=CC=CC=C\", \"....\"]\ncanonical_smiles = [\"c1cccc1c\", \"...\"]\nsrc, tgt, out = ds(generic_smiles, canonical_smiles)\n\n# train\nimport torch.nn as nn\nfrom torch.optim import Adam\nfrom moltx import nets, models\n\n## use custom config\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMR(conf)\n\ncrt = nn.CrossEntropyLoss(ignore_index=0)\noptim = Adam(model.parameters(), lr=0.1)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred.view(-1, pred.size(-1)), out.view(-1))\nloss.backward()\noptim.step()\n\n# save ckpt\ntorch.save(model.state_dict(), '/path/to/adamr.ckpt')\n```\n\n\n### Finetune\n\n\n```python\n# Classifier finetune\nfrom moltx import datasets, tokenizers\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)\n\nseq_len = 256 # max token lens of smiles in datasets, if None, use max token lens in smiles\nds = datasets.AdaMRClassifier(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\nlabels = [0, 1]\nsrc, tgt, out = ds(smiles, labels, seq_len)\n\nfrom moltx import nets, models\npretrained_conf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRClassifier(num_classes=2, conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.CrossEntropyLoss()\noptim = Adam(model.parameters(), lr=0.1)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred, out)\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/classifier.ckpt')\n\n# Regression finetune\nds = datasets.AdaMRRegression(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\nvalues = [0.23, 0.12]\nsrc, tgt, out = ds(smiles, values, seq_len)\n\nmodel = models.AdaMRRegression(conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.MSELoss()\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred, out)\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/regression.ckpt')\n\n# Distributed Generation\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)\nds = datasets.AdaMRDistGeneration(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\nsrc, tgt, out = ds(smiles, seq_len)\n\nmodel = models.AdaMRDistGeneration(conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.CrossEntropyLoss(ignore_index=0)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred.view(-1, pred.size(-1)), out.view(-1))\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/distgen.ckpt')\n\n# Goal Generation\nds = datasets.AdaMRGoalGeneration(tokenizer=tk, device=torch.device('cpu'))\nsmiles = [\"c1cccc1c\", \"CC[N+](C)(C)Cc1ccccc1Br\"]\ngoals = [0.23, 0.12]\nsrc, tgt, out = ds(smiles, goals, seq_len)\n\nmodel = models.AdaMRGoalGeneration(conf=pretrained_conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\ncrt = nn.CrossEntropyLoss(ignore_index=0)\n\noptim.zero_grad()\npred = model(src, tgt)\nloss = crt(pred.view(-1, pred.size(-1)), out.view(-1))\nloss.backward()\noptim.step()\n\ntorch.save(model.state_dict(), '/path/to/goalgen.ckpt')\n```\n\n### Inference\n\n```python\nfrom moltx import nets, models, pipelines, tokenizers\n# AdaMR\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMR(conf)\nmodel.load_ckpt('/path/to/adamr.ckpt')\npipeline = pipelines.AdaMR(tk, model)\npipeline(\"C=CC=CC=C\")\n# {\"smiles\": [\"c1ccccc1\"], probabilities: [0.9]}\n\n# Classifier\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Prediction)\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRClassifier(2, conf)\nmodel.load_ckpt('/path/to/classifier.ckpt')\npipeline = pipelines.AdaMRClassifier(tk, model)\npipeline(\"C=CC=CC=C\")\n# {\"label\": [1], \"probability\": [0.67]}\n\n# Regression\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRRegression(2, conf)\nmodel.load_ckpt('/path/to/regression.ckpt')\npipeline = pipelines.AdaMRRegression(tk, model)\npipeline(\"C=CC=CC=C\")\n# {\"value\": [0.467], \"probability\": [0.67]}\n\n# DistGeneration\ntk = tokenizers.MoltxTokenizer.from_pretrain(models.AdaMRTokenizerConfig.Generation)\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRDistGeneration(conf)\nmodel.load_ckpt('/path/to/distgen.ckpt')\npipeline = pipelines.AdaMRDistGeneration(tk, model)\npipeline(k=2)\n# {\"smiles\": [\"c1ccccc1\", \"....\"], probabilities: [0.9, 0.1]}\n\n# GoalGeneration\nconf = models.AdaMR.CONFIG_LARGE # or models.AdaMR.CONFIG_BASE\nmodel = models.AdaMRGoalGeneration(conf)\nmodel.load_ckpt('/path/to/goalgen.ckpt')\npipeline = pipelines.AdaMRGoalGeneration(tk, model)\npipeline(0.48, k=2)\n# {\"smiles\": [\"c1ccccc1\", \"....\"], probabilities: [0.9, 0.1]}\n```\n",
"bugtrack_url": null,
"license": null,
"summary": "Molcule Transformer X Model",
"version": "1.0.2",
"project_urls": null,
"split_keywords": [
"molcule",
" ai",
" deep learning",
" transformer"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "509cdd8c4dceccaa59559eb5c9a8674db4138f13a3f42f19bd45ef8b20acb122",
"md5": "d61bab35b97e8bbf698512621e969b1d",
"sha256": "11ffd9ba041ee66e2a323bd12e576f32fa9b0659c7bbbbd61b9dd13b6a8431cf"
},
"downloads": -1,
"filename": "moltx-1.0.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "d61bab35b97e8bbf698512621e969b1d",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 41469,
"upload_time": "2024-05-13T09:51:22",
"upload_time_iso_8601": "2024-05-13T09:51:22.486920Z",
"url": "https://files.pythonhosted.org/packages/50/9c/dd8c4dceccaa59559eb5c9a8674db4138f13a3f42f19bd45ef8b20acb122/moltx-1.0.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "35ba0d73d4cd646d539e14cbdb919e0fdf06ed9682171a9714e49c922619debd",
"md5": "20bd1a508ea77dd5612cd9f1988f8e45",
"sha256": "4f5da606a50ce56e4f85b4df1309932e62ef23e3813045be94f4ee594a6889e6"
},
"downloads": -1,
"filename": "moltx-1.0.2.tar.gz",
"has_sig": false,
"md5_digest": "20bd1a508ea77dd5612cd9f1988f8e45",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8",
"size": 43488,
"upload_time": "2024-05-13T09:51:24",
"upload_time_iso_8601": "2024-05-13T09:51:24.065989Z",
"url": "https://files.pythonhosted.org/packages/35/ba/0d73d4cd646d539e14cbdb919e0fdf06ed9682171a9714e49c922619debd/moltx-1.0.2.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-05-13 09:51:24",
"github": false,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"lcname": "moltx"
}