# antidb
## Quick start
```
pip3 install antidb
```
```
from antidb import (Idx,
Prs,
count_exec_time)
__version__ = 'v1.0.0'
dbsnp_vcf_path = '/path/to/GCF_000001405.40.zst'
dbsnp_idx_prefix = 'all_rsids'
dbsnp_idx = Idx(dbsnp_vcf_path,
dbsnp_idx_prefix,
lambda dbsnp_zst_line:
dbsnp_zst_line.split('\t')[2])
dbsnp_idx.idx()
dbsnp_prs = Prs(dbsnp_vcf_path,
dbsnp_idx_prefix)
@count_exec_time
def get_rsid_lines(dbsnp_prs):
for dbsnp_zst_line in dbsnp_prs.prs(['rs1009150',
'rs12044852',
'rs4902496']):
print(dbsnp_zst_line)
print(get_rsid_lines(dbsnp_prs))
```
```
NC_000022.11 36306254 rs1009150 C T . . RS=1009150;dbSNPBuildID=86;SSR=0;GENEINFO=MYH9:4627;VC=SNV;PUB;INT;GNO;FREQ=1000Genomes:0.569,0.431|ALSPAC:0.2906,0.7094|Estonian:0.269,0.731|GENOME_DK:0.35,0.65|GnomAD:0.4415,0.5585|GoNL:0.3126,0.6874|HapMap:0.5881,0.4119|KOREAN:0.7334,0.2666|MGP:0.8652,0.1348|NorthernSweden:0.315,0.685|Qatari:0.5463,0.4537|SGDP_PRJ:0.2929,0.7071|Siberian:0.3043,0.6957|TOMMO:0.7117,0.2883|TOPMED:0.4596,0.5404|TWINSUK:0.2869,0.7131|dbGaP_PopFreq:0.3304,0.6696;COMMON;CLNVI=.,;CLNORIGIN=.,1;CLNSIG=.,2;CLNDISDB=.,MedGen:CN517202;CLNDN=.,not_provided;CLNREVSTAT=.,single;CLNACC=.,RCV001695529.1;CLNHGVS=NC_000022.11:g.36306254=,NC_000022.11:g.36306254C>T
NC_000001.11 116545157 rs12044852 C A . . RS=12044852;dbSNPBuildID=120;SSR=0;GENEINFO=CD58:965|LOC105378925:105378925;VC=SNV;PUB;INT;GNO;FREQ=1000Genomes:0.7473,0.2527|ALSPAC:0.8957,0.1043|Chileans:0.7396,0.2604|Estonian:0.9125,0.0875|GENOME_DK:0.875,0.125|GnomAD:0.8826,0.1174|GoNL:0.9078,0.09218|HapMap:0.787,0.213|KOREAN:0.3945,0.6055|Korea1K:0.3892,0.6108|NorthernSweden:0.895,0.105|PRJEB37584:0.439,0.561|Qatari:0.8704,0.1296|SGDP_PRJ:0.3373,0.6627|Siberian:0.3846,0.6154|TOMMO:0.4146,0.5854|TOPMED:0.8671,0.1329|TWINSUK:0.8972,0.1028|Vietnamese:0.4486,0.5514|dbGaP_PopFreq:0.8864,0.1136;COMMON
NC_000014.9 67588896 rs4902496 C G,T . . RS=4902496;dbSNPBuildID=111;SSR=0;GENEINFO=PIGH:5283|GPHN:10243|PLEKHH1:57475;VC=SNV;PUB;U3;INT;R3;GNO;FREQ=1000Genomes:0.3357,0.6643,.|ALSPAC:0.2019,0.7981,.|Estonian:0.1518,0.8482,.|GENOME_DK:0.125,0.875,.|GoNL:0.1703,0.8297,.|HapMap:0.3639,0.6361,.|KOREAN:0.3399,0.6601,.|MGP:0.3558,0.6442,.|NorthernSweden:0.1817,0.8183,.|Qatari:0.2176,0.7824,.|SGDP_PRJ:0.189,0.811,.|Siberian:0.1429,0.8571,.|TOMMO:0.2816,0.7184,.|TOPMED:0.285,0.715,.|TWINSUK:0.1888,0.8112,.|Vietnamese:0.4533,0.5467,.|dbGaP_PopFreq:0.2712,0.7288,0;COMMON
('get_rsid_lines', '0:00:00.007858')
```
## App example
### Bioinformatic annotator template
```
# autopep8: off
import sys; sys.dont_write_bytecode = True
# autopep8: on
import json
import os
from argparse import ArgumentParser
from datetime import datetime
from antidb import (Idx,
Prs,
count_exec_time)
__version__ = 'v1.0.0'
def parse_dbsnp_line(dbsnp_zst_line):
if 'GnomAD' in dbsnp_zst_line \
and 'CLN' in dbsnp_zst_line:
return dbsnp_zst_line.split('\t')[2]
return None
def parse_rsmerged_line(rsmerged_zst_line):
rsmerged_zst_obj = json.loads(rsmerged_zst_line)
rsids = list(map(lambda rsid: f'rs{rsid}',
([rsmerged_zst_obj['refsnp_id']] +
rsmerged_zst_obj['merged_snapshot_data']['merged_into'])))
return rsids
def rsid_to_coords(rsid, dbsnp_prs,
rsmerged_prs, parse_rsmerged_line):
for dbsnp_zst_line in dbsnp_prs.prs(rsid):
return dbsnp_zst_line
for rsmerged_zst_line in rsmerged_prs.prs(rsid):
rsid_syns = parse_rsmerged_line(rsmerged_zst_line)
for dbsnp_zst_line in dbsnp_prs.prs(rsid_syns):
return dbsnp_zst_line
return None
arg_parser = ArgumentParser()
arg_parser.add_argument('-S', '--ann-file-path', required=True, metavar='str', dest='ann_file_path', type=str,
help='Path to table with rsIDs column (uncompressed)')
arg_parser.add_argument('-D', '--dbsnp-file-path', required=True, metavar='str', dest='dbsnp_file_path', type=str,
help='Path to official dbSNP VCF (uncompressed or compressed via Seekable zstd)')
arg_parser.add_argument('-R', '--rsmerged-file-path', required=True, metavar='str', dest='rsmerged_file_path', type=str,
help='Path to official refsnp-merged JSON (uncompressed or compressed via Seekable zstd)')
arg_parser.add_argument('-T', '--trg-dir-path', required=True, metavar='str', dest='trg_dir_path', type=str,
help='Path to directory for results')
arg_parser.add_argument('-c', '--rsids-col-num', metavar='1', default=1, dest='rsids_col_num', type=int,
help='rsIDs-column number in source table')
args = arg_parser.parse_args()
dbsnp_idx = Idx(args.dbsnp_file_path,
'rsids__gnomad_cln',
parse_dbsnp_line)
dbsnp_idx.idx()
rsmerged_idx = Idx(args.rsmerged_file_path,
'rsids',
parse_rsmerged_line)
rsmerged_idx.idx()
perf = {'dbsnp_idx': dbsnp_idx.perf,
'rsmerged_idx': rsmerged_idx.perf}
dbsnp_prs = Prs(args.dbsnp_file_path,
'rsids__gnomad_cln')
rsmerged_prs = Prs(args.rsmerged_file_path,
'rsids')
@count_exec_time
def ann(args, res_files_crt_time, dbsnp_prs, rsmerged_prs, parse_rsmerged_line):
trg_file_path = os.path.join(args.trg_dir_path,
f'ann_res_{res_files_crt_time}.txt')
dump_file_path = os.path.join(args.trg_dir_path,
f'ann_dump_{res_files_crt_time}.txt')
with open(args.ann_file_path) as ann_file_opened:
with open(trg_file_path, 'w') as trg_file_opened:
with open(dump_file_path, 'w') as dump_file_opened:
for ann_file_line in ann_file_opened:
if ann_file_line.startswith('#'):
continue
ann_file_line = ann_file_line.rstrip()
ann_rsid = ann_file_line.split('\t')[args.rsids_col_num - 1]
dbsnp_zst_line = rsid_to_coords(ann_rsid,
dbsnp_prs,
rsmerged_prs,
parse_rsmerged_line)
if dbsnp_zst_line:
trg_file_opened.write(ann_file_line + '\t' +
dbsnp_zst_line)
else:
dump_file_opened.write(ann_file_line + '\n')
res_files_crt_time = datetime.now()
perf['ann'] = ann(args,
res_files_crt_time,
dbsnp_prs,
rsmerged_prs,
parse_rsmerged_line)[1]
perf_file_path = os.path.join(args.trg_dir_path,
f'ann_perf_{res_files_crt_time}.json')
with open(perf_file_path, 'w') as perf_file_opened:
json.dump(perf, perf_file_opened, indent=4)
```
#### Performance measurement results
##### ann_perf_2023-07-09 20:45:36.102376.json
- `dbsnp_idx` - indexing `GnomAD`- and `CLN`-containing lines of dbSNP VCF;
- `crt_db_zst` - compressing indexable file (output is further called "DB");
- `crt_full_idx_tmp` - indexing DB (output is further called "temporary full index");
- `crt_full_idx_tmp_srtd` - sorting temporary full index by indexed DB elements;
- `crt_full_idx` - compressing sorted temporary full index (output is further called "full index");
- `crt_mem_idx` - selective indexing of full index;
- `rsmerged_idx` - indexing all lines of rsmerged JSON;
- <...>
- `ann` - querying 2842 rsIDs by indexed dbSNP VCF and indexed rsmerged JSON.
```
{
"dbsnp_idx": [
[
"crt_db_zst",
"0:39:02.127938"
],
[
"crt_full_idx_tmp",
"1:06:13.698458"
],
[
"crt_full_idx_tmp_srtd",
"0:00:00.928633"
],
[
"crt_full_idx",
"0:00:00.577710"
],
[
"crt_mem_idx",
"0:00:00.280014"
]
],
"rsmerged_idx": [
[
"crt_db_zst",
"0:02:44.068920"
],
[
"crt_full_idx_tmp",
"0:04:43.153807"
],
[
"crt_full_idx_tmp_srtd",
"0:00:30.015826"
],
[
"crt_full_idx",
"0:00:17.204649"
],
[
"crt_mem_idx",
"0:00:08.811190"
]
],
"ann": "0:00:06.995505"
}
```
Raw data
{
"_id": null,
"home_page": null,
"name": "antidb",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.10",
"maintainer_email": null,
"keywords": "python, search-engine, parser, performance, bioinformatics, big-data, dbms, indexer, zstd, zstandard, seekable, pyzstd",
"author": null,
"author_email": "Platon Bykadorov <platon.work@gmail.com>",
"download_url": "https://files.pythonhosted.org/packages/7e/99/e0ca46def41789079d85b6d1b68ea95bed7bc6896bd8a20e1c59ec516c88/antidb-2024.4.6.tar.gz",
"platform": null,
"description": "# antidb\n## Quick start\n```\npip3 install antidb\n```\n```\nfrom antidb import (Idx,\n Prs,\n count_exec_time)\n\n__version__ = 'v1.0.0'\n\ndbsnp_vcf_path = '/path/to/GCF_000001405.40.zst'\ndbsnp_idx_prefix = 'all_rsids'\ndbsnp_idx = Idx(dbsnp_vcf_path,\n dbsnp_idx_prefix,\n lambda dbsnp_zst_line:\n dbsnp_zst_line.split('\\t')[2])\ndbsnp_idx.idx()\ndbsnp_prs = Prs(dbsnp_vcf_path,\n dbsnp_idx_prefix)\n\n\n@count_exec_time\ndef get_rsid_lines(dbsnp_prs):\n for dbsnp_zst_line in dbsnp_prs.prs(['rs1009150',\n 'rs12044852',\n 'rs4902496']):\n print(dbsnp_zst_line)\n\n\nprint(get_rsid_lines(dbsnp_prs))\n```\n```\nNC_000022.11 36306254 rs1009150 C T . . RS=1009150;dbSNPBuildID=86;SSR=0;GENEINFO=MYH9:4627;VC=SNV;PUB;INT;GNO;FREQ=1000Genomes:0.569,0.431|ALSPAC:0.2906,0.7094|Estonian:0.269,0.731|GENOME_DK:0.35,0.65|GnomAD:0.4415,0.5585|GoNL:0.3126,0.6874|HapMap:0.5881,0.4119|KOREAN:0.7334,0.2666|MGP:0.8652,0.1348|NorthernSweden:0.315,0.685|Qatari:0.5463,0.4537|SGDP_PRJ:0.2929,0.7071|Siberian:0.3043,0.6957|TOMMO:0.7117,0.2883|TOPMED:0.4596,0.5404|TWINSUK:0.2869,0.7131|dbGaP_PopFreq:0.3304,0.6696;COMMON;CLNVI=.,;CLNORIGIN=.,1;CLNSIG=.,2;CLNDISDB=.,MedGen:CN517202;CLNDN=.,not_provided;CLNREVSTAT=.,single;CLNACC=.,RCV001695529.1;CLNHGVS=NC_000022.11:g.36306254=,NC_000022.11:g.36306254C>T\n\nNC_000001.11 116545157 rs12044852 C A . . RS=12044852;dbSNPBuildID=120;SSR=0;GENEINFO=CD58:965|LOC105378925:105378925;VC=SNV;PUB;INT;GNO;FREQ=1000Genomes:0.7473,0.2527|ALSPAC:0.8957,0.1043|Chileans:0.7396,0.2604|Estonian:0.9125,0.0875|GENOME_DK:0.875,0.125|GnomAD:0.8826,0.1174|GoNL:0.9078,0.09218|HapMap:0.787,0.213|KOREAN:0.3945,0.6055|Korea1K:0.3892,0.6108|NorthernSweden:0.895,0.105|PRJEB37584:0.439,0.561|Qatari:0.8704,0.1296|SGDP_PRJ:0.3373,0.6627|Siberian:0.3846,0.6154|TOMMO:0.4146,0.5854|TOPMED:0.8671,0.1329|TWINSUK:0.8972,0.1028|Vietnamese:0.4486,0.5514|dbGaP_PopFreq:0.8864,0.1136;COMMON\n\nNC_000014.9 67588896 rs4902496 C G,T . . RS=4902496;dbSNPBuildID=111;SSR=0;GENEINFO=PIGH:5283|GPHN:10243|PLEKHH1:57475;VC=SNV;PUB;U3;INT;R3;GNO;FREQ=1000Genomes:0.3357,0.6643,.|ALSPAC:0.2019,0.7981,.|Estonian:0.1518,0.8482,.|GENOME_DK:0.125,0.875,.|GoNL:0.1703,0.8297,.|HapMap:0.3639,0.6361,.|KOREAN:0.3399,0.6601,.|MGP:0.3558,0.6442,.|NorthernSweden:0.1817,0.8183,.|Qatari:0.2176,0.7824,.|SGDP_PRJ:0.189,0.811,.|Siberian:0.1429,0.8571,.|TOMMO:0.2816,0.7184,.|TOPMED:0.285,0.715,.|TWINSUK:0.1888,0.8112,.|Vietnamese:0.4533,0.5467,.|dbGaP_PopFreq:0.2712,0.7288,0;COMMON\n\n('get_rsid_lines', '0:00:00.007858')\n```\n\n## App example\n### Bioinformatic annotator template\n```\n# autopep8: off\nimport sys; sys.dont_write_bytecode = True\n# autopep8: on\nimport json\nimport os\nfrom argparse import ArgumentParser\nfrom datetime import datetime\nfrom antidb import (Idx,\n Prs,\n count_exec_time)\n\n__version__ = 'v1.0.0'\n\n\ndef parse_dbsnp_line(dbsnp_zst_line):\n if 'GnomAD' in dbsnp_zst_line \\\n and 'CLN' in dbsnp_zst_line:\n return dbsnp_zst_line.split('\\t')[2]\n return None\n\n\ndef parse_rsmerged_line(rsmerged_zst_line):\n rsmerged_zst_obj = json.loads(rsmerged_zst_line)\n rsids = list(map(lambda rsid: f'rs{rsid}',\n ([rsmerged_zst_obj['refsnp_id']] +\n rsmerged_zst_obj['merged_snapshot_data']['merged_into'])))\n return rsids\n\n\ndef rsid_to_coords(rsid, dbsnp_prs,\n rsmerged_prs, parse_rsmerged_line):\n for dbsnp_zst_line in dbsnp_prs.prs(rsid):\n return dbsnp_zst_line\n for rsmerged_zst_line in rsmerged_prs.prs(rsid):\n rsid_syns = parse_rsmerged_line(rsmerged_zst_line)\n for dbsnp_zst_line in dbsnp_prs.prs(rsid_syns):\n return dbsnp_zst_line\n return None\n\n\narg_parser = ArgumentParser()\narg_parser.add_argument('-S', '--ann-file-path', required=True, metavar='str', dest='ann_file_path', type=str,\n help='Path to table with rsIDs column (uncompressed)')\narg_parser.add_argument('-D', '--dbsnp-file-path', required=True, metavar='str', dest='dbsnp_file_path', type=str,\n help='Path to official dbSNP VCF (uncompressed or compressed via Seekable zstd)')\narg_parser.add_argument('-R', '--rsmerged-file-path', required=True, metavar='str', dest='rsmerged_file_path', type=str,\n help='Path to official refsnp-merged JSON (uncompressed or compressed via Seekable zstd)')\narg_parser.add_argument('-T', '--trg-dir-path', required=True, metavar='str', dest='trg_dir_path', type=str,\n help='Path to directory for results')\narg_parser.add_argument('-c', '--rsids-col-num', metavar='1', default=1, dest='rsids_col_num', type=int,\n help='rsIDs-column number in source table')\nargs = arg_parser.parse_args()\n\ndbsnp_idx = Idx(args.dbsnp_file_path,\n 'rsids__gnomad_cln',\n parse_dbsnp_line)\ndbsnp_idx.idx()\nrsmerged_idx = Idx(args.rsmerged_file_path,\n 'rsids',\n parse_rsmerged_line)\nrsmerged_idx.idx()\nperf = {'dbsnp_idx': dbsnp_idx.perf,\n 'rsmerged_idx': rsmerged_idx.perf}\ndbsnp_prs = Prs(args.dbsnp_file_path,\n 'rsids__gnomad_cln')\nrsmerged_prs = Prs(args.rsmerged_file_path,\n 'rsids')\n\n\n@count_exec_time\ndef ann(args, res_files_crt_time, dbsnp_prs, rsmerged_prs, parse_rsmerged_line):\n trg_file_path = os.path.join(args.trg_dir_path,\n f'ann_res_{res_files_crt_time}.txt')\n dump_file_path = os.path.join(args.trg_dir_path,\n f'ann_dump_{res_files_crt_time}.txt')\n with open(args.ann_file_path) as ann_file_opened:\n with open(trg_file_path, 'w') as trg_file_opened:\n with open(dump_file_path, 'w') as dump_file_opened:\n for ann_file_line in ann_file_opened:\n if ann_file_line.startswith('#'):\n continue\n ann_file_line = ann_file_line.rstrip()\n ann_rsid = ann_file_line.split('\\t')[args.rsids_col_num - 1]\n dbsnp_zst_line = rsid_to_coords(ann_rsid,\n dbsnp_prs,\n rsmerged_prs,\n parse_rsmerged_line)\n if dbsnp_zst_line:\n trg_file_opened.write(ann_file_line + '\\t' +\n dbsnp_zst_line)\n else:\n dump_file_opened.write(ann_file_line + '\\n')\n\n\nres_files_crt_time = datetime.now()\n\nperf['ann'] = ann(args,\n res_files_crt_time,\n dbsnp_prs,\n rsmerged_prs,\n parse_rsmerged_line)[1]\n\nperf_file_path = os.path.join(args.trg_dir_path,\n f'ann_perf_{res_files_crt_time}.json')\nwith open(perf_file_path, 'w') as perf_file_opened:\n json.dump(perf, perf_file_opened, indent=4)\n```\n\n#### Performance measurement results\n##### ann_perf_2023-07-09 20:45:36.102376.json\n- `dbsnp_idx` - indexing `GnomAD`- and `CLN`-containing lines of dbSNP VCF;\n - `crt_db_zst` - compressing indexable file (output is further called \"DB\");\n - `crt_full_idx_tmp` - indexing DB (output is further called \"temporary full index\");\n - `crt_full_idx_tmp_srtd` - sorting temporary full index by indexed DB elements;\n - `crt_full_idx` - compressing sorted temporary full index (output is further called \"full index\");\n - `crt_mem_idx` - selective indexing of full index;\n- `rsmerged_idx` - indexing all lines of rsmerged JSON;\n - <...>\n- `ann` - querying 2842 rsIDs by indexed dbSNP VCF and indexed rsmerged JSON.\n```\n{\n \"dbsnp_idx\": [\n [\n \"crt_db_zst\",\n \"0:39:02.127938\"\n ],\n [\n \"crt_full_idx_tmp\",\n \"1:06:13.698458\"\n ],\n [\n \"crt_full_idx_tmp_srtd\",\n \"0:00:00.928633\"\n ],\n [\n \"crt_full_idx\",\n \"0:00:00.577710\"\n ],\n [\n \"crt_mem_idx\",\n \"0:00:00.280014\"\n ]\n ],\n \"rsmerged_idx\": [\n [\n \"crt_db_zst\",\n \"0:02:44.068920\"\n ],\n [\n \"crt_full_idx_tmp\",\n \"0:04:43.153807\"\n ],\n [\n \"crt_full_idx_tmp_srtd\",\n \"0:00:30.015826\"\n ],\n [\n \"crt_full_idx\",\n \"0:00:17.204649\"\n ],\n [\n \"crt_mem_idx\",\n \"0:00:08.811190\"\n ]\n ],\n \"ann\": \"0:00:06.995505\"\n}\n```\n\n",
"bugtrack_url": null,
"license": null,
"summary": "The simplest index-and-search engine for huge multiline text files. Focused primarily on bioinformatics. Inspired by tabix, but isn't its replacement. Written in Python. Works on top of Zstandard Seekable & pyzstd SeekableZstdFile.",
"version": "2024.4.6",
"project_urls": {
"Bug Tracker": "https://github.com/PlatonB/antidb/issues",
"Homepage": "https://github.com/PlatonB/antidb"
},
"split_keywords": [
"python",
" search-engine",
" parser",
" performance",
" bioinformatics",
" big-data",
" dbms",
" indexer",
" zstd",
" zstandard",
" seekable",
" pyzstd"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "811e2675c0b35370c7136857efac5df17dfcf8cb23d4fe5266a1080b6cde4d21",
"md5": "68bd1af8dcd40d6831aec13f6e467db7",
"sha256": "b1c570978ce52ff9e91e5e9b2ed743ece640981b6424bbdda75b6100486b7554"
},
"downloads": -1,
"filename": "antidb-2024.4.6-py3-none-any.whl",
"has_sig": false,
"md5_digest": "68bd1af8dcd40d6831aec13f6e467db7",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.10",
"size": 8856,
"upload_time": "2024-04-06T21:27:31",
"upload_time_iso_8601": "2024-04-06T21:27:31.666857Z",
"url": "https://files.pythonhosted.org/packages/81/1e/2675c0b35370c7136857efac5df17dfcf8cb23d4fe5266a1080b6cde4d21/antidb-2024.4.6-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "7e99e0ca46def41789079d85b6d1b68ea95bed7bc6896bd8a20e1c59ec516c88",
"md5": "2c01e81ca26df7ebdf4b51a3d25d9d75",
"sha256": "58ca185549c2bad6d26d66f005c42d2d5d730a0f03f3a996a74904c4c065ace1"
},
"downloads": -1,
"filename": "antidb-2024.4.6.tar.gz",
"has_sig": false,
"md5_digest": "2c01e81ca26df7ebdf4b51a3d25d9d75",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.10",
"size": 10290,
"upload_time": "2024-04-06T21:27:33",
"upload_time_iso_8601": "2024-04-06T21:27:33.543039Z",
"url": "https://files.pythonhosted.org/packages/7e/99/e0ca46def41789079d85b6d1b68ea95bed7bc6896bd8a20e1c59ec516c88/antidb-2024.4.6.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-04-06 21:27:33",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "PlatonB",
"github_project": "antidb",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "antidb"
}