xv


Namexv JSON
Version 0.0.6 PyPI version JSON
download
home_pagehttps://github.com/thorwhalen/xv
SummaryAccess to arxiv data
upload_time2024-01-23 12:50:48
maintainer
docs_urlNone
authorThor Whalen
requires_python
licensemit
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # xv

Access to arxiv data

To install:	```pip install xv```


# Examples


```python
from xv import *
```

## Raw store

At the point of writing this, my attempts enable `graze` to automatically confirm download in the googledrive downloads (which, when downloading too-big files, will tell the user it can't scan the file and ask the user to confirm the download).

Therefore, the following files need to be downloaded manually:
* **titles**: https://drive.google.com/file/d/1Ul5mPePtoPKHZkH5Rm6dWKAO11dG98GN/view?usp=share_link
* **abstracts**: https://drive.google.com/file/d/1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ/view?usp=share_link

(If those urls don't work, perhaps they were updated: See here: https://alex.macrocosm.so/download .)

You can then copy them over to the place graze will look for by doing:

```python
from pathlib import Path
from xv.util import Graze
from xv.data_access import urls


g[urls['titles']] = Path('TITLES_DATA_LOCAL_FILEPATH').read_bytes()
g[urls['abstracts']] = Path('ABSTRACTS_DATA_LOCAL_FILEPATH').read_bytes()
```


```python
# from imbed.mdat.arxiv import urls
# from pathlib import Path

# g[urls['titles']] = Path('FILE_WHERE_YOU_DOWNLOADED_TITLES_DATA').read_bytes()
# g[urls['abstracts']] = Path('FILE_WHERE_YOU_DOWNLOADED_TITLES_DATA').read_bytes()

```


```python
from xv.util import Graze

g = Graze()
list(g)
```




    ['https://drive.google.com/file/d/1Ul5mPePtoPKHZkH5Rm6dWKAO11dG98GN/view?usp=share_link',
     'https://drive.google.com/file/d/1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ/view?usp=share_link',
     'https://arxiv.org/pdf/0704.0001']




```python
from xv import raw_sources

list(raw_sources)
```




    ['titles', 'abstracts']






```python
raw = raw_sources['titles']
list(raw)
```




    ['titles_7.parquet',
     'titles_23.parquet',
     'titles_15.parquet',
     'verifyResults.py',
     'titles_14.parquet',
     'titles_22.parquet',
     'titles_6.parquet',
     'titles_16.parquet',
     'titles_20.parquet',
     'titles_4.parquet',
     'titles_5.parquet',
     'titles_21.parquet',
     'params.txt',
     'titles_17.parquet',
     'exampleEmbed.py',
     'titles_12.parquet',
     'README.md',
     'titles_9.parquet',
     'titles_1.parquet',
     'titles_13.parquet',
     'titles_8.parquet',
     'titles_18.parquet',
     'titles_3.parquet',
     'titles_11.parquet',
     'titles_10.parquet',
     'titles_19.parquet',
     'titles_2.parquet']




```python
print(raw['exampleEmbed.py'].decode())
```

    from InstructorEmbedding import INSTRUCTOR
    
    model = INSTRUCTOR('hkunlp/instructor-xl')
    sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
    instruction = "Represent the Research Paper title for retrieval; Input:"
    embeddings = model.encode([[instruction,sentence]])
    print(embeddings)



```python
from InstructorEmbedding import INSTRUCTOR

model = INSTRUCTOR('hkunlp/instructor-xl')

```

    /Users/thorwhalen/.pyenv/versions/3.10.13/envs/p10/lib/python3.10/site-packages/InstructorEmbedding/instructor.py:7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
      from tqdm.autonotebook import trange


    load INSTRUCTOR_Transformer
    max_seq_length  512



```python
sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
instruction = "Represent the Research Paper title for retrieval; Input:"
embeddings = model.encode([[instruction, sentence]])

```


```python
print(raw['params.txt'].decode())
```

    prompt: Represent the Research Paper title for retrieval; Input:
    type: title
    time string: 20230518-185428
    model: InstructorXL
    version: 2.0



```python
print(raw['exampleEmbed.py'].decode())
```

    from InstructorEmbedding import INSTRUCTOR
    
    model = INSTRUCTOR('hkunlp/instructor-xl')
    sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
    instruction = "Represent the Research Paper title for retrieval; Input:"
    embeddings = model.encode([[instruction,sentence]])
    print(embeddings)


## The imbedding data store

And now, we'll transform the raw store to get a convenient interface to the actual data of interest.


```python
b = raw['titles_1.parquet']
len(b)
```




    313383694




```python
from xv import sources  # raw store + wrapper. See parquet_codec code.

titles_tables = sources['titles']
abstract_tables = sources['abstracts']
print(list(titles_tables))
```

    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]



```python
titles_df = titles_tables[1]
titles_df
```




<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>title</th>
      <th>embeddings</th>
      <th>doi</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Calculation of prompt diphoton production cros...</td>
      <td>[-0.050620172, 0.041436385, 0.05363288, -0.029...</td>
      <td>0704.0001</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Sparsity-certifying Graph Decompositions</td>
      <td>[0.014515653, 0.023809524, -0.028145121, -0.04...</td>
      <td>0704.0002</td>
    </tr>
    <tr>
      <th>2</th>
      <td>The evolution of the Earth-Moon system based o...</td>
      <td>[-4.766115e-05, 0.017415706, 0.04146007, -0.03...</td>
      <td>0704.0003</td>
    </tr>
    <tr>
      <th>3</th>
      <td>A determinant of Stirling cycle numbers counts...</td>
      <td>[0.027208889, 0.046175897, 0.0010913888, -0.01...</td>
      <td>0704.0004</td>
    </tr>
    <tr>
      <th>4</th>
      <td>From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...</td>
      <td>[0.0113909235, 0.0042667952, -0.0008565594, -0...</td>
      <td>0704.0005</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>99995</th>
      <td>Multiple Time Dimensions</td>
      <td>[0.02682626, -0.0015173098, -0.0019915192, -0....</td>
      <td>0812.3869</td>
    </tr>
    <tr>
      <th>99996</th>
      <td>Depth Zero Representations of Nonlinear Covers...</td>
      <td>[-0.02740943, 0.011689809, -0.0105154915, -0.0...</td>
      <td>0812.3870</td>
    </tr>
    <tr>
      <th>99997</th>
      <td>Decting Errors in Reversible Circuits With Inv...</td>
      <td>[0.0072460608, 0.0028085636, -0.015064359, -0....</td>
      <td>0812.3871</td>
    </tr>
    <tr>
      <th>99998</th>
      <td>Unveiling the birth and evolution of the HII r...</td>
      <td>[0.009408689, -0.0047120117, 0.0021392817, -0....</td>
      <td>0812.3872</td>
    </tr>
    <tr>
      <th>99999</th>
      <td>The K-Receiver Broadcast Channel with Confiden...</td>
      <td>[-0.0026305509, -0.006502139, 0.013400236, -0....</td>
      <td>0812.3873</td>
    </tr>
  </tbody>
</table>
<p>100000 rows × 3 columns</p>
</div>




```python
abstract_df = abstract_tables[1]
abstract_df
```




<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>abstract</th>
      <th>embeddings</th>
      <th>doi</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>A fully differential calculation in perturba...</td>
      <td>[-0.035151865, 0.022851437, 0.025942933, -0.02...</td>
      <td>0704.0001</td>
    </tr>
    <tr>
      <th>1</th>
      <td>We describe a new algorithm, the $(k,\ell)$-...</td>
      <td>[0.035485767, -0.0015772493, -0.0016615744, -0...</td>
      <td>0704.0002</td>
    </tr>
    <tr>
      <th>2</th>
      <td>The evolution of Earth-Moon system is descri...</td>
      <td>[-0.014510429, 0.010210799, 0.049661566, -0.01...</td>
      <td>0704.0003</td>
    </tr>
    <tr>
      <th>3</th>
      <td>We show that a determinant of Stirling cycle...</td>
      <td>[0.029191103, 0.047992915, -0.0061754594, -0.0...</td>
      <td>0704.0004</td>
    </tr>
    <tr>
      <th>4</th>
      <td>In this paper we show how to compute the $\L...</td>
      <td>[-0.015174898, 0.01603887, 0.04062805, -0.0246...</td>
      <td>0704.0005</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>99995</th>
      <td>The possibility of physics in multiple time ...</td>
      <td>[0.016121766, 0.011126887, 0.018650021, -0.044...</td>
      <td>0812.3869</td>
    </tr>
    <tr>
      <th>99996</th>
      <td>We generalize the methods of Moy-Prasad, in ...</td>
      <td>[-7.164341e-05, -0.007114291, -0.008979887, -0...</td>
      <td>0812.3870</td>
    </tr>
    <tr>
      <th>99997</th>
      <td>Reversible logic is experience renewed inter...</td>
      <td>[0.03194286, -0.00771745, 0.015977046, -0.0474...</td>
      <td>0812.3871</td>
    </tr>
    <tr>
      <th>99998</th>
      <td>Based on a multiwavelength study, the ISM ar...</td>
      <td>[-0.012340169, -0.021712925, 0.00806009, -0.00...</td>
      <td>0812.3872</td>
    </tr>
    <tr>
      <th>99999</th>
      <td>The secrecy capacity region for the K-receiv...</td>
      <td>[0.0012416588, 0.0006933478, -0.0057888636, -0...</td>
      <td>0812.3873</td>
    </tr>
  </tbody>
</table>
<p>100000 rows × 3 columns</p>
</div>




```python
abstract_df['doi'].values
```




    array(['0704.0001', '0704.0002', '0704.0003', ..., '0812.3871',
           '0812.3872', '0812.3873'], dtype=object)




```python
from xv import arxiv_url

doi = abstract_df['doi'].values[0]
arxiv_url(doi)
```




    'https://arxiv.org/abs/0704.0001'




```python
from xv.data_access import resource_descriptions
resource_descriptions
```




    {'abs': 'Main page of article. Contains links to all other relevant information.',
     'pdf': 'Direct link to article pdf',
     'format': 'Page giving access to other formats',
     'src': 'Access to the original source files submitted by the authors.',
     'cits': 'Tracks citations of the article across various platforms and databases.',
     'html': 'Link to the ar5iv html page for the article.'}




```python
doi = '0704.0001'

for resource, description in resource_descriptions.items():
    print(f"{resource}: {description}")
    print(f"Example: {arxiv_url(doi, resource)}")
    print("")

```

    abs: Main page of article. Contains links to all other relevant information.
    Example: https://arxiv.org/abs/0704.0001
    
    pdf: Direct link to article pdf
    Example: https://arxiv.org/pdf/0704.0001
    
    format: Page giving access to other formats
    Example: https://arxiv.org/format/0704.0001
    
    src: Access to the original source files submitted by the authors.
    Example: https://arxiv.org/src/0704.0001
    
    cits: Tracks citations of the article across various platforms and databases.
    Example: https://arxiv.org/cits/0704.0001
    
    html: Link to the ar5iv html page for the article.
    Example: https://ar5iv.labs.arxiv.org/html/0704.0001
    



```python
arxiv_url(doi, 'pdf')
```


    'https://arxiv.org/pdf/0704.0001'



```python
pdf_bytes = g[arxiv_url(doi, 'pdf')]
```

    The contents  (~1.647MB) of https://arxiv.org/pdf/0704.0001 are being downloaded...



```python
abstract_df.embeddings.values[0].shape
```




    (768,)


            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/thorwhalen/xv",
    "name": "xv",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "",
    "author": "Thor Whalen",
    "author_email": "",
    "download_url": "https://files.pythonhosted.org/packages/95/58/bc83623a8c76809e6ced3649dad4ae056dc8041d88d6e9768f93200ad465/xv-0.0.6.tar.gz",
    "platform": "any",
    "description": "# xv\n\nAccess to arxiv data\n\nTo install:\t```pip install xv```\n\n\n# Examples\n\n\n```python\nfrom xv import *\n```\n\n## Raw store\n\nAt the point of writing this, my attempts enable `graze` to automatically confirm download in the googledrive downloads (which, when downloading too-big files, will tell the user it can't scan the file and ask the user to confirm the download).\n\nTherefore, the following files need to be downloaded manually:\n* **titles**: https://drive.google.com/file/d/1Ul5mPePtoPKHZkH5Rm6dWKAO11dG98GN/view?usp=share_link\n* **abstracts**: https://drive.google.com/file/d/1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ/view?usp=share_link\n\n(If those urls don't work, perhaps they were updated: See here: https://alex.macrocosm.so/download .)\n\nYou can then copy them over to the place graze will look for by doing:\n\n```python\nfrom pathlib import Path\nfrom xv.util import Graze\nfrom xv.data_access import urls\n\n\ng[urls['titles']] = Path('TITLES_DATA_LOCAL_FILEPATH').read_bytes()\ng[urls['abstracts']] = Path('ABSTRACTS_DATA_LOCAL_FILEPATH').read_bytes()\n```\n\n\n```python\n# from imbed.mdat.arxiv import urls\n# from pathlib import Path\n\n# g[urls['titles']] = Path('FILE_WHERE_YOU_DOWNLOADED_TITLES_DATA').read_bytes()\n# g[urls['abstracts']] = Path('FILE_WHERE_YOU_DOWNLOADED_TITLES_DATA').read_bytes()\n\n```\n\n\n```python\nfrom xv.util import Graze\n\ng = Graze()\nlist(g)\n```\n\n\n\n\n    ['https://drive.google.com/file/d/1Ul5mPePtoPKHZkH5Rm6dWKAO11dG98GN/view?usp=share_link',\n     'https://drive.google.com/file/d/1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ/view?usp=share_link',\n     'https://arxiv.org/pdf/0704.0001']\n\n\n\n\n```python\nfrom xv import raw_sources\n\nlist(raw_sources)\n```\n\n\n\n\n    ['titles', 'abstracts']\n\n\n\n\n\n\n```python\nraw = raw_sources['titles']\nlist(raw)\n```\n\n\n\n\n    ['titles_7.parquet',\n     'titles_23.parquet',\n     'titles_15.parquet',\n     'verifyResults.py',\n     'titles_14.parquet',\n     'titles_22.parquet',\n     'titles_6.parquet',\n     'titles_16.parquet',\n     'titles_20.parquet',\n     'titles_4.parquet',\n     'titles_5.parquet',\n     'titles_21.parquet',\n     'params.txt',\n     'titles_17.parquet',\n     'exampleEmbed.py',\n     'titles_12.parquet',\n     'README.md',\n     'titles_9.parquet',\n     'titles_1.parquet',\n     'titles_13.parquet',\n     'titles_8.parquet',\n     'titles_18.parquet',\n     'titles_3.parquet',\n     'titles_11.parquet',\n     'titles_10.parquet',\n     'titles_19.parquet',\n     'titles_2.parquet']\n\n\n\n\n```python\nprint(raw['exampleEmbed.py'].decode())\n```\n\n    from InstructorEmbedding import INSTRUCTOR\n    \n    model = INSTRUCTOR('hkunlp/instructor-xl')\n    sentence = \"3D ActionSLAM: wearable person tracking in multi-floor environments\"\n    instruction = \"Represent the Research Paper title for retrieval; Input:\"\n    embeddings = model.encode([[instruction,sentence]])\n    print(embeddings)\n\n\n\n```python\nfrom InstructorEmbedding import INSTRUCTOR\n\nmodel = INSTRUCTOR('hkunlp/instructor-xl')\n\n```\n\n    /Users/thorwhalen/.pyenv/versions/3.10.13/envs/p10/lib/python3.10/site-packages/InstructorEmbedding/instructor.py:7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n      from tqdm.autonotebook import trange\n\n\n    load INSTRUCTOR_Transformer\n    max_seq_length  512\n\n\n\n```python\nsentence = \"3D ActionSLAM: wearable person tracking in multi-floor environments\"\ninstruction = \"Represent the Research Paper title for retrieval; Input:\"\nembeddings = model.encode([[instruction, sentence]])\n\n```\n\n\n```python\nprint(raw['params.txt'].decode())\n```\n\n    prompt: Represent the Research Paper title for retrieval; Input:\n    type: title\n    time string: 20230518-185428\n    model: InstructorXL\n    version: 2.0\n\n\n\n```python\nprint(raw['exampleEmbed.py'].decode())\n```\n\n    from InstructorEmbedding import INSTRUCTOR\n    \n    model = INSTRUCTOR('hkunlp/instructor-xl')\n    sentence = \"3D ActionSLAM: wearable person tracking in multi-floor environments\"\n    instruction = \"Represent the Research Paper title for retrieval; Input:\"\n    embeddings = model.encode([[instruction,sentence]])\n    print(embeddings)\n\n\n## The imbedding data store\n\nAnd now, we'll transform the raw store to get a convenient interface to the actual data of interest.\n\n\n```python\nb = raw['titles_1.parquet']\nlen(b)\n```\n\n\n\n\n    313383694\n\n\n\n\n```python\nfrom xv import sources  # raw store + wrapper. See parquet_codec code.\n\ntitles_tables = sources['titles']\nabstract_tables = sources['abstracts']\nprint(list(titles_tables))\n```\n\n    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]\n\n\n\n```python\ntitles_df = titles_tables[1]\ntitles_df\n```\n\n\n\n\n<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>title</th>\n      <th>embeddings</th>\n      <th>doi</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Calculation of prompt diphoton production cros...</td>\n      <td>[-0.050620172, 0.041436385, 0.05363288, -0.029...</td>\n      <td>0704.0001</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Sparsity-certifying Graph Decompositions</td>\n      <td>[0.014515653, 0.023809524, -0.028145121, -0.04...</td>\n      <td>0704.0002</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>The evolution of the Earth-Moon system based o...</td>\n      <td>[-4.766115e-05, 0.017415706, 0.04146007, -0.03...</td>\n      <td>0704.0003</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>A determinant of Stirling cycle numbers counts...</td>\n      <td>[0.027208889, 0.046175897, 0.0010913888, -0.01...</td>\n      <td>0704.0004</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n      <td>[0.0113909235, 0.0042667952, -0.0008565594, -0...</td>\n      <td>0704.0005</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>99995</th>\n      <td>Multiple Time Dimensions</td>\n      <td>[0.02682626, -0.0015173098, -0.0019915192, -0....</td>\n      <td>0812.3869</td>\n    </tr>\n    <tr>\n      <th>99996</th>\n      <td>Depth Zero Representations of Nonlinear Covers...</td>\n      <td>[-0.02740943, 0.011689809, -0.0105154915, -0.0...</td>\n      <td>0812.3870</td>\n    </tr>\n    <tr>\n      <th>99997</th>\n      <td>Decting Errors in Reversible Circuits With Inv...</td>\n      <td>[0.0072460608, 0.0028085636, -0.015064359, -0....</td>\n      <td>0812.3871</td>\n    </tr>\n    <tr>\n      <th>99998</th>\n      <td>Unveiling the birth and evolution of the HII r...</td>\n      <td>[0.009408689, -0.0047120117, 0.0021392817, -0....</td>\n      <td>0812.3872</td>\n    </tr>\n    <tr>\n      <th>99999</th>\n      <td>The K-Receiver Broadcast Channel with Confiden...</td>\n      <td>[-0.0026305509, -0.006502139, 0.013400236, -0....</td>\n      <td>0812.3873</td>\n    </tr>\n  </tbody>\n</table>\n<p>100000 rows \u00d7 3 columns</p>\n</div>\n\n\n\n\n```python\nabstract_df = abstract_tables[1]\nabstract_df\n```\n\n\n\n\n<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>abstract</th>\n      <th>embeddings</th>\n      <th>doi</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>A fully differential calculation in perturba...</td>\n      <td>[-0.035151865, 0.022851437, 0.025942933, -0.02...</td>\n      <td>0704.0001</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n      <td>[0.035485767, -0.0015772493, -0.0016615744, -0...</td>\n      <td>0704.0002</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>The evolution of Earth-Moon system is descri...</td>\n      <td>[-0.014510429, 0.010210799, 0.049661566, -0.01...</td>\n      <td>0704.0003</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>We show that a determinant of Stirling cycle...</td>\n      <td>[0.029191103, 0.047992915, -0.0061754594, -0.0...</td>\n      <td>0704.0004</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>In this paper we show how to compute the $\\L...</td>\n      <td>[-0.015174898, 0.01603887, 0.04062805, -0.0246...</td>\n      <td>0704.0005</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>99995</th>\n      <td>The possibility of physics in multiple time ...</td>\n      <td>[0.016121766, 0.011126887, 0.018650021, -0.044...</td>\n      <td>0812.3869</td>\n    </tr>\n    <tr>\n      <th>99996</th>\n      <td>We generalize the methods of Moy-Prasad, in ...</td>\n      <td>[-7.164341e-05, -0.007114291, -0.008979887, -0...</td>\n      <td>0812.3870</td>\n    </tr>\n    <tr>\n      <th>99997</th>\n      <td>Reversible logic is experience renewed inter...</td>\n      <td>[0.03194286, -0.00771745, 0.015977046, -0.0474...</td>\n      <td>0812.3871</td>\n    </tr>\n    <tr>\n      <th>99998</th>\n      <td>Based on a multiwavelength study, the ISM ar...</td>\n      <td>[-0.012340169, -0.021712925, 0.00806009, -0.00...</td>\n      <td>0812.3872</td>\n    </tr>\n    <tr>\n      <th>99999</th>\n      <td>The secrecy capacity region for the K-receiv...</td>\n      <td>[0.0012416588, 0.0006933478, -0.0057888636, -0...</td>\n      <td>0812.3873</td>\n    </tr>\n  </tbody>\n</table>\n<p>100000 rows \u00d7 3 columns</p>\n</div>\n\n\n\n\n```python\nabstract_df['doi'].values\n```\n\n\n\n\n    array(['0704.0001', '0704.0002', '0704.0003', ..., '0812.3871',\n           '0812.3872', '0812.3873'], dtype=object)\n\n\n\n\n```python\nfrom xv import arxiv_url\n\ndoi = abstract_df['doi'].values[0]\narxiv_url(doi)\n```\n\n\n\n\n    'https://arxiv.org/abs/0704.0001'\n\n\n\n\n```python\nfrom xv.data_access import resource_descriptions\nresource_descriptions\n```\n\n\n\n\n    {'abs': 'Main page of article. Contains links to all other relevant information.',\n     'pdf': 'Direct link to article pdf',\n     'format': 'Page giving access to other formats',\n     'src': 'Access to the original source files submitted by the authors.',\n     'cits': 'Tracks citations of the article across various platforms and databases.',\n     'html': 'Link to the ar5iv html page for the article.'}\n\n\n\n\n```python\ndoi = '0704.0001'\n\nfor resource, description in resource_descriptions.items():\n    print(f\"{resource}: {description}\")\n    print(f\"Example: {arxiv_url(doi, resource)}\")\n    print(\"\")\n\n```\n\n    abs: Main page of article. Contains links to all other relevant information.\n    Example: https://arxiv.org/abs/0704.0001\n    \n    pdf: Direct link to article pdf\n    Example: https://arxiv.org/pdf/0704.0001\n    \n    format: Page giving access to other formats\n    Example: https://arxiv.org/format/0704.0001\n    \n    src: Access to the original source files submitted by the authors.\n    Example: https://arxiv.org/src/0704.0001\n    \n    cits: Tracks citations of the article across various platforms and databases.\n    Example: https://arxiv.org/cits/0704.0001\n    \n    html: Link to the ar5iv html page for the article.\n    Example: https://ar5iv.labs.arxiv.org/html/0704.0001\n    \n\n\n\n```python\narxiv_url(doi, 'pdf')\n```\n\n\n    'https://arxiv.org/pdf/0704.0001'\n\n\n\n```python\npdf_bytes = g[arxiv_url(doi, 'pdf')]\n```\n\n    The contents  (~1.647MB) of https://arxiv.org/pdf/0704.0001 are being downloaded...\n\n\n\n```python\nabstract_df.embeddings.values[0].shape\n```\n\n\n\n\n    (768,)\n\n",
    "bugtrack_url": null,
    "license": "mit",
    "summary": "Access to arxiv data",
    "version": "0.0.6",
    "project_urls": {
        "Homepage": "https://github.com/thorwhalen/xv"
    },
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "8dc5a9413ee0a03c2f693d9d8982932ed77f42dd226ecf6c765cec0084e3e4e8",
                "md5": "d41e54668f382a05b626095f48e67508",
                "sha256": "b7a81b98f534651bba33a80991899ff4d7b92cb518d58dc17c5efd8a81b1d7a3"
            },
            "downloads": -1,
            "filename": "xv-0.0.6-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "d41e54668f382a05b626095f48e67508",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 8178,
            "upload_time": "2024-01-23T12:50:46",
            "upload_time_iso_8601": "2024-01-23T12:50:46.779990Z",
            "url": "https://files.pythonhosted.org/packages/8d/c5/a9413ee0a03c2f693d9d8982932ed77f42dd226ecf6c765cec0084e3e4e8/xv-0.0.6-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "9558bc83623a8c76809e6ced3649dad4ae056dc8041d88d6e9768f93200ad465",
                "md5": "760e1704a9f6bd6f324b54e4f444f00a",
                "sha256": "8c639074c8f92e5cb4da4c715ac3e58e5a51071128f1a88a243cb4fd3d77a735"
            },
            "downloads": -1,
            "filename": "xv-0.0.6.tar.gz",
            "has_sig": false,
            "md5_digest": "760e1704a9f6bd6f324b54e4f444f00a",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 7423,
            "upload_time": "2024-01-23T12:50:48",
            "upload_time_iso_8601": "2024-01-23T12:50:48.440775Z",
            "url": "https://files.pythonhosted.org/packages/95/58/bc83623a8c76809e6ced3649dad4ae056dc8041d88d6e9768f93200ad465/xv-0.0.6.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-01-23 12:50:48",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "thorwhalen",
    "github_project": "xv",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "lcname": "xv"
}
        
Elapsed time: 0.17844s