deepl-scraper-pp2


Namedeepl-scraper-pp2 JSON
Version 0.1.0a1 PyPI version JSON
download
home_pagehttps://github.com/ffreemt/deepl-scraper-pyppeteer2
Summaryscrape deepl via pyppeteer2 with para info
upload_time2023-01-27 06:38:44
maintainer
docs_urlNone
authorfreemt
requires_python>=3.8.3,<4.0.0
licenseMIT
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # deepl-scraper-pp2
[![tests](https://github.com/ffreemt/deepl-scraper-pyppeteer2/actions/workflows/routine-tests.yml/badge.svg)][![python](https://img.shields.io/static/v1?label=python+&message=3.8.3%2B&color=blue)](https://img.shields.io/static/v1?label=python+&message=3.8.3%2B&color=blue)[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)[![PyPI version](https://badge.fury.io/py/deepl-scraper-pp2.svg)](https://badge.fury.io/py/deepl-scraper-pp2)

scrape deepl using pyppeteer2 with para info, cross platform (Windows/MacOS/Linux)

## Intro
`deepl-scraper-pp2` is more or less deepl-scraper-pp. `deepl-scraper-pp2` however preserves newlines in the translated text. Hence, it will make life easier when trying to process large chunks of text. `deepl-scraper-pp2` is originally intended for `deepl-tr-webui` but can be used elsewhere as well.

## Installation

```bash
pip install deepl-scraper-pp2
# pip install deepl-scraper-pp2  # upgrade to the latest version
```
or
```bash
poetry add deepl-scraper-pp2
# poetry add deepl-scraper-pp2@latest  # upgrade to the latest version
```

or clone the repo (``git clone https://github.com/ffreemt/deepl-scraper-pyppeteer2.git``) and install from it.

## Usage

### in `python`

```python
import asyncio
from deepl_scraper_pp2.deepl_tr import deepl_tr

print(asyncio.run(deepl_tr("test 1 \n\n test 2"))

# '测试1 \n\n  测试2'
```

### Or start a local server
```bash
uvicorn deepl_scraper_pp2.deepl_server:app

# or
python -m deepl_scraper_pp2.run_uvicorn
```

#### and consume the REST API
```python
res = requests.post(
  "http://127.0.0.1:8000/text",
  json={
    "text": "test 1\n\ntest2",
    "to_lang": "zh"},
    headers={"accept": "application/json", "Content-Type": "application/json"}
)
print(res.json())
# {'q': {'text': 'test 1\n\ntest2', 'from_lang': None, 'to_lang': 'zh', 'description': None}, 'result': '测试1\n\n测试2'}
```

Consult [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) for details.

## Disclaimer

The pypi is beta and will likely remain beta -- use it at your own peril.

<!---

In [367]: doc0("div.lmt__textarea.lmt__textarea_dummydiv").text()
Out[367]: 'test you are me new lines 试探你是我 新行'

# doc0("div#target-dummydiv").text()
In [371]: doc0("#target-dummydiv").text()
Out[371]: '试探你是我 新行'

In [394]: doc0("#target-dummydiv").html()
Out[394]: '试探你是我\n新行\n\n'

# doc0("button.lmt__translations_as_text__text_btn").text()
In [369]: doc0(".lmt__translations_as_text__text_btn").text()
Out[369]: '试探你是我 新行'
In [369]: doc0(".lmt__translations_as_text__text_btn").html()


In [388]: re.findall(r"<button class=\"lmt__translations_as_text__text_btn[\s\S]*?>[\s\S]*?<\/button>", text0)
Out[388]: ['<button class="lmt__translations_as_text__text_btn">试探你是我\n新行</button>']

re.findall(r"<div id=\"target-dummydiv[\s\S]*?>[\s\S]*?<\/div>", text0)
['<div id="target-dummydiv" class="lmt__textarea lmt__textarea_dummydiv">试探你是我\n新行\n\n</div>']


extract format:  no need of html.escape

textarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)

re.findall(r'lmt__translations_as_text__text_btn">([\s\S]+?)<\/button>', doc.html())
  re.findall(r'lmt__translations_as_text__text_btn">([\s\S]+?)<\/button>', await page.content())

===
from get_pwbrowser import get_pwbrowser

browser = await get_pwbrowser(headless=False)
context = await browser.new_context()
page = await context.new_page()

url = 'https://translate.google.cn/?sl=auto&tl=zh-CN&op=translate'
url = 'https://www.deepl.com/translator'
await page.goto(url)  # 10 s

textarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)

sel_btn = "button.lmt__clear_text_button"

with CodeTimer():
    for text in [' test 1 ' * 10, ' test 2 ' * 10, ' test 3' *10]:
        # await textarea.fill('a')
        # await textarea.fill('a')

        # await page.evaluate(f'() => document.querySelectorAll("{sel_btn}")')

        _ = await is_visible(sel_btn, page)
        if _:
            clear_button = await page.wait_for_selector(f"{sel_btn}", timeout=1000)
            await clear_button.click()
        await textarea.fill(text)

        idx = 0
        flag = False
        ulimit = 1 / 0.1
        while not flag and idx < ulimit:
            idx += 1
            content = await page.content()
            doc = pq(content)

            flag = re.findall(r'lmt__translations_as_text__text_btn', doc.html())
            logger.debug(flag)
            if flag:
                break
            await asyncio.sleep(0.1)
        logger.info("loop: %s", idx)

        res = re.findall(r'lmt__translations_as_text__text_btn">([\s\S]+?)<\/button>', await page.content())
        print(res)
        # does not work for long text!

# https://stackoverflow.com/questions/47712679/how-can-i-check-that-an-element-is-visible-with-puppeteer-and-pure-javascript
selector = 'button.lmt__clear_text_button'

let elem = document.querySelector(selector);
const style = getComputedStyle(elem);
const rect1 = elem.getBoundingClientRect();
style.visibility !== 'hidden' && !!(rect1.bottom || rect1.top || rect1.height || rect1.width);

# ==
const element_is_visible = await page.evaluate(() => {
  const element = document.querySelector('button.lmt__clear_text_button');
  const style = getComputedStyle(element);
  const rect = element.getBoundingClientRect();

  return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);
});

await textarea.fill(text)

str_ = f"""const element = document.querySelector('{sel_btn}');
  const style = getComputedStyle(element);
  const rect = element.getBoundingClientRect();
  return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);"""
# visibility
visibility = await page.evaluate(f'() => {{{str_}}}')
print('visibility', visibility)

if visibility:
    cbtn= await page.wait_for_selector(f"{sel_btn}", timeout=1000)
    await cbtn.click(timeout=1000, no_wait_after=True)

async def is_visible(selector, page):
    _ = f"""const element = document.querySelector('{selector}'); if (element === null) return false;
  const style = getComputedStyle(element);
  const rect = element.getBoundingClientRect();
  return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);"""
    return await page.evaluate(f'() => {{{_}}}')

async def console_run(js, page):
    _ = f'() => {js}'
    print(_)
    return await page.evaluate(_)

--->
            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/ffreemt/deepl-scraper-pyppeteer2",
    "name": "deepl-scraper-pp2",
    "maintainer": "",
    "docs_url": null,
    "requires_python": ">=3.8.3,<4.0.0",
    "maintainer_email": "",
    "keywords": "",
    "author": "freemt",
    "author_email": "",
    "download_url": "https://files.pythonhosted.org/packages/a6/06/140076a4226e7f75a4db45310a4ca0aa6e800d5957315dc49c5465fbba1d/deepl-scraper-pp2-0.1.0a1.tar.gz",
    "platform": null,
    "description": "# deepl-scraper-pp2\n[![tests](https://github.com/ffreemt/deepl-scraper-pyppeteer2/actions/workflows/routine-tests.yml/badge.svg)][![python](https://img.shields.io/static/v1?label=python+&message=3.8.3%2B&color=blue)](https://img.shields.io/static/v1?label=python+&message=3.8.3%2B&color=blue)[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)[![PyPI version](https://badge.fury.io/py/deepl-scraper-pp2.svg)](https://badge.fury.io/py/deepl-scraper-pp2)\n\nscrape deepl using pyppeteer2 with para info, cross platform (Windows/MacOS/Linux)\n\n## Intro\n`deepl-scraper-pp2` is more or less deepl-scraper-pp. `deepl-scraper-pp2` however preserves newlines in the translated text. Hence, it will make life easier when trying to process large chunks of text. `deepl-scraper-pp2` is originally intended for `deepl-tr-webui` but can be used elsewhere as well.\n\n## Installation\n\n```bash\npip install deepl-scraper-pp2\n# pip install deepl-scraper-pp2  # upgrade to the latest version\n```\nor\n```bash\npoetry add deepl-scraper-pp2\n# poetry add deepl-scraper-pp2@latest  # upgrade to the latest version\n```\n\nor clone the repo (``git clone https://github.com/ffreemt/deepl-scraper-pyppeteer2.git``) and install from it.\n\n## Usage\n\n### in `python`\n\n```python\nimport asyncio\nfrom deepl_scraper_pp2.deepl_tr import deepl_tr\n\nprint(asyncio.run(deepl_tr(\"test 1 \\n\\n test 2\"))\n\n# '\u6d4b\u8bd51 \\n\\n  \u6d4b\u8bd52'\n```\n\n### Or start a local server\n```bash\nuvicorn deepl_scraper_pp2.deepl_server:app\n\n# or\npython -m deepl_scraper_pp2.run_uvicorn\n```\n\n#### and consume the REST API\n```python\nres = requests.post(\n  \"http://127.0.0.1:8000/text\",\n  json={\n    \"text\": \"test 1\\n\\ntest2\",\n    \"to_lang\": \"zh\"},\n    headers={\"accept\": \"application/json\", \"Content-Type\": \"application/json\"}\n)\nprint(res.json())\n# {'q': {'text': 'test 1\\n\\ntest2', 'from_lang': None, 'to_lang': 'zh', 'description': None}, 'result': '\u6d4b\u8bd51\\n\\n\u6d4b\u8bd52'}\n```\n\nConsult [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) for details.\n\n## Disclaimer\n\nThe pypi is beta and will likely remain beta -- use it at your own peril.\n\n<!---\n\nIn [367]: doc0(\"div.lmt__textarea.lmt__textarea_dummydiv\").text()\nOut[367]: 'test you are me new lines \u8bd5\u63a2\u4f60\u662f\u6211 \u65b0\u884c'\n\n# doc0(\"div#target-dummydiv\").text()\nIn [371]: doc0(\"#target-dummydiv\").text()\nOut[371]: '\u8bd5\u63a2\u4f60\u662f\u6211 \u65b0\u884c'\n\nIn [394]: doc0(\"#target-dummydiv\").html()\nOut[394]: '\u8bd5\u63a2\u4f60\u662f\u6211\\n\u65b0\u884c\\n\\n'\n\n# doc0(\"button.lmt__translations_as_text__text_btn\").text()\nIn [369]: doc0(\".lmt__translations_as_text__text_btn\").text()\nOut[369]: '\u8bd5\u63a2\u4f60\u662f\u6211 \u65b0\u884c'\nIn [369]: doc0(\".lmt__translations_as_text__text_btn\").html()\n\n\nIn [388]: re.findall(r\"<button class=\\\"lmt__translations_as_text__text_btn[\\s\\S]*?>[\\s\\S]*?<\\/button>\", text0)\nOut[388]: ['<button class=\"lmt__translations_as_text__text_btn\">\u8bd5\u63a2\u4f60\u662f\u6211\\n\u65b0\u884c</button>']\n\nre.findall(r\"<div id=\\\"target-dummydiv[\\s\\S]*?>[\\s\\S]*?<\\/div>\", text0)\n['<div id=\"target-dummydiv\" class=\"lmt__textarea lmt__textarea_dummydiv\">\u8bd5\u63a2\u4f60\u662f\u6211\\n\u65b0\u884c\\n\\n</div>']\n\n\nextract format:  no need of html.escape\n\ntextarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)\n\nre.findall(r'lmt__translations_as_text__text_btn\">([\\s\\S]+?)<\\/button>', doc.html())\n  re.findall(r'lmt__translations_as_text__text_btn\">([\\s\\S]+?)<\\/button>', await page.content())\n\n===\nfrom get_pwbrowser import get_pwbrowser\n\nbrowser = await get_pwbrowser(headless=False)\ncontext = await browser.new_context()\npage = await context.new_page()\n\nurl = 'https://translate.google.cn/?sl=auto&tl=zh-CN&op=translate'\nurl = 'https://www.deepl.com/translator'\nawait page.goto(url)  # 10 s\n\ntextarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)\n\nsel_btn = \"button.lmt__clear_text_button\"\n\nwith CodeTimer():\n    for text in [' test 1 ' * 10, ' test 2 ' * 10, ' test 3' *10]:\n        # await textarea.fill('a')\n        # await textarea.fill('a')\n\n        # await page.evaluate(f'() => document.querySelectorAll(\"{sel_btn}\")')\n\n        _ = await is_visible(sel_btn, page)\n        if _:\n            clear_button = await page.wait_for_selector(f\"{sel_btn}\", timeout=1000)\n            await clear_button.click()\n        await textarea.fill(text)\n\n        idx = 0\n        flag = False\n        ulimit = 1 / 0.1\n        while not flag and idx < ulimit:\n            idx += 1\n            content = await page.content()\n            doc = pq(content)\n\n            flag = re.findall(r'lmt__translations_as_text__text_btn', doc.html())\n            logger.debug(flag)\n            if flag:\n                break\n            await asyncio.sleep(0.1)\n        logger.info(\"loop: %s\", idx)\n\n        res = re.findall(r'lmt__translations_as_text__text_btn\">([\\s\\S]+?)<\\/button>', await page.content())\n        print(res)\n        # does not work for long text!\n\n# https://stackoverflow.com/questions/47712679/how-can-i-check-that-an-element-is-visible-with-puppeteer-and-pure-javascript\nselector = 'button.lmt__clear_text_button'\n\nlet elem = document.querySelector(selector);\nconst style = getComputedStyle(elem);\nconst rect1 = elem.getBoundingClientRect();\nstyle.visibility !== 'hidden' && !!(rect1.bottom || rect1.top || rect1.height || rect1.width);\n\n# ==\nconst element_is_visible = await page.evaluate(() => {\n  const element = document.querySelector('button.lmt__clear_text_button');\n  const style = getComputedStyle(element);\n  const rect = element.getBoundingClientRect();\n\n  return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);\n});\n\nawait textarea.fill(text)\n\nstr_ = f\"\"\"const element = document.querySelector('{sel_btn}');\n  const style = getComputedStyle(element);\n  const rect = element.getBoundingClientRect();\n  return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);\"\"\"\n# visibility\nvisibility = await page.evaluate(f'() => {{{str_}}}')\nprint('visibility', visibility)\n\nif visibility:\n    cbtn= await page.wait_for_selector(f\"{sel_btn}\", timeout=1000)\n    await cbtn.click(timeout=1000, no_wait_after=True)\n\nasync def is_visible(selector, page):\n    _ = f\"\"\"const element = document.querySelector('{selector}'); if (element === null) return false;\n  const style = getComputedStyle(element);\n  const rect = element.getBoundingClientRect();\n  return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);\"\"\"\n    return await page.evaluate(f'() => {{{_}}}')\n\nasync def console_run(js, page):\n    _ = f'() => {js}'\n    print(_)\n    return await page.evaluate(_)\n\n--->",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "scrape deepl via pyppeteer2 with para info",
    "version": "0.1.0a1",
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "cd26bf10c6f720d3ca3a47b22a9fa24195e361b2350e1d19d33f31690bee2596",
                "md5": "40b6c0a18d5ee2fd28ea746dbca2ad3d",
                "sha256": "87a387723cfa334934dfc696bde8f3f5b747481a92d6d646615cfef53c435260"
            },
            "downloads": -1,
            "filename": "deepl_scraper_pp2-0.1.0a1-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "40b6c0a18d5ee2fd28ea746dbca2ad3d",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.8.3,<4.0.0",
            "size": 11422,
            "upload_time": "2023-01-27T06:38:46",
            "upload_time_iso_8601": "2023-01-27T06:38:46.450275Z",
            "url": "https://files.pythonhosted.org/packages/cd/26/bf10c6f720d3ca3a47b22a9fa24195e361b2350e1d19d33f31690bee2596/deepl_scraper_pp2-0.1.0a1-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "a606140076a4226e7f75a4db45310a4ca0aa6e800d5957315dc49c5465fbba1d",
                "md5": "1b30cdb0977334bf0e05f022d458b30b",
                "sha256": "3dc7fa104465262c9777b0aed3d7a07285da1b02b0e174a1d8cf63a39ae3b4a1"
            },
            "downloads": -1,
            "filename": "deepl-scraper-pp2-0.1.0a1.tar.gz",
            "has_sig": false,
            "md5_digest": "1b30cdb0977334bf0e05f022d458b30b",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.8.3,<4.0.0",
            "size": 10721,
            "upload_time": "2023-01-27T06:38:44",
            "upload_time_iso_8601": "2023-01-27T06:38:44.566995Z",
            "url": "https://files.pythonhosted.org/packages/a6/06/140076a4226e7f75a4db45310a4ca0aa6e800d5957315dc49c5465fbba1d/deepl-scraper-pp2-0.1.0a1.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-01-27 06:38:44",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "github_user": "ffreemt",
    "github_project": "deepl-scraper-pyppeteer2",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "requirements": [],
    "lcname": "deepl-scraper-pp2"
}
        
Elapsed time: 0.06459s