# deepl-scraper-pp2
[][](https://img.shields.io/static/v1?label=python+&message=3.8.3%2B&color=blue)[](https://github.com/psf/black)[](https://opensource.org/licenses/MIT)[](https://badge.fury.io/py/deepl-scraper-pp2)
scrape deepl using pyppeteer2 with para info, cross platform (Windows/MacOS/Linux)
## Intro
`deepl-scraper-pp2` is more or less deepl-scraper-pp. `deepl-scraper-pp2` however preserves newlines in the translated text. Hence, it will make life easier when trying to process large chunks of text. `deepl-scraper-pp2` is originally intended for `deepl-tr-webui` but can be used elsewhere as well.
## Installation
```bash
pip install deepl-scraper-pp2
# pip install deepl-scraper-pp2 # upgrade to the latest version
```
or
```bash
poetry add deepl-scraper-pp2
# poetry add deepl-scraper-pp2@latest # upgrade to the latest version
```
or clone the repo (``git clone https://github.com/ffreemt/deepl-scraper-pyppeteer2.git``) and install from it.
## Usage
### in `python`
```python
import asyncio
from deepl_scraper_pp2.deepl_tr import deepl_tr
print(asyncio.run(deepl_tr("test 1 \n\n test 2"))
# '测试1 \n\n 测试2'
```
### Or start a local server
```bash
uvicorn deepl_scraper_pp2.deepl_server:app
# or
python -m deepl_scraper_pp2.run_uvicorn
```
#### and consume the REST API
```python
res = requests.post(
"http://127.0.0.1:8000/text",
json={
"text": "test 1\n\ntest2",
"to_lang": "zh"},
headers={"accept": "application/json", "Content-Type": "application/json"}
)
print(res.json())
# {'q': {'text': 'test 1\n\ntest2', 'from_lang': None, 'to_lang': 'zh', 'description': None}, 'result': '测试1\n\n测试2'}
```
Consult [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) for details.
## Disclaimer
The pypi is beta and will likely remain beta -- use it at your own peril.
<!---
In [367]: doc0("div.lmt__textarea.lmt__textarea_dummydiv").text()
Out[367]: 'test you are me new lines 试探你是我 新行'
# doc0("div#target-dummydiv").text()
In [371]: doc0("#target-dummydiv").text()
Out[371]: '试探你是我 新行'
In [394]: doc0("#target-dummydiv").html()
Out[394]: '试探你是我\n新行\n\n'
# doc0("button.lmt__translations_as_text__text_btn").text()
In [369]: doc0(".lmt__translations_as_text__text_btn").text()
Out[369]: '试探你是我 新行'
In [369]: doc0(".lmt__translations_as_text__text_btn").html()
In [388]: re.findall(r"<button class=\"lmt__translations_as_text__text_btn[\s\S]*?>[\s\S]*?<\/button>", text0)
Out[388]: ['<button class="lmt__translations_as_text__text_btn">试探你是我\n新行</button>']
re.findall(r"<div id=\"target-dummydiv[\s\S]*?>[\s\S]*?<\/div>", text0)
['<div id="target-dummydiv" class="lmt__textarea lmt__textarea_dummydiv">试探你是我\n新行\n\n</div>']
extract format: no need of html.escape
textarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)
re.findall(r'lmt__translations_as_text__text_btn">([\s\S]+?)<\/button>', doc.html())
re.findall(r'lmt__translations_as_text__text_btn">([\s\S]+?)<\/button>', await page.content())
===
from get_pwbrowser import get_pwbrowser
browser = await get_pwbrowser(headless=False)
context = await browser.new_context()
page = await context.new_page()
url = 'https://translate.google.cn/?sl=auto&tl=zh-CN&op=translate'
url = 'https://www.deepl.com/translator'
await page.goto(url) # 10 s
textarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)
sel_btn = "button.lmt__clear_text_button"
with CodeTimer():
for text in [' test 1 ' * 10, ' test 2 ' * 10, ' test 3' *10]:
# await textarea.fill('a')
# await textarea.fill('a')
# await page.evaluate(f'() => document.querySelectorAll("{sel_btn}")')
_ = await is_visible(sel_btn, page)
if _:
clear_button = await page.wait_for_selector(f"{sel_btn}", timeout=1000)
await clear_button.click()
await textarea.fill(text)
idx = 0
flag = False
ulimit = 1 / 0.1
while not flag and idx < ulimit:
idx += 1
content = await page.content()
doc = pq(content)
flag = re.findall(r'lmt__translations_as_text__text_btn', doc.html())
logger.debug(flag)
if flag:
break
await asyncio.sleep(0.1)
logger.info("loop: %s", idx)
res = re.findall(r'lmt__translations_as_text__text_btn">([\s\S]+?)<\/button>', await page.content())
print(res)
# does not work for long text!
# https://stackoverflow.com/questions/47712679/how-can-i-check-that-an-element-is-visible-with-puppeteer-and-pure-javascript
selector = 'button.lmt__clear_text_button'
let elem = document.querySelector(selector);
const style = getComputedStyle(elem);
const rect1 = elem.getBoundingClientRect();
style.visibility !== 'hidden' && !!(rect1.bottom || rect1.top || rect1.height || rect1.width);
# ==
const element_is_visible = await page.evaluate(() => {
const element = document.querySelector('button.lmt__clear_text_button');
const style = getComputedStyle(element);
const rect = element.getBoundingClientRect();
return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);
});
await textarea.fill(text)
str_ = f"""const element = document.querySelector('{sel_btn}');
const style = getComputedStyle(element);
const rect = element.getBoundingClientRect();
return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);"""
# visibility
visibility = await page.evaluate(f'() => {{{str_}}}')
print('visibility', visibility)
if visibility:
cbtn= await page.wait_for_selector(f"{sel_btn}", timeout=1000)
await cbtn.click(timeout=1000, no_wait_after=True)
async def is_visible(selector, page):
_ = f"""const element = document.querySelector('{selector}'); if (element === null) return false;
const style = getComputedStyle(element);
const rect = element.getBoundingClientRect();
return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);"""
return await page.evaluate(f'() => {{{_}}}')
async def console_run(js, page):
_ = f'() => {js}'
print(_)
return await page.evaluate(_)
--->
Raw data
{
"_id": null,
"home_page": "https://github.com/ffreemt/deepl-scraper-pyppeteer2",
"name": "deepl-scraper-pp2",
"maintainer": "",
"docs_url": null,
"requires_python": ">=3.8.3,<4.0.0",
"maintainer_email": "",
"keywords": "",
"author": "freemt",
"author_email": "",
"download_url": "https://files.pythonhosted.org/packages/a6/06/140076a4226e7f75a4db45310a4ca0aa6e800d5957315dc49c5465fbba1d/deepl-scraper-pp2-0.1.0a1.tar.gz",
"platform": null,
"description": "# deepl-scraper-pp2\n[][](https://img.shields.io/static/v1?label=python+&message=3.8.3%2B&color=blue)[](https://github.com/psf/black)[](https://opensource.org/licenses/MIT)[](https://badge.fury.io/py/deepl-scraper-pp2)\n\nscrape deepl using pyppeteer2 with para info, cross platform (Windows/MacOS/Linux)\n\n## Intro\n`deepl-scraper-pp2` is more or less deepl-scraper-pp. `deepl-scraper-pp2` however preserves newlines in the translated text. Hence, it will make life easier when trying to process large chunks of text. `deepl-scraper-pp2` is originally intended for `deepl-tr-webui` but can be used elsewhere as well.\n\n## Installation\n\n```bash\npip install deepl-scraper-pp2\n# pip install deepl-scraper-pp2 # upgrade to the latest version\n```\nor\n```bash\npoetry add deepl-scraper-pp2\n# poetry add deepl-scraper-pp2@latest # upgrade to the latest version\n```\n\nor clone the repo (``git clone https://github.com/ffreemt/deepl-scraper-pyppeteer2.git``) and install from it.\n\n## Usage\n\n### in `python`\n\n```python\nimport asyncio\nfrom deepl_scraper_pp2.deepl_tr import deepl_tr\n\nprint(asyncio.run(deepl_tr(\"test 1 \\n\\n test 2\"))\n\n# '\u6d4b\u8bd51 \\n\\n \u6d4b\u8bd52'\n```\n\n### Or start a local server\n```bash\nuvicorn deepl_scraper_pp2.deepl_server:app\n\n# or\npython -m deepl_scraper_pp2.run_uvicorn\n```\n\n#### and consume the REST API\n```python\nres = requests.post(\n \"http://127.0.0.1:8000/text\",\n json={\n \"text\": \"test 1\\n\\ntest2\",\n \"to_lang\": \"zh\"},\n headers={\"accept\": \"application/json\", \"Content-Type\": \"application/json\"}\n)\nprint(res.json())\n# {'q': {'text': 'test 1\\n\\ntest2', 'from_lang': None, 'to_lang': 'zh', 'description': None}, 'result': '\u6d4b\u8bd51\\n\\n\u6d4b\u8bd52'}\n```\n\nConsult [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) for details.\n\n## Disclaimer\n\nThe pypi is beta and will likely remain beta -- use it at your own peril.\n\n<!---\n\nIn [367]: doc0(\"div.lmt__textarea.lmt__textarea_dummydiv\").text()\nOut[367]: 'test you are me new lines \u8bd5\u63a2\u4f60\u662f\u6211 \u65b0\u884c'\n\n# doc0(\"div#target-dummydiv\").text()\nIn [371]: doc0(\"#target-dummydiv\").text()\nOut[371]: '\u8bd5\u63a2\u4f60\u662f\u6211 \u65b0\u884c'\n\nIn [394]: doc0(\"#target-dummydiv\").html()\nOut[394]: '\u8bd5\u63a2\u4f60\u662f\u6211\\n\u65b0\u884c\\n\\n'\n\n# doc0(\"button.lmt__translations_as_text__text_btn\").text()\nIn [369]: doc0(\".lmt__translations_as_text__text_btn\").text()\nOut[369]: '\u8bd5\u63a2\u4f60\u662f\u6211 \u65b0\u884c'\nIn [369]: doc0(\".lmt__translations_as_text__text_btn\").html()\n\n\nIn [388]: re.findall(r\"<button class=\\\"lmt__translations_as_text__text_btn[\\s\\S]*?>[\\s\\S]*?<\\/button>\", text0)\nOut[388]: ['<button class=\"lmt__translations_as_text__text_btn\">\u8bd5\u63a2\u4f60\u662f\u6211\\n\u65b0\u884c</button>']\n\nre.findall(r\"<div id=\\\"target-dummydiv[\\s\\S]*?>[\\s\\S]*?<\\/div>\", text0)\n['<div id=\"target-dummydiv\" class=\"lmt__textarea lmt__textarea_dummydiv\">\u8bd5\u63a2\u4f60\u662f\u6211\\n\u65b0\u884c\\n\\n</div>']\n\n\nextract format: no need of html.escape\n\ntextarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)\n\nre.findall(r'lmt__translations_as_text__text_btn\">([\\s\\S]+?)<\\/button>', doc.html())\n re.findall(r'lmt__translations_as_text__text_btn\">([\\s\\S]+?)<\\/button>', await page.content())\n\n===\nfrom get_pwbrowser import get_pwbrowser\n\nbrowser = await get_pwbrowser(headless=False)\ncontext = await browser.new_context()\npage = await context.new_page()\n\nurl = 'https://translate.google.cn/?sl=auto&tl=zh-CN&op=translate'\nurl = 'https://www.deepl.com/translator'\nawait page.goto(url) # 10 s\n\ntextarea = await page.wait_for_selector('//textarea', timeout=1 * 1000)\n\nsel_btn = \"button.lmt__clear_text_button\"\n\nwith CodeTimer():\n for text in [' test 1 ' * 10, ' test 2 ' * 10, ' test 3' *10]:\n # await textarea.fill('a')\n # await textarea.fill('a')\n\n # await page.evaluate(f'() => document.querySelectorAll(\"{sel_btn}\")')\n\n _ = await is_visible(sel_btn, page)\n if _:\n clear_button = await page.wait_for_selector(f\"{sel_btn}\", timeout=1000)\n await clear_button.click()\n await textarea.fill(text)\n\n idx = 0\n flag = False\n ulimit = 1 / 0.1\n while not flag and idx < ulimit:\n idx += 1\n content = await page.content()\n doc = pq(content)\n\n flag = re.findall(r'lmt__translations_as_text__text_btn', doc.html())\n logger.debug(flag)\n if flag:\n break\n await asyncio.sleep(0.1)\n logger.info(\"loop: %s\", idx)\n\n res = re.findall(r'lmt__translations_as_text__text_btn\">([\\s\\S]+?)<\\/button>', await page.content())\n print(res)\n # does not work for long text!\n\n# https://stackoverflow.com/questions/47712679/how-can-i-check-that-an-element-is-visible-with-puppeteer-and-pure-javascript\nselector = 'button.lmt__clear_text_button'\n\nlet elem = document.querySelector(selector);\nconst style = getComputedStyle(elem);\nconst rect1 = elem.getBoundingClientRect();\nstyle.visibility !== 'hidden' && !!(rect1.bottom || rect1.top || rect1.height || rect1.width);\n\n# ==\nconst element_is_visible = await page.evaluate(() => {\n const element = document.querySelector('button.lmt__clear_text_button');\n const style = getComputedStyle(element);\n const rect = element.getBoundingClientRect();\n\n return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);\n});\n\nawait textarea.fill(text)\n\nstr_ = f\"\"\"const element = document.querySelector('{sel_btn}');\n const style = getComputedStyle(element);\n const rect = element.getBoundingClientRect();\n return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);\"\"\"\n# visibility\nvisibility = await page.evaluate(f'() => {{{str_}}}')\nprint('visibility', visibility)\n\nif visibility:\n cbtn= await page.wait_for_selector(f\"{sel_btn}\", timeout=1000)\n await cbtn.click(timeout=1000, no_wait_after=True)\n\nasync def is_visible(selector, page):\n _ = f\"\"\"const element = document.querySelector('{selector}'); if (element === null) return false;\n const style = getComputedStyle(element);\n const rect = element.getBoundingClientRect();\n return style.visibility !== 'hidden' && !!(rect.bottom || rect.top || rect.height || rect.width);\"\"\"\n return await page.evaluate(f'() => {{{_}}}')\n\nasync def console_run(js, page):\n _ = f'() => {js}'\n print(_)\n return await page.evaluate(_)\n\n--->",
"bugtrack_url": null,
"license": "MIT",
"summary": "scrape deepl via pyppeteer2 with para info",
"version": "0.1.0a1",
"split_keywords": [],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "cd26bf10c6f720d3ca3a47b22a9fa24195e361b2350e1d19d33f31690bee2596",
"md5": "40b6c0a18d5ee2fd28ea746dbca2ad3d",
"sha256": "87a387723cfa334934dfc696bde8f3f5b747481a92d6d646615cfef53c435260"
},
"downloads": -1,
"filename": "deepl_scraper_pp2-0.1.0a1-py3-none-any.whl",
"has_sig": false,
"md5_digest": "40b6c0a18d5ee2fd28ea746dbca2ad3d",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8.3,<4.0.0",
"size": 11422,
"upload_time": "2023-01-27T06:38:46",
"upload_time_iso_8601": "2023-01-27T06:38:46.450275Z",
"url": "https://files.pythonhosted.org/packages/cd/26/bf10c6f720d3ca3a47b22a9fa24195e361b2350e1d19d33f31690bee2596/deepl_scraper_pp2-0.1.0a1-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "a606140076a4226e7f75a4db45310a4ca0aa6e800d5957315dc49c5465fbba1d",
"md5": "1b30cdb0977334bf0e05f022d458b30b",
"sha256": "3dc7fa104465262c9777b0aed3d7a07285da1b02b0e174a1d8cf63a39ae3b4a1"
},
"downloads": -1,
"filename": "deepl-scraper-pp2-0.1.0a1.tar.gz",
"has_sig": false,
"md5_digest": "1b30cdb0977334bf0e05f022d458b30b",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8.3,<4.0.0",
"size": 10721,
"upload_time": "2023-01-27T06:38:44",
"upload_time_iso_8601": "2023-01-27T06:38:44.566995Z",
"url": "https://files.pythonhosted.org/packages/a6/06/140076a4226e7f75a4db45310a4ca0aa6e800d5957315dc49c5465fbba1d/deepl-scraper-pp2-0.1.0a1.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-01-27 06:38:44",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "ffreemt",
"github_project": "deepl-scraper-pyppeteer2",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [],
"lcname": "deepl-scraper-pp2"
}