# html/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas
### Tested against Windows 10 / Python 3.11 / Anaconda / Windows
### pip install xmlhtml2pandas
### Cython and a C compiler must be installed!
```PY
import os
# Tesseract and ImageMagick must be installed!
os.environ["OMP_THREAD_LIMIT"] = "1" # to limit the number of threads (tesseract)
os.environ["MAGICK_THREAD_LIMIT"] = "1" # to limit the number of threads (ImageMagick)
from xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract
from cythondfprint import add_printer # fast color printer for pandas df
add_printer(1)
for file2parse in [
r"C:\Users\hansc\Downloads\Apostas Futebol _ Sportingbet.mhtml",
r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online.mhtml",
r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online2.mhtml",
]:
with open(
file2parse,
"rb",
) as f:
df_html = parse_xmlhtml(f, "html", ())
print(df_html)
print(df_html.dtypes)
for picture in preprocess_images_and_run_tesseract(
density=200,
resize_percentage=100,
tesser_cpus=1,
image_magick_cpus=1,
path_in=r"C:\Users\hansc\Desktop\testimg", # for folders
path_out=r"C:\Users\hansc\Desktop\testimg_outfiles", # for folders
# path_in=r"C:\Users\hansc\Downloads\apicture.png",# single file
# path_out=r"C:\Users\hansc\Downloads\afolderforapicture", # single file - folder as output
magick_options="""-colorspace LinearGray -normalize -auto-level -alpha deactivate -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%""",
magick_path=r"C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe",
tesseractpath=r"C:\Program Files\Tesseract-OCR\tesseract.exe",
tessdata_dir=r"C:\Program Files\Tesseract-OCR\tessdata",
tesser_options_str="-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6",
debug=False,
subprocess_kwargs_tesser=None,
subprocess_kwargs_magick=None,
include_screenshots=True,
):
print(picture)
# on android
import os
import subprocess
os.environ["OMP_THREAD_LIMIT"] = "1"
os.environ["MAGICK_THREAD_LIMIT"] = "1"
os.environ["KMP_ALL_THREADS"] = "1"
os.environ["KMP_TEAMS_THREAD_LIMIT"] = "1"
os.environ["OMP_THREAD_LIMIT"] = "1"
os.environ["KMP_DEVICE_THREAD_LIMIT"] = "1"
from xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract
subprocess.run("screencap -p > /sdcard/shot.png",shell=True)
for picture in preprocess_images_and_run_tesseract(
density=200,
resize_percentage=100,
tesser_cpus=1,
image_magick_cpus=1,
path_in=r"/sdcard/shot.png",
path_out=r"/sdcard/Downloadsout",
magick_options="""-colorspace LinearGray -normalize -auto-level -alpha deactivate -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%""",
magick_path=r"/data/data/com.termux/files/usr/bin/magick",
tesseractpath=r"/data/data/com.termux/files/usr/bin/tesseract",
tessdata_dir=r"/data/data/com.termux/files/usr/share/tessdata_fast",
tesser_options_str="-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6",
debug=False,
subprocess_kwargs_tesser=None,
subprocess_kwargs_magick=None,
include_screenshots=False,
):
print(picture)
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/xmlhtml2pandas",
"name": "xmlhtml2pandas",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "ocr, html",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/d4/da/93ae71fd08abc74d2b85bc7c4b48e99a0baa673628a1207cecda9afd3836/xmlhtml2pandas-0.14.tar.gz",
"platform": null,
"description": "\r\n# html/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas\r\n\r\n### Tested against Windows 10 / Python 3.11 / Anaconda / Windows \r\n\r\n### pip install xmlhtml2pandas\r\n\r\n### Cython and a C compiler must be installed!\r\n\r\n```PY\r\nimport os\r\n# Tesseract and ImageMagick must be installed!\r\nos.environ[\"OMP_THREAD_LIMIT\"] = \"1\" # to limit the number of threads (tesseract)\r\nos.environ[\"MAGICK_THREAD_LIMIT\"] = \"1\" # to limit the number of threads (ImageMagick)\r\nfrom xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract\r\nfrom cythondfprint import add_printer # fast color printer for pandas df\r\n\r\nadd_printer(1)\r\nfor file2parse in [\r\n r\"C:\\Users\\hansc\\Downloads\\Apostas Futebol _ Sportingbet.mhtml\",\r\n r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online.mhtml\",\r\n r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online2.mhtml\",\r\n]:\r\n with open(\r\n file2parse,\r\n \"rb\",\r\n ) as f:\r\n df_html = parse_xmlhtml(f, \"html\", ())\r\n print(df_html)\r\n print(df_html.dtypes)\r\n\r\n\r\nfor picture in preprocess_images_and_run_tesseract(\r\n density=200,\r\n resize_percentage=100,\r\n tesser_cpus=1,\r\n image_magick_cpus=1,\r\n path_in=r\"C:\\Users\\hansc\\Desktop\\testimg\", # for folders\r\n path_out=r\"C:\\Users\\hansc\\Desktop\\testimg_outfiles\", # for folders\r\n # path_in=r\"C:\\Users\\hansc\\Downloads\\apicture.png\",# single file\r\n # path_out=r\"C:\\Users\\hansc\\Downloads\\afolderforapicture\", # single file - folder as output\r\n magick_options=\"\"\"-colorspace LinearGray -normalize -auto-level -alpha deactivate -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%\"\"\",\r\n magick_path=r\"C:\\Program Files\\ImageMagick-7.1.1-Q16-HDRI\\magick.exe\",\r\n tesseractpath=r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\",\r\n tessdata_dir=r\"C:\\Program Files\\Tesseract-OCR\\tessdata\",\r\n tesser_options_str=\"-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6\",\r\n debug=False,\r\n subprocess_kwargs_tesser=None,\r\n subprocess_kwargs_magick=None,\r\n include_screenshots=True,\r\n):\r\n print(picture)\r\n\r\n# on android \r\n\r\nimport os \r\nimport subprocess \r\nos.environ[\"OMP_THREAD_LIMIT\"] = \"1\" \r\nos.environ[\"MAGICK_THREAD_LIMIT\"] = \"1\" \r\nos.environ[\"KMP_ALL_THREADS\"] = \"1\" \r\nos.environ[\"KMP_TEAMS_THREAD_LIMIT\"] = \"1\" \r\nos.environ[\"OMP_THREAD_LIMIT\"] = \"1\" \r\nos.environ[\"KMP_DEVICE_THREAD_LIMIT\"] = \"1\" \r\n\r\nfrom xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract\r\nsubprocess.run(\"screencap -p > /sdcard/shot.png\",shell=True)\r\nfor picture in preprocess_images_and_run_tesseract(\r\n density=200,\r\n resize_percentage=100,\r\n tesser_cpus=1,\r\n image_magick_cpus=1,\r\n path_in=r\"/sdcard/shot.png\", \r\n path_out=r\"/sdcard/Downloadsout\",\r\n magick_options=\"\"\"-colorspace LinearGray -normalize -auto-level -alpha deactivate -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%\"\"\",\r\n magick_path=r\"/data/data/com.termux/files/usr/bin/magick\",\r\n tesseractpath=r\"/data/data/com.termux/files/usr/bin/tesseract\",\r\n tessdata_dir=r\"/data/data/com.termux/files/usr/share/tessdata_fast\",\r\n tesser_options_str=\"-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6\",\r\n debug=False,\r\n subprocess_kwargs_tesser=None,\r\n subprocess_kwargs_magick=None,\r\n include_screenshots=False,\r\n):\r\n print(picture)\r\n```\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "html/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas",
"version": "0.14",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/xmlhtml2pandas"
},
"split_keywords": [
"ocr",
" html"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "40a7671e769ebd4e1dafdb872f618ab3f6f625af8fe6f1470f0f185091ab5ea0",
"md5": "d6d6638837c47146475d88ba5fc6a2a9",
"sha256": "3a60b80fd1551256a4df4a1f4906b11ea13a120c17a69ad79a742296252c10da"
},
"downloads": -1,
"filename": "xmlhtml2pandas-0.14-py3-none-any.whl",
"has_sig": false,
"md5_digest": "d6d6638837c47146475d88ba5fc6a2a9",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 32608,
"upload_time": "2024-07-04T02:58:18",
"upload_time_iso_8601": "2024-07-04T02:58:18.524097Z",
"url": "https://files.pythonhosted.org/packages/40/a7/671e769ebd4e1dafdb872f618ab3f6f625af8fe6f1470f0f185091ab5ea0/xmlhtml2pandas-0.14-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "d4da93ae71fd08abc74d2b85bc7c4b48e99a0baa673628a1207cecda9afd3836",
"md5": "89ce7465fc41973939d6829778109e45",
"sha256": "2a886f61968e593475a0f6c9c9ec25cbf3519fb6102744f35d3f548739771d57"
},
"downloads": -1,
"filename": "xmlhtml2pandas-0.14.tar.gz",
"has_sig": false,
"md5_digest": "89ce7465fc41973939d6829778109e45",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 32217,
"upload_time": "2024-07-04T02:58:20",
"upload_time_iso_8601": "2024-07-04T02:58:20.074827Z",
"url": "https://files.pythonhosted.org/packages/d4/da/93ae71fd08abc74d2b85bc7c4b48e99a0baa673628a1207cecda9afd3836/xmlhtml2pandas-0.14.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-07-04 02:58:20",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "xmlhtml2pandas",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [],
"lcname": "xmlhtml2pandas"
}