# extracts structured data from text using user-defined delimiters (strings or regex)
## Tested against Windows / Python 3.11 / Anaconda
## pip install parifinder
parifinder extracts structured data from text using user-defined delimiters (strings or regex), making it versatile for data processing.
## Advantages
### Flexibility:
The function can handle a wide range of scenarios, making it versatile for parsing text with various delimiters. It can handle both single and multiple pairs of delimiters, whether they are simple strings or complex regular expressions. This flexibility makes it suitable for different use cases.
### Scalability:
It can parse multiple pairs of delimiters within a given text, which is especially useful when dealing with documents or data containing nested elements.
### Pure Python:
It uses only Python's standard library
```python
from parifinder import parse_pairs
from pprint import pprint
text_0 = """[[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]][[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]]"""
s1_0 = "["
s2_0 = "]"
r0 = parse_pairs(string=text_0, s1=s1_0, s2=s2_0, str_regex=False)
print("r0-----------------------------------------------------------------")
pprint(r0, indent=1, width=1)
text_1 = "<body><p>a</p><p>a</p><p>The HTML <code>button</code> tag defines a clickable button.</p><p>x</p><p>The CSS <code>background-color</code> property defines the background color of an element.</p></body></html>"
s1_1 = "<p>"
s2_1 = "</p>"
r1 = parse_pairs(string=text_1, s1=s1_1, s2=s2_1, str_regex=False)
print("r1-----------------------------------------------------------------")
pprint(r1, indent=1, width=1)
text_2 = "[1bla[2bla/2]/1]"
s1_2 = r"\[\d"
s2_2 = r"/\d]"
r2 = parse_pairs(string=text_2, s1=s1_2, s2=s2_2, str_regex=True)
print("r2-----------------------------------------------------------------")
pprint(r2, indent=1, width=1)
text_3 = "[1bla[2bla/2]/1]"
s1_3 = [("[1", "/1]"), ("[2", "/2]")]
s2_3 = None
r3 = parse_pairs(string=text_3, s1=s1_3, s2=s2_3, str_regex=False)
print("r3-----------------------------------------------------------------")
pprint(r3, indent=1, width=1)
text_4 = "[1bla[2bla/2]/1]"
s1_4 = ["[1", "[2"]
s2_4 = ["/1]", "/2]"]
r4 = parse_pairs(string=text_4, s1=s1_4, s2=s2_4, str_regex=False)
print("r4-----------------------------------------------------------------")
pprint(r4, indent=1, width=1)
# r0-----------------------------------------------------------------
# {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23): {'children': [(1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9),
# (17,
# 18,
# 19,
# 20,
# 21,
# 22),
# (12,
# 13,
# 14)],
# 'end': 23,
# 'parents': [],
# 'size': 23,
# 'start': 0,
# 'text': '[[1, '
# '2, '
# '2], '
# '[5], '
# '[2, '
# '3]]'},
# (1, 2, 3, 4, 5, 6, 7, 8, 9): {'children': [],
# 'end': 9,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 8,
# 'start': 1,
# 'text': '[1, '
# '2, '
# '2]'},
# (12, 13, 14): {'children': [],
# 'end': 14,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 2,
# 'start': 12,
# 'text': '[5]'},
# (17, 18, 19, 20, 21, 22): {'children': [],
# 'end': 22,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 5,
# 'start': 17,
# 'text': '[2, '
# '3]'},
# (30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57): {'children': [(31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39),
# (42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48),
# (51,
# 52,
# 53,
# 54,
# 55,
# 56)],
# 'end': 57,
# 'parents': [],
# 'size': 27,
# 'start': 30,
# 'text': '[[4, '
# '4, '
# '4], '
# '[12, '
# '0], '
# '[6, '
# '6]]'},
# (31, 32, 33, 34, 35, 36, 37, 38, 39): {'children': [],
# 'end': 39,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 8,
# 'start': 31,
# 'text': '[4, '
# '4, '
# '4]'},
# (42, 43, 44, 45, 46, 47, 48): {'children': [],
# 'end': 48,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 6,
# 'start': 42,
# 'text': '[12, '
# '0]'},
# (51, 52, 53, 54, 55, 56): {'children': [],
# 'end': 56,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 5,
# 'start': 51,
# 'text': '[6, '
# '6]'},
# (63, 64, 65, 66, 67, 68, 69, 70): {'children': [(64,
# 65,
# 66,
# 67,
# 68,
# 69)],
# 'end': 70,
# 'parents': [],
# 'size': 7,
# 'start': 63,
# 'text': '[[1, '
# '2]]'},
# (64, 65, 66, 67, 68, 69): {'children': [],
# 'end': 69,
# 'parents': [(63,
# 64,
# 65,
# 66,
# 67,
# 68,
# 69,
# 70)],
# 'size': 5,
# 'start': 64,
# 'text': '[1, '
# '2]'},
# (71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94): {'children': [(72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80),
# (88,
# 89,
# 90,
# 91,
# 92,
# 93),
# (83,
# 84,
# 85)],
# 'end': 94,
# 'parents': [],
# 'size': 23,
# 'start': 71,
# 'text': '[[1, '
# '2, '
# '2], '
# '[5], '
# '[2, '
# '3]]'},
# (72, 73, 74, 75, 76, 77, 78, 79, 80): {'children': [],
# 'end': 80,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 8,
# 'start': 72,
# 'text': '[1, '
# '2, '
# '2]'},
# (83, 84, 85): {'children': [],
# 'end': 85,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 2,
# 'start': 83,
# 'text': '[5]'},
# (88, 89, 90, 91, 92, 93): {'children': [],
# 'end': 93,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 5,
# 'start': 88,
# 'text': '[2, '
# '3]'},
# (101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128): {'children': [(102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110),
# (113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119),
# (122,
# 123,
# 124,
# 125,
# 126,
# 127)],
# 'end': 128,
# 'parents': [],
# 'size': 27,
# 'start': 101,
# 'text': '[[4, '
# '4, '
# '4], '
# '[12, '
# '0], '
# '[6, '
# '6]]'},
# (102, 103, 104, 105, 106, 107, 108, 109, 110): {'children': [],
# 'end': 110,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 8,
# 'start': 102,
# 'text': '[4, '
# '4, '
# '4]'},
# (113, 114, 115, 116, 117, 118, 119): {'children': [],
# 'end': 119,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 6,
# 'start': 113,
# 'text': '[12, '
# '0]'},
# (122, 123, 124, 125, 126, 127): {'children': [],
# 'end': 127,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 5,
# 'start': 122,
# 'text': '[6, '
# '6]'},
# (134, 135, 136, 137, 138, 139, 140, 141): {'children': [(135,
# 136,
# 137,
# 138,
# 139,
# 140)],
# 'end': 141,
# 'parents': [],
# 'size': 7,
# 'start': 134,
# 'text': '[[1, '
# '2]]'},
# (135, 136, 137, 138, 139, 140): {'children': [],
# 'end': 140,
# 'parents': [(134,
# 135,
# 136,
# 137,
# 138,
# 139,
# 140,
# 141)],
# 'size': 5,
# 'start': 135,
# 'text': '[1, '
# '2]'}}
# r1-----------------------------------------------------------------
# {(6, 7, 8, 9, 10, 11, 12, 13, 14): {'children': [],
# 'end': 14,
# 'parents': [],
# 'size': 9,
# 'start': 6,
# 'text': '<p>a</p>'},
# (14, 15, 16, 17, 18, 19, 20, 21, 22): {'children': [],
# 'end': 22,
# 'parents': [],
# 'size': 9,
# 'start': 14,
# 'text': '<p>a</p>'},
# (22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89): {'children': [],
# 'end': 89,
# 'parents': [],
# 'size': 68,
# 'start': 22,
# 'text': '<p>The '
# 'HTML '
# '<code>button</code> '
# 'tag '
# 'defines '
# 'a '
# 'clickable '
# 'button.</p>'},
# (89, 90, 91, 92, 93, 94, 95, 96, 97): {'children': [],
# 'end': 97,
# 'parents': [],
# 'size': 9,
# 'start': 89,
# 'text': '<p>x</p>'},
# (97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194): {'children': [],
# 'end': 194,
# 'parents': [],
# 'size': 98,
# 'start': 97,
# 'text': '<p>The '
# 'CSS '
# '<code>background-color</code> '
# 'property '
# 'defines '
# 'the '
# 'background '
# 'color '
# 'of '
# 'an '
# 'element.</p>'}}
# r2-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[1', '/2]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 14,
# 'start': 0,
# 'text': '[1bla[2bla/2]'}},
# ('[2', '/1]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 12,
# 'start': 5,
# 'text': '[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
# r3-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
# r4-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/parifinder",
"name": "parifinder",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "webscraping,html,parsing",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/51/b1/29731142c02192070af86a7c792d3eb7d2f5b90f6188a28df91d84b75a98/parifinder-0.10.tar.gz",
"platform": null,
"description": "\r\n# extracts structured data from text using user-defined delimiters (strings or regex)\r\n\r\n## Tested against Windows / Python 3.11 / Anaconda\r\n\r\n## pip install parifinder\r\n\r\nparifinder extracts structured data from text using user-defined delimiters (strings or regex), making it versatile for data processing.\r\n\r\n## Advantages\r\n\r\n### Flexibility: \r\n\r\nThe function can handle a wide range of scenarios, making it versatile for parsing text with various delimiters. It can handle both single and multiple pairs of delimiters, whether they are simple strings or complex regular expressions. This flexibility makes it suitable for different use cases.\r\n\r\n### Scalability: \r\n\r\nIt can parse multiple pairs of delimiters within a given text, which is especially useful when dealing with documents or data containing nested elements.\r\n\r\n### Pure Python: \r\n\r\nIt uses only Python's standard library\r\n\r\n\r\n```python\r\nfrom parifinder import parse_pairs\r\nfrom pprint import pprint\r\n\r\ntext_0 = \"\"\"[[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]][[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]]\"\"\"\r\ns1_0 = \"[\"\r\ns2_0 = \"]\"\r\nr0 = parse_pairs(string=text_0, s1=s1_0, s2=s2_0, str_regex=False)\r\nprint(\"r0-----------------------------------------------------------------\")\r\npprint(r0, indent=1, width=1)\r\n\r\ntext_1 = \"<body><p>a</p><p>a</p><p>The HTML <code>button</code> tag defines a clickable button.</p><p>x</p><p>The CSS <code>background-color</code> property defines the background color of an element.</p></body></html>\"\r\ns1_1 = \"<p>\"\r\ns2_1 = \"</p>\"\r\nr1 = parse_pairs(string=text_1, s1=s1_1, s2=s2_1, str_regex=False)\r\nprint(\"r1-----------------------------------------------------------------\")\r\npprint(r1, indent=1, width=1)\r\n\r\ntext_2 = \"[1bla[2bla/2]/1]\"\r\ns1_2 = r\"\\[\\d\"\r\ns2_2 = r\"/\\d]\"\r\nr2 = parse_pairs(string=text_2, s1=s1_2, s2=s2_2, str_regex=True)\r\nprint(\"r2-----------------------------------------------------------------\")\r\npprint(r2, indent=1, width=1)\r\n\r\ntext_3 = \"[1bla[2bla/2]/1]\"\r\ns1_3 = [(\"[1\", \"/1]\"), (\"[2\", \"/2]\")]\r\ns2_3 = None\r\nr3 = parse_pairs(string=text_3, s1=s1_3, s2=s2_3, str_regex=False)\r\nprint(\"r3-----------------------------------------------------------------\")\r\npprint(r3, indent=1, width=1)\r\n\r\ntext_4 = \"[1bla[2bla/2]/1]\"\r\ns1_4 = [\"[1\", \"[2\"]\r\ns2_4 = [\"/1]\", \"/2]\"]\r\nr4 = parse_pairs(string=text_4, s1=s1_4, s2=s2_4, str_regex=False)\r\nprint(\"r4-----------------------------------------------------------------\")\r\npprint(r4, indent=1, width=1)\r\n\r\n\r\n# r0-----------------------------------------------------------------\r\n# {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23): {'children': [(1,\r\n# 2,\r\n# 3,\r\n# 4,\r\n# 5,\r\n# 6,\r\n# 7,\r\n# 8,\r\n# 9),\r\n# (17,\r\n# 18,\r\n# 19,\r\n# 20,\r\n# 21,\r\n# 22),\r\n# (12,\r\n# 13,\r\n# 14)],\r\n# 'end': 23,\r\n# 'parents': [],\r\n# 'size': 23,\r\n# 'start': 0,\r\n# 'text': '[[1, '\r\n# '2, '\r\n# '2], '\r\n# '[5], '\r\n# '[2, '\r\n# '3]]'},\r\n# (1, 2, 3, 4, 5, 6, 7, 8, 9): {'children': [],\r\n# 'end': 9,\r\n# 'parents': [(0,\r\n# 1,\r\n# 2,\r\n# 3,\r\n# 4,\r\n# 5,\r\n# 6,\r\n# 7,\r\n# 8,\r\n# 9,\r\n# 10,\r\n# 11,\r\n# 12,\r\n# 13,\r\n# 14,\r\n# 15,\r\n# 16,\r\n# 17,\r\n# 18,\r\n# 19,\r\n# 20,\r\n# 21,\r\n# 22,\r\n# 23)],\r\n# 'size': 8,\r\n# 'start': 1,\r\n# 'text': '[1, '\r\n# '2, '\r\n# '2]'},\r\n# (12, 13, 14): {'children': [],\r\n# 'end': 14,\r\n# 'parents': [(0,\r\n# 1,\r\n# 2,\r\n# 3,\r\n# 4,\r\n# 5,\r\n# 6,\r\n# 7,\r\n# 8,\r\n# 9,\r\n# 10,\r\n# 11,\r\n# 12,\r\n# 13,\r\n# 14,\r\n# 15,\r\n# 16,\r\n# 17,\r\n# 18,\r\n# 19,\r\n# 20,\r\n# 21,\r\n# 22,\r\n# 23)],\r\n# 'size': 2,\r\n# 'start': 12,\r\n# 'text': '[5]'},\r\n# (17, 18, 19, 20, 21, 22): {'children': [],\r\n# 'end': 22,\r\n# 'parents': [(0,\r\n# 1,\r\n# 2,\r\n# 3,\r\n# 4,\r\n# 5,\r\n# 6,\r\n# 7,\r\n# 8,\r\n# 9,\r\n# 10,\r\n# 11,\r\n# 12,\r\n# 13,\r\n# 14,\r\n# 15,\r\n# 16,\r\n# 17,\r\n# 18,\r\n# 19,\r\n# 20,\r\n# 21,\r\n# 22,\r\n# 23)],\r\n# 'size': 5,\r\n# 'start': 17,\r\n# 'text': '[2, '\r\n# '3]'},\r\n# (30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57): {'children': [(31,\r\n# 32,\r\n# 33,\r\n# 34,\r\n# 35,\r\n# 36,\r\n# 37,\r\n# 38,\r\n# 39),\r\n# (42,\r\n# 43,\r\n# 44,\r\n# 45,\r\n# 46,\r\n# 47,\r\n# 48),\r\n# (51,\r\n# 52,\r\n# 53,\r\n# 54,\r\n# 55,\r\n# 56)],\r\n# 'end': 57,\r\n# 'parents': [],\r\n# 'size': 27,\r\n# 'start': 30,\r\n# 'text': '[[4, '\r\n# '4, '\r\n# '4], '\r\n# '[12, '\r\n# '0], '\r\n# '[6, '\r\n# '6]]'},\r\n# (31, 32, 33, 34, 35, 36, 37, 38, 39): {'children': [],\r\n# 'end': 39,\r\n# 'parents': [(30,\r\n# 31,\r\n# 32,\r\n# 33,\r\n# 34,\r\n# 35,\r\n# 36,\r\n# 37,\r\n# 38,\r\n# 39,\r\n# 40,\r\n# 41,\r\n# 42,\r\n# 43,\r\n# 44,\r\n# 45,\r\n# 46,\r\n# 47,\r\n# 48,\r\n# 49,\r\n# 50,\r\n# 51,\r\n# 52,\r\n# 53,\r\n# 54,\r\n# 55,\r\n# 56,\r\n# 57)],\r\n# 'size': 8,\r\n# 'start': 31,\r\n# 'text': '[4, '\r\n# '4, '\r\n# '4]'},\r\n# (42, 43, 44, 45, 46, 47, 48): {'children': [],\r\n# 'end': 48,\r\n# 'parents': [(30,\r\n# 31,\r\n# 32,\r\n# 33,\r\n# 34,\r\n# 35,\r\n# 36,\r\n# 37,\r\n# 38,\r\n# 39,\r\n# 40,\r\n# 41,\r\n# 42,\r\n# 43,\r\n# 44,\r\n# 45,\r\n# 46,\r\n# 47,\r\n# 48,\r\n# 49,\r\n# 50,\r\n# 51,\r\n# 52,\r\n# 53,\r\n# 54,\r\n# 55,\r\n# 56,\r\n# 57)],\r\n# 'size': 6,\r\n# 'start': 42,\r\n# 'text': '[12, '\r\n# '0]'},\r\n# (51, 52, 53, 54, 55, 56): {'children': [],\r\n# 'end': 56,\r\n# 'parents': [(30,\r\n# 31,\r\n# 32,\r\n# 33,\r\n# 34,\r\n# 35,\r\n# 36,\r\n# 37,\r\n# 38,\r\n# 39,\r\n# 40,\r\n# 41,\r\n# 42,\r\n# 43,\r\n# 44,\r\n# 45,\r\n# 46,\r\n# 47,\r\n# 48,\r\n# 49,\r\n# 50,\r\n# 51,\r\n# 52,\r\n# 53,\r\n# 54,\r\n# 55,\r\n# 56,\r\n# 57)],\r\n# 'size': 5,\r\n# 'start': 51,\r\n# 'text': '[6, '\r\n# '6]'},\r\n# (63, 64, 65, 66, 67, 68, 69, 70): {'children': [(64,\r\n# 65,\r\n# 66,\r\n# 67,\r\n# 68,\r\n# 69)],\r\n# 'end': 70,\r\n# 'parents': [],\r\n# 'size': 7,\r\n# 'start': 63,\r\n# 'text': '[[1, '\r\n# '2]]'},\r\n# (64, 65, 66, 67, 68, 69): {'children': [],\r\n# 'end': 69,\r\n# 'parents': [(63,\r\n# 64,\r\n# 65,\r\n# 66,\r\n# 67,\r\n# 68,\r\n# 69,\r\n# 70)],\r\n# 'size': 5,\r\n# 'start': 64,\r\n# 'text': '[1, '\r\n# '2]'},\r\n# (71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94): {'children': [(72,\r\n# 73,\r\n# 74,\r\n# 75,\r\n# 76,\r\n# 77,\r\n# 78,\r\n# 79,\r\n# 80),\r\n# (88,\r\n# 89,\r\n# 90,\r\n# 91,\r\n# 92,\r\n# 93),\r\n# (83,\r\n# 84,\r\n# 85)],\r\n# 'end': 94,\r\n# 'parents': [],\r\n# 'size': 23,\r\n# 'start': 71,\r\n# 'text': '[[1, '\r\n# '2, '\r\n# '2], '\r\n# '[5], '\r\n# '[2, '\r\n# '3]]'},\r\n# (72, 73, 74, 75, 76, 77, 78, 79, 80): {'children': [],\r\n# 'end': 80,\r\n# 'parents': [(71,\r\n# 72,\r\n# 73,\r\n# 74,\r\n# 75,\r\n# 76,\r\n# 77,\r\n# 78,\r\n# 79,\r\n# 80,\r\n# 81,\r\n# 82,\r\n# 83,\r\n# 84,\r\n# 85,\r\n# 86,\r\n# 87,\r\n# 88,\r\n# 89,\r\n# 90,\r\n# 91,\r\n# 92,\r\n# 93,\r\n# 94)],\r\n# 'size': 8,\r\n# 'start': 72,\r\n# 'text': '[1, '\r\n# '2, '\r\n# '2]'},\r\n# (83, 84, 85): {'children': [],\r\n# 'end': 85,\r\n# 'parents': [(71,\r\n# 72,\r\n# 73,\r\n# 74,\r\n# 75,\r\n# 76,\r\n# 77,\r\n# 78,\r\n# 79,\r\n# 80,\r\n# 81,\r\n# 82,\r\n# 83,\r\n# 84,\r\n# 85,\r\n# 86,\r\n# 87,\r\n# 88,\r\n# 89,\r\n# 90,\r\n# 91,\r\n# 92,\r\n# 93,\r\n# 94)],\r\n# 'size': 2,\r\n# 'start': 83,\r\n# 'text': '[5]'},\r\n# (88, 89, 90, 91, 92, 93): {'children': [],\r\n# 'end': 93,\r\n# 'parents': [(71,\r\n# 72,\r\n# 73,\r\n# 74,\r\n# 75,\r\n# 76,\r\n# 77,\r\n# 78,\r\n# 79,\r\n# 80,\r\n# 81,\r\n# 82,\r\n# 83,\r\n# 84,\r\n# 85,\r\n# 86,\r\n# 87,\r\n# 88,\r\n# 89,\r\n# 90,\r\n# 91,\r\n# 92,\r\n# 93,\r\n# 94)],\r\n# 'size': 5,\r\n# 'start': 88,\r\n# 'text': '[2, '\r\n# '3]'},\r\n# (101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128): {'children': [(102,\r\n# 103,\r\n# 104,\r\n# 105,\r\n# 106,\r\n# 107,\r\n# 108,\r\n# 109,\r\n# 110),\r\n# (113,\r\n# 114,\r\n# 115,\r\n# 116,\r\n# 117,\r\n# 118,\r\n# 119),\r\n# (122,\r\n# 123,\r\n# 124,\r\n# 125,\r\n# 126,\r\n# 127)],\r\n# 'end': 128,\r\n# 'parents': [],\r\n# 'size': 27,\r\n# 'start': 101,\r\n# 'text': '[[4, '\r\n# '4, '\r\n# '4], '\r\n# '[12, '\r\n# '0], '\r\n# '[6, '\r\n# '6]]'},\r\n# (102, 103, 104, 105, 106, 107, 108, 109, 110): {'children': [],\r\n# 'end': 110,\r\n# 'parents': [(101,\r\n# 102,\r\n# 103,\r\n# 104,\r\n# 105,\r\n# 106,\r\n# 107,\r\n# 108,\r\n# 109,\r\n# 110,\r\n# 111,\r\n# 112,\r\n# 113,\r\n# 114,\r\n# 115,\r\n# 116,\r\n# 117,\r\n# 118,\r\n# 119,\r\n# 120,\r\n# 121,\r\n# 122,\r\n# 123,\r\n# 124,\r\n# 125,\r\n# 126,\r\n# 127,\r\n# 128)],\r\n# 'size': 8,\r\n# 'start': 102,\r\n# 'text': '[4, '\r\n# '4, '\r\n# '4]'},\r\n# (113, 114, 115, 116, 117, 118, 119): {'children': [],\r\n# 'end': 119,\r\n# 'parents': [(101,\r\n# 102,\r\n# 103,\r\n# 104,\r\n# 105,\r\n# 106,\r\n# 107,\r\n# 108,\r\n# 109,\r\n# 110,\r\n# 111,\r\n# 112,\r\n# 113,\r\n# 114,\r\n# 115,\r\n# 116,\r\n# 117,\r\n# 118,\r\n# 119,\r\n# 120,\r\n# 121,\r\n# 122,\r\n# 123,\r\n# 124,\r\n# 125,\r\n# 126,\r\n# 127,\r\n# 128)],\r\n# 'size': 6,\r\n# 'start': 113,\r\n# 'text': '[12, '\r\n# '0]'},\r\n# (122, 123, 124, 125, 126, 127): {'children': [],\r\n# 'end': 127,\r\n# 'parents': [(101,\r\n# 102,\r\n# 103,\r\n# 104,\r\n# 105,\r\n# 106,\r\n# 107,\r\n# 108,\r\n# 109,\r\n# 110,\r\n# 111,\r\n# 112,\r\n# 113,\r\n# 114,\r\n# 115,\r\n# 116,\r\n# 117,\r\n# 118,\r\n# 119,\r\n# 120,\r\n# 121,\r\n# 122,\r\n# 123,\r\n# 124,\r\n# 125,\r\n# 126,\r\n# 127,\r\n# 128)],\r\n# 'size': 5,\r\n# 'start': 122,\r\n# 'text': '[6, '\r\n# '6]'},\r\n# (134, 135, 136, 137, 138, 139, 140, 141): {'children': [(135,\r\n# 136,\r\n# 137,\r\n# 138,\r\n# 139,\r\n# 140)],\r\n# 'end': 141,\r\n# 'parents': [],\r\n# 'size': 7,\r\n# 'start': 134,\r\n# 'text': '[[1, '\r\n# '2]]'},\r\n# (135, 136, 137, 138, 139, 140): {'children': [],\r\n# 'end': 140,\r\n# 'parents': [(134,\r\n# 135,\r\n# 136,\r\n# 137,\r\n# 138,\r\n# 139,\r\n# 140,\r\n# 141)],\r\n# 'size': 5,\r\n# 'start': 135,\r\n# 'text': '[1, '\r\n# '2]'}}\r\n# r1-----------------------------------------------------------------\r\n# {(6, 7, 8, 9, 10, 11, 12, 13, 14): {'children': [],\r\n# 'end': 14,\r\n# 'parents': [],\r\n# 'size': 9,\r\n# 'start': 6,\r\n# 'text': '<p>a</p>'},\r\n# (14, 15, 16, 17, 18, 19, 20, 21, 22): {'children': [],\r\n# 'end': 22,\r\n# 'parents': [],\r\n# 'size': 9,\r\n# 'start': 14,\r\n# 'text': '<p>a</p>'},\r\n# (22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89): {'children': [],\r\n# 'end': 89,\r\n# 'parents': [],\r\n# 'size': 68,\r\n# 'start': 22,\r\n# 'text': '<p>The '\r\n# 'HTML '\r\n# '<code>button</code> '\r\n# 'tag '\r\n# 'defines '\r\n# 'a '\r\n# 'clickable '\r\n# 'button.</p>'},\r\n# (89, 90, 91, 92, 93, 94, 95, 96, 97): {'children': [],\r\n# 'end': 97,\r\n# 'parents': [],\r\n# 'size': 9,\r\n# 'start': 89,\r\n# 'text': '<p>x</p>'},\r\n# (97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194): {'children': [],\r\n# 'end': 194,\r\n# 'parents': [],\r\n# 'size': 98,\r\n# 'start': 97,\r\n# 'text': '<p>The '\r\n# 'CSS '\r\n# '<code>background-color</code> '\r\n# 'property '\r\n# 'defines '\r\n# 'the '\r\n# 'background '\r\n# 'color '\r\n# 'of '\r\n# 'an '\r\n# 'element.</p>'}}\r\n# r2-----------------------------------------------------------------\r\n# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],\r\n# 'end': 16,\r\n# 'parents': [],\r\n# 'size': 17,\r\n# 'start': 0,\r\n# 'text': '[1bla[2bla/2]/1]'}},\r\n# ('[1', '/2]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],\r\n# 'end': 13,\r\n# 'parents': [],\r\n# 'size': 14,\r\n# 'start': 0,\r\n# 'text': '[1bla[2bla/2]'}},\r\n# ('[2', '/1]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],\r\n# 'end': 16,\r\n# 'parents': [],\r\n# 'size': 12,\r\n# 'start': 5,\r\n# 'text': '[2bla/2]/1]'}},\r\n# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],\r\n# 'end': 13,\r\n# 'parents': [],\r\n# 'size': 9,\r\n# 'start': 5,\r\n# 'text': '[2bla/2]'}}}\r\n# r3-----------------------------------------------------------------\r\n# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],\r\n# 'end': 16,\r\n# 'parents': [],\r\n# 'size': 17,\r\n# 'start': 0,\r\n# 'text': '[1bla[2bla/2]/1]'}},\r\n# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],\r\n# 'end': 13,\r\n# 'parents': [],\r\n# 'size': 9,\r\n# 'start': 5,\r\n# 'text': '[2bla/2]'}}}\r\n# r4-----------------------------------------------------------------\r\n# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],\r\n# 'end': 16,\r\n# 'parents': [],\r\n# 'size': 17,\r\n# 'start': 0,\r\n# 'text': '[1bla[2bla/2]/1]'}},\r\n# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],\r\n# 'end': 13,\r\n# 'parents': [],\r\n# 'size': 9,\r\n# 'start': 5,\r\n# 'text': '[2bla/2]'}}}\r\n```\r\n\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "extracts structured data from text using user-defined delimiters (strings or regex)",
"version": "0.10",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/parifinder"
},
"split_keywords": [
"webscraping",
"html",
"parsing"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "9137989178c4f74ba41d23a6d99af5eabbaf1ddab92b6e8f9c29ad2485f950a4",
"md5": "a70d380d498974b173caf0ff6506db38",
"sha256": "8303552a9b79fa03f37b765dd3ba1f75e94faf5c18d1509e48b25a53904bfc9f"
},
"downloads": -1,
"filename": "parifinder-0.10-py3-none-any.whl",
"has_sig": false,
"md5_digest": "a70d380d498974b173caf0ff6506db38",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 22270,
"upload_time": "2023-10-14T22:38:12",
"upload_time_iso_8601": "2023-10-14T22:38:12.549680Z",
"url": "https://files.pythonhosted.org/packages/91/37/989178c4f74ba41d23a6d99af5eabbaf1ddab92b6e8f9c29ad2485f950a4/parifinder-0.10-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "51b129731142c02192070af86a7c792d3eb7d2f5b90f6188a28df91d84b75a98",
"md5": "1e67f1daa3046ced3830d334fa44d3f0",
"sha256": "1443ce07cef731f57e74c55744614fa1a52b6e1f8873f24be2ee1c82543f3906"
},
"downloads": -1,
"filename": "parifinder-0.10.tar.gz",
"has_sig": false,
"md5_digest": "1e67f1daa3046ced3830d334fa44d3f0",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 29865,
"upload_time": "2023-10-14T22:38:14",
"upload_time_iso_8601": "2023-10-14T22:38:14.828496Z",
"url": "https://files.pythonhosted.org/packages/51/b1/29731142c02192070af86a7c792d3eb7d2f5b90f6188a28df91d84b75a98/parifinder-0.10.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-10-14 22:38:14",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "parifinder",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [],
"lcname": "parifinder"
}