# SuffixAutomaton 后缀自动机
find LCS (longest common substrings) by suffix automaton
## usage
> pip install SuffixAutomaton
```python
from SuffixAutomaton import SuffixAutomaton, lcs1, lcs2, logger
raw = """
ASE : International Conference on Automated Software Engineering
ESEC/FSE : ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering
ICSE : International Conference on Software Engineering
ISSTA : The International Symposium on Software Testing and Analysis
OOPSLA : Conference on Object-Oriented Programming Systems, Languages, and Applications
OSDI : Operating Systems Design and Implementation
PLDI : ACM SIGPLAN conference on Programming Language Design and Implementation
POPL : ACM SIGACT-SIGPLAN Symposium on Principles of Programming Languages
SOSP : ACM Symposium on Operating Systems Principles
"""
doc = raw.strip().splitlines()
doc = [x.split() for x in doc]
# for tokens
logger.info(lcs1(doc[1], doc[2], output_lcs=True)) # [(14, 2, 5, ['Software', 'Engineering'])]
logger.info(lcs2(doc[0], doc[1:4], output_lcs=True)) # [(1, 1, [':']), (4, 1, ['on']), (6, 1, ['Software'])]
logger.info(lcs1(doc[1], doc[2], 1, output_lcs=True)) # [(1, 1, 1, [':']), (7, 1, 3, ['Conference']), (10, 1, 4, ['on']), (14, 2, 5, ['Software', 'Engineering'])]
logger.info(lcs2(doc[0], doc[1:4], 1, output_lcs=True)) # [(1, 1, [':']), (4, 1, ['on']), (6, 1, ['Software'])]
logger.info(lcs2(doc[0], doc[1:4], 1, output_lcs=False)) # [(1, 1, None), (4, 1, None), (6, 1, None)]
# for chars
poet = "江天一色无纤尘皎皎空中孤月轮 江畔何人初见月江月何年初照人 人生代代无穷已江月年年望相似 不知江月待何人但见长江送流水"
doc = poet.split()
logger.info(lcs1(doc[1], doc[3], output_lcs=True)) # [(2, 2, 5, '何人'), (7, 2, 2, '江月')]
logger.info(lcs1(doc[1], doc[3], 1, output_lcs=True)) # [(0, 1, 10, '江'), (2, 2, 5, '何人'), (5, 1, 8, '见'), (7, 2, 2, '江月')]
# for lcs of doc
logger.info(lcs2(doc[2], doc[2:4], output_lcs=True)) # [(7, 2, '江月')]
logger.info(lcs2(doc[2], doc[2:4], 1 ,output_lcs=True)) # [(0, 1, '人'), (7, 2, '江月')]
# faster when iterally
sam = SuffixAutomaton(doc[0])
for x in doc[1:]:
print((x, sam.lcs1(x, output_lcs=True)))
"""
('江畔何人初见月江月何年初照人', [(0, 1, 0, '江'), (12, 1, 6, '月')])
('人生代代无穷已江月年年望相似', [(0, 1, 7, '江'), (4, 1, 4, '无'), (12, 1, 8, '月')])
('不知江月待何人但见长江送流水', [(0, 1, 2, '江'), (12, 1, 3, '月')])
"""
# lcs() -> [(str, start, cand_start)], sort in length decending. may overlap.
logger.info(lcs2("布架 拖把抹布悬挂沥水洁具架 ", ["抹布架"], 1, output_lcs=True)) # [(0, 2, '布架'), (5, 2, '抹布'), (13, 1, '架')]
```
## feature
* suffix automaton [in words] 可分词后缀自动机
* [Longest] Common Substring of two lines 两文[最长]共串
* [Longest] Common Substring of document 多文[最长]共串
## inspired by
参照:https://www.cnblogs.com/shld/p/10444808.html
讲解:https://www.cnblogs.com/zjp-shadow/p/9218214.html
详解:https://www.cnblogs.com/1625--H/p/12416198.html
证明:https://oi-wiki.org/string/sam/
题解:https://www.cnblogs.com/Lyush/archive/2013/08/25/3281546.html https://www.cnblogs.com/mollnn/p/13175736.html
Raw data
{
"_id": null,
"home_page": "https://github.com/laohur/SuffixAutomaton",
"name": "SuffixAutomaton",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.0",
"maintainer_email": null,
"keywords": "Suffix Automaton, sam",
"author": "laohur",
"author_email": "laohur@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/02/91/d2c1e28282cd598db5d40e6ce706e4c0737522e76e322cdb5e3780459c0e/SuffixAutomaton-0.1.6.tar.gz",
"platform": null,
"description": "\r\n# SuffixAutomaton \u540e\u7f00\u81ea\u52a8\u673a\r\nfind LCS (longest common substrings) by suffix automaton \r\n\r\n## usage\r\n> pip install SuffixAutomaton \r\n\r\n```python\r\nfrom SuffixAutomaton import SuffixAutomaton, lcs1, lcs2, logger\r\n\r\nraw = \"\"\"\r\nASE : International Conference on Automated Software Engineering\r\nESEC/FSE : ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering\r\nICSE : International Conference on Software Engineering\r\nISSTA : The International Symposium on Software Testing and Analysis\r\nOOPSLA : Conference on Object-Oriented Programming Systems, Languages, and Applications\r\nOSDI : Operating Systems Design and Implementation\r\nPLDI : ACM SIGPLAN conference on Programming Language Design and Implementation\r\nPOPL : ACM SIGACT-SIGPLAN Symposium on Principles of Programming Languages\r\nSOSP : ACM Symposium on Operating Systems Principles\r\n\"\"\"\r\ndoc = raw.strip().splitlines()\r\ndoc = [x.split() for x in doc]\r\n# for tokens\r\nlogger.info(lcs1(doc[1], doc[2], output_lcs=True)) # [(14, 2, 5, ['Software', 'Engineering'])]\r\nlogger.info(lcs2(doc[0], doc[1:4], output_lcs=True)) # [(1, 1, [':']), (4, 1, ['on']), (6, 1, ['Software'])]\r\nlogger.info(lcs1(doc[1], doc[2], 1, output_lcs=True)) # [(1, 1, 1, [':']), (7, 1, 3, ['Conference']), (10, 1, 4, ['on']), (14, 2, 5, ['Software', 'Engineering'])]\r\nlogger.info(lcs2(doc[0], doc[1:4], 1, output_lcs=True)) # [(1, 1, [':']), (4, 1, ['on']), (6, 1, ['Software'])]\r\nlogger.info(lcs2(doc[0], doc[1:4], 1, output_lcs=False)) # [(1, 1, None), (4, 1, None), (6, 1, None)]\r\n\r\n# for chars\r\npoet = \"\u6c5f\u5929\u4e00\u8272\u65e0\u7ea4\u5c18\u768e\u768e\u7a7a\u4e2d\u5b64\u6708\u8f6e \u6c5f\u7554\u4f55\u4eba\u521d\u89c1\u6708\u6c5f\u6708\u4f55\u5e74\u521d\u7167\u4eba \u4eba\u751f\u4ee3\u4ee3\u65e0\u7a77\u5df2\u6c5f\u6708\u5e74\u5e74\u671b\u76f8\u4f3c \u4e0d\u77e5\u6c5f\u6708\u5f85\u4f55\u4eba\u4f46\u89c1\u957f\u6c5f\u9001\u6d41\u6c34\"\r\ndoc = poet.split() \r\nlogger.info(lcs1(doc[1], doc[3], output_lcs=True)) # [(2, 2, 5, '\u4f55\u4eba'), (7, 2, 2, '\u6c5f\u6708')]\r\nlogger.info(lcs1(doc[1], doc[3], 1, output_lcs=True)) # [(0, 1, 10, '\u6c5f'), (2, 2, 5, '\u4f55\u4eba'), (5, 1, 8, '\u89c1'), (7, 2, 2, '\u6c5f\u6708')]\r\n# for lcs of doc\r\nlogger.info(lcs2(doc[2], doc[2:4], output_lcs=True)) # [(7, 2, '\u6c5f\u6708')]\r\nlogger.info(lcs2(doc[2], doc[2:4], 1 ,output_lcs=True)) # [(0, 1, '\u4eba'), (7, 2, '\u6c5f\u6708')]\r\n# faster when iterally\r\nsam = SuffixAutomaton(doc[0])\r\nfor x in doc[1:]:\r\n print((x, sam.lcs1(x, output_lcs=True)))\r\n\"\"\"\r\n('\u6c5f\u7554\u4f55\u4eba\u521d\u89c1\u6708\u6c5f\u6708\u4f55\u5e74\u521d\u7167\u4eba', [(0, 1, 0, '\u6c5f'), (12, 1, 6, '\u6708')])\r\n('\u4eba\u751f\u4ee3\u4ee3\u65e0\u7a77\u5df2\u6c5f\u6708\u5e74\u5e74\u671b\u76f8\u4f3c', [(0, 1, 7, '\u6c5f'), (4, 1, 4, '\u65e0'), (12, 1, 8, '\u6708')])\r\n('\u4e0d\u77e5\u6c5f\u6708\u5f85\u4f55\u4eba\u4f46\u89c1\u957f\u6c5f\u9001\u6d41\u6c34', [(0, 1, 2, '\u6c5f'), (12, 1, 3, '\u6708')])\r\n\"\"\"\r\n\r\n# lcs() -> [(str, start, cand_start)], sort in length decending. may overlap. \r\nlogger.info(lcs2(\"\u5e03\u67b6 \u62d6\u628a\u62b9\u5e03\u60ac\u6302\u6ca5\u6c34\u6d01\u5177\u67b6 \", [\"\u62b9\u5e03\u67b6\"], 1, output_lcs=True)) # [(0, 2, '\u5e03\u67b6'), (5, 2, '\u62b9\u5e03'), (13, 1, '\u67b6')]\r\n\r\n\r\n```\r\n\r\n## feature\r\n* suffix automaton [in words] \u53ef\u5206\u8bcd\u540e\u7f00\u81ea\u52a8\u673a\r\n* [Longest] Common Substring of two lines \u4e24\u6587[\u6700\u957f]\u5171\u4e32\r\n* [Longest] Common Substring of document \u591a\u6587[\u6700\u957f]\u5171\u4e32\r\n\r\n\r\n## inspired by \r\n \u53c2\u7167\uff1ahttps://www.cnblogs.com/shld/p/10444808.html\r\n \u8bb2\u89e3\uff1ahttps://www.cnblogs.com/zjp-shadow/p/9218214.html\r\n \u8be6\u89e3\uff1ahttps://www.cnblogs.com/1625--H/p/12416198.html\r\n \u8bc1\u660e\uff1ahttps://oi-wiki.org/string/sam/\r\n \u9898\u89e3\uff1ahttps://www.cnblogs.com/Lyush/archive/2013/08/25/3281546.html https://www.cnblogs.com/mollnn/p/13175736.html\r\n",
"bugtrack_url": null,
"license": "[Anti-996 License](https: // github.com/996icu/996.ICU/blob/master/LICENSE)",
"summary": "suffix automaton by words, to get text common substrings",
"version": "0.1.6",
"project_urls": {
"Homepage": "https://github.com/laohur/SuffixAutomaton"
},
"split_keywords": [
"suffix automaton",
" sam"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "7404f869f36a19b9fe167d5464059fde2866bda85afaa720d42cf0f95681909f",
"md5": "f27a023c0a1484c38b7ef19831ceaa7d",
"sha256": "267e9dcec2bf61652143bed81bddf721422e7e703429fc02df2693fefff07df9"
},
"downloads": -1,
"filename": "SuffixAutomaton-0.1.6-py3-none-any.whl",
"has_sig": false,
"md5_digest": "f27a023c0a1484c38b7ef19831ceaa7d",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.0",
"size": 6790,
"upload_time": "2024-07-29T06:27:03",
"upload_time_iso_8601": "2024-07-29T06:27:03.076150Z",
"url": "https://files.pythonhosted.org/packages/74/04/f869f36a19b9fe167d5464059fde2866bda85afaa720d42cf0f95681909f/SuffixAutomaton-0.1.6-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "0291d2c1e28282cd598db5d40e6ce706e4c0737522e76e322cdb5e3780459c0e",
"md5": "0c177657827caabfd6687041f9128e73",
"sha256": "7a02e109880e30e0f4aea51b5454991ac22e737dd3a46201f29a2712e8494115"
},
"downloads": -1,
"filename": "SuffixAutomaton-0.1.6.tar.gz",
"has_sig": false,
"md5_digest": "0c177657827caabfd6687041f9128e73",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.0",
"size": 6108,
"upload_time": "2024-07-29T06:27:04",
"upload_time_iso_8601": "2024-07-29T06:27:04.606627Z",
"url": "https://files.pythonhosted.org/packages/02/91/d2c1e28282cd598db5d40e6ce706e4c0737522e76e322cdb5e3780459c0e/SuffixAutomaton-0.1.6.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-07-29 06:27:04",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "laohur",
"github_project": "SuffixAutomaton",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "suffixautomaton"
}