youtool

Name	youtool JSON
Version	0.2.0 JSON
	download
home_page	https://github.com/PythonicCafe/youtool/
Summary	Easy-to-use library to access YouTube Data API v3 in bulk operations
upload_time	2024-07-14 05:47:59
maintainer	None
docs_url	None
author	Álvaro Justen
requires_python	>=3.7
license	GNU Lesser General Public License v3 (LGPLv3)
keywords	youtube api data videos
VCS
bugtrack_url
requirements	No requirements were recorded.
Travis-CI	No Travis.
coveralls test coverage	No coveralls.

            # youtool - Easily access YouTube Data API v3 in batches

Python library (and future command-line interface) to crawl YouTube Data API v3 in batch operations and other related
tasks. Easier to use than alternatives - you don't need to spend time learning the YouTube API and its caveats. With
this library you can get:

- Channel ID from channel URL (scraping) or username (API)
- Channel information (title, subscribers etc.)
- List of playlists for a channel
- List of videos for a playlist
- Video search (many parameters)
- Video information (title, description, likes, comments etc.)
- Comments
- Livechat, including superchat (scraping using
  [chat-downloader](https://chat-downloader.readthedocs.io/en/latest/))
- Automatic transcription (scraping using [yt-dlp](https://github.com/yt-dlp/yt-dlp))

The library will automatically:

- Try as many keys as you provide
- Use batch of 50 items in supported API endpoints
- Paginate when needed


## Installing


```shell
pip install youtool
```

You may also want some extras:

```shell
pip install youtool[livechat]
pip install youtool[transcription]
```

## Using as a library


Just follow the tutorial/examples below and check the `help()` for `YouTube` methods.

> Note: the examples below will use 135 units of your API key quota.

```python
from pprint import pprint
from pathlib import Path

from youtool import YouTube

api_keys = ["key1", "key2", ...]  # Create one in Google Cloud Console
yt = YouTube(api_keys, disable_ipv6=True)  # Will try all keys

channel_id_1 = yt.channel_id_from_url("https://youtube.com/c/PythonicCafe/")
print(f"Pythonic Café's channel ID (got from URL): {channel_id_1}")
channel_id_2 = yt.channel_id_from_username("turicas")
print(f"Turicas' channel ID (got from username): {channel_id_2}")

print("Playlists found on Turicas' channel (the \"uploads\" playlist is not here):")
# WARNING: this method won't return the main channel playlist ("uploads").
# If you need it, get channel info using `channels_infos` and the `playlist_id` key (or use the hack in the next
# section), so you can pass it to `playlist_videos`.
for playlist in yt.channel_playlists(channel_id_2):
    # `playlist` is a `dict`
    print(f"Playlist: {playlist}")
    for video in yt.playlist_videos(playlist["id"]):
        # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)
        print(f"  Video: {video}")
    print("-" * 80)

# Hack: replace `UC` with `UU` on channel ID to get main playlist ID ("uploads"):
assert channel_id_1[:2] == "UC"
print("Last 3 uploads for Pythonic Café:")
for index, video in enumerate(yt.playlist_videos("UU" + channel_id_1[2:])):
    # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)
    print(f"  Video: {video}")
    if index == 2:  # First 3 results only
        break
print("-" * 80)

print("5 videos found on search:")
# `video_search` has many other parameters also!
# WARNING: each request made by this method will consume 100 units of your quota (out of 10k daily!)
for index, video in enumerate(yt.video_search(term="Álvaro Justen")):  # Will paginate automatically
    # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)
    print(f"  Video: {video}")
    if index == 4:  # First 5 results only
        break
print("-" * 80)

# The method below can be used to get information in batches (50 videos per request) - you can pass a list of video IDs
# (more than 50) and it'll get data in batches from the API.
last_video = list(yt.videos_infos([video["id"]]))[0]
print("Complete information for last video:")
pprint(last_video)
print("-" * 80)

print("Channel information (2 channels in one request):")
for channel in yt.channels_infos([channel_id_1, channel_id_2]):
    # `channel` is a `dict`
    print(channel)
print("-" * 80)

video_id = "b1FjmUzgFB0"
print(f"Comments for video {video_id}:")
for comment in yt.video_comments(video_id):
    # `comment` is a `dict`
    print(comment)
print("-" * 80)

live_video_id = "yyzIPQsa98A"
print(f"Live chat for video {live_video_id}:")
for chat_message in yt.video_livechat(live_video_id):
    # `chat_message` is a `dict`
    print(chat_message)  # It has the superchat information (`money_currency` and `money_amount` keys)
print("-" * 80)

download_path = Path("transcriptions")
if not download_path.exists():
    download_path.mkdir(parents=True)
print(f"Downloading Portuguese (pt) transcriptions for videos {video_id} and {live_video_id} - saving at {download_path.absolute()}")
for downloaded in yt.download_transcriptions([video_id, live_video_id], language_code="pt", path=download_path):
    vid, status, filename = downloaded["video_id"], downloaded["status"], downloaded["filename"]
    if status == "error":
        print(f"  {vid}: error downloading!")
    elif status == "skipped":
        print(f"  {vid}: skipped, file already exists ({filename}: {filename.stat().st_size / 1024:.1f} KiB)")
    elif status == "done":
        print(f"  {vid}: done ({filename}: {filename.stat().st_size / 1024:.1f} KiB)")
print("-" * 80)

# You can also download audio and video, just replace `download_transcriptions` with `download_audios` or
# `download_videos`. As simple as it is. :)

print("Categories in Brazilian YouTube:")
for category in yt.categories(region_code="BR"):
    # `category` is a `dict`
    print(category)
print("-" * 80)

print("Current most popular videos in Brazil:")
for video in yt.most_popular(region_code="BR"):  # Will paginate automatically
    # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)
    print(f"{video['id']} {video['title']}")
print("-" * 80)

print("Total quota used during this session:")
total_used = 0
for method, units_used in yt.used_quota.items():
    print(f"{method:20}: {units_used:05d} unit{'' if units_used == 1 else 's'}")
    total_used += units_used
print(f"TOTAL               : {total_used:05d} unit{'' if total_used == 1 else 's'}")
```

## Tests

To run all tests, execute:

```shell
make test
```

## Future improvments

Pull requests are welcome! :)

- Command-line interface with the following subcommands:
  - channel-id: get channel IDs from a list of URLs (or CSV filename with URLs inside), generate CSV output (just the
    IDs)
  - channel-info: get channel info from a list of IDs (or CSV filename with IDs inside), generate CSV output (same
    schema for `channel` dicts)
  - video-info: get video info from a list of IDs or URLs (or CSV filename with URLs/IDs inside), generate CSV output
    (same schema for `video` dicts)
  - video-search: get video info from a list of IDs or URLs (or CSV filename with URLs/IDs inside), generate CSV output
    (simplified `video` dict schema or option to get full video info after)
  - video-comments: get comments from a video ID, generate CSV output (same schema for `comment` dicts)
  - video-livechat: get comments from a video ID, generate CSV output (same schema for `chat_message` dicts)
  - video-transcriptions: download video transcriptions based on language code, path and list of video IDs or URLs (or
    CSV filename with URLs/IDs inside), download files to destination and report results
- Replace `dict`s with dataclasses
- Create a website with docs/reference
- Deal with quotas (wait some time before using a key, for example)


## License

GNU Lesser General Public License (LGPL) version3.

This project was developed in a partnership between [Pythonic Café](https://pythonic.cafe/) and [Novelo
Data](https://novelo.io/).

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/PythonicCafe/youtool/",
    "name": "youtool",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.7",
    "maintainer_email": null,
    "keywords": "youtube api data videos",
    "author": "\u00c1lvaro Justen",
    "author_email": "alvarojusten@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/05/6e/35a52d096a81fb68406577daa23cdaf919bbb73e27f96eb8e0bba2118423/youtool-0.2.0.tar.gz",
    "platform": null,
    "description": "# youtool - Easily access YouTube Data API v3 in batches\n\nPython library (and future command-line interface) to crawl YouTube Data API v3 in batch operations and other related\ntasks. Easier to use than alternatives - you don't need to spend time learning the YouTube API and its caveats. With\nthis library you can get:\n\n- Channel ID from channel URL (scraping) or username (API)\n- Channel information (title, subscribers etc.)\n- List of playlists for a channel\n- List of videos for a playlist\n- Video search (many parameters)\n- Video information (title, description, likes, comments etc.)\n- Comments\n- Livechat, including superchat (scraping using\n  [chat-downloader](https://chat-downloader.readthedocs.io/en/latest/))\n- Automatic transcription (scraping using [yt-dlp](https://github.com/yt-dlp/yt-dlp))\n\nThe library will automatically:\n\n- Try as many keys as you provide\n- Use batch of 50 items in supported API endpoints\n- Paginate when needed\n\n\n## Installing\n\n\n```shell\npip install youtool\n```\n\nYou may also want some extras:\n\n```shell\npip install youtool[livechat]\npip install youtool[transcription]\n```\n\n## Using as a library\n\n\nJust follow the tutorial/examples below and check the `help()` for `YouTube` methods.\n\n> Note: the examples below will use 135 units of your API key quota.\n\n```python\nfrom pprint import pprint\nfrom pathlib import Path\n\nfrom youtool import YouTube\n\napi_keys = [\"key1\", \"key2\", ...]  # Create one in Google Cloud Console\nyt = YouTube(api_keys, disable_ipv6=True)  # Will try all keys\n\nchannel_id_1 = yt.channel_id_from_url(\"https://youtube.com/c/PythonicCafe/\")\nprint(f\"Pythonic Caf\u00e9's channel ID (got from URL): {channel_id_1}\")\nchannel_id_2 = yt.channel_id_from_username(\"turicas\")\nprint(f\"Turicas' channel ID (got from username): {channel_id_2}\")\n\nprint(\"Playlists found on Turicas' channel (the \\\"uploads\\\" playlist is not here):\")\n# WARNING: this method won't return the main channel playlist (\"uploads\").\n# If you need it, get channel info using `channels_infos` and the `playlist_id` key (or use the hack in the next\n# section), so you can pass it to `playlist_videos`.\nfor playlist in yt.channel_playlists(channel_id_2):\n    # `playlist` is a `dict`\n    print(f\"Playlist: {playlist}\")\n    for video in yt.playlist_videos(playlist[\"id\"]):\n        # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)\n        print(f\"  Video: {video}\")\n    print(\"-\" * 80)\n\n# Hack: replace `UC` with `UU` on channel ID to get main playlist ID (\"uploads\"):\nassert channel_id_1[:2] == \"UC\"\nprint(\"Last 3 uploads for Pythonic Caf\u00e9:\")\nfor index, video in enumerate(yt.playlist_videos(\"UU\" + channel_id_1[2:])):\n    # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)\n    print(f\"  Video: {video}\")\n    if index == 2:  # First 3 results only\n        break\nprint(\"-\" * 80)\n\nprint(\"5 videos found on search:\")\n# `video_search` has many other parameters also!\n# WARNING: each request made by this method will consume 100 units of your quota (out of 10k daily!)\nfor index, video in enumerate(yt.video_search(term=\"\u00c1lvaro Justen\")):  # Will paginate automatically\n    # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)\n    print(f\"  Video: {video}\")\n    if index == 4:  # First 5 results only\n        break\nprint(\"-\" * 80)\n\n# The method below can be used to get information in batches (50 videos per request) - you can pass a list of video IDs\n# (more than 50) and it'll get data in batches from the API.\nlast_video = list(yt.videos_infos([video[\"id\"]]))[0]\nprint(\"Complete information for last video:\")\npprint(last_video)\nprint(\"-\" * 80)\n\nprint(\"Channel information (2 channels in one request):\")\nfor channel in yt.channels_infos([channel_id_1, channel_id_2]):\n    # `channel` is a `dict`\n    print(channel)\nprint(\"-\" * 80)\n\nvideo_id = \"b1FjmUzgFB0\"\nprint(f\"Comments for video {video_id}:\")\nfor comment in yt.video_comments(video_id):\n    # `comment` is a `dict`\n    print(comment)\nprint(\"-\" * 80)\n\nlive_video_id = \"yyzIPQsa98A\"\nprint(f\"Live chat for video {live_video_id}:\")\nfor chat_message in yt.video_livechat(live_video_id):\n    # `chat_message` is a `dict`\n    print(chat_message)  # It has the superchat information (`money_currency` and `money_amount` keys)\nprint(\"-\" * 80)\n\ndownload_path = Path(\"transcriptions\")\nif not download_path.exists():\n    download_path.mkdir(parents=True)\nprint(f\"Downloading Portuguese (pt) transcriptions for videos {video_id} and {live_video_id} - saving at {download_path.absolute()}\")\nfor downloaded in yt.download_transcriptions([video_id, live_video_id], language_code=\"pt\", path=download_path):\n    vid, status, filename = downloaded[\"video_id\"], downloaded[\"status\"], downloaded[\"filename\"]\n    if status == \"error\":\n        print(f\"  {vid}: error downloading!\")\n    elif status == \"skipped\":\n        print(f\"  {vid}: skipped, file already exists ({filename}: {filename.stat().st_size / 1024:.1f} KiB)\")\n    elif status == \"done\":\n        print(f\"  {vid}: done ({filename}: {filename.stat().st_size / 1024:.1f} KiB)\")\nprint(\"-\" * 80)\n\n# You can also download audio and video, just replace `download_transcriptions` with `download_audios` or\n# `download_videos`. As simple as it is. :)\n\nprint(\"Categories in Brazilian YouTube:\")\nfor category in yt.categories(region_code=\"BR\"):\n    # `category` is a `dict`\n    print(category)\nprint(\"-\" * 80)\n\nprint(\"Current most popular videos in Brazil:\")\nfor video in yt.most_popular(region_code=\"BR\"):  # Will paginate automatically\n    # `video` is a `dict`, but this endpoint doesn't provide full video information (use `videos_infos` to get them)\n    print(f\"{video['id']} {video['title']}\")\nprint(\"-\" * 80)\n\nprint(\"Total quota used during this session:\")\ntotal_used = 0\nfor method, units_used in yt.used_quota.items():\n    print(f\"{method:20}: {units_used:05d} unit{'' if units_used == 1 else 's'}\")\n    total_used += units_used\nprint(f\"TOTAL               : {total_used:05d} unit{'' if total_used == 1 else 's'}\")\n```\n\n## Tests\n\nTo run all tests, execute:\n\n```shell\nmake test\n```\n\n## Future improvments\n\nPull requests are welcome! :)\n\n- Command-line interface with the following subcommands:\n  - channel-id: get channel IDs from a list of URLs (or CSV filename with URLs inside), generate CSV output (just the\n    IDs)\n  - channel-info: get channel info from a list of IDs (or CSV filename with IDs inside), generate CSV output (same\n    schema for `channel` dicts)\n  - video-info: get video info from a list of IDs or URLs (or CSV filename with URLs/IDs inside), generate CSV output\n    (same schema for `video` dicts)\n  - video-search: get video info from a list of IDs or URLs (or CSV filename with URLs/IDs inside), generate CSV output\n    (simplified `video` dict schema or option to get full video info after)\n  - video-comments: get comments from a video ID, generate CSV output (same schema for `comment` dicts)\n  - video-livechat: get comments from a video ID, generate CSV output (same schema for `chat_message` dicts)\n  - video-transcriptions: download video transcriptions based on language code, path and list of video IDs or URLs (or\n    CSV filename with URLs/IDs inside), download files to destination and report results\n- Replace `dict`s with dataclasses\n- Create a website with docs/reference\n- Deal with quotas (wait some time before using a key, for example)\n\n\n## License\n\nGNU Lesser General Public License (LGPL) version3.\n\nThis project was developed in a partnership between [Pythonic Caf\u00e9](https://pythonic.cafe/) and [Novelo\nData](https://novelo.io/).\n",
    "bugtrack_url": null,
    "license": "GNU Lesser General Public License v3 (LGPLv3)",
    "summary": "Easy-to-use library to access YouTube Data API v3 in bulk operations",
    "version": "0.2.0",
    "project_urls": {
        "Homepage": "https://github.com/PythonicCafe/youtool/"
    },
    "split_keywords": [
        "youtube",
        "api",
        "data",
        "videos"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "841a21842eab8393697bbb32a7de0c7dccb634334062eca201178b276898fb7f",
                "md5": "c24e3eee6d72c49b711b5ae953410679",
                "sha256": "dffa28b8f79fd06f3f8ae9b04094073494f16c839a6c69dd8783688710dba311"
            },
            "downloads": -1,
            "filename": "youtool-0.2.0-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "c24e3eee6d72c49b711b5ae953410679",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.7",
            "size": 17358,
            "upload_time": "2024-07-14T05:47:57",
            "upload_time_iso_8601": "2024-07-14T05:47:57.997079Z",
            "url": "https://files.pythonhosted.org/packages/84/1a/21842eab8393697bbb32a7de0c7dccb634334062eca201178b276898fb7f/youtool-0.2.0-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "056e35a52d096a81fb68406577daa23cdaf919bbb73e27f96eb8e0bba2118423",
                "md5": "bbcd0050d92ebaa257caeade5036bf3c",
                "sha256": "9ee7206f2cc3bacbc7b750b3b4c4053ef778d049c7ec63b7a0f95f669807ab37"
            },
            "downloads": -1,
            "filename": "youtool-0.2.0.tar.gz",
            "has_sig": false,
            "md5_digest": "bbcd0050d92ebaa257caeade5036bf3c",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.7",
            "size": 19663,
            "upload_time": "2024-07-14T05:47:59",
            "upload_time_iso_8601": "2024-07-14T05:47:59.819234Z",
            "url": "https://files.pythonhosted.org/packages/05/6e/35a52d096a81fb68406577daa23cdaf919bbb73e27f96eb8e0bba2118423/youtool-0.2.0.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-07-14 05:47:59",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "PythonicCafe",
    "github_project": "youtool",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "lcname": "youtool"
}

Álvaro Justen