From 06a1f8c3a44f97d7196faea7511ba0693f987476 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 17 May 2022 04:03:31 +0000 Subject: [PATCH] Initial commit --- Dockerfile | 8 +++++++ README.md | 12 +++++++++++ telegram-dl.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 telegram-dl.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..32789e5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.7-alpine3.14 +RUN \ + apk add --no-cache gcc libc-dev libffi-dev \ + && pip install --no-cache-dir telethon 'cryptg<0.3' +COPY telegram-dl.py / +VOLUME ["/data/"] +WORKDIR /data +ENTRYPOINT ["python3", "/telegram-dl.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..1002687 --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +telegram-dl is a little script for retrieving content on Telegram. + +Usage: `telegram-dl.py URL [URL...]` + +All URLs must be messages in the same channel. The output consists of writing each message to a file `{channelName}_{messageId}.json`. If the message has any attachment (photo, video, file, etc.), it gets downloaded as well. The filename for that is chosen by Telethon; if it's a file attachment, the literal path as presented by Telegram is used, and if it's a photo or video, a name is made up. + +The only dependency of the script is Telethon (though you probably want cryptg for performance reasons). For authentication (always required), specify the `TELEGRAM_API_ID`, `TELEGRAM_API_HASH`, and `TELEGRAM_BOT_TOKEN` environment variables. The session is stored in a file `telegram-dl.session`; keeping this between executions is not critical but recommended. + +A `Dockerfile` is provided for running with Docker. Simply provide the URL(s) as arguments on running the image. Data is written to the `/data` volume, which you may want to mount from the host machine instead. + + docker build -t telegram-dl:latest . + docker run --rm -v "$(pwd)"/data:/data -e TELEGRAM_API_ID=... -e TELEGRAM_API_HASH=... -e TELEGRAM_BOT_TOKEN=... telegram-dl https://t.me/telegram/178 diff --git a/telegram-dl.py b/telegram-dl.py new file mode 100644 index 0000000..30bfe52 --- /dev/null +++ b/telegram-dl.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +import asyncio +import base64 +import datetime +import json +import os +import re +import sys +import telethon + + +API_ID = os.environ['TELEGRAM_API_ID'] +API_HASH = os.environ['TELEGRAM_API_HASH'] +BOT_TOKEN = os.environ['TELEGRAM_BOT_TOKEN'] +URL_PATTERN = re.compile(r'^https?://t\.me/(?:s/)?(?P[^/]+)/(?P\d+)$') + + +def stuff_to_json(o): + if isinstance(o, datetime.datetime): + return o.isoformat() + if isinstance(o, bytes): + return f'binary data: {base64.b64encode(o).decode("ascii")}' + raise TypeError(f'Object of type {type(o)} is not JSON serializable') + + +async def main(): + # Parse URLs + targets = [] + for url in sys.argv[1:]: + m = URL_PATTERN.match(url) + if not m: + print(f'Error: {url} is not a recognised Telegram URL', file = sys.stderr) + sys.exit(1) + targets.append((m['channel'], int(m['message']))) + if not targets: + print(f'Usage: telegram-dl.py URL [URL...]', file = sys.stderr) + sys.exit(1) + channelName = targets[0][0] + if not all(x[0] == channelName for x in targets[1:]): + print(f'Error: all URLs must be of the same channel', file = sys.stderr) + sys.exit(1) + ids = [x[1] for x in targets] + + # Let's go... + client = telethon.TelegramClient('telegram-dl', API_ID, API_HASH) + await client.start(bot_token = BOT_TOKEN) + messages = await client.get_messages(channelName, ids = ids) + for message in messages: + if not message: + continue + with open(f'{channelName}_{message.id}.json', 'x') as fp: + json.dump(message.to_dict(), fp, default = stuff_to_json) + + await client.download_media(message) + + +asyncio.run(main())