From da8e7c830c24d37e1723ab50943dc509d118f96c Mon Sep 17 00:00:00 2001 From: David Gay Date: Fri, 26 Jan 2024 23:40:56 -0500 Subject: Original script by Anacreon --- subsync.py | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 subsync.py diff --git a/subsync.py b/subsync.py new file mode 100644 index 0000000..cdd52c2 --- /dev/null +++ b/subsync.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +import os +import subprocess +import sys +import json +import os.path +import time +import re +from pprint import pprint +from multiprocessing import Pool +from collections import defaultdict +import chardet + +os.chdir(os.path.dirname(os.path.realpath(__file__))) +os.chdir('..') + +class mopen: + def __init__(self, filename, mode='r', **kwargs): + self.real_encoding = chardet.detect(open(filename, 'rb').read())['encoding'].lower() + self.file_o = open(filename, mode, encoding=self.real_encoding) + def __getattr__(self, name): + return getattr(self.file_o, name) + def __enter__(self): + return self.file_o + def __exit__(self, type, value, traceback): + self.file_o.close() + + +def get_lib_folder(): + return "auto-sub-retimer" + +def remove_styles(lines, styles): + res = [] + for line in lines: + if line.startswith("Dialogue: "): + style = line.split(',')[3] + if style not in styles: + continue + res.append(line) + return res + +def count_style_occurences(lines, style): + res = 0 + for line in lines: + if line.startswith("Dialogue: "): + line_style = line.split(',')[3] + if style == line_style: + res += 1 + return res + +def get_example_lines(lines, style, sample=5): + res = [] + for line in lines: + if line.startswith("Dialogue: "): + line_style = line.split(',')[3] + if style == line_style: + res.append(line) + sample -= 1 + if sample == 0: + return res + return res + +def get_styles(lines): + res = [] + for line in lines: + if line.startswith('Style: '): + res.append(line[7:].split(',')[0]) + return res + +def select_keep_styles(all_styles, lines): + styles_occ = sorted([(s, count_style_occurences(lines, s)) for s in all_styles], key=lambda x: x[1], reverse=True) + for i, s in enumerate(styles_occ): + print("[{}] {} ({} lines use this style)".format(i, s[0], s[1])) + if i < 3: + print(''.join(get_example_lines(lines, s[0]))) + keep_indices = input("Which sub styles to keep? Enter numbers separated by spaces or leave blank for all: ").split() + if not keep_indices: + return all_styles + return [s[0] for i, s in enumerate(styles_occ) if str(i) in keep_indices] + +def extract_eng_subs(mkv, index=None): + print("\nExtracting english subtitles, this may take a while...\n") + mkv_json = json.loads(subprocess.check_output([ + os.path.join(get_lib_folder(), 'ffmpeg', 'bin', 'ffprobe'), + "-v", + "quiet", + "-print_format", + "json", + "-show_streams", + "-select_streams", + "s", + mkv + ])) + + # Use the subtitle track as provided. If there is not one, + # ask instead. + all_streams = {s["index"]: s for s in mkv_json["streams"]} + if index not in all_streams: + index = None + if index is None: + if not mkv_json.get("streams"): + raise Exception("No subtitle streams to extract? Can't do any syncing. {}".format(mkv)) + elif len(mkv_json["streams"]) == 1: + index = mkv_json["streams"][0]["index"] + else: + print("[id]: Tag Information") + for s in mkv_json["streams"]: + tags = 'Unknown' + try: + tags = str(s['tags']) + except: + pass + print(f"[{s['index']}]:\n{tags}\n") + index = int(input("Pick the stream to retime against: ")) + + # Extract + stream = all_streams[index] + codec_name = stream['codec_name'] + if codec_name == "subrip": codec_name = "srt" + if codec_name not in ("srt", "ssa", "ass", "idx"): + print("\n\n") + print("#"*20) + print("Embedded subtitle stream is an image based subtitle file.") + print("These are not supported by Alass and cannot be used.") + print("#"*20) + sys.exit(1) + extracted = mkv.replace(".mkv", f".EXTRACTED.{codec_name}") + subprocess.run([os.path.join(get_lib_folder(), "mkvtoolnix\\mkvextract.exe"), "tracks", mkv, f"{index}:{extracted}"]) + return index + +def fix_styling(): + extracted_subs = [f for f in os.listdir() if '.EXTRACTED.' in f] + if extracted_subs[0].split('.')[-1] != 'ass': + print("Extracted subs are not .ASS format. Skipping style removal.") + return + all_lines = sum([mopen(f, encoding="utf-8").readlines() for f in extracted_subs], []) + all_styles = list(set(get_styles(all_lines))) + keep_styles = select_keep_styles(all_styles, all_lines) + for sub in extracted_subs: + with mopen(sub, encoding="utf-8") as ass: + lines = ass.readlines() + lines = remove_styles(lines, keep_styles) + with open(sub, 'w', encoding="utf-8") as ass: + ass.write('\r\n'.join(lines)) + +def retime_based_on_audio(mkv, srt, conf): + retimed = mkv.replace(".mkv", ".ja.RETIMED.{}".format(conf['ext'])) + subprocess.run([os.path.join(get_lib_folder(), "alass\\alass.bat"), "--split-penalty", conf['split_pen'], mkv, srt, retimed]) + os.remove(srt) + os.rename(retimed, mkv.replace(".mkv", ".ja.{}".format(conf['ext']))) + +def retime(mkv, srt, conf): + retimed = mkv.replace(".mkv", ".ja.RETIMED.{}".format(conf['ext'])) + extracted = [f for f in os.listdir() if mkv.replace(".mkv", ".EXTRACTED.") in f][0] + print([os.path.join(get_lib_folder(), "alass\\alass.bat"), "--split-penalty", conf['split_pen'], extracted, srt, retimed]) + subprocess.run([os.path.join(get_lib_folder(), "alass\\alass.bat"), "--split-penalty", conf['split_pen'], extracted, srt, retimed]) + os.remove(srt) + os.rename(retimed, mkv.replace(".mkv", ".ja.{}".format(conf['ext']))) + os.remove(extracted) + +def fix_common_errors(srt): + lines = mopen(srt, encoding='utf-8').readlines() + lines = [x for i, x in enumerate(lines) if not (x.strip() == "" and i+1 < len(lines) and not lines[i+1].strip().isdigit())] + with open(srt, 'w', encoding="utf-8") as subfile: + subfile.write(''.join(lines)) + +def tryint(s): + try: + return int(s) + except: + return s + +def alphanum_key(s): + return [ tryint(c) for c in re.split('([0-9]+)', s) ] + +if __name__ == '__main__': + # Guess the local mkv/srt pairs that need syncing + files = os.listdir() + CONF = {} + print('[1] .srt files (Default)') + print('[2] .ass files') + choice = input("What file type are your Japanese subs? ") + if choice.strip() == "": + choice = 1 + CONF['ext'] = ["", "srt", "ass"][int(choice)] + srts = sorted([f for f in files if f.endswith(f".{CONF['ext']}")], key=alphanum_key) + mkvs = [f for f in files if f.endswith(".mkv")] + while len(mkvs) > len(srts): + print(f"Found more .mkv files than .{CONF['ext']} files. This can occur when OP/ED/extras are in the same folder as episodes.") + print("Please enter a pattern to filter out of mkv results. (e.g. OP)") + pattern = input(">>> Pattern: ") + removed = [m for m in mkvs if pattern in m] + mkvs = [m for m in mkvs if pattern not in m] + print("Ignoring filtered mkv files:\n{}".format("\n".join(removed))) + mkvs = sorted([f for f in mkvs], key=alphanum_key) + if len(mkvs) == 0 or len(mkvs) != len(srts): + print(f"ERROR: Some .{CONF['ext']} files don't have a matching .mkv!") + print(f"{len(srts)} {CONF['ext']} files found. {len(mkvs)} mkv files found") + print(f"Ensure auto-sub-retimer folder is in same folder as your .mkv and .{CONF['ext']} files.") + print(f"Ensure you have an equal number of .mkv and .{CONF['ext']} files!") + input("press enter to exit...") + exit(1) + if len(sys.argv) > 1 and sys.argv[1] == "rename_only": + for srt, mkv in zip(srts, mkvs): + os.rename(srt, mkv.replace('.mkv', ".ja.{}".format(CONF['ext']))) + input("Rename finished!") + sys.exit(0) + pool = Pool() + print("[1] Retime using embedded subs. (Default)") + print("[2] Retime using audio.") + retime_choice = input("Enter desired option number: ") + SPLIT_PEN = input("Split Penalty? Only change this if default resulted in mis-timed subs. (Default 7): ").strip() + if not SPLIT_PEN: + SPLIT_PEN = "7" + CONF['split_pen'] = SPLIT_PEN + for srt in srts: + fix_common_errors(srt) + if "2" in retime_choice: + pool.starmap(retime_based_on_audio, zip(mkvs, srts, [CONF]*len(mkvs))) + else: + index = None + for mkv in mkvs: + index = extract_eng_subs(mkv, index) + #index = extract_eng_subs(mkvs[0]) + #pool.starmap(extract_eng_subs, [(x, index) for x in mkvs]) + print("English Sub Extraction Complete.") + fix_styling() + pool.starmap(retime, zip(mkvs, srts, [CONF]*len(mkvs))) + print("\nSuccess!\n") + time.sleep(3) -- cgit v1.2.3