subsync.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

#!/usr/bin/env python

import json
import os
import os.path
import re
import subprocess
import sys
import time
from multiprocessing import Pool

import chardet

os.chdir(os.path.dirname(os.path.realpath(__file__)))
os.chdir('..')


class mopen:
    def __init__(self, filename, mode='r'):
        self.real_encoding = chardet.detect(open(filename, 'rb').read())['encoding'].lower()
        self.file_o = open(filename, mode, encoding=self.real_encoding)

    def __getattr__(self, name):
        return getattr(self.file_o, name)

    def __enter__(self):
        return self.file_o

    def __exit__(self, type, value, traceback):
        self.file_o.close()


def remove_styles(lines, styles):
    res = []
    for line in lines:
        if line.startswith("Dialogue: "):
            style = line.split(',')[3]
            if style not in styles:
                continue
        res.append(line)
    return res


def count_style_occurences(lines, style):
    res = 0
    for line in lines:
        if line.startswith("Dialogue: "):
            line_style = line.split(',')[3]
            if style == line_style:
                res += 1
    return res


def get_example_lines(lines, style, sample=5):
    res = []
    for line in lines:
        if line.startswith("Dialogue: "):
            line_style = line.split(',')[3]
            if style == line_style:
                res.append(line)
                sample -= 1
                if sample == 0:
                    return res
    return res


def get_styles(lines):
    res = []
    for line in lines:
        if line.startswith('Style: '):
            res.append(line[7:].split(',')[0])
    return res


def select_keep_styles(all_styles, lines):
    styles_occ = sorted([(s, count_style_occurences(lines, s)) for s in all_styles], key=lambda x: x[1], reverse=True)
    for i, s in enumerate(styles_occ):
        print("[{}] {} ({} lines use this style)".format(i, s[0], s[1]))
        if i < 3:
            print(''.join(get_example_lines(lines, s[0])))
    keep_indices = input("Which sub styles to keep? Enter numbers separated by spaces or leave blank for all: ").split()
    if not keep_indices:
        return all_styles
    return [s[0] for i, s in enumerate(styles_occ) if str(i) in keep_indices]


def extract_eng_subs(mkv, index=None):
    print("\nExtracting english subtitles, this may take a while...\n")
    mkv_json = json.loads(subprocess.check_output([
        "ffmpeg",
        "-v",
        "quiet",
        "-print_format",
        "json",
        "-show_streams",
        "-select_streams",
        "s",
        mkv
    ]))

    # Use the subtitle track as provided. If there is not one,
    # ask instead.
    all_streams = {s["index"]: s for s in mkv_json["streams"]}
    if index not in all_streams:
        index = None
    if index is None:
        if not mkv_json.get("streams"):
            raise Exception("No subtitle streams to extract? Can't do any syncing. {}".format(mkv))
        elif len(mkv_json["streams"]) == 1:
            index = mkv_json["streams"][0]["index"]
        else:
            print("[id]: Tag Information")
            for s in mkv_json["streams"]:
                tags = 'Unknown'
                try:
                    tags = str(s['tags'])
                except:
                    pass
                print(f"[{s['index']}]:\n{tags}\n")
            index = int(input("Pick the stream to retime against: "))

    # Extract
    stream = all_streams[index]
    codec_name = stream['codec_name']
    if codec_name == "subrip": codec_name = "srt"
    if codec_name not in ("srt", "ssa", "ass", "idx"):
        print("\n\n")
        print("#" * 20)
        print("Embedded subtitle stream is an image based subtitle file.")
        print("These are not supported by Alass and cannot be used.")
        print("#" * 20)
        sys.exit(1)
    extracted = mkv.replace(".mkv", f".EXTRACTED.{codec_name}")
    subprocess.run(["mkvextract", "tracks", mkv, f"{index}:{extracted}"])
    return index


def fix_styling():
    extracted_subs = [f for f in os.listdir() if '.EXTRACTED.' in f]
    if extracted_subs[0].split('.')[-1] != 'ass':
        print("Extracted subs are not .ASS format. Skipping style removal.")
        return
    all_lines = sum([mopen(f).readlines() for f in extracted_subs], [])
    all_styles = list(set(get_styles(all_lines)))
    keep_styles = select_keep_styles(all_styles, all_lines)
    for sub in extracted_subs:
        with mopen(sub) as ass:
            lines = ass.readlines()
        lines = remove_styles(lines, keep_styles)
        with open(sub, 'w', encoding="utf-8") as ass:
            ass.write('\r\n'.join(lines))


def retime_based_on_audio(mkv, srt, conf):
    retimed = mkv.replace(".mkv", ".ja.RETIMED.{}".format(conf['ext']))
    subprocess.run(["alass", "--split-penalty", conf['split_pen'], mkv, srt, retimed])
    os.remove(srt)
    os.rename(retimed, mkv.replace(".mkv", ".ja.{}".format(conf['ext'])))


def retime(mkv, srt, conf):
    retimed = mkv.replace(".mkv", ".ja.RETIMED.{}".format(conf['ext']))
    extracted = [f for f in os.listdir() if mkv.replace(".mkv", ".EXTRACTED.") in f][0]
    print(["alass", "--split-penalty", conf['split_pen'], extracted, srt, retimed])
    subprocess.run(
        ["alass", "--split-penalty", conf['split_pen'], extracted, srt, retimed])
    os.remove(srt)
    os.rename(retimed, mkv.replace(".mkv", ".ja.{}".format(conf['ext'])))
    os.remove(extracted)


def fix_common_errors(srt):
    lines = mopen(srt).readlines()
    lines = [x for i, x in enumerate(lines) if
             not (x.strip() == "" and i + 1 < len(lines) and not lines[i + 1].strip().isdigit())]
    with open(srt, 'w', encoding="utf-8") as subfile:
        subfile.write(''.join(lines))


def tryint(s):
    try:
        return int(s)
    except:
        return s


def alphanum_key(s):
    return [tryint(c) for c in re.split('([0-9]+)', s)]


if __name__ == '__main__':
    # Guess the local mkv/srt pairs that need syncing
    files = os.listdir()
    CONF = {}
    print('[1] .srt files (Default)')
    print('[2] .ass files')
    choice = input("What file type are your Japanese subs? ")
    if choice.strip() == "":
        choice = 1
    CONF['ext'] = ["", "srt", "ass"][int(choice)]
    srts = sorted([f for f in files if f.endswith(f".{CONF['ext']}")], key=alphanum_key)
    mkvs = [f for f in files if f.endswith(".mkv")]
    while len(mkvs) > len(srts):
        print(
            f"Found more .mkv files than .{CONF['ext']} files. This can occur when OP/ED/extras are in the same "
            f"folder as episodes.")
        print("Please enter a pattern to filter out of mkv results. (e.g. OP)")
        pattern = input(">>> Pattern: ")
        removed = [m for m in mkvs if pattern in m]
        mkvs = [m for m in mkvs if pattern not in m]
        print("Ignoring filtered mkv files:\n{}".format("\n".join(removed)))
    mkvs = sorted([f for f in mkvs], key=alphanum_key)
    if len(mkvs) == 0 or len(mkvs) != len(srts):
        print(f"ERROR: Some .{CONF['ext']} files don't have a matching .mkv!")
        print(f"{len(srts)} {CONF['ext']} files found. {len(mkvs)} mkv files found")
        print(f"Ensure auto-sub-retimer folder is in same folder as your .mkv and .{CONF['ext']} files.")
        print(f"Ensure you have an equal number of .mkv and .{CONF['ext']} files!")
        input("press enter to exit...")
        exit(1)
    if len(sys.argv) > 1 and sys.argv[1] == "rename_only":
        for srt, mkv in zip(srts, mkvs):
            os.rename(srt, mkv.replace('.mkv', ".ja.{}".format(CONF['ext'])))
        input("Rename finished!")
        sys.exit(0)
    pool = Pool()
    print("[1] Retime using embedded subs. (Default)")
    print("[2] Retime using audio.")
    retime_choice = input("Enter desired option number: ")
    SPLIT_PEN = input("Split Penalty? Only change this if default resulted in mis-timed subs. (Default 7): ").strip()
    if not SPLIT_PEN:
        SPLIT_PEN = "7"
    CONF['split_pen'] = SPLIT_PEN
    for srt in srts:
        fix_common_errors(srt)
    if "2" in retime_choice:
        pool.starmap(retime_based_on_audio, zip(mkvs, srts, [CONF] * len(mkvs)))
    else:
        index = None
        for mkv in mkvs:
            index = extract_eng_subs(mkv, index)
        print("English Sub Extraction Complete.")
        fix_styling()
        pool.starmap(retime, zip(mkvs, srts, [CONF] * len(mkvs)))
    print("\nSuccess!\n")
    time.sleep(3)