doomcc/dcc/concat.py

import av
import copy
import dcc.doom_base
import fractions
import io
import logging
import math
import numpy as np
import wand.image


class Concat(dcc.doom_base.Wad):
    def get_parser(self, prog_name):
        parser = super().get_parser(prog_name)
        parser.add_argument("start_map")
        parser.add_argument("end_map")
        parser.add_argument("-n", "--nooverlay", action="store_true")
        return parser

    def take_action(self, parsed_args):
        logging.basicConfig()
        av.logging.set_level(av.logging.VERBOSE)
        av.logging.restore_default_callback()
        videos = (
            self.fabricate.joinpath(parsed_args.wad)
            .glob(f"{parsed_args.wad}_map*.mp4")
        )
        fn_base = (
            f"{parsed_args.wad}_maps{parsed_args.start_map}"
            + f"to{parsed_args.end_map}"
        )
        output = av.open(
            self.fabricate.joinpath(parsed_args.wad).joinpath(
                f"{fn_base}.mp4"), "w"
        )
        summary_file = open(
            self.fabricate.joinpath(parsed_args.wad).joinpath(
                f"{fn_base}.txt"), "w"
        )

        self._offset = 0
        summary = []
        # We'd like to use the concat filter here and connect everything into a
        # single filter graph... but it produces a "Resource temporarily
        # unavailable" error when switching to inputs after the first.
        # Presumably fixable, but it's easier to just make one graph per video
        # and mux everything together at the end.
        for v in sorted(videos):
            # TODO: Support UDoom in literally any way.
            if not (
                v.name >= f"{parsed_args.wad}_map{parsed_args.start_map}.mp4"
                and v.name <= f"{parsed_args.wad}_map{parsed_args.end_map}.mp4"
            ):
                continue
            start_time = self._offset / 1000000
            text = self._add_chunk(v, output, not parsed_args.nooverlay)
            list.append(
                summary, f"{text} {math.floor(start_time / 60):02}:"
                + f"{math.floor(start_time % 60):02}"
            )
        output.close()

        for line in summary:
            summary_file.write(f"{line}\n")
        summary_file.close()

    def _add_chunk(self, v, output, overlay):
        chunk = av.open(v)
        if not (len(chunk.streams.video) == 1
                and len(chunk.streams.audio) == 1):
            raise Exception(
                f"irregular chunk {v}: streams {chunk.streams} "
                + f"(expected 1 video & 1 audio)"
            )

        ograph = av.filter.Graph()
        sink = ograph.add("buffersink")
        asink = ograph.add("abuffersink")

        text = ""
        if overlay:
            img = wand.image.Image(
                height=chunk.streams[0].height,
                width=chunk.streams[0].width
            )
            mapstring = v.name[-6:-4]
            text = self._config["map_names"][f"map{mapstring}"]
            self.draw_text(
                img,
                f"MAP{mapstring}: {text}",
                font_size=120
            )
            img.trim(reset_coords=True)
            img.border("graya(25%, 25%)", 10, 10)
            img.border(self.thumbnail_text_stroke, 16, 16)
            # for this to work... the image needs to have a width that's a
            # multiple of 8.  dude whyyyyyyy
            padfactor = 8
            img.border("transparent", padfactor, 0)
            img.crop(
                width=img.width - img.width % padfactor,
                height=img.height
            )

        if len(output.streams.get()) == 0:
            # We can't use the input stream as a template here; it doesn't
            # have everything needed to do encoding and will fail
            # mysteriously later.
            vs = chunk.streams.video[0]
            vr = int(vs.time_base.denominator/vs.time_base.numerator)
            ovs = output.add_stream("h264", rate=vr)
            ovs.extradata = copy.deepcopy(vs.extradata)
            ovs.height = vs.height
            ovs.width = vs.width
            ovs.qmax = vs.qmax
            ovs.qmin = vs.qmin
            ovs.codec_context.bit_rate = vs.codec_context.bit_rate
            ovs.codec_context.framerate = vs.base_rate
            ovs.codec_context.pix_fmt = vs.codec_context.pix_fmt
            # The following are only used for encoding and have no equivalent
            # on the input stream.
            ovs.profile = "High"
            ovs.codec_context.gop_size = 30
            ovs.codec_context.max_b_frames = 2

            astr = chunk.streams.audio[0]
            oas = output.add_stream("aac", rate=astr.rate)
            oas.extradata = copy.deepcopy(astr.extradata)
            oas.bit_rate = astr.bit_rate

        src = ograph.add_buffer(
            template=chunk.streams.video[0],
            time_base=chunk.streams.video[0].time_base
        )
        asrc = ograph.add_abuffer(
            template=chunk.streams.audio[0],
            time_base=chunk.streams.audio[0].time_base
        )
        # TODO: video fades are absolute relative to the input video; audio
        # fades need to have their timestamps offset by the position in the
        # final video.  Clarify if this is really necessary.
        frame_rate = chunk.streams.video[0].base_rate
        sample_rate = chunk.streams.audio[0].rate
        ifade = ograph.add("fade", args="in:0:{}".format(frame_rate))
        ofade_start = (chunk.duration * frame_rate / 1000000) - frame_rate
        ofade = ograph.add("fade", args=f"out:{ofade_start}:{frame_rate}")
        iafade_start = self._offset * sample_rate / 1000000
        iafade = ograph.add("afade", args=f"in:{iafade_start}:{sample_rate}")
        oafade_start = (
            (self._offset + chunk.duration) * sample_rate / 1000000
            - sample_rate
        )
        oafade = ograph.add("afade", args=f"out:{oafade_start}:{sample_rate}")

        if overlay:
            overlay = ograph.add_buffer(
                width=img.width, height=img.height,
                format="rgba", time_base=chunk.streams.video[0].time_base
            )
            overlay_fo = ograph.add(
                "fade", args=f"out:{4 * frame_rate}:{frame_rate}"
            )
            overlay.link_to(overlay_fo, 0, 0)
            composite = ograph.add("overlay", args="x=4:y=4")
            src.link_to(composite, 0, 0)
            overlay_fo.link_to(composite, 0, 1)
            composite.link_to(ifade, 0, 0)
        else:
            src.link_to(ifade, 0, 0)

        asrc.link_to(iafade, 0, 0)
        ifade.link_to(ofade, 0, 0)
        iafade.link_to(oafade, 0, 0)
        ofade.link_to(sink, 0, 0)
        oafade.link_to(asink, 0, 0)
        ograph.configure()

        for packet in chunk.demux():
            if packet.dts is None:
                continue
            pof = (
                (self._offset * packet.time_base.denominator)
                / (packet.time_base.numerator * 1000000)
            )
            packet.dts += pof
            packet.pts += pof
            if packet.stream == chunk.streams.video[0]:
                for ifr in packet.decode():
                    if overlay:
                        overlay.push(self._make_text_frame(img, ifr))
                    src.push(ifr)
                    ofr = sink.pull()
                    for p in output.streams[packet.stream_index].encode(ofr):
                        output.mux(p)
            else:
                for ifr in packet.decode():
                    asrc.push(ifr)
                    ofr = asink.pull()
                    for p in output.streams[packet.stream_index].encode(ofr):
                        output.mux(p)
        self._offset += chunk.duration
        chunk.close()

        return text

    def _make_text_frame(self, img, ifr):
        # We need to give each frame its own memory it can own.
        text_frame = av.video.frame.VideoFrame(
            img.width, img.height, format="rgba"
        )
        text_frame.planes[0].update(img.make_blob(format="rgba"))
        text_frame.pts = ifr.pts
        text_frame.dts = ifr.dts
        text_frame.time_base = ifr.time_base
        return text_frame