RtpH263Reader.java

/*
 * Copyright 2022 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package androidx.media3.exoplayer.rtsp.reader;

import static androidx.media3.common.util.Assertions.checkNotNull;
import static androidx.media3.common.util.Assertions.checkState;
import static androidx.media3.common.util.Assertions.checkStateNotNull;
import static androidx.media3.exoplayer.rtsp.reader.RtpReaderUtils.toSampleTimeUs;

import androidx.media3.common.C;
import androidx.media3.common.util.Log;
import androidx.media3.common.util.ParsableByteArray;
import androidx.media3.common.util.Util;
import androidx.media3.exoplayer.rtsp.RtpPacket;
import androidx.media3.exoplayer.rtsp.RtpPayloadFormat;
import androidx.media3.extractor.ExtractorOutput;
import androidx.media3.extractor.TrackOutput;
import org.checkerframework.checker.nullness.qual.MonotonicNonNull;

/**
 * Parses a H263 byte stream carried on RTP packets, and extracts H263 frames as defined in RFC4629.
 */
/* package */ final class RtpH263Reader implements RtpPayloadReader {
  private static final String TAG = "RtpH263Reader";

  private static final int MEDIA_CLOCK_FREQUENCY = 90_000;

  /** I-frame VOP unit type. */
  private static final int I_VOP = 0;

  /** Picture start code, P=1, V=0, PLEN=0. Refer to RFC4629 Section 6.1. */
  private static final int PICTURE_START_CODE = 128;

  private final RtpPayloadFormat payloadFormat;

  private @MonotonicNonNull TrackOutput trackOutput;

  /**
   * First received RTP timestamp. All RTP timestamps are dimension-less, the time base is defined
   * by {@link #MEDIA_CLOCK_FREQUENCY}.
   */
  private long firstReceivedTimestamp;

  /** The combined size of a sample that is fragmented into multiple RTP packets. */
  private int fragmentedSampleSizeBytes;

  private int previousSequenceNumber;

  private int width;
  private int height;
  private boolean isKeyFrame;
  private boolean isOutputFormatSet;
  private long startTimeOffsetUs;
  private long fragmentedSampleTimeUs;
  /**
   * Whether the first packet of a H263 frame is received, it mark the start of a H263 partition. A
   * H263 frame can be split into multiple RTP packets.
   */
  private boolean gotFirstPacketOfH263Frame;

  /** Creates an instance. */
  public RtpH263Reader(RtpPayloadFormat payloadFormat) {
    this.payloadFormat = payloadFormat;
    firstReceivedTimestamp = C.TIME_UNSET;
    previousSequenceNumber = C.INDEX_UNSET;
  }

  @Override
  public void createTracks(ExtractorOutput extractorOutput, int trackId) {
    trackOutput = extractorOutput.track(trackId, C.TRACK_TYPE_VIDEO);
    trackOutput.format(payloadFormat.format);
  }

  @Override
  public void onReceivingFirstPacket(long timestamp, int sequenceNumber) {
    checkState(firstReceivedTimestamp == C.TIME_UNSET);
    firstReceivedTimestamp = timestamp;
  }

  @Override
  public void consume(
      ParsableByteArray data, long timestamp, int sequenceNumber, boolean rtpMarker) {
    checkStateNotNull(trackOutput);

    // H263 Header Payload Header, RFC4629 Section 5.1.
    //    0                   1
    //    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
    //    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    //    |   RR    |P|V|   PLEN    |PEBIT|
    //    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    int currentPosition = data.getPosition();
    int header = data.readUnsignedShort();
    boolean pBitIsSet = (header & 0x400) > 0;

    // Check if optional V (Video Redundancy Coding), PLEN or PEBIT is present, RFC4629 Section 5.1.
    if ((header & 0x200) != 0 || (header & 0x1F8) != 0 || (header & 0x7) != 0) {
      Log.w(
          TAG,
          "Dropping packet: video reduncancy coding is not supported, packet header VRC, or PLEN or"
              + " PEBIT is non-zero");
      return;
    }

    if (pBitIsSet) {
      if (gotFirstPacketOfH263Frame && fragmentedSampleSizeBytes > 0) {
        // Received new H263 fragment, output data of previous fragment to decoder.
        outputSampleMetadataForFragmentedPackets();
      }
      gotFirstPacketOfH263Frame = true;

      int payloadStartCode = data.peekUnsignedByte() & 0xFC;
      // Packets that begin with a Picture Start Code(100000). Refer RFC4629 Section 6.1.
      if (payloadStartCode < PICTURE_START_CODE) {
        Log.w(TAG, "Picture start Code (PSC) missing, dropping packet.");
        return;
      }
      // Setting first two bytes of the start code. Refer RFC4629 Section 6.1.1.
      data.getData()[currentPosition] = 0;
      data.getData()[currentPosition + 1] = 0;
      data.setPosition(currentPosition);
    } else if (gotFirstPacketOfH263Frame) {
      // Check that this packet is in the sequence of the previous packet.
      int expectedSequenceNumber = RtpPacket.getNextSequenceNumber(previousSequenceNumber);
      if (sequenceNumber < expectedSequenceNumber) {
        Log.w(
            TAG,
            Util.formatInvariant(
                "Received RTP packet with unexpected sequence number. Expected: %d; received: %d."
                    + " Dropping packet.",
                expectedSequenceNumber, sequenceNumber));
        return;
      }
    } else {
      Log.w(
          TAG,
          "First payload octet of the H263 packet is not the beginning of a new H263 partition,"
              + " Dropping current packet.");
      return;
    }

    if (fragmentedSampleSizeBytes == 0) {
      parseVopHeader(data, isOutputFormatSet);
      if (!isOutputFormatSet && isKeyFrame) {
        if (width != payloadFormat.format.width || height != payloadFormat.format.height) {
          trackOutput.format(
              payloadFormat.format.buildUpon().setWidth(width).setHeight(height).build());
        }
        isOutputFormatSet = true;
      }
    }
    int fragmentSize = data.bytesLeft();
    // Write the video sample.
    trackOutput.sampleData(data, fragmentSize);
    fragmentedSampleSizeBytes += fragmentSize;
    fragmentedSampleTimeUs =
        toSampleTimeUs(startTimeOffsetUs, timestamp, firstReceivedTimestamp, MEDIA_CLOCK_FREQUENCY);

    if (rtpMarker) {
      outputSampleMetadataForFragmentedPackets();
    }
    previousSequenceNumber = sequenceNumber;
  }

  @Override
  public void seek(long nextRtpTimestamp, long timeUs) {
    firstReceivedTimestamp = nextRtpTimestamp;
    fragmentedSampleSizeBytes = 0;
    startTimeOffsetUs = timeUs;
  }

  /**
   * Parses and set VOP Coding type and resolution. The {@linkplain ParsableByteArray#getPosition()
   * position} is preserved.
   */
  private void parseVopHeader(ParsableByteArray data, boolean gotResolution) {
    // Picture Segment Packets (RFC4629 Section 6.1).
    // Search for SHORT_VIDEO_START_MARKER (0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0).
    int currentPosition = data.getPosition();

    /*
     * Parse short video header.
     *
     * These values are taken from <a
     * href=https://cs.android.com/android/platform/superproject/+/master:frameworks/av/media/codecs/m4v_h263/dec/src/mp4def.h;l=115
     * >Android's software H263 decoder</a>.
     */
    long shortVideoHeader = data.readUnsignedInt();
    if (((shortVideoHeader >> 10) & 0x3F) == 0x20) {
      int header = data.peekUnsignedByte();
      int vopType = ((header >> 1) & 0x1);
      if (!gotResolution && vopType == I_VOP) {
        /*
         * Parse resolution from source format.
         *
         * These values are taken from <a
         * href=https://cs.android.com/android/platform/superproject/+/master:frameworks/av/media/codecs/m4v_h263/dec/src/vop.cpp;l=1126
         * >Android's software H263 decoder</a>.
         */
        int sourceFormat = ((header >> 2) & 0x07);
        if (sourceFormat == 1) {
          width = 128;
          height = 96;
        } else {
          width = 176 << (sourceFormat - 2);
          height = 144 << (sourceFormat - 2);
        }
      }
      data.setPosition(currentPosition);
      isKeyFrame = vopType == I_VOP;
      return;
    }
    data.setPosition(currentPosition);
    isKeyFrame = false;
  }

  /**
   * Outputs sample metadata of the received fragmented packets.
   *
   * <p>Call this method only after receiving an end of a H263 partition.
   */
  private void outputSampleMetadataForFragmentedPackets() {
    checkNotNull(trackOutput)
        .sampleMetadata(
            fragmentedSampleTimeUs,
            isKeyFrame ? C.BUFFER_FLAG_KEY_FRAME : 0,
            fragmentedSampleSizeBytes,
            /* offset= */ 0,
            /* cryptoData= */ null);
    fragmentedSampleSizeBytes = 0;
    fragmentedSampleTimeUs = C.TIME_UNSET;
    isKeyFrame = false;
    gotFirstPacketOfH263Frame = false;
  }
}