Do audio/video pts sync in a hopefully much more sensible way.

[dcpomatic.git] / src / lib / ffmpeg_decoder.cc
diff --git a/src/lib/ffmpeg_decoder.cc b/src/lib/ffmpeg_decoder.cc

index e014051918235c87fa9a10e5848dfe036129bacf..c74fee008458a53914daaf38ea4244f4166d5d31 100644 (file)
--- a/src/lib/ffmpeg_decoder.cc
+++ b/src/lib/ffmpeg_decoder.cc
@@ -66,6 +66,8 @@ FFmpegDecoder::FFmpegDecoder (boost::shared_ptr<const FilmState> s, boost::share
         , _audio_codec (0)
         , _subtitle_codec_context (0)
         , _subtitle_codec (0)
+       , _first_video_pts (-1)
+       , _first_audio_pts (-1)
  {
         setup_general ();
         setup_video ();
@@ -106,16 +108,35 @@ FFmpegDecoder::setup_general ()
                 throw DecodeError ("could not find stream information");
         }
  
+       /* Find video, audio and subtitle streams and choose the first of each */
+
         for (uint32_t i = 0; i < _format_context->nb_streams; ++i) {
-               if (_format_context->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO) {
+               AVStream* s = _format_context->streams[i];
+               if (s->codec->codec_type == AVMEDIA_TYPE_VIDEO) {
                         _video_stream = i;
-               } else if (_format_context->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO) {
-                       _audio_stream = i;
-               } else if (_format_context->streams[i]->codec->codec_type == AVMEDIA_TYPE_SUBTITLE) {
-                       _subtitle_stream = i;
+               } else if (s->codec->codec_type == AVMEDIA_TYPE_AUDIO) {
+                       if (_audio_stream == -1) {
+                               _audio_stream = i;
+                       }
+                       _audio_streams.push_back (AudioStream (stream_name (s), i, s->codec->channels));
+               } else if (s->codec->codec_type == AVMEDIA_TYPE_SUBTITLE) {
+                       if (_subtitle_stream == -1) {
+                               _subtitle_stream = i;
+                       }
+                       _subtitle_streams.push_back (SubtitleStream (stream_name (s), i));
                 }
         }
  
+       /* Now override audio and subtitle streams with those from the Film, if it has any */
+
+       if (_fs->audio_stream_index() != -1) {
+               _audio_stream = _fs->audio_stream().id();
+       }
+
+       if (_fs->subtitle_stream_index() != -1) {
+               _subtitle_stream = _fs->subtitle_stream().id ();
+       }
+
         if (_video_stream < 0) {
                 throw DecodeError ("could not find video stream");
         }
@@ -202,6 +223,8 @@ FFmpegDecoder::do_pass ()
                 _packet.data = 0;
                 _packet.size = 0;
  
+               /* XXX: should we reset _packet.data and size after each *_decode_* call? */
+
                 int frame_finished;
  
                 while (avcodec_decode_video2 (_video_codec_context, _frame, &frame_finished, &_packet) >= 0 && frame_finished) {
@@ -213,8 +236,8 @@ FFmpegDecoder::do_pass ()
                                 int const data_size = av_samples_get_buffer_size (
                                         0, _audio_codec_context->channels, _frame->nb_samples, audio_sample_format (), 1
                                         );
-                               
-                               assert (_audio_codec_context->channels == _fs->audio_channels);
+
+                               assert (_audio_codec_context->channels == _fs->audio_channels());
                                 process_audio (_frame->data[0], data_size);
                         }
                 }
@@ -224,13 +247,56 @@ FFmpegDecoder::do_pass ()
  
         if (_packet.stream_index == _video_stream) {
  
+               if (_first_video_pts == -1) {
+                       _first_video_pts = _packet.pts;
+               }
+               
                 int frame_finished;
                 if (avcodec_decode_video2 (_video_codec_context, _frame, &frame_finished, &_packet) >= 0 && frame_finished) {
                         process_video (_frame);
                 }
  
-       } else if (_audio_stream >= 0 && _packet.stream_index == _audio_stream && _opt->decode_audio) {
-               
+       } else if (_audio_stream >= 0 && _packet.stream_index == _audio_stream && _opt->decode_audio && (_first_video_pts != -1 && _packet.pts > _first_video_pts)) {
+
+               /* Note: We only decode audio if we've had our first video packet through, and if this
+                  packet comes after it.  Until then it is thrown away.
+               */
+
+               if (_first_audio_pts == -1) {
+                       _first_audio_pts = _packet.pts;
+
+                       /* This is our first audio packet, and if we've arrived here we must have had our
+                          first video packet.  Push some silence to make up the gap between our first
+                          video packet and our first audio.
+                       */
+                       
+                       AVStream* v = _format_context->streams[_video_stream];
+                       AVStream* a = _format_context->streams[_audio_stream];
+                       
+                       assert (v->time_base.num == a->time_base.num);
+                       assert (v->time_base.den == a->time_base.den);
+
+                       /* samples of silence that we must push */
+                       int const s = rint (av_q2d (v->time_base) * (_first_audio_pts - _first_video_pts) * audio_sample_rate ());
+
+                       _log->log (
+                               String::compose (
+                                       "First video at %1, first audio at %2, pushing %3 samples of silence",
+                                       _first_video_pts, _first_audio_pts, s
+                                       )
+                               );
+
+                       /* hence bytes */
+                       int const b = s * audio_channels() * bytes_per_audio_sample();
+
+                       /* XXX: this assumes that it won't be too much, and there are shaky assumptions
+                          that all sound representations are silent with memset()ed zero data.
+                       */
+                       uint8_t silence[b];
+                       memset (silence, 0, b);
+                       process_audio (silence, b);
+               }
+
                 avcodec_get_frame_defaults (_frame);
                 
                 int frame_finished;
@@ -239,7 +305,7 @@ FFmpegDecoder::do_pass ()
                                 0, _audio_codec_context->channels, _frame->nb_samples, audio_sample_format (), 1
                                 );
  
-                       assert (_audio_codec_context->channels == _fs->audio_channels);
+                       assert (_audio_codec_context->channels == _fs->audio_channels());
                         process_audio (_frame->data[0], data_size);
                 }
  
@@ -248,7 +314,12 @@ FFmpegDecoder::do_pass ()
                 int got_subtitle;
                 AVSubtitle sub;
                 if (avcodec_decode_subtitle2 (_subtitle_codec_context, &sub, &got_subtitle, &_packet) && got_subtitle) {
-                       process_subtitle (shared_ptr<Subtitle> (new Subtitle (sub)));
+                       /* I'm not entirely sure why, but sometimes we get an AVSubtitle with
+                          no AVSubtitleRects.
+                       */
+                       if (sub.num_rects > 0) {
+                               process_subtitle (shared_ptr<TimedSubtitle> (new TimedSubtitle (sub)));
+                       }
                         avsubtitle_free (&sub);
                 }
         }
@@ -356,3 +427,41 @@ FFmpegDecoder::has_subtitles () const
  {
         return (_subtitle_stream != -1);
  }
+
+vector<AudioStream>
+FFmpegDecoder::audio_streams () const
+{
+       return _audio_streams;
+}
+
+vector<SubtitleStream>
+FFmpegDecoder::subtitle_streams () const
+{
+       return _subtitle_streams;
+}
+
+string
+FFmpegDecoder::stream_name (AVStream* s) const
+{
+       stringstream n;
+       
+       AVDictionaryEntry const * lang = av_dict_get (s->metadata, "language", 0, 0);
+       if (lang) {
+               n << lang->value;
+       }
+       
+       AVDictionaryEntry const * title = av_dict_get (s->metadata, "title", 0, 0);
+       if (title) {
+               if (!n.str().empty()) {
+                       n << " ";
+               }
+               n << title->value;
+       }
+
+       if (n.str().empty()) {
+               n << "unknown";
+       }
+
+       return n.str ();
+}
+