Home Reference Source

src/remux/mp4-remuxer.ts

  1. import AAC from './aac-helper';
  2. import MP4 from './mp4-generator';
  3. import type { HlsEventEmitter } from '../events';
  4. import { Events } from '../events';
  5. import { ErrorTypes, ErrorDetails } from '../errors';
  6. import { logger } from '../utils/logger';
  7. import {
  8. InitSegmentData,
  9. Remuxer,
  10. RemuxerResult,
  11. RemuxedMetadata,
  12. RemuxedTrack,
  13. RemuxedUserdata,
  14. } from '../types/remuxer';
  15. import { PlaylistLevelType } from '../types/loader';
  16. import { toMsFromMpegTsClock } from '../utils/timescale-conversion';
  17. import type {
  18. AudioSample,
  19. AvcSample,
  20. DemuxedAudioTrack,
  21. DemuxedAvcTrack,
  22. DemuxedMetadataTrack,
  23. DemuxedUserdataTrack,
  24. } from '../types/demuxer';
  25. import type { TrackSet } from '../types/track';
  26. import type { SourceBufferName } from '../types/buffer';
  27. import type { Fragment } from '../loader/fragment';
  28. import type { HlsConfig } from '../config';
  29.  
  30. const MAX_SILENT_FRAME_DURATION = 10 * 1000; // 10 seconds
  31. const AAC_SAMPLES_PER_FRAME = 1024;
  32. const MPEG_AUDIO_SAMPLE_PER_FRAME = 1152;
  33.  
  34. let chromeVersion: number | null = null;
  35. let safariWebkitVersion: number | null = null;
  36.  
  37. export default class MP4Remuxer implements Remuxer {
  38. private observer: HlsEventEmitter;
  39. private config: HlsConfig;
  40. private typeSupported: any;
  41. private ISGenerated: boolean = false;
  42. private _initPTS!: number;
  43. private _initDTS!: number;
  44. private nextAvcDts: number | null = null;
  45. private nextAudioPts: number | null = null;
  46. private videoSampleDuration: number | null = null;
  47. private isAudioContiguous: boolean = false;
  48. private isVideoContiguous: boolean = false;
  49.  
  50. constructor(
  51. observer: HlsEventEmitter,
  52. config: HlsConfig,
  53. typeSupported,
  54. vendor = ''
  55. ) {
  56. this.observer = observer;
  57. this.config = config;
  58. this.typeSupported = typeSupported;
  59. this.ISGenerated = false;
  60.  
  61. if (chromeVersion === null) {
  62. const userAgent = navigator.userAgent || '';
  63. const result = userAgent.match(/Chrome\/(\d+)/i);
  64. chromeVersion = result ? parseInt(result[1]) : 0;
  65. }
  66. if (safariWebkitVersion === null) {
  67. const result = navigator.userAgent.match(/Safari\/(\d+)/i);
  68. safariWebkitVersion = result ? parseInt(result[1]) : 0;
  69. }
  70. }
  71.  
  72. destroy() {}
  73.  
  74. resetTimeStamp(defaultTimeStamp) {
  75. logger.log('[mp4-remuxer]: initPTS & initDTS reset');
  76. this._initPTS = this._initDTS = defaultTimeStamp;
  77. }
  78.  
  79. resetNextTimestamp() {
  80. logger.log('[mp4-remuxer]: reset next timestamp');
  81. this.isVideoContiguous = false;
  82. this.isAudioContiguous = false;
  83. }
  84.  
  85. resetInitSegment() {
  86. logger.log('[mp4-remuxer]: ISGenerated flag reset');
  87. this.ISGenerated = false;
  88. }
  89.  
  90. getVideoStartPts(videoSamples) {
  91. let rolloverDetected = false;
  92. const startPTS = videoSamples.reduce((minPTS, sample) => {
  93. const delta = sample.pts - minPTS;
  94. if (delta < -4294967296) {
  95. // 2^32, see PTSNormalize for reasoning, but we're hitting a rollover here, and we don't want that to impact the timeOffset calculation
  96. rolloverDetected = true;
  97. return normalizePts(minPTS, sample.pts);
  98. } else if (delta > 0) {
  99. return minPTS;
  100. } else {
  101. return sample.pts;
  102. }
  103. }, videoSamples[0].pts);
  104. if (rolloverDetected) {
  105. logger.debug('PTS rollover detected');
  106. }
  107. return startPTS;
  108. }
  109.  
  110. remux(
  111. audioTrack: DemuxedAudioTrack,
  112. videoTrack: DemuxedAvcTrack,
  113. id3Track: DemuxedMetadataTrack,
  114. textTrack: DemuxedUserdataTrack,
  115. timeOffset: number,
  116. accurateTimeOffset: boolean,
  117. flush: boolean,
  118. playlistType: PlaylistLevelType
  119. ): RemuxerResult {
  120. let video: RemuxedTrack | undefined;
  121. let audio: RemuxedTrack | undefined;
  122. let initSegment: InitSegmentData | undefined;
  123. let text: RemuxedUserdata | undefined;
  124. let id3: RemuxedMetadata | undefined;
  125. let independent: boolean | undefined;
  126. let audioTimeOffset = timeOffset;
  127. let videoTimeOffset = timeOffset;
  128.  
  129. // If we're remuxing audio and video progressively, wait until we've received enough samples for each track before proceeding.
  130. // This is done to synchronize the audio and video streams. We know if the current segment will have samples if the "pid"
  131. // parameter is greater than -1. The pid is set when the PMT is parsed, which contains the tracks list.
  132. // However, if the initSegment has already been generated, or we've reached the end of a segment (flush),
  133. // then we can remux one track without waiting for the other.
  134. const hasAudio = audioTrack.pid > -1;
  135. const hasVideo = videoTrack.pid > -1;
  136. const length = videoTrack.samples.length;
  137. const enoughAudioSamples = audioTrack.samples.length > 0;
  138. const enoughVideoSamples = (flush && length > 0) || length > 1;
  139. const canRemuxAvc =
  140. ((!hasAudio || enoughAudioSamples) &&
  141. (!hasVideo || enoughVideoSamples)) ||
  142. this.ISGenerated ||
  143. flush;
  144.  
  145. if (canRemuxAvc) {
  146. if (!this.ISGenerated) {
  147. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  148. }
  149.  
  150. const isVideoContiguous = this.isVideoContiguous;
  151. let firstKeyFrameIndex = -1;
  152. let firstKeyFramePTS;
  153.  
  154. if (enoughVideoSamples) {
  155. firstKeyFrameIndex = findKeyframeIndex(videoTrack.samples);
  156. if (!isVideoContiguous && this.config.forceKeyFrameOnDiscontinuity) {
  157. independent = true;
  158. if (firstKeyFrameIndex > 0) {
  159. logger.warn(
  160. `[mp4-remuxer]: Dropped ${firstKeyFrameIndex} out of ${length} video samples due to a missing keyframe`
  161. );
  162. const startPTS = this.getVideoStartPts(videoTrack.samples);
  163. videoTrack.samples = videoTrack.samples.slice(firstKeyFrameIndex);
  164. videoTrack.dropped += firstKeyFrameIndex;
  165. videoTimeOffset +=
  166. (videoTrack.samples[0].pts - startPTS) /
  167. videoTrack.inputTimeScale;
  168. firstKeyFramePTS = videoTimeOffset;
  169. } else if (firstKeyFrameIndex === -1) {
  170. logger.warn(
  171. `[mp4-remuxer]: No keyframe found out of ${length} video samples`
  172. );
  173. independent = false;
  174. }
  175. }
  176. }
  177.  
  178. if (this.ISGenerated) {
  179. if (enoughAudioSamples && enoughVideoSamples) {
  180. // timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS)
  181. // if first audio DTS is not aligned with first video DTS then we need to take that into account
  182. // when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small
  183. // drift between audio and video streams
  184. const startPTS = this.getVideoStartPts(videoTrack.samples);
  185. const tsDelta =
  186. normalizePts(audioTrack.samples[0].pts, startPTS) - startPTS;
  187. const audiovideoTimestampDelta = tsDelta / videoTrack.inputTimeScale;
  188. audioTimeOffset += Math.max(0, audiovideoTimestampDelta);
  189. videoTimeOffset += Math.max(0, -audiovideoTimestampDelta);
  190. }
  191.  
  192. // Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is calculated in remuxAudio.
  193. if (enoughAudioSamples) {
  194. // if initSegment was generated without audio samples, regenerate it again
  195. if (!audioTrack.samplerate) {
  196. logger.warn(
  197. '[mp4-remuxer]: regenerate InitSegment as audio detected'
  198. );
  199. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  200. }
  201. audio = this.remuxAudio(
  202. audioTrack,
  203. audioTimeOffset,
  204. this.isAudioContiguous,
  205. accurateTimeOffset,
  206. hasVideo ||
  207. enoughVideoSamples ||
  208. playlistType === PlaylistLevelType.AUDIO
  209. ? videoTimeOffset
  210. : undefined
  211. );
  212. if (enoughVideoSamples) {
  213. const audioTrackLength = audio ? audio.endPTS - audio.startPTS : 0;
  214. // if initSegment was generated without video samples, regenerate it again
  215. if (!videoTrack.inputTimeScale) {
  216. logger.warn(
  217. '[mp4-remuxer]: regenerate InitSegment as video detected'
  218. );
  219. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  220. }
  221. video = this.remuxVideo(
  222. videoTrack,
  223. videoTimeOffset,
  224. isVideoContiguous,
  225. audioTrackLength
  226. );
  227. }
  228. } else if (enoughVideoSamples) {
  229. video = this.remuxVideo(
  230. videoTrack,
  231. videoTimeOffset,
  232. isVideoContiguous,
  233. 0
  234. );
  235. }
  236. if (video) {
  237. video.firstKeyFrame = firstKeyFrameIndex;
  238. video.independent = firstKeyFrameIndex !== -1;
  239. video.firstKeyFramePTS = firstKeyFramePTS;
  240. }
  241. }
  242. }
  243.  
  244. // Allow ID3 and text to remux, even if more audio/video samples are required
  245. if (this.ISGenerated) {
  246. if (id3Track.samples.length) {
  247. id3 = flushTextTrackMetadataCueSamples(
  248. id3Track,
  249. timeOffset,
  250. this._initPTS,
  251. this._initDTS
  252. );
  253. }
  254.  
  255. if (textTrack.samples.length) {
  256. text = flushTextTrackUserdataCueSamples(
  257. textTrack,
  258. timeOffset,
  259. this._initPTS
  260. );
  261. }
  262. }
  263.  
  264. return {
  265. audio,
  266. video,
  267. initSegment,
  268. independent,
  269. text,
  270. id3,
  271. };
  272. }
  273.  
  274. generateIS(
  275. audioTrack: DemuxedAudioTrack,
  276. videoTrack: DemuxedAvcTrack,
  277. timeOffset
  278. ): InitSegmentData | undefined {
  279. const audioSamples = audioTrack.samples;
  280. const videoSamples = videoTrack.samples;
  281. const typeSupported = this.typeSupported;
  282. const tracks: TrackSet = {};
  283. const computePTSDTS = !Number.isFinite(this._initPTS);
  284. let container = 'audio/mp4';
  285. let initPTS: number | undefined;
  286. let initDTS: number | undefined;
  287. let timescale: number | undefined;
  288.  
  289. if (computePTSDTS) {
  290. initPTS = initDTS = Infinity;
  291. }
  292.  
  293. if (audioTrack.config && audioSamples.length) {
  294. // let's use audio sampling rate as MP4 time scale.
  295. // rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC)
  296. // using audio sampling rate here helps having an integer MP4 frame duration
  297. // this avoids potential rounding issue and AV sync issue
  298. audioTrack.timescale = audioTrack.samplerate;
  299. switch (audioTrack.segmentCodec) {
  300. case 'mp3':
  301. if (typeSupported.mpeg) {
  302. // Chrome and Safari
  303. container = 'audio/mpeg';
  304. audioTrack.codec = '';
  305. } else if (typeSupported.mp3) {
  306. // Firefox
  307. audioTrack.codec = 'mp3';
  308. }
  309. break;
  310. }
  311. tracks.audio = {
  312. id: 'audio',
  313. container: container,
  314. codec: audioTrack.codec,
  315. initSegment:
  316. audioTrack.segmentCodec === 'mp3' && typeSupported.mpeg
  317. ? new Uint8Array(0)
  318. : MP4.initSegment([audioTrack]),
  319. metadata: {
  320. channelCount: audioTrack.channelCount,
  321. },
  322. };
  323. if (computePTSDTS) {
  324. timescale = audioTrack.inputTimeScale;
  325. // remember first PTS of this demuxing context. for audio, PTS = DTS
  326. initPTS = initDTS =
  327. audioSamples[0].pts - Math.round(timescale * timeOffset);
  328. }
  329. }
  330.  
  331. if (videoTrack.sps && videoTrack.pps && videoSamples.length) {
  332. // let's use input time scale as MP4 video timescale
  333. // we use input time scale straight away to avoid rounding issues on frame duration / cts computation
  334. videoTrack.timescale = videoTrack.inputTimeScale;
  335. tracks.video = {
  336. id: 'main',
  337. container: 'video/mp4',
  338. codec: videoTrack.codec,
  339. initSegment: MP4.initSegment([videoTrack]),
  340. metadata: {
  341. width: videoTrack.width,
  342. height: videoTrack.height,
  343. },
  344. };
  345. if (computePTSDTS) {
  346. timescale = videoTrack.inputTimeScale;
  347. const startPTS = this.getVideoStartPts(videoSamples);
  348. const startOffset = Math.round(timescale * timeOffset);
  349. initDTS = Math.min(
  350. initDTS as number,
  351. normalizePts(videoSamples[0].dts, startPTS) - startOffset
  352. );
  353. initPTS = Math.min(initPTS as number, startPTS - startOffset);
  354. }
  355. }
  356.  
  357. if (Object.keys(tracks).length) {
  358. this.ISGenerated = true;
  359. if (computePTSDTS) {
  360. this._initPTS = initPTS as number;
  361. this._initDTS = initDTS as number;
  362. }
  363.  
  364. return {
  365. tracks,
  366. initPTS,
  367. timescale,
  368. };
  369. }
  370. }
  371.  
  372. remuxVideo(
  373. track: DemuxedAvcTrack,
  374. timeOffset: number,
  375. contiguous: boolean,
  376. audioTrackLength: number
  377. ): RemuxedTrack | undefined {
  378. const timeScale: number = track.inputTimeScale;
  379. const inputSamples: Array<AvcSample> = track.samples;
  380. const outputSamples: Array<Mp4Sample> = [];
  381. const nbSamples: number = inputSamples.length;
  382. const initPTS: number = this._initPTS;
  383. let nextAvcDts = this.nextAvcDts;
  384. let offset = 8;
  385. let mp4SampleDuration = this.videoSampleDuration;
  386. let firstDTS;
  387. let lastDTS;
  388. let minPTS: number = Number.POSITIVE_INFINITY;
  389. let maxPTS: number = Number.NEGATIVE_INFINITY;
  390. let sortSamples = false;
  391.  
  392. // if parsed fragment is contiguous with last one, let's use last DTS value as reference
  393. if (!contiguous || nextAvcDts === null) {
  394. const pts = timeOffset * timeScale;
  395. const cts =
  396. inputSamples[0].pts -
  397. normalizePts(inputSamples[0].dts, inputSamples[0].pts);
  398. // if not contiguous, let's use target timeOffset
  399. nextAvcDts = pts - cts;
  400. }
  401.  
  402. // PTS is coded on 33bits, and can loop from -2^32 to 2^32
  403. // PTSNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value
  404. for (let i = 0; i < nbSamples; i++) {
  405. const sample = inputSamples[i];
  406. sample.pts = normalizePts(sample.pts - initPTS, nextAvcDts);
  407. sample.dts = normalizePts(sample.dts - initPTS, nextAvcDts);
  408. if (sample.dts < inputSamples[i > 0 ? i - 1 : i].dts) {
  409. sortSamples = true;
  410. }
  411. }
  412.  
  413. // sort video samples by DTS then PTS then demux id order
  414. if (sortSamples) {
  415. inputSamples.sort(function (a, b) {
  416. const deltadts = a.dts - b.dts;
  417. const deltapts = a.pts - b.pts;
  418. return deltadts || deltapts;
  419. });
  420. }
  421.  
  422. // Get first/last DTS
  423. firstDTS = inputSamples[0].dts;
  424. lastDTS = inputSamples[inputSamples.length - 1].dts;
  425.  
  426. // Sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS
  427. // set this constant duration as being the avg delta between consecutive DTS.
  428. const inputDuration = lastDTS - firstDTS;
  429. const averageSampleDuration = inputDuration
  430. ? Math.round(inputDuration / (nbSamples - 1))
  431. : mp4SampleDuration || track.inputTimeScale / 30;
  432.  
  433. // if fragment are contiguous, detect hole/overlapping between fragments
  434. if (contiguous) {
  435. // check timestamp continuity across consecutive fragments (this is to remove inter-fragment gap/hole)
  436. const delta = firstDTS - nextAvcDts;
  437. const foundHole = delta > averageSampleDuration;
  438. const foundOverlap = delta < -1;
  439. if (foundHole || foundOverlap) {
  440. if (foundHole) {
  441. logger.warn(
  442. `AVC: ${toMsFromMpegTsClock(
  443. delta,
  444. true
  445. )} ms (${delta}dts) hole between fragments detected, filling it`
  446. );
  447. } else {
  448. logger.warn(
  449. `AVC: ${toMsFromMpegTsClock(
  450. -delta,
  451. true
  452. )} ms (${delta}dts) overlapping between fragments detected`
  453. );
  454. }
  455. firstDTS = nextAvcDts;
  456. const firstPTS = inputSamples[0].pts - delta;
  457. inputSamples[0].dts = firstDTS;
  458. inputSamples[0].pts = firstPTS;
  459. logger.log(
  460. `Video: First PTS/DTS adjusted: ${toMsFromMpegTsClock(
  461. firstPTS,
  462. true
  463. )}/${toMsFromMpegTsClock(
  464. firstDTS,
  465. true
  466. )}, delta: ${toMsFromMpegTsClock(delta, true)} ms`
  467. );
  468. }
  469. }
  470.  
  471. firstDTS = Math.max(0, firstDTS);
  472.  
  473. let nbNalu = 0;
  474. let naluLen = 0;
  475. for (let i = 0; i < nbSamples; i++) {
  476. // compute total/avc sample length and nb of NAL units
  477. const sample = inputSamples[i];
  478. const units = sample.units;
  479. const nbUnits = units.length;
  480. let sampleLen = 0;
  481. for (let j = 0; j < nbUnits; j++) {
  482. sampleLen += units[j].data.length;
  483. }
  484.  
  485. naluLen += sampleLen;
  486. nbNalu += nbUnits;
  487. sample.length = sampleLen;
  488.  
  489. // ensure sample monotonic DTS
  490. sample.dts = Math.max(sample.dts, firstDTS);
  491.  
  492. minPTS = Math.min(sample.pts, minPTS);
  493. maxPTS = Math.max(sample.pts, maxPTS);
  494. }
  495. lastDTS = inputSamples[nbSamples - 1].dts;
  496.  
  497. /* concatenate the video data and construct the mdat in place
  498. (need 8 more bytes to fill length and mpdat type) */
  499. const mdatSize = naluLen + 4 * nbNalu + 8;
  500. let mdat;
  501. try {
  502. mdat = new Uint8Array(mdatSize);
  503. } catch (err) {
  504. this.observer.emit(Events.ERROR, Events.ERROR, {
  505. type: ErrorTypes.MUX_ERROR,
  506. details: ErrorDetails.REMUX_ALLOC_ERROR,
  507. fatal: false,
  508. bytes: mdatSize,
  509. reason: `fail allocating video mdat ${mdatSize}`,
  510. });
  511. return;
  512. }
  513. const view = new DataView(mdat.buffer);
  514. view.setUint32(0, mdatSize);
  515. mdat.set(MP4.types.mdat, 4);
  516.  
  517. let stretchedLastFrame = false;
  518. let minDtsDelta = Number.POSITIVE_INFINITY;
  519. let minPtsDelta = Number.POSITIVE_INFINITY;
  520. let maxDtsDelta = Number.NEGATIVE_INFINITY;
  521. let maxPtsDelta = Number.NEGATIVE_INFINITY;
  522. for (let i = 0; i < nbSamples; i++) {
  523. const avcSample = inputSamples[i];
  524. const avcSampleUnits = avcSample.units;
  525. let mp4SampleLength = 0;
  526. // convert NALU bitstream to MP4 format (prepend NALU with size field)
  527. for (let j = 0, nbUnits = avcSampleUnits.length; j < nbUnits; j++) {
  528. const unit = avcSampleUnits[j];
  529. const unitData = unit.data;
  530. const unitDataLen = unit.data.byteLength;
  531. view.setUint32(offset, unitDataLen);
  532. offset += 4;
  533. mdat.set(unitData, offset);
  534. offset += unitDataLen;
  535. mp4SampleLength += 4 + unitDataLen;
  536. }
  537.  
  538. // expected sample duration is the Decoding Timestamp diff of consecutive samples
  539. let ptsDelta;
  540. if (i < nbSamples - 1) {
  541. mp4SampleDuration = inputSamples[i + 1].dts - avcSample.dts;
  542. ptsDelta = inputSamples[i + 1].pts - avcSample.pts;
  543. } else {
  544. const config = this.config;
  545. const lastFrameDuration =
  546. i > 0
  547. ? avcSample.dts - inputSamples[i - 1].dts
  548. : averageSampleDuration;
  549. ptsDelta =
  550. i > 0
  551. ? avcSample.pts - inputSamples[i - 1].pts
  552. : averageSampleDuration;
  553. if (config.stretchShortVideoTrack && this.nextAudioPts !== null) {
  554. // In some cases, a segment's audio track duration may exceed the video track duration.
  555. // Since we've already remuxed audio, and we know how long the audio track is, we look to
  556. // see if the delta to the next segment is longer than maxBufferHole.
  557. // If so, playback would potentially get stuck, so we artificially inflate
  558. // the duration of the last frame to minimize any potential gap between segments.
  559. const gapTolerance = Math.floor(config.maxBufferHole * timeScale);
  560. const deltaToFrameEnd =
  561. (audioTrackLength
  562. ? minPTS + audioTrackLength * timeScale
  563. : this.nextAudioPts) - avcSample.pts;
  564. if (deltaToFrameEnd > gapTolerance) {
  565. // We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video
  566. // frame overlap. maxBufferHole should be >> lastFrameDuration anyway.
  567. mp4SampleDuration = deltaToFrameEnd - lastFrameDuration;
  568. if (mp4SampleDuration < 0) {
  569. mp4SampleDuration = lastFrameDuration;
  570. } else {
  571. stretchedLastFrame = true;
  572. }
  573. logger.log(
  574. `[mp4-remuxer]: It is approximately ${
  575. deltaToFrameEnd / 90
  576. } ms to the next segment; using duration ${
  577. mp4SampleDuration / 90
  578. } ms for the last video frame.`
  579. );
  580. } else {
  581. mp4SampleDuration = lastFrameDuration;
  582. }
  583. } else {
  584. mp4SampleDuration = lastFrameDuration;
  585. }
  586. }
  587. const compositionTimeOffset = Math.round(avcSample.pts - avcSample.dts);
  588. minDtsDelta = Math.min(minDtsDelta, mp4SampleDuration);
  589. maxDtsDelta = Math.max(maxDtsDelta, mp4SampleDuration);
  590. minPtsDelta = Math.min(minPtsDelta, ptsDelta);
  591. maxPtsDelta = Math.max(maxPtsDelta, ptsDelta);
  592.  
  593. outputSamples.push(
  594. new Mp4Sample(
  595. avcSample.key,
  596. mp4SampleDuration,
  597. mp4SampleLength,
  598. compositionTimeOffset
  599. )
  600. );
  601. }
  602.  
  603. if (outputSamples.length) {
  604. if (chromeVersion) {
  605. if (chromeVersion < 70) {
  606. // Chrome workaround, mark first sample as being a Random Access Point (keyframe) to avoid sourcebuffer append issue
  607. // https://code.google.com/p/chromium/issues/detail?id=229412
  608. const flags = outputSamples[0].flags;
  609. flags.dependsOn = 2;
  610. flags.isNonSync = 0;
  611. }
  612. } else if (safariWebkitVersion) {
  613. // Fix for "CNN special report, with CC" in test-streams (Safari browser only)
  614. // Ignore DTS when frame durations are irregular. Safari MSE does not handle this leading to gaps.
  615. if (
  616. maxPtsDelta - minPtsDelta < maxDtsDelta - minDtsDelta &&
  617. averageSampleDuration / maxDtsDelta < 0.025 &&
  618. outputSamples[0].cts === 0
  619. ) {
  620. logger.warn(
  621. 'Found irregular gaps in sample duration. Using PTS instead of DTS to determine MP4 sample duration.'
  622. );
  623. let dts = firstDTS;
  624. for (let i = 0, len = outputSamples.length; i < len; i++) {
  625. const nextDts = dts + outputSamples[i].duration;
  626. const pts = dts + outputSamples[i].cts;
  627. if (i < len - 1) {
  628. const nextPts = nextDts + outputSamples[i + 1].cts;
  629. outputSamples[i].duration = nextPts - pts;
  630. } else {
  631. outputSamples[i].duration = i
  632. ? outputSamples[i - 1].duration
  633. : averageSampleDuration;
  634. }
  635. outputSamples[i].cts = 0;
  636. dts = nextDts;
  637. }
  638. }
  639. }
  640. }
  641.  
  642. console.assert(
  643. mp4SampleDuration !== null,
  644. 'mp4SampleDuration must be computed'
  645. );
  646. // next AVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale)
  647. mp4SampleDuration =
  648. stretchedLastFrame || !mp4SampleDuration
  649. ? averageSampleDuration
  650. : mp4SampleDuration;
  651. this.nextAvcDts = nextAvcDts = lastDTS + mp4SampleDuration;
  652. this.videoSampleDuration = mp4SampleDuration;
  653. this.isVideoContiguous = true;
  654. const moof = MP4.moof(
  655. track.sequenceNumber++,
  656. firstDTS,
  657. Object.assign({}, track, {
  658. samples: outputSamples,
  659. })
  660. );
  661. const type: SourceBufferName = 'video';
  662. const data = {
  663. data1: moof,
  664. data2: mdat,
  665. startPTS: minPTS / timeScale,
  666. endPTS: (maxPTS + mp4SampleDuration) / timeScale,
  667. startDTS: firstDTS / timeScale,
  668. endDTS: (nextAvcDts as number) / timeScale,
  669. type,
  670. hasAudio: false,
  671. hasVideo: true,
  672. nb: outputSamples.length,
  673. dropped: track.dropped,
  674. };
  675.  
  676. track.samples = [];
  677. track.dropped = 0;
  678.  
  679. console.assert(mdat.length, 'MDAT length must not be zero');
  680.  
  681. return data;
  682. }
  683.  
  684. remuxAudio(
  685. track: DemuxedAudioTrack,
  686. timeOffset: number,
  687. contiguous: boolean,
  688. accurateTimeOffset: boolean,
  689. videoTimeOffset?: number
  690. ): RemuxedTrack | undefined {
  691. const inputTimeScale: number = track.inputTimeScale;
  692. const mp4timeScale: number = track.samplerate
  693. ? track.samplerate
  694. : inputTimeScale;
  695. const scaleFactor: number = inputTimeScale / mp4timeScale;
  696. const mp4SampleDuration: number =
  697. track.segmentCodec === 'aac'
  698. ? AAC_SAMPLES_PER_FRAME
  699. : MPEG_AUDIO_SAMPLE_PER_FRAME;
  700. const inputSampleDuration: number = mp4SampleDuration * scaleFactor;
  701. const initPTS: number = this._initPTS;
  702. const rawMPEG: boolean =
  703. track.segmentCodec === 'mp3' && this.typeSupported.mpeg;
  704. const outputSamples: Array<Mp4Sample> = [];
  705. const alignedWithVideo = videoTimeOffset !== undefined;
  706.  
  707. let inputSamples: Array<AudioSample> = track.samples;
  708. let offset: number = rawMPEG ? 0 : 8;
  709. let nextAudioPts: number = this.nextAudioPts || -1;
  710.  
  711. // window.audioSamples ? window.audioSamples.push(inputSamples.map(s => s.pts)) : (window.audioSamples = [inputSamples.map(s => s.pts)]);
  712.  
  713. // for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs),
  714. // for sake of clarity:
  715. // consecutive fragments are frags with
  716. // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
  717. // - less than 20 audio frames distance
  718. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  719. // this helps ensuring audio continuity
  720. // and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame
  721. const timeOffsetMpegTS = timeOffset * inputTimeScale;
  722. this.isAudioContiguous = contiguous =
  723. contiguous ||
  724. ((inputSamples.length &&
  725. nextAudioPts > 0 &&
  726. ((accurateTimeOffset &&
  727. Math.abs(timeOffsetMpegTS - nextAudioPts) < 9000) ||
  728. Math.abs(
  729. normalizePts(inputSamples[0].pts - initPTS, timeOffsetMpegTS) -
  730. nextAudioPts
  731. ) <
  732. 20 * inputSampleDuration)) as boolean);
  733.  
  734. // compute normalized PTS
  735. inputSamples.forEach(function (sample) {
  736. sample.pts = normalizePts(sample.pts - initPTS, timeOffsetMpegTS);
  737. });
  738.  
  739. if (!contiguous || nextAudioPts < 0) {
  740. // filter out sample with negative PTS that are not playable anyway
  741. // if we don't remove these negative samples, they will shift all audio samples forward.
  742. // leading to audio overlap between current / next fragment
  743. inputSamples = inputSamples.filter((sample) => sample.pts >= 0);
  744.  
  745. // in case all samples have negative PTS, and have been filtered out, return now
  746. if (!inputSamples.length) {
  747. return;
  748. }
  749.  
  750. if (videoTimeOffset === 0) {
  751. // Set the start to 0 to match video so that start gaps larger than inputSampleDuration are filled with silence
  752. nextAudioPts = 0;
  753. } else if (accurateTimeOffset && !alignedWithVideo) {
  754. // When not seeking, not live, and LevelDetails.PTSKnown, use fragment start as predicted next audio PTS
  755. nextAudioPts = Math.max(0, timeOffsetMpegTS);
  756. } else {
  757. // if frags are not contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS
  758. nextAudioPts = inputSamples[0].pts;
  759. }
  760. }
  761.  
  762. // If the audio track is missing samples, the frames seem to get "left-shifted" within the
  763. // resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment.
  764. // In an effort to prevent this from happening, we inject frames here where there are gaps.
  765. // When possible, we inject a silent frame; when that's not possible, we duplicate the last
  766. // frame.
  767.  
  768. if (track.segmentCodec === 'aac') {
  769. const maxAudioFramesDrift = this.config.maxAudioFramesDrift;
  770. for (let i = 0, nextPts = nextAudioPts; i < inputSamples.length; i++) {
  771. // First, let's see how far off this frame is from where we expect it to be
  772. const sample = inputSamples[i];
  773. const pts = sample.pts;
  774. const delta = pts - nextPts;
  775. const duration = Math.abs((1000 * delta) / inputTimeScale);
  776.  
  777. // When remuxing with video, if we're overlapping by more than a duration, drop this sample to stay in sync
  778. if (
  779. delta <= -maxAudioFramesDrift * inputSampleDuration &&
  780. alignedWithVideo
  781. ) {
  782. if (i === 0) {
  783. logger.warn(
  784. `Audio frame @ ${(pts / inputTimeScale).toFixed(
  785. 3
  786. )}s overlaps nextAudioPts by ${Math.round(
  787. (1000 * delta) / inputTimeScale
  788. )} ms.`
  789. );
  790. this.nextAudioPts = nextAudioPts = nextPts = pts;
  791. }
  792. } // eslint-disable-line brace-style
  793.  
  794. // Insert missing frames if:
  795. // 1: We're more than maxAudioFramesDrift frame away
  796. // 2: Not more than MAX_SILENT_FRAME_DURATION away
  797. // 3: currentTime (aka nextPtsNorm) is not 0
  798. // 4: remuxing with video (videoTimeOffset !== undefined)
  799. else if (
  800. delta >= maxAudioFramesDrift * inputSampleDuration &&
  801. duration < MAX_SILENT_FRAME_DURATION &&
  802. alignedWithVideo
  803. ) {
  804. let missing = Math.round(delta / inputSampleDuration);
  805. // Adjust nextPts so that silent samples are aligned with media pts. This will prevent media samples from
  806. // later being shifted if nextPts is based on timeOffset and delta is not a multiple of inputSampleDuration.
  807. nextPts = pts - missing * inputSampleDuration;
  808. if (nextPts < 0) {
  809. missing--;
  810. nextPts += inputSampleDuration;
  811. }
  812. if (i === 0) {
  813. this.nextAudioPts = nextAudioPts = nextPts;
  814. }
  815. logger.warn(
  816. `[mp4-remuxer]: Injecting ${missing} audio frame @ ${(
  817. nextPts / inputTimeScale
  818. ).toFixed(3)}s due to ${Math.round(
  819. (1000 * delta) / inputTimeScale
  820. )} ms gap.`
  821. );
  822. for (let j = 0; j < missing; j++) {
  823. const newStamp = Math.max(nextPts as number, 0);
  824. let fillFrame = AAC.getSilentFrame(
  825. track.manifestCodec || track.codec,
  826. track.channelCount
  827. );
  828. if (!fillFrame) {
  829. logger.log(
  830. '[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating last frame instead.'
  831. );
  832. fillFrame = sample.unit.subarray();
  833. }
  834. inputSamples.splice(i, 0, {
  835. unit: fillFrame,
  836. pts: newStamp,
  837. });
  838. nextPts += inputSampleDuration;
  839. i++;
  840. }
  841. }
  842. sample.pts = nextPts;
  843. nextPts += inputSampleDuration;
  844. }
  845. }
  846. let firstPTS: number | null = null;
  847. let lastPTS: number | null = null;
  848. let mdat: any;
  849. let mdatSize: number = 0;
  850. let sampleLength: number = inputSamples.length;
  851. while (sampleLength--) {
  852. mdatSize += inputSamples[sampleLength].unit.byteLength;
  853. }
  854. for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) {
  855. const audioSample = inputSamples[j];
  856. const unit = audioSample.unit;
  857. let pts = audioSample.pts;
  858. if (lastPTS !== null) {
  859. // If we have more than one sample, set the duration of the sample to the "real" duration; the PTS diff with
  860. // the previous sample
  861. const prevSample = outputSamples[j - 1];
  862. prevSample.duration = Math.round((pts - lastPTS) / scaleFactor);
  863. } else {
  864. if (contiguous && track.segmentCodec === 'aac') {
  865. // set PTS/DTS to expected PTS/DTS
  866. pts = nextAudioPts;
  867. }
  868. // remember first PTS of our audioSamples
  869. firstPTS = pts;
  870. if (mdatSize > 0) {
  871. /* concatenate the audio data and construct the mdat in place
  872. (need 8 more bytes to fill length and mdat type) */
  873. mdatSize += offset;
  874. try {
  875. mdat = new Uint8Array(mdatSize);
  876. } catch (err) {
  877. this.observer.emit(Events.ERROR, Events.ERROR, {
  878. type: ErrorTypes.MUX_ERROR,
  879. details: ErrorDetails.REMUX_ALLOC_ERROR,
  880. fatal: false,
  881. bytes: mdatSize,
  882. reason: `fail allocating audio mdat ${mdatSize}`,
  883. });
  884. return;
  885. }
  886. if (!rawMPEG) {
  887. const view = new DataView(mdat.buffer);
  888. view.setUint32(0, mdatSize);
  889. mdat.set(MP4.types.mdat, 4);
  890. }
  891. } else {
  892. // no audio samples
  893. return;
  894. }
  895. }
  896. mdat.set(unit, offset);
  897. const unitLen = unit.byteLength;
  898. offset += unitLen;
  899. // Default the sample's duration to the computed mp4SampleDuration, which will either be 1024 for AAC or 1152 for MPEG
  900. // In the case that we have 1 sample, this will be the duration. If we have more than one sample, the duration
  901. // becomes the PTS diff with the previous sample
  902. outputSamples.push(new Mp4Sample(true, mp4SampleDuration, unitLen, 0));
  903. lastPTS = pts;
  904. }
  905.  
  906. // We could end up with no audio samples if all input samples were overlapping with the previously remuxed ones
  907. const nbSamples = outputSamples.length;
  908. if (!nbSamples) {
  909. return;
  910. }
  911.  
  912. // The next audio sample PTS should be equal to last sample PTS + duration
  913. const lastSample = outputSamples[outputSamples.length - 1];
  914. this.nextAudioPts = nextAudioPts =
  915. lastPTS! + scaleFactor * lastSample.duration;
  916.  
  917. // Set the track samples from inputSamples to outputSamples before remuxing
  918. const moof = rawMPEG
  919. ? new Uint8Array(0)
  920. : MP4.moof(
  921. track.sequenceNumber++,
  922. firstPTS! / scaleFactor,
  923. Object.assign({}, track, { samples: outputSamples })
  924. );
  925.  
  926. // Clear the track samples. This also clears the samples array in the demuxer, since the reference is shared
  927. track.samples = [];
  928. const start = firstPTS! / inputTimeScale;
  929. const end = nextAudioPts / inputTimeScale;
  930. const type: SourceBufferName = 'audio';
  931. const audioData = {
  932. data1: moof,
  933. data2: mdat,
  934. startPTS: start,
  935. endPTS: end,
  936. startDTS: start,
  937. endDTS: end,
  938. type,
  939. hasAudio: true,
  940. hasVideo: false,
  941. nb: nbSamples,
  942. };
  943.  
  944. this.isAudioContiguous = true;
  945.  
  946. console.assert(mdat.length, 'MDAT length must not be zero');
  947. return audioData;
  948. }
  949.  
  950. remuxEmptyAudio(
  951. track: DemuxedAudioTrack,
  952. timeOffset: number,
  953. contiguous: boolean,
  954. videoData: Fragment
  955. ): RemuxedTrack | undefined {
  956. const inputTimeScale: number = track.inputTimeScale;
  957. const mp4timeScale: number = track.samplerate
  958. ? track.samplerate
  959. : inputTimeScale;
  960. const scaleFactor: number = inputTimeScale / mp4timeScale;
  961. const nextAudioPts: number | null = this.nextAudioPts;
  962. // sync with video's timestamp
  963. const startDTS: number =
  964. (nextAudioPts !== null
  965. ? nextAudioPts
  966. : videoData.startDTS * inputTimeScale) + this._initDTS;
  967. const endDTS: number = videoData.endDTS * inputTimeScale + this._initDTS;
  968. // one sample's duration value
  969. const frameDuration: number = scaleFactor * AAC_SAMPLES_PER_FRAME;
  970. // samples count of this segment's duration
  971. const nbSamples: number = Math.ceil((endDTS - startDTS) / frameDuration);
  972. // silent frame
  973. const silentFrame: Uint8Array | undefined = AAC.getSilentFrame(
  974. track.manifestCodec || track.codec,
  975. track.channelCount
  976. );
  977.  
  978. logger.warn('[mp4-remuxer]: remux empty Audio');
  979. // Can't remux if we can't generate a silent frame...
  980. if (!silentFrame) {
  981. logger.trace(
  982. '[mp4-remuxer]: Unable to remuxEmptyAudio since we were unable to get a silent frame for given audio codec'
  983. );
  984. return;
  985. }
  986.  
  987. const samples: Array<any> = [];
  988. for (let i = 0; i < nbSamples; i++) {
  989. const stamp = startDTS + i * frameDuration;
  990. samples.push({ unit: silentFrame, pts: stamp, dts: stamp });
  991. }
  992. track.samples = samples;
  993.  
  994. return this.remuxAudio(track, timeOffset, contiguous, false);
  995. }
  996. }
  997.  
  998. export function normalizePts(value: number, reference: number | null): number {
  999. let offset;
  1000. if (reference === null) {
  1001. return value;
  1002. }
  1003.  
  1004. if (reference < value) {
  1005. // - 2^33
  1006. offset = -8589934592;
  1007. } else {
  1008. // + 2^33
  1009. offset = 8589934592;
  1010. }
  1011. /* PTS is 33bit (from 0 to 2^33 -1)
  1012. if diff between value and reference is bigger than half of the amplitude (2^32) then it means that
  1013. PTS looping occured. fill the gap */
  1014. while (Math.abs(value - reference) > 4294967296) {
  1015. value += offset;
  1016. }
  1017.  
  1018. return value;
  1019. }
  1020.  
  1021. function findKeyframeIndex(samples: Array<AvcSample>): number {
  1022. for (let i = 0; i < samples.length; i++) {
  1023. if (samples[i].key) {
  1024. return i;
  1025. }
  1026. }
  1027. return -1;
  1028. }
  1029.  
  1030. export function flushTextTrackMetadataCueSamples(
  1031. track: DemuxedMetadataTrack,
  1032. timeOffset: number,
  1033. initPTS: number,
  1034. initDTS: number
  1035. ): RemuxedMetadata | undefined {
  1036. const length = track.samples.length;
  1037. if (!length) {
  1038. return;
  1039. }
  1040. const inputTimeScale = track.inputTimeScale;
  1041. for (let index = 0; index < length; index++) {
  1042. const sample = track.samples[index];
  1043. // setting id3 pts, dts to relative time
  1044. // using this._initPTS and this._initDTS to calculate relative time
  1045. sample.pts =
  1046. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1047. inputTimeScale;
  1048. sample.dts =
  1049. normalizePts(sample.dts - initDTS, timeOffset * inputTimeScale) /
  1050. inputTimeScale;
  1051. }
  1052. const samples = track.samples;
  1053. track.samples = [];
  1054. return {
  1055. samples,
  1056. };
  1057. }
  1058.  
  1059. export function flushTextTrackUserdataCueSamples(
  1060. track: DemuxedUserdataTrack,
  1061. timeOffset: number,
  1062. initPTS: number
  1063. ): RemuxedUserdata | undefined {
  1064. const length = track.samples.length;
  1065. if (!length) {
  1066. return;
  1067. }
  1068.  
  1069. const inputTimeScale = track.inputTimeScale;
  1070. for (let index = 0; index < length; index++) {
  1071. const sample = track.samples[index];
  1072. // setting text pts, dts to relative time
  1073. // using this._initPTS and this._initDTS to calculate relative time
  1074. sample.pts =
  1075. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1076. inputTimeScale;
  1077. }
  1078. track.samples.sort((a, b) => a.pts - b.pts);
  1079. const samples = track.samples;
  1080. track.samples = [];
  1081. return {
  1082. samples,
  1083. };
  1084. }
  1085.  
  1086. class Mp4Sample {
  1087. public size: number;
  1088. public duration: number;
  1089. public cts: number;
  1090. public flags: Mp4SampleFlags;
  1091.  
  1092. constructor(
  1093. isKeyframe: boolean,
  1094. duration: number,
  1095. size: number,
  1096. cts: number
  1097. ) {
  1098. this.duration = duration;
  1099. this.size = size;
  1100. this.cts = cts;
  1101. this.flags = new Mp4SampleFlags(isKeyframe);
  1102. }
  1103. }
  1104.  
  1105. class Mp4SampleFlags {
  1106. public isLeading: 0 = 0;
  1107. public isDependedOn: 0 = 0;
  1108. public hasRedundancy: 0 = 0;
  1109. public degradPrio: 0 = 0;
  1110. public dependsOn: 1 | 2 = 1;
  1111. public isNonSync: 0 | 1 = 1;
  1112.  
  1113. constructor(isKeyframe) {
  1114. this.dependsOn = isKeyframe ? 2 : 1;
  1115. this.isNonSync = isKeyframe ? 0 : 1;
  1116. }
  1117. }