Source code for openl3.core

import os
import tensorflow as tf
import resampy
import traceback
import soundfile as sf
import numpy as np
from numbers import Real
from math import ceil
import warnings
from .models import load_audio_embedding_model, load_image_embedding_model, _validate_audio_frontend
from .openl3_exceptions import OpenL3Error
from .openl3_warnings import OpenL3Warning

TARGET_SR = 48000


def _center_audio(audio, frame_len):
    """Center audio so that first sample will occur in the middle of the first frame"""
    return np.pad(audio, (int(frame_len / 2.0), 0), mode='constant', constant_values=0)


def _pad_audio(audio, frame_len, hop_len):
    """Pad audio if necessary so that all samples are processed"""
    audio_len = audio.size
    if audio_len < frame_len:
        pad_length = frame_len - audio_len
    else:
        pad_length = (
            int(np.ceil((audio_len - frame_len)/float(hop_len))) * hop_len 
            - (audio_len - frame_len))

    if pad_length > 0:
        audio = np.pad(audio, (0, pad_length), mode='constant', constant_values=0)

    return audio


def _get_num_windows(audio_len, frame_len, hop_len, center):
    if center:
        audio_len += int(frame_len / 2.0)

    if audio_len <= frame_len:
        return 1
    else:
        return 1 + int(np.ceil((audio_len - frame_len)/float(hop_len)))


def _preprocess_audio_batch(audio, sr, center=True, hop_size=0.1):
    """Process audio into batch format suitable for input to embedding model """
    if audio.size == 0:
        raise OpenL3Error('Got empty audio')

    # Warn user if audio is all zero
    if np.all(audio == 0):
        warnings.warn('Provided audio is all zeros', OpenL3Warning)

    # Check audio array dimension
    if audio.ndim > 2:
        raise OpenL3Error('Audio array can only be be 1D or 2D')
    elif audio.ndim == 2:
        # Downmix if multichannel
        audio = np.mean(audio, axis=1)

    if not isinstance(sr, Real) or sr <= 0:
        raise OpenL3Error('Invalid sample rate {}'.format(sr))

    if not isinstance(hop_size, Real) or hop_size <= 0:
        raise OpenL3Error('Invalid hop size {}'.format(hop_size))

    if center not in (True, False):
        raise OpenL3Error('Invalid center value {}'.format(center))

    # Resample if necessary
    if sr != TARGET_SR:
        audio = resampy.resample(audio, sr_orig=sr, sr_new=TARGET_SR, filter='kaiser_best')

    audio_len = audio.size
    frame_len = TARGET_SR
    hop_len = int(hop_size * TARGET_SR)

    if audio_len < frame_len:
        warnings.warn('Duration of provided audio is shorter than window size (1 second). Audio will be padded.',
                      OpenL3Warning)

    if center:
        # Center audio
        audio = _center_audio(audio, frame_len)

    # Pad if necessary to ensure that we process all samples
    audio = _pad_audio(audio, frame_len, hop_len)

    # Split audio into frames, copied from librosa.util.frame
    n_frames = 1 + int((len(audio) - frame_len) / float(hop_len))
    x = np.lib.stride_tricks.as_strided(audio, shape=(frame_len, n_frames),
                                        strides=(audio.itemsize, hop_len * audio.itemsize)).T

    # Add a channel dimension
    x = x.reshape((x.shape[0], 1, x.shape[-1]))
    return x


def _librosa_linear_frontend(audio, n_fft=512, hop_length=242, db_amin=1e-10, 
                             db_ref=1.0, dynamic_range=80.0):
    '''Librosa linear frontend designed to match original Kapre (0.1.4).'''
    import librosa
    S = np.abs(librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length, center=False))
    S = librosa.power_to_db(S=S, ref=db_ref, amin=db_amin, top_db=dynamic_range)
    S -= S.max()
    return S


def _librosa_mel_frontend(audio, sr, n_mels=128, n_fft=2048, hop_length=242,
                          db_amin=1e-10, db_ref=1.0, dynamic_range=80.0):
    '''Librosa mel frontend designed to match original Kapre (0.1.4).'''
    import librosa
    S = librosa.feature.melspectrogram(
        y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length,
        center=True, power=1.0)
    S = librosa.power_to_db(S=S, ref=db_ref, amin=db_amin, top_db=dynamic_range)
    S -= S.max()
    return S


[docs]def preprocess_audio(audio, sr, hop_size=0.1, input_repr=None, center=True, **kw):
    """
    Preprocess the audio into a format compatible with the model.

    Parameters
    ----------
    audio : np.ndarray [shape=(N,) or (N,C)] or list[np.ndarray]
        1D numpy array of audio data or list of audio arrays for multiple
        inputs.
    sr : int or list[int]
        Sampling rate, or list of sampling rates. If not 48kHz audio will
        be resampled.
    hop_size : float
        Hop size in seconds.
    input_repr : str or None
        Spectrogram representation used for model.
        If input_repr, is None, then no spectrogram is computed and
        it is assumed that the model contains the details about 
        the input representation.
    center : boolean
        If True, pads beginning of signal so timestamps correspond
        to center of window.

    Returns
    -------
    input_data (np.ndarray): The preprocessed audio. Depending on 
        the value of input_repr, it will be np.ndarray[batch, time, frequency, 1]
        if a valid input representation is provided,
        or np.ndarray[batch, time, 1] if no input_repr is provided.
    """
    x = _preprocess_audio_batch(audio, sr, hop_size=hop_size, center=center)  # this resamples to 48k
    if input_repr:
        if input_repr == 'linear':
            x = np.stack([_librosa_linear_frontend(xi[0], **kw) for xi in x])[..., None]
        elif input_repr == 'mel128':
            x = np.stack([_librosa_mel_frontend(xi[0], TARGET_SR, n_mels=128, **kw) for xi in x])[..., None]
        elif input_repr == 'mel256':
            x = np.stack([_librosa_mel_frontend(xi[0], TARGET_SR, n_mels=256, **kw) for xi in x])[..., None]
        else:
            raise OpenL3Error('Invalid input representation "{}"'.format(input_repr))
    return x



[docs]def get_audio_embedding(audio, sr, model=None, input_repr=None,
                        content_type="music", embedding_size=6144,
                        center=True, hop_size=0.1, batch_size=32,
                        frontend="kapre", verbose=True):
    """
    Computes and returns L3 embedding for given audio data.

    Embeddings are computed for 1-second windows of audio.

    Parameters
    ----------
    audio : np.ndarray [shape=(N,) or (N,C)] or list[np.ndarray]
        1D numpy array of audio data or list of audio arrays for multiple
        inputs.
    sr : int or list[int]
        Sampling rate, or list of sampling rates. If not 48kHz audio will
        be resampled.
    model : tf.keras.Model or None
        Loaded model object. If a model is provided, then `input_repr`,
        `content_type`, and `embedding_size` will be ignored.
        If None is provided, the model will be loaded using
        the provided values of `input_repr`, `content_type` and
        `embedding_size`.
    input_repr : "linear", "mel128", or "mel256"
        Spectrogram representation used for model. Ignored if `model` is
        a valid Keras model.
    content_type : "music" or "env"
        Type of content used to train the embedding model. Ignored if `model` is
        a valid Keras model.
    embedding_size : 6144 or 512
        Embedding dimensionality. Ignored if `model` is a valid
        Keras model.
    center : boolean
        If True, pads beginning of signal so timestamps correspond
        to center of window.
    hop_size : float
        Hop size in seconds.
    batch_size : int
        Batch size used for input to embedding model
    frontend : "kapre" or "librosa"
        The audio frontend to use. By default, it will use "kapre".
    verbose : bool
        If True, prints verbose messages.

    Returns
    -------
    embedding : np.ndarray [shape=(T, D)] or list[np.ndarray]
        Array of embeddings for each window or list of such arrays for
        multiple audio clips.
    timestamps : np.ndarray [shape=(T,)] or list[np.ndarray]
        Array of timestamps corresponding to each embedding in the output or
        list of such arrays for multiple audio cplips.

    """
    if model is not None and not isinstance(model, tf.keras.Model):
        raise OpenL3Error('Invalid model provided. Must be of type tf.keras.Model'
                          ' but got {}'.format(str(type(model))))

    frontend, input_repr = _validate_audio_frontend(frontend, input_repr, model)

    if str(content_type) not in ("music", "env"):
        raise OpenL3Error('Invalid content type "{}"'.format(content_type))

    if embedding_size not in (6144, 512):
        raise OpenL3Error('Invalid content type "{}"'.format(embedding_size))

    if verbose not in (0, 1):
        raise OpenL3Error('Invalid verbosity level {}'.format(verbose))

    if isinstance(audio, np.ndarray):
        audio_list = [audio]
        list_input = False
    elif isinstance(audio, list):
        audio_list = audio
        list_input = True
    else:
        err_msg = 'audio must be type list[np.ndarray] or np.ndarray. Got {}'
        raise OpenL3Error(err_msg.format(type(audio)))

    if isinstance(sr, Real):
        sr_list = [sr] * len(audio_list)
    elif isinstance(sr, list):
        sr_list = sr
    else:
        err_msg = 'sr must be type list[numbers.Real] or numbers.Real. Got {}'
        raise OpenL3Error(err_msg.format(type(sr)))

    if len(audio_list) != len(sr_list):
        err_msg = ('Mismatch between number of audio inputs ({}) and number of'
                   ' sample rates ({})')
        raise OpenL3Error(err_msg.format(len(audio_list), len(sr_list)))

    # Get embedding model
    if model is None:
        model = load_audio_embedding_model(
            input_repr, content_type, embedding_size, 
            frontend=frontend)

    # Collect all audio arrays in a single array
    batch = []
    for x, sr in zip(audio_list, sr_list):
        x = preprocess_audio(
            x, sr, hop_size=hop_size, center=center, 
            input_repr=input_repr if frontend == 'librosa' else None)
        batch.append(x)

    file_batch_size_list = [x.shape[0] for x in batch]
    batch = np.vstack(batch)
    # Compute embeddings
    batch_embedding = model.predict(batch, verbose=1 if verbose else 0,
                                    batch_size=batch_size)

    embedding_list = []
    start_idx = 0
    for file_batch_size in file_batch_size_list:
        end_idx = start_idx + file_batch_size
        embedding_list.append(batch_embedding[start_idx:end_idx, ...])
        start_idx = end_idx

    ts_list = [np.arange(z.shape[0]) * hop_size for z in embedding_list]

    if not list_input:
        return embedding_list[0], ts_list[0]
    return embedding_list, ts_list


[docs]def process_audio_file(filepath, output_dir=None, suffix=None, model=None,
                       input_repr=None, content_type="music",
                       embedding_size=6144, center=True, hop_size=0.1,
                       batch_size=32, overwrite=False, frontend="kapre", verbose=True):
    """
    Computes and saves L3 embedding for a given audio file

    Parameters
    ----------
    filepath : str or list[str]
        Path or list of paths to WAV file(s) to be processed.
    output_dir : str or None
        Path to directory for saving output files. If None, output files will
        be saved to the directory containing the input file.
    suffix : str or None
        String to be appended to the output filename, i.e. <base filename>_<suffix>.npz.
        If None, then no suffix will be added, i.e. <base filename>.npz.
    model : tf.keras.Model or None
        Loaded model object. If a model is provided, then `input_repr`,
        `content_type`, and `embedding_size` will be ignored.
        If None is provided, the model will be loaded using
        the provided values of `input_repr`, `content_type` and
        `embedding_size`.
    input_repr : "linear", "mel128", or "mel256"
        Spectrogram representation used as model input. Ignored if `model` is
        a valid Keras model with a Kapre frontend. This is required with a 
        Librosa frontend.
    content_type : "music" or "env"
        Type of content used to train the embedding model. Ignored if `model` is
        a valid Keras model.
    embedding_size : 6144 or 512
        Embedding dimensionality. Ignored if `model` is a valid
        Keras model.
    center : boolean
        If True, pads beginning of signal so timestamps correspond
        to center of window.
    hop_size : float
        Hop size in seconds.
    batch_size : int
        Batch size used for input to embedding model
    overwrite : bool
        If True, overwrites existing output files
    frontend : "kapre" or "librosa"
        The audio frontend to use. By default, it will use "kapre".
    verbose : bool
        If True, prints verbose messages.

    Returns
    -------

    """
    if isinstance(filepath, str):
        filepath_list = [filepath]
    elif isinstance(filepath, list):
        filepath_list = filepath
    else:
        err_msg = 'filepath should be type str or list[str], but got {}.'
        raise OpenL3Error(err_msg.format(filepath))

    if not suffix:
        suffix = ""

    # Load model
    frontend, input_repr = _validate_audio_frontend(frontend, input_repr, model)
    if not model:
        model = load_audio_embedding_model(input_repr, content_type,
                                           embedding_size, frontend=frontend)

    audio_list = []
    sr_list = []
    batch_filepath_list = []

    total_batch_size = 0

    num_files = len(filepath_list)
    for file_idx, filepath in enumerate(filepath_list):
        if not os.path.exists(filepath):
            raise OpenL3Error('File "{}" could not be found.'.format(filepath))

        if verbose:
            print("openl3: Processing {} ({}/{})".format(filepath,
                                                         file_idx+1,
                                                         num_files))

        # Skip if overwriting isn't enabled and output file exists
        output_path = get_output_path(filepath, suffix + ".npz",
                                      output_dir=output_dir)
        if os.path.exists(output_path) and not overwrite:
            err_msg = "openl3: {} exists and overwriting not enabled, skipping."
            print(err_msg.format(output_path))
            continue

        try:
            audio, sr = sf.read(filepath)
        except Exception:
            err_msg = 'Could not open file "{}":\n{}'
            raise OpenL3Error(err_msg.format(filepath, traceback.format_exc()))

        audio_list.append(audio)
        sr_list.append(sr)
        batch_filepath_list.append(filepath)

        audio_length = ceil(audio.shape[0] / float(TARGET_SR / sr))
        frame_length = TARGET_SR
        hop_length = int(hop_size * TARGET_SR)
        num_windows = _get_num_windows(audio_length, frame_length,
                                       hop_length, center)
        total_batch_size += num_windows

        if total_batch_size >= batch_size or file_idx == (num_files - 1):
            embedding_list, ts_list = get_audio_embedding(
                audio_list, sr_list, model=model,
                input_repr=input_repr,
                content_type=content_type,
                embedding_size=embedding_size,
                center=center,
                hop_size=hop_size,
                batch_size=batch_size,
                frontend=frontend,
                verbose=verbose)
            for fpath, embedding, ts in zip(batch_filepath_list,
                                            embedding_list,
                                            ts_list):
                output_path = get_output_path(fpath, suffix + ".npz",
                                              output_dir=output_dir)

                np.savez(output_path, embedding=embedding, timestamps=ts)
                assert os.path.exists(output_path)

                if verbose:
                    print("openl3: Saved {}".format(output_path))

            audio_list = []
            sr_list = []
            batch_filepath_list = []
            total_batch_size = 0


def _preprocess_image_batch(image):
    """
    Preprocesses an image array so that they are rescaled and cropped to the
    appropriate dimensions required by the embedding model.

    Parameters
    ----------
    image : np.ndarray [shape=(H, W, C) or (N, H, W, C)]
        3D or 4D numpy array of image data. If the images are not 224x224,
        the images are resized so that the smallest size is 256 and then
        the center 224x224 patch is extracted from the images. Any type
        is accepted, and will be converted to np.float32 in the range [-1,1].
        Signed data-types are assumed to take on negative values.

    Returns
    -------
    batch : np.ndarray [shape=(N, H, W, C)]
        4d numpy array of image data.
    """
    import skimage
    import skimage.transform
    if image.size == 0:
        raise OpenL3Error('Got empty image')

    # Warn user if image is all zero
    if np.all(image == 0):
        warnings.warn('Provided image is all zeros', OpenL3Warning)

    # Check image array dimension
    if image.ndim not in (3, 4):
        raise OpenL3Error('RGB image array can only be 3D or 4D (sequence of videos)')

    if image.shape[-1] != 3:
        raise OpenL3Error('Need 3 channel images corresponding to RGB.')

    if image.ndim == 3:
        # Add a batch dimension dimension
        image = image[np.newaxis, ...]

    if min(image.shape[1], image.shape[2]) < 224:
        err_msg = ('Image(s) must be at at least as large as 224x224 px. '
                   'Got image(s) of size {}x{} px')
        raise OpenL3Error(err_msg.format(image.shape[1], image.shape[2]))

    if image.shape[1] != 224 or image.shape[2] != 224:
        # If image is not 224x224, rescale to 256x256, and take center
        # 224x224 image patch, corresponding to what was done in L3
        scaling = 256.0 / min(image.shape[1], image.shape[2])
        batch = np.zeros((image.shape[0], 224, 224, 3))
        for idx, frame in enumerate(image):
            # Only reshape if image is larger than 256x256
            if min(frame.shape[0], frame.shape[1]) > 256:
                try:
                    frame = skimage.transform.rescale(frame, scaling,
                                                      mode='constant',
                                                      cval=0,
                                                      clip=True,
                                                      preserve_range=False,
                                                      channel_axis=-1,
                                                      anti_aliasing=False,
                                                      anti_aliasing_sigma=None)
                except TypeError:
                    frame = skimage.transform.rescale(frame, scaling,
                                                      mode='constant',
                                                      cval=0,
                                                      clip=True,
                                                      preserve_range=False,
                                                      multichannel=True,
                                                      anti_aliasing=False,
                                                      anti_aliasing_sigma=None)
            x1, x2 = frame.shape[:-1]
            startx1 = x1//2-(224//2)
            startx2 = x2//2-(224//2)
            batch[idx] = frame[startx1:startx1+224,startx2:startx2+224]
    else:
        batch = image

    # Make sure image correct type
    if batch.dtype in (np.float16, np.float32, np.float64, np.int8,
                       np.int16, np.int32, np.int64):
        batch = skimage.img_as_float32(batch)
    elif batch.dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
        # If unsigned int, convert to range [-1, 1]
        batch = 2 * skimage.img_as_float32(batch) - 1

    # Make sure maximum magnitude is in the range [-1, 1]
    if np.max(np.abs(batch)) > 1:
        batch /= np.max(np.abs(batch))

    return batch


[docs]def get_image_embedding(image, frame_rate=None, model=None,
                        input_repr="mel256", content_type="music",
                        embedding_size=8192, batch_size=32, verbose=True):
    """
    Computes and returns L3 embedding for given video frame (image) data.

    Embeddings are computed for every image in the input.

    Parameters
    ----------
    image : np.ndarray [shape=(H, W, C) or (N, H, W, C)] or list[np.ndarray]
        3D or 4D numpy array of image data. If the images are not 224x224,
        the images are resized so that the smallest size is 256 and then
        the center 224x224 patch is extracted from the images. Any type
        is accepted, and will be converted to np.float32 in the range [-1,1].
        Signed data-types are assumed to take on negative values. A list of
        image arrays can also be provided.
    frame_rate : int or list[int] or None
        Video frame rate (if applicable), which if provided results in
        a timestamp array being returned. A list of frame rates can also be
        provided. If None, no timestamp array is returned.
    model : tf.keras.Model or None
        Loaded model object. If a model is provided, then `input_repr`,
        `content_type`, and `embedding_size` will be ignored.
        If None is provided, the model will be loaded using
        the provided values of `input_repr`, `content_type` and
        `embedding_size`.
    input_repr : "linear", "mel128", or "mel256"
        Spectrogram representation used for to train audio part of embedding
        model. Ignored if `model` is a valid Keras model.
    content_type : "music" or "env"
        Type of content used to train the embedding model. Ignored if `model` is
        a valid Keras model.
    embedding_size : 8192 or 512
        Embedding dimensionality. Ignored if `model` is a valid
        Keras model.
    batch_size : int
        Batch size used for input to embedding model
    verbose : bool
        If True, prints verbose messages.

    Returns
    -------
        embedding : np.ndarray [shape=(N, D)]
            Array of embeddings for each frame.
        timestamps : np.ndarray [shape=(N,)]
            Array of timestamps for each frame. If `frame_rate` is None,
            this is not returned.
    """
    if model is not None and not isinstance(model, tf.keras.Model):
        raise OpenL3Error('Invalid model provided. Must be of type tf.keras.Model'
                          ' but got {}'.format(str(type(model))))

    if str(input_repr) not in ("linear", "mel128", "mel256"):
        raise OpenL3Error('Invalid input representation "{}"'.format(input_repr))

    if str(content_type) not in ("music", "env"):
        raise OpenL3Error('Invalid content type "{}"'.format(content_type))

    if embedding_size not in (8192, 512):
        raise OpenL3Error('Invalid content type "{}"'.format(embedding_size))

    if verbose not in (0, 1):
        raise OpenL3Error('Invalid verbosity level {}'.format(verbose))

    # Get embedding model
    if model is None:
        model = load_image_embedding_model(input_repr, content_type, embedding_size)

    if isinstance(image, np.ndarray):
        image_list = [image]
        list_input = False
    elif isinstance(image, list):
        image_list = image
        list_input = True
    else:
        err_msg = 'image must be type list[np.ndarray] or np.ndarray. Got {}'
        raise OpenL3Error(err_msg.format(type(image)))

    if frame_rate is None or isinstance(frame_rate, Real):
        frame_rate_list = [frame_rate] * len(image_list)
    elif isinstance(frame_rate, list):
        frame_rate_list = frame_rate
    else:
        err_msg = 'frame rate must be type list[numbers.Real] or numbers.Real. Got {}'
        raise OpenL3Error(err_msg.format(type(frame_rate)))

    if len(image_list) != len(frame_rate_list):
        err_msg = ('Mismatch between number of image inputs ({}) and number of'
                   ' frame rates ({})')
        raise OpenL3Error(err_msg.format(len(image_list), len(frame_rate_list)))

    batch = []
    file_batch_size_list = []
    for image, frame_rate in zip(image_list, frame_rate_list):
        if (frame_rate is not None) and (not isinstance(frame_rate, Real) or frame_rate <= 0):
            raise OpenL3Error('Invalid frame rate {}'.format(frame_rate))

        # Preprocess image to scale appropriate scale
        x = _preprocess_image_batch(image)
        batch.append(x)
        file_batch_size_list.append(x.shape[0])

    batch = np.vstack(batch)
    # Compute embeddings
    batch_embedding = model.predict(batch, verbose=1 if verbose else 0,
                                    batch_size=batch_size)

    embedding_list = []
    ts_list = []
    start_idx = 0
    for file_batch_size in file_batch_size_list:
        end_idx = start_idx + file_batch_size
        embedding = batch_embedding[start_idx:end_idx, ...]
        embedding_list.append(embedding)
        if frame_rate is not None:
            ts = np.arange(embedding.shape[0]) / float(frame_rate)
            ts_list.append(ts)

        start_idx = end_idx

    if frame_rate is not None:
        if not list_input:
            return embedding_list[0], ts_list[0]
        else:
            return embedding_list, ts_list
    else:
        if not list_input:
            return embedding_list[0]
        else:
            return embedding_list


[docs]def process_image_file(filepath, output_dir=None, suffix=None, model=None,
                       input_repr="mel256", content_type="music",
                       embedding_size=8192, batch_size=32,
                       overwrite=False, verbose=True):
    """
    Computes and saves L3 embedding for a given image file

    Parameters
    ----------
    filepath : str or list[str]
        Path or list of paths to image file(s) to be processed.
    output_dir : str or None
        Path to directory for saving output files. If None, output files will
        be saved to the directory containing the input file.
    suffix : str or None
        String to be appended to the output filename, i.e. <base filename>_<suffix>.npz.
        If None, then no suffix will be added, i.e. <base filename>.npz.
    model : tf.keras.Model or None
        Loaded model object. If a model is provided, then `input_repr`,
        `content_type`, and `embedding_size` will be ignored.
        If None is provided, the model will be loaded using
        the provided values of `input_repr`, `content_type` and
        `embedding_size`.
    input_repr : "linear", "mel128", or "mel256"
        Spectrogram representation used for model. Ignored if `model` is
        a valid Keras model.
    content_type : "music" or "env"
        Type of content used to train the embedding model. Ignored if `model` is
        a valid Keras model.
    embedding_size : 8192 or 512
        Embedding dimensionality. Ignored if `model` is a valid
        Keras model.
    batch_size : int
        Batch size used for input to embedding model
    overwrite : bool
        If True, overwrites existing output files
    verbose : bool
        If True, prints verbose messages.

    Returns
    -------

    """
    import skimage.io
    if isinstance(filepath, str):
        filepath_list = [filepath]
    elif isinstance(filepath, list):
        filepath_list = filepath
    else:
        err_msg = 'filepath should be type str or list[str], but got {}.'
        raise OpenL3Error(err_msg.format(filepath))

    # Load model
    if not model:
        model = load_image_embedding_model(input_repr, content_type,
                                           embedding_size)

    if not suffix:
        suffix = ""

    image_list = []
    batch_filepath_list = []

    num_files = len(filepath_list)
    for file_idx, filepath in enumerate(filepath_list):
        if not os.path.exists(filepath):
            raise OpenL3Error('File "{}" could not be found.'.format(filepath))

        if verbose:
            print("openl3: Processing {} ({}/{})".format(filepath,
                                                         file_idx+1,
                                                         num_files))

        # Skip if overwriting isn't enabled and output file exists
        output_path = get_output_path(filepath, suffix + ".npz",
                                      output_dir=output_dir)
        if os.path.exists(output_path) and not overwrite:
            print("openl3: {} exists, skipping.".format(output_path))
            continue

        try:
            image = skimage.io.imread(filepath)
            # Get rid of alpha dimension
            if image.shape[-1] == 4:
                image = image[..., :3]
        except Exception:
            raise OpenL3Error('Could not open file "{}":\n{}'.format(filepath, traceback.format_exc()))

        image_list.append(image[np.newaxis, ...])
        batch_filepath_list.append(filepath)

        if len(image_list) >= batch_size or file_idx == (num_files - 1):
            embedding_list = get_image_embedding(
                image_list, model=model,
                input_repr=input_repr,
                content_type=content_type,
                embedding_size=embedding_size,
                verbose=verbose)
            for fpath, embedding in zip(batch_filepath_list, embedding_list):
                output_path = get_output_path(fpath, suffix + ".npz",
                                              output_dir=output_dir)

                np.savez(output_path, embedding=embedding)
                assert os.path.exists(output_path)

                if verbose:
                    print("openl3: Saved {}".format(output_path))

            image_list = []
            batch_filepath_list = []


[docs]def process_video_file(filepath, output_dir=None, suffix=None,
                       audio_model=None, image_model=None,
                       input_repr=None, content_type="music",
                       audio_embedding_size=6144, audio_center=True,
                       audio_hop_size=0.1, image_embedding_size=8192,
                       audio_batch_size=32, image_batch_size=32,
                       audio_frontend="kapre",
                       overwrite=False, verbose=True):
    """
    Computes and saves L3 audio and video frame embeddings for a given video file

    Note that image embeddings are computed for every frame of the video. Also
    note that embeddings for the audio and images are not temporally aligned.
    Please refer to the timestamps in the output files for the corresponding
    timestamps for each set of embeddings.

    Parameters
    ----------
    filepath : str or list[str]
        Path or list of paths to video file(s) to be processed.
    output_dir : str or None
        Path to directory for saving output files. If None, output files will
        be saved to the directory containing the input file.
    suffix : str or None
        String to be appended to the output filename,
        i.e. <base filename>_<modality>_<suffix>.npz.
        If None, then no suffix will be added,
        i.e. <base filename>_<modality>.npz.
    audio_model : tf.keras.Model or None
        Loaded audio model object. If a model is provided, then `input_repr`,
        `content_type`, and `embedding_size` will be ignored.
        If None is provided, the model will be loaded using
        the provided values of `input_repr`, `content_type` and
        `embedding_size`.
    image_model : tf.keras.Model or None
        Loaded audio model object. If a model is provided, then `input_repr`,
        `content_type`, and `embedding_size` will be ignored.
        If None is provided, the model will be loaded using
        the provided values of `input_repr`, `content_type` and
        `embedding_size`.
    input_repr : "linear", "mel128", or "mel256"
        Spectrogram representation used for audio model. Ignored if `model` is
        a valid Keras model with a Kapre frontend. This is required with a 
        Librosa frontend.
    content_type : "music" or "env"
        Type of content used to train the embedding model. Ignored if `model` is
        a valid Keras model.
    audio_embedding_size : 6144 or 512
        Audio embedding dimensionality. Ignored if `model` is a valid Keras model.
    audio_center : boolean
        If True, pads beginning of audio signal so timestamps correspond
        to center of window.
    audio_hop_size : float
        Hop size in seconds.
    image_embedding_size : 8192 or 512
        Video frame embedding dimensionality. Ignored if `model` is a valid Keras model.
    audio_batch_size : int
        Batch size used for input to audio embedding model
    image_batch_size : int
        Batch size used for input to image embedding model
    audio_frontend : "kapre" or "librosa"
        The audio frontend to use. By default, it will use "kapre".
    overwrite : bool
        If True, overwrites existing output files
    verbose : bool
        If True, prints verbose messages.

    Returns
    -------

    """
    from moviepy.video.io.VideoFileClip import VideoFileClip
    if isinstance(filepath, str):
        filepath_list = [filepath]
    elif isinstance(filepath, list):
        filepath_list = filepath
    else:
        err_msg = 'filepath should be type str or list[str], but got {}.'
        raise OpenL3Error(err_msg.format(filepath))

    audio_frontend, input_repr = _validate_audio_frontend(audio_frontend, input_repr, audio_model)

    # Load models
    if not audio_model:
        audio_model = load_audio_embedding_model(input_repr, content_type,
                                                 audio_embedding_size, 
                                                 frontend=audio_frontend)
    if not image_model:
        image_model = load_image_embedding_model(input_repr, content_type,
                                                 image_embedding_size)

    audio_suffix, image_suffix = "audio", "image"
    if suffix:
        audio_suffix += "_" + suffix
        image_suffix += "_" + suffix

    audio_list = []
    sr_list = []
    audio_batch_filepath_list = []
    total_audio_batch_size = 0

    image_list = []
    frame_rate_list = []
    image_batch_filepath_list = []

    num_files = len(filepath_list)
    for file_idx, filepath in enumerate(filepath_list):

        if not os.path.exists(filepath):
            raise OpenL3Error('File "{}" could not be found.'.format(filepath))

        if verbose:
            print("openl3: Processing {} ({}/{})".format(filepath,
                                                         file_idx+1,
                                                         num_files))

        # Skip if overwriting isn't enabled and output file exists
        audio_output_path = get_output_path(filepath, audio_suffix + ".npz",
                                            output_dir=output_dir)
        image_output_path = get_output_path(filepath, image_suffix + ".npz",
                                            output_dir=output_dir)
        skip_audio = os.path.exists(audio_output_path) and not overwrite
        skip_image = os.path.exists(image_output_path) and not overwrite

        if skip_audio and skip_image:
            err_msg = "openl3: {} and {} exist, skipping."
            print(err_msg.format(audio_output_path, image_output_path))
            continue

        try:
            clip = VideoFileClip(filepath, target_resolution=(256, 256),
                                 audio_fps=TARGET_SR)
            audio = clip.audio.to_soundarray(fps=TARGET_SR)
            images = np.array([frame for frame in clip.iter_frames()])
        except Exception:
            err_msg = 'Could not open file "{}":\n{}'
            raise OpenL3Error(err_msg.format(filepath, traceback.format_exc()))

        if not skip_audio:
            audio_list.append(audio)
            sr_list.append(TARGET_SR)
            audio_batch_filepath_list.append(filepath)
            audio_len = audio.shape[0]
            audio_hop_length = int(audio_hop_size * TARGET_SR)
            num_windows = 1 + max(ceil((audio_len - TARGET_SR)/float(audio_hop_length)), 0)
            total_audio_batch_size += num_windows
        else:
            err_msg = "openl3: {} exists, skipping audio embedding extraction."
            print(err_msg.format(audio_output_path))

        if not skip_image:
            image_list.append(images)
            frame_rate_list.append(int(clip.fps))
            image_batch_filepath_list.append(filepath)
        else:
            err_msg = "openl3: {} exists, skipping image embedding extraction."
            print(err_msg.format(image_output_path))

        if (total_audio_batch_size >= audio_batch_size or file_idx == (num_files - 1)) and len(audio_list) > 0:
            embedding_list, ts_list = get_audio_embedding(
                audio_list, sr_list, model=audio_model,
                input_repr=input_repr,
                content_type=content_type,
                embedding_size=audio_embedding_size,
                center=audio_center,
                hop_size=audio_hop_size,
                batch_size=audio_batch_size,
                frontend=audio_frontend,
                verbose=verbose)
            for fpath, embedding, ts in zip(audio_batch_filepath_list,
                                            embedding_list,
                                            ts_list):
                output_path = get_output_path(fpath, audio_suffix + ".npz",
                                              output_dir=output_dir)

                np.savez(output_path, embedding=embedding, timestamps=ts)
                assert os.path.exists(output_path)

                if verbose:
                    print("openl3: Saved {}".format(output_path))

            audio_list = []
            sr_list = []
            audio_batch_filepath_list = []
            total_audio_batch_size = 0

        if (len(image_list) >= image_batch_size or file_idx == (num_files - 1)) and len(image_list) > 0:
            embedding_list, ts_list = get_image_embedding(
                image_list, frame_rate_list,
                model=image_model, input_repr=input_repr,
                content_type=content_type,
                embedding_size=image_embedding_size,
                batch_size=image_batch_size,
                verbose=verbose)
            for fpath, embedding, ts in zip(image_batch_filepath_list,
                                            embedding_list,
                                            ts_list):
                output_path = get_output_path(fpath, image_suffix + ".npz",
                                              output_dir=output_dir)

                np.savez(output_path, embedding=embedding, timestamps=ts)
                assert os.path.exists(output_path)

                if verbose:
                    print("openl3: Saved {}".format(output_path))

            image_list = []
            frame_rate_list = []
            image_batch_filepath_list = []


[docs]def get_output_path(filepath, suffix, output_dir=None):
    """
    Returns path to output file corresponding to the given input file.

    Parameters
    ----------
    filepath : str
        Path to audio file to be processed
    suffix : str
        String to append to filename (including extension)
    output_dir : str or None
        Path to directory where file will be saved. If None, will use directory of given filepath.

    Returns
    -------
    output_path : str
        Path to output file

    """
    base_filename = os.path.splitext(os.path.basename(filepath))[0]
    if not output_dir:
        output_dir = os.path.dirname(filepath)

    if suffix[0] != '.':
        output_filename = "{}_{}".format(base_filename, suffix)
    else:
        output_filename = base_filename + suffix

    return os.path.join(output_dir, output_filename)