Skip to content

vision_utils

Functions

fastvideo.models.vision_utils.create_default_image

create_default_image(width: int = 512, height: int = 512, color: tuple[int, int, int] = (0, 0, 0)) -> Image

Create a default black PIL image.

Parameters:

Name Type Description Default
width int

Image width in pixels

512
height int

Image height in pixels

512
color tuple[int, int, int]

RGB color tuple

(0, 0, 0)

Returns:

Type Description
Image

PIL.Image.Image: A new PIL image with specified dimensions and color

Source code in fastvideo/models/vision_utils.py
def create_default_image(width: int = 512, height: int = 512, color: tuple[int, int, int] = (0, 0, 0)) -> PIL.Image.Image:
    """
    Create a default black PIL image.

    Args:
        width: Image width in pixels
        height: Image height in pixels
        color: RGB color tuple

    Returns:
        PIL.Image.Image: A new PIL image with specified dimensions and color
    """
    return PIL.Image.new("RGB", (width, height), color=color)

fastvideo.models.vision_utils.get_default_height_width

get_default_height_width(image: Image | ndarray | Tensor, vae_scale_factor: int, height: int | None = None, width: int | None = None) -> tuple[int, int]

Returns the height and width of the image, downscaled to the next integer multiple of vae_scale_factor.

Parameters:

Name Type Description Default
image `Union[PIL.Image.Image, np.ndarray, torch.Tensor]`

The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it should have shape [batch, height, width] or [batch, height, width, channels]. If it is a PyTorch tensor, it should have shape [batch, channels, height, width].

required
height `Optional[int]`, *optional*, defaults to `None`

The height of the preprocessed image. If None, the height of the image input will be used.

None
width `Optional[int]`, *optional*, defaults to `None`

The width of the preprocessed image. If None, the width of the image input will be used.

None

Returns:

Type Description
tuple[int, int]

Tuple[int, int]: A tuple containing the height and width, both resized to the nearest integer multiple of vae_scale_factor.

Source code in fastvideo/models/vision_utils.py
def get_default_height_width(
    image: PIL.Image.Image | np.ndarray | torch.Tensor,
    vae_scale_factor: int,
    height: int | None = None,
    width: int | None = None,
) -> tuple[int, int]:
    r"""
    Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.

    Args:
        image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
            The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
            should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
            tensor, it should have shape `[batch, channels, height, width]`.
        height (`Optional[int]`, *optional*, defaults to `None`):
            The height of the preprocessed image. If `None`, the height of the `image` input will be used.
        width (`Optional[int]`, *optional*, defaults to `None`):
            The width of the preprocessed image. If `None`, the width of the `image` input will be used.

    Returns:
        `Tuple[int, int]`:
            A tuple containing the height and width, both resized to the nearest integer multiple of
            `vae_scale_factor`.
    """

    if height is None:
        if isinstance(image, PIL.Image.Image):
            height = image.height
        elif isinstance(image, torch.Tensor):
            height = image.shape[2]
        else:
            height = image.shape[1]

    if width is None:
        if isinstance(image, PIL.Image.Image):
            width = image.width
        elif isinstance(image, torch.Tensor):
            width = image.shape[3]
        else:
            width = image.shape[2]

    width, height = (x - x % vae_scale_factor for x in (width, height)
                     )  # resize to integer multiple of vae_scale_factor

    return height, width

fastvideo.models.vision_utils.load_image

load_image(image: str | Image, convert_method: Callable[[Image], Image] | None = None) -> Image

Loads image to a PIL Image.

Parameters:

Name Type Description Default
image `str` or `PIL.Image.Image`

The image to convert to the PIL Image format.

required
convert_method Callable[[PIL.Image.Image], PIL.Image.Image], *optional*

A conversion method to apply to the image after loading it. When set to None the image will be converted "RGB".

None

Returns:

Type Description
Image

PIL.Image.Image: A PIL Image.

Source code in fastvideo/models/vision_utils.py
def load_image(
    image: str | PIL.Image.Image,
    convert_method: Callable[[PIL.Image.Image], PIL.Image.Image] | None = None
) -> PIL.Image.Image:
    """
    Loads `image` to a PIL Image.

    Args:
        image (`str` or `PIL.Image.Image`):
            The image to convert to the PIL Image format.
        convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], *optional*):
            A conversion method to apply to the image after loading it. When set to `None` the image will be converted
            "RGB".

    Returns:
        `PIL.Image.Image`:
            A PIL Image.
    """
    if isinstance(image, str):
        if image.startswith("http://") or image.startswith("https://"):
            image = PIL.Image.open(requests.get(image, stream=True).raw)
        elif os.path.isfile(image):
            image = PIL.Image.open(image)
        else:
            raise ValueError(
                f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {image} is not a valid path."
            )
    elif isinstance(image, PIL.Image.Image):
        image = image
    else:
        raise ValueError(
            "Incorrect format used for the image. Should be a URL linking to an image, a local path, or a PIL image."
        )

    image = PIL.ImageOps.exif_transpose(image)

    if convert_method is not None:
        image = convert_method(image)
    else:
        image = image.convert("RGB")

    return image

fastvideo.models.vision_utils.load_video

load_video(video: str, convert_method: Callable[[list[Image]], list[Image]] | None = None, return_fps: bool = False) -> tuple[list[Image], float | Any] | list[Image]

Loads video to a list of PIL Image. Args: video (str): A URL or Path to a video to convert to a list of PIL Image format. convert_method (Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]], optional): A conversion method to apply to the video after loading it. When set to None the images will be converted to "RGB". return_fps (bool, optional, defaults to False): Whether to return the FPS of the video. If True, returns a tuple of (images, fps). If False, returns only the list of images. Returns: List[PIL.Image.Image] or Tuple[List[PIL.Image.Image], float | None]: The video as a list of PIL images. If return_fps is True, also returns the original FPS.

Source code in fastvideo/models/vision_utils.py
def load_video(
    video: str,
    convert_method: Callable[[list[PIL.Image.Image]], list[PIL.Image.Image]]
    | None = None,
    return_fps: bool = False,
) -> tuple[list[PIL.Image.Image], float | Any] | list[PIL.Image.Image]:
    """
    Loads `video` to a list of PIL Image.
    Args:
        video (`str`):
            A URL or Path to a video to convert to a list of PIL Image format.
        convert_method (Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]], *optional*):
            A conversion method to apply to the video after loading it. When set to `None` the images will be converted
            to "RGB".
        return_fps (`bool`, *optional*, defaults to `False`):
            Whether to return the FPS of the video. If `True`, returns a tuple of (images, fps).
            If `False`, returns only the list of images.
    Returns:
        `List[PIL.Image.Image]` or `Tuple[List[PIL.Image.Image], float | None]`:
            The video as a list of PIL images. If `return_fps` is True, also returns the original FPS.
    """
    is_url = video.startswith("http://") or video.startswith("https://")
    is_file = os.path.isfile(video)
    was_tempfile_created = False

    if not (is_url or is_file):
        raise ValueError(
            f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {video} is not a valid path."
        )

    if is_url:
        response = requests.get(video, stream=True)
        if response.status_code != 200:
            raise ValueError(
                f"Failed to download video. Status code: {response.status_code}"
            )

        parsed_url = urlparse(video)
        file_name = os.path.basename(unquote(parsed_url.path))

        suffix = os.path.splitext(file_name)[1] or ".mp4"
        with tempfile.NamedTemporaryFile(suffix=suffix,
                                         delete=False) as temp_file:
            video_path = temp_file.name
            video_data = response.iter_content(chunk_size=8192)
            for chunk in video_data:
                temp_file.write(chunk)
        was_tempfile_created = True
    else:
        video_path = video

    pil_images = []
    original_fps = None

    try:
        if video_path.endswith(".gif"):
            pil_images, original_fps = _load_gif(video_path)
        else:
            pil_images, original_fps = _load_video_with_ffmpeg(video_path)
    finally:
        # Clean up temporary file if it was created
        if was_tempfile_created and os.path.exists(video_path):
            os.remove(video_path)

    if convert_method is not None:
        pil_images = convert_method(pil_images)

    return pil_images, original_fps if return_fps else pil_images

fastvideo.models.vision_utils.normalize

normalize(images: ndarray | Tensor) -> ndarray | Tensor

Normalize an image array to [-1,1].

Parameters:

Name Type Description Default
images `np.ndarray` or `torch.Tensor`

The image array to normalize.

required

Returns:

Type Description
ndarray | Tensor

np.ndarray or torch.Tensor: The normalized image array.

Source code in fastvideo/models/vision_utils.py
def normalize(images: np.ndarray | torch.Tensor) -> np.ndarray | torch.Tensor:
    r"""
    Normalize an image array to [-1,1].

    Args:
        images (`np.ndarray` or `torch.Tensor`):
            The image array to normalize.

    Returns:
        `np.ndarray` or `torch.Tensor`:
            The normalized image array.
    """
    return 2.0 * images - 1.0

fastvideo.models.vision_utils.numpy_to_pt

numpy_to_pt(images: ndarray) -> Tensor

Convert a NumPy image to a PyTorch tensor.

Parameters:

Name Type Description Default
images `np.ndarray`

The NumPy image array to convert to PyTorch format.

required

Returns:

Type Description
Tensor

torch.Tensor: A PyTorch tensor representation of the images.

Source code in fastvideo/models/vision_utils.py
def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
    r"""
    Convert a NumPy image to a PyTorch tensor.

    Args:
        images (`np.ndarray`):
            The NumPy image array to convert to PyTorch format.

    Returns:
        `torch.Tensor`:
            A PyTorch tensor representation of the images.
    """
    if images.ndim == 3:
        images = images[..., None]

    images = torch.from_numpy(images.transpose(0, 3, 1, 2))
    return images

fastvideo.models.vision_utils.pil_to_numpy

pil_to_numpy(images: list[Image] | Image) -> ndarray

Convert a PIL image or a list of PIL images to NumPy arrays.

Parameters:

Name Type Description Default
images `PIL.Image.Image` or `List[PIL.Image.Image]`

The PIL image or list of images to convert to NumPy format.

required

Returns:

Type Description
ndarray

np.ndarray: A NumPy array representation of the images.

Source code in fastvideo/models/vision_utils.py
def pil_to_numpy(images: list[PIL.Image.Image] | PIL.Image.Image) -> np.ndarray:
    r"""
    Convert a PIL image or a list of PIL images to NumPy arrays.

    Args:
        images (`PIL.Image.Image` or `List[PIL.Image.Image]`):
            The PIL image or list of images to convert to NumPy format.

    Returns:
        `np.ndarray`:
            A NumPy array representation of the images.
    """
    if not isinstance(images, list):
        images = [images]
    images = [np.array(image).astype(np.float32) / 255.0 for image in images]
    images_arr: np.ndarray = np.stack(images, axis=0)

    return images_arr

fastvideo.models.vision_utils.preprocess_reference_image_for_clip

preprocess_reference_image_for_clip(image: Image, device: device) -> Image

Preprocess reference image to match CLIP encoder requirements.

Applies normalization, resizing to 224x224, and denormalization to ensure the image is in the correct format for CLIP processing.

Parameters:

Name Type Description Default
image Image

Input PIL image

required
device device

Target device for tensor operations

required

Returns:

Type Description
Image

Preprocessed PIL image ready for CLIP encoder

Source code in fastvideo/models/vision_utils.py
def preprocess_reference_image_for_clip(image: PIL.Image.Image, device: torch.device) -> PIL.Image.Image:
    """
    Preprocess reference image to match CLIP encoder requirements.

    Applies normalization, resizing to 224x224, and denormalization to ensure
    the image is in the correct format for CLIP processing.

    Args:
        image: Input PIL image
        device: Target device for tensor operations

    Returns:
        Preprocessed PIL image ready for CLIP encoder
    """
    # Convert PIL to tensor and normalize to [-1, 1] range
    image_tensor = TF.to_tensor(image).sub_(0.5).div_(0.5).to(device)

    # Resize to CLIP's expected input size (224x224) using bicubic interpolation
    resized_tensor = F.interpolate(
        image_tensor.unsqueeze(0),
        size=(224, 224),
        mode='bicubic',
        align_corners=False
    ).squeeze(0)

    # Denormalize back to [0, 1] range
    denormalized_tensor = resized_tensor.mul_(0.5).add_(0.5)

    return TF.to_pil_image(denormalized_tensor)

fastvideo.models.vision_utils.resize

resize(image: Image | ndarray | Tensor, height: int, width: int, resize_mode: str = 'default', resample: str = 'lanczos') -> Image | ndarray | Tensor

Resize image.

Parameters:

Name Type Description Default
image `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`

The image input, can be a PIL image, numpy array or pytorch tensor.

required
height `int`

The height to resize to.

required
width `int`

The width to resize to.

required
resize_mode `str`, *optional*, defaults to `default`

The resize mode to use, can be one of default or fill. If default, will resize the image to fit within the specified width and height, and it may not maintaining the original aspect ratio. If fill, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image. If crop, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess. Note that resize_mode fill and crop are only supported for PIL image input.

'default'

Returns:

Type Description
Image | ndarray | Tensor

PIL.Image.Image, np.ndarray or torch.Tensor: The resized image.

Source code in fastvideo/models/vision_utils.py
def resize(
    image: PIL.Image.Image | np.ndarray | torch.Tensor,
    height: int,
    width: int,
    resize_mode: str = "default",  # "default", "fill", "crop"
    resample: str = "lanczos",
) -> PIL.Image.Image | np.ndarray | torch.Tensor:
    """
    Resize image.

    Args:
        image (`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
            The image input, can be a PIL image, numpy array or pytorch tensor.
        height (`int`):
            The height to resize to.
        width (`int`):
            The width to resize to.
        resize_mode (`str`, *optional*, defaults to `default`):
            The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit
            within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`,
            will resize the image to fit within the specified width and height, maintaining the aspect ratio, and
            then center the image within the dimensions, filling empty with data from image. If `crop`, will resize
            the image to fit within the specified width and height, maintaining the aspect ratio, and then center
            the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
            supported for PIL image input.

    Returns:
        `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
            The resized image.
    """
    if resize_mode != "default" and not isinstance(image, PIL.Image.Image):
        raise ValueError(
            f"Only PIL image input is supported for resize_mode {resize_mode}")
    assert isinstance(image, PIL.Image.Image)
    if resize_mode == "default":
        image = image.resize((width, height),
                             resample=PIL_INTERPOLATION[resample])
    else:
        raise ValueError(f"resize_mode {resize_mode} is not supported")
    return image