vision_utils ¶

Functions¶

fastvideo.models.vision_utils.create_default_image ¶

create_default_image(width: int = 512, height: int = 512, color: tuple[int, int, int] = (0, 0, 0)) -> Image

Create a default black PIL image.

Parameters:

Name	Type	Description	Default
`width`	`int`	Image width in pixels	`512`
`height`	`int`	Image height in pixels	`512`
`color`	`tuple[int, int, int]`	RGB color tuple	`(0, 0, 0)`

Returns:

Type	Description
`Image`	PIL.Image.Image: A new PIL image with specified dimensions and color

Source code in fastvideo/models/vision_utils.py

def create_default_image(width: int = 512, height: int = 512, color: tuple[int, int, int] = (0, 0, 0)) -> PIL.Image.Image:
    """
    Create a default black PIL image.

    Args:
        width: Image width in pixels
        height: Image height in pixels
        color: RGB color tuple

    Returns:
        PIL.Image.Image: A new PIL image with specified dimensions and color
    """
    return PIL.Image.new("RGB", (width, height), color=color)

fastvideo.models.vision_utils.get_default_height_width ¶

get_default_height_width(image: Image | ndarray | Tensor, vae_scale_factor: int, height: int | None = None, width: int | None = None) -> tuple[int, int]

Returns the height and width of the image, downscaled to the next integer multiple of vae_scale_factor.

Parameters:

Name	Type	Description	Default
`image`	`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`	The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch tensor, it should have shape `[batch, channels, height, width]`.	required
`height`	`Optional[int]`, optional, defaults to `None`	The height of the preprocessed image. If `None`, the height of the `image` input will be used.	`None`
`width`	`Optional[int]`, optional, defaults to `None`	The width of the preprocessed image. If `None`, the width of the `image` input will be used.	`None`

Returns:

Type	Description
`tuple[int, int]`	`Tuple[int, int]`: A tuple containing the height and width, both resized to the nearest integer multiple of `vae_scale_factor`.

Source code in fastvideo/models/vision_utils.py

def get_default_height_width(
    image: PIL.Image.Image | np.ndarray | torch.Tensor,
    vae_scale_factor: int,
    height: int | None = None,
    width: int | None = None,
) -> tuple[int, int]:
    r"""
    Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.

    Args:
        image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
            The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
            should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
            tensor, it should have shape `[batch, channels, height, width]`.
        height (`Optional[int]`, *optional*, defaults to `None`):
            The height of the preprocessed image. If `None`, the height of the `image` input will be used.
        width (`Optional[int]`, *optional*, defaults to `None`):
            The width of the preprocessed image. If `None`, the width of the `image` input will be used.

    Returns:
        `Tuple[int, int]`:
            A tuple containing the height and width, both resized to the nearest integer multiple of
            `vae_scale_factor`.
    """

    if height is None:
        if isinstance(image, PIL.Image.Image):
            height = image.height
        elif isinstance(image, torch.Tensor):
            height = image.shape[2]
        else:
            height = image.shape[1]

    if width is None:
        if isinstance(image, PIL.Image.Image):
            width = image.width
        elif isinstance(image, torch.Tensor):
            width = image.shape[3]
        else:
            width = image.shape[2]

    width, height = (x - x % vae_scale_factor for x in (width, height)
                     )  # resize to integer multiple of vae_scale_factor

    return height, width

fastvideo.models.vision_utils.load_image ¶

load_image(image: str | Image, convert_method: Callable[[Image], Image] | None = None) -> Image

Loads image to a PIL Image.

Parameters:

Name	Type	Description	Default
`image`	`str` or `PIL.Image.Image`	The image to convert to the PIL Image format.	required
`convert_method`	`Callable[[PIL.Image.Image], PIL.Image.Image], optional`	A conversion method to apply to the image after loading it. When set to `None` the image will be converted "RGB".	`None`

Returns:

Type	Description
`Image`	`PIL.Image.Image`: A PIL Image.

Source code in fastvideo/models/vision_utils.py

def load_image(
    image: str | PIL.Image.Image,
    convert_method: Callable[[PIL.Image.Image], PIL.Image.Image] | None = None
) -> PIL.Image.Image:
    """
    Loads `image` to a PIL Image.

    Args:
        image (`str` or `PIL.Image.Image`):
            The image to convert to the PIL Image format.
        convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], *optional*):
            A conversion method to apply to the image after loading it. When set to `None` the image will be converted
            "RGB".

    Returns:
        `PIL.Image.Image`:
            A PIL Image.
    """
    if isinstance(image, str):
        if image.startswith("http://") or image.startswith("https://"):
            image = PIL.Image.open(requests.get(image, stream=True).raw)
        elif os.path.isfile(image):
            image = PIL.Image.open(image)
        else:
            raise ValueError(
                f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {image} is not a valid path."
            )
    elif isinstance(image, PIL.Image.Image):
        image = image
    else:
        raise ValueError(
            "Incorrect format used for the image. Should be a URL linking to an image, a local path, or a PIL image."
        )

    image = PIL.ImageOps.exif_transpose(image)

    if convert_method is not None:
        image = convert_method(image)
    else:
        image = image.convert("RGB")

    return image

fastvideo.models.vision_utils.load_video ¶

load_video(video: str, convert_method: Callable[[list[Image]], list[Image]] | None = None, return_fps: bool = False) -> tuple[list[Image], float | Any] | list[Image]

Loads video to a list of PIL Image. Args: video (str): A URL or Path to a video to convert to a list of PIL Image format. convert_method (Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]], optional): A conversion method to apply to the video after loading it. When set to None the images will be converted to "RGB". return_fps (bool, optional, defaults to False): Whether to return the FPS of the video. If True, returns a tuple of (images, fps). If False, returns only the list of images. Returns: List[PIL.Image.Image] or Tuple[List[PIL.Image.Image], float | None]: The video as a list of PIL images. If return_fps is True, also returns the original FPS.

Source code in fastvideo/models/vision_utils.py

def load_video(
    video: str,
    convert_method: Callable[[list[PIL.Image.Image]], list[PIL.Image.Image]]
    | None = None,
    return_fps: bool = False,
) -> tuple[list[PIL.Image.Image], float | Any] | list[PIL.Image.Image]:
    """
    Loads `video` to a list of PIL Image.
    Args:
        video (`str`):
            A URL or Path to a video to convert to a list of PIL Image format.
        convert_method (Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]], *optional*):
            A conversion method to apply to the video after loading it. When set to `None` the images will be converted
            to "RGB".
        return_fps (`bool`, *optional*, defaults to `False`):
            Whether to return the FPS of the video. If `True`, returns a tuple of (images, fps).
            If `False`, returns only the list of images.
    Returns:
        `List[PIL.Image.Image]` or `Tuple[List[PIL.Image.Image], float | None]`:
            The video as a list of PIL images. If `return_fps` is True, also returns the original FPS.
    """
    is_url = video.startswith("http://") or video.startswith("https://")
    is_file = os.path.isfile(video)
    was_tempfile_created = False

    if not (is_url or is_file):
        raise ValueError(
            f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {video} is not a valid path."
        )

    if is_url:
        response = requests.get(video, stream=True)
        if response.status_code != 200:
            raise ValueError(
                f"Failed to download video. Status code: {response.status_code}"
            )

        parsed_url = urlparse(video)
        file_name = os.path.basename(unquote(parsed_url.path))

        suffix = os.path.splitext(file_name)[1] or ".mp4"
        with tempfile.NamedTemporaryFile(suffix=suffix,
                                         delete=False) as temp_file:
            video_path = temp_file.name
            video_data = response.iter_content(chunk_size=8192)
            for chunk in video_data:
                temp_file.write(chunk)
        was_tempfile_created = True
    else:
        video_path = video

    pil_images = []
    original_fps = None

    try:
        if video_path.endswith(".gif"):
            pil_images, original_fps = _load_gif(video_path)
        else:
            pil_images, original_fps = _load_video_with_ffmpeg(video_path)
    finally:
        # Clean up temporary file if it was created
        if was_tempfile_created and os.path.exists(video_path):
            os.remove(video_path)

    if convert_method is not None:
        pil_images = convert_method(pil_images)

    return pil_images, original_fps if return_fps else pil_images

fastvideo.models.vision_utils.normalize ¶

normalize(images: ndarray | Tensor) -> ndarray | Tensor

Normalize an image array to [-1,1].

Parameters:

Name	Type	Description	Default
`images`	`np.ndarray` or `torch.Tensor`	The image array to normalize.	required

Returns:

Type	Description
`ndarray \| Tensor`	`np.ndarray` or `torch.Tensor`: The normalized image array.

Source code in fastvideo/models/vision_utils.py

def normalize(images: np.ndarray | torch.Tensor) -> np.ndarray | torch.Tensor:
    r"""
    Normalize an image array to [-1,1].

    Args:
        images (`np.ndarray` or `torch.Tensor`):
            The image array to normalize.

    Returns:
        `np.ndarray` or `torch.Tensor`:
            The normalized image array.
    """
    return 2.0 * images - 1.0

fastvideo.models.vision_utils.numpy_to_pt ¶

numpy_to_pt(images: ndarray) -> Tensor

Convert a NumPy image to a PyTorch tensor.

Parameters:

Name	Type	Description	Default
`images`	`np.ndarray`	The NumPy image array to convert to PyTorch format.	required

Returns:

Type	Description
`Tensor`	`torch.Tensor`: A PyTorch tensor representation of the images.

Source code in fastvideo/models/vision_utils.py

def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
    r"""
    Convert a NumPy image to a PyTorch tensor.

    Args:
        images (`np.ndarray`):
            The NumPy image array to convert to PyTorch format.

    Returns:
        `torch.Tensor`:
            A PyTorch tensor representation of the images.
    """
    if images.ndim == 3:
        images = images[..., None]

    images = torch.from_numpy(images.transpose(0, 3, 1, 2))
    return images

fastvideo.models.vision_utils.pil_to_numpy ¶

pil_to_numpy(images: list[Image] | Image) -> ndarray

Convert a PIL image or a list of PIL images to NumPy arrays.

Parameters:

Name	Type	Description	Default
`images`	`PIL.Image.Image` or `List[PIL.Image.Image]`	The PIL image or list of images to convert to NumPy format.	required

Returns:

Type	Description
`ndarray`	`np.ndarray`: A NumPy array representation of the images.

Source code in fastvideo/models/vision_utils.py

def pil_to_numpy(images: list[PIL.Image.Image] | PIL.Image.Image) -> np.ndarray:
    r"""
    Convert a PIL image or a list of PIL images to NumPy arrays.

    Args:
        images (`PIL.Image.Image` or `List[PIL.Image.Image]`):
            The PIL image or list of images to convert to NumPy format.

    Returns:
        `np.ndarray`:
            A NumPy array representation of the images.
    """
    if not isinstance(images, list):
        images = [images]
    images = [np.array(image).astype(np.float32) / 255.0 for image in images]
    images_arr: np.ndarray = np.stack(images, axis=0)

    return images_arr

fastvideo.models.vision_utils.preprocess_reference_image_for_clip ¶

preprocess_reference_image_for_clip(image: Image, device: device) -> Image

Preprocess reference image to match CLIP encoder requirements.

Applies normalization, resizing to 224x224, and denormalization to ensure the image is in the correct format for CLIP processing.

Parameters:

Name	Type	Description	Default
`image`	`Image`	Input PIL image	required
`device`	`device`	Target device for tensor operations	required

Returns:

Type	Description
`Image`	Preprocessed PIL image ready for CLIP encoder

Source code in fastvideo/models/vision_utils.py

def preprocess_reference_image_for_clip(image: PIL.Image.Image, device: torch.device) -> PIL.Image.Image:
    """
    Preprocess reference image to match CLIP encoder requirements.

    Applies normalization, resizing to 224x224, and denormalization to ensure
    the image is in the correct format for CLIP processing.

    Args:
        image: Input PIL image
        device: Target device for tensor operations

    Returns:
        Preprocessed PIL image ready for CLIP encoder
    """
    # Convert PIL to tensor and normalize to [-1, 1] range
    image_tensor = TF.to_tensor(image).sub_(0.5).div_(0.5).to(device)

    # Resize to CLIP's expected input size (224x224) using bicubic interpolation
    resized_tensor = F.interpolate(
        image_tensor.unsqueeze(0),
        size=(224, 224),
        mode='bicubic',
        align_corners=False
    ).squeeze(0)

    # Denormalize back to [0, 1] range
    denormalized_tensor = resized_tensor.mul_(0.5).add_(0.5)

    return TF.to_pil_image(denormalized_tensor)

fastvideo.models.vision_utils.resize ¶

resize(image: Image | ndarray | Tensor, height: int, width: int, resize_mode: str = 'default', resample: str = 'lanczos') -> Image | ndarray | Tensor

Resize image.

Parameters:

Name	Type	Description	Default
`image`	`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`	The image input, can be a PIL image, numpy array or pytorch tensor.	required
`height`	`int`	The height to resize to.	required
`width`	`int`	The width to resize to.	required
`resize_mode`	`str`, optional, defaults to `default`	The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image. If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only supported for PIL image input.	`'default'`

Returns:

Type	Description
`Image \| ndarray \| Tensor`	`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: The resized image.

Source code in fastvideo/models/vision_utils.py

def resize(
    image: PIL.Image.Image | np.ndarray | torch.Tensor,
    height: int,
    width: int,
    resize_mode: str = "default",  # "default", "fill", "crop"
    resample: str = "lanczos",
) -> PIL.Image.Image | np.ndarray | torch.Tensor:
    """
    Resize image.

    Args:
        image (`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
            The image input, can be a PIL image, numpy array or pytorch tensor.
        height (`int`):
            The height to resize to.
        width (`int`):
            The width to resize to.
        resize_mode (`str`, *optional*, defaults to `default`):
            The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit
            within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`,
            will resize the image to fit within the specified width and height, maintaining the aspect ratio, and
            then center the image within the dimensions, filling empty with data from image. If `crop`, will resize
            the image to fit within the specified width and height, maintaining the aspect ratio, and then center
            the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
            supported for PIL image input.

    Returns:
        `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
            The resized image.
    """
    if resize_mode != "default" and not isinstance(image, PIL.Image.Image):
        raise ValueError(
            f"Only PIL image input is supported for resize_mode {resize_mode}")
    assert isinstance(image, PIL.Image.Image)
    if resize_mode == "default":
        image = image.resize((width, height),
                             resample=PIL_INTERPOLATION[resample])
    else:
        raise ValueError(f"resize_mode {resize_mode} is not supported")
    return image