latent_preparation ¶

Latent preparation stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.latent_preparation.Cosmos25AutoLatentPreparationStage ¶

Cosmos25AutoLatentPreparationStage(scheduler, transformer, vae=None)

Bases: PipelineStage

Route Cosmos 2.5 latent prep to T2W vs V2W/I2W.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self, scheduler, transformer, vae=None) -> None:
    super().__init__()
    self._t2w = Cosmos25T2WLatentPreparationStage(
        scheduler=scheduler,
        transformer=transformer,
    )
    self._v2w = Cosmos25V2WLatentPreparationStage(
        scheduler=scheduler,
        transformer=transformer,
        vae=vae,
    )

fastvideo.pipelines.stages.latent_preparation.Cosmos25LatentPreparationStage ¶

Cosmos25LatentPreparationStage(scheduler, transformer, vae=None)

Bases: CosmosLatentPreparationStage

Latent preparation for Cosmos 2.5 DiT input conventions.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self, scheduler, transformer, vae=None) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.vae = vae

Functions¶

fastvideo.pipelines.stages.latent_preparation.Cosmos25LatentPreparationStage.adjust_video_length ¶

adjust_video_length(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> int

Adjust video length based on VAE version.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`int`	The batch with adjusted video length.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def adjust_video_length(self, batch: ForwardBatch,
                        fastvideo_args: FastVideoArgs) -> int:
    """
    Adjust video length based on VAE version.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with adjusted video length.
    """

    video_length = batch.num_frames
    use_temporal_scaling_frames = fastvideo_args.pipeline_config.vae_config.use_temporal_scaling_frames
    if use_temporal_scaling_frames:
        temporal_scale_factor = fastvideo_args.pipeline_config.vae_config.arch_config.temporal_compression_ratio
        latent_num_frames = (video_length - 1) // temporal_scale_factor + 1
    else:  # stepvideo only
        latent_num_frames = video_length // 17 * 3
    return int(latent_num_frames)

fastvideo.pipelines.stages.latent_preparation.Cosmos25LatentPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify Cosmos latent preparation stage inputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify Cosmos latent preparation stage inputs."""
    result = VerificationResult()
    result.add_check(
        "prompt_or_embeds", None, lambda _: V.string_or_list_strings(
            batch.prompt) or V.list_not_empty(batch.prompt_embeds))
    result.add_check("prompt_embeds", batch.prompt_embeds,
                     V.list_of_tensors)
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("num_frames", batch.num_frames, V.positive_int)
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("latents", batch.latents, V.none_or_tensor)
    return result

fastvideo.pipelines.stages.latent_preparation.Cosmos25LatentPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage outputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("raw_latent_shape", batch.raw_latent_shape, V.is_tuple)
    return result

fastvideo.pipelines.stages.latent_preparation.Cosmos25T2WLatentPreparationStage ¶

Cosmos25T2WLatentPreparationStage(scheduler, transformer)

Bases: PipelineStage

Cosmos 2.5 Text2World latent preparation.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self, scheduler, transformer) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer

fastvideo.pipelines.stages.latent_preparation.Cosmos25V2WLatentPreparationStage ¶

Cosmos25V2WLatentPreparationStage(scheduler, transformer, vae=None)

Bases: Cosmos25LatentPreparationStage

Cosmos 2.5 V2W/I2W latent preparation stage (conditioning-aware).

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self, scheduler, transformer, vae=None) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.vae = vae

fastvideo.pipelines.stages.latent_preparation.CosmosLatentPreparationStage ¶

CosmosLatentPreparationStage(scheduler, transformer, vae=None)

Bases: PipelineStage

Cosmos-specific latent preparation stage that properly handles the tensor shapes and conditioning masks required by the Cosmos transformer.

This stage replicates the logic from diffusers' Cosmos2VideoToWorldPipeline.prepare_latents()

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self, scheduler, transformer, vae=None) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.vae = vae

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage ¶

LatentPreparationStage(scheduler, transformer, use_btchw_layout: bool = False)

Bases: PipelineStage

Stage for preparing initial latent variables for the diffusion process.

This stage handles the preparation of the initial latent variables that will be denoised during the diffusion process.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self,
             scheduler,
             transformer,
             use_btchw_layout: bool = False) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.use_btchw_layout = use_btchw_layout

Functions¶

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.adjust_video_length ¶

adjust_video_length(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> int

Adjust video length based on VAE version.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`int`	The batch with adjusted video length.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def adjust_video_length(self, batch: ForwardBatch,
                        fastvideo_args: FastVideoArgs) -> int:
    """
    Adjust video length based on VAE version.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with adjusted video length.
    """

    video_length = batch.num_frames
    use_temporal_scaling_frames = fastvideo_args.pipeline_config.vae_config.use_temporal_scaling_frames
    if use_temporal_scaling_frames:
        temporal_scale_factor = fastvideo_args.pipeline_config.vae_config.arch_config.temporal_compression_ratio
        latent_num_frames = (video_length - 1) // temporal_scale_factor + 1
    else:  # stepvideo only
        latent_num_frames = video_length // 17 * 3
    return int(latent_num_frames)

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Prepare initial latent variables for the diffusion process.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with prepared latent variables.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Prepare initial latent variables for the diffusion process.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with prepared latent variables.
    """

    latent_num_frames = None
    # Adjust video length based on VAE version if needed
    if hasattr(self, 'adjust_video_length'):
        latent_num_frames = self.adjust_video_length(batch, fastvideo_args)
    # Determine batch size; fall back to action/image inputs when no text encoder is present
    if not batch.prompt_embeds:
        if batch.keyboard_cond is not None:
            batch_size = batch.keyboard_cond.shape[0]
        elif batch.mouse_cond is not None:
            batch_size = batch.mouse_cond.shape[0]
        elif batch.image_embeds:
            batch_size = batch.image_embeds[0].shape[0]
        else:
            batch_size = 1
    elif isinstance(batch.prompt, list):
        batch_size = len(batch.prompt)
    elif batch.prompt is not None:
        batch_size = 1
    else:
        batch_size = batch.prompt_embeds[0].shape[0]

    # Adjust batch size for number of videos per prompt
    batch_size *= batch.num_videos_per_prompt

    # Get required parameters
    if not batch.prompt_embeds:
        # Create a dummy zero-length text embedding to satisfy downstream checks.
        # Matrix-Game models have text_dim=0 and ignore encoder_hidden_states.
        transformer_dtype = next(self.transformer.parameters()).dtype
        device = get_local_torch_device()
        dummy_prompt = torch.zeros(batch_size,
                                   0,
                                   self.transformer.hidden_size,
                                   device=device,
                                   dtype=transformer_dtype)
        batch.prompt_embeds = [dummy_prompt]
        batch.negative_prompt_embeds = []
        batch.do_classifier_free_guidance = False
    dtype = batch.prompt_embeds[0].dtype
    device = get_local_torch_device()
    generator = batch.generator
    latents = batch.latents
    num_frames = latent_num_frames if latent_num_frames is not None else batch.num_frames
    height = batch.height
    width = batch.width

    # TODO(will): remove this once we add input/output validation for stages
    if height is None or width is None:
        raise ValueError("Height and width must be provided")

    # Calculate latent shape
    bcthw_shape: tuple[int, ...] | None = None
    if self.use_btchw_layout:
        shape = (
            batch_size,
            num_frames,
            self.transformer.num_channels_latents,
            height // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
            width // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
        )
        bcthw_shape = tuple(shape[i] for i in [0, 2, 1, 3, 4])
    else:
        shape = (
            batch_size,
            self.transformer.num_channels_latents,
            num_frames,
            height // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
            width // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
        )
        bcthw_shape = shape

    # Validate generator if it's a list
    if isinstance(generator, list) and len(generator) != batch_size:
        raise ValueError(
            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
        )
    # Generate or use provided latents
    if latents is None:
        latents = randn_tensor(
            shape,
            generator=generator,
            device=device,
            dtype=dtype,
        )
        if hasattr(self.scheduler, "init_noise_sigma"):
            latents = latents * self.scheduler.init_noise_sigma
    else:
        # Pre-initialized latents:
        # - For LongCat refine (refine_from or stage1_video present), we should not re-scale by init_noise_sigma.
        # - For other models, keep the original behavior.
        latents = latents.to(device)
        is_longcat_refine = (batch.refine_from
                             is not None) or (batch.stage1_video
                                              is not None)
        if (not is_longcat_refine) and hasattr(self.scheduler,
                                               "init_noise_sigma"):
            latents = latents * self.scheduler.init_noise_sigma

    # Update batch with prepared latents
    batch.latents = latents
    batch.raw_latent_shape = bcthw_shape

    return batch

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage inputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage inputs."""
    result = VerificationResult()
    result.add_check(
        "prompt_or_embeds", None,
        lambda _: V.string_or_list_strings(batch.prompt) or not batch.
        prompt_embeds or V.list_not_empty(batch.prompt_embeds))
    if batch.prompt_embeds:
        result.add_check("prompt_embeds", batch.prompt_embeds,
                         V.list_of_tensors)
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("num_frames", batch.num_frames, V.positive_int)
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("latents", batch.latents, V.none_or_tensor)
    return result

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage outputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("raw_latent_shape", batch.raw_latent_shape, V.is_tuple)
    return result