pipelines ¶

Diffusion pipelines for fastvideo.

This package contains diffusion pipelines for generating videos and images.

Classes¶

fastvideo.pipelines.ComposedPipelineBase ¶

ComposedPipelineBase(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: ABC

Base class for pipelines composed of multiple stages.

This class provides the framework for creating pipelines by composing multiple stages together. Each stage is responsible for a specific part of the diffusion process, and the pipeline orchestrates the execution of these stages.

Initialize the pipeline. After init, the pipeline should be ready to use. The pipeline should be stateless and not hold any batch state.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Attributes¶

fastvideo.pipelines.ComposedPipelineBase.required_config_modules `property` ¶

required_config_modules: list[str]

List of modules that are required by the pipeline. The names should match the diffusers directory and model_index.json file. These modules will be loaded using the PipelineComponentLoader and made available in the modules dictionary. Access these modules using the get_module method.

class ConcretePipeline(ComposedPipelineBase): _required_config_modules = ["vae", "text_encoder", "transformer", "scheduler", "tokenizer"]

@property
def required_config_modules(self):
    return self._required_config_modules

fastvideo.pipelines.ComposedPipelineBase.stages `property` ¶

stages: list[PipelineStage]

List of stages in the pipeline.

Functions¶

fastvideo.pipelines.ComposedPipelineBase.create_pipeline_stages `abstractmethod` ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Create the inference pipeline stages.

Source code in fastvideo/pipelines/composed_pipeline_base.py

@abstractmethod
def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """
    Create the inference pipeline stages.
    """
    raise NotImplementedError

fastvideo.pipelines.ComposedPipelineBase.create_training_stages ¶

create_training_stages(training_args: TrainingArgs)

Create the training pipeline stages.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def create_training_stages(self, training_args: TrainingArgs):
    """
    Create the training pipeline stages.
    """
    raise NotImplementedError

fastvideo.pipelines.ComposedPipelineBase.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Generate a video or image using the pipeline.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The batch to generate from.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns: ForwardBatch: The batch with the generated video or image.

Source code in fastvideo/pipelines/composed_pipeline_base.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Generate a video or image using the pipeline.

    Args:
        batch: The batch to generate from.
        fastvideo_args: The inference arguments.
    Returns:
        ForwardBatch: The batch with the generated video or image.
    """
    if not self.post_init_called:
        self.post_init()

    # Execute each stage
    logger.info("Running pipeline stages: %s",
                self._stage_name_mapping.keys())
    # logger.info("Batch: %s", batch)
    for stage in self.stages:
        batch = stage(batch, fastvideo_args)

    # Return the output
    return batch

fastvideo.pipelines.ComposedPipelineBase.from_pretrained `classmethod` ¶

from_pretrained(model_path: str, device: str | None = None, torch_dtype: dtype | None = None, pipeline_config: str | PipelineConfig | None = None, args: Namespace | None = None, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None, **kwargs) -> ComposedPipelineBase

Load a pipeline from a pretrained model. loaded_modules: Optional[Dict[str, torch.nn.Module]] = None, If provided, loaded_modules will be used instead of loading from config/pretrained weights.

Source code in fastvideo/pipelines/composed_pipeline_base.py

@classmethod
def from_pretrained(cls,
                    model_path: str,
                    device: str | None = None,
                    torch_dtype: torch.dtype | None = None,
                    pipeline_config: str | PipelineConfig | None = None,
                    args: argparse.Namespace | None = None,
                    required_config_modules: list[str] | None = None,
                    loaded_modules: dict[str, torch.nn.Module]
                    | None = None,
                    **kwargs) -> "ComposedPipelineBase":
    """
    Load a pipeline from a pretrained model.
    loaded_modules: Optional[Dict[str, torch.nn.Module]] = None,
    If provided, loaded_modules will be used instead of loading from config/pretrained weights.
    """
    if args is None or args.inference_mode:

        kwargs['model_path'] = model_path
        fastvideo_args = FastVideoArgs.from_kwargs(**kwargs)
    else:
        assert args is not None, "args must be provided for training mode"
        fastvideo_args = TrainingArgs.from_cli_args(args)
        # TODO(will): fix this so that its not so ugly
        fastvideo_args.model_path = model_path
        for key, value in kwargs.items():
            setattr(fastvideo_args, key, value)

        fastvideo_args.dit_cpu_offload = False
        # we hijack the precision to be the master weight type so that the
        # model is loaded with the correct precision. Subsequently we will
        # use FSDP2's MixedPrecisionPolicy to set the precision for the
        # fwd, bwd, and other operations' precision.
        assert fastvideo_args.pipeline_config.dit_precision == 'fp32', 'only fp32 is supported for training'

    logger.info("fastvideo_args in from_pretrained: %s", fastvideo_args)

    pipe = cls(model_path,
               fastvideo_args,
               required_config_modules=required_config_modules,
               loaded_modules=loaded_modules)
    pipe.post_init()
    return pipe

fastvideo.pipelines.ComposedPipelineBase.initialize_pipeline ¶

initialize_pipeline(fastvideo_args: FastVideoArgs)

Initialize the pipeline.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
    """
    Initialize the pipeline.
    """
    return

fastvideo.pipelines.ComposedPipelineBase.load_modules ¶

load_modules(fastvideo_args: FastVideoArgs, loaded_modules: dict[str, Module] | None = None) -> dict[str, Any]

Load the modules from the config. loaded_modules: Optional[Dict[str, torch.nn.Module]] = None, If provided, loaded_modules will be used instead of loading from config/pretrained weights.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def load_modules(
    self,
    fastvideo_args: FastVideoArgs,
    loaded_modules: dict[str, torch.nn.Module] | None = None
) -> dict[str, Any]:
    """
    Load the modules from the config.
    loaded_modules: Optional[Dict[str, torch.nn.Module]] = None, 
    If provided, loaded_modules will be used instead of loading from config/pretrained weights.
    """

    model_index = self._load_config(self.model_path)
    logger.info("Loading pipeline modules from config: %s", model_index)

    # remove keys that are not pipeline modules
    model_index.pop("_class_name")
    model_index.pop("_diffusers_version")
    model_index.pop("workload_type", None)
    if "boundary_ratio" in model_index and model_index[
            "boundary_ratio"] is not None:
        logger.info(
            "MoE pipeline detected. Adding transformer_2 to self.required_config_modules..."
        )
        self.required_config_modules.append("transformer_2")
        logger.info("MoE pipeline detected. Setting boundary ratio to %s",
                    model_index["boundary_ratio"])
        fastvideo_args.pipeline_config.dit_config.boundary_ratio = model_index[
            "boundary_ratio"]

    model_index.pop("boundary_ratio", None)
    # used by Wan2.2 ti2v
    model_index.pop("expand_timesteps", None)

    # some sanity checks
    assert len(
        model_index
    ) > 1, "model_index.json must contain at least one pipeline module"

    for module_name in self.required_config_modules:
        if module_name not in model_index and module_name in self._extra_config_module_map:
            extra_module_value = self._extra_config_module_map[module_name]
            logger.warning(
                "model_index.json does not contain a %s module, but found {%s: %s} in _extra_config_module_map, adding to model_index.",
                module_name, module_name, extra_module_value)
            if extra_module_value in model_index:
                logger.info("Using module %s for %s", extra_module_value,
                            module_name)
                model_index[module_name] = model_index[extra_module_value]
                continue
            else:
                raise ValueError(
                    f"Required module key: {module_name} value: {model_index.get(module_name)} was not found in loaded modules {model_index.keys()}"
                )

    # all the component models used by the pipeline
    required_modules = self.required_config_modules
    logger.info("Loading required modules: %s", required_modules)

    modules = {}
    for module_name, (transformers_or_diffusers,
                      architecture) in model_index.items():
        if transformers_or_diffusers is None:
            logger.warning(
                "Module %s in model_index.json has null value, removing from required_config_modules",
                module_name)
            if module_name in self.required_config_modules:
                self.required_config_modules.remove(module_name)
            continue
        if module_name not in required_modules:
            logger.info("Skipping module %s", module_name)
            continue
        if loaded_modules is not None and module_name in loaded_modules:
            logger.info("Using module %s already provided", module_name)
            modules[module_name] = loaded_modules[module_name]
            continue

        # we load the module from the extra config module map if it exists
        if module_name in self._extra_config_module_map:
            load_module_name = self._extra_config_module_map[module_name]
        else:
            load_module_name = module_name

        component_model_path = os.path.join(self.model_path,
                                            load_module_name)
        module = PipelineComponentLoader.load_module(
            module_name=load_module_name,
            component_model_path=component_model_path,
            transformers_or_diffusers=transformers_or_diffusers,
            fastvideo_args=fastvideo_args,
        )
        logger.info("Loaded module %s from %s", module_name,
                    component_model_path)

        if module_name in modules:
            logger.warning("Overwriting module %s", module_name)
        modules[module_name] = module

    # Check if all required modules were loaded
    for module_name in required_modules:
        if module_name not in modules or modules[module_name] is None:
            raise ValueError(
                f"Required module key: {module_name} value: {modules.get(module_name)} was not found in loaded modules {modules.keys()}"
            )

    return modules

fastvideo.pipelines.ForwardBatch `dataclass` ¶

ForwardBatch(data_type: str, generator: Generator | list[Generator] | None = None, image_path: str | None = None, image_embeds: list[Tensor] = list(), pil_image: Tensor | Image | None = None, preprocessed_image: Tensor | None = None, prompt: str | list[str] | None = None, negative_prompt: str | list[str] | None = None, prompt_path: str | None = None, output_path: str = 'outputs/', output_video_name: str | None = None, video_path: str | None = None, video_latent: Tensor | None = None, refine_from: str | None = None, t_thresh: float = 0.5, spatial_refine_only: bool = False, num_cond_frames: int = 0, stage1_video: list[Image] | None = None, prompt_embeds: list[Tensor] = list(), negative_prompt_embeds: list[Tensor] | None = None, prompt_attention_mask: list[Tensor] | None = None, negative_attention_mask: list[Tensor] | None = None, clip_embedding_pos: list[Tensor] | None = None, clip_embedding_neg: list[Tensor] | None = None, max_sequence_length: int | None = None, prompt_template: dict[str, Any] | None = None, do_classifier_free_guidance: bool = False, batch_size: int | None = None, num_videos_per_prompt: int = 1, seed: int | None = None, seeds: list[int] | None = None, is_prompt_processed: bool = False, latents: Tensor | None = None, raw_latent_shape: tuple[int, ...] | None = None, noise_pred: Tensor | None = None, image_latent: Tensor | None = None, mouse_cond: Tensor | None = None, keyboard_cond: Tensor | None = None, grid_sizes: Tensor | None = None, height_latents: list[int] | int | None = None, width_latents: list[int] | int | None = None, num_frames: list[int] | int = 1, num_frames_round_down: bool = False, height: list[int] | int | None = None, width: list[int] | int | None = None, fps: list[int] | int | None = None, timesteps: Tensor | None = None, timestep: Tensor | float | int | None = None, step_index: int | None = None, boundary_ratio: float | None = None, num_inference_steps: int = 50, guidance_scale: float = 1.0, guidance_scale_2: float | None = None, guidance_rescale: float = 0.0, eta: float = 0.0, sigmas: list[float] | None = None, n_tokens: int | None = None, extra_step_kwargs: dict[str, Any] = dict(), modules: dict[str, Any] = dict(), output: Tensor | None = None, return_trajectory_latents: bool = False, return_trajectory_decoded: bool = False, trajectory_timesteps: list[Tensor] | None = None, trajectory_latents: Tensor | None = None, trajectory_decoded: list[Tensor] | None = None, extra: dict[str, Any] = dict(), save_video: bool = True, return_frames: bool = False, enable_teacache: bool = False, teacache_params: TeaCacheParams | WanTeaCacheParams | None = None, STA_param: list | None = None, is_cfg_negative: bool = False, mask_search_final_result_pos: list[list] | None = None, mask_search_final_result_neg: list[list] | None = None, VSA_sparsity: float = 0.0, logging_info: PipelineLoggingInfo = PipelineLoggingInfo())

Complete state passed through the pipeline execution.

This dataclass contains all information needed during the diffusion pipeline execution, allowing methods to update specific components without needing to manage numerous individual parameters.

Functions¶

fastvideo.pipelines.ForwardBatch.__post_init__ ¶

__post_init__()

Initialize dependent fields after dataclass initialization.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def __post_init__(self):
    """Initialize dependent fields after dataclass initialization."""

    # Set do_classifier_free_guidance based on guidance scale and negative prompt
    if self.guidance_scale > 1.0:
        self.do_classifier_free_guidance = True
    if self.negative_prompt_embeds is None:
        self.negative_prompt_embeds = []
    if self.guidance_scale_2 is None:
        self.guidance_scale_2 = self.guidance_scale

fastvideo.pipelines.LoRAPipeline ¶

LoRAPipeline(*args, **kwargs)

Bases: ComposedPipelineBase

Pipeline that supports injecting LoRA adapters into the diffusion transformer. TODO: support training.

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.LoRAPipeline.convert_to_lora_layers ¶

convert_to_lora_layers() -> None

Unified method to convert the transformer to a LoRA transformer.

Source code in fastvideo/pipelines/lora_pipeline.py

def convert_to_lora_layers(self) -> None:
    """
    Unified method to convert the transformer to a LoRA transformer.
    """
    if self.lora_initialized:
        return
    self.lora_initialized = True
    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        converted_count = 0
        if transformer_name not in self.lora_layers:
            self.lora_layers[transformer_name] = {}
        logger.info("Converting %s to LoRA Transformer", transformer_name)
        for name, layer in transformer_module.named_modules():
            if not self.is_target_layer(name):
                continue

            excluded = False
            for exclude_layer in self.exclude_lora_layers[transformer_name]:
                if exclude_layer in name:
                    excluded = True
                    break
            if excluded:
                continue

            layer = get_lora_layer(layer,
                                   lora_rank=self.lora_rank,
                                   lora_alpha=self.lora_alpha,
                                   training_mode=self.training_mode)
            if layer is not None:
                self.lora_layers[transformer_name][name] = layer
                replace_submodule(transformer_module, name, layer)
                converted_count += 1
        logger.info("Converted %d layers to LoRA layers", converted_count)

fastvideo.pipelines.LoRAPipeline.set_lora_adapter ¶

set_lora_adapter(lora_nickname: str, lora_path: str | None = None)

Load a LoRA adapter into the pipeline and merge it into the transformer. Args: lora_nickname: The "nick name" of the adapter when referenced in the pipeline. lora_path: The path to the adapter, either a local path or a Hugging Face repo id.

Source code in fastvideo/pipelines/lora_pipeline.py

def set_lora_adapter(self,
                     lora_nickname: str,
                     lora_path: str | None = None):  # type: ignore
    """
    Load a LoRA adapter into the pipeline and merge it into the transformer.
    Args:
        lora_nickname: The "nick name" of the adapter when referenced in the pipeline.
        lora_path: The path to the adapter, either a local path or a Hugging Face repo id.
    """

    if lora_nickname not in self.lora_adapters and lora_path is None:
        raise ValueError(
            f"Adapter {lora_nickname} not found in the pipeline. Please provide lora_path to load it."
        )
    if not self.lora_initialized:
        self.convert_to_lora_layers()
    adapter_updated = False
    rank = dist.get_rank()
    if lora_path is not None and lora_path != self.cur_adapter_path:
        lora_local_path = maybe_download_lora(lora_path)
        lora_state_dict = load_file(lora_local_path)

        # Map the hf layer names to our custom layer names
        param_names_mapping_fn = get_param_names_mapping(
            self.modules["transformer"].param_names_mapping)
        lora_param_names_mapping_fn = get_param_names_mapping(
            self.modules["transformer"].lora_param_names_mapping)

        # Extract alpha values and weights in a single pass
        to_merge_params: defaultdict[Hashable,
                                     dict[Any, Any]] = defaultdict(dict)
        for name, weight in lora_state_dict.items():
            # Extract weights (lora_A, lora_B, and lora_alpha)
            name = name.replace("diffusion_model.", "")
            name = name.replace(".weight", "")

            if "lora_alpha" in name:
                # Store alpha with minimal mapping - same processing as lora_A/lora_B
                # but store in lora_adapters with ".lora_alpha" suffix
                layer_name = name.replace(".lora_alpha", "")
                layer_name, _, _ = lora_param_names_mapping_fn(layer_name)
                target_name, _, _ = param_names_mapping_fn(layer_name)
                # Store alpha alongside weights with same target_name base
                alpha_key = target_name + ".lora_alpha"
                self.lora_adapters[lora_nickname][alpha_key] = weight.item(
                ) if weight.numel() == 1 else float(weight.mean())
                continue

            name, _, _ = lora_param_names_mapping_fn(name)
            target_name, merge_index, num_params_to_merge = param_names_mapping_fn(
                name)
            # for (in_dim, r) @ (r, out_dim), we only merge (r, out_dim * n) where n is the number of linear layers to fuse
            # see param mapping in HunyuanVideoArchConfig
            if merge_index is not None and "lora_B" in name:
                to_merge_params[target_name][merge_index] = weight
                if len(to_merge_params[target_name]) == num_params_to_merge:
                    # cat at output dim according to the merge_index order
                    sorted_tensors = [
                        to_merge_params[target_name][i]
                        for i in range(num_params_to_merge)
                    ]
                    weight = torch.cat(sorted_tensors, dim=1)
                    del to_merge_params[target_name]
                else:
                    continue

            if target_name in self.lora_adapters[lora_nickname]:
                raise ValueError(
                    f"Target name {target_name} already exists in lora_adapters[{lora_nickname}]"
                )
            self.lora_adapters[lora_nickname][target_name] = weight.to(
                self.device)
        adapter_updated = True
        self.cur_adapter_path = lora_path
        logger.info("Rank %d: loaded LoRA adapter %s", rank, lora_path)

    if not adapter_updated and self.cur_adapter_name == lora_nickname:
        return
    self.cur_adapter_name = lora_nickname

    # Merge the new adapter
    adapted_count = 0
    for transformer_name, transformer_lora_layers in self.lora_layers.items(
    ):
        for name, layer in transformer_lora_layers.items():
            lora_A_name = name + ".lora_A"
            lora_B_name = name + ".lora_B"
            lora_alpha_name = name + ".lora_alpha"
            if lora_A_name in self.lora_adapters[lora_nickname]\
                and lora_B_name in self.lora_adapters[lora_nickname]:
                # Get alpha value for this layer (defaults to None if not present)
                lora_A = self.lora_adapters[lora_nickname][lora_A_name]
                lora_B = self.lora_adapters[lora_nickname][lora_B_name]
                # Simple lookup - alpha stored with same naming scheme as lora_A/lora_B
                alpha = self.lora_adapters[lora_nickname].get(
                    lora_alpha_name) if adapter_updated else None

                layer.set_lora_weights(
                    lora_A,
                    lora_B,
                    lora_alpha=alpha,
                    training_mode=self.fastvideo_args.training_mode,
                    lora_path=lora_path)
                adapted_count += 1
            else:
                if rank == 0:
                    logger.warning(
                        "LoRA adapter %s does not contain the weights for layer %s. LoRA will not be applied to it.",
                        lora_path, name)
                layer.disable_lora = True
    logger.info("Rank %d: LoRA adapter %s applied to %d layers", rank,
                lora_path, adapted_count)

fastvideo.pipelines.PipelineWithLoRA ¶

PipelineWithLoRA(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Type for a pipeline that has both ComposedPipelineBase and LoRAPipeline functionality.

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.build_pipeline ¶

build_pipeline(fastvideo_args: FastVideoArgs, pipeline_type: PipelineType | str = BASIC) -> PipelineWithLoRA

Only works with valid hf diffusers configs. (model_index.json) We want to build a pipeline based on the inference args mode_path: 1. download the model from the hub if it's not already downloaded 2. verify the model config and directory 3. based on the config, determine the pipeline class

Source code in fastvideo/pipelines/__init__.py

def build_pipeline(
        fastvideo_args: FastVideoArgs,
        pipeline_type: PipelineType | str = PipelineType.BASIC
) -> PipelineWithLoRA:
    """
    Only works with valid hf diffusers configs. (model_index.json)
    We want to build a pipeline based on the inference args mode_path:
    1. download the model from the hub if it's not already downloaded
    2. verify the model config and directory
    3. based on the config, determine the pipeline class 
    """
    # Get pipeline type
    model_path = fastvideo_args.model_path
    model_path = maybe_download_model(model_path)
    # fastvideo_args.downloaded_model_path = model_path
    logger.info("Model path: %s", model_path)

    config = verify_model_config_and_directory(model_path)
    pipeline_name = config.get("_class_name")

    if fastvideo_args.override_pipeline_cls_name:
        logger.info("Overriding pipeline class name from %s to %s",
                    pipeline_name, fastvideo_args.override_pipeline_cls_name)
        pipeline_name = fastvideo_args.override_pipeline_cls_name

    if pipeline_name is None:
        raise ValueError(
            "Model config does not contain a _class_name attribute. "
            "Only diffusers format is supported.")

    # Get the appropriate pipeline registry based on pipeline_type
    logger.info(
        "Building pipeline of type: %s", pipeline_type.value if isinstance(
            pipeline_type, PipelineType) else pipeline_type)
    pipeline_registry = get_pipeline_registry(pipeline_type)

    if isinstance(pipeline_type, str):
        pipeline_type = PipelineType.from_string(pipeline_type)

    pipeline_cls = pipeline_registry.resolve_pipeline_cls(
        pipeline_name, pipeline_type, fastvideo_args.workload_type)

    # instantiate the pipelines
    pipeline = pipeline_cls(model_path, fastvideo_args)

    logger.info("Pipelines instantiated")

    return cast(PipelineWithLoRA, pipeline)

Modules¶

fastvideo.pipelines.basic ¶

Basic inference pipelines for fastvideo.

This package contains basic pipelines for video and image generation.

Modules¶

fastvideo.pipelines.basic.cosmos ¶

Modules¶

fastvideo.pipelines.basic.cosmos.cosmos_pipeline ¶

Cosmos video diffusion pipeline implementation.

This module contains an implementation of the Cosmos video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.cosmos.cosmos_pipeline.Cosmos2VideoToWorldPipeline ¶

Cosmos2VideoToWorldPipeline(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: ComposedPipelineBase

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.basic.cosmos.cosmos_pipeline.Cosmos2VideoToWorldPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=CosmosLatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer"),
                       vae=self.get_module("vae")))

    self.add_stage(stage_name="denoising_stage",
                   stage=CosmosDenoisingStage(
                       transformer=self.get_module("transformer"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.basic.hunyuan ¶

Modules¶

fastvideo.pipelines.basic.hunyuan.hunyuan_pipeline ¶

Hunyuan video diffusion pipeline implementation.

This module contains an implementation of the Hunyuan video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.hunyuan.hunyuan_pipeline.HunyuanVideoPipeline ¶

HunyuanVideoPipeline(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: ComposedPipelineBase

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.basic.hunyuan.hunyuan_pipeline.HunyuanVideoPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/hunyuan/hunyuan_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage_primary",
                   stage=TextEncodingStage(
                       text_encoders=[
                           self.get_module("text_encoder"),
                           self.get_module("text_encoder_2")
                       ],
                       tokenizers=[
                           self.get_module("tokenizer"),
                           self.get_module("tokenizer_2")
                       ],
                   ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer")))

    self.add_stage(stage_name="denoising_stage",
                   stage=DenoisingStage(
                       transformer=self.get_module("transformer"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.basic.hunyuan15 ¶

Modules¶

fastvideo.pipelines.basic.hunyuan15.hunyuan15_pipeline ¶

Hunyuan video diffusion pipeline implementation.

This module contains an implementation of the Hunyuan video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.hunyuan15.hunyuan15_pipeline.HunyuanVideo15Pipeline ¶

HunyuanVideo15Pipeline(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: ComposedPipelineBase

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.basic.hunyuan15.hunyuan15_pipeline.HunyuanVideo15Pipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/hunyuan15/hunyuan15_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage_primary",
                   stage=TextEncodingStage(
                       text_encoders=[
                           self.get_module("text_encoder"),
                           self.get_module("text_encoder_2")
                       ],
                       tokenizers=[
                           self.get_module("tokenizer"),
                           self.get_module("tokenizer_2")
                       ],
                   ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer")))

    self.add_stage(stage_name="image_encoding_stage",
                   stage=Hy15ImageEncodingStage(image_encoder=None,
                                                image_processor=None))

    self.add_stage(stage_name="denoising_stage",
                   stage=DenoisingStage(
                       transformer=self.get_module("transformer"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.basic.longcat ¶

LongCat pipeline module.

Classes¶

fastvideo.pipelines.basic.longcat.LongCatPipeline ¶

LongCatPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

LongCat video diffusion pipeline with LoRA support.

Phase 1 implementation using wrapper modules from third_party/longcat_video. This validates the pipeline infrastructure before full FastVideo integration.

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.longcat.LongCatPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs) -> None

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/longcat/longcat_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs) -> None:
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    # Add refine initialization stage (will be skipped if not refining)
    self.add_stage(stage_name="longcat_refine_init_stage",
                   stage=LongCatRefineInitStage(vae=self.get_module("vae")))

    # First prepare generic timesteps (for non-refine paths)
    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    # Then override timesteps for refinement (will be a no-op if not refining),
    # matching LongCat's generate_refine schedule.
    self.add_stage(stage_name="longcat_refine_timestep_stage",
                   stage=LongCatRefineTimestepStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer", None)))

    self.add_stage(stage_name="denoising_stage",
                   stage=LongCatDenoisingStage(
                       transformer=self.get_module("transformer"),
                       transformer_2=self.get_module("transformer_2", None),
                       scheduler=self.get_module("scheduler"),
                       vae=self.get_module("vae"),
                       pipeline=self))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae"),
                                       pipeline=self))

fastvideo.pipelines.basic.longcat.LongCatPipeline.initialize_pipeline ¶

initialize_pipeline(fastvideo_args: FastVideoArgs)

Initialize LongCat-specific components.

Source code in fastvideo/pipelines/basic/longcat/longcat_pipeline.py

def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
    """Initialize LongCat-specific components."""

    # Enable BSA (Block Sparse Attention) if configured
    pipeline_config = fastvideo_args.pipeline_config
    transformer = self.get_module("transformer", None)
    if transformer is None:
        raise RuntimeError(
            "Transformer module not found during initializing LongCat pipeline."
        )
    # If user toggles BSA via CLI/config
    if pipeline_config.enable_bsa:
        # Build effective BSA params:
        # 1) from explicit CLI overrides if provided
        # 2) else from pipeline_config.bsa_params
        # 3) else fall back to reasonable defaults
        bsa_params_cfg = pipeline_config.bsa_params
        sparsity = pipeline_config.bsa_sparsity
        cdf_threshold = pipeline_config.bsa_cdf_threshold
        chunk_q = pipeline_config.bsa_chunk_q
        chunk_k = pipeline_config.bsa_chunk_k

        effective_bsa_params = dict(bsa_params_cfg) if isinstance(
            bsa_params_cfg, dict) else {}
        if sparsity is not None:
            effective_bsa_params['sparsity'] = sparsity
        if cdf_threshold is not None:
            effective_bsa_params['cdf_threshold'] = cdf_threshold
        if chunk_q is not None:
            effective_bsa_params['chunk_3d_shape_q'] = chunk_q
        if chunk_k is not None:
            effective_bsa_params['chunk_3d_shape_k'] = chunk_k
        # Provide defaults if still missing
        effective_bsa_params.setdefault('sparsity', 0.9375)
        effective_bsa_params.setdefault('chunk_3d_shape_q', [4, 4, 4])
        effective_bsa_params.setdefault('chunk_3d_shape_k', [4, 4, 4])

        if hasattr(transformer, 'enable_bsa'):
            logger.info(
                "Enabling Block Sparse Attention (BSA) for LongCat transformer"
            )
            transformer.enable_bsa()
            # Propagate params to all attention modules
            if hasattr(transformer, 'blocks'):
                try:
                    for blk in transformer.blocks:
                        if hasattr(blk, 'self_attn'):
                            blk.self_attn.bsa_params = effective_bsa_params
                except Exception as e:
                    logger.warning(
                        "Failed to set BSA params on all blocks: %s", e)
            logger.info("BSA parameters in effect: %s",
                        effective_bsa_params)
        else:
            logger.warning(
                "BSA is enabled in config but transformer does not support it"
            )
    else:
        # Explicitly disable if present
        if hasattr(transformer, 'disable_bsa'):
            transformer.disable_bsa()

Modules¶

fastvideo.pipelines.basic.longcat.longcat_pipeline ¶

LongCat video diffusion pipeline implementation (Phase 1: Wrapper).

This module contains a wrapper implementation of the LongCat video diffusion pipeline using FastVideo's modular pipeline architecture with the original LongCat modules.

Classes¶

fastvideo.pipelines.basic.longcat.longcat_pipeline.LongCatPipeline ¶

LongCatPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

LongCat video diffusion pipeline with LoRA support.

Phase 1 implementation using wrapper modules from third_party/longcat_video. This validates the pipeline infrastructure before full FastVideo integration.

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.longcat.longcat_pipeline.LongCatPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs) -> None

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/longcat/longcat_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs) -> None:
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    # Add refine initialization stage (will be skipped if not refining)
    self.add_stage(stage_name="longcat_refine_init_stage",
                   stage=LongCatRefineInitStage(vae=self.get_module("vae")))

    # First prepare generic timesteps (for non-refine paths)
    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    # Then override timesteps for refinement (will be a no-op if not refining),
    # matching LongCat's generate_refine schedule.
    self.add_stage(stage_name="longcat_refine_timestep_stage",
                   stage=LongCatRefineTimestepStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer", None)))

    self.add_stage(stage_name="denoising_stage",
                   stage=LongCatDenoisingStage(
                       transformer=self.get_module("transformer"),
                       transformer_2=self.get_module("transformer_2", None),
                       scheduler=self.get_module("scheduler"),
                       vae=self.get_module("vae"),
                       pipeline=self))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae"),
                                       pipeline=self))

fastvideo.pipelines.basic.longcat.longcat_pipeline.LongCatPipeline.initialize_pipeline ¶

initialize_pipeline(fastvideo_args: FastVideoArgs)

Initialize LongCat-specific components.

Source code in fastvideo/pipelines/basic/longcat/longcat_pipeline.py

def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
    """Initialize LongCat-specific components."""

    # Enable BSA (Block Sparse Attention) if configured
    pipeline_config = fastvideo_args.pipeline_config
    transformer = self.get_module("transformer", None)
    if transformer is None:
        raise RuntimeError(
            "Transformer module not found during initializing LongCat pipeline."
        )
    # If user toggles BSA via CLI/config
    if pipeline_config.enable_bsa:
        # Build effective BSA params:
        # 1) from explicit CLI overrides if provided
        # 2) else from pipeline_config.bsa_params
        # 3) else fall back to reasonable defaults
        bsa_params_cfg = pipeline_config.bsa_params
        sparsity = pipeline_config.bsa_sparsity
        cdf_threshold = pipeline_config.bsa_cdf_threshold
        chunk_q = pipeline_config.bsa_chunk_q
        chunk_k = pipeline_config.bsa_chunk_k

        effective_bsa_params = dict(bsa_params_cfg) if isinstance(
            bsa_params_cfg, dict) else {}
        if sparsity is not None:
            effective_bsa_params['sparsity'] = sparsity
        if cdf_threshold is not None:
            effective_bsa_params['cdf_threshold'] = cdf_threshold
        if chunk_q is not None:
            effective_bsa_params['chunk_3d_shape_q'] = chunk_q
        if chunk_k is not None:
            effective_bsa_params['chunk_3d_shape_k'] = chunk_k
        # Provide defaults if still missing
        effective_bsa_params.setdefault('sparsity', 0.9375)
        effective_bsa_params.setdefault('chunk_3d_shape_q', [4, 4, 4])
        effective_bsa_params.setdefault('chunk_3d_shape_k', [4, 4, 4])

        if hasattr(transformer, 'enable_bsa'):
            logger.info(
                "Enabling Block Sparse Attention (BSA) for LongCat transformer"
            )
            transformer.enable_bsa()
            # Propagate params to all attention modules
            if hasattr(transformer, 'blocks'):
                try:
                    for blk in transformer.blocks:
                        if hasattr(blk, 'self_attn'):
                            blk.self_attn.bsa_params = effective_bsa_params
                except Exception as e:
                    logger.warning(
                        "Failed to set BSA params on all blocks: %s", e)
            logger.info("BSA parameters in effect: %s",
                        effective_bsa_params)
        else:
            logger.warning(
                "BSA is enabled in config but transformer does not support it"
            )
    else:
        # Explicitly disable if present
        if hasattr(transformer, 'disable_bsa'):
            transformer.disable_bsa()

Functions¶

fastvideo.pipelines.basic.matrixgame ¶

Modules¶

fastvideo.pipelines.basic.matrixgame.matrixgame_causal_dmd_pipeline ¶

Matrix-Game causal DMD pipeline implementation.

Classes¶ Functions¶

fastvideo.pipelines.basic.matrixgame.matrixgame_i2v_pipeline ¶

Matrix-Game I2V pipeline implementation.

Classes¶ Functions¶

fastvideo.pipelines.basic.stepvideo ¶

Modules¶

fastvideo.pipelines.basic.stepvideo.stepvideo_pipeline ¶

Hunyuan video diffusion pipeline implementation.

This module contains an implementation of the Hunyuan video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.stepvideo.stepvideo_pipeline.StepVideoPipeline ¶

StepVideoPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.stepvideo.stepvideo_pipeline.StepVideoPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/stepvideo/stepvideo_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=StepvideoPromptEncodingStage(
                       stepllm=self.get_module("text_encoder"),
                       clip=self.get_module("text_encoder_2"),
                   ))

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer"),
                   ))

    self.add_stage(stage_name="denoising_stage",
                   stage=DenoisingStage(
                       transformer=self.get_module("transformer"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

fastvideo.pipelines.basic.stepvideo.stepvideo_pipeline.StepVideoPipeline.initialize_pipeline ¶

initialize_pipeline(fastvideo_args: FastVideoArgs)

Initialize the pipeline.

Source code in fastvideo/pipelines/basic/stepvideo/stepvideo_pipeline.py

def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
    """
    Initialize the pipeline.
    """
    target_device = get_local_torch_device()
    llm_dir = os.path.join(self.model_path, "step_llm")
    clip_dir = os.path.join(self.model_path, "hunyuan_clip")
    text_enc = self.build_llm(llm_dir, target_device)
    clip_enc = self.build_clip(clip_dir, target_device)
    self.add_module("text_encoder", text_enc)
    self.add_module("text_encoder_2", clip_enc)
    lib_path = (
        os.path.join(
            fastvideo_args.model_path,
            'lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so'
        ) if os.path.isdir(fastvideo_args.model_path)  # local checkout
        else hf_hub_download(
            repo_id=fastvideo_args.model_path,
            filename=
            'lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so'
        ))
    torch.ops.load_library(lib_path)

fastvideo.pipelines.basic.stepvideo.stepvideo_pipeline.StepVideoPipeline.load_modules ¶

load_modules(fastvideo_args: FastVideoArgs) -> dict[str, Any]

Load the modules from the config.

Source code in fastvideo/pipelines/basic/stepvideo/stepvideo_pipeline.py

def load_modules(self, fastvideo_args: FastVideoArgs) -> dict[str, Any]:
    """
    Load the modules from the config.
    """
    model_index = self._load_config(self.model_path)
    logger.info("Loading pipeline modules from config: %s", model_index)

    # remove keys that are not pipeline modules
    model_index.pop("_class_name")
    model_index.pop("_diffusers_version")

    # some sanity checks
    assert len(
        model_index
    ) > 1, "model_index.json must contain at least one pipeline module"

    required_modules = ["transformer", "scheduler", "vae"]
    for module_name in required_modules:
        if module_name not in model_index:
            raise ValueError(
                f"model_index.json must contain a {module_name} module")
    logger.info("Diffusers config passed sanity checks")

    # all the component models used by the pipeline
    modules = {}
    for module_name, (transformers_or_diffusers,
                      architecture) in model_index.items():
        component_model_path = os.path.join(self.model_path, module_name)
        module = PipelineComponentLoader.load_module(
            module_name=module_name,
            component_model_path=component_model_path,
            transformers_or_diffusers=transformers_or_diffusers,
            fastvideo_args=fastvideo_args,
        )
        logger.info("Loaded module %s from %s", module_name,
                    component_model_path)

        if module_name in modules:
            logger.warning("Overwriting module %s", module_name)
        modules[module_name] = module

    required_modules = self.required_config_modules
    # Check if all required modules were loaded
    for module_name in required_modules:
        if module_name not in modules or modules[module_name] is None:
            raise ValueError(
                f"Required module {module_name} was not loaded properly")

    return modules

Functions¶

fastvideo.pipelines.basic.wan ¶

Modules¶

fastvideo.pipelines.basic.wan.wan_causal_dmd_pipeline ¶

Wan causal DMD pipeline implementation.

This module wires the causal DMD denoising stage into the modular pipeline.

Classes¶

fastvideo.pipelines.basic.wan.wan_causal_dmd_pipeline.WanCausalDMDPipeline ¶

WanCausalDMDPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.wan.wan_causal_dmd_pipeline.WanCausalDMDPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs) -> None

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/wan/wan_causal_dmd_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs) -> None:
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer", None)))

    self.add_stage(stage_name="denoising_stage",
                   stage=CausalDMDDenosingStage(
                       transformer=self.get_module("transformer"),
                       transformer_2=self.get_module("transformer_2", None),
                       scheduler=self.get_module("scheduler"),
                       vae=self.get_module("vae")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.basic.wan.wan_dmd_pipeline ¶

Wan video diffusion pipeline implementation.

This module contains an implementation of the Wan video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.wan.wan_dmd_pipeline.WanDMDPipeline ¶

WanDMDPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Wan video diffusion pipeline with LoRA support.

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.wan.wan_dmd_pipeline.WanDMDPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs) -> None

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/wan/wan_dmd_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs) -> None:
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer", None),
                       use_btchw_layout=True))

    self.add_stage(stage_name="denoising_stage",
                   stage=DmdDenoisingStage(
                       transformer=self.get_module("transformer"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.basic.wan.wan_i2v_dmd_pipeline ¶

Wan video diffusion pipeline implementation.

This module contains an implementation of the Wan video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.wan.wan_i2v_dmd_pipeline.WanImageToVideoDmdPipeline ¶

WanImageToVideoDmdPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.wan.wan_i2v_dmd_pipeline.WanImageToVideoDmdPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/wan/wan_i2v_dmd_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    self.add_stage(stage_name="image_encoding_stage",
                   stage=ImageEncodingStage(
                       image_encoder=self.get_module("image_encoder"),
                       image_processor=self.get_module("image_processor"),
                   ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer"),
                       use_btchw_layout=True))

    self.add_stage(stage_name="image_latent_preparation_stage",
                   stage=ImageVAEEncodingStage(vae=self.get_module("vae")))

    self.add_stage(stage_name="denoising_stage",
                   stage=DmdDenoisingStage(
                       transformer=self.get_module("transformer"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.basic.wan.wan_i2v_pipeline ¶

Wan video diffusion pipeline implementation.

This module contains an implementation of the Wan video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.wan.wan_i2v_pipeline.WanImageToVideoPipeline ¶

WanImageToVideoPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.wan.wan_i2v_pipeline.WanImageToVideoPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/wan/wan_i2v_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    if (self.get_module("image_encoder") is not None
            and self.get_module("image_processor") is not None):
        self.add_stage(
            stage_name="image_encoding_stage",
            stage=ImageEncodingStage(
                image_encoder=self.get_module("image_encoder"),
                image_processor=self.get_module("image_processor"),
            ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer")))

    self.add_stage(stage_name="image_latent_preparation_stage",
                   stage=ImageVAEEncodingStage(vae=self.get_module("vae")))

    self.add_stage(stage_name="denoising_stage",
                   stage=DenoisingStage(
                       transformer=self.get_module("transformer"),
                       transformer_2=self.get_module("transformer_2"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.basic.wan.wan_pipeline ¶

Wan video diffusion pipeline implementation.

This module contains an implementation of the Wan video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.wan.wan_pipeline.WanPipeline ¶

WanPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Wan video diffusion pipeline with LoRA support.

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.wan.wan_pipeline.WanPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs) -> None

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/wan/wan_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs) -> None:
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer", None)))

    self.add_stage(stage_name="denoising_stage",
                   stage=DenoisingStage(
                       transformer=self.get_module("transformer"),
                       transformer_2=self.get_module("transformer_2", None),
                       scheduler=self.get_module("scheduler"),
                       vae=self.get_module("vae"),
                       pipeline=self))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae"),
                                       pipeline=self))

Functions¶

fastvideo.pipelines.basic.wan.wan_v2v_pipeline ¶

Wan video-to-video diffusion pipeline implementation.

This module contains an implementation of the Wan video-to-video diffusion pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.basic.wan.wan_v2v_pipeline.WanVideoToVideoPipeline ¶

WanVideoToVideoPipeline(*args, **kwargs)

Bases: LoRAPipeline, ComposedPipelineBase

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.basic.wan.wan_v2v_pipeline.WanVideoToVideoPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/basic/wan/wan_v2v_pipeline.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())

    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

    if (self.get_module("image_encoder") is not None
            and self.get_module("image_processor") is not None):
        self.add_stage(
            stage_name="ref_image_encoding_stage",
            stage=RefImageEncodingStage(
                image_encoder=self.get_module("image_encoder"),
                image_processor=self.get_module("image_processor"),
            ))

    self.add_stage(stage_name="conditioning_stage",
                   stage=ConditioningStage())

    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer")))

    self.add_stage(stage_name="video_latent_preparation_stage",
                   stage=VideoVAEEncodingStage(vae=self.get_module("vae")))

    self.add_stage(stage_name="denoising_stage",
                   stage=DenoisingStage(
                       transformer=self.get_module("transformer"),
                       transformer_2=self.get_module("transformer_2"),
                       scheduler=self.get_module("scheduler")))

    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

Functions¶

fastvideo.pipelines.composed_pipeline_base ¶

Base class for composed pipelines.

This module defines the base class for pipelines that are composed of multiple stages.

Classes¶

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase ¶

ComposedPipelineBase(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: ABC

Base class for pipelines composed of multiple stages.

This class provides the framework for creating pipelines by composing multiple stages together. Each stage is responsible for a specific part of the diffusion process, and the pipeline orchestrates the execution of these stages.

Initialize the pipeline. After init, the pipeline should be ready to use. The pipeline should be stateless and not hold any batch state.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Attributes¶

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.required_config_modules property ¶

required_config_modules: list[str]

List of modules that are required by the pipeline. The names should match the diffusers directory and model_index.json file. These modules will be loaded using the PipelineComponentLoader and made available in the modules dictionary. Access these modules using the get_module method.

class ConcretePipeline(ComposedPipelineBase): _required_config_modules = ["vae", "text_encoder", "transformer", "scheduler", "tokenizer"]

@property
def required_config_modules(self):
    return self._required_config_modules

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.stages property ¶

stages: list[PipelineStage]

List of stages in the pipeline.

Functions¶

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.create_pipeline_stages abstractmethod ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Create the inference pipeline stages.

Source code in fastvideo/pipelines/composed_pipeline_base.py

@abstractmethod
def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """
    Create the inference pipeline stages.
    """
    raise NotImplementedError

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.create_training_stages ¶

create_training_stages(training_args: TrainingArgs)

Create the training pipeline stages.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def create_training_stages(self, training_args: TrainingArgs):
    """
    Create the training pipeline stages.
    """
    raise NotImplementedError

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Generate a video or image using the pipeline.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The batch to generate from.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns: ForwardBatch: The batch with the generated video or image.

Source code in fastvideo/pipelines/composed_pipeline_base.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Generate a video or image using the pipeline.

    Args:
        batch: The batch to generate from.
        fastvideo_args: The inference arguments.
    Returns:
        ForwardBatch: The batch with the generated video or image.
    """
    if not self.post_init_called:
        self.post_init()

    # Execute each stage
    logger.info("Running pipeline stages: %s",
                self._stage_name_mapping.keys())
    # logger.info("Batch: %s", batch)
    for stage in self.stages:
        batch = stage(batch, fastvideo_args)

    # Return the output
    return batch

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.from_pretrained classmethod ¶

from_pretrained(model_path: str, device: str | None = None, torch_dtype: dtype | None = None, pipeline_config: str | PipelineConfig | None = None, args: Namespace | None = None, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None, **kwargs) -> ComposedPipelineBase

Load a pipeline from a pretrained model. loaded_modules: Optional[Dict[str, torch.nn.Module]] = None, If provided, loaded_modules will be used instead of loading from config/pretrained weights.

Source code in fastvideo/pipelines/composed_pipeline_base.py

@classmethod
def from_pretrained(cls,
                    model_path: str,
                    device: str | None = None,
                    torch_dtype: torch.dtype | None = None,
                    pipeline_config: str | PipelineConfig | None = None,
                    args: argparse.Namespace | None = None,
                    required_config_modules: list[str] | None = None,
                    loaded_modules: dict[str, torch.nn.Module]
                    | None = None,
                    **kwargs) -> "ComposedPipelineBase":
    """
    Load a pipeline from a pretrained model.
    loaded_modules: Optional[Dict[str, torch.nn.Module]] = None,
    If provided, loaded_modules will be used instead of loading from config/pretrained weights.
    """
    if args is None or args.inference_mode:

        kwargs['model_path'] = model_path
        fastvideo_args = FastVideoArgs.from_kwargs(**kwargs)
    else:
        assert args is not None, "args must be provided for training mode"
        fastvideo_args = TrainingArgs.from_cli_args(args)
        # TODO(will): fix this so that its not so ugly
        fastvideo_args.model_path = model_path
        for key, value in kwargs.items():
            setattr(fastvideo_args, key, value)

        fastvideo_args.dit_cpu_offload = False
        # we hijack the precision to be the master weight type so that the
        # model is loaded with the correct precision. Subsequently we will
        # use FSDP2's MixedPrecisionPolicy to set the precision for the
        # fwd, bwd, and other operations' precision.
        assert fastvideo_args.pipeline_config.dit_precision == 'fp32', 'only fp32 is supported for training'

    logger.info("fastvideo_args in from_pretrained: %s", fastvideo_args)

    pipe = cls(model_path,
               fastvideo_args,
               required_config_modules=required_config_modules,
               loaded_modules=loaded_modules)
    pipe.post_init()
    return pipe

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.initialize_pipeline ¶

initialize_pipeline(fastvideo_args: FastVideoArgs)

Initialize the pipeline.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
    """
    Initialize the pipeline.
    """
    return

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase.load_modules ¶

load_modules(fastvideo_args: FastVideoArgs, loaded_modules: dict[str, Module] | None = None) -> dict[str, Any]

Load the modules from the config. loaded_modules: Optional[Dict[str, torch.nn.Module]] = None, If provided, loaded_modules will be used instead of loading from config/pretrained weights.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def load_modules(
    self,
    fastvideo_args: FastVideoArgs,
    loaded_modules: dict[str, torch.nn.Module] | None = None
) -> dict[str, Any]:
    """
    Load the modules from the config.
    loaded_modules: Optional[Dict[str, torch.nn.Module]] = None, 
    If provided, loaded_modules will be used instead of loading from config/pretrained weights.
    """

    model_index = self._load_config(self.model_path)
    logger.info("Loading pipeline modules from config: %s", model_index)

    # remove keys that are not pipeline modules
    model_index.pop("_class_name")
    model_index.pop("_diffusers_version")
    model_index.pop("workload_type", None)
    if "boundary_ratio" in model_index and model_index[
            "boundary_ratio"] is not None:
        logger.info(
            "MoE pipeline detected. Adding transformer_2 to self.required_config_modules..."
        )
        self.required_config_modules.append("transformer_2")
        logger.info("MoE pipeline detected. Setting boundary ratio to %s",
                    model_index["boundary_ratio"])
        fastvideo_args.pipeline_config.dit_config.boundary_ratio = model_index[
            "boundary_ratio"]

    model_index.pop("boundary_ratio", None)
    # used by Wan2.2 ti2v
    model_index.pop("expand_timesteps", None)

    # some sanity checks
    assert len(
        model_index
    ) > 1, "model_index.json must contain at least one pipeline module"

    for module_name in self.required_config_modules:
        if module_name not in model_index and module_name in self._extra_config_module_map:
            extra_module_value = self._extra_config_module_map[module_name]
            logger.warning(
                "model_index.json does not contain a %s module, but found {%s: %s} in _extra_config_module_map, adding to model_index.",
                module_name, module_name, extra_module_value)
            if extra_module_value in model_index:
                logger.info("Using module %s for %s", extra_module_value,
                            module_name)
                model_index[module_name] = model_index[extra_module_value]
                continue
            else:
                raise ValueError(
                    f"Required module key: {module_name} value: {model_index.get(module_name)} was not found in loaded modules {model_index.keys()}"
                )

    # all the component models used by the pipeline
    required_modules = self.required_config_modules
    logger.info("Loading required modules: %s", required_modules)

    modules = {}
    for module_name, (transformers_or_diffusers,
                      architecture) in model_index.items():
        if transformers_or_diffusers is None:
            logger.warning(
                "Module %s in model_index.json has null value, removing from required_config_modules",
                module_name)
            if module_name in self.required_config_modules:
                self.required_config_modules.remove(module_name)
            continue
        if module_name not in required_modules:
            logger.info("Skipping module %s", module_name)
            continue
        if loaded_modules is not None and module_name in loaded_modules:
            logger.info("Using module %s already provided", module_name)
            modules[module_name] = loaded_modules[module_name]
            continue

        # we load the module from the extra config module map if it exists
        if module_name in self._extra_config_module_map:
            load_module_name = self._extra_config_module_map[module_name]
        else:
            load_module_name = module_name

        component_model_path = os.path.join(self.model_path,
                                            load_module_name)
        module = PipelineComponentLoader.load_module(
            module_name=load_module_name,
            component_model_path=component_model_path,
            transformers_or_diffusers=transformers_or_diffusers,
            fastvideo_args=fastvideo_args,
        )
        logger.info("Loaded module %s from %s", module_name,
                    component_model_path)

        if module_name in modules:
            logger.warning("Overwriting module %s", module_name)
        modules[module_name] = module

    # Check if all required modules were loaded
    for module_name in required_modules:
        if module_name not in modules or modules[module_name] is None:
            raise ValueError(
                f"Required module key: {module_name} value: {modules.get(module_name)} was not found in loaded modules {modules.keys()}"
            )

    return modules

Functions¶

fastvideo.pipelines.lora_pipeline ¶

Classes¶

fastvideo.pipelines.lora_pipeline.LoRAPipeline ¶

LoRAPipeline(*args, **kwargs)

Bases: ComposedPipelineBase

Pipeline that supports injecting LoRA adapters into the diffusion transformer. TODO: support training.

Source code in fastvideo/pipelines/lora_pipeline.py

def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.device = get_local_torch_device()
    # build list of trainable transformers
    for transformer_name in self.trainable_transformer_names:
        if transformer_name in self.modules and self.modules[
                transformer_name] is not None:
            self.trainable_transformer_modules[
                transformer_name] = self.modules[transformer_name]
        # check for transformer_2 in case of Wan2.2 MoE or fake_score_transformer_2
        if transformer_name.endswith("_2"):
            raise ValueError(
                f"trainable_transformer_name override in pipelines should not include _2 suffix: {transformer_name}"
            )

        secondary_transformer_name = transformer_name + "_2"
        if secondary_transformer_name in self.modules and self.modules[
                secondary_transformer_name] is not None:
            self.trainable_transformer_modules[
                secondary_transformer_name] = self.modules[
                    secondary_transformer_name]

    logger.info("trainable_transformer_modules: %s",
                self.trainable_transformer_modules.keys())

    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        self.exclude_lora_layers[
            transformer_name] = transformer_module.config.arch_config.exclude_lora_layers
    self.lora_target_modules = self.fastvideo_args.lora_target_modules
    self.lora_path = self.fastvideo_args.lora_path
    self.lora_nickname = self.fastvideo_args.lora_nickname
    self.training_mode = self.fastvideo_args.training_mode
    if self.training_mode and getattr(self.fastvideo_args, "lora_training",
                                      False):
        assert isinstance(self.fastvideo_args, TrainingArgs)
        if self.fastvideo_args.lora_alpha is None:
            self.fastvideo_args.lora_alpha = self.fastvideo_args.lora_rank
        self.lora_rank = self.fastvideo_args.lora_rank  # type: ignore
        self.lora_alpha = self.fastvideo_args.lora_alpha  # type: ignore
        logger.info("Using LoRA training with rank %d and alpha %d",
                    self.lora_rank, self.lora_alpha)
        if self.lora_target_modules is None:
            self.lora_target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj", "to_q", "to_k",
                "to_v", "to_out", "to_qkv", "to_gate_compress"
            ]
            logger.info(
                "Using default lora_target_modules for all transformers: %s",
                self.lora_target_modules)
        else:
            logger.warning(
                "Using custom lora_target_modules for all transformers, which may not be intended: %s",
                self.lora_target_modules)

        self.convert_to_lora_layers()
    # Inference
    elif not self.training_mode and self.lora_path is not None:
        self.convert_to_lora_layers()
        self.set_lora_adapter(
            self.lora_nickname,  # type: ignore
            self.lora_path)  # type: ignore

Functions¶

fastvideo.pipelines.lora_pipeline.LoRAPipeline.convert_to_lora_layers ¶

convert_to_lora_layers() -> None

Unified method to convert the transformer to a LoRA transformer.

Source code in fastvideo/pipelines/lora_pipeline.py

def convert_to_lora_layers(self) -> None:
    """
    Unified method to convert the transformer to a LoRA transformer.
    """
    if self.lora_initialized:
        return
    self.lora_initialized = True
    for transformer_name, transformer_module in self.trainable_transformer_modules.items(
    ):
        converted_count = 0
        if transformer_name not in self.lora_layers:
            self.lora_layers[transformer_name] = {}
        logger.info("Converting %s to LoRA Transformer", transformer_name)
        for name, layer in transformer_module.named_modules():
            if not self.is_target_layer(name):
                continue

            excluded = False
            for exclude_layer in self.exclude_lora_layers[transformer_name]:
                if exclude_layer in name:
                    excluded = True
                    break
            if excluded:
                continue

            layer = get_lora_layer(layer,
                                   lora_rank=self.lora_rank,
                                   lora_alpha=self.lora_alpha,
                                   training_mode=self.training_mode)
            if layer is not None:
                self.lora_layers[transformer_name][name] = layer
                replace_submodule(transformer_module, name, layer)
                converted_count += 1
        logger.info("Converted %d layers to LoRA layers", converted_count)

fastvideo.pipelines.lora_pipeline.LoRAPipeline.set_lora_adapter ¶

set_lora_adapter(lora_nickname: str, lora_path: str | None = None)

Load a LoRA adapter into the pipeline and merge it into the transformer. Args: lora_nickname: The "nick name" of the adapter when referenced in the pipeline. lora_path: The path to the adapter, either a local path or a Hugging Face repo id.

Source code in fastvideo/pipelines/lora_pipeline.py

def set_lora_adapter(self,
                     lora_nickname: str,
                     lora_path: str | None = None):  # type: ignore
    """
    Load a LoRA adapter into the pipeline and merge it into the transformer.
    Args:
        lora_nickname: The "nick name" of the adapter when referenced in the pipeline.
        lora_path: The path to the adapter, either a local path or a Hugging Face repo id.
    """

    if lora_nickname not in self.lora_adapters and lora_path is None:
        raise ValueError(
            f"Adapter {lora_nickname} not found in the pipeline. Please provide lora_path to load it."
        )
    if not self.lora_initialized:
        self.convert_to_lora_layers()
    adapter_updated = False
    rank = dist.get_rank()
    if lora_path is not None and lora_path != self.cur_adapter_path:
        lora_local_path = maybe_download_lora(lora_path)
        lora_state_dict = load_file(lora_local_path)

        # Map the hf layer names to our custom layer names
        param_names_mapping_fn = get_param_names_mapping(
            self.modules["transformer"].param_names_mapping)
        lora_param_names_mapping_fn = get_param_names_mapping(
            self.modules["transformer"].lora_param_names_mapping)

        # Extract alpha values and weights in a single pass
        to_merge_params: defaultdict[Hashable,
                                     dict[Any, Any]] = defaultdict(dict)
        for name, weight in lora_state_dict.items():
            # Extract weights (lora_A, lora_B, and lora_alpha)
            name = name.replace("diffusion_model.", "")
            name = name.replace(".weight", "")

            if "lora_alpha" in name:
                # Store alpha with minimal mapping - same processing as lora_A/lora_B
                # but store in lora_adapters with ".lora_alpha" suffix
                layer_name = name.replace(".lora_alpha", "")
                layer_name, _, _ = lora_param_names_mapping_fn(layer_name)
                target_name, _, _ = param_names_mapping_fn(layer_name)
                # Store alpha alongside weights with same target_name base
                alpha_key = target_name + ".lora_alpha"
                self.lora_adapters[lora_nickname][alpha_key] = weight.item(
                ) if weight.numel() == 1 else float(weight.mean())
                continue

            name, _, _ = lora_param_names_mapping_fn(name)
            target_name, merge_index, num_params_to_merge = param_names_mapping_fn(
                name)
            # for (in_dim, r) @ (r, out_dim), we only merge (r, out_dim * n) where n is the number of linear layers to fuse
            # see param mapping in HunyuanVideoArchConfig
            if merge_index is not None and "lora_B" in name:
                to_merge_params[target_name][merge_index] = weight
                if len(to_merge_params[target_name]) == num_params_to_merge:
                    # cat at output dim according to the merge_index order
                    sorted_tensors = [
                        to_merge_params[target_name][i]
                        for i in range(num_params_to_merge)
                    ]
                    weight = torch.cat(sorted_tensors, dim=1)
                    del to_merge_params[target_name]
                else:
                    continue

            if target_name in self.lora_adapters[lora_nickname]:
                raise ValueError(
                    f"Target name {target_name} already exists in lora_adapters[{lora_nickname}]"
                )
            self.lora_adapters[lora_nickname][target_name] = weight.to(
                self.device)
        adapter_updated = True
        self.cur_adapter_path = lora_path
        logger.info("Rank %d: loaded LoRA adapter %s", rank, lora_path)

    if not adapter_updated and self.cur_adapter_name == lora_nickname:
        return
    self.cur_adapter_name = lora_nickname

    # Merge the new adapter
    adapted_count = 0
    for transformer_name, transformer_lora_layers in self.lora_layers.items(
    ):
        for name, layer in transformer_lora_layers.items():
            lora_A_name = name + ".lora_A"
            lora_B_name = name + ".lora_B"
            lora_alpha_name = name + ".lora_alpha"
            if lora_A_name in self.lora_adapters[lora_nickname]\
                and lora_B_name in self.lora_adapters[lora_nickname]:
                # Get alpha value for this layer (defaults to None if not present)
                lora_A = self.lora_adapters[lora_nickname][lora_A_name]
                lora_B = self.lora_adapters[lora_nickname][lora_B_name]
                # Simple lookup - alpha stored with same naming scheme as lora_A/lora_B
                alpha = self.lora_adapters[lora_nickname].get(
                    lora_alpha_name) if adapter_updated else None

                layer.set_lora_weights(
                    lora_A,
                    lora_B,
                    lora_alpha=alpha,
                    training_mode=self.fastvideo_args.training_mode,
                    lora_path=lora_path)
                adapted_count += 1
            else:
                if rank == 0:
                    logger.warning(
                        "LoRA adapter %s does not contain the weights for layer %s. LoRA will not be applied to it.",
                        lora_path, name)
                layer.disable_lora = True
    logger.info("Rank %d: LoRA adapter %s applied to %d layers", rank,
                lora_path, adapted_count)

Functions¶

fastvideo.pipelines.pipeline_batch_info ¶

Data structures for functional pipeline processing.

This module defines the dataclasses used to pass state between pipeline components in a functional manner, reducing the need for explicit parameter passing.

Classes¶

fastvideo.pipelines.pipeline_batch_info.ForwardBatch `dataclass` ¶

ForwardBatch(data_type: str, generator: Generator | list[Generator] | None = None, image_path: str | None = None, image_embeds: list[Tensor] = list(), pil_image: Tensor | Image | None = None, preprocessed_image: Tensor | None = None, prompt: str | list[str] | None = None, negative_prompt: str | list[str] | None = None, prompt_path: str | None = None, output_path: str = 'outputs/', output_video_name: str | None = None, video_path: str | None = None, video_latent: Tensor | None = None, refine_from: str | None = None, t_thresh: float = 0.5, spatial_refine_only: bool = False, num_cond_frames: int = 0, stage1_video: list[Image] | None = None, prompt_embeds: list[Tensor] = list(), negative_prompt_embeds: list[Tensor] | None = None, prompt_attention_mask: list[Tensor] | None = None, negative_attention_mask: list[Tensor] | None = None, clip_embedding_pos: list[Tensor] | None = None, clip_embedding_neg: list[Tensor] | None = None, max_sequence_length: int | None = None, prompt_template: dict[str, Any] | None = None, do_classifier_free_guidance: bool = False, batch_size: int | None = None, num_videos_per_prompt: int = 1, seed: int | None = None, seeds: list[int] | None = None, is_prompt_processed: bool = False, latents: Tensor | None = None, raw_latent_shape: tuple[int, ...] | None = None, noise_pred: Tensor | None = None, image_latent: Tensor | None = None, mouse_cond: Tensor | None = None, keyboard_cond: Tensor | None = None, grid_sizes: Tensor | None = None, height_latents: list[int] | int | None = None, width_latents: list[int] | int | None = None, num_frames: list[int] | int = 1, num_frames_round_down: bool = False, height: list[int] | int | None = None, width: list[int] | int | None = None, fps: list[int] | int | None = None, timesteps: Tensor | None = None, timestep: Tensor | float | int | None = None, step_index: int | None = None, boundary_ratio: float | None = None, num_inference_steps: int = 50, guidance_scale: float = 1.0, guidance_scale_2: float | None = None, guidance_rescale: float = 0.0, eta: float = 0.0, sigmas: list[float] | None = None, n_tokens: int | None = None, extra_step_kwargs: dict[str, Any] = dict(), modules: dict[str, Any] = dict(), output: Tensor | None = None, return_trajectory_latents: bool = False, return_trajectory_decoded: bool = False, trajectory_timesteps: list[Tensor] | None = None, trajectory_latents: Tensor | None = None, trajectory_decoded: list[Tensor] | None = None, extra: dict[str, Any] = dict(), save_video: bool = True, return_frames: bool = False, enable_teacache: bool = False, teacache_params: TeaCacheParams | WanTeaCacheParams | None = None, STA_param: list | None = None, is_cfg_negative: bool = False, mask_search_final_result_pos: list[list] | None = None, mask_search_final_result_neg: list[list] | None = None, VSA_sparsity: float = 0.0, logging_info: PipelineLoggingInfo = PipelineLoggingInfo())

Complete state passed through the pipeline execution.

This dataclass contains all information needed during the diffusion pipeline execution, allowing methods to update specific components without needing to manage numerous individual parameters.

Functions¶

fastvideo.pipelines.pipeline_batch_info.ForwardBatch.__post_init__ ¶

__post_init__()

Initialize dependent fields after dataclass initialization.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def __post_init__(self):
    """Initialize dependent fields after dataclass initialization."""

    # Set do_classifier_free_guidance based on guidance scale and negative prompt
    if self.guidance_scale > 1.0:
        self.do_classifier_free_guidance = True
    if self.negative_prompt_embeds is None:
        self.negative_prompt_embeds = []
    if self.guidance_scale_2 is None:
        self.guidance_scale_2 = self.guidance_scale

fastvideo.pipelines.pipeline_batch_info.PipelineLoggingInfo ¶

PipelineLoggingInfo()

Simple approach using OrderedDict to track stage metrics.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def __init__(self):
    # OrderedDict preserves insertion order and allows easy access
    self.stages: OrderedDict[str, dict[str, Any]] = OrderedDict()

Functions¶

fastvideo.pipelines.pipeline_batch_info.PipelineLoggingInfo.add_stage_execution_time ¶

add_stage_execution_time(stage_name: str, execution_time: float)

Add execution time for a stage.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def add_stage_execution_time(self, stage_name: str, execution_time: float):
    """Add execution time for a stage."""
    if stage_name not in self.stages:
        self.stages[stage_name] = {}
    self.stages[stage_name]['execution_time'] = execution_time
    self.stages[stage_name]['timestamp'] = time.time()

fastvideo.pipelines.pipeline_batch_info.PipelineLoggingInfo.add_stage_metric ¶

add_stage_metric(stage_name: str, metric_name: str, value: Any)

Add any metric for a stage.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def add_stage_metric(self, stage_name: str, metric_name: str, value: Any):
    """Add any metric for a stage."""
    if stage_name not in self.stages:
        self.stages[stage_name] = {}
    self.stages[stage_name][metric_name] = value

fastvideo.pipelines.pipeline_batch_info.PipelineLoggingInfo.get_execution_order ¶

get_execution_order() -> list[str]

Get stages in execution order.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def get_execution_order(self) -> list[str]:
    """Get stages in execution order."""
    return list(self.stages.keys())

fastvideo.pipelines.pipeline_batch_info.PipelineLoggingInfo.get_stage_info ¶

get_stage_info(stage_name: str) -> dict[str, Any]

Get all info for a specific stage.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def get_stage_info(self, stage_name: str) -> dict[str, Any]:
    """Get all info for a specific stage."""
    return self.stages.get(stage_name, {})

fastvideo.pipelines.pipeline_batch_info.PipelineLoggingInfo.get_total_execution_time ¶

get_total_execution_time() -> float

Get total pipeline execution time.

Source code in fastvideo/pipelines/pipeline_batch_info.py

def get_total_execution_time(self) -> float:
    """Get total pipeline execution time."""
    return sum(
        stage.get('execution_time', 0) for stage in self.stages.values())

fastvideo.pipelines.pipeline_registry ¶

Classes¶

fastvideo.pipelines.pipeline_registry.PipelineType ¶

Bases: str, Enum

Enumeration for different pipeline types.

Inherits from str to allow string comparison for backward compatibility.

Functions¶

fastvideo.pipelines.pipeline_registry.PipelineType.choices classmethod ¶

choices() -> list[str]

Get all available choices as strings.

Source code in fastvideo/pipelines/pipeline_registry.py

@classmethod
def choices(cls) -> list[str]:
    """Get all available choices as strings."""
    return [pipeline_type.value for pipeline_type in cls]

fastvideo.pipelines.pipeline_registry.PipelineType.from_string classmethod ¶

from_string(value: str) -> PipelineType

Convert string to PipelineType enum.

Source code in fastvideo/pipelines/pipeline_registry.py

@classmethod
def from_string(cls, value: str) -> "PipelineType":
    """Convert string to PipelineType enum."""
    try:
        return cls(value.lower())
    except ValueError:
        raise ValueError(
            f"Invalid pipeline type: {value}. Must be one of: {', '.join([t.value for t in cls])}"
        ) from None

Functions¶

fastvideo.pipelines.pipeline_registry.get_pipeline_registry ¶

get_pipeline_registry(pipeline_type: PipelineType | str | None = None) -> _PipelineRegistry

Get a pipeline registry for the specified mode, pipeline type, and workload type.

Parameters:

Name	Type	Description	Default
`pipeline_type`	`PipelineType \| str \| None`	Pipeline type to load. If None and mode is provided, will be derived from mode.	`None`

Returns:

Type	Description
`_PipelineRegistry`	A pipeline registry instance.

Source code in fastvideo/pipelines/pipeline_registry.py

def get_pipeline_registry(
        pipeline_type: PipelineType | str | None = None) -> _PipelineRegistry:
    """
    Get a pipeline registry for the specified mode, pipeline type, and workload type.

    Args:
        pipeline_type: Pipeline type to load. If None and mode is provided, will be derived from mode.

    Returns:
        A pipeline registry instance.
    """
    if isinstance(pipeline_type, str):
        pipeline_type = PipelineType.from_string(pipeline_type)

    pipeline_classes = import_pipeline_classes(pipeline_type)
    return _PipelineRegistry(pipeline_classes)

fastvideo.pipelines.pipeline_registry.import_pipeline_classes `cached` ¶

import_pipeline_classes(pipeline_types: list[PipelineType] | PipelineType | None = None) -> dict[str, dict[str, dict[str, type[ComposedPipelineBase] | None]]]

Import pipeline classes based on the pipeline type and workload type.

Parameters:

Name	Type	Description	Default
`pipeline_types`	`list[PipelineType] \| PipelineType \| None`	The pipeline types to load (basic, preprocess, training). If None, loads all types.	`None`

Returns:

Type	Description
`dict[str, dict[str, dict[str, type[ComposedPipelineBase] \| None]]]`	A three-level nested dictionary:
`dict[str, dict[str, dict[str, type[ComposedPipelineBase] \| None]]]`	{pipeline_type: {architecture_name: {pipeline_name: pipeline_cls}}}
`dict[str, dict[str, dict[str, type[ComposedPipelineBase] \| None]]]`	e.g., {"basic": {"wan": {"WanPipeline": WanPipeline}}}

Source code in fastvideo/pipelines/pipeline_registry.py

@lru_cache
def import_pipeline_classes(
    pipeline_types: list[PipelineType] | PipelineType | None = None
) -> dict[str, dict[str, dict[str, type[ComposedPipelineBase] | None]]]:
    """
    Import pipeline classes based on the pipeline type and workload type.

    Args:
        pipeline_types: The pipeline types to load (basic, preprocess, training). 
                      If None, loads all types.

    Returns:
        A three-level nested dictionary:
        {pipeline_type: {architecture_name: {pipeline_name: pipeline_cls}}}
        e.g., {"basic": {"wan": {"WanPipeline": WanPipeline}}}
    """
    type_to_arch_to_pipeline_dict: dict[str,
                                        dict[str,
                                             dict[str,
                                                  type[ComposedPipelineBase]
                                                  | None]]] = {}
    package_name: str = "fastvideo.pipelines"

    # Determine which pipeline types to scan
    if isinstance(pipeline_types, list):
        pipeline_types_to_scan = [
            pipeline_type.value for pipeline_type in pipeline_types
        ]
    elif isinstance(pipeline_types, PipelineType):
        pipeline_types_to_scan = [pipeline_types.value]
    else:
        pipeline_types_to_scan = [pt.value for pt in PipelineType]

    logger.info("Loading pipelines for types: %s", pipeline_types_to_scan)

    for pipeline_type_str in pipeline_types_to_scan:
        arch_to_pipeline_dict: dict[str, dict[str, type[ComposedPipelineBase]
                                              | None]] = {}

        # Try to load from pipeline-type-specific directory first
        pipeline_type_package_name = f"{package_name}.{pipeline_type_str}"

        try:
            pipeline_type_package = importlib.import_module(
                pipeline_type_package_name)
            logger.debug("Successfully imported %s", pipeline_type_package_name)

            for _, arch, ispkg in pkgutil.iter_modules(
                    pipeline_type_package.__path__):
                pipeline_dict: dict[str, type[ComposedPipelineBase] | None] = {}

                arch_package_name = f"{pipeline_type_package_name}.{arch}"
                if ispkg:
                    arch_package = importlib.import_module(arch_package_name)
                    for _, module_name, ispkg in pkgutil.walk_packages(
                            arch_package.__path__, arch_package_name + "."):
                        if not ispkg:
                            pipeline_module = importlib.import_module(
                                module_name)
                            if hasattr(pipeline_module, "EntryClass"):
                                if isinstance(pipeline_module.EntryClass, list):
                                    for pipeline in pipeline_module.EntryClass:
                                        pipeline_name = pipeline.__name__
                                        assert (
                                            pipeline_name not in pipeline_dict
                                        ), f"Duplicated pipeline implementation for {pipeline_name} in {pipeline_type_str}.{arch_package_name}"
                                        pipeline_dict[pipeline_name] = pipeline
                                else:
                                    pipeline_name = pipeline_module.EntryClass.__name__
                                    assert (
                                        pipeline_name not in pipeline_dict
                                    ), f"Duplicated pipeline implementation for {pipeline_name} in {pipeline_type_str}.{arch_package_name}"
                                    pipeline_dict[
                                        pipeline_name] = pipeline_module.EntryClass

                arch_to_pipeline_dict[arch] = pipeline_dict

        except ImportError as e:
            raise ImportError(
                f"Could not import {pipeline_type_package_name} when importing pipeline classes: {e}"
            ) from None

        type_to_arch_to_pipeline_dict[pipeline_type_str] = arch_to_pipeline_dict

    # Log summary
    total_pipelines = sum(
        len(pipeline_dict)
        for arch_to_pipeline_dict in type_to_arch_to_pipeline_dict.values()
        for pipeline_dict in arch_to_pipeline_dict.values())
    logger.info("Loaded %d pipeline classes across %d types", total_pipelines,
                len(pipeline_types_to_scan))

    return type_to_arch_to_pipeline_dict

fastvideo.pipelines.preprocess ¶

Modules¶

fastvideo.pipelines.preprocess.preprocess_pipeline_base ¶

Classes¶

fastvideo.pipelines.preprocess.preprocess_pipeline_base.BasePreprocessPipeline ¶

BasePreprocessPipeline(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: ComposedPipelineBase

Base class for preprocessing pipelines that handles common functionality.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_base.BasePreprocessPipeline.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_base.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""
    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

fastvideo.pipelines.preprocess.preprocess_pipeline_base.BasePreprocessPipeline.create_record ¶

create_record(video_name: str, vae_latent: ndarray, text_embedding: ndarray, valid_data: dict[str, Any], idx: int, extra_features: dict[str, Any] | None = None) -> dict[str, Any]

Create a record for the Parquet dataset.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_base.py

def create_record(
        self,
        video_name: str,
        vae_latent: np.ndarray,
        text_embedding: np.ndarray,
        valid_data: dict[str, Any],
        idx: int,
        extra_features: dict[str, Any] | None = None) -> dict[str, Any]:
    """Create a record for the Parquet dataset."""
    record = {
        "id":
        video_name,
        "vae_latent_bytes":
        vae_latent.tobytes(),
        "vae_latent_shape":
        list(vae_latent.shape),
        "vae_latent_dtype":
        str(vae_latent.dtype),
        "text_embedding_bytes":
        text_embedding.tobytes(),
        "text_embedding_shape":
        list(text_embedding.shape),
        "text_embedding_dtype":
        str(text_embedding.dtype),
        "file_name":
        video_name,
        "caption":
        valid_data["text"][idx] if len(valid_data["text"]) > 0 else "",
        "media_type":
        "video",
        "width":
        valid_data["pixel_values"][idx].shape[-2]
        if len(valid_data["pixel_values"]) > 0 else 0,
        "height":
        valid_data["pixel_values"][idx].shape[-1]
        if len(valid_data["pixel_values"]) > 0 else 0,
        "num_frames":
        vae_latent.shape[1] if len(vae_latent.shape) > 1 else 0,
        "duration_sec":
        float(valid_data["duration"][idx])
        if len(valid_data["duration"]) > 0 else 0.0,
        "fps":
        float(valid_data["fps"][idx])
        if len(valid_data["fps"]) > 0 else 0.0,
    }
    if extra_features:
        record.update(extra_features)
    return record

fastvideo.pipelines.preprocess.preprocess_pipeline_base.BasePreprocessPipeline.create_record_for_schema ¶

create_record_for_schema(preprocess_batch: PreprocessBatch, schema: Schema, strict: bool = False) -> dict[str, Any]

Create a record for the Parquet dataset using a generic schema-based approach.

Parameters:

Name	Type	Description	Default
`preprocess_batch`	`PreprocessBatch`	The batch containing the data to extract	required
`schema`	`Schema`	PyArrow schema defining the expected fields	required
`strict`	`bool`	If True, raises an exception when required fields are missing or unfilled	`False`

Returns:

Type	Description
`dict[str, Any]`	Dictionary record matching the schema

Raises:

Type	Description
`ValueError`	If strict=True and required fields are missing or unfilled

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_base.py

def create_record_for_schema(self,
                             preprocess_batch: PreprocessBatch,
                             schema: pa.Schema,
                             strict: bool = False) -> dict[str, Any]:
    """Create a record for the Parquet dataset using a generic schema-based approach.

    Args:
        preprocess_batch: The batch containing the data to extract
        schema: PyArrow schema defining the expected fields
        strict: If True, raises an exception when required fields are missing or unfilled

    Returns:
        Dictionary record matching the schema

    Raises:
        ValueError: If strict=True and required fields are missing or unfilled
    """
    record = {}
    unfilled_fields = []

    for field in schema.names:
        field_filled = False

        if field.endswith('_bytes'):
            # Handle binary tensor data - convert numpy array or tensor to bytes
            tensor_name = field.replace('_bytes', '')
            tensor_data = getattr(preprocess_batch, tensor_name, None)
            if tensor_data is not None:
                try:
                    if hasattr(tensor_data, 'numpy'):  # torch tensor
                        record[field] = tensor_data.cpu().numpy().tobytes()
                        field_filled = True
                    elif hasattr(tensor_data, 'tobytes'):  # numpy array
                        record[field] = tensor_data.tobytes()
                        field_filled = True
                    else:
                        raise ValueError(
                            f"Unsupported tensor type for field {field}: {type(tensor_data)}"
                        )
                except Exception as e:
                    if strict:
                        raise ValueError(
                            f"Failed to convert tensor {tensor_name} to bytes: {e}"
                        ) from e
                    record[field] = b''  # Empty bytes for missing data
            else:
                record[field] = b''  # Empty bytes for missing data

        elif field.endswith('_shape'):
            # Handle tensor shape info
            tensor_name = field.replace('_shape', '')
            tensor_data = getattr(preprocess_batch, tensor_name, None)
            if tensor_data is not None and hasattr(tensor_data, 'shape'):
                record[field] = list(tensor_data.shape)
                field_filled = True
            else:
                record[field] = []

        elif field.endswith('_dtype'):
            # Handle tensor dtype info
            tensor_name = field.replace('_dtype', '')
            tensor_data = getattr(preprocess_batch, tensor_name, None)
            if tensor_data is not None and hasattr(tensor_data, 'dtype'):
                record[field] = str(tensor_data.dtype)
                field_filled = True
            else:
                record[field] = 'unknown'

        elif field in ['width', 'height', 'num_frames']:
            # Handle integer metadata fields
            value = getattr(preprocess_batch, field, None)
            if value is not None:
                try:
                    record[field] = int(value)
                    field_filled = True
                except (ValueError, TypeError) as e:
                    if strict:
                        raise ValueError(
                            f"Failed to convert field {field} to int: {e}"
                        ) from e
                    record[field] = 0
            else:
                record[field] = 0

        elif field in ['duration_sec', 'fps']:
            # Handle float metadata fields
            # Map schema field names to batch attribute names
            attr_name = 'duration' if field == 'duration_sec' else field
            value = getattr(preprocess_batch, attr_name, None)
            if value is not None:
                try:
                    record[field] = float(value)
                    field_filled = True
                except (ValueError, TypeError) as e:
                    if strict:
                        raise ValueError(
                            f"Failed to convert field {field} to float: {e}"
                        ) from e
                    record[field] = 0.0
            else:
                record[field] = 0.0

        else:
            # Handle string fields (id, file_name, caption, media_type, etc.)
            # Map common schema field names to batch attribute names
            attr_name = field
            if field == 'caption':
                attr_name = 'text'
            elif field == 'file_name':
                attr_name = 'path'
            elif field == 'id':
                # Generate ID from path if available
                path_value = getattr(preprocess_batch, 'path', None)
                if path_value:
                    import os
                    record[field] = os.path.basename(path_value).split(
                        '.')[0]
                    field_filled = True
                else:
                    record[field] = ""
                continue
            elif field == 'media_type':
                # Determine media type from path
                path_value = getattr(preprocess_batch, 'path', None)
                if path_value:
                    record[field] = 'video' if path_value.endswith(
                        '.mp4') else 'image'
                    field_filled = True
                else:
                    record[field] = ""
                continue

            value = getattr(preprocess_batch, attr_name, None)
            if value is not None:
                record[field] = str(value)
                field_filled = True
            else:
                record[field] = ""

        # Track unfilled fields
        if not field_filled:
            unfilled_fields.append(field)

    # Handle strict mode
    if strict and unfilled_fields:
        raise ValueError(
            f"Required fields were not filled: {unfilled_fields}")

    # Log unfilled fields as warning if not in strict mode
    if unfilled_fields:
        logger.warning(
            "Some fields were not filled and got default values: %s",
            unfilled_fields)

    return record

fastvideo.pipelines.preprocess.preprocess_pipeline_base.BasePreprocessPipeline.get_extra_features ¶

get_extra_features(valid_data: dict[str, Any], fastvideo_args: FastVideoArgs) -> dict[str, Any]

Get additional features specific to the pipeline type. Override in subclasses.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_base.py

def get_extra_features(self, valid_data: dict[str, Any],
                       fastvideo_args: FastVideoArgs) -> dict[str, Any]:
    """Get additional features specific to the pipeline type. Override in subclasses."""
    return {}

fastvideo.pipelines.preprocess.preprocess_pipeline_base.BasePreprocessPipeline.get_pyarrow_schema ¶

get_pyarrow_schema() -> Schema

Return the PyArrow schema for this pipeline. Must be overridden.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_base.py

def get_pyarrow_schema(self) -> pa.Schema:
    """Return the PyArrow schema for this pipeline. Must be overridden."""
    raise NotImplementedError

fastvideo.pipelines.preprocess.preprocess_pipeline_base.BasePreprocessPipeline.get_schema_fields ¶

get_schema_fields() -> list[str]

Get the schema fields for the pipeline type.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_base.py

def get_schema_fields(self) -> list[str]:
    """Get the schema fields for the pipeline type."""
    return [f.name for f in self.get_pyarrow_schema()]

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_i2v ¶

I2V Data Preprocessing pipeline implementation.

This module contains an implementation of the I2V Data Preprocessing pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.preprocess.preprocess_pipeline_i2v.PreprocessPipeline_I2V ¶

PreprocessPipeline_I2V(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: BasePreprocessPipeline

I2V preprocessing pipeline implementation.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_i2v.PreprocessPipeline_I2V.create_record ¶

create_record(video_name: str, vae_latent: ndarray, text_embedding: ndarray, valid_data: dict[str, Any], idx: int, extra_features: dict[str, Any] | None = None) -> dict[str, Any]

Create a record for the Parquet dataset with CLIP features.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_i2v.py

def create_record(
        self,
        video_name: str,
        vae_latent: np.ndarray,
        text_embedding: np.ndarray,
        valid_data: dict[str, Any],
        idx: int,
        extra_features: dict[str, Any] | None = None) -> dict[str, Any]:
    """Create a record for the Parquet dataset with CLIP features."""
    record = super().create_record(video_name=video_name,
                                   vae_latent=vae_latent,
                                   text_embedding=text_embedding,
                                   valid_data=valid_data,
                                   idx=idx,
                                   extra_features=extra_features)

    if extra_features and "clip_feature" in extra_features:
        clip_feature = extra_features["clip_feature"]
        record.update({
            "clip_feature_bytes": clip_feature.tobytes(),
            "clip_feature_shape": list(clip_feature.shape),
            "clip_feature_dtype": str(clip_feature.dtype),
        })
    else:
        record.update({
            "clip_feature_bytes": b"",
            "clip_feature_shape": [],
            "clip_feature_dtype": "",
        })

    if extra_features and "first_frame_latent" in extra_features:
        first_frame_latent = extra_features["first_frame_latent"]
        record.update({
            "first_frame_latent_bytes":
            first_frame_latent.tobytes(),
            "first_frame_latent_shape":
            list(first_frame_latent.shape),
            "first_frame_latent_dtype":
            str(first_frame_latent.dtype),
        })
    else:
        record.update({
            "first_frame_latent_bytes": b"",
            "first_frame_latent_shape": [],
            "first_frame_latent_dtype": "",
        })

    if extra_features and "pil_image" in extra_features:
        pil_image = extra_features["pil_image"]
        record.update({
            "pil_image_bytes": pil_image.tobytes(),
            "pil_image_shape": list(pil_image.shape),
            "pil_image_dtype": str(pil_image.dtype),
        })
    else:
        record.update({
            "pil_image_bytes": b"",
            "pil_image_shape": [],
            "pil_image_dtype": "",
        })

    return record

fastvideo.pipelines.preprocess.preprocess_pipeline_i2v.PreprocessPipeline_I2V.get_pyarrow_schema ¶

get_pyarrow_schema()

Return the PyArrow schema for I2V pipeline.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_i2v.py

def get_pyarrow_schema(self):
    """Return the PyArrow schema for I2V pipeline."""
    return pyarrow_schema_i2v

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_ode_trajectory ¶

ODE Trajectory Data Preprocessing pipeline implementation.

This module contains an implementation of the ODE Trajectory Data Preprocessing pipeline using the modular pipeline architecture.

Sec 4.3 of CausVid paper: https://arxiv.org/pdf/2412.07772

Classes¶

fastvideo.pipelines.preprocess.preprocess_pipeline_ode_trajectory.PreprocessPipeline_ODE_Trajectory ¶

PreprocessPipeline_ODE_Trajectory(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: BasePreprocessPipeline

ODE Trajectory preprocessing pipeline implementation.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_ode_trajectory.PreprocessPipeline_ODE_Trajectory.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_ode_trajectory.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""
    assert fastvideo_args.pipeline_config.flow_shift == 5
    self.modules["scheduler"] = SelfForcingFlowMatchScheduler(
        shift=fastvideo_args.pipeline_config.flow_shift,
        sigma_min=0.0,
        extra_one_step=True)
    self.modules["scheduler"].set_timesteps(num_inference_steps=48,
                                            denoising_strength=1.0)

    self.add_stage(stage_name="input_validation_stage",
                   stage=InputValidationStage())
    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))
    self.add_stage(stage_name="timestep_preparation_stage",
                   stage=TimestepPreparationStage(
                       scheduler=self.get_module("scheduler")))
    self.add_stage(stage_name="latent_preparation_stage",
                   stage=LatentPreparationStage(
                       scheduler=self.get_module("scheduler"),
                       transformer=self.get_module("transformer", None)))
    self.add_stage(stage_name="denoising_stage",
                   stage=DenoisingStage(
                       transformer=self.get_module("transformer"),
                       scheduler=self.get_module("scheduler"),
                       pipeline=self,
                   ))
    self.add_stage(stage_name="decoding_stage",
                   stage=DecodingStage(vae=self.get_module("vae")))

fastvideo.pipelines.preprocess.preprocess_pipeline_ode_trajectory.PreprocessPipeline_ODE_Trajectory.get_pyarrow_schema ¶

get_pyarrow_schema() -> Schema

Return the PyArrow schema for ODE Trajectory pipeline.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_ode_trajectory.py

def get_pyarrow_schema(self) -> pa.Schema:
    """Return the PyArrow schema for ODE Trajectory pipeline."""
    return pyarrow_schema_ode_trajectory_text_only

fastvideo.pipelines.preprocess.preprocess_pipeline_ode_trajectory.PreprocessPipeline_ODE_Trajectory.preprocess_text_and_trajectory ¶

preprocess_text_and_trajectory(fastvideo_args: FastVideoArgs, args)

Preprocess text-only data and generate trajectory information.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_ode_trajectory.py

def preprocess_text_and_trajectory(self, fastvideo_args: FastVideoArgs,
                                   args):
    """Preprocess text-only data and generate trajectory information."""

    for batch_idx, data in enumerate(self.pbar):
        if data is None:
            continue

        with torch.inference_mode():
            # For text-only processing, we only need text data
            # Filter out samples without text
            valid_indices = []
            for i, text in enumerate(data["text"]):
                if text and text.strip():  # Check if text is not empty
                    valid_indices.append(i)
            self.num_processed_samples += len(valid_indices)

            if not valid_indices:
                continue

            # Create new batch with only valid samples (text-only)
            valid_data = {
                "text": [data["text"][i] for i in valid_indices],
                "path": [data["path"][i] for i in valid_indices],
            }

            # Add fps and duration if available in data
            if "fps" in data:
                valid_data["fps"] = [data["fps"][i] for i in valid_indices]
            if "duration" in data:
                valid_data["duration"] = [
                    data["duration"][i] for i in valid_indices
                ]

            batch_captions = valid_data["text"]
            # Encode text using the standalone TextEncodingStage API
            prompt_embeds_list, prompt_masks_list = self.prompt_encoding_stage.encode_text(
                batch_captions,
                fastvideo_args,
                encoder_index=[0],
                return_attention_mask=True,
            )
            prompt_embeds = prompt_embeds_list[0]
            prompt_attention_masks = prompt_masks_list[0]
            assert prompt_embeds.shape[0] == prompt_attention_masks.shape[0]

            sampling_params = SamplingParam.from_pretrained(args.model_path)

            # encode negative prompt for trajectory collection
            if sampling_params.guidance_scale > 1 and sampling_params.negative_prompt is not None:
                negative_prompt_embeds_list, negative_prompt_masks_list = self.prompt_encoding_stage.encode_text(
                    sampling_params.negative_prompt,
                    fastvideo_args,
                    encoder_index=[0],
                    return_attention_mask=True,
                )
                negative_prompt_embed = negative_prompt_embeds_list[0][0]
                negative_prompt_attention_mask = negative_prompt_masks_list[
                    0][0]
            else:
                negative_prompt_embed = None
                negative_prompt_attention_mask = None

            trajectory_latents = []
            trajectory_timesteps = []
            trajectory_decoded = []

            for i, (prompt_embed, prompt_attention_mask) in enumerate(
                    zip(prompt_embeds, prompt_attention_masks,
                        strict=False)):
                prompt_embed = prompt_embed.unsqueeze(0)
                prompt_attention_mask = prompt_attention_mask.unsqueeze(0)

                # Collect the trajectory data (text-to-video generation)
                batch = ForwardBatch(**shallow_asdict(sampling_params), )
                batch.prompt_embeds = [prompt_embed]
                batch.prompt_attention_mask = [prompt_attention_mask]
                batch.negative_prompt_embeds = [negative_prompt_embed]
                batch.negative_attention_mask = [
                    negative_prompt_attention_mask
                ]
                batch.num_inference_steps = 48
                batch.return_trajectory_latents = True
                # Enabling this will save the decoded trajectory videos.
                # Used for debugging.
                batch.return_trajectory_decoded = False
                batch.height = args.max_height
                batch.width = args.max_width
                batch.fps = args.train_fps
                batch.guidance_scale = 6.0
                batch.do_classifier_free_guidance = True

                result_batch = self.input_validation_stage(
                    batch, fastvideo_args)
                result_batch = self.timestep_preparation_stage(
                    batch, fastvideo_args)
                result_batch = self.latent_preparation_stage(
                    result_batch, fastvideo_args)
                result_batch = self.denoising_stage(result_batch,
                                                    fastvideo_args)
                result_batch = self.decoding_stage(result_batch,
                                                   fastvideo_args)

                trajectory_latents.append(
                    result_batch.trajectory_latents.cpu())
                trajectory_timesteps.append(
                    result_batch.trajectory_timesteps.cpu())
                trajectory_decoded.append(result_batch.trajectory_decoded)

            # Prepare extra features for text-only processing
            extra_features = {
                "trajectory_latents": trajectory_latents,
                "trajectory_timesteps": trajectory_timesteps
            }

            if batch.return_trajectory_decoded:
                for i, decoded_frames in enumerate(trajectory_decoded):
                    for j, decoded_frame in enumerate(decoded_frames):
                        save_decoded_latents_as_video(
                            decoded_frame,
                            f"decoded_videos/trajectory_decoded_{i}_{j}.mp4",
                            args.train_fps)

            # Prepare batch data for Parquet dataset
            batch_data: list[dict[str, Any]] = []

            # Add progress bar for saving outputs
            save_pbar = tqdm(enumerate(valid_data["path"]),
                             desc="Saving outputs",
                             unit="item",
                             leave=False)

            for idx, video_path in save_pbar:
                video_name = os.path.basename(video_path).split(".")[0]

                # Convert tensors to numpy arrays
                text_embedding = prompt_embeds[idx].cpu().numpy()

                # Get extra features for this sample
                sample_extra_features = {}
                if extra_features:
                    for key, value in extra_features.items():
                        if isinstance(value, torch.Tensor):
                            sample_extra_features[key] = value[idx].cpu(
                            ).numpy()
                        else:
                            assert isinstance(value, list)
                            if isinstance(value[idx], torch.Tensor):
                                sample_extra_features[key] = value[idx].cpu(
                                ).float().numpy()
                            else:
                                sample_extra_features[key] = value[idx]

                # Create record for Parquet dataset (text-only ODE schema)
                record: dict[str, Any] = ode_text_only_record_creator(
                    video_name=video_name,
                    text_embedding=text_embedding,
                    caption=valid_data["text"][idx],
                    trajectory_latents=sample_extra_features[
                        "trajectory_latents"],
                    trajectory_timesteps=sample_extra_features[
                        "trajectory_timesteps"],
                )
                batch_data.append(record)

            if batch_data:
                write_pbar = tqdm(total=1,
                                  desc="Writing to Parquet dataset",
                                  unit="batch")
                table = records_to_table(batch_data,
                                         self.get_pyarrow_schema())
                write_pbar.update(1)
                write_pbar.close()

                if not hasattr(self, 'dataset_writer'):
                    self.dataset_writer = ParquetDatasetWriter(
                        out_dir=self.combined_parquet_dir,
                        samples_per_file=args.samples_per_file,
                    )
                self.dataset_writer.append_table(table)

                logger.info("Collected batch with %s samples", len(table))

            if self.num_processed_samples >= args.flush_frequency:
                written = self.dataset_writer.flush()
                logger.info("Flushed %s samples to parquet", written)
                self.num_processed_samples = 0

    # Final flush for any remaining samples
    if hasattr(self, 'dataset_writer'):
        written = self.dataset_writer.flush(write_remainder=True)
        if written:
            logger.info("Final flush wrote %s samples", written)

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_t2v ¶

T2V Data Preprocessing pipeline implementation.

This module contains an implementation of the T2V Data Preprocessing pipeline using the modular pipeline architecture.

Classes¶

fastvideo.pipelines.preprocess.preprocess_pipeline_t2v.PreprocessPipeline_T2V ¶

PreprocessPipeline_T2V(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: BasePreprocessPipeline

T2V preprocessing pipeline implementation.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_t2v.PreprocessPipeline_T2V.get_pyarrow_schema ¶

get_pyarrow_schema()

Return the PyArrow schema for T2V pipeline.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_t2v.py

def get_pyarrow_schema(self):
    """Return the PyArrow schema for T2V pipeline."""
    return pyarrow_schema_t2v

fastvideo.pipelines.preprocess.preprocess_pipeline_text ¶

Text-only Data Preprocessing pipeline implementation.

This module contains an implementation of the Text-only Data Preprocessing pipeline using the modular pipeline architecture, based on the ODE Trajectory preprocessing.

Classes¶

fastvideo.pipelines.preprocess.preprocess_pipeline_text.PreprocessPipeline_Text ¶

PreprocessPipeline_Text(model_path: str, fastvideo_args: FastVideoArgs | TrainingArgs, required_config_modules: list[str] | None = None, loaded_modules: dict[str, Module] | None = None)

Bases: BasePreprocessPipeline

Text-only preprocessing pipeline implementation.

Source code in fastvideo/pipelines/composed_pipeline_base.py

def __init__(self,
             model_path: str,
             fastvideo_args: FastVideoArgs | TrainingArgs,
             required_config_modules: list[str] | None = None,
             loaded_modules: dict[str, torch.nn.Module] | None = None):
    """
    Initialize the pipeline. After __init__, the pipeline should be ready to
    use. The pipeline should be stateless and not hold any batch state.
    """
    self.fastvideo_args = fastvideo_args

    self.model_path: str = model_path
    self._stages: list[PipelineStage] = []
    self._stage_name_mapping: dict[str, PipelineStage] = {}

    if required_config_modules is not None:
        self._required_config_modules = required_config_modules

    if self._required_config_modules is None:
        raise NotImplementedError(
            "Subclass must set _required_config_modules")

    maybe_init_distributed_environment_and_model_parallel(
        fastvideo_args.tp_size, fastvideo_args.sp_size)

    # Torch profiler. Enabled and configured through env vars:
    # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
    trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
    self.profiler_controller = get_or_create_profiler(trace_dir)
    self.profiler = self.profiler_controller.profiler

    self.local_rank = get_world_group().local_rank

    # Load modules directly in initialization
    logger.info("Loading pipeline modules...")
    with self.profiler_controller.region("profiler_region_model_loading"):
        self.modules = self.load_modules(fastvideo_args, loaded_modules)

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_text.PreprocessPipeline_Text.create_pipeline_stages ¶

create_pipeline_stages(fastvideo_args: FastVideoArgs)

Set up pipeline stages with proper dependency injection.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_text.py

def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
    """Set up pipeline stages with proper dependency injection."""
    self.add_stage(stage_name="prompt_encoding_stage",
                   stage=TextEncodingStage(
                       text_encoders=[self.get_module("text_encoder")],
                       tokenizers=[self.get_module("tokenizer")],
                   ))

fastvideo.pipelines.preprocess.preprocess_pipeline_text.PreprocessPipeline_Text.get_pyarrow_schema ¶

get_pyarrow_schema()

Return the PyArrow schema for text-only pipeline.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_text.py

def get_pyarrow_schema(self):
    """Return the PyArrow schema for text-only pipeline."""
    return pyarrow_schema_text_only

fastvideo.pipelines.preprocess.preprocess_pipeline_text.PreprocessPipeline_Text.preprocess_text_only ¶

preprocess_text_only(fastvideo_args: FastVideoArgs, args)

Preprocess text-only data.

Source code in fastvideo/pipelines/preprocess/preprocess_pipeline_text.py

def preprocess_text_only(self, fastvideo_args: FastVideoArgs, args):
    """Preprocess text-only data."""

    for batch_idx, data in enumerate(self.pbar):
        if data is None:
            continue

        with torch.inference_mode():
            # For text-only processing, we only need text data
            # Filter out samples without text
            valid_indices = []
            for i, text in enumerate(data["text"]):
                if text and text.strip():  # Check if text is not empty
                    valid_indices.append(i)
            self.num_processed_samples += len(valid_indices)

            if not valid_indices:
                continue

            # Create new batch with only valid samples (text-only)
            valid_data = {
                "text": [data["text"][i] for i in valid_indices],
                "path": [data["path"][i] for i in valid_indices],
            }

            batch_captions = valid_data["text"]
            # Encode text using the standalone TextEncodingStage API
            prompt_embeds_list, prompt_masks_list = self.prompt_encoding_stage.encode_text(
                batch_captions,
                fastvideo_args,
                encoder_index=[0],
                return_attention_mask=True,
            )
            prompt_embeds = prompt_embeds_list[0]
            prompt_attention_masks = prompt_masks_list[0]
            assert prompt_embeds.shape[0] == prompt_attention_masks.shape[0]

            logger.info("===== prompt_embeds: %s", prompt_embeds.shape)
            logger.info("===== prompt_attention_masks: %s",
                        prompt_attention_masks.shape)

            # Prepare batch data for Parquet dataset
            batch_data = []

            # Add progress bar for saving outputs
            save_pbar = tqdm(enumerate(valid_data["path"]),
                             desc="Saving outputs",
                             unit="item",
                             leave=False)

            for idx, text_path in save_pbar:
                text_name = os.path.basename(text_path).split(".")[0]

                # Convert tensors to numpy arrays
                text_embedding = prompt_embeds[idx].cpu().numpy()

                # Create record for Parquet dataset (text-only schema)
                record = text_only_record_creator(
                    text_name=text_name,
                    text_embedding=text_embedding,
                    caption=valid_data["text"][idx],
                )
                batch_data.append(record)

            if batch_data:
                write_pbar = tqdm(total=1,
                                  desc="Writing to Parquet dataset",
                                  unit="batch")
                table = records_to_table(batch_data,
                                         pyarrow_schema_text_only)
                write_pbar.update(1)
                write_pbar.close()

                if not hasattr(self, 'dataset_writer'):
                    self.dataset_writer = ParquetDatasetWriter(
                        out_dir=self.combined_parquet_dir,
                        samples_per_file=args.samples_per_file,
                    )
                self.dataset_writer.append_table(table)

                logger.info("Collected batch with %s samples", len(table))

            if self.num_processed_samples >= args.flush_frequency:
                written = self.dataset_writer.flush()
                logger.info("Flushed %s samples to parquet", written)
                self.num_processed_samples = 0

    # Final flush for any remaining samples
    if hasattr(self, 'dataset_writer'):
        written = self.dataset_writer.flush(write_remainder=True)
        if written:
            logger.info("Final flush wrote %s samples", written)

Functions¶

fastvideo.pipelines.preprocess.preprocess_stages ¶

Classes¶

fastvideo.pipelines.preprocess.preprocess_stages.TextTransformStage ¶

TextTransformStage(cfg_uncondition_drop_rate: float, seed: int)

Bases: PipelineStage

Process text data according to the cfg rate.

Source code in fastvideo/pipelines/preprocess/preprocess_stages.py

def __init__(self, cfg_uncondition_drop_rate: float, seed: int) -> None:
    self.cfg_rate = cfg_uncondition_drop_rate
    self.rng = random.Random(seed)

fastvideo.pipelines.preprocess.preprocess_stages.VideoTransformStage ¶

VideoTransformStage(train_fps: int, num_frames: int, max_height: int, max_width: int, do_temporal_sample: bool)

Bases: PipelineStage

Crop a video in temporal dimension.

Source code in fastvideo/pipelines/preprocess/preprocess_stages.py

def __init__(self, train_fps: int, num_frames: int, max_height: int,
             max_width: int, do_temporal_sample: bool) -> None:
    self.train_fps = train_fps
    self.num_frames = num_frames
    if do_temporal_sample:
        self.temporal_sample_fn: Callable | None = TemporalRandomCrop(
            num_frames)
    else:
        self.temporal_sample_fn = None

    self.video_transform = transforms.Compose([
        CenterCropResizeVideo((max_height, max_width)),
    ])

fastvideo.pipelines.stages ¶

Pipeline stages for diffusion models.

This package contains the various stages that can be composed to create complete diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.CausalDMDDenosingStage ¶

CausalDMDDenosingStage(transformer, scheduler, transformer_2=None, vae=None)

Bases: DenoisingStage

Denoising stage for causal diffusion.

Source code in fastvideo/pipelines/stages/causal_denoising.py

def __init__(self,
             transformer,
             scheduler,
             transformer_2=None,
             vae=None) -> None:
    super().__init__(transformer, scheduler, transformer_2)
    # KV and cross-attention cache state (initialized on first forward)
    self.transformer = transformer
    self.transformer_2 = transformer_2
    self.vae = vae
    # Model-dependent constants (aligned with causal_inference.py assumptions)
    self.num_transformer_blocks = len(self.transformer.blocks)
    self.num_frames_per_block = self.transformer.config.arch_config.num_frames_per_block
    self.sliding_window_num_frames = self.transformer.config.arch_config.sliding_window_num_frames

    try:
        self.local_attn_size = getattr(self.transformer.model,
                                       "local_attn_size",
                                       -1)  # type: ignore
    except Exception:
        self.local_attn_size = -1

Functions¶

fastvideo.pipelines.stages.CausalDMDDenosingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify denoising stage inputs.

Source code in fastvideo/pipelines/stages/causal_denoising.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify denoising stage inputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
    result.add_check("image_embeds", batch.image_embeds, V.is_list)
    result.add_check("image_latent", batch.image_latent,
                     V.none_or_tensor_with_dims(5))
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    result.add_check("eta", batch.eta, V.non_negative_float)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds, lambda x:
        not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

fastvideo.pipelines.stages.ConditioningStage ¶

Bases: PipelineStage

Stage for applying conditioning to the diffusion process.

This stage handles the application of conditioning, such as classifier-free guidance, to the diffusion process.

Functions¶

fastvideo.pipelines.stages.ConditioningStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Apply conditioning to the diffusion process.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with applied conditioning.

Source code in fastvideo/pipelines/stages/conditioning.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Apply conditioning to the diffusion process.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with applied conditioning.
    """
    # TODO!!
    if not batch.do_classifier_free_guidance:
        return batch
    else:
        return batch

    logger.info("batch.negative_prompt_embeds: %s",
                batch.negative_prompt_embeds)
    logger.info("do_classifier_free_guidance: %s",
                batch.do_classifier_free_guidance)
    logger.info("cfg_scale: %s", batch.guidance_scale)

    # Ensure negative prompt embeddings are available
    assert batch.negative_prompt_embeds is not None, (
        "Negative prompt embeddings are required for classifier-free guidance"
    )

    # Concatenate primary embeddings and masks
    batch.prompt_embeds = torch.cat(
        [batch.negative_prompt_embeds, batch.prompt_embeds])
    if batch.attention_mask is not None:
        batch.attention_mask = torch.cat(
            [batch.negative_attention_mask, batch.attention_mask])

    # Concatenate secondary embeddings and masks if present
    if batch.prompt_embeds_2 is not None:
        batch.prompt_embeds_2 = torch.cat(
            [batch.negative_prompt_embeds_2, batch.prompt_embeds_2])
    if batch.attention_mask_2 is not None:
        batch.attention_mask_2 = torch.cat(
            [batch.negative_attention_mask_2, batch.attention_mask_2])

    return batch

fastvideo.pipelines.stages.ConditioningStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify conditioning stage inputs.

Source code in fastvideo/pipelines/stages/conditioning.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify conditioning stage inputs."""
    result = VerificationResult()
    if not batch.prompt_embeds:
        # No text encoder/prompt embeddings: skip checks and effectively disable CFG.
        batch.do_classifier_free_guidance = False
        return result
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    # Matrix-Game allow empty prompt
    # embeddings when CFG isn't enabled.
    if batch.do_classifier_free_guidance or batch.prompt_embeds:
        result.add_check("prompt_embeds", batch.prompt_embeds,
                         V.list_not_empty)
        result.add_check(
            "negative_prompt_embeds", batch.negative_prompt_embeds, lambda
            x: not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

fastvideo.pipelines.stages.ConditioningStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify conditioning stage outputs.

Source code in fastvideo/pipelines/stages/conditioning.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify conditioning stage outputs."""
    result = VerificationResult()
    if batch.prompt_embeds is None or not batch.prompt_embeds:
        batch.do_classifier_free_guidance = False
        return result
    if batch.do_classifier_free_guidance or batch.prompt_embeds:
        result.add_check("prompt_embeds", batch.prompt_embeds,
                         V.list_not_empty)
    return result

fastvideo.pipelines.stages.CosmosDenoisingStage ¶

CosmosDenoisingStage(transformer, scheduler, pipeline=None)

Bases: DenoisingStage

Denoising stage for Cosmos models using FlowMatchEulerDiscreteScheduler.

Source code in fastvideo/pipelines/stages/denoising.py

def __init__(self, transformer, scheduler, pipeline=None) -> None:
    super().__init__(transformer, scheduler, pipeline)

Functions¶

fastvideo.pipelines.stages.CosmosDenoisingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify Cosmos denoising stage inputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify Cosmos denoising stage inputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds, lambda x:
        not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

fastvideo.pipelines.stages.CosmosDenoisingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify Cosmos denoising stage outputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify Cosmos denoising stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.CosmosLatentPreparationStage ¶

CosmosLatentPreparationStage(scheduler, transformer, vae=None)

Bases: PipelineStage

Cosmos-specific latent preparation stage that properly handles the tensor shapes and conditioning masks required by the Cosmos transformer.

This stage replicates the logic from diffusers' Cosmos2VideoToWorldPipeline.prepare_latents()

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self, scheduler, transformer, vae=None) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.vae = vae

Functions¶

fastvideo.pipelines.stages.CosmosLatentPreparationStage.adjust_video_length ¶

adjust_video_length(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> int

Adjust video length based on VAE version.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`int`	The batch with adjusted video length.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def adjust_video_length(self, batch: ForwardBatch,
                        fastvideo_args: FastVideoArgs) -> int:
    """
    Adjust video length based on VAE version.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with adjusted video length.
    """

    video_length = batch.num_frames
    use_temporal_scaling_frames = fastvideo_args.pipeline_config.vae_config.use_temporal_scaling_frames
    if use_temporal_scaling_frames:
        temporal_scale_factor = fastvideo_args.pipeline_config.vae_config.arch_config.temporal_compression_ratio
        latent_num_frames = (video_length - 1) // temporal_scale_factor + 1
    else:  # stepvideo only
        latent_num_frames = video_length // 17 * 3
    return int(latent_num_frames)

fastvideo.pipelines.stages.CosmosLatentPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify Cosmos latent preparation stage inputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify Cosmos latent preparation stage inputs."""
    result = VerificationResult()
    result.add_check(
        "prompt_or_embeds", None, lambda _: V.string_or_list_strings(
            batch.prompt) or V.list_not_empty(batch.prompt_embeds))
    result.add_check("prompt_embeds", batch.prompt_embeds,
                     V.list_of_tensors)
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("num_frames", batch.num_frames, V.positive_int)
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("latents", batch.latents, V.none_or_tensor)
    return result

fastvideo.pipelines.stages.CosmosLatentPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage outputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("raw_latent_shape", batch.raw_latent_shape, V.is_tuple)
    return result

fastvideo.pipelines.stages.DecodingStage ¶

DecodingStage(vae, pipeline=None)

Bases: PipelineStage

Stage for decoding latent representations into pixel space.

This stage handles the decoding of latent representations into the final output format (e.g., pixel values).

Source code in fastvideo/pipelines/stages/decoding.py

def __init__(self, vae, pipeline=None) -> None:
    self.vae: ParallelTiledVAE = vae
    self.pipeline = weakref.ref(pipeline) if pipeline else None

Functions¶

fastvideo.pipelines.stages.DecodingStage.decode ¶

decode(latents: Tensor, fastvideo_args: FastVideoArgs) -> Tensor

Decode latent representations into pixel space using VAE.

Parameters:

Name	Type	Description	Default
`latents`	`Tensor`	Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)	required
`fastvideo_args`	`FastVideoArgs`	Configuration containing: - disable_autocast: Whether to disable automatic mixed precision (default: False) - pipeline_config.vae_precision: VAE computation precision ("fp32", "fp16", "bf16") - pipeline_config.vae_tiling: Whether to enable VAE tiling for memory efficiency	required

Returns:

Type	Description
`Tensor`	Decoded video tensor with shape (batch, channels, frames, height, width),
`Tensor`	normalized to [0, 1] range and moved to CPU as float32

Source code in fastvideo/pipelines/stages/decoding.py

@torch.no_grad()
def decode(self, latents: torch.Tensor,
           fastvideo_args: FastVideoArgs) -> torch.Tensor:
    """
    Decode latent representations into pixel space using VAE.

    Args:
        latents: Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)
        fastvideo_args: Configuration containing:
            - disable_autocast: Whether to disable automatic mixed precision (default: False)
            - pipeline_config.vae_precision: VAE computation precision ("fp32", "fp16", "bf16")
            - pipeline_config.vae_tiling: Whether to enable VAE tiling for memory efficiency

    Returns:
        Decoded video tensor with shape (batch, channels, frames, height, width), 
        normalized to [0, 1] range and moved to CPU as float32
    """
    self.vae = self.vae.to(get_local_torch_device())
    latents = latents.to(get_local_torch_device())

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # denormalization for MatrixGame VAE
    # z = z * std + mean during decode
    if (hasattr(self.vae.config, 'latents_mean')
            and hasattr(self.vae.config, 'latents_std')):
        # Convert config values to tensors
        latents_mean = torch.tensor(self.vae.config.latents_mean,
                                    device=latents.device,
                                    dtype=latents.dtype).view(
                                        1, -1, 1, 1, 1)

        latents_std = torch.tensor(self.vae.config.latents_std,
                                   device=latents.device,
                                   dtype=latents.dtype).view(
                                       1, -1, 1, 1, 1)

        # Apply denormalization: z = z * std + mean
        latents = latents * latents_std + latents_mean
    elif hasattr(self.vae, 'scaling_factor'):
        # Standard VAE scaling
        if isinstance(self.vae.scaling_factor, torch.Tensor):
            latents = latents / self.vae.scaling_factor.to(
                latents.device, latents.dtype)
        else:
            latents = latents / self.vae.scaling_factor

        # Apply shifting if needed
        if (hasattr(self.vae, "shift_factor")
                and self.vae.shift_factor is not None):
            if isinstance(self.vae.shift_factor, torch.Tensor):
                latents += self.vae.shift_factor.to(latents.device,
                                                    latents.dtype)
            else:
                latents += self.vae.shift_factor

    # Decode latents
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        # if fastvideo_args.vae_sp:
        #     self.vae.enable_parallel()
        if not vae_autocast_enabled:
            latents = latents.to(vae_dtype)
        image = self.vae.decode(latents)

    # Normalize image to [0, 1] range
    image = (image / 2 + 0.5).clamp(0, 1)
    return image

fastvideo.pipelines.stages.DecodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Decode latent representations into pixel space.

This method processes the batch through the VAE decoder, converting latent representations to pixel-space video/images. It also optionally decodes trajectory latents for visualization purposes.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch containing: - latents: Tensor to decode (batch, channels, frames, height_latents, width_latents) - return_trajectory_decoded (optional): Flag to decode trajectory latents - trajectory_latents (optional): Latents at different timesteps - trajectory_timesteps (optional): Corresponding timesteps	required
`fastvideo_args`	`FastVideoArgs`	Configuration containing: - output_type: "latent" to skip decoding, otherwise decode to pixels - vae_cpu_offload: Whether to offload VAE to CPU after decoding - model_loaded: Track VAE loading state - model_paths: Path to VAE model if loading needed	required

Returns:

Type	Description
`ForwardBatch`	Modified batch with: - output: Decoded frames (batch, channels, frames, height, width) as CPU float32 - trajectory_decoded (if requested): List of decoded frames per timestep

Source code in fastvideo/pipelines/stages/decoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Decode latent representations into pixel space.

    This method processes the batch through the VAE decoder, converting latent
    representations to pixel-space video/images. It also optionally decodes
    trajectory latents for visualization purposes.

    Args:
        batch: The current batch containing:
            - latents: Tensor to decode (batch, channels, frames, height_latents, width_latents)
            - return_trajectory_decoded (optional): Flag to decode trajectory latents
            - trajectory_latents (optional): Latents at different timesteps
            - trajectory_timesteps (optional): Corresponding timesteps
        fastvideo_args: Configuration containing:
            - output_type: "latent" to skip decoding, otherwise decode to pixels
            - vae_cpu_offload: Whether to offload VAE to CPU after decoding
            - model_loaded: Track VAE loading state
            - model_paths: Path to VAE model if loading needed

    Returns:
        Modified batch with:
            - output: Decoded frames (batch, channels, frames, height, width) as CPU float32
            - trajectory_decoded (if requested): List of decoded frames per timestep
    """
    # load vae if not already loaded (used for memory constrained devices)
    pipeline = self.pipeline() if self.pipeline else None
    if not fastvideo_args.model_loaded["vae"]:
        loader = VAELoader()
        self.vae = loader.load(fastvideo_args.model_paths["vae"],
                               fastvideo_args)
        if pipeline:
            pipeline.add_module("vae", self.vae)
        fastvideo_args.model_loaded["vae"] = True

    if fastvideo_args.output_type == "latent":
        frames = batch.latents
    else:
        frames = self.decode(batch.latents, fastvideo_args)

    # decode trajectory latents if needed
    if batch.return_trajectory_decoded:
        batch.trajectory_decoded = []
        assert batch.trajectory_latents is not None, "batch should have trajectory latents"
        for idx in range(batch.trajectory_latents.shape[1]):
            # batch.trajectory_latents is [batch_size, timesteps, channels, frames, height, width]
            cur_latent = batch.trajectory_latents[:, idx, :, :, :, :]
            cur_timestep = batch.trajectory_timesteps[idx]
            logger.info("decoding trajectory latent for timestep: %s",
                        cur_timestep)
            decoded_frames = self.decode(cur_latent, fastvideo_args)
            batch.trajectory_decoded.append(decoded_frames.cpu().float())

    # Convert to CPU float32 for compatibility
    frames = frames.cpu().float()

    # Crop padding if this is a LongCat refinement
    if hasattr(batch, 'num_cond_frames_added') and hasattr(
            batch, 'new_frame_size_before_padding'):
        num_cond_frames_added = batch.num_cond_frames_added
        new_frame_size = batch.new_frame_size_before_padding
        if num_cond_frames_added > 0 or frames.shape[2] != new_frame_size:
            # frames is [B, C, T, H, W], crop temporal dimension
            frames = frames[:, :,
                            num_cond_frames_added:num_cond_frames_added +
                            new_frame_size, :, :]
            logger.info(
                "Cropped LongCat refinement padding: %s:%s, final shape: %s",
                num_cond_frames_added,
                num_cond_frames_added + new_frame_size, frames.shape)

    # Update batch with decoded image
    batch.output = frames

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    if fastvideo_args.vae_cpu_offload:
        self.vae.to("cpu")

    if torch.backends.mps.is_available():
        del self.vae
        if pipeline is not None and "vae" in pipeline.modules:
            del pipeline.modules["vae"]
        fastvideo_args.model_loaded["vae"] = False

    return batch

fastvideo.pipelines.stages.DecodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify decoding stage inputs.

Source code in fastvideo/pipelines/stages/decoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify decoding stage inputs."""
    result = VerificationResult()
    # Denoised latents for VAE decoding: [batch_size, channels, frames, height_latents, width_latents]
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.DecodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify decoding stage outputs.

Source code in fastvideo/pipelines/stages/decoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify decoding stage outputs."""
    result = VerificationResult()
    # Decoded video/images: [batch_size, channels, frames, height, width]
    result.add_check("output", batch.output, [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.DenoisingStage ¶

DenoisingStage(transformer, scheduler, pipeline=None, transformer_2=None, vae=None)

Bases: PipelineStage

Stage for running the denoising loop in diffusion pipelines.

This stage handles the iterative denoising process that transforms the initial noise into the final output.

Source code in fastvideo/pipelines/stages/denoising.py

def __init__(self,
             transformer,
             scheduler,
             pipeline=None,
             transformer_2=None,
             vae=None) -> None:
    super().__init__()
    self.transformer = transformer
    self.transformer_2 = transformer_2
    self.scheduler = scheduler
    self.vae = vae
    self.pipeline = weakref.ref(pipeline) if pipeline else None
    attn_head_size = self.transformer.hidden_size // self.transformer.num_attention_heads
    self.attn_backend = get_attn_backend(
        head_size=attn_head_size,
        dtype=torch.float16,  # TODO(will): hack
        supported_attention_backends=(
            AttentionBackendEnum.SLIDING_TILE_ATTN,
            AttentionBackendEnum.VIDEO_SPARSE_ATTN,
            AttentionBackendEnum.VMOBA_ATTN,
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.TORCH_SDPA,
            AttentionBackendEnum.SAGE_ATTN_THREE)  # hack
    )

Functions¶

fastvideo.pipelines.stages.DenoisingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Run the denoising loop.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with denoised latents.

Source code in fastvideo/pipelines/stages/denoising.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Run the denoising loop.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with denoised latents.
    """
    pipeline = self.pipeline() if self.pipeline else None
    if not fastvideo_args.model_loaded["transformer"]:
        loader = TransformerLoader()
        self.transformer = loader.load(
            fastvideo_args.model_paths["transformer"], fastvideo_args)
        if pipeline:
            pipeline.add_module("transformer", self.transformer)
        fastvideo_args.model_loaded["transformer"] = True

    # Prepare extra step kwargs for scheduler
    extra_step_kwargs = self.prepare_extra_func_kwargs(
        self.scheduler.step,
        {
            "generator": batch.generator,
            "eta": batch.eta
        },
    )

    # Setup precision and autocast settings
    # TODO(will): make the precision configurable for inference
    # target_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
    target_dtype = torch.bfloat16
    autocast_enabled = (target_dtype != torch.float32
                        ) and not fastvideo_args.disable_autocast

    # Get timesteps and calculate warmup steps
    timesteps = batch.timesteps
    # TODO(will): remove this once we add input/output validation for stages
    if timesteps is None:
        raise ValueError("Timesteps must be provided")
    num_inference_steps = batch.num_inference_steps
    num_warmup_steps = len(
        timesteps) - num_inference_steps * self.scheduler.order

    # Prepare image latents and embeddings for I2V generation
    image_embeds = batch.image_embeds
    if len(image_embeds) > 0:
        assert not torch.isnan(
            image_embeds[0]).any(), "image_embeds contains nan"
        image_embeds = [
            image_embed.to(target_dtype) for image_embed in image_embeds
        ]

    image_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_image": image_embeds,
            "mask_strategy": dict_to_3d_list(
                None, t_max=50, l_max=60, h_max=24)
        },
    )

    pos_cond_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_2": batch.clip_embedding_pos,
            "encoder_attention_mask": batch.prompt_attention_mask,
        },
    )

    neg_cond_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_2": batch.clip_embedding_neg,
            "encoder_attention_mask": batch.negative_attention_mask,
        },
    )

    action_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "mouse_cond": batch.mouse_cond,
            "keyboard_cond": batch.keyboard_cond,
        },
    )

    # Prepare STA parameters
    if st_attn_available and self.attn_backend == SlidingTileAttentionBackend:
        self.prepare_sta_param(batch, fastvideo_args)

    # Get latents and embeddings
    latents = batch.latents
    prompt_embeds = batch.prompt_embeds
    assert not torch.isnan(
        prompt_embeds[0]).any(), "prompt_embeds contains nan"
    if batch.do_classifier_free_guidance:
        neg_prompt_embeds = batch.negative_prompt_embeds
        assert neg_prompt_embeds is not None
        assert not torch.isnan(
            neg_prompt_embeds[0]).any(), "neg_prompt_embeds contains nan"

    # (Wan2.2) Calculate timestep to switch from high noise expert to low noise expert
    boundary_ratio = fastvideo_args.pipeline_config.dit_config.boundary_ratio
    if batch.boundary_ratio is not None:
        logger.info("Overriding boundary ratio from %s to %s",
                    boundary_ratio, batch.boundary_ratio)
        boundary_ratio = batch.boundary_ratio

    if boundary_ratio is not None:
        boundary_timestep = boundary_ratio * self.scheduler.num_train_timesteps
    else:
        boundary_timestep = None
    latent_model_input = latents.to(target_dtype)
    assert latent_model_input.shape[0] == 1, "only support batch size 1"

    if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
        # TI2V directly replaces the first frame of the latent with
        # the image latent instead of appending along the channel dim
        assert batch.image_latent is None, "TI2V task should not have image latents"
        assert self.vae is not None, "VAE is not provided for TI2V task"
        z = self.vae.encode(batch.pil_image).mean.float()
        if (hasattr(self.vae, "shift_factor")
                and self.vae.shift_factor is not None):
            if isinstance(self.vae.shift_factor, torch.Tensor):
                z -= self.vae.shift_factor.to(z.device, z.dtype)
            else:
                z -= self.vae.shift_factor

        if isinstance(self.vae.scaling_factor, torch.Tensor):
            z = z * self.vae.scaling_factor.to(z.device, z.dtype)
        else:
            z = z * self.vae.scaling_factor

        latent_model_input = latent_model_input.squeeze(0)
        _, mask2 = masks_like([latent_model_input], zero=True)

        latent_model_input = (1. -
                              mask2[0]) * z + mask2[0] * latent_model_input
        # latent_model_input = latent_model_input.unsqueeze(0)
        latent_model_input = latent_model_input.to(get_local_torch_device())
        latents = latent_model_input
        F = batch.num_frames
        temporal_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_temporal
        spatial_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_spatial
        patch_size = fastvideo_args.pipeline_config.dit_config.arch_config.patch_size
        seq_len = ((F - 1) // temporal_scale +
                   1) * (batch.height // spatial_scale) * (
                       batch.width // spatial_scale) // (patch_size[1] *
                                                         patch_size[2])

    # Initialize lists for ODE trajectory
    trajectory_timesteps: list[torch.Tensor] = []
    trajectory_latents: list[torch.Tensor] = []

    # Run denoising loop
    with self.progress_bar(total=num_inference_steps) as progress_bar:
        for i, t in enumerate(timesteps):
            # Skip if interrupted
            if hasattr(self, 'interrupt') and self.interrupt:
                continue

            if boundary_timestep is None or t >= boundary_timestep:
                if (fastvideo_args.dit_cpu_offload
                        and self.transformer_2 is not None and next(
                            self.transformer_2.parameters()).device.type
                        == 'cuda'):
                    self.transformer_2.to('cpu')
                current_model = self.transformer
                current_guidance_scale = batch.guidance_scale
            else:
                # low-noise stage in wan2.2
                if fastvideo_args.dit_cpu_offload and next(
                        self.transformer.parameters(
                        )).device.type == 'cuda':
                    self.transformer.to('cpu')
                current_model = self.transformer_2
                current_guidance_scale = batch.guidance_scale_2
            assert current_model is not None, "current_model is None"

            # Expand latents for V2V/I2V
            latent_model_input = latents.to(target_dtype)
            if batch.video_latent is not None:
                latent_model_input = torch.cat([
                    latent_model_input, batch.video_latent,
                    torch.zeros_like(latents)
                ],
                                               dim=1).to(target_dtype)
            elif batch.image_latent is not None:
                assert not fastvideo_args.pipeline_config.ti2v_task, "image latents should not be provided for TI2V task"
                latent_model_input = torch.cat(
                    [latent_model_input, batch.image_latent],
                    dim=1).to(target_dtype)

            assert not torch.isnan(
                latent_model_input).any(), "latent_model_input contains nan"
            if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
                timestep = torch.stack([t]).to(get_local_torch_device())
                temp_ts = (mask2[0][0][:, ::2, ::2] * timestep).flatten()
                temp_ts = torch.cat([
                    temp_ts,
                    temp_ts.new_ones(seq_len - temp_ts.size(0)) * timestep
                ])
                timestep = temp_ts.unsqueeze(0)
                t_expand = timestep.repeat(latent_model_input.shape[0], 1)
            else:
                t_expand = t.repeat(latent_model_input.shape[0])

            latent_model_input = self.scheduler.scale_model_input(
                latent_model_input, t)

            # Prepare inputs for transformer
            guidance_expand = (
                torch.tensor(
                    [fastvideo_args.pipeline_config.embedded_cfg_scale] *
                    latent_model_input.shape[0],
                    dtype=torch.float32,
                    device=get_local_torch_device(),
                ).to(target_dtype) *
                1000.0 if fastvideo_args.pipeline_config.embedded_cfg_scale
                is not None else None)

            # Predict noise residual
            with torch.autocast(device_type="cuda",
                                dtype=target_dtype,
                                enabled=autocast_enabled):
                if (st_attn_available
                        and self.attn_backend == SlidingTileAttentionBackend
                    ) or (vsa_available and self.attn_backend
                          == VideoSparseAttentionBackend):
                    self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls(
                    )

                    if self.attn_metadata_builder_cls is not None:
                        self.attn_metadata_builder = self.attn_metadata_builder_cls(
                        )
                        # TODO(will): clean this up
                        attn_metadata = self.attn_metadata_builder.build(  # type: ignore
                            current_timestep=i,  # type: ignore
                            raw_latent_shape=batch.
                            raw_latent_shape[2:5],  # type: ignore
                            patch_size=fastvideo_args.
                            pipeline_config.  # type: ignore
                            dit_config.patch_size,  # type: ignore
                            STA_param=batch.STA_param,  # type: ignore
                            VSA_sparsity=fastvideo_args.
                            VSA_sparsity,  # type: ignore
                            device=get_local_torch_device(),
                        )
                        assert attn_metadata is not None, "attn_metadata cannot be None"
                    else:
                        attn_metadata = None
                elif (vmoba_attn_available
                      and self.attn_backend == VMOBAAttentionBackend):
                    self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls(
                    )
                    if self.attn_metadata_builder_cls is not None:
                        self.attn_metadata_builder = self.attn_metadata_builder_cls(
                        )
                        # Prepare V-MoBA parameters from config
                        moba_params = fastvideo_args.moba_config.copy()
                        moba_params.update({
                            "current_timestep":
                            i,
                            "raw_latent_shape":
                            batch.raw_latent_shape[2:5],
                            "patch_size":
                            fastvideo_args.pipeline_config.dit_config.
                            patch_size,
                            "device":
                            get_local_torch_device(),
                        })
                        attn_metadata = self.attn_metadata_builder.build(
                            **moba_params)
                        assert attn_metadata is not None, "attn_metadata cannot be None"
                    else:
                        attn_metadata = None
                else:
                    attn_metadata = None
                # TODO(will): finalize the interface. vLLM uses this to
                # support torch dynamo compilation. They pass in
                # attn_metadata, vllm_config, and num_tokens. We can pass in
                # fastvideo_args or training_args, and attn_metadata.
                batch.is_cfg_negative = False
                with set_forward_context(
                        current_timestep=i,
                        attn_metadata=attn_metadata,
                        forward_batch=batch,
                        # fastvideo_args=fastvideo_args
                ):
                    # Run transformer
                    noise_pred = current_model(
                        latent_model_input,
                        prompt_embeds,
                        t_expand,
                        guidance=guidance_expand,
                        **image_kwargs,
                        **pos_cond_kwargs,
                        **action_kwargs,
                    )

                if batch.do_classifier_free_guidance:
                    batch.is_cfg_negative = True
                    with set_forward_context(
                            current_timestep=i,
                            attn_metadata=attn_metadata,
                            forward_batch=batch,
                    ):
                        noise_pred_uncond = current_model(
                            latent_model_input,
                            neg_prompt_embeds,
                            t_expand,
                            guidance=guidance_expand,
                            **image_kwargs,
                            **neg_cond_kwargs,
                            **action_kwargs,
                        )

                    noise_pred_text = noise_pred
                    noise_pred = noise_pred_uncond + current_guidance_scale * (
                        noise_pred_text - noise_pred_uncond)

                    # Apply guidance rescale if needed
                    if batch.guidance_rescale > 0.0:
                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
                        noise_pred = self.rescale_noise_cfg(
                            noise_pred,
                            noise_pred_text,
                            guidance_rescale=batch.guidance_rescale,
                        )
                # Compute the previous noisy sample
                latents = self.scheduler.step(noise_pred,
                                              t,
                                              latents,
                                              **extra_step_kwargs,
                                              return_dict=False)[0]
                if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
                    latents = latents.squeeze(0)
                    latents = (1. - mask2[0]) * z + mask2[0] * latents
                    # latents = latents.unsqueeze(0)

            # save trajectory latents if needed
            if batch.return_trajectory_latents:
                trajectory_timesteps.append(t)
                trajectory_latents.append(latents)

            # Update progress bar
            if i == len(timesteps) - 1 or (
                (i + 1) > num_warmup_steps and
                (i + 1) % self.scheduler.order == 0
                    and progress_bar is not None):
                progress_bar.update()

    # Gather results if using sequence parallelism
    trajectory_tensor: torch.Tensor | None = None
    if trajectory_latents:
        trajectory_tensor = torch.stack(trajectory_latents, dim=1)
        trajectory_timesteps_tensor = torch.stack(trajectory_timesteps,
                                                  dim=0)
    else:
        trajectory_tensor = None
        trajectory_timesteps_tensor = None

    if trajectory_tensor is not None and trajectory_timesteps_tensor is not None:
        batch.trajectory_timesteps = trajectory_timesteps_tensor.cpu()
        batch.trajectory_latents = trajectory_tensor.cpu()

    # Update batch with final latents
    batch.latents = latents

    # Save STA mask search results if needed
    if st_attn_available and self.attn_backend == SlidingTileAttentionBackend and fastvideo_args.STA_mode == STA_Mode.STA_SEARCHING:
        self.save_sta_search_results(batch)

    # deallocate transformer if on mps
    if torch.backends.mps.is_available():
        logger.info("Memory before deallocating transformer: %s",
                    torch.mps.current_allocated_memory())
        del self.transformer
        if pipeline is not None and "transformer" in pipeline.modules:
            del pipeline.modules["transformer"]
        fastvideo_args.model_loaded["transformer"] = False
        logger.info("Memory after deallocating transformer: %s",
                    torch.mps.current_allocated_memory())

    return batch

fastvideo.pipelines.stages.DenoisingStage.prepare_extra_func_kwargs ¶

prepare_extra_func_kwargs(func, kwargs) -> dict[str, Any]

Prepare extra kwargs for the scheduler step / denoise step.

Parameters:

Name	Type	Description	Default
`func`		The function to prepare kwargs for.	required
`kwargs`		The kwargs to prepare.	required

Returns:

Type	Description
`dict[str, Any]`	The prepared kwargs.

Source code in fastvideo/pipelines/stages/denoising.py

def prepare_extra_func_kwargs(self, func, kwargs) -> dict[str, Any]:
    """
    Prepare extra kwargs for the scheduler step / denoise step.

    Args:
        func: The function to prepare kwargs for.
        kwargs: The kwargs to prepare.

    Returns:
        The prepared kwargs.
    """
    extra_step_kwargs = {}
    for k, v in kwargs.items():
        accepts = k in set(inspect.signature(func).parameters.keys())
        if accepts:
            extra_step_kwargs[k] = v
    return extra_step_kwargs

fastvideo.pipelines.stages.DenoisingStage.prepare_sta_param ¶

prepare_sta_param(batch: ForwardBatch, fastvideo_args: FastVideoArgs)

Prepare Sliding Tile Attention (STA) parameters and settings.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Source code in fastvideo/pipelines/stages/denoising.py

def prepare_sta_param(self, batch: ForwardBatch,
                      fastvideo_args: FastVideoArgs):
    """
    Prepare Sliding Tile Attention (STA) parameters and settings.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.
    """
    # TODO(kevin): STA mask search, currently only support Wan2.1 with 69x768x1280
    from fastvideo.attention.backends.STA_configuration import configure_sta
    STA_mode = fastvideo_args.STA_mode
    skip_time_steps = fastvideo_args.skip_time_steps
    if batch.timesteps is None:
        raise ValueError("Timesteps must be provided")
    timesteps_num = batch.timesteps.shape[0]

    logger.info("STA_mode: %s", STA_mode)
    if (batch.num_frames, batch.height,
            batch.width) != (69, 768, 1280) and STA_mode != "STA_inference":
        raise NotImplementedError(
            "STA mask search/tuning is not supported for this resolution")

    if STA_mode == STA_Mode.STA_SEARCHING or STA_mode == STA_Mode.STA_TUNING or STA_mode == STA_Mode.STA_TUNING_CFG:
        size = (batch.width, batch.height)
        if size == (1280, 768):
            # TODO: make it configurable
            sparse_mask_candidates_searching = [
                "3, 1, 10", "1, 5, 7", "3, 3, 3", "1, 6, 5", "1, 3, 10",
                "3, 6, 1"
            ]
            sparse_mask_candidates_tuning = [
                "3, 1, 10", "1, 5, 7", "3, 3, 3", "1, 6, 5", "1, 3, 10",
                "3, 6, 1"
            ]
            full_mask = ["3,6,10"]
        else:
            raise NotImplementedError(
                "STA mask search is not supported for this resolution")
    layer_num = self.transformer.config.num_layers
    # specific for HunyuanVideo
    if hasattr(self.transformer.config, "num_single_layers"):
        layer_num += self.transformer.config.num_single_layers
    head_num = self.transformer.config.num_attention_heads

    if STA_mode == STA_Mode.STA_SEARCHING:
        STA_param = configure_sta(
            mode=STA_Mode.STA_SEARCHING,
            layer_num=layer_num,
            head_num=head_num,
            time_step_num=timesteps_num,
            mask_candidates=sparse_mask_candidates_searching +
            full_mask,  # last is full mask; Can add more sparse masks while keep last one as full mask
        )
    elif STA_mode == STA_Mode.STA_TUNING:
        STA_param = configure_sta(
            mode=STA_Mode.STA_TUNING,
            layer_num=layer_num,
            head_num=head_num,
            time_step_num=timesteps_num,
            mask_search_files_path=
            f'output/mask_search_result_pos_{size[0]}x{size[1]}/',
            mask_candidates=sparse_mask_candidates_tuning,
            full_attention_mask=[int(x) for x in full_mask[0].split(',')],
            skip_time_steps=
            skip_time_steps,  # Use full attention for first 12 steps
            save_dir=
            f'output/mask_search_strategy_{size[0]}x{size[1]}/',  # Custom save directory
            timesteps=timesteps_num)
    elif STA_mode == STA_Mode.STA_TUNING_CFG:
        STA_param = configure_sta(
            mode=STA_Mode.STA_TUNING_CFG,
            layer_num=layer_num,
            head_num=head_num,
            time_step_num=timesteps_num,
            mask_search_files_path_pos=
            f'output/mask_search_result_pos_{size[0]}x{size[1]}/',
            mask_search_files_path_neg=
            f'output/mask_search_result_neg_{size[0]}x{size[1]}/',
            mask_candidates=sparse_mask_candidates_tuning,
            full_attention_mask=[int(x) for x in full_mask[0].split(',')],
            skip_time_steps=skip_time_steps,
            save_dir=f'output/mask_search_strategy_{size[0]}x{size[1]}/',
            timesteps=timesteps_num)
    elif STA_mode == STA_Mode.STA_INFERENCE:
        import fastvideo.envs as envs
        config_file = envs.FASTVIDEO_ATTENTION_CONFIG
        if config_file is None:
            raise ValueError("FASTVIDEO_ATTENTION_CONFIG is not set")
        STA_param = configure_sta(mode=STA_Mode.STA_INFERENCE,
                                  layer_num=layer_num,
                                  head_num=head_num,
                                  time_step_num=timesteps_num,
                                  load_path=config_file)

    batch.STA_param = STA_param
    batch.mask_search_final_result_pos = [[] for _ in range(timesteps_num)]
    batch.mask_search_final_result_neg = [[] for _ in range(timesteps_num)]

fastvideo.pipelines.stages.DenoisingStage.progress_bar ¶

progress_bar(iterable: Iterable | None = None, total: int | None = None) -> tqdm

Create a progress bar for the denoising process.

Parameters:

Name	Type	Description	Default
`iterable`	`Iterable \| None`	The iterable to iterate over.	`None`
`total`	`int \| None`	The total number of items.	`None`

Returns:

Type	Description
`tqdm`	A tqdm progress bar.

Source code in fastvideo/pipelines/stages/denoising.py

def progress_bar(self,
                 iterable: Iterable | None = None,
                 total: int | None = None) -> tqdm:
    """
    Create a progress bar for the denoising process.

    Args:
        iterable: The iterable to iterate over.
        total: The total number of items.

    Returns:
        A tqdm progress bar.
    """
    local_rank = get_world_group().local_rank
    if local_rank == 0:
        return tqdm(iterable=iterable, total=total)
    else:
        return tqdm(iterable=iterable, total=total, disable=True)

fastvideo.pipelines.stages.DenoisingStage.rescale_noise_cfg ¶

rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0) -> Tensor

Rescale noise prediction according to guidance_rescale.

Based on findings of "Common Diffusion Noise Schedules and Sample Steps are Flawed" (https://arxiv.org/pdf/2305.08891.pdf), Section 3.4.

Parameters:

Name	Description	Default
`noise_cfg`	The noise prediction with guidance.	required
`noise_pred_text`	The text-conditioned noise prediction.	required
`guidance_rescale`	The guidance rescale factor.	`0.0`

Returns:

Type	Description
`Tensor`	The rescaled noise prediction.

Source code in fastvideo/pipelines/stages/denoising.py

def rescale_noise_cfg(self,
                      noise_cfg,
                      noise_pred_text,
                      guidance_rescale=0.0) -> torch.Tensor:
    """
    Rescale noise prediction according to guidance_rescale.

    Based on findings of "Common Diffusion Noise Schedules and Sample Steps are Flawed"
    (https://arxiv.org/pdf/2305.08891.pdf), Section 3.4.

    Args:
        noise_cfg: The noise prediction with guidance.
        noise_pred_text: The text-conditioned noise prediction.
        guidance_rescale: The guidance rescale factor.

    Returns:
        The rescaled noise prediction.
    """
    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)),
                                   keepdim=True)
    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)),
                            keepdim=True)
    # Rescale the results from guidance (fixes overexposure)
    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
    # Mix with the original results from guidance by factor guidance_rescale
    noise_cfg = (guidance_rescale * noise_pred_rescaled +
                 (1 - guidance_rescale) * noise_cfg)
    return noise_cfg

fastvideo.pipelines.stages.DenoisingStage.save_sta_search_results ¶

save_sta_search_results(batch: ForwardBatch)

Save the STA mask search results.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required

Source code in fastvideo/pipelines/stages/denoising.py

def save_sta_search_results(self, batch: ForwardBatch):
    """
    Save the STA mask search results.

    Args:
        batch: The current batch information.
    """
    size = (batch.width, batch.height)
    if size == (1280, 768):
        # TODO: make it configurable
        sparse_mask_candidates_searching = [
            "3, 1, 10", "1, 5, 7", "3, 3, 3", "1, 6, 5", "1, 3, 10",
            "3, 6, 1"
        ]
    else:
        raise NotImplementedError(
            "STA mask search is not supported for this resolution")

    from fastvideo.attention.backends.STA_configuration import save_mask_search_results
    if batch.mask_search_final_result_pos is not None and batch.prompt is not None:
        save_mask_search_results(
            [
                dict(layer_data)
                for layer_data in batch.mask_search_final_result_pos
            ],
            prompt=str(batch.prompt),
            mask_strategies=sparse_mask_candidates_searching,
            output_dir=f'output/mask_search_result_pos_{size[0]}x{size[1]}/'
        )
    if batch.mask_search_final_result_neg is not None and batch.prompt is not None:
        save_mask_search_results(
            [
                dict(layer_data)
                for layer_data in batch.mask_search_final_result_neg
            ],
            prompt=str(batch.prompt),
            mask_strategies=sparse_mask_candidates_searching,
            output_dir=f'output/mask_search_result_neg_{size[0]}x{size[1]}/'
        )

fastvideo.pipelines.stages.DenoisingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify denoising stage inputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify denoising stage inputs."""
    result = VerificationResult()
    result.add_check("timesteps", batch.timesteps,
                     [V.is_tensor, V.min_dims(1)])
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
    result.add_check("image_embeds", batch.image_embeds, V.is_list)
    result.add_check("image_latent", batch.image_latent,
                     V.none_or_tensor_with_dims(5))
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    result.add_check("eta", batch.eta, V.non_negative_float)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds, lambda x:
        not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

fastvideo.pipelines.stages.DenoisingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify denoising stage outputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify denoising stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.DmdDenoisingStage ¶

DmdDenoisingStage(transformer, scheduler)

Bases: DenoisingStage

Denoising stage for DMD.

Source code in fastvideo/pipelines/stages/denoising.py

def __init__(self, transformer, scheduler) -> None:
    super().__init__(transformer, scheduler)
    self.scheduler = FlowMatchEulerDiscreteScheduler(shift=8.0)

Functions¶

fastvideo.pipelines.stages.DmdDenoisingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Run the denoising loop.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with denoised latents.

Source code in fastvideo/pipelines/stages/denoising.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Run the denoising loop.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with denoised latents.
    """
    # Setup precision and autocast settings
    # TODO(will): make the precision configurable for inference
    # target_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
    target_dtype = torch.bfloat16
    autocast_enabled = (target_dtype != torch.float32
                        ) and not fastvideo_args.disable_autocast

    # Get timesteps and calculate warmup steps
    timesteps = batch.timesteps

    # TODO(will): remove this once we add input/output validation for stages
    if timesteps is None:
        raise ValueError("Timesteps must be provided")
    num_inference_steps = batch.num_inference_steps
    num_warmup_steps = len(
        timesteps) - num_inference_steps * self.scheduler.order

    # Prepare image latents and embeddings for I2V generation
    image_embeds = batch.image_embeds
    if len(image_embeds) > 0:
        assert torch.isnan(image_embeds[0]).sum() == 0
        image_embeds = [
            image_embed.to(target_dtype) for image_embed in image_embeds
        ]

    image_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_image": image_embeds,
            "mask_strategy": dict_to_3d_list(
                None, t_max=50, l_max=60, h_max=24)
        },
    )

    pos_cond_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_2": batch.clip_embedding_pos,
            "encoder_attention_mask": batch.prompt_attention_mask,
        },
    )

    # Prepare STA parameters
    if st_attn_available and self.attn_backend == SlidingTileAttentionBackend:
        self.prepare_sta_param(batch, fastvideo_args)

    # Get latents and embeddings
    assert batch.latents is not None, "latents must be provided"
    latents = batch.latents

    video_raw_latent_shape = latents.shape
    prompt_embeds = batch.prompt_embeds
    assert not torch.isnan(
        prompt_embeds[0]).any(), "prompt_embeds contains nan"
    timesteps = torch.tensor(
        fastvideo_args.pipeline_config.dmd_denoising_steps,
        dtype=torch.long,
        device=get_local_torch_device())

    # Run denoising loop
    with self.progress_bar(total=len(timesteps)) as progress_bar:
        for i, t in enumerate(timesteps):
            # Skip if interrupted
            if hasattr(self, 'interrupt') and self.interrupt:
                continue
            # Expand latents for I2V
            noise_latents = latents.clone()
            latent_model_input = latents.to(target_dtype)

            if batch.image_latent is not None:
                latent_model_input = torch.cat([
                    latent_model_input,
                    batch.image_latent.permute(0, 2, 1, 3, 4)
                ],
                                               dim=2).to(target_dtype)
            assert not torch.isnan(
                latent_model_input).any(), "latent_model_input contains nan"

            # Prepare inputs for transformer
            t_expand = t.repeat(latent_model_input.shape[0])
            guidance_expand = (
                torch.tensor(
                    [fastvideo_args.pipeline_config.embedded_cfg_scale] *
                    latent_model_input.shape[0],
                    dtype=torch.float32,
                    device=get_local_torch_device(),
                ).to(target_dtype) *
                1000.0 if fastvideo_args.pipeline_config.embedded_cfg_scale
                is not None else None)

            # Predict noise residual
            with torch.autocast(device_type="cuda",
                                dtype=target_dtype,
                                enabled=autocast_enabled):
                if (vsa_available and self.attn_backend
                        == VideoSparseAttentionBackend):
                    self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls(
                    )

                    if self.attn_metadata_builder_cls is not None:
                        self.attn_metadata_builder = self.attn_metadata_builder_cls(
                        )
                        # TODO(will): clean this up
                        attn_metadata = self.attn_metadata_builder.build(  # type: ignore
                            current_timestep=i,  # type: ignore
                            raw_latent_shape=batch.
                            raw_latent_shape[2:5],  # type: ignore
                            patch_size=fastvideo_args.
                            pipeline_config.  # type: ignore
                            dit_config.patch_size,  # type: ignore
                            STA_param=batch.STA_param,  # type: ignore
                            VSA_sparsity=fastvideo_args.
                            VSA_sparsity,  # type: ignore
                            device=get_local_torch_device(),  # type: ignore
                        )  # type: ignore
                        assert attn_metadata is not None, "attn_metadata cannot be None"
                    else:
                        attn_metadata = None
                else:
                    attn_metadata = None

                batch.is_cfg_negative = False
                with set_forward_context(
                        current_timestep=i,
                        attn_metadata=attn_metadata,
                        forward_batch=batch,
                        # fastvideo_args=fastvideo_args
                ):
                    # Run transformer
                    pred_noise = self.transformer(
                        latent_model_input.permute(0, 2, 1, 3, 4),
                        prompt_embeds,
                        t_expand,
                        guidance=guidance_expand,
                        **image_kwargs,
                        **pos_cond_kwargs,
                    ).permute(0, 2, 1, 3, 4)

                pred_video = pred_noise_to_pred_video(
                    pred_noise=pred_noise.flatten(0, 1),
                    noise_input_latent=noise_latents.flatten(0, 1),
                    timestep=t_expand,
                    scheduler=self.scheduler).unflatten(
                        0, pred_noise.shape[:2])

                if i < len(timesteps) - 1:
                    next_timestep = timesteps[i + 1] * torch.ones(
                        [1], dtype=torch.long, device=pred_video.device)
                    noise = torch.randn(video_raw_latent_shape,
                                        dtype=pred_video.dtype,
                                        generator=batch.generator[0]).to(
                                            self.device)
                    latents = self.scheduler.add_noise(
                        pred_video.flatten(0, 1), noise.flatten(0, 1),
                        next_timestep).unflatten(0, pred_video.shape[:2])
                else:
                    latents = pred_video

                # Update progress bar
                if i == len(timesteps) - 1 or (
                    (i + 1) > num_warmup_steps and
                    (i + 1) % self.scheduler.order == 0
                        and progress_bar is not None):
                    progress_bar.update()

    # Gather results if using sequence parallelism
    latents = latents.permute(0, 2, 1, 3, 4)
    # Update batch with final latents
    batch.latents = latents

    return batch

fastvideo.pipelines.stages.EncodingStage ¶

EncodingStage(vae: ParallelTiledVAE)

Bases: PipelineStage

Stage for encoding pixel space representations into latent space.

This stage handles the encoding of pixel-space video/images into latent representations for further processing in the diffusion pipeline.

Source code in fastvideo/pipelines/stages/encoding.py

def __init__(self, vae: ParallelTiledVAE) -> None:
    self.vae: ParallelTiledVAE = vae

Functions¶

fastvideo.pipelines.stages.EncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode pixel space representations into latent space.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded latents.

Source code in fastvideo/pipelines/stages/encoding.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode pixel space representations into latent space.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded latents.
    """
    assert batch.latents is not None and isinstance(batch.latents,
                                                    torch.Tensor)

    self.vae = self.vae.to(get_local_torch_device())

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # Normalize input to [-1, 1] range (reverse of decoding normalization)
    latents = (batch.latents * 2.0 - 1.0).clamp(-1, 1)

    # Move to appropriate device and dtype
    latents = latents.to(get_local_torch_device())

    # Encode image to latents
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        # if fastvideo_args.vae_sp:
        #     self.vae.enable_parallel()
        if not vae_autocast_enabled:
            latents = latents.to(vae_dtype)
        latents = self.vae.encode(latents).mean

    # Update batch with encoded latents
    batch.latents = latents

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    if fastvideo_args.vae_cpu_offload:
        self.vae.to("cpu")

    return batch

fastvideo.pipelines.stages.EncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage inputs.

Source code in fastvideo/pipelines/stages/encoding.py

@torch.no_grad()
def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage inputs."""
    result = VerificationResult()
    # Input video/images for VAE encoding: [batch_size, channels, frames, height, width]
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.EncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage outputs.

Source code in fastvideo/pipelines/stages/encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage outputs."""
    result = VerificationResult()
    # Encoded latents: [batch_size, channels, frames, height_latents, width_latents]
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.Hy15ImageEncodingStage ¶

Hy15ImageEncodingStage(image_encoder, image_processor)

Bases: ImageEncodingStage

Stage for encoding image prompts into embeddings for HunyuanVideo1.5 models.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, image_encoder, image_processor) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary image encoder.
    """
    super().__init__()
    self.image_processor = image_processor
    self.image_encoder = image_encoder

Functions¶

fastvideo.pipelines.stages.Hy15ImageEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into image encoder hidden states.

Source code in fastvideo/pipelines/stages/image_encoding.py

def forward(self, batch: ForwardBatch,
            fastvideo_args: FastVideoArgs) -> ForwardBatch:
    """
    Encode the prompt into image encoder hidden states.
    """
    if batch.pil_image is None:
        batch.image_embeds = [
            torch.zeros(1, 729, 1152, device=get_local_torch_device())
        ]

    raw_latent_shape = list(batch.raw_latent_shape)
    raw_latent_shape[1] = 1
    batch.video_latent = torch.zeros(tuple(raw_latent_shape),
                                     device=get_local_torch_device())
    return batch

fastvideo.pipelines.stages.Hy15ImageEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify image encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify image encoding stage inputs."""
    return VerificationResult()

fastvideo.pipelines.stages.ImageEncodingStage ¶

ImageEncodingStage(image_encoder, image_processor)

Bases: PipelineStage

Stage for encoding image prompts into embeddings for diffusion models.

This stage handles the encoding of image prompts into the embedding space expected by the diffusion model.

Initialize the prompt encoding stage.

Parameters:

Name	Type	Description	Default
`enable_logging`		Whether to enable logging for this stage.	required
`is_secondary`		Whether this is a secondary image encoder.	required

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, image_encoder, image_processor) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary image encoder.
    """
    super().__init__()
    self.image_processor = image_processor
    self.image_encoder = image_encoder

Functions¶

fastvideo.pipelines.stages.ImageEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into image encoder hidden states.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded prompt embeddings.

Source code in fastvideo/pipelines/stages/image_encoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode the prompt into image encoder hidden states.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded prompt embeddings.
    """
    self.image_encoder = self.image_encoder.to(get_local_torch_device())

    image = batch.pil_image

    image_inputs = self.image_processor(
        images=image, return_tensors="pt").to(get_local_torch_device())
    with set_forward_context(current_timestep=0, attn_metadata=None):
        outputs = self.image_encoder(**image_inputs)
        image_embeds = outputs.last_hidden_state

    batch.image_embeds.append(image_embeds)

    if fastvideo_args.image_encoder_cpu_offload:
        self.image_encoder.to('cpu')

    return batch

fastvideo.pipelines.stages.ImageEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify image encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify image encoding stage inputs."""
    result = VerificationResult()
    result.add_check("pil_image", batch.pil_image, V.not_none)
    result.add_check("image_embeds", batch.image_embeds, V.is_list)
    return result

fastvideo.pipelines.stages.ImageEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify image encoding stage outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify image encoding stage outputs."""
    result = VerificationResult()
    result.add_check("image_embeds", batch.image_embeds,
                     V.list_of_tensors_dims(3))
    return result

fastvideo.pipelines.stages.ImageVAEEncodingStage ¶

ImageVAEEncodingStage(vae: ParallelTiledVAE)

Bases: PipelineStage

Stage for encoding image pixel representations into latent space.

This stage handles the encoding of image pixel representations into the final input format (e.g., latents) for image-to-video generation.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, vae: ParallelTiledVAE) -> None:
    self.vae: ParallelTiledVAE = vae

Functions¶

fastvideo.pipelines.stages.ImageVAEEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode pixel representations into latent space.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode pixel representations into latent space.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded outputs.
    """
    assert batch.pil_image is not None
    if fastvideo_args.mode == ExecutionMode.INFERENCE:
        assert batch.pil_image is not None and isinstance(
            batch.pil_image, PIL.Image.Image)
        assert batch.height is not None and isinstance(batch.height, int)
        assert batch.width is not None and isinstance(batch.width, int)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, int)
        height = batch.height
        width = batch.width
        num_frames = batch.num_frames
    elif fastvideo_args.mode == ExecutionMode.PREPROCESS:
        assert batch.pil_image is not None and isinstance(
            batch.pil_image, torch.Tensor)
        assert batch.height is not None and isinstance(batch.height, list)
        assert batch.width is not None and isinstance(batch.width, list)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, list)
        num_frames = batch.num_frames[0]
        height = batch.height[0]
        width = batch.width[0]

    self.vae = self.vae.to(get_local_torch_device())

    # Process single image for I2V
    latent_height = height // self.vae.spatial_compression_ratio
    latent_width = width // self.vae.spatial_compression_ratio
    image = batch.pil_image
    image = self.preprocess(
        image,
        vae_scale_factor=self.vae.spatial_compression_ratio,
        height=height,
        width=width).to(get_local_torch_device(), dtype=torch.float32)

    # (B, C, H, W) -> (B, C, 1, H, W)
    image = image.unsqueeze(2)

    video_condition = torch.cat([
        image,
        image.new_zeros(image.shape[0], image.shape[1], num_frames - 1,
                        image.shape[3], image.shape[4])
    ],
                                dim=2)
    video_condition = video_condition.to(device=get_local_torch_device(),
                                         dtype=torch.float32)

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # Encode Image
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        # if fastvideo_args.vae_sp:
        #     self.vae.enable_parallel()
        if not vae_autocast_enabled:
            video_condition = video_condition.to(vae_dtype)
        encoder_output = self.vae.encode(video_condition)

    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        latent_condition = encoder_output.mean
    else:
        generator = batch.generator
        if generator is None:
            raise ValueError("Generator must be provided")
        latent_condition = self.retrieve_latents(encoder_output, generator)

    # Apply shifting if needed
    if (hasattr(self.vae, "shift_factor")
            and self.vae.shift_factor is not None):
        if isinstance(self.vae.shift_factor, torch.Tensor):
            latent_condition -= self.vae.shift_factor.to(
                latent_condition.device, latent_condition.dtype)
        else:
            latent_condition -= self.vae.shift_factor

    if isinstance(self.vae.scaling_factor, torch.Tensor):
        latent_condition = latent_condition * self.vae.scaling_factor.to(
            latent_condition.device, latent_condition.dtype)
    else:
        latent_condition = latent_condition * self.vae.scaling_factor

    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        batch.image_latent = latent_condition
    else:
        mask_lat_size = torch.ones(1, 1, num_frames, latent_height,
                                   latent_width)
        mask_lat_size[:, :, list(range(1, num_frames))] = 0
        first_frame_mask = mask_lat_size[:, :, 0:1]
        first_frame_mask = torch.repeat_interleave(
            first_frame_mask,
            dim=2,
            repeats=self.vae.temporal_compression_ratio)
        mask_lat_size = torch.concat(
            [first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
        mask_lat_size = mask_lat_size.view(
            1, -1, self.vae.temporal_compression_ratio, latent_height,
            latent_width)
        mask_lat_size = mask_lat_size.transpose(1, 2)
        mask_lat_size = mask_lat_size.to(latent_condition.device)

        batch.image_latent = torch.concat([mask_lat_size, latent_condition],
                                          dim=1)

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    self.vae.to("cpu")

    return batch

fastvideo.pipelines.stages.ImageVAEEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage inputs."""
    result = VerificationResult()
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        result.add_check("height", batch.height, V.list_not_empty)
        result.add_check("width", batch.width, V.list_not_empty)
        result.add_check("num_frames", batch.num_frames, V.list_not_empty)
    else:
        result.add_check("height", batch.height, V.positive_int)
        result.add_check("width", batch.width, V.positive_int)
        result.add_check("num_frames", batch.num_frames, V.positive_int)
    return result

fastvideo.pipelines.stages.ImageVAEEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage outputs."""
    result = VerificationResult()
    result.add_check("image_latent", batch.image_latent,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.InputValidationStage ¶

Bases: PipelineStage

Stage for validating and preparing inputs for diffusion pipelines.

This stage validates that all required inputs are present and properly formatted before proceeding with the diffusion process.

Functions¶

fastvideo.pipelines.stages.InputValidationStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Validate and prepare inputs.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The validated batch information.

Source code in fastvideo/pipelines/stages/input_validation.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Validate and prepare inputs.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The validated batch information.
    """

    self._generate_seeds(batch, fastvideo_args)

    # Ensure prompt is properly formatted
    if batch.prompt is None and batch.prompt_embeds is None:
        raise ValueError(
            "Either `prompt` or `prompt_embeds` must be provided")

    # Ensure negative prompt is properly formatted if using classifier-free guidance
    if (batch.do_classifier_free_guidance and batch.negative_prompt is None
            and batch.negative_prompt_embeds is None):
        raise ValueError(
            "For classifier-free guidance, either `negative_prompt` or "
            "`negative_prompt_embeds` must be provided")

    # Validate height and width
    if batch.height is None or batch.width is None:
        raise ValueError(
            "Height and width must be provided. Please set `height` and `width`."
        )
    if batch.height % 8 != 0 or batch.width % 8 != 0:
        raise ValueError(
            f"Height and width must be divisible by 8 but are {batch.height} and {batch.width}."
        )

    # Validate number of inference steps
    if batch.num_inference_steps <= 0:
        raise ValueError(
            f"Number of inference steps must be positive, but got {batch.num_inference_steps}"
        )

    # Validate guidance scale if using classifier-free guidance
    if batch.do_classifier_free_guidance and batch.guidance_scale <= 0:
        raise ValueError(
            f"Guidance scale must be positive, but got {batch.guidance_scale}"
        )

    # for i2v, get image from image_path
    # @TODO(Wei) hard-coded for wan2.2 5b ti2v for now. Should put this in image_encoding stage
    if batch.image_path is not None:
        if batch.image_path.endswith(".mp4"):
            image = load_video(batch.image_path)[0]
        else:
            image = load_image(batch.image_path)
        batch.pil_image = image

    # further processing for ti2v task
    if (fastvideo_args.pipeline_config.ti2v_task
            or fastvideo_args.pipeline_config.is_causal
        ) and batch.pil_image is not None:
        img = batch.pil_image
        ih, iw = img.height, img.width

        pipeline_class_name = type(fastvideo_args.pipeline_config).__name__
        if 'MatrixGame' in pipeline_class_name or 'MatrixCausal' in pipeline_class_name:
            oh, ow = batch.height, batch.width
            img = img.resize((ow, oh), Image.LANCZOS)
        else:
            # Standard Wan logic
            patch_size = fastvideo_args.pipeline_config.dit_config.arch_config.patch_size
            vae_stride = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_spatial
            dh, dw = patch_size[1] * vae_stride, patch_size[2] * vae_stride
            max_area = 480 * 832
            ow, oh = best_output_size(iw, ih, dw, dh, max_area)

            scale = max(ow / iw, oh / ih)
            img = img.resize((round(iw * scale), round(ih * scale)),
                             Image.LANCZOS)

            # center-crop
            x1 = (img.width - ow) // 2
            y1 = (img.height - oh) // 2
            img = img.crop((x1, y1, x1 + ow, y1 + oh))

        assert img.width == ow and img.height == oh
        logger.info("final processed img height: %s, img width: %s",
                    img.height, img.width)

        # to tensor
        img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(
            self.device).unsqueeze(1)
        img = img.unsqueeze(0)
        batch.height = oh
        batch.width = ow
        batch.pil_image = img

    # for v2v, get control video from video path
    if batch.video_path is not None:
        pil_images, original_fps = load_video(batch.video_path,
                                              return_fps=True)
        logger.info("Loaded video with %s frames, original FPS: %s",
                    len(pil_images), original_fps)

        # Get target parameters from batch
        target_fps = batch.fps
        target_num_frames = batch.num_frames
        target_height = batch.height
        target_width = batch.width

        if target_fps is not None and original_fps is not None:
            frame_skip = max(1, int(original_fps // target_fps))
            if frame_skip > 1:
                pil_images = pil_images[::frame_skip]
                effective_fps = original_fps / frame_skip
                logger.info(
                    "Resampled video from %.1f fps to %.1f fps (skip=%s)",
                    original_fps, effective_fps, frame_skip)

        # Limit to target number of frames
        if target_num_frames is not None and len(
                pil_images) > target_num_frames:
            pil_images = pil_images[:target_num_frames]
            logger.info("Limited video to %s frames (from %s total)",
                        target_num_frames, len(pil_images))

        # Resize each PIL image to target dimensions
        resized_images = []
        for pil_img in pil_images:
            resized_img = resize(pil_img,
                                 target_height,
                                 target_width,
                                 resize_mode="default",
                                 resample="lanczos")
            resized_images.append(resized_img)

        # Convert PIL images to numpy array
        video_numpy = pil_to_numpy(resized_images)
        video_numpy = normalize(video_numpy)
        video_tensor = numpy_to_pt(video_numpy)

        # Rearrange to [C, T, H, W] and add batch dimension -> [B, C, T, H, W]
        input_video = video_tensor.permute(1, 0, 2, 3).unsqueeze(0)

        batch.video_latent = input_video

    # Validate action control inputs (Matrix-Game)
    if batch.mouse_cond is not None:
        if batch.mouse_cond.dim() != 3 or batch.mouse_cond.shape[-1] != 2:
            raise ValueError(
                f"mouse_cond must have shape (B, T, 2), but got {batch.mouse_cond.shape}"
            )
        logger.info("Action control: mouse_cond validated - shape %s",
                    batch.mouse_cond.shape)

    if batch.keyboard_cond is not None:
        if batch.keyboard_cond.dim() != 3:
            raise ValueError(
                f"keyboard_cond must have 3 dimensions (B, T, K), but got {batch.keyboard_cond.dim()}"
            )
        keyboard_dim = batch.keyboard_cond.shape[-1]
        if keyboard_dim not in {2, 4, 6, 7}:
            raise ValueError(
                f"keyboard_cond last dimension must be 2, 4, 6, or 7, but got {keyboard_dim}"
            )
        logger.info(
            "Action control: keyboard_cond validated - shape %s (dim=%d)",
            batch.keyboard_cond.shape, keyboard_dim)

    if batch.grid_sizes is not None:
        if not isinstance(batch.grid_sizes, list | tuple | torch.Tensor):
            raise ValueError("grid_sizes must be a list, tuple, or tensor")
        if isinstance(batch.grid_sizes, torch.Tensor):
            if batch.grid_sizes.numel() != 3:
                raise ValueError(
                    "grid_sizes must have 3 elements [F, H, W]")
        else:
            if len(batch.grid_sizes) != 3:
                raise ValueError(
                    "grid_sizes must have 3 elements [F, H, W]")
        logger.info("Action control: grid_sizes validated - %s",
                    batch.grid_sizes)

    return batch

fastvideo.pipelines.stages.InputValidationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify input validation stage inputs.

Source code in fastvideo/pipelines/stages/input_validation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify input validation stage inputs."""
    result = VerificationResult()
    result.add_check("seed", batch.seed, [V.not_none, V.positive_int])
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check(
        "prompt_or_embeds", None, lambda _: V.string_or_list_strings(
            batch.prompt) or V.list_not_empty(batch.prompt_embeds))
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check(
        "guidance_scale", batch.guidance_scale, lambda x: not batch.
        do_classifier_free_guidance or V.positive_float(x))
    return result

fastvideo.pipelines.stages.InputValidationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify input validation stage outputs.

Source code in fastvideo/pipelines/stages/input_validation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify input validation stage outputs."""
    result = VerificationResult()
    result.add_check("seeds", batch.seeds, V.list_not_empty)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    return result

fastvideo.pipelines.stages.LatentPreparationStage ¶

LatentPreparationStage(scheduler, transformer, use_btchw_layout: bool = False)

Bases: PipelineStage

Stage for preparing initial latent variables for the diffusion process.

This stage handles the preparation of the initial latent variables that will be denoised during the diffusion process.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self,
             scheduler,
             transformer,
             use_btchw_layout: bool = False) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.use_btchw_layout = use_btchw_layout

Functions¶

fastvideo.pipelines.stages.LatentPreparationStage.adjust_video_length ¶

adjust_video_length(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> int

Adjust video length based on VAE version.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`int`	The batch with adjusted video length.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def adjust_video_length(self, batch: ForwardBatch,
                        fastvideo_args: FastVideoArgs) -> int:
    """
    Adjust video length based on VAE version.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with adjusted video length.
    """

    video_length = batch.num_frames
    use_temporal_scaling_frames = fastvideo_args.pipeline_config.vae_config.use_temporal_scaling_frames
    if use_temporal_scaling_frames:
        temporal_scale_factor = fastvideo_args.pipeline_config.vae_config.arch_config.temporal_compression_ratio
        latent_num_frames = (video_length - 1) // temporal_scale_factor + 1
    else:  # stepvideo only
        latent_num_frames = video_length // 17 * 3
    return int(latent_num_frames)

fastvideo.pipelines.stages.LatentPreparationStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Prepare initial latent variables for the diffusion process.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with prepared latent variables.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Prepare initial latent variables for the diffusion process.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with prepared latent variables.
    """

    latent_num_frames = None
    # Adjust video length based on VAE version if needed
    if hasattr(self, 'adjust_video_length'):
        latent_num_frames = self.adjust_video_length(batch, fastvideo_args)
    # Determine batch size; fall back to action/image inputs when no text encoder is present
    if not batch.prompt_embeds:
        if batch.keyboard_cond is not None:
            batch_size = batch.keyboard_cond.shape[0]
        elif batch.mouse_cond is not None:
            batch_size = batch.mouse_cond.shape[0]
        elif batch.image_embeds:
            batch_size = batch.image_embeds[0].shape[0]
        else:
            batch_size = 1
    elif isinstance(batch.prompt, list):
        batch_size = len(batch.prompt)
    elif batch.prompt is not None:
        batch_size = 1
    else:
        batch_size = batch.prompt_embeds[0].shape[0]

    # Adjust batch size for number of videos per prompt
    batch_size *= batch.num_videos_per_prompt

    # Get required parameters
    if not batch.prompt_embeds:
        # Create a dummy zero-length text embedding to satisfy downstream checks.
        # Matrix-Game models have text_dim=0 and ignore encoder_hidden_states.
        transformer_dtype = next(self.transformer.parameters()).dtype
        device = get_local_torch_device()
        dummy_prompt = torch.zeros(batch_size,
                                   0,
                                   self.transformer.hidden_size,
                                   device=device,
                                   dtype=transformer_dtype)
        batch.prompt_embeds = [dummy_prompt]
        batch.negative_prompt_embeds = []
        batch.do_classifier_free_guidance = False
    dtype = batch.prompt_embeds[0].dtype
    device = get_local_torch_device()
    generator = batch.generator
    latents = batch.latents
    num_frames = latent_num_frames if latent_num_frames is not None else batch.num_frames
    height = batch.height
    width = batch.width

    # TODO(will): remove this once we add input/output validation for stages
    if height is None or width is None:
        raise ValueError("Height and width must be provided")

    # Calculate latent shape
    bcthw_shape: tuple[int, ...] | None = None
    if self.use_btchw_layout:
        shape = (
            batch_size,
            num_frames,
            self.transformer.num_channels_latents,
            height // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
            width // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
        )
        bcthw_shape = tuple(shape[i] for i in [0, 2, 1, 3, 4])
    else:
        shape = (
            batch_size,
            self.transformer.num_channels_latents,
            num_frames,
            height // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
            width // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
        )
        bcthw_shape = shape

    # Validate generator if it's a list
    if isinstance(generator, list) and len(generator) != batch_size:
        raise ValueError(
            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
        )
    # Generate or use provided latents
    if latents is None:
        latents = randn_tensor(
            shape,
            generator=generator,
            device=device,
            dtype=dtype,
        )
        if hasattr(self.scheduler, "init_noise_sigma"):
            latents = latents * self.scheduler.init_noise_sigma
    else:
        # Pre-initialized latents:
        # - For LongCat refine (refine_from or stage1_video present), we should not re-scale by init_noise_sigma.
        # - For other models, keep the original behavior.
        latents = latents.to(device)
        is_longcat_refine = (batch.refine_from
                             is not None) or (batch.stage1_video
                                              is not None)
        if (not is_longcat_refine) and hasattr(self.scheduler,
                                               "init_noise_sigma"):
            latents = latents * self.scheduler.init_noise_sigma

    # Update batch with prepared latents
    batch.latents = latents
    batch.raw_latent_shape = bcthw_shape

    return batch

fastvideo.pipelines.stages.LatentPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage inputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage inputs."""
    result = VerificationResult()
    result.add_check(
        "prompt_or_embeds", None,
        lambda _: V.string_or_list_strings(batch.prompt) or not batch.
        prompt_embeds or V.list_not_empty(batch.prompt_embeds))
    if batch.prompt_embeds:
        result.add_check("prompt_embeds", batch.prompt_embeds,
                         V.list_of_tensors)
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("num_frames", batch.num_frames, V.positive_int)
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("latents", batch.latents, V.none_or_tensor)
    return result

fastvideo.pipelines.stages.LatentPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage outputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("raw_latent_shape", batch.raw_latent_shape, V.is_tuple)
    return result

fastvideo.pipelines.stages.PipelineStage ¶

Bases: ABC

Abstract base class for all pipeline stages.

A pipeline stage represents a discrete step in the diffusion process that can be composed with other stages to create a complete pipeline. Each stage is responsible for a specific part of the process, such as prompt encoding, latent preparation, etc.

Attributes¶

fastvideo.pipelines.stages.PipelineStage.device property ¶

device: device

Get the device for this stage.

Functions¶

fastvideo.pipelines.stages.PipelineStage.__call__ ¶

__call__(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Execute the stage's processing on the batch with optional verification and logging. Should not be overridden by subclasses.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The updated batch information after this stage's processing.

Source code in fastvideo/pipelines/stages/base.py

def __call__(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Execute the stage's processing on the batch with optional verification and logging.
    Should not be overridden by subclasses.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The updated batch information after this stage's processing.
    """
    stage_name = self.__class__.__name__

    # Check if verification is enabled (simple approach for prototype)
    enable_verification = getattr(fastvideo_args,
                                  'enable_stage_verification', False)

    if enable_verification:
        # Pre-execution input verification
        try:
            input_result = self.verify_input(batch, fastvideo_args)
            self._run_verification(input_result, stage_name, "input")
        except Exception as e:
            logger.error("Input verification failed for %s: %s", stage_name,
                         str(e))
            raise

    # Execute the actual stage logic
    if envs.FASTVIDEO_STAGE_LOGGING:
        logger.info("[%s] Starting execution", stage_name)
        start_time = time.perf_counter()

        try:
            result = self.forward(batch, fastvideo_args)
            execution_time = time.perf_counter() - start_time
            logger.info("[%s] Execution completed in %s ms", stage_name,
                        execution_time * 1000)
            batch.logging_info.add_stage_execution_time(
                stage_name, execution_time)
        except Exception as e:
            execution_time = time.perf_counter() - start_time
            logger.error("[%s] Error during execution after %s ms: %s",
                         stage_name, execution_time * 1000, e)
            logger.error("[%s] Traceback: %s", stage_name,
                         traceback.format_exc())
            raise
    else:
        # Direct execution (current behavior)
        result = self.forward(batch, fastvideo_args)

    if enable_verification:
        # Post-execution output verification
        try:
            output_result = self.verify_output(result, fastvideo_args)
            self._run_verification(output_result, stage_name, "output")
        except Exception as e:
            logger.error("Output verification failed for %s: %s",
                         stage_name, str(e))
            raise

    return result

fastvideo.pipelines.stages.PipelineStage.forward abstractmethod ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Forward pass of the stage's processing.

This method should be implemented by subclasses to provide the forward processing logic for the stage.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The updated batch information after this stage's processing.

Source code in fastvideo/pipelines/stages/base.py

@abstractmethod
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Forward pass of the stage's processing.

    This method should be implemented by subclasses to provide the forward
    processing logic for the stage.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The updated batch information after this stage's processing.
    """
    raise NotImplementedError

fastvideo.pipelines.stages.PipelineStage.set_logging ¶

set_logging(enable: bool)

Enable or disable logging for this stage.

Parameters:

Name	Type	Description	Default
`enable`	`bool`	Whether to enable logging.	required

Source code in fastvideo/pipelines/stages/base.py

def set_logging(self, enable: bool):
    """
    Enable or disable logging for this stage.

    Args:
        enable: Whether to enable logging.
    """
    self._enable_logging = enable

fastvideo.pipelines.stages.PipelineStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify the input for the stage.

Example

from fastvideo.pipelines.stages.validators import V, VerificationResult

def verify_input(self, batch, fastvideo_args): result = VerificationResult() result.add_check("height", batch.height, V.positive_int_divisible(8)) result.add_check("width", batch.width, V.positive_int_divisible(8)) result.add_check("image_latent", batch.image_latent, V.is_tensor) return result

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`VerificationResult`	A VerificationResult containing the verification status.

Source code in fastvideo/pipelines/stages/base.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """
    Verify the input for the stage.

    Example:
        from fastvideo.pipelines.stages.validators import V, VerificationResult

        def verify_input(self, batch, fastvideo_args):
            result = VerificationResult()
            result.add_check("height", batch.height, V.positive_int_divisible(8))
            result.add_check("width", batch.width, V.positive_int_divisible(8))
            result.add_check("image_latent", batch.image_latent, V.is_tensor)
            return result

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        A VerificationResult containing the verification status.

    """
    # Default implementation - no verification
    return VerificationResult()

fastvideo.pipelines.stages.PipelineStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify the output for the stage.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`VerificationResult`	A VerificationResult containing the verification status.

Source code in fastvideo/pipelines/stages/base.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """
    Verify the output for the stage.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        A VerificationResult containing the verification status.
    """
    # Default implementation - no verification
    return VerificationResult()

fastvideo.pipelines.stages.RefImageEncodingStage ¶

RefImageEncodingStage(image_encoder, image_processor)

Bases: ImageEncodingStage

Stage for encoding reference image prompts into embeddings for Wan2.1 Control models.

This stage extends ImageEncodingStage with specialized preprocessing for reference images.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, image_encoder, image_processor) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary image encoder.
    """
    super().__init__()
    self.image_processor = image_processor
    self.image_encoder = image_encoder

Functions¶

fastvideo.pipelines.stages.RefImageEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into image encoder hidden states.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded prompt embeddings.

Source code in fastvideo/pipelines/stages/image_encoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode the prompt into image encoder hidden states.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded prompt embeddings.
    """
    self.image_encoder = self.image_encoder.to(get_local_torch_device())

    image = batch.pil_image
    if image is None:
        image = create_default_image()
    # Preprocess reference image for CLIP encoder
    image_tensor = preprocess_reference_image_for_clip(
        image, get_local_torch_device())

    image_inputs = self.image_processor(images=image_tensor,
                                        return_tensors="pt").to(
                                            get_local_torch_device())
    with set_forward_context(current_timestep=0, attn_metadata=None):
        outputs = self.image_encoder(**image_inputs)
        image_embeds = outputs.last_hidden_state
    batch.image_embeds.append(image_embeds)

    if batch.pil_image is None:
        batch.image_embeds = [
            torch.zeros_like(x) for x in batch.image_embeds
        ]

    return batch

fastvideo.pipelines.stages.StepvideoPromptEncodingStage ¶

StepvideoPromptEncodingStage(stepllm, clip)

Bases: PipelineStage

Stage for encoding prompts using the remote caption API.

This stage applies the magic string transformations and calls the remote caption service asynchronously to get: - primary prompt embeddings, - an attention mask, - and a clip embedding.

Source code in fastvideo/pipelines/stages/stepvideo_encoding.py

def __init__(self, stepllm, clip) -> None:
    super().__init__()
    # self.caption_client = caption_client  # This should have a call_caption(prompts: List[str]) method.
    self.stepllm = stepllm
    self.clip = clip

Functions¶

fastvideo.pipelines.stages.StepvideoPromptEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify stepvideo encoding stage inputs.

Source code in fastvideo/pipelines/stages/stepvideo_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify stepvideo encoding stage inputs."""
    result = VerificationResult()
    result.add_check("prompt", batch.prompt, V.string_not_empty)
    return result

fastvideo.pipelines.stages.StepvideoPromptEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify stepvideo encoding stage outputs.

Source code in fastvideo/pipelines/stages/stepvideo_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify stepvideo encoding stage outputs."""
    result = VerificationResult()
    result.add_check("prompt_embeds", batch.prompt_embeds,
                     [V.is_tensor, V.with_dims(3)])
    result.add_check("negative_prompt_embeds", batch.negative_prompt_embeds,
                     [V.is_tensor, V.with_dims(3)])
    result.add_check("prompt_attention_mask", batch.prompt_attention_mask,
                     [V.is_tensor, V.with_dims(2)])
    result.add_check("negative_attention_mask",
                     batch.negative_attention_mask,
                     [V.is_tensor, V.with_dims(2)])
    result.add_check("clip_embedding_pos", batch.clip_embedding_pos,
                     [V.is_tensor, V.with_dims(2)])
    result.add_check("clip_embedding_neg", batch.clip_embedding_neg,
                     [V.is_tensor, V.with_dims(2)])
    return result

fastvideo.pipelines.stages.TextEncodingStage ¶

TextEncodingStage(text_encoders, tokenizers)

Bases: PipelineStage

Stage for encoding text prompts into embeddings for diffusion models.

This stage handles the encoding of text prompts into the embedding space expected by the diffusion model.

Initialize the prompt encoding stage.

Parameters:

Name	Type	Description	Default
`enable_logging`		Whether to enable logging for this stage.	required
`is_secondary`		Whether this is a secondary text encoder.	required

Source code in fastvideo/pipelines/stages/text_encoding.py

def __init__(self, text_encoders, tokenizers) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary text encoder.
    """
    super().__init__()
    self.tokenizers = tokenizers
    self.text_encoders = text_encoders

Functions¶

fastvideo.pipelines.stages.TextEncodingStage.encode_text ¶

encode_text(text: str | list[str], fastvideo_args: FastVideoArgs, encoder_index: int | list[int] | None = None, return_attention_mask: bool = False, return_type: str = 'list', device: device | str | None = None, dtype: dtype | None = None, max_length: int | None = None, truncation: bool | None = None, padding: bool | str | None = None)

Encode plain text using selected text encoder(s) and return embeddings.

Parameters:

Name	Type	Description	Default
`text`	`str \| list[str]`	A single string or a list of strings to encode.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments providing pipeline config, including tokenizer and encoder settings, preprocess and postprocess functions.	required
`encoder_index`	`int \| list[int] \| None`	Encoder selector by index. Accepts an int or list of ints.	`None`
`return_attention_mask`	`bool`	If True, also return attention masks for each selected encoder.	`False`
`return_type`	`str`	"list" (default) returns a list aligned with selection; "dict" returns a dict keyed by encoder index as a string; "stack" stacks along a new first dimension (requires matching shapes).	`'list'`
`device`	`device \| str \| None`	Optional device override for inputs; defaults to local torch device.	`None`
`dtype`	`dtype \| None`	Optional dtype to cast returned embeddings to.	`None`
`max_length`	`int \| None`	Optional per-call tokenizer override.	`None`
`truncation`	`bool \| None`	Optional per-call tokenizer override.	`None`
`padding`	`bool \| str \| None`	Optional per-call tokenizer override.	`None`

Returns:

Type	Description
	Depending on return_type and return_attention_mask:
	list: List[Tensor] or (List[Tensor], List[Tensor])
	dict: Dict[str, Tensor] or (Dict[str, Tensor], Dict[str, Tensor])
	stack: Tensor of shape [num_encoders, ...] or a tuple with stacked attention masks

Source code in fastvideo/pipelines/stages/text_encoding.py

@torch.no_grad()
def encode_text(
    self,
    text: str | list[str],
    fastvideo_args: FastVideoArgs,
    encoder_index: int | list[int] | None = None,
    return_attention_mask: bool = False,
    return_type: str = "list",  # one of: "list", "dict", "stack"
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
    max_length: int | None = None,
    truncation: bool | None = None,
    padding: bool | str | None = None,
):
    """
    Encode plain text using selected text encoder(s) and return embeddings.

    Args:
        text: A single string or a list of strings to encode.
        fastvideo_args: The inference arguments providing pipeline config,
            including tokenizer and encoder settings, preprocess and postprocess
            functions.
        encoder_index: Encoder selector by index. Accepts an int or list of ints.
        return_attention_mask: If True, also return attention masks for each
            selected encoder.
        return_type: "list" (default) returns a list aligned with selection;
            "dict" returns a dict keyed by encoder index as a string; "stack" stacks along a
            new first dimension (requires matching shapes).
        device: Optional device override for inputs; defaults to local torch device.
        dtype: Optional dtype to cast returned embeddings to.
        max_length: Optional per-call tokenizer override.
        truncation: Optional per-call tokenizer override.
        padding: Optional per-call tokenizer override.

    Returns:
        Depending on return_type and return_attention_mask:
        - list: List[Tensor] or (List[Tensor], List[Tensor])
        - dict: Dict[str, Tensor] or (Dict[str, Tensor], Dict[str, Tensor])
        - stack: Tensor of shape [num_encoders, ...] or a tuple with stacked
          attention masks
    """

    assert len(self.tokenizers) == len(self.text_encoders)
    assert len(self.text_encoders) == len(
        fastvideo_args.pipeline_config.text_encoder_configs)

    # Resolve selection into indices
    encoder_cfgs = fastvideo_args.pipeline_config.text_encoder_configs
    if encoder_index is None:
        indices: list[int] = [0]
    elif isinstance(encoder_index, int):
        indices = [encoder_index]
    else:
        indices = list(encoder_index)
    # validate range
    num_encoders = len(self.text_encoders)
    for idx in indices:
        if idx < 0 or idx >= num_encoders:
            raise IndexError(
                f"encoder index {idx} out of range [0, {num_encoders-1}]")

    # Validate indices are within range
    num_encoders = len(self.text_encoders)

    # Normalize input to list[str]
    assert isinstance(text, str | list)
    if isinstance(text, str):
        texts: list[str] = [text]
    else:
        texts = text

    embeds_list: list[torch.Tensor] = []
    attn_masks_list: list[torch.Tensor] = []

    preprocess_funcs = fastvideo_args.pipeline_config.preprocess_text_funcs
    postprocess_funcs = fastvideo_args.pipeline_config.postprocess_text_funcs
    encoder_cfgs = fastvideo_args.pipeline_config.text_encoder_configs

    if return_type not in ("list", "dict", "stack"):
        raise ValueError(
            f"Invalid return_type '{return_type}'. Expected one of: 'list', 'dict', 'stack'"
        )

    target_device = device if device is not None else get_local_torch_device(
    )

    for i in indices:
        tokenizer = self.tokenizers[i]
        text_encoder = self.text_encoders[i]
        encoder_config = encoder_cfgs[i]
        preprocess_func = preprocess_funcs[i]
        postprocess_func = postprocess_funcs[i]

        tok_kwargs = dict(encoder_config.tokenizer_kwargs)
        if max_length is not None:
            tok_kwargs["max_length"] = max_length
        elif hasattr(fastvideo_args.pipeline_config,
                     "text_encoder_max_lengths"):
            tok_kwargs[
                "max_length"] = fastvideo_args.pipeline_config.text_encoder_max_lengths[
                    i]

        if truncation is not None:
            tok_kwargs["truncation"] = truncation
        if padding is not None:
            tok_kwargs["padding"] = padding

        processed_texts: list[str] = []
        for prompt_str in texts:
            processed_text = preprocess_func(prompt_str)
            if processed_text is not None:
                processed_texts.append(processed_text)
            else:
                # Assuming batch_size = 1
                prompt_embeds = torch.zeros((1, tok_kwargs["max_length"],
                                             encoder_config.hidden_size),
                                            device=target_device)
                attention_mask = torch.zeros((1, tok_kwargs["max_length"]),
                                             device=target_device,
                                             dtype=torch.int64)
                embeds_list.append(prompt_embeds)
                attn_masks_list.append(attention_mask)
                return self.return_embeds(embeds_list, attn_masks_list,
                                          return_type,
                                          return_attention_mask, indices)

        if encoder_config.is_chat_model:
            text_inputs = tokenizer.apply_chat_template(
                processed_texts, **tok_kwargs).to(target_device)
        else:
            text_inputs = tokenizer(processed_texts,
                                    **tok_kwargs).to(target_device)

        input_ids = text_inputs["input_ids"]
        attention_mask = text_inputs["attention_mask"]

        with set_forward_context(current_timestep=0, attn_metadata=None):
            outputs = text_encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,
            )

        try:
            prompt_embeds = postprocess_func(outputs)
        except Exception:
            prompt_embeds, attention_mask = postprocess_func(
                outputs, attention_mask)

        if dtype is not None:
            prompt_embeds = prompt_embeds.to(dtype=dtype)
        embeds_list.append(prompt_embeds)
        if return_attention_mask:
            attn_masks_list.append(attention_mask)

    return self.return_embeds(embeds_list, attn_masks_list, return_type,
                              return_attention_mask, indices)

fastvideo.pipelines.stages.TextEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into text encoder hidden states.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded prompt embeddings.

Source code in fastvideo/pipelines/stages/text_encoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode the prompt into text encoder hidden states.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded prompt embeddings.
    """
    assert len(self.tokenizers) == len(self.text_encoders)
    assert len(self.text_encoders) == len(
        fastvideo_args.pipeline_config.text_encoder_configs)

    # Encode positive prompt with all available encoders
    assert batch.prompt is not None
    prompt_text: str | list[str] = batch.prompt
    all_indices: list[int] = list(range(len(self.text_encoders)))
    prompt_embeds_list, prompt_masks_list = self.encode_text(
        prompt_text,
        fastvideo_args,
        encoder_index=all_indices,
        return_attention_mask=True,
    )

    for pe in prompt_embeds_list:
        batch.prompt_embeds.append(pe)
    if batch.prompt_attention_mask is not None:
        for am in prompt_masks_list:
            batch.prompt_attention_mask.append(am)

    # Encode negative prompt if CFG is enabled
    if batch.do_classifier_free_guidance:
        assert isinstance(batch.negative_prompt, str)
        neg_embeds_list, neg_masks_list = self.encode_text(
            batch.negative_prompt,
            fastvideo_args,
            encoder_index=all_indices,
            return_attention_mask=True,
        )

        assert batch.negative_prompt_embeds is not None
        for ne in neg_embeds_list:
            batch.negative_prompt_embeds.append(ne)
        if batch.negative_attention_mask is not None:
            for nm in neg_masks_list:
                batch.negative_attention_mask.append(nm)

    return batch

fastvideo.pipelines.stages.TextEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify text encoding stage inputs.

Source code in fastvideo/pipelines/stages/text_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify text encoding stage inputs."""
    result = VerificationResult()
    result.add_check("prompt", batch.prompt, V.string_or_list_strings)
    # result.add_check(
    #     "negative_prompt", batch.negative_prompt, lambda x: not batch.
    #     do_classifier_free_guidance or V.string_not_empty(x))
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check("prompt_embeds", batch.prompt_embeds, V.is_list)
    result.add_check("negative_prompt_embeds", batch.negative_prompt_embeds,
                     V.none_or_list)
    return result

fastvideo.pipelines.stages.TextEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify text encoding stage outputs.

Source code in fastvideo/pipelines/stages/text_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify text encoding stage outputs."""
    result = VerificationResult()
    result.add_check("prompt_embeds", batch.prompt_embeds,
                     V.list_of_tensors_min_dims(2))
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds,
        lambda x: not batch.do_classifier_free_guidance or V.
        list_of_tensors_with_min_dims(x, 2))
    return result

fastvideo.pipelines.stages.TimestepPreparationStage ¶

TimestepPreparationStage(scheduler)

Bases: PipelineStage

Stage for preparing timesteps for the diffusion process.

This stage handles the preparation of the timestep sequence that will be used during the diffusion process.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def __init__(self, scheduler) -> None:
    self.scheduler = scheduler

Functions¶

fastvideo.pipelines.stages.TimestepPreparationStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Prepare timesteps for the diffusion process.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with prepared timesteps.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Prepare timesteps for the diffusion process.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with prepared timesteps.
    """
    scheduler = self.scheduler
    device = get_local_torch_device()
    num_inference_steps = batch.num_inference_steps
    timesteps = batch.timesteps
    sigmas = batch.sigmas
    n_tokens = batch.n_tokens

    # Prepare extra kwargs for set_timesteps
    extra_set_timesteps_kwargs = {}
    if n_tokens is not None and "n_tokens" in inspect.signature(
            scheduler.set_timesteps).parameters:
        extra_set_timesteps_kwargs["n_tokens"] = n_tokens

    # Handle custom timesteps or sigmas
    if timesteps is not None and sigmas is not None:
        raise ValueError(
            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
        )

    if timesteps is not None:
        accepts_timesteps = "timesteps" in inspect.signature(
            scheduler.set_timesteps).parameters
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        # Convert timesteps to CPU if it's a tensor (for numpy conversion in scheduler)
        if isinstance(timesteps, torch.Tensor):
            timesteps_for_scheduler = timesteps.cpu()
        else:
            timesteps_for_scheduler = timesteps
        scheduler.set_timesteps(timesteps=timesteps_for_scheduler,
                                device=device,
                                **extra_set_timesteps_kwargs)
        timesteps = scheduler.timesteps
    elif sigmas is not None:
        accept_sigmas = "sigmas" in inspect.signature(
            scheduler.set_timesteps).parameters
        if not accept_sigmas:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" sigmas schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(sigmas=sigmas,
                                device=device,
                                **extra_set_timesteps_kwargs)
        timesteps = scheduler.timesteps
    else:
        scheduler.set_timesteps(num_inference_steps,
                                device=device,
                                **extra_set_timesteps_kwargs)
        timesteps = scheduler.timesteps

    # Update batch with prepared timesteps
    batch.timesteps = timesteps

    return batch

fastvideo.pipelines.stages.TimestepPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify timestep preparation stage inputs.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify timestep preparation stage inputs."""
    result = VerificationResult()
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("timesteps", batch.timesteps, V.none_or_tensor)
    result.add_check("sigmas", batch.sigmas, V.none_or_list)
    result.add_check("n_tokens", batch.n_tokens, V.none_or_positive_int)
    return result

fastvideo.pipelines.stages.TimestepPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify timestep preparation stage outputs.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify timestep preparation stage outputs."""
    result = VerificationResult()
    result.add_check("timesteps", batch.timesteps,
                     [V.is_tensor, V.with_dims(1)])
    return result

fastvideo.pipelines.stages.VideoVAEEncodingStage ¶

VideoVAEEncodingStage(vae: ParallelTiledVAE)

Bases: ImageVAEEncodingStage

Stage for encoding video pixel representations into latent space.

This stage handles the encoding of video pixel representations for video-to-video generation and control. Inherits from ImageVAEEncodingStage to reuse common functionality.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, vae: ParallelTiledVAE) -> None:
    self.vae: ParallelTiledVAE = vae

Functions¶

fastvideo.pipelines.stages.VideoVAEEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode video pixel representations into latent space.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode video pixel representations into latent space.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded outputs.
    """
    assert batch.video_latent is not None, "Video latent input is required for VideoVAEEncodingStage"

    if fastvideo_args.mode == ExecutionMode.INFERENCE:
        assert batch.height is not None and isinstance(batch.height, int)
        assert batch.width is not None and isinstance(batch.width, int)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, int)
        height = batch.height
        width = batch.width
        num_frames = batch.num_frames
    elif fastvideo_args.mode == ExecutionMode.PREPROCESS:
        assert batch.height is not None and isinstance(batch.height, list)
        assert batch.width is not None and isinstance(batch.width, list)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, list)
        num_frames = batch.num_frames[0]
        height = batch.height[0]
        width = batch.width[0]

    self.vae = self.vae.to(get_local_torch_device())

    # Prepare video tensor from control video
    video_condition = self._prepare_control_video_tensor(
        batch.video_latent, num_frames, height,
        width).to(get_local_torch_device(), dtype=torch.float32)

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # Encode control video
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        if not vae_autocast_enabled:
            video_condition = video_condition.to(vae_dtype)
        encoder_output = self.vae.encode(video_condition)

    generator = batch.generator
    if generator is None:
        raise ValueError("Generator must be provided")
    latent_condition = self.retrieve_latents(encoder_output, generator)

    if (hasattr(self.vae, "shift_factor")
            and self.vae.shift_factor is not None):
        if isinstance(self.vae.shift_factor, torch.Tensor):
            latent_condition -= self.vae.shift_factor.to(
                latent_condition.device, latent_condition.dtype)
        else:
            latent_condition -= self.vae.shift_factor

    if isinstance(self.vae.scaling_factor, torch.Tensor):
        latent_condition = latent_condition * self.vae.scaling_factor.to(
            latent_condition.device, latent_condition.dtype)
    else:
        latent_condition = latent_condition * self.vae.scaling_factor

    batch.video_latent = latent_condition

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    self.vae.to("cpu")

    return batch

fastvideo.pipelines.stages.VideoVAEEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify video encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify video encoding stage inputs."""
    result = VerificationResult()
    result.add_check("video_latent", batch.video_latent, V.not_none)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        result.add_check("height", batch.height, V.list_not_empty)
        result.add_check("width", batch.width, V.list_not_empty)
        result.add_check("num_frames", batch.num_frames, V.list_not_empty)
    else:
        result.add_check("height", batch.height, V.positive_int)
        result.add_check("width", batch.width, V.positive_int)
        result.add_check("num_frames", batch.num_frames, V.positive_int)
    return result

fastvideo.pipelines.stages.VideoVAEEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify video encoding stage outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify video encoding stage outputs."""
    result = VerificationResult()
    result.add_check("video_latent", batch.video_latent,
                     [V.is_tensor, V.with_dims(5)])
    return result

Modules¶

fastvideo.pipelines.stages.base ¶

Base classes for pipeline stages.

This module defines the abstract base classes for pipeline stages that can be composed to create complete diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.base.PipelineStage ¶

Bases: ABC

Abstract base class for all pipeline stages.

A pipeline stage represents a discrete step in the diffusion process that can be composed with other stages to create a complete pipeline. Each stage is responsible for a specific part of the process, such as prompt encoding, latent preparation, etc.

Attributes¶

fastvideo.pipelines.stages.base.PipelineStage.device property ¶

device: device

Get the device for this stage.

Functions¶

fastvideo.pipelines.stages.base.PipelineStage.__call__ ¶

__call__(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Execute the stage's processing on the batch with optional verification and logging. Should not be overridden by subclasses.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The updated batch information after this stage's processing.

Source code in fastvideo/pipelines/stages/base.py

def __call__(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Execute the stage's processing on the batch with optional verification and logging.
    Should not be overridden by subclasses.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The updated batch information after this stage's processing.
    """
    stage_name = self.__class__.__name__

    # Check if verification is enabled (simple approach for prototype)
    enable_verification = getattr(fastvideo_args,
                                  'enable_stage_verification', False)

    if enable_verification:
        # Pre-execution input verification
        try:
            input_result = self.verify_input(batch, fastvideo_args)
            self._run_verification(input_result, stage_name, "input")
        except Exception as e:
            logger.error("Input verification failed for %s: %s", stage_name,
                         str(e))
            raise

    # Execute the actual stage logic
    if envs.FASTVIDEO_STAGE_LOGGING:
        logger.info("[%s] Starting execution", stage_name)
        start_time = time.perf_counter()

        try:
            result = self.forward(batch, fastvideo_args)
            execution_time = time.perf_counter() - start_time
            logger.info("[%s] Execution completed in %s ms", stage_name,
                        execution_time * 1000)
            batch.logging_info.add_stage_execution_time(
                stage_name, execution_time)
        except Exception as e:
            execution_time = time.perf_counter() - start_time
            logger.error("[%s] Error during execution after %s ms: %s",
                         stage_name, execution_time * 1000, e)
            logger.error("[%s] Traceback: %s", stage_name,
                         traceback.format_exc())
            raise
    else:
        # Direct execution (current behavior)
        result = self.forward(batch, fastvideo_args)

    if enable_verification:
        # Post-execution output verification
        try:
            output_result = self.verify_output(result, fastvideo_args)
            self._run_verification(output_result, stage_name, "output")
        except Exception as e:
            logger.error("Output verification failed for %s: %s",
                         stage_name, str(e))
            raise

    return result

fastvideo.pipelines.stages.base.PipelineStage.forward abstractmethod ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Forward pass of the stage's processing.

This method should be implemented by subclasses to provide the forward processing logic for the stage.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The updated batch information after this stage's processing.

Source code in fastvideo/pipelines/stages/base.py

@abstractmethod
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Forward pass of the stage's processing.

    This method should be implemented by subclasses to provide the forward
    processing logic for the stage.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The updated batch information after this stage's processing.
    """
    raise NotImplementedError

fastvideo.pipelines.stages.base.PipelineStage.set_logging ¶

set_logging(enable: bool)

Enable or disable logging for this stage.

Parameters:

Name	Type	Description	Default
`enable`	`bool`	Whether to enable logging.	required

Source code in fastvideo/pipelines/stages/base.py

def set_logging(self, enable: bool):
    """
    Enable or disable logging for this stage.

    Args:
        enable: Whether to enable logging.
    """
    self._enable_logging = enable

fastvideo.pipelines.stages.base.PipelineStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify the input for the stage.

Example

from fastvideo.pipelines.stages.validators import V, VerificationResult

def verify_input(self, batch, fastvideo_args): result = VerificationResult() result.add_check("height", batch.height, V.positive_int_divisible(8)) result.add_check("width", batch.width, V.positive_int_divisible(8)) result.add_check("image_latent", batch.image_latent, V.is_tensor) return result

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`VerificationResult`	A VerificationResult containing the verification status.

Source code in fastvideo/pipelines/stages/base.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """
    Verify the input for the stage.

    Example:
        from fastvideo.pipelines.stages.validators import V, VerificationResult

        def verify_input(self, batch, fastvideo_args):
            result = VerificationResult()
            result.add_check("height", batch.height, V.positive_int_divisible(8))
            result.add_check("width", batch.width, V.positive_int_divisible(8))
            result.add_check("image_latent", batch.image_latent, V.is_tensor)
            return result

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        A VerificationResult containing the verification status.

    """
    # Default implementation - no verification
    return VerificationResult()

fastvideo.pipelines.stages.base.PipelineStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify the output for the stage.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`VerificationResult`	A VerificationResult containing the verification status.

Source code in fastvideo/pipelines/stages/base.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """
    Verify the output for the stage.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        A VerificationResult containing the verification status.
    """
    # Default implementation - no verification
    return VerificationResult()

fastvideo.pipelines.stages.base.StageVerificationError ¶

Bases: Exception

Exception raised when stage verification fails.

Functions¶

fastvideo.pipelines.stages.causal_denoising ¶

Classes¶

fastvideo.pipelines.stages.causal_denoising.CausalDMDDenosingStage ¶

CausalDMDDenosingStage(transformer, scheduler, transformer_2=None, vae=None)

Bases: DenoisingStage

Denoising stage for causal diffusion.

Source code in fastvideo/pipelines/stages/causal_denoising.py

def __init__(self,
             transformer,
             scheduler,
             transformer_2=None,
             vae=None) -> None:
    super().__init__(transformer, scheduler, transformer_2)
    # KV and cross-attention cache state (initialized on first forward)
    self.transformer = transformer
    self.transformer_2 = transformer_2
    self.vae = vae
    # Model-dependent constants (aligned with causal_inference.py assumptions)
    self.num_transformer_blocks = len(self.transformer.blocks)
    self.num_frames_per_block = self.transformer.config.arch_config.num_frames_per_block
    self.sliding_window_num_frames = self.transformer.config.arch_config.sliding_window_num_frames

    try:
        self.local_attn_size = getattr(self.transformer.model,
                                       "local_attn_size",
                                       -1)  # type: ignore
    except Exception:
        self.local_attn_size = -1

Functions¶

fastvideo.pipelines.stages.causal_denoising.CausalDMDDenosingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify denoising stage inputs.

Source code in fastvideo/pipelines/stages/causal_denoising.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify denoising stage inputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
    result.add_check("image_embeds", batch.image_embeds, V.is_list)
    result.add_check("image_latent", batch.image_latent,
                     V.none_or_tensor_with_dims(5))
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    result.add_check("eta", batch.eta, V.non_negative_float)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds, lambda x:
        not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

Functions¶

fastvideo.pipelines.stages.conditioning ¶

Conditioning stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.conditioning.ConditioningStage ¶

Bases: PipelineStage

Stage for applying conditioning to the diffusion process.

This stage handles the application of conditioning, such as classifier-free guidance, to the diffusion process.

Functions¶

fastvideo.pipelines.stages.conditioning.ConditioningStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Apply conditioning to the diffusion process.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with applied conditioning.

Source code in fastvideo/pipelines/stages/conditioning.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Apply conditioning to the diffusion process.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with applied conditioning.
    """
    # TODO!!
    if not batch.do_classifier_free_guidance:
        return batch
    else:
        return batch

    logger.info("batch.negative_prompt_embeds: %s",
                batch.negative_prompt_embeds)
    logger.info("do_classifier_free_guidance: %s",
                batch.do_classifier_free_guidance)
    logger.info("cfg_scale: %s", batch.guidance_scale)

    # Ensure negative prompt embeddings are available
    assert batch.negative_prompt_embeds is not None, (
        "Negative prompt embeddings are required for classifier-free guidance"
    )

    # Concatenate primary embeddings and masks
    batch.prompt_embeds = torch.cat(
        [batch.negative_prompt_embeds, batch.prompt_embeds])
    if batch.attention_mask is not None:
        batch.attention_mask = torch.cat(
            [batch.negative_attention_mask, batch.attention_mask])

    # Concatenate secondary embeddings and masks if present
    if batch.prompt_embeds_2 is not None:
        batch.prompt_embeds_2 = torch.cat(
            [batch.negative_prompt_embeds_2, batch.prompt_embeds_2])
    if batch.attention_mask_2 is not None:
        batch.attention_mask_2 = torch.cat(
            [batch.negative_attention_mask_2, batch.attention_mask_2])

    return batch

fastvideo.pipelines.stages.conditioning.ConditioningStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify conditioning stage inputs.

Source code in fastvideo/pipelines/stages/conditioning.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify conditioning stage inputs."""
    result = VerificationResult()
    if not batch.prompt_embeds:
        # No text encoder/prompt embeddings: skip checks and effectively disable CFG.
        batch.do_classifier_free_guidance = False
        return result
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    # Matrix-Game allow empty prompt
    # embeddings when CFG isn't enabled.
    if batch.do_classifier_free_guidance or batch.prompt_embeds:
        result.add_check("prompt_embeds", batch.prompt_embeds,
                         V.list_not_empty)
        result.add_check(
            "negative_prompt_embeds", batch.negative_prompt_embeds, lambda
            x: not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

fastvideo.pipelines.stages.conditioning.ConditioningStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify conditioning stage outputs.

Source code in fastvideo/pipelines/stages/conditioning.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify conditioning stage outputs."""
    result = VerificationResult()
    if batch.prompt_embeds is None or not batch.prompt_embeds:
        batch.do_classifier_free_guidance = False
        return result
    if batch.do_classifier_free_guidance or batch.prompt_embeds:
        result.add_check("prompt_embeds", batch.prompt_embeds,
                         V.list_not_empty)
    return result

Functions¶

fastvideo.pipelines.stages.decoding ¶

Decoding stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.decoding.DecodingStage ¶

DecodingStage(vae, pipeline=None)

Bases: PipelineStage

Stage for decoding latent representations into pixel space.

This stage handles the decoding of latent representations into the final output format (e.g., pixel values).

Source code in fastvideo/pipelines/stages/decoding.py

def __init__(self, vae, pipeline=None) -> None:
    self.vae: ParallelTiledVAE = vae
    self.pipeline = weakref.ref(pipeline) if pipeline else None

Functions¶

fastvideo.pipelines.stages.decoding.DecodingStage.decode ¶

decode(latents: Tensor, fastvideo_args: FastVideoArgs) -> Tensor

Decode latent representations into pixel space using VAE.

Parameters:

Name	Type	Description	Default
`latents`	`Tensor`	Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)	required
`fastvideo_args`	`FastVideoArgs`	Configuration containing: - disable_autocast: Whether to disable automatic mixed precision (default: False) - pipeline_config.vae_precision: VAE computation precision ("fp32", "fp16", "bf16") - pipeline_config.vae_tiling: Whether to enable VAE tiling for memory efficiency	required

Returns:

Type	Description
`Tensor`	Decoded video tensor with shape (batch, channels, frames, height, width),
`Tensor`	normalized to [0, 1] range and moved to CPU as float32

Source code in fastvideo/pipelines/stages/decoding.py

@torch.no_grad()
def decode(self, latents: torch.Tensor,
           fastvideo_args: FastVideoArgs) -> torch.Tensor:
    """
    Decode latent representations into pixel space using VAE.

    Args:
        latents: Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)
        fastvideo_args: Configuration containing:
            - disable_autocast: Whether to disable automatic mixed precision (default: False)
            - pipeline_config.vae_precision: VAE computation precision ("fp32", "fp16", "bf16")
            - pipeline_config.vae_tiling: Whether to enable VAE tiling for memory efficiency

    Returns:
        Decoded video tensor with shape (batch, channels, frames, height, width), 
        normalized to [0, 1] range and moved to CPU as float32
    """
    self.vae = self.vae.to(get_local_torch_device())
    latents = latents.to(get_local_torch_device())

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # denormalization for MatrixGame VAE
    # z = z * std + mean during decode
    if (hasattr(self.vae.config, 'latents_mean')
            and hasattr(self.vae.config, 'latents_std')):
        # Convert config values to tensors
        latents_mean = torch.tensor(self.vae.config.latents_mean,
                                    device=latents.device,
                                    dtype=latents.dtype).view(
                                        1, -1, 1, 1, 1)

        latents_std = torch.tensor(self.vae.config.latents_std,
                                   device=latents.device,
                                   dtype=latents.dtype).view(
                                       1, -1, 1, 1, 1)

        # Apply denormalization: z = z * std + mean
        latents = latents * latents_std + latents_mean
    elif hasattr(self.vae, 'scaling_factor'):
        # Standard VAE scaling
        if isinstance(self.vae.scaling_factor, torch.Tensor):
            latents = latents / self.vae.scaling_factor.to(
                latents.device, latents.dtype)
        else:
            latents = latents / self.vae.scaling_factor

        # Apply shifting if needed
        if (hasattr(self.vae, "shift_factor")
                and self.vae.shift_factor is not None):
            if isinstance(self.vae.shift_factor, torch.Tensor):
                latents += self.vae.shift_factor.to(latents.device,
                                                    latents.dtype)
            else:
                latents += self.vae.shift_factor

    # Decode latents
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        # if fastvideo_args.vae_sp:
        #     self.vae.enable_parallel()
        if not vae_autocast_enabled:
            latents = latents.to(vae_dtype)
        image = self.vae.decode(latents)

    # Normalize image to [0, 1] range
    image = (image / 2 + 0.5).clamp(0, 1)
    return image

fastvideo.pipelines.stages.decoding.DecodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Decode latent representations into pixel space.

This method processes the batch through the VAE decoder, converting latent representations to pixel-space video/images. It also optionally decodes trajectory latents for visualization purposes.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch containing: - latents: Tensor to decode (batch, channels, frames, height_latents, width_latents) - return_trajectory_decoded (optional): Flag to decode trajectory latents - trajectory_latents (optional): Latents at different timesteps - trajectory_timesteps (optional): Corresponding timesteps	required
`fastvideo_args`	`FastVideoArgs`	Configuration containing: - output_type: "latent" to skip decoding, otherwise decode to pixels - vae_cpu_offload: Whether to offload VAE to CPU after decoding - model_loaded: Track VAE loading state - model_paths: Path to VAE model if loading needed	required

Returns:

Type	Description
`ForwardBatch`	Modified batch with: - output: Decoded frames (batch, channels, frames, height, width) as CPU float32 - trajectory_decoded (if requested): List of decoded frames per timestep

Source code in fastvideo/pipelines/stages/decoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Decode latent representations into pixel space.

    This method processes the batch through the VAE decoder, converting latent
    representations to pixel-space video/images. It also optionally decodes
    trajectory latents for visualization purposes.

    Args:
        batch: The current batch containing:
            - latents: Tensor to decode (batch, channels, frames, height_latents, width_latents)
            - return_trajectory_decoded (optional): Flag to decode trajectory latents
            - trajectory_latents (optional): Latents at different timesteps
            - trajectory_timesteps (optional): Corresponding timesteps
        fastvideo_args: Configuration containing:
            - output_type: "latent" to skip decoding, otherwise decode to pixels
            - vae_cpu_offload: Whether to offload VAE to CPU after decoding
            - model_loaded: Track VAE loading state
            - model_paths: Path to VAE model if loading needed

    Returns:
        Modified batch with:
            - output: Decoded frames (batch, channels, frames, height, width) as CPU float32
            - trajectory_decoded (if requested): List of decoded frames per timestep
    """
    # load vae if not already loaded (used for memory constrained devices)
    pipeline = self.pipeline() if self.pipeline else None
    if not fastvideo_args.model_loaded["vae"]:
        loader = VAELoader()
        self.vae = loader.load(fastvideo_args.model_paths["vae"],
                               fastvideo_args)
        if pipeline:
            pipeline.add_module("vae", self.vae)
        fastvideo_args.model_loaded["vae"] = True

    if fastvideo_args.output_type == "latent":
        frames = batch.latents
    else:
        frames = self.decode(batch.latents, fastvideo_args)

    # decode trajectory latents if needed
    if batch.return_trajectory_decoded:
        batch.trajectory_decoded = []
        assert batch.trajectory_latents is not None, "batch should have trajectory latents"
        for idx in range(batch.trajectory_latents.shape[1]):
            # batch.trajectory_latents is [batch_size, timesteps, channels, frames, height, width]
            cur_latent = batch.trajectory_latents[:, idx, :, :, :, :]
            cur_timestep = batch.trajectory_timesteps[idx]
            logger.info("decoding trajectory latent for timestep: %s",
                        cur_timestep)
            decoded_frames = self.decode(cur_latent, fastvideo_args)
            batch.trajectory_decoded.append(decoded_frames.cpu().float())

    # Convert to CPU float32 for compatibility
    frames = frames.cpu().float()

    # Crop padding if this is a LongCat refinement
    if hasattr(batch, 'num_cond_frames_added') and hasattr(
            batch, 'new_frame_size_before_padding'):
        num_cond_frames_added = batch.num_cond_frames_added
        new_frame_size = batch.new_frame_size_before_padding
        if num_cond_frames_added > 0 or frames.shape[2] != new_frame_size:
            # frames is [B, C, T, H, W], crop temporal dimension
            frames = frames[:, :,
                            num_cond_frames_added:num_cond_frames_added +
                            new_frame_size, :, :]
            logger.info(
                "Cropped LongCat refinement padding: %s:%s, final shape: %s",
                num_cond_frames_added,
                num_cond_frames_added + new_frame_size, frames.shape)

    # Update batch with decoded image
    batch.output = frames

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    if fastvideo_args.vae_cpu_offload:
        self.vae.to("cpu")

    if torch.backends.mps.is_available():
        del self.vae
        if pipeline is not None and "vae" in pipeline.modules:
            del pipeline.modules["vae"]
        fastvideo_args.model_loaded["vae"] = False

    return batch

fastvideo.pipelines.stages.decoding.DecodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify decoding stage inputs.

Source code in fastvideo/pipelines/stages/decoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify decoding stage inputs."""
    result = VerificationResult()
    # Denoised latents for VAE decoding: [batch_size, channels, frames, height_latents, width_latents]
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.decoding.DecodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify decoding stage outputs.

Source code in fastvideo/pipelines/stages/decoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify decoding stage outputs."""
    result = VerificationResult()
    # Decoded video/images: [batch_size, channels, frames, height, width]
    result.add_check("output", batch.output, [V.is_tensor, V.with_dims(5)])
    return result

Functions¶

fastvideo.pipelines.stages.denoising ¶

Denoising stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.denoising.CosmosDenoisingStage ¶

CosmosDenoisingStage(transformer, scheduler, pipeline=None)

Bases: DenoisingStage

Denoising stage for Cosmos models using FlowMatchEulerDiscreteScheduler.

Source code in fastvideo/pipelines/stages/denoising.py

def __init__(self, transformer, scheduler, pipeline=None) -> None:
    super().__init__(transformer, scheduler, pipeline)

Functions¶

fastvideo.pipelines.stages.denoising.CosmosDenoisingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify Cosmos denoising stage inputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify Cosmos denoising stage inputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds, lambda x:
        not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

fastvideo.pipelines.stages.denoising.CosmosDenoisingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify Cosmos denoising stage outputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify Cosmos denoising stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.denoising.DenoisingStage ¶

DenoisingStage(transformer, scheduler, pipeline=None, transformer_2=None, vae=None)

Bases: PipelineStage

Stage for running the denoising loop in diffusion pipelines.

This stage handles the iterative denoising process that transforms the initial noise into the final output.

Source code in fastvideo/pipelines/stages/denoising.py

def __init__(self,
             transformer,
             scheduler,
             pipeline=None,
             transformer_2=None,
             vae=None) -> None:
    super().__init__()
    self.transformer = transformer
    self.transformer_2 = transformer_2
    self.scheduler = scheduler
    self.vae = vae
    self.pipeline = weakref.ref(pipeline) if pipeline else None
    attn_head_size = self.transformer.hidden_size // self.transformer.num_attention_heads
    self.attn_backend = get_attn_backend(
        head_size=attn_head_size,
        dtype=torch.float16,  # TODO(will): hack
        supported_attention_backends=(
            AttentionBackendEnum.SLIDING_TILE_ATTN,
            AttentionBackendEnum.VIDEO_SPARSE_ATTN,
            AttentionBackendEnum.VMOBA_ATTN,
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.TORCH_SDPA,
            AttentionBackendEnum.SAGE_ATTN_THREE)  # hack
    )

Functions¶

fastvideo.pipelines.stages.denoising.DenoisingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Run the denoising loop.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with denoised latents.

Source code in fastvideo/pipelines/stages/denoising.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Run the denoising loop.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with denoised latents.
    """
    pipeline = self.pipeline() if self.pipeline else None
    if not fastvideo_args.model_loaded["transformer"]:
        loader = TransformerLoader()
        self.transformer = loader.load(
            fastvideo_args.model_paths["transformer"], fastvideo_args)
        if pipeline:
            pipeline.add_module("transformer", self.transformer)
        fastvideo_args.model_loaded["transformer"] = True

    # Prepare extra step kwargs for scheduler
    extra_step_kwargs = self.prepare_extra_func_kwargs(
        self.scheduler.step,
        {
            "generator": batch.generator,
            "eta": batch.eta
        },
    )

    # Setup precision and autocast settings
    # TODO(will): make the precision configurable for inference
    # target_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
    target_dtype = torch.bfloat16
    autocast_enabled = (target_dtype != torch.float32
                        ) and not fastvideo_args.disable_autocast

    # Get timesteps and calculate warmup steps
    timesteps = batch.timesteps
    # TODO(will): remove this once we add input/output validation for stages
    if timesteps is None:
        raise ValueError("Timesteps must be provided")
    num_inference_steps = batch.num_inference_steps
    num_warmup_steps = len(
        timesteps) - num_inference_steps * self.scheduler.order

    # Prepare image latents and embeddings for I2V generation
    image_embeds = batch.image_embeds
    if len(image_embeds) > 0:
        assert not torch.isnan(
            image_embeds[0]).any(), "image_embeds contains nan"
        image_embeds = [
            image_embed.to(target_dtype) for image_embed in image_embeds
        ]

    image_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_image": image_embeds,
            "mask_strategy": dict_to_3d_list(
                None, t_max=50, l_max=60, h_max=24)
        },
    )

    pos_cond_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_2": batch.clip_embedding_pos,
            "encoder_attention_mask": batch.prompt_attention_mask,
        },
    )

    neg_cond_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_2": batch.clip_embedding_neg,
            "encoder_attention_mask": batch.negative_attention_mask,
        },
    )

    action_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "mouse_cond": batch.mouse_cond,
            "keyboard_cond": batch.keyboard_cond,
        },
    )

    # Prepare STA parameters
    if st_attn_available and self.attn_backend == SlidingTileAttentionBackend:
        self.prepare_sta_param(batch, fastvideo_args)

    # Get latents and embeddings
    latents = batch.latents
    prompt_embeds = batch.prompt_embeds
    assert not torch.isnan(
        prompt_embeds[0]).any(), "prompt_embeds contains nan"
    if batch.do_classifier_free_guidance:
        neg_prompt_embeds = batch.negative_prompt_embeds
        assert neg_prompt_embeds is not None
        assert not torch.isnan(
            neg_prompt_embeds[0]).any(), "neg_prompt_embeds contains nan"

    # (Wan2.2) Calculate timestep to switch from high noise expert to low noise expert
    boundary_ratio = fastvideo_args.pipeline_config.dit_config.boundary_ratio
    if batch.boundary_ratio is not None:
        logger.info("Overriding boundary ratio from %s to %s",
                    boundary_ratio, batch.boundary_ratio)
        boundary_ratio = batch.boundary_ratio

    if boundary_ratio is not None:
        boundary_timestep = boundary_ratio * self.scheduler.num_train_timesteps
    else:
        boundary_timestep = None
    latent_model_input = latents.to(target_dtype)
    assert latent_model_input.shape[0] == 1, "only support batch size 1"

    if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
        # TI2V directly replaces the first frame of the latent with
        # the image latent instead of appending along the channel dim
        assert batch.image_latent is None, "TI2V task should not have image latents"
        assert self.vae is not None, "VAE is not provided for TI2V task"
        z = self.vae.encode(batch.pil_image).mean.float()
        if (hasattr(self.vae, "shift_factor")
                and self.vae.shift_factor is not None):
            if isinstance(self.vae.shift_factor, torch.Tensor):
                z -= self.vae.shift_factor.to(z.device, z.dtype)
            else:
                z -= self.vae.shift_factor

        if isinstance(self.vae.scaling_factor, torch.Tensor):
            z = z * self.vae.scaling_factor.to(z.device, z.dtype)
        else:
            z = z * self.vae.scaling_factor

        latent_model_input = latent_model_input.squeeze(0)
        _, mask2 = masks_like([latent_model_input], zero=True)

        latent_model_input = (1. -
                              mask2[0]) * z + mask2[0] * latent_model_input
        # latent_model_input = latent_model_input.unsqueeze(0)
        latent_model_input = latent_model_input.to(get_local_torch_device())
        latents = latent_model_input
        F = batch.num_frames
        temporal_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_temporal
        spatial_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_spatial
        patch_size = fastvideo_args.pipeline_config.dit_config.arch_config.patch_size
        seq_len = ((F - 1) // temporal_scale +
                   1) * (batch.height // spatial_scale) * (
                       batch.width // spatial_scale) // (patch_size[1] *
                                                         patch_size[2])

    # Initialize lists for ODE trajectory
    trajectory_timesteps: list[torch.Tensor] = []
    trajectory_latents: list[torch.Tensor] = []

    # Run denoising loop
    with self.progress_bar(total=num_inference_steps) as progress_bar:
        for i, t in enumerate(timesteps):
            # Skip if interrupted
            if hasattr(self, 'interrupt') and self.interrupt:
                continue

            if boundary_timestep is None or t >= boundary_timestep:
                if (fastvideo_args.dit_cpu_offload
                        and self.transformer_2 is not None and next(
                            self.transformer_2.parameters()).device.type
                        == 'cuda'):
                    self.transformer_2.to('cpu')
                current_model = self.transformer
                current_guidance_scale = batch.guidance_scale
            else:
                # low-noise stage in wan2.2
                if fastvideo_args.dit_cpu_offload and next(
                        self.transformer.parameters(
                        )).device.type == 'cuda':
                    self.transformer.to('cpu')
                current_model = self.transformer_2
                current_guidance_scale = batch.guidance_scale_2
            assert current_model is not None, "current_model is None"

            # Expand latents for V2V/I2V
            latent_model_input = latents.to(target_dtype)
            if batch.video_latent is not None:
                latent_model_input = torch.cat([
                    latent_model_input, batch.video_latent,
                    torch.zeros_like(latents)
                ],
                                               dim=1).to(target_dtype)
            elif batch.image_latent is not None:
                assert not fastvideo_args.pipeline_config.ti2v_task, "image latents should not be provided for TI2V task"
                latent_model_input = torch.cat(
                    [latent_model_input, batch.image_latent],
                    dim=1).to(target_dtype)

            assert not torch.isnan(
                latent_model_input).any(), "latent_model_input contains nan"
            if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
                timestep = torch.stack([t]).to(get_local_torch_device())
                temp_ts = (mask2[0][0][:, ::2, ::2] * timestep).flatten()
                temp_ts = torch.cat([
                    temp_ts,
                    temp_ts.new_ones(seq_len - temp_ts.size(0)) * timestep
                ])
                timestep = temp_ts.unsqueeze(0)
                t_expand = timestep.repeat(latent_model_input.shape[0], 1)
            else:
                t_expand = t.repeat(latent_model_input.shape[0])

            latent_model_input = self.scheduler.scale_model_input(
                latent_model_input, t)

            # Prepare inputs for transformer
            guidance_expand = (
                torch.tensor(
                    [fastvideo_args.pipeline_config.embedded_cfg_scale] *
                    latent_model_input.shape[0],
                    dtype=torch.float32,
                    device=get_local_torch_device(),
                ).to(target_dtype) *
                1000.0 if fastvideo_args.pipeline_config.embedded_cfg_scale
                is not None else None)

            # Predict noise residual
            with torch.autocast(device_type="cuda",
                                dtype=target_dtype,
                                enabled=autocast_enabled):
                if (st_attn_available
                        and self.attn_backend == SlidingTileAttentionBackend
                    ) or (vsa_available and self.attn_backend
                          == VideoSparseAttentionBackend):
                    self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls(
                    )

                    if self.attn_metadata_builder_cls is not None:
                        self.attn_metadata_builder = self.attn_metadata_builder_cls(
                        )
                        # TODO(will): clean this up
                        attn_metadata = self.attn_metadata_builder.build(  # type: ignore
                            current_timestep=i,  # type: ignore
                            raw_latent_shape=batch.
                            raw_latent_shape[2:5],  # type: ignore
                            patch_size=fastvideo_args.
                            pipeline_config.  # type: ignore
                            dit_config.patch_size,  # type: ignore
                            STA_param=batch.STA_param,  # type: ignore
                            VSA_sparsity=fastvideo_args.
                            VSA_sparsity,  # type: ignore
                            device=get_local_torch_device(),
                        )
                        assert attn_metadata is not None, "attn_metadata cannot be None"
                    else:
                        attn_metadata = None
                elif (vmoba_attn_available
                      and self.attn_backend == VMOBAAttentionBackend):
                    self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls(
                    )
                    if self.attn_metadata_builder_cls is not None:
                        self.attn_metadata_builder = self.attn_metadata_builder_cls(
                        )
                        # Prepare V-MoBA parameters from config
                        moba_params = fastvideo_args.moba_config.copy()
                        moba_params.update({
                            "current_timestep":
                            i,
                            "raw_latent_shape":
                            batch.raw_latent_shape[2:5],
                            "patch_size":
                            fastvideo_args.pipeline_config.dit_config.
                            patch_size,
                            "device":
                            get_local_torch_device(),
                        })
                        attn_metadata = self.attn_metadata_builder.build(
                            **moba_params)
                        assert attn_metadata is not None, "attn_metadata cannot be None"
                    else:
                        attn_metadata = None
                else:
                    attn_metadata = None
                # TODO(will): finalize the interface. vLLM uses this to
                # support torch dynamo compilation. They pass in
                # attn_metadata, vllm_config, and num_tokens. We can pass in
                # fastvideo_args or training_args, and attn_metadata.
                batch.is_cfg_negative = False
                with set_forward_context(
                        current_timestep=i,
                        attn_metadata=attn_metadata,
                        forward_batch=batch,
                        # fastvideo_args=fastvideo_args
                ):
                    # Run transformer
                    noise_pred = current_model(
                        latent_model_input,
                        prompt_embeds,
                        t_expand,
                        guidance=guidance_expand,
                        **image_kwargs,
                        **pos_cond_kwargs,
                        **action_kwargs,
                    )

                if batch.do_classifier_free_guidance:
                    batch.is_cfg_negative = True
                    with set_forward_context(
                            current_timestep=i,
                            attn_metadata=attn_metadata,
                            forward_batch=batch,
                    ):
                        noise_pred_uncond = current_model(
                            latent_model_input,
                            neg_prompt_embeds,
                            t_expand,
                            guidance=guidance_expand,
                            **image_kwargs,
                            **neg_cond_kwargs,
                            **action_kwargs,
                        )

                    noise_pred_text = noise_pred
                    noise_pred = noise_pred_uncond + current_guidance_scale * (
                        noise_pred_text - noise_pred_uncond)

                    # Apply guidance rescale if needed
                    if batch.guidance_rescale > 0.0:
                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
                        noise_pred = self.rescale_noise_cfg(
                            noise_pred,
                            noise_pred_text,
                            guidance_rescale=batch.guidance_rescale,
                        )
                # Compute the previous noisy sample
                latents = self.scheduler.step(noise_pred,
                                              t,
                                              latents,
                                              **extra_step_kwargs,
                                              return_dict=False)[0]
                if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
                    latents = latents.squeeze(0)
                    latents = (1. - mask2[0]) * z + mask2[0] * latents
                    # latents = latents.unsqueeze(0)

            # save trajectory latents if needed
            if batch.return_trajectory_latents:
                trajectory_timesteps.append(t)
                trajectory_latents.append(latents)

            # Update progress bar
            if i == len(timesteps) - 1 or (
                (i + 1) > num_warmup_steps and
                (i + 1) % self.scheduler.order == 0
                    and progress_bar is not None):
                progress_bar.update()

    # Gather results if using sequence parallelism
    trajectory_tensor: torch.Tensor | None = None
    if trajectory_latents:
        trajectory_tensor = torch.stack(trajectory_latents, dim=1)
        trajectory_timesteps_tensor = torch.stack(trajectory_timesteps,
                                                  dim=0)
    else:
        trajectory_tensor = None
        trajectory_timesteps_tensor = None

    if trajectory_tensor is not None and trajectory_timesteps_tensor is not None:
        batch.trajectory_timesteps = trajectory_timesteps_tensor.cpu()
        batch.trajectory_latents = trajectory_tensor.cpu()

    # Update batch with final latents
    batch.latents = latents

    # Save STA mask search results if needed
    if st_attn_available and self.attn_backend == SlidingTileAttentionBackend and fastvideo_args.STA_mode == STA_Mode.STA_SEARCHING:
        self.save_sta_search_results(batch)

    # deallocate transformer if on mps
    if torch.backends.mps.is_available():
        logger.info("Memory before deallocating transformer: %s",
                    torch.mps.current_allocated_memory())
        del self.transformer
        if pipeline is not None and "transformer" in pipeline.modules:
            del pipeline.modules["transformer"]
        fastvideo_args.model_loaded["transformer"] = False
        logger.info("Memory after deallocating transformer: %s",
                    torch.mps.current_allocated_memory())

    return batch

fastvideo.pipelines.stages.denoising.DenoisingStage.prepare_extra_func_kwargs ¶

prepare_extra_func_kwargs(func, kwargs) -> dict[str, Any]

Prepare extra kwargs for the scheduler step / denoise step.

Parameters:

Name	Type	Description	Default
`func`		The function to prepare kwargs for.	required
`kwargs`		The kwargs to prepare.	required

Returns:

Type	Description
`dict[str, Any]`	The prepared kwargs.

Source code in fastvideo/pipelines/stages/denoising.py

def prepare_extra_func_kwargs(self, func, kwargs) -> dict[str, Any]:
    """
    Prepare extra kwargs for the scheduler step / denoise step.

    Args:
        func: The function to prepare kwargs for.
        kwargs: The kwargs to prepare.

    Returns:
        The prepared kwargs.
    """
    extra_step_kwargs = {}
    for k, v in kwargs.items():
        accepts = k in set(inspect.signature(func).parameters.keys())
        if accepts:
            extra_step_kwargs[k] = v
    return extra_step_kwargs

fastvideo.pipelines.stages.denoising.DenoisingStage.prepare_sta_param ¶

prepare_sta_param(batch: ForwardBatch, fastvideo_args: FastVideoArgs)

Prepare Sliding Tile Attention (STA) parameters and settings.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Source code in fastvideo/pipelines/stages/denoising.py

def prepare_sta_param(self, batch: ForwardBatch,
                      fastvideo_args: FastVideoArgs):
    """
    Prepare Sliding Tile Attention (STA) parameters and settings.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.
    """
    # TODO(kevin): STA mask search, currently only support Wan2.1 with 69x768x1280
    from fastvideo.attention.backends.STA_configuration import configure_sta
    STA_mode = fastvideo_args.STA_mode
    skip_time_steps = fastvideo_args.skip_time_steps
    if batch.timesteps is None:
        raise ValueError("Timesteps must be provided")
    timesteps_num = batch.timesteps.shape[0]

    logger.info("STA_mode: %s", STA_mode)
    if (batch.num_frames, batch.height,
            batch.width) != (69, 768, 1280) and STA_mode != "STA_inference":
        raise NotImplementedError(
            "STA mask search/tuning is not supported for this resolution")

    if STA_mode == STA_Mode.STA_SEARCHING or STA_mode == STA_Mode.STA_TUNING or STA_mode == STA_Mode.STA_TUNING_CFG:
        size = (batch.width, batch.height)
        if size == (1280, 768):
            # TODO: make it configurable
            sparse_mask_candidates_searching = [
                "3, 1, 10", "1, 5, 7", "3, 3, 3", "1, 6, 5", "1, 3, 10",
                "3, 6, 1"
            ]
            sparse_mask_candidates_tuning = [
                "3, 1, 10", "1, 5, 7", "3, 3, 3", "1, 6, 5", "1, 3, 10",
                "3, 6, 1"
            ]
            full_mask = ["3,6,10"]
        else:
            raise NotImplementedError(
                "STA mask search is not supported for this resolution")
    layer_num = self.transformer.config.num_layers
    # specific for HunyuanVideo
    if hasattr(self.transformer.config, "num_single_layers"):
        layer_num += self.transformer.config.num_single_layers
    head_num = self.transformer.config.num_attention_heads

    if STA_mode == STA_Mode.STA_SEARCHING:
        STA_param = configure_sta(
            mode=STA_Mode.STA_SEARCHING,
            layer_num=layer_num,
            head_num=head_num,
            time_step_num=timesteps_num,
            mask_candidates=sparse_mask_candidates_searching +
            full_mask,  # last is full mask; Can add more sparse masks while keep last one as full mask
        )
    elif STA_mode == STA_Mode.STA_TUNING:
        STA_param = configure_sta(
            mode=STA_Mode.STA_TUNING,
            layer_num=layer_num,
            head_num=head_num,
            time_step_num=timesteps_num,
            mask_search_files_path=
            f'output/mask_search_result_pos_{size[0]}x{size[1]}/',
            mask_candidates=sparse_mask_candidates_tuning,
            full_attention_mask=[int(x) for x in full_mask[0].split(',')],
            skip_time_steps=
            skip_time_steps,  # Use full attention for first 12 steps
            save_dir=
            f'output/mask_search_strategy_{size[0]}x{size[1]}/',  # Custom save directory
            timesteps=timesteps_num)
    elif STA_mode == STA_Mode.STA_TUNING_CFG:
        STA_param = configure_sta(
            mode=STA_Mode.STA_TUNING_CFG,
            layer_num=layer_num,
            head_num=head_num,
            time_step_num=timesteps_num,
            mask_search_files_path_pos=
            f'output/mask_search_result_pos_{size[0]}x{size[1]}/',
            mask_search_files_path_neg=
            f'output/mask_search_result_neg_{size[0]}x{size[1]}/',
            mask_candidates=sparse_mask_candidates_tuning,
            full_attention_mask=[int(x) for x in full_mask[0].split(',')],
            skip_time_steps=skip_time_steps,
            save_dir=f'output/mask_search_strategy_{size[0]}x{size[1]}/',
            timesteps=timesteps_num)
    elif STA_mode == STA_Mode.STA_INFERENCE:
        import fastvideo.envs as envs
        config_file = envs.FASTVIDEO_ATTENTION_CONFIG
        if config_file is None:
            raise ValueError("FASTVIDEO_ATTENTION_CONFIG is not set")
        STA_param = configure_sta(mode=STA_Mode.STA_INFERENCE,
                                  layer_num=layer_num,
                                  head_num=head_num,
                                  time_step_num=timesteps_num,
                                  load_path=config_file)

    batch.STA_param = STA_param
    batch.mask_search_final_result_pos = [[] for _ in range(timesteps_num)]
    batch.mask_search_final_result_neg = [[] for _ in range(timesteps_num)]

fastvideo.pipelines.stages.denoising.DenoisingStage.progress_bar ¶

progress_bar(iterable: Iterable | None = None, total: int | None = None) -> tqdm

Create a progress bar for the denoising process.

Parameters:

Name	Type	Description	Default
`iterable`	`Iterable \| None`	The iterable to iterate over.	`None`
`total`	`int \| None`	The total number of items.	`None`

Returns:

Type	Description
`tqdm`	A tqdm progress bar.

Source code in fastvideo/pipelines/stages/denoising.py

def progress_bar(self,
                 iterable: Iterable | None = None,
                 total: int | None = None) -> tqdm:
    """
    Create a progress bar for the denoising process.

    Args:
        iterable: The iterable to iterate over.
        total: The total number of items.

    Returns:
        A tqdm progress bar.
    """
    local_rank = get_world_group().local_rank
    if local_rank == 0:
        return tqdm(iterable=iterable, total=total)
    else:
        return tqdm(iterable=iterable, total=total, disable=True)

fastvideo.pipelines.stages.denoising.DenoisingStage.rescale_noise_cfg ¶

rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0) -> Tensor

Rescale noise prediction according to guidance_rescale.

Based on findings of "Common Diffusion Noise Schedules and Sample Steps are Flawed" (https://arxiv.org/pdf/2305.08891.pdf), Section 3.4.

Parameters:

Name	Description	Default
`noise_cfg`	The noise prediction with guidance.	required
`noise_pred_text`	The text-conditioned noise prediction.	required
`guidance_rescale`	The guidance rescale factor.	`0.0`

Returns:

Type	Description
`Tensor`	The rescaled noise prediction.

Source code in fastvideo/pipelines/stages/denoising.py

def rescale_noise_cfg(self,
                      noise_cfg,
                      noise_pred_text,
                      guidance_rescale=0.0) -> torch.Tensor:
    """
    Rescale noise prediction according to guidance_rescale.

    Based on findings of "Common Diffusion Noise Schedules and Sample Steps are Flawed"
    (https://arxiv.org/pdf/2305.08891.pdf), Section 3.4.

    Args:
        noise_cfg: The noise prediction with guidance.
        noise_pred_text: The text-conditioned noise prediction.
        guidance_rescale: The guidance rescale factor.

    Returns:
        The rescaled noise prediction.
    """
    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)),
                                   keepdim=True)
    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)),
                            keepdim=True)
    # Rescale the results from guidance (fixes overexposure)
    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
    # Mix with the original results from guidance by factor guidance_rescale
    noise_cfg = (guidance_rescale * noise_pred_rescaled +
                 (1 - guidance_rescale) * noise_cfg)
    return noise_cfg

fastvideo.pipelines.stages.denoising.DenoisingStage.save_sta_search_results ¶

save_sta_search_results(batch: ForwardBatch)

Save the STA mask search results.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required

Source code in fastvideo/pipelines/stages/denoising.py

def save_sta_search_results(self, batch: ForwardBatch):
    """
    Save the STA mask search results.

    Args:
        batch: The current batch information.
    """
    size = (batch.width, batch.height)
    if size == (1280, 768):
        # TODO: make it configurable
        sparse_mask_candidates_searching = [
            "3, 1, 10", "1, 5, 7", "3, 3, 3", "1, 6, 5", "1, 3, 10",
            "3, 6, 1"
        ]
    else:
        raise NotImplementedError(
            "STA mask search is not supported for this resolution")

    from fastvideo.attention.backends.STA_configuration import save_mask_search_results
    if batch.mask_search_final_result_pos is not None and batch.prompt is not None:
        save_mask_search_results(
            [
                dict(layer_data)
                for layer_data in batch.mask_search_final_result_pos
            ],
            prompt=str(batch.prompt),
            mask_strategies=sparse_mask_candidates_searching,
            output_dir=f'output/mask_search_result_pos_{size[0]}x{size[1]}/'
        )
    if batch.mask_search_final_result_neg is not None and batch.prompt is not None:
        save_mask_search_results(
            [
                dict(layer_data)
                for layer_data in batch.mask_search_final_result_neg
            ],
            prompt=str(batch.prompt),
            mask_strategies=sparse_mask_candidates_searching,
            output_dir=f'output/mask_search_result_neg_{size[0]}x{size[1]}/'
        )

fastvideo.pipelines.stages.denoising.DenoisingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify denoising stage inputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify denoising stage inputs."""
    result = VerificationResult()
    result.add_check("timesteps", batch.timesteps,
                     [V.is_tensor, V.min_dims(1)])
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
    result.add_check("image_embeds", batch.image_embeds, V.is_list)
    result.add_check("image_latent", batch.image_latent,
                     V.none_or_tensor_with_dims(5))
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("guidance_scale", batch.guidance_scale,
                     V.positive_float)
    result.add_check("eta", batch.eta, V.non_negative_float)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds, lambda x:
        not batch.do_classifier_free_guidance or V.list_not_empty(x))
    return result

fastvideo.pipelines.stages.denoising.DenoisingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify denoising stage outputs.

Source code in fastvideo/pipelines/stages/denoising.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify denoising stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.denoising.DmdDenoisingStage ¶

DmdDenoisingStage(transformer, scheduler)

Bases: DenoisingStage

Denoising stage for DMD.

Source code in fastvideo/pipelines/stages/denoising.py

def __init__(self, transformer, scheduler) -> None:
    super().__init__(transformer, scheduler)
    self.scheduler = FlowMatchEulerDiscreteScheduler(shift=8.0)

Functions¶

fastvideo.pipelines.stages.denoising.DmdDenoisingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Run the denoising loop.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with denoised latents.

Source code in fastvideo/pipelines/stages/denoising.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Run the denoising loop.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with denoised latents.
    """
    # Setup precision and autocast settings
    # TODO(will): make the precision configurable for inference
    # target_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
    target_dtype = torch.bfloat16
    autocast_enabled = (target_dtype != torch.float32
                        ) and not fastvideo_args.disable_autocast

    # Get timesteps and calculate warmup steps
    timesteps = batch.timesteps

    # TODO(will): remove this once we add input/output validation for stages
    if timesteps is None:
        raise ValueError("Timesteps must be provided")
    num_inference_steps = batch.num_inference_steps
    num_warmup_steps = len(
        timesteps) - num_inference_steps * self.scheduler.order

    # Prepare image latents and embeddings for I2V generation
    image_embeds = batch.image_embeds
    if len(image_embeds) > 0:
        assert torch.isnan(image_embeds[0]).sum() == 0
        image_embeds = [
            image_embed.to(target_dtype) for image_embed in image_embeds
        ]

    image_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_image": image_embeds,
            "mask_strategy": dict_to_3d_list(
                None, t_max=50, l_max=60, h_max=24)
        },
    )

    pos_cond_kwargs = self.prepare_extra_func_kwargs(
        self.transformer.forward,
        {
            "encoder_hidden_states_2": batch.clip_embedding_pos,
            "encoder_attention_mask": batch.prompt_attention_mask,
        },
    )

    # Prepare STA parameters
    if st_attn_available and self.attn_backend == SlidingTileAttentionBackend:
        self.prepare_sta_param(batch, fastvideo_args)

    # Get latents and embeddings
    assert batch.latents is not None, "latents must be provided"
    latents = batch.latents

    video_raw_latent_shape = latents.shape
    prompt_embeds = batch.prompt_embeds
    assert not torch.isnan(
        prompt_embeds[0]).any(), "prompt_embeds contains nan"
    timesteps = torch.tensor(
        fastvideo_args.pipeline_config.dmd_denoising_steps,
        dtype=torch.long,
        device=get_local_torch_device())

    # Run denoising loop
    with self.progress_bar(total=len(timesteps)) as progress_bar:
        for i, t in enumerate(timesteps):
            # Skip if interrupted
            if hasattr(self, 'interrupt') and self.interrupt:
                continue
            # Expand latents for I2V
            noise_latents = latents.clone()
            latent_model_input = latents.to(target_dtype)

            if batch.image_latent is not None:
                latent_model_input = torch.cat([
                    latent_model_input,
                    batch.image_latent.permute(0, 2, 1, 3, 4)
                ],
                                               dim=2).to(target_dtype)
            assert not torch.isnan(
                latent_model_input).any(), "latent_model_input contains nan"

            # Prepare inputs for transformer
            t_expand = t.repeat(latent_model_input.shape[0])
            guidance_expand = (
                torch.tensor(
                    [fastvideo_args.pipeline_config.embedded_cfg_scale] *
                    latent_model_input.shape[0],
                    dtype=torch.float32,
                    device=get_local_torch_device(),
                ).to(target_dtype) *
                1000.0 if fastvideo_args.pipeline_config.embedded_cfg_scale
                is not None else None)

            # Predict noise residual
            with torch.autocast(device_type="cuda",
                                dtype=target_dtype,
                                enabled=autocast_enabled):
                if (vsa_available and self.attn_backend
                        == VideoSparseAttentionBackend):
                    self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls(
                    )

                    if self.attn_metadata_builder_cls is not None:
                        self.attn_metadata_builder = self.attn_metadata_builder_cls(
                        )
                        # TODO(will): clean this up
                        attn_metadata = self.attn_metadata_builder.build(  # type: ignore
                            current_timestep=i,  # type: ignore
                            raw_latent_shape=batch.
                            raw_latent_shape[2:5],  # type: ignore
                            patch_size=fastvideo_args.
                            pipeline_config.  # type: ignore
                            dit_config.patch_size,  # type: ignore
                            STA_param=batch.STA_param,  # type: ignore
                            VSA_sparsity=fastvideo_args.
                            VSA_sparsity,  # type: ignore
                            device=get_local_torch_device(),  # type: ignore
                        )  # type: ignore
                        assert attn_metadata is not None, "attn_metadata cannot be None"
                    else:
                        attn_metadata = None
                else:
                    attn_metadata = None

                batch.is_cfg_negative = False
                with set_forward_context(
                        current_timestep=i,
                        attn_metadata=attn_metadata,
                        forward_batch=batch,
                        # fastvideo_args=fastvideo_args
                ):
                    # Run transformer
                    pred_noise = self.transformer(
                        latent_model_input.permute(0, 2, 1, 3, 4),
                        prompt_embeds,
                        t_expand,
                        guidance=guidance_expand,
                        **image_kwargs,
                        **pos_cond_kwargs,
                    ).permute(0, 2, 1, 3, 4)

                pred_video = pred_noise_to_pred_video(
                    pred_noise=pred_noise.flatten(0, 1),
                    noise_input_latent=noise_latents.flatten(0, 1),
                    timestep=t_expand,
                    scheduler=self.scheduler).unflatten(
                        0, pred_noise.shape[:2])

                if i < len(timesteps) - 1:
                    next_timestep = timesteps[i + 1] * torch.ones(
                        [1], dtype=torch.long, device=pred_video.device)
                    noise = torch.randn(video_raw_latent_shape,
                                        dtype=pred_video.dtype,
                                        generator=batch.generator[0]).to(
                                            self.device)
                    latents = self.scheduler.add_noise(
                        pred_video.flatten(0, 1), noise.flatten(0, 1),
                        next_timestep).unflatten(0, pred_video.shape[:2])
                else:
                    latents = pred_video

                # Update progress bar
                if i == len(timesteps) - 1 or (
                    (i + 1) > num_warmup_steps and
                    (i + 1) % self.scheduler.order == 0
                        and progress_bar is not None):
                    progress_bar.update()

    # Gather results if using sequence parallelism
    latents = latents.permute(0, 2, 1, 3, 4)
    # Update batch with final latents
    batch.latents = latents

    return batch

Functions¶

fastvideo.pipelines.stages.encoding ¶

Encoding stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.encoding.EncodingStage ¶

EncodingStage(vae: ParallelTiledVAE)

Bases: PipelineStage

Stage for encoding pixel space representations into latent space.

This stage handles the encoding of pixel-space video/images into latent representations for further processing in the diffusion pipeline.

Source code in fastvideo/pipelines/stages/encoding.py

def __init__(self, vae: ParallelTiledVAE) -> None:
    self.vae: ParallelTiledVAE = vae

Functions¶

fastvideo.pipelines.stages.encoding.EncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode pixel space representations into latent space.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded latents.

Source code in fastvideo/pipelines/stages/encoding.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode pixel space representations into latent space.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded latents.
    """
    assert batch.latents is not None and isinstance(batch.latents,
                                                    torch.Tensor)

    self.vae = self.vae.to(get_local_torch_device())

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # Normalize input to [-1, 1] range (reverse of decoding normalization)
    latents = (batch.latents * 2.0 - 1.0).clamp(-1, 1)

    # Move to appropriate device and dtype
    latents = latents.to(get_local_torch_device())

    # Encode image to latents
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        # if fastvideo_args.vae_sp:
        #     self.vae.enable_parallel()
        if not vae_autocast_enabled:
            latents = latents.to(vae_dtype)
        latents = self.vae.encode(latents).mean

    # Update batch with encoded latents
    batch.latents = latents

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    if fastvideo_args.vae_cpu_offload:
        self.vae.to("cpu")

    return batch

fastvideo.pipelines.stages.encoding.EncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage inputs.

Source code in fastvideo/pipelines/stages/encoding.py

@torch.no_grad()
def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage inputs."""
    result = VerificationResult()
    # Input video/images for VAE encoding: [batch_size, channels, frames, height, width]
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.encoding.EncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage outputs.

Source code in fastvideo/pipelines/stages/encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage outputs."""
    result = VerificationResult()
    # Encoded latents: [batch_size, channels, frames, height_latents, width_latents]
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    return result

Functions¶

fastvideo.pipelines.stages.image_encoding ¶

Image and video encoding stages for diffusion pipelines.

This module contains implementations of encoding stages for diffusion pipelines: - ImageEncodingStage: Encodes images using image encoders (e.g., CLIP) - RefImageEncodingStage: Encodes reference image for Wan2.1 control pipeline - ImageVAEEncodingStage: Encodes images to latent space using VAE for I2V generation - VideoVAEEncodingStage: Encodes videos to latent space using VAE for V2V and control tasks

Classes¶

fastvideo.pipelines.stages.image_encoding.Hy15ImageEncodingStage ¶

Hy15ImageEncodingStage(image_encoder, image_processor)

Bases: ImageEncodingStage

Stage for encoding image prompts into embeddings for HunyuanVideo1.5 models.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, image_encoder, image_processor) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary image encoder.
    """
    super().__init__()
    self.image_processor = image_processor
    self.image_encoder = image_encoder

Functions¶

fastvideo.pipelines.stages.image_encoding.Hy15ImageEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into image encoder hidden states.

Source code in fastvideo/pipelines/stages/image_encoding.py

def forward(self, batch: ForwardBatch,
            fastvideo_args: FastVideoArgs) -> ForwardBatch:
    """
    Encode the prompt into image encoder hidden states.
    """
    if batch.pil_image is None:
        batch.image_embeds = [
            torch.zeros(1, 729, 1152, device=get_local_torch_device())
        ]

    raw_latent_shape = list(batch.raw_latent_shape)
    raw_latent_shape[1] = 1
    batch.video_latent = torch.zeros(tuple(raw_latent_shape),
                                     device=get_local_torch_device())
    return batch

fastvideo.pipelines.stages.image_encoding.Hy15ImageEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify image encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify image encoding stage inputs."""
    return VerificationResult()

fastvideo.pipelines.stages.image_encoding.ImageEncodingStage ¶

ImageEncodingStage(image_encoder, image_processor)

Bases: PipelineStage

Stage for encoding image prompts into embeddings for diffusion models.

This stage handles the encoding of image prompts into the embedding space expected by the diffusion model.

Initialize the prompt encoding stage.

Parameters:

Name	Type	Description	Default
`enable_logging`		Whether to enable logging for this stage.	required
`is_secondary`		Whether this is a secondary image encoder.	required

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, image_encoder, image_processor) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary image encoder.
    """
    super().__init__()
    self.image_processor = image_processor
    self.image_encoder = image_encoder

Functions¶

fastvideo.pipelines.stages.image_encoding.ImageEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into image encoder hidden states.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded prompt embeddings.

Source code in fastvideo/pipelines/stages/image_encoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode the prompt into image encoder hidden states.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded prompt embeddings.
    """
    self.image_encoder = self.image_encoder.to(get_local_torch_device())

    image = batch.pil_image

    image_inputs = self.image_processor(
        images=image, return_tensors="pt").to(get_local_torch_device())
    with set_forward_context(current_timestep=0, attn_metadata=None):
        outputs = self.image_encoder(**image_inputs)
        image_embeds = outputs.last_hidden_state

    batch.image_embeds.append(image_embeds)

    if fastvideo_args.image_encoder_cpu_offload:
        self.image_encoder.to('cpu')

    return batch

fastvideo.pipelines.stages.image_encoding.ImageEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify image encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify image encoding stage inputs."""
    result = VerificationResult()
    result.add_check("pil_image", batch.pil_image, V.not_none)
    result.add_check("image_embeds", batch.image_embeds, V.is_list)
    return result

fastvideo.pipelines.stages.image_encoding.ImageEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify image encoding stage outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify image encoding stage outputs."""
    result = VerificationResult()
    result.add_check("image_embeds", batch.image_embeds,
                     V.list_of_tensors_dims(3))
    return result

fastvideo.pipelines.stages.image_encoding.ImageVAEEncodingStage ¶

ImageVAEEncodingStage(vae: ParallelTiledVAE)

Bases: PipelineStage

Stage for encoding image pixel representations into latent space.

This stage handles the encoding of image pixel representations into the final input format (e.g., latents) for image-to-video generation.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, vae: ParallelTiledVAE) -> None:
    self.vae: ParallelTiledVAE = vae

Functions¶

fastvideo.pipelines.stages.image_encoding.ImageVAEEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode pixel representations into latent space.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode pixel representations into latent space.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded outputs.
    """
    assert batch.pil_image is not None
    if fastvideo_args.mode == ExecutionMode.INFERENCE:
        assert batch.pil_image is not None and isinstance(
            batch.pil_image, PIL.Image.Image)
        assert batch.height is not None and isinstance(batch.height, int)
        assert batch.width is not None and isinstance(batch.width, int)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, int)
        height = batch.height
        width = batch.width
        num_frames = batch.num_frames
    elif fastvideo_args.mode == ExecutionMode.PREPROCESS:
        assert batch.pil_image is not None and isinstance(
            batch.pil_image, torch.Tensor)
        assert batch.height is not None and isinstance(batch.height, list)
        assert batch.width is not None and isinstance(batch.width, list)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, list)
        num_frames = batch.num_frames[0]
        height = batch.height[0]
        width = batch.width[0]

    self.vae = self.vae.to(get_local_torch_device())

    # Process single image for I2V
    latent_height = height // self.vae.spatial_compression_ratio
    latent_width = width // self.vae.spatial_compression_ratio
    image = batch.pil_image
    image = self.preprocess(
        image,
        vae_scale_factor=self.vae.spatial_compression_ratio,
        height=height,
        width=width).to(get_local_torch_device(), dtype=torch.float32)

    # (B, C, H, W) -> (B, C, 1, H, W)
    image = image.unsqueeze(2)

    video_condition = torch.cat([
        image,
        image.new_zeros(image.shape[0], image.shape[1], num_frames - 1,
                        image.shape[3], image.shape[4])
    ],
                                dim=2)
    video_condition = video_condition.to(device=get_local_torch_device(),
                                         dtype=torch.float32)

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # Encode Image
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        # if fastvideo_args.vae_sp:
        #     self.vae.enable_parallel()
        if not vae_autocast_enabled:
            video_condition = video_condition.to(vae_dtype)
        encoder_output = self.vae.encode(video_condition)

    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        latent_condition = encoder_output.mean
    else:
        generator = batch.generator
        if generator is None:
            raise ValueError("Generator must be provided")
        latent_condition = self.retrieve_latents(encoder_output, generator)

    # Apply shifting if needed
    if (hasattr(self.vae, "shift_factor")
            and self.vae.shift_factor is not None):
        if isinstance(self.vae.shift_factor, torch.Tensor):
            latent_condition -= self.vae.shift_factor.to(
                latent_condition.device, latent_condition.dtype)
        else:
            latent_condition -= self.vae.shift_factor

    if isinstance(self.vae.scaling_factor, torch.Tensor):
        latent_condition = latent_condition * self.vae.scaling_factor.to(
            latent_condition.device, latent_condition.dtype)
    else:
        latent_condition = latent_condition * self.vae.scaling_factor

    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        batch.image_latent = latent_condition
    else:
        mask_lat_size = torch.ones(1, 1, num_frames, latent_height,
                                   latent_width)
        mask_lat_size[:, :, list(range(1, num_frames))] = 0
        first_frame_mask = mask_lat_size[:, :, 0:1]
        first_frame_mask = torch.repeat_interleave(
            first_frame_mask,
            dim=2,
            repeats=self.vae.temporal_compression_ratio)
        mask_lat_size = torch.concat(
            [first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
        mask_lat_size = mask_lat_size.view(
            1, -1, self.vae.temporal_compression_ratio, latent_height,
            latent_width)
        mask_lat_size = mask_lat_size.transpose(1, 2)
        mask_lat_size = mask_lat_size.to(latent_condition.device)

        batch.image_latent = torch.concat([mask_lat_size, latent_condition],
                                          dim=1)

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    self.vae.to("cpu")

    return batch

fastvideo.pipelines.stages.image_encoding.ImageVAEEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage inputs."""
    result = VerificationResult()
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        result.add_check("height", batch.height, V.list_not_empty)
        result.add_check("width", batch.width, V.list_not_empty)
        result.add_check("num_frames", batch.num_frames, V.list_not_empty)
    else:
        result.add_check("height", batch.height, V.positive_int)
        result.add_check("width", batch.width, V.positive_int)
        result.add_check("num_frames", batch.num_frames, V.positive_int)
    return result

fastvideo.pipelines.stages.image_encoding.ImageVAEEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify encoding stage outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify encoding stage outputs."""
    result = VerificationResult()
    result.add_check("image_latent", batch.image_latent,
                     [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.image_encoding.RefImageEncodingStage ¶

RefImageEncodingStage(image_encoder, image_processor)

Bases: ImageEncodingStage

Stage for encoding reference image prompts into embeddings for Wan2.1 Control models.

This stage extends ImageEncodingStage with specialized preprocessing for reference images.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, image_encoder, image_processor) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary image encoder.
    """
    super().__init__()
    self.image_processor = image_processor
    self.image_encoder = image_encoder

Functions¶

fastvideo.pipelines.stages.image_encoding.RefImageEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into image encoder hidden states.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded prompt embeddings.

Source code in fastvideo/pipelines/stages/image_encoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode the prompt into image encoder hidden states.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded prompt embeddings.
    """
    self.image_encoder = self.image_encoder.to(get_local_torch_device())

    image = batch.pil_image
    if image is None:
        image = create_default_image()
    # Preprocess reference image for CLIP encoder
    image_tensor = preprocess_reference_image_for_clip(
        image, get_local_torch_device())

    image_inputs = self.image_processor(images=image_tensor,
                                        return_tensors="pt").to(
                                            get_local_torch_device())
    with set_forward_context(current_timestep=0, attn_metadata=None):
        outputs = self.image_encoder(**image_inputs)
        image_embeds = outputs.last_hidden_state
    batch.image_embeds.append(image_embeds)

    if batch.pil_image is None:
        batch.image_embeds = [
            torch.zeros_like(x) for x in batch.image_embeds
        ]

    return batch

fastvideo.pipelines.stages.image_encoding.VideoVAEEncodingStage ¶

VideoVAEEncodingStage(vae: ParallelTiledVAE)

Bases: ImageVAEEncodingStage

Stage for encoding video pixel representations into latent space.

This stage handles the encoding of video pixel representations for video-to-video generation and control. Inherits from ImageVAEEncodingStage to reuse common functionality.

Source code in fastvideo/pipelines/stages/image_encoding.py

def __init__(self, vae: ParallelTiledVAE) -> None:
    self.vae: ParallelTiledVAE = vae

Functions¶

fastvideo.pipelines.stages.image_encoding.VideoVAEEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode video pixel representations into latent space.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode video pixel representations into latent space.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded outputs.
    """
    assert batch.video_latent is not None, "Video latent input is required for VideoVAEEncodingStage"

    if fastvideo_args.mode == ExecutionMode.INFERENCE:
        assert batch.height is not None and isinstance(batch.height, int)
        assert batch.width is not None and isinstance(batch.width, int)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, int)
        height = batch.height
        width = batch.width
        num_frames = batch.num_frames
    elif fastvideo_args.mode == ExecutionMode.PREPROCESS:
        assert batch.height is not None and isinstance(batch.height, list)
        assert batch.width is not None and isinstance(batch.width, list)
        assert batch.num_frames is not None and isinstance(
            batch.num_frames, list)
        num_frames = batch.num_frames[0]
        height = batch.height[0]
        width = batch.width[0]

    self.vae = self.vae.to(get_local_torch_device())

    # Prepare video tensor from control video
    video_condition = self._prepare_control_video_tensor(
        batch.video_latent, num_frames, height,
        width).to(get_local_torch_device(), dtype=torch.float32)

    # Setup VAE precision
    vae_dtype = PRECISION_TO_TYPE[
        fastvideo_args.pipeline_config.vae_precision]
    vae_autocast_enabled = (
        vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # Encode control video
    with torch.autocast(device_type="cuda",
                        dtype=vae_dtype,
                        enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        if not vae_autocast_enabled:
            video_condition = video_condition.to(vae_dtype)
        encoder_output = self.vae.encode(video_condition)

    generator = batch.generator
    if generator is None:
        raise ValueError("Generator must be provided")
    latent_condition = self.retrieve_latents(encoder_output, generator)

    if (hasattr(self.vae, "shift_factor")
            and self.vae.shift_factor is not None):
        if isinstance(self.vae.shift_factor, torch.Tensor):
            latent_condition -= self.vae.shift_factor.to(
                latent_condition.device, latent_condition.dtype)
        else:
            latent_condition -= self.vae.shift_factor

    if isinstance(self.vae.scaling_factor, torch.Tensor):
        latent_condition = latent_condition * self.vae.scaling_factor.to(
            latent_condition.device, latent_condition.dtype)
    else:
        latent_condition = latent_condition * self.vae.scaling_factor

    batch.video_latent = latent_condition

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    self.vae.to("cpu")

    return batch

fastvideo.pipelines.stages.image_encoding.VideoVAEEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify video encoding stage inputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify video encoding stage inputs."""
    result = VerificationResult()
    result.add_check("video_latent", batch.video_latent, V.not_none)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    if fastvideo_args.mode == ExecutionMode.PREPROCESS:
        result.add_check("height", batch.height, V.list_not_empty)
        result.add_check("width", batch.width, V.list_not_empty)
        result.add_check("num_frames", batch.num_frames, V.list_not_empty)
    else:
        result.add_check("height", batch.height, V.positive_int)
        result.add_check("width", batch.width, V.positive_int)
        result.add_check("num_frames", batch.num_frames, V.positive_int)
    return result

fastvideo.pipelines.stages.image_encoding.VideoVAEEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify video encoding stage outputs.

Source code in fastvideo/pipelines/stages/image_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify video encoding stage outputs."""
    result = VerificationResult()
    result.add_check("video_latent", batch.video_latent,
                     [V.is_tensor, V.with_dims(5)])
    return result

Functions¶

fastvideo.pipelines.stages.input_validation ¶

Input validation stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.input_validation.InputValidationStage ¶

Bases: PipelineStage

Stage for validating and preparing inputs for diffusion pipelines.

This stage validates that all required inputs are present and properly formatted before proceeding with the diffusion process.

Functions¶

fastvideo.pipelines.stages.input_validation.InputValidationStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Validate and prepare inputs.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The validated batch information.

Source code in fastvideo/pipelines/stages/input_validation.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Validate and prepare inputs.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The validated batch information.
    """

    self._generate_seeds(batch, fastvideo_args)

    # Ensure prompt is properly formatted
    if batch.prompt is None and batch.prompt_embeds is None:
        raise ValueError(
            "Either `prompt` or `prompt_embeds` must be provided")

    # Ensure negative prompt is properly formatted if using classifier-free guidance
    if (batch.do_classifier_free_guidance and batch.negative_prompt is None
            and batch.negative_prompt_embeds is None):
        raise ValueError(
            "For classifier-free guidance, either `negative_prompt` or "
            "`negative_prompt_embeds` must be provided")

    # Validate height and width
    if batch.height is None or batch.width is None:
        raise ValueError(
            "Height and width must be provided. Please set `height` and `width`."
        )
    if batch.height % 8 != 0 or batch.width % 8 != 0:
        raise ValueError(
            f"Height and width must be divisible by 8 but are {batch.height} and {batch.width}."
        )

    # Validate number of inference steps
    if batch.num_inference_steps <= 0:
        raise ValueError(
            f"Number of inference steps must be positive, but got {batch.num_inference_steps}"
        )

    # Validate guidance scale if using classifier-free guidance
    if batch.do_classifier_free_guidance and batch.guidance_scale <= 0:
        raise ValueError(
            f"Guidance scale must be positive, but got {batch.guidance_scale}"
        )

    # for i2v, get image from image_path
    # @TODO(Wei) hard-coded for wan2.2 5b ti2v for now. Should put this in image_encoding stage
    if batch.image_path is not None:
        if batch.image_path.endswith(".mp4"):
            image = load_video(batch.image_path)[0]
        else:
            image = load_image(batch.image_path)
        batch.pil_image = image

    # further processing for ti2v task
    if (fastvideo_args.pipeline_config.ti2v_task
            or fastvideo_args.pipeline_config.is_causal
        ) and batch.pil_image is not None:
        img = batch.pil_image
        ih, iw = img.height, img.width

        pipeline_class_name = type(fastvideo_args.pipeline_config).__name__
        if 'MatrixGame' in pipeline_class_name or 'MatrixCausal' in pipeline_class_name:
            oh, ow = batch.height, batch.width
            img = img.resize((ow, oh), Image.LANCZOS)
        else:
            # Standard Wan logic
            patch_size = fastvideo_args.pipeline_config.dit_config.arch_config.patch_size
            vae_stride = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_spatial
            dh, dw = patch_size[1] * vae_stride, patch_size[2] * vae_stride
            max_area = 480 * 832
            ow, oh = best_output_size(iw, ih, dw, dh, max_area)

            scale = max(ow / iw, oh / ih)
            img = img.resize((round(iw * scale), round(ih * scale)),
                             Image.LANCZOS)

            # center-crop
            x1 = (img.width - ow) // 2
            y1 = (img.height - oh) // 2
            img = img.crop((x1, y1, x1 + ow, y1 + oh))

        assert img.width == ow and img.height == oh
        logger.info("final processed img height: %s, img width: %s",
                    img.height, img.width)

        # to tensor
        img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(
            self.device).unsqueeze(1)
        img = img.unsqueeze(0)
        batch.height = oh
        batch.width = ow
        batch.pil_image = img

    # for v2v, get control video from video path
    if batch.video_path is not None:
        pil_images, original_fps = load_video(batch.video_path,
                                              return_fps=True)
        logger.info("Loaded video with %s frames, original FPS: %s",
                    len(pil_images), original_fps)

        # Get target parameters from batch
        target_fps = batch.fps
        target_num_frames = batch.num_frames
        target_height = batch.height
        target_width = batch.width

        if target_fps is not None and original_fps is not None:
            frame_skip = max(1, int(original_fps // target_fps))
            if frame_skip > 1:
                pil_images = pil_images[::frame_skip]
                effective_fps = original_fps / frame_skip
                logger.info(
                    "Resampled video from %.1f fps to %.1f fps (skip=%s)",
                    original_fps, effective_fps, frame_skip)

        # Limit to target number of frames
        if target_num_frames is not None and len(
                pil_images) > target_num_frames:
            pil_images = pil_images[:target_num_frames]
            logger.info("Limited video to %s frames (from %s total)",
                        target_num_frames, len(pil_images))

        # Resize each PIL image to target dimensions
        resized_images = []
        for pil_img in pil_images:
            resized_img = resize(pil_img,
                                 target_height,
                                 target_width,
                                 resize_mode="default",
                                 resample="lanczos")
            resized_images.append(resized_img)

        # Convert PIL images to numpy array
        video_numpy = pil_to_numpy(resized_images)
        video_numpy = normalize(video_numpy)
        video_tensor = numpy_to_pt(video_numpy)

        # Rearrange to [C, T, H, W] and add batch dimension -> [B, C, T, H, W]
        input_video = video_tensor.permute(1, 0, 2, 3).unsqueeze(0)

        batch.video_latent = input_video

    # Validate action control inputs (Matrix-Game)
    if batch.mouse_cond is not None:
        if batch.mouse_cond.dim() != 3 or batch.mouse_cond.shape[-1] != 2:
            raise ValueError(
                f"mouse_cond must have shape (B, T, 2), but got {batch.mouse_cond.shape}"
            )
        logger.info("Action control: mouse_cond validated - shape %s",
                    batch.mouse_cond.shape)

    if batch.keyboard_cond is not None:
        if batch.keyboard_cond.dim() != 3:
            raise ValueError(
                f"keyboard_cond must have 3 dimensions (B, T, K), but got {batch.keyboard_cond.dim()}"
            )
        keyboard_dim = batch.keyboard_cond.shape[-1]
        if keyboard_dim not in {2, 4, 6, 7}:
            raise ValueError(
                f"keyboard_cond last dimension must be 2, 4, 6, or 7, but got {keyboard_dim}"
            )
        logger.info(
            "Action control: keyboard_cond validated - shape %s (dim=%d)",
            batch.keyboard_cond.shape, keyboard_dim)

    if batch.grid_sizes is not None:
        if not isinstance(batch.grid_sizes, list | tuple | torch.Tensor):
            raise ValueError("grid_sizes must be a list, tuple, or tensor")
        if isinstance(batch.grid_sizes, torch.Tensor):
            if batch.grid_sizes.numel() != 3:
                raise ValueError(
                    "grid_sizes must have 3 elements [F, H, W]")
        else:
            if len(batch.grid_sizes) != 3:
                raise ValueError(
                    "grid_sizes must have 3 elements [F, H, W]")
        logger.info("Action control: grid_sizes validated - %s",
                    batch.grid_sizes)

    return batch

fastvideo.pipelines.stages.input_validation.InputValidationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify input validation stage inputs.

Source code in fastvideo/pipelines/stages/input_validation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify input validation stage inputs."""
    result = VerificationResult()
    result.add_check("seed", batch.seed, [V.not_none, V.positive_int])
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check(
        "prompt_or_embeds", None, lambda _: V.string_or_list_strings(
            batch.prompt) or V.list_not_empty(batch.prompt_embeds))
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check(
        "guidance_scale", batch.guidance_scale, lambda x: not batch.
        do_classifier_free_guidance or V.positive_float(x))
    return result

fastvideo.pipelines.stages.input_validation.InputValidationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify input validation stage outputs.

Source code in fastvideo/pipelines/stages/input_validation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify input validation stage outputs."""
    result = VerificationResult()
    result.add_check("seeds", batch.seeds, V.list_not_empty)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    return result

Functions¶

fastvideo.pipelines.stages.latent_preparation ¶

Latent preparation stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.latent_preparation.CosmosLatentPreparationStage ¶

CosmosLatentPreparationStage(scheduler, transformer, vae=None)

Bases: PipelineStage

Cosmos-specific latent preparation stage that properly handles the tensor shapes and conditioning masks required by the Cosmos transformer.

This stage replicates the logic from diffusers' Cosmos2VideoToWorldPipeline.prepare_latents()

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self, scheduler, transformer, vae=None) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.vae = vae

Functions¶

fastvideo.pipelines.stages.latent_preparation.CosmosLatentPreparationStage.adjust_video_length ¶

adjust_video_length(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> int

Adjust video length based on VAE version.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`int`	The batch with adjusted video length.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def adjust_video_length(self, batch: ForwardBatch,
                        fastvideo_args: FastVideoArgs) -> int:
    """
    Adjust video length based on VAE version.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with adjusted video length.
    """

    video_length = batch.num_frames
    use_temporal_scaling_frames = fastvideo_args.pipeline_config.vae_config.use_temporal_scaling_frames
    if use_temporal_scaling_frames:
        temporal_scale_factor = fastvideo_args.pipeline_config.vae_config.arch_config.temporal_compression_ratio
        latent_num_frames = (video_length - 1) // temporal_scale_factor + 1
    else:  # stepvideo only
        latent_num_frames = video_length // 17 * 3
    return int(latent_num_frames)

fastvideo.pipelines.stages.latent_preparation.CosmosLatentPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify Cosmos latent preparation stage inputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify Cosmos latent preparation stage inputs."""
    result = VerificationResult()
    result.add_check(
        "prompt_or_embeds", None, lambda _: V.string_or_list_strings(
            batch.prompt) or V.list_not_empty(batch.prompt_embeds))
    result.add_check("prompt_embeds", batch.prompt_embeds,
                     V.list_of_tensors)
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("num_frames", batch.num_frames, V.positive_int)
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("latents", batch.latents, V.none_or_tensor)
    return result

fastvideo.pipelines.stages.latent_preparation.CosmosLatentPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage outputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("raw_latent_shape", batch.raw_latent_shape, V.is_tuple)
    return result

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage ¶

LatentPreparationStage(scheduler, transformer, use_btchw_layout: bool = False)

Bases: PipelineStage

Stage for preparing initial latent variables for the diffusion process.

This stage handles the preparation of the initial latent variables that will be denoised during the diffusion process.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def __init__(self,
             scheduler,
             transformer,
             use_btchw_layout: bool = False) -> None:
    super().__init__()
    self.scheduler = scheduler
    self.transformer = transformer
    self.use_btchw_layout = use_btchw_layout

Functions¶

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.adjust_video_length ¶

adjust_video_length(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> int

Adjust video length based on VAE version.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`int`	The batch with adjusted video length.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def adjust_video_length(self, batch: ForwardBatch,
                        fastvideo_args: FastVideoArgs) -> int:
    """
    Adjust video length based on VAE version.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with adjusted video length.
    """

    video_length = batch.num_frames
    use_temporal_scaling_frames = fastvideo_args.pipeline_config.vae_config.use_temporal_scaling_frames
    if use_temporal_scaling_frames:
        temporal_scale_factor = fastvideo_args.pipeline_config.vae_config.arch_config.temporal_compression_ratio
        latent_num_frames = (video_length - 1) // temporal_scale_factor + 1
    else:  # stepvideo only
        latent_num_frames = video_length // 17 * 3
    return int(latent_num_frames)

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Prepare initial latent variables for the diffusion process.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with prepared latent variables.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Prepare initial latent variables for the diffusion process.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with prepared latent variables.
    """

    latent_num_frames = None
    # Adjust video length based on VAE version if needed
    if hasattr(self, 'adjust_video_length'):
        latent_num_frames = self.adjust_video_length(batch, fastvideo_args)
    # Determine batch size; fall back to action/image inputs when no text encoder is present
    if not batch.prompt_embeds:
        if batch.keyboard_cond is not None:
            batch_size = batch.keyboard_cond.shape[0]
        elif batch.mouse_cond is not None:
            batch_size = batch.mouse_cond.shape[0]
        elif batch.image_embeds:
            batch_size = batch.image_embeds[0].shape[0]
        else:
            batch_size = 1
    elif isinstance(batch.prompt, list):
        batch_size = len(batch.prompt)
    elif batch.prompt is not None:
        batch_size = 1
    else:
        batch_size = batch.prompt_embeds[0].shape[0]

    # Adjust batch size for number of videos per prompt
    batch_size *= batch.num_videos_per_prompt

    # Get required parameters
    if not batch.prompt_embeds:
        # Create a dummy zero-length text embedding to satisfy downstream checks.
        # Matrix-Game models have text_dim=0 and ignore encoder_hidden_states.
        transformer_dtype = next(self.transformer.parameters()).dtype
        device = get_local_torch_device()
        dummy_prompt = torch.zeros(batch_size,
                                   0,
                                   self.transformer.hidden_size,
                                   device=device,
                                   dtype=transformer_dtype)
        batch.prompt_embeds = [dummy_prompt]
        batch.negative_prompt_embeds = []
        batch.do_classifier_free_guidance = False
    dtype = batch.prompt_embeds[0].dtype
    device = get_local_torch_device()
    generator = batch.generator
    latents = batch.latents
    num_frames = latent_num_frames if latent_num_frames is not None else batch.num_frames
    height = batch.height
    width = batch.width

    # TODO(will): remove this once we add input/output validation for stages
    if height is None or width is None:
        raise ValueError("Height and width must be provided")

    # Calculate latent shape
    bcthw_shape: tuple[int, ...] | None = None
    if self.use_btchw_layout:
        shape = (
            batch_size,
            num_frames,
            self.transformer.num_channels_latents,
            height // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
            width // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
        )
        bcthw_shape = tuple(shape[i] for i in [0, 2, 1, 3, 4])
    else:
        shape = (
            batch_size,
            self.transformer.num_channels_latents,
            num_frames,
            height // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
            width // fastvideo_args.pipeline_config.vae_config.arch_config.
            spatial_compression_ratio,
        )
        bcthw_shape = shape

    # Validate generator if it's a list
    if isinstance(generator, list) and len(generator) != batch_size:
        raise ValueError(
            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
        )
    # Generate or use provided latents
    if latents is None:
        latents = randn_tensor(
            shape,
            generator=generator,
            device=device,
            dtype=dtype,
        )
        if hasattr(self.scheduler, "init_noise_sigma"):
            latents = latents * self.scheduler.init_noise_sigma
    else:
        # Pre-initialized latents:
        # - For LongCat refine (refine_from or stage1_video present), we should not re-scale by init_noise_sigma.
        # - For other models, keep the original behavior.
        latents = latents.to(device)
        is_longcat_refine = (batch.refine_from
                             is not None) or (batch.stage1_video
                                              is not None)
        if (not is_longcat_refine) and hasattr(self.scheduler,
                                               "init_noise_sigma"):
            latents = latents * self.scheduler.init_noise_sigma

    # Update batch with prepared latents
    batch.latents = latents
    batch.raw_latent_shape = bcthw_shape

    return batch

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage inputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage inputs."""
    result = VerificationResult()
    result.add_check(
        "prompt_or_embeds", None,
        lambda _: V.string_or_list_strings(batch.prompt) or not batch.
        prompt_embeds or V.list_not_empty(batch.prompt_embeds))
    if batch.prompt_embeds:
        result.add_check("prompt_embeds", batch.prompt_embeds,
                         V.list_of_tensors)
    result.add_check("num_videos_per_prompt", batch.num_videos_per_prompt,
                     V.positive_int)
    result.add_check("generator", batch.generator,
                     V.generator_or_list_generators)
    result.add_check("num_frames", batch.num_frames, V.positive_int)
    result.add_check("height", batch.height, V.positive_int)
    result.add_check("width", batch.width, V.positive_int)
    result.add_check("latents", batch.latents, V.none_or_tensor)
    return result

fastvideo.pipelines.stages.latent_preparation.LatentPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify latent preparation stage outputs.

Source code in fastvideo/pipelines/stages/latent_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify latent preparation stage outputs."""
    result = VerificationResult()
    result.add_check("latents", batch.latents,
                     [V.is_tensor, V.with_dims(5)])
    result.add_check("raw_latent_shape", batch.raw_latent_shape, V.is_tuple)
    return result

Functions¶

fastvideo.pipelines.stages.longcat_denoising ¶

LongCat-specific denoising stage implementing CFG-zero optimized guidance.

Classes¶

fastvideo.pipelines.stages.longcat_denoising.LongCatDenoisingStage ¶

LongCatDenoisingStage(transformer, scheduler, pipeline=None, transformer_2=None, vae=None)

Bases: DenoisingStage

LongCat denoising stage with CFG-zero optimized guidance scale.

Implements: 1. Optimized CFG scale from CFG-zero paper 2. Negation of noise prediction before scheduler step (flow matching convention) 3. Batched CFG computation (unlike standard FastVideo separate passes)

Source code in fastvideo/pipelines/stages/denoising.py

def __init__(self,
             transformer,
             scheduler,
             pipeline=None,
             transformer_2=None,
             vae=None) -> None:
    super().__init__()
    self.transformer = transformer
    self.transformer_2 = transformer_2
    self.scheduler = scheduler
    self.vae = vae
    self.pipeline = weakref.ref(pipeline) if pipeline else None
    attn_head_size = self.transformer.hidden_size // self.transformer.num_attention_heads
    self.attn_backend = get_attn_backend(
        head_size=attn_head_size,
        dtype=torch.float16,  # TODO(will): hack
        supported_attention_backends=(
            AttentionBackendEnum.SLIDING_TILE_ATTN,
            AttentionBackendEnum.VIDEO_SPARSE_ATTN,
            AttentionBackendEnum.VMOBA_ATTN,
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.TORCH_SDPA,
            AttentionBackendEnum.SAGE_ATTN_THREE)  # hack
    )

Functions¶

fastvideo.pipelines.stages.longcat_denoising.LongCatDenoisingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Run LongCat denoising loop with optimized CFG.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with denoised latents.

Source code in fastvideo/pipelines/stages/longcat_denoising.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Run LongCat denoising loop with optimized CFG.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with denoised latents.
    """
    if not fastvideo_args.model_loaded["transformer"]:
        from fastvideo.models.model_loader import TransformerLoader
        loader = TransformerLoader()
        self.transformer = loader.load(
            fastvideo_args.model_paths["transformer"], fastvideo_args)
        pipeline = self.pipeline() if self.pipeline else None
        if pipeline:
            pipeline.add_module("transformer", self.transformer)
        fastvideo_args.model_loaded["transformer"] = True

    # Get transformer dtype
    if hasattr(self.transformer, 'module'):
        transformer_dtype = next(self.transformer.module.parameters()).dtype
    else:
        transformer_dtype = next(self.transformer.parameters()).dtype

    target_dtype = transformer_dtype
    autocast_enabled = (target_dtype != torch.float32
                        ) and not fastvideo_args.disable_autocast

    # Extract batch parameters
    latents = batch.latents
    timesteps = batch.timesteps
    prompt_embeds = batch.prompt_embeds[0]  # LongCat uses single encoder
    prompt_attention_mask = batch.prompt_attention_mask[
        0] if batch.prompt_attention_mask else None
    guidance_scale = batch.guidance_scale
    do_classifier_free_guidance = batch.do_classifier_free_guidance

    # Get negative prompts if doing CFG
    if do_classifier_free_guidance:
        negative_prompt_embeds = batch.negative_prompt_embeds[0]
        negative_prompt_attention_mask = (batch.negative_attention_mask[0]
                                          if batch.negative_attention_mask
                                          else None)
        # Concatenate for batched processing
        prompt_embeds_combined = torch.cat(
            [negative_prompt_embeds, prompt_embeds], dim=0)
        if prompt_attention_mask is not None:
            prompt_attention_mask_combined = torch.cat(
                [negative_prompt_attention_mask, prompt_attention_mask],
                dim=0)
        else:
            prompt_attention_mask_combined = None
    else:
        prompt_embeds_combined = prompt_embeds
        prompt_attention_mask_combined = prompt_attention_mask

    # Denoising loop
    num_inference_steps = len(timesteps)
    with tqdm(total=num_inference_steps,
              desc="LongCat Denoising") as progress_bar:
        for i, t in enumerate(timesteps):
            # Expand latents for CFG
            if do_classifier_free_guidance:
                latent_model_input = torch.cat([latents] * 2)
            else:
                latent_model_input = latents

            latent_model_input = latent_model_input.to(target_dtype)

            # Expand timestep to match batch size
            timestep = t.expand(
                latent_model_input.shape[0]).to(target_dtype)

            # Run transformer with context
            batch.is_cfg_negative = False
            with set_forward_context(
                    current_timestep=i,
                    attn_metadata=None,
                    forward_batch=batch,
            ), torch.autocast(device_type='cuda',
                              dtype=target_dtype,
                              enabled=autocast_enabled):
                noise_pred = self.transformer(
                    hidden_states=latent_model_input,
                    encoder_hidden_states=prompt_embeds_combined,
                    timestep=timestep,
                    encoder_attention_mask=prompt_attention_mask_combined,
                )

            # Apply CFG with optimized scale
            if do_classifier_free_guidance:
                noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)

                B = noise_pred_cond.shape[0]
                positive = noise_pred_cond.reshape(B, -1)
                negative = noise_pred_uncond.reshape(B, -1)

                # Calculate optimized scale (CFG-zero)
                st_star = self.optimized_scale(positive, negative)

                # Reshape for broadcasting
                st_star = st_star.view(B, 1, 1, 1, 1)

                # Apply optimized CFG formula
                noise_pred = (
                    noise_pred_uncond * st_star + guidance_scale *
                    (noise_pred_cond - noise_pred_uncond * st_star))

            # CRITICAL: Negate noise prediction for flow matching scheduler
            noise_pred = -noise_pred

            # Compute previous noisy sample x_t -> x_t-1
            latents = self.scheduler.step(noise_pred,
                                          t,
                                          latents,
                                          return_dict=False)[0]

            progress_bar.update()

    # Update batch with denoised latents
    batch.latents = latents
    return batch

fastvideo.pipelines.stages.longcat_denoising.LongCatDenoisingStage.optimized_scale ¶

optimized_scale(positive_flat, negative_flat) -> Tensor

Calculate optimized scale from CFG-zero paper.

st_star = (v_cond^T * v_uncond) / ||v_uncond||^2

Parameters:

Name	Type	Description	Default
`positive_flat`		Conditional prediction, flattened [B, -1]	required
`negative_flat`		Unconditional prediction, flattened [B, -1]	required

Returns:

Name	Type	Description
`st_star`	`Tensor`	Optimized scale [B, 1]

Source code in fastvideo/pipelines/stages/longcat_denoising.py

def optimized_scale(self, positive_flat, negative_flat) -> torch.Tensor:
    """
    Calculate optimized scale from CFG-zero paper.

    st_star = (v_cond^T * v_uncond) / ||v_uncond||^2

    Args:
        positive_flat: Conditional prediction, flattened [B, -1]
        negative_flat: Unconditional prediction, flattened [B, -1]

    Returns:
        st_star: Optimized scale [B, 1]
    """
    # Calculate dot product
    dot_product = torch.sum(positive_flat * negative_flat,
                            dim=1,
                            keepdim=True)
    # Squared norm of uncondition
    squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
    # st_star = v_cond^T * v_uncond / ||v_uncond||^2
    st_star = dot_product / squared_norm
    return st_star

Functions¶

fastvideo.pipelines.stages.longcat_refine_init ¶

LongCat refinement initialization stage.

This stage prepares the latent variables for LongCat's 480p->720p refinement by: 1. Loading the stage1 (480p) video 2. Upsampling it to 720p resolution 3. Encoding it with VAE 4. Mixing with noise according to t_thresh

Classes¶

fastvideo.pipelines.stages.longcat_refine_init.LongCatRefineInitStage ¶

LongCatRefineInitStage(vae)

Bases: PipelineStage

Stage for initializing LongCat refinement from a stage1 (480p) video.

This replicates the logic from LongCatVideoPipeline.generate_refine(): - Load stage1_video frames - Upsample spatially and temporally - VAE encode and normalize - Mix with noise according to t_thresh

Source code in fastvideo/pipelines/stages/longcat_refine_init.py

def __init__(self, vae) -> None:
    super().__init__()
    self.vae = vae

Functions¶

fastvideo.pipelines.stages.longcat_refine_init.LongCatRefineInitStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Initialize latents for refinement.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with initialized latents for refinement.

Source code in fastvideo/pipelines/stages/longcat_refine_init.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Initialize latents for refinement.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with initialized latents for refinement.
    """
    refine_from = batch.refine_from
    in_memory_stage1 = batch.stage1_video

    # Only run for refinement tasks: either a path (refine_from) or in-memory video is provided
    if refine_from is None and in_memory_stage1 is None:
        # Not a refinement task, skip
        return batch

    # ------------------------------------------------------------------
    # 1. Obtain stage1 frames (either from disk or from in-memory input)
    # ------------------------------------------------------------------
    if in_memory_stage1 is not None:
        # User provided stage1 frames directly (e.g., from distilled stage output)
        if len(in_memory_stage1) == 0:
            raise ValueError(
                "stage1_video is empty; expected a non-empty list of frames"
            )

        if isinstance(in_memory_stage1[0], Image.Image):
            pil_images = in_memory_stage1
        else:
            # Assume numpy arrays or torch tensors with shape [H, W, C]
            pil_images = [
                Image.fromarray(np.array(frame))
                for frame in in_memory_stage1
            ]

        logger.info(
            "Initializing LongCat refinement from in-memory stage1_video (%s frames)",
            len(pil_images))
    else:
        # Path-based refine: load video from disk (original design)
        logger.info("Initializing LongCat refinement from file: %s",
                    refine_from)
        stage1_video_path = Path(refine_from)
        if not stage1_video_path.exists():
            raise FileNotFoundError(
                f"Stage1 video not found: {refine_from}")

        # Load video frames as PIL Images
        pil_images, original_fps = load_video(str(stage1_video_path),
                                              return_fps=True)
        logger.info("Loaded stage1 video: %s frames @ %s fps",
                    len(pil_images), original_fps)

    # Store in batch for reference (use PIL images, same as official demo)
    batch.stage1_video = pil_images

    # Get parameters from batch
    num_frames = len(pil_images)
    spatial_refine_only = batch.spatial_refine_only
    t_thresh = batch.t_thresh
    num_cond_frames = batch.num_cond_frames if hasattr(
        batch, 'num_cond_frames') else 0

    # Calculate new frame count (temporal upsampling if not spatial_refine_only)
    new_num_frames = num_frames if spatial_refine_only else 2 * num_frames
    logger.info(
        "Refine mode: %s",
        'spatial only' if spatial_refine_only else 'spatial + temporal')

    # Update batch.num_frames to reflect the upsampled count
    batch.num_frames = new_num_frames

    # Use bucket system to select resolution (exactly like LongCat)
    # Calculate scale_factor_spatial considering SP split
    sp_size = fastvideo_args.sp_size if fastvideo_args.sp_size > 0 else 1
    vae_scale_factor_spatial = 8  # VAE spatial downsampling
    patch_size_spatial = 2  # LongCat patch size
    bsa_latent_granularity = 4
    scale_factor_spatial = vae_scale_factor_spatial * patch_size_spatial * bsa_latent_granularity  # 64

    # Calculate optimal split like LongCat (cp_split_hw logic)
    # For sp_size=1: [1,1], max=1
    # For sp_size=2: [1,2], max=2
    # For sp_size=4: [2,2], max=2
    # For sp_size=8: [2,4], max=4
    if sp_size > 1:
        # Get optimal 2D split factors (mimic context_parallel_util.get_optimal_split)
        factors = []
        for i in range(1, int(sp_size**0.5) + 1):
            if sp_size % i == 0:
                factors.append([i, sp_size // i])
        cp_split_hw = min(factors, key=lambda x: abs(x[0] - x[1]))
        scale_factor_spatial *= max(cp_split_hw)
        logger.info("SP split: sp_size=%s, cp_split_hw=%s, max_split=%s",
                    sp_size, cp_split_hw, max(cp_split_hw))
    else:
        cp_split_hw = [1, 1]

    # Get bucket config and find closest bucket for the input aspect ratio
    bucket_config = get_bucket_config('720p', scale_factor_spatial)

    # Get input aspect ratio from stage1 video
    input_height, input_width = pil_images[0].height, pil_images[0].width
    input_ratio = input_height / input_width

    # Find closest bucket
    closest_ratio = min(bucket_config.keys(),
                        key=lambda x: abs(float(x) - input_ratio))
    height, width = bucket_config[closest_ratio][0]

    logger.info("Input aspect ratio: %.2f (%sx%s)", input_ratio,
                input_width, input_height)
    logger.info("Matched bucket ratio: %s -> resolution: %sx%s",
                closest_ratio, width, height)
    logger.info("Target: %sx%s @ %s frames (sp_size=%s, scale_factor=%s)",
                width, height, new_num_frames, sp_size,
                scale_factor_spatial)

    # Override batch height/width with bucket-selected resolution
    batch.height = height
    batch.width = width

    # Convert PIL images to tensor [T, C, H, W]
    stage1_video_tensor = torch.stack([
        torch.from_numpy(np.array(img)).permute(2, 0, 1)  # HWC -> CHW
        for img in pil_images
    ]).float()  # [T, C, H, W]

    device = batch.prompt_embeds[0].device
    dtype = batch.prompt_embeds[0].dtype
    stage1_video_tensor = stage1_video_tensor.to(device=device, dtype=dtype)

    # Replicate LongCat's exact preprocessing (lines 1227-1235 in pipeline_longcat_video.py)
    # First: spatial interpolation to target (height, width) on [T, C, H, W]
    video_down = F.interpolate(stage1_video_tensor,
                               size=(height, width),
                               mode='bilinear',
                               align_corners=True)

    # Rearrange to [C, T, H, W] and add batch dimension -> [1, C, T, H, W]
    video_down = video_down.permute(1, 0, 2,
                                    3).unsqueeze(0)  # [1, C, T, H, W]
    video_down = video_down / 255.0  # Normalize to [0, 1]

    # Then: temporal+spatial interpolation to (new_num_frames, height, width)
    video_up = F.interpolate(video_down,
                             size=(new_num_frames, height, width),
                             mode='trilinear',
                             align_corners=True)

    # Rescale to [-1, 1] for VAE
    video_up = video_up * 2.0 - 1.0

    logger.info("Upsampled video shape: %s", video_up.shape)

    # Padding logic (exactly like LongCat lines 1237-1255)
    # Only pad temporal dimension to ensure BSA compatibility
    vae_scale_factor_temporal = 4
    num_noise_frames = video_up.shape[2] - num_cond_frames

    num_cond_latents = 0
    num_cond_frames_added = 0
    if num_cond_frames > 0:
        num_cond_latents = 1 + math.ceil(
            (num_cond_frames - 1) / vae_scale_factor_temporal)
        num_cond_latents = math.ceil(
            num_cond_latents /
            bsa_latent_granularity) * bsa_latent_granularity
        num_cond_frames_added = 1 + (
            num_cond_latents -
            1) * vae_scale_factor_temporal - num_cond_frames
        num_cond_frames = num_cond_frames + num_cond_frames_added

    num_noise_latents = math.ceil(num_noise_frames /
                                  vae_scale_factor_temporal)
    num_noise_latents = math.ceil(
        num_noise_latents / bsa_latent_granularity) * bsa_latent_granularity
    num_noise_frames_added = num_noise_latents * vae_scale_factor_temporal - num_noise_frames

    if num_cond_frames_added > 0 or num_noise_frames_added > 0:
        logger.info(
            "Padding temporal dimension for BSA: cond_frames+=%s, noise_frames+=%s",
            num_cond_frames_added, num_noise_frames_added)
        pad_front = video_up[:, :, 0:1].repeat(1, 1, num_cond_frames_added,
                                               1, 1)
        pad_back = video_up[:, :, -1:].repeat(1, 1, num_noise_frames_added,
                                              1, 1)
        video_up = torch.cat([pad_front, video_up, pad_back], dim=2)
        logger.info("Padded video shape: %s", video_up.shape)

    # Update batch with actual frame count after padding
    batch.num_frames = video_up.shape[2]

    # Store padding info for later cropping (CRITICAL for correct output!)
    batch.num_cond_frames_added = num_cond_frames_added
    batch.num_noise_frames_added = num_noise_frames_added
    batch.new_frame_size_before_padding = new_num_frames

    # Store num_cond_latents for denoising stage
    if num_cond_latents > 0:
        batch.num_cond_latents = num_cond_latents
        logger.info("Will use num_cond_latents=%s during denoising",
                    num_cond_latents)

    logger.info("Padding info: cond+=%s, noise+=%s, original=%s",
                num_cond_frames_added, num_noise_frames_added,
                new_num_frames)

    # VAE encode
    logger.info("Encoding stage1 video with VAE...")
    vae_dtype = next(self.vae.parameters()).dtype
    vae_device = next(self.vae.parameters()).device
    video_up = video_up.to(dtype=vae_dtype, device=vae_device)

    with torch.no_grad():
        latent_dist = self.vae.encode(video_up)
        # Extract tensor from latent distribution
        if hasattr(latent_dist, 'latent_dist'):
            # Nested distribution wrapper
            latent_up = latent_dist.latent_dist.sample()
        elif hasattr(latent_dist, 'sample'):
            # DiagonalGaussianDistribution or similar
            latent_up = latent_dist.sample()
        elif hasattr(latent_dist, 'latents'):
            # Direct latents tensor
            latent_up = latent_dist.latents
        else:
            # Assume it's already a tensor
            latent_up = latent_dist

    # Normalize latents using VAE config (exactly like LongCat)
    if hasattr(self.vae.config, 'latents_mean') and hasattr(
            self.vae.config, 'latents_std'):
        latents_mean = torch.tensor(self.vae.config.latents_mean).view(
            1, self.vae.config.z_dim, 1, 1, 1).to(latent_up.device,
                                                  latent_up.dtype)
        # LongCat uses: 1.0 / latents_std (equivalent to dividing by latents_std)
        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(
            1, self.vae.config.z_dim, 1, 1, 1).to(latent_up.device,
                                                  latent_up.dtype)
        # LongCat: (latents - mean) * (1/std)
        latent_up = (latent_up - latents_mean) * latents_std

    logger.info("Encoded latent shape: %s", latent_up.shape)

    # Mix with noise according to t_thresh
    # latent_up = (1 - t_thresh) * latent_up + t_thresh * noise
    noise = torch.randn_like(latent_up).contiguous()
    latent_up = (1 - t_thresh) * latent_up + t_thresh * noise

    logger.info("Applied t_thresh=%s noise mixing", t_thresh)

    # Store in batch
    batch.latents = latent_up.to(dtype)
    batch.raw_latent_shape = latent_up.shape

    logger.info("LongCat refinement initialization complete")

    return batch

Functions¶

fastvideo.pipelines.stages.longcat_refine_timestep ¶

LongCat refinement timestep preparation stage.

This stage prepares special timesteps for LongCat refinement that start from t_thresh.

Classes¶

fastvideo.pipelines.stages.longcat_refine_timestep.LongCatRefineTimestepStage ¶

LongCatRefineTimestepStage(scheduler)

Bases: PipelineStage

Stage for preparing timesteps specific to LongCat refinement.

For refinement, we need to start from t_thresh instead of t=1.0, so we: 1. Generate normal timesteps for num_inference_steps 2. Filter to only keep timesteps < t_thresh * 1000 3. Prepend t_thresh * 1000 as the first timestep

Source code in fastvideo/pipelines/stages/longcat_refine_timestep.py

def __init__(self, scheduler) -> None:
    super().__init__()
    self.scheduler = scheduler

Functions¶

fastvideo.pipelines.stages.longcat_refine_timestep.LongCatRefineTimestepStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Prepare refinement-specific timesteps.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with refinement timesteps.

Source code in fastvideo/pipelines/stages/longcat_refine_timestep.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Prepare refinement-specific timesteps.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with refinement timesteps.
    """
    # Only apply if this is a refinement task
    # Trigger when either a refine_from path or in-memory stage1_video is provided
    if batch.refine_from is None and batch.stage1_video is None:
        return batch

    device = get_local_torch_device()
    num_inference_steps = batch.num_inference_steps
    t_thresh = batch.t_thresh

    logger.info("Preparing LongCat refinement timesteps (t_thresh=%s)",
                t_thresh)

    # ------------------------------------------------------------------
    # 1) Match LongCatVideoPipeline.get_timesteps_sigmas (non-distill):
    #    sigmas = linspace(1, 0.001, num_inference_steps) on CPU
    # ------------------------------------------------------------------
    base_sigmas = torch.linspace(
        1.0,
        0.001,
        num_inference_steps,
        dtype=torch.float32,
        device=
        "cpu",  # scheduler.set_timesteps expects CPU-convertible sigmas
    )
    # Let the scheduler build its internal timestep schedule from sigmas
    self.scheduler.set_timesteps(num_inference_steps,
                                 sigmas=base_sigmas,
                                 device=device)
    base_timesteps = self.scheduler.timesteps

    # ------------------------------------------------------------------
    # 2) Apply t_thresh cropping exactly like generate_refine:
    #    timesteps = [t_thresh*1000] + [t for t in base_timesteps if t < t_thresh*1000]
    #    sigmas = timesteps / 1000  (with trailing zero)
    # ------------------------------------------------------------------
    t_thresh_value = t_thresh * 1000.0
    t_thresh_tensor = torch.tensor(t_thresh_value,
                                   dtype=base_timesteps.dtype,
                                   device=device)
    filtered_timesteps = base_timesteps[base_timesteps < t_thresh_tensor]

    timesteps = torch.cat(
        [t_thresh_tensor.unsqueeze(0), filtered_timesteps])

    # Update scheduler with these custom timesteps and corresponding sigmas
    self.scheduler.timesteps = timesteps
    sigmas = torch.cat([timesteps / 1000.0, torch.zeros(1, device=device)])
    self.scheduler.sigmas = sigmas

    logger.info("Refinement timesteps: %s steps starting from t=%s",
                len(timesteps), t_thresh)
    logger.info("First few timesteps: %s", timesteps[:5].tolist())

    # Store in batch so downstream stages (denoising) use the same schedule
    batch.timesteps = timesteps

    return batch

Functions¶

fastvideo.pipelines.stages.stepvideo_encoding ¶

Classes¶

fastvideo.pipelines.stages.stepvideo_encoding.StepvideoPromptEncodingStage ¶

StepvideoPromptEncodingStage(stepllm, clip)

Bases: PipelineStage

Stage for encoding prompts using the remote caption API.

This stage applies the magic string transformations and calls the remote caption service asynchronously to get: - primary prompt embeddings, - an attention mask, - and a clip embedding.

Source code in fastvideo/pipelines/stages/stepvideo_encoding.py

def __init__(self, stepllm, clip) -> None:
    super().__init__()
    # self.caption_client = caption_client  # This should have a call_caption(prompts: List[str]) method.
    self.stepllm = stepllm
    self.clip = clip

Functions¶

fastvideo.pipelines.stages.stepvideo_encoding.StepvideoPromptEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify stepvideo encoding stage inputs.

Source code in fastvideo/pipelines/stages/stepvideo_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify stepvideo encoding stage inputs."""
    result = VerificationResult()
    result.add_check("prompt", batch.prompt, V.string_not_empty)
    return result

fastvideo.pipelines.stages.stepvideo_encoding.StepvideoPromptEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify stepvideo encoding stage outputs.

Source code in fastvideo/pipelines/stages/stepvideo_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify stepvideo encoding stage outputs."""
    result = VerificationResult()
    result.add_check("prompt_embeds", batch.prompt_embeds,
                     [V.is_tensor, V.with_dims(3)])
    result.add_check("negative_prompt_embeds", batch.negative_prompt_embeds,
                     [V.is_tensor, V.with_dims(3)])
    result.add_check("prompt_attention_mask", batch.prompt_attention_mask,
                     [V.is_tensor, V.with_dims(2)])
    result.add_check("negative_attention_mask",
                     batch.negative_attention_mask,
                     [V.is_tensor, V.with_dims(2)])
    result.add_check("clip_embedding_pos", batch.clip_embedding_pos,
                     [V.is_tensor, V.with_dims(2)])
    result.add_check("clip_embedding_neg", batch.clip_embedding_neg,
                     [V.is_tensor, V.with_dims(2)])
    return result

Functions¶

fastvideo.pipelines.stages.text_encoding ¶

Prompt encoding stages for diffusion pipelines.

This module contains implementations of prompt encoding stages for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.text_encoding.TextEncodingStage ¶

TextEncodingStage(text_encoders, tokenizers)

Bases: PipelineStage

Stage for encoding text prompts into embeddings for diffusion models.

This stage handles the encoding of text prompts into the embedding space expected by the diffusion model.

Initialize the prompt encoding stage.

Parameters:

Name	Type	Description	Default
`enable_logging`		Whether to enable logging for this stage.	required
`is_secondary`		Whether this is a secondary text encoder.	required

Source code in fastvideo/pipelines/stages/text_encoding.py

def __init__(self, text_encoders, tokenizers) -> None:
    """
    Initialize the prompt encoding stage.

    Args:
        enable_logging: Whether to enable logging for this stage.
        is_secondary: Whether this is a secondary text encoder.
    """
    super().__init__()
    self.tokenizers = tokenizers
    self.text_encoders = text_encoders

Functions¶

fastvideo.pipelines.stages.text_encoding.TextEncodingStage.encode_text ¶

encode_text(text: str | list[str], fastvideo_args: FastVideoArgs, encoder_index: int | list[int] | None = None, return_attention_mask: bool = False, return_type: str = 'list', device: device | str | None = None, dtype: dtype | None = None, max_length: int | None = None, truncation: bool | None = None, padding: bool | str | None = None)

Encode plain text using selected text encoder(s) and return embeddings.

Parameters:

Name	Type	Description	Default
`text`	`str \| list[str]`	A single string or a list of strings to encode.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments providing pipeline config, including tokenizer and encoder settings, preprocess and postprocess functions.	required
`encoder_index`	`int \| list[int] \| None`	Encoder selector by index. Accepts an int or list of ints.	`None`
`return_attention_mask`	`bool`	If True, also return attention masks for each selected encoder.	`False`
`return_type`	`str`	"list" (default) returns a list aligned with selection; "dict" returns a dict keyed by encoder index as a string; "stack" stacks along a new first dimension (requires matching shapes).	`'list'`
`device`	`device \| str \| None`	Optional device override for inputs; defaults to local torch device.	`None`
`dtype`	`dtype \| None`	Optional dtype to cast returned embeddings to.	`None`
`max_length`	`int \| None`	Optional per-call tokenizer override.	`None`
`truncation`	`bool \| None`	Optional per-call tokenizer override.	`None`
`padding`	`bool \| str \| None`	Optional per-call tokenizer override.	`None`

Returns:

Type	Description
	Depending on return_type and return_attention_mask:
	list: List[Tensor] or (List[Tensor], List[Tensor])
	dict: Dict[str, Tensor] or (Dict[str, Tensor], Dict[str, Tensor])
	stack: Tensor of shape [num_encoders, ...] or a tuple with stacked attention masks

Source code in fastvideo/pipelines/stages/text_encoding.py

@torch.no_grad()
def encode_text(
    self,
    text: str | list[str],
    fastvideo_args: FastVideoArgs,
    encoder_index: int | list[int] | None = None,
    return_attention_mask: bool = False,
    return_type: str = "list",  # one of: "list", "dict", "stack"
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
    max_length: int | None = None,
    truncation: bool | None = None,
    padding: bool | str | None = None,
):
    """
    Encode plain text using selected text encoder(s) and return embeddings.

    Args:
        text: A single string or a list of strings to encode.
        fastvideo_args: The inference arguments providing pipeline config,
            including tokenizer and encoder settings, preprocess and postprocess
            functions.
        encoder_index: Encoder selector by index. Accepts an int or list of ints.
        return_attention_mask: If True, also return attention masks for each
            selected encoder.
        return_type: "list" (default) returns a list aligned with selection;
            "dict" returns a dict keyed by encoder index as a string; "stack" stacks along a
            new first dimension (requires matching shapes).
        device: Optional device override for inputs; defaults to local torch device.
        dtype: Optional dtype to cast returned embeddings to.
        max_length: Optional per-call tokenizer override.
        truncation: Optional per-call tokenizer override.
        padding: Optional per-call tokenizer override.

    Returns:
        Depending on return_type and return_attention_mask:
        - list: List[Tensor] or (List[Tensor], List[Tensor])
        - dict: Dict[str, Tensor] or (Dict[str, Tensor], Dict[str, Tensor])
        - stack: Tensor of shape [num_encoders, ...] or a tuple with stacked
          attention masks
    """

    assert len(self.tokenizers) == len(self.text_encoders)
    assert len(self.text_encoders) == len(
        fastvideo_args.pipeline_config.text_encoder_configs)

    # Resolve selection into indices
    encoder_cfgs = fastvideo_args.pipeline_config.text_encoder_configs
    if encoder_index is None:
        indices: list[int] = [0]
    elif isinstance(encoder_index, int):
        indices = [encoder_index]
    else:
        indices = list(encoder_index)
    # validate range
    num_encoders = len(self.text_encoders)
    for idx in indices:
        if idx < 0 or idx >= num_encoders:
            raise IndexError(
                f"encoder index {idx} out of range [0, {num_encoders-1}]")

    # Validate indices are within range
    num_encoders = len(self.text_encoders)

    # Normalize input to list[str]
    assert isinstance(text, str | list)
    if isinstance(text, str):
        texts: list[str] = [text]
    else:
        texts = text

    embeds_list: list[torch.Tensor] = []
    attn_masks_list: list[torch.Tensor] = []

    preprocess_funcs = fastvideo_args.pipeline_config.preprocess_text_funcs
    postprocess_funcs = fastvideo_args.pipeline_config.postprocess_text_funcs
    encoder_cfgs = fastvideo_args.pipeline_config.text_encoder_configs

    if return_type not in ("list", "dict", "stack"):
        raise ValueError(
            f"Invalid return_type '{return_type}'. Expected one of: 'list', 'dict', 'stack'"
        )

    target_device = device if device is not None else get_local_torch_device(
    )

    for i in indices:
        tokenizer = self.tokenizers[i]
        text_encoder = self.text_encoders[i]
        encoder_config = encoder_cfgs[i]
        preprocess_func = preprocess_funcs[i]
        postprocess_func = postprocess_funcs[i]

        tok_kwargs = dict(encoder_config.tokenizer_kwargs)
        if max_length is not None:
            tok_kwargs["max_length"] = max_length
        elif hasattr(fastvideo_args.pipeline_config,
                     "text_encoder_max_lengths"):
            tok_kwargs[
                "max_length"] = fastvideo_args.pipeline_config.text_encoder_max_lengths[
                    i]

        if truncation is not None:
            tok_kwargs["truncation"] = truncation
        if padding is not None:
            tok_kwargs["padding"] = padding

        processed_texts: list[str] = []
        for prompt_str in texts:
            processed_text = preprocess_func(prompt_str)
            if processed_text is not None:
                processed_texts.append(processed_text)
            else:
                # Assuming batch_size = 1
                prompt_embeds = torch.zeros((1, tok_kwargs["max_length"],
                                             encoder_config.hidden_size),
                                            device=target_device)
                attention_mask = torch.zeros((1, tok_kwargs["max_length"]),
                                             device=target_device,
                                             dtype=torch.int64)
                embeds_list.append(prompt_embeds)
                attn_masks_list.append(attention_mask)
                return self.return_embeds(embeds_list, attn_masks_list,
                                          return_type,
                                          return_attention_mask, indices)

        if encoder_config.is_chat_model:
            text_inputs = tokenizer.apply_chat_template(
                processed_texts, **tok_kwargs).to(target_device)
        else:
            text_inputs = tokenizer(processed_texts,
                                    **tok_kwargs).to(target_device)

        input_ids = text_inputs["input_ids"]
        attention_mask = text_inputs["attention_mask"]

        with set_forward_context(current_timestep=0, attn_metadata=None):
            outputs = text_encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,
            )

        try:
            prompt_embeds = postprocess_func(outputs)
        except Exception:
            prompt_embeds, attention_mask = postprocess_func(
                outputs, attention_mask)

        if dtype is not None:
            prompt_embeds = prompt_embeds.to(dtype=dtype)
        embeds_list.append(prompt_embeds)
        if return_attention_mask:
            attn_masks_list.append(attention_mask)

    return self.return_embeds(embeds_list, attn_masks_list, return_type,
                              return_attention_mask, indices)

fastvideo.pipelines.stages.text_encoding.TextEncodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Encode the prompt into text encoder hidden states.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with encoded prompt embeddings.

Source code in fastvideo/pipelines/stages/text_encoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Encode the prompt into text encoder hidden states.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with encoded prompt embeddings.
    """
    assert len(self.tokenizers) == len(self.text_encoders)
    assert len(self.text_encoders) == len(
        fastvideo_args.pipeline_config.text_encoder_configs)

    # Encode positive prompt with all available encoders
    assert batch.prompt is not None
    prompt_text: str | list[str] = batch.prompt
    all_indices: list[int] = list(range(len(self.text_encoders)))
    prompt_embeds_list, prompt_masks_list = self.encode_text(
        prompt_text,
        fastvideo_args,
        encoder_index=all_indices,
        return_attention_mask=True,
    )

    for pe in prompt_embeds_list:
        batch.prompt_embeds.append(pe)
    if batch.prompt_attention_mask is not None:
        for am in prompt_masks_list:
            batch.prompt_attention_mask.append(am)

    # Encode negative prompt if CFG is enabled
    if batch.do_classifier_free_guidance:
        assert isinstance(batch.negative_prompt, str)
        neg_embeds_list, neg_masks_list = self.encode_text(
            batch.negative_prompt,
            fastvideo_args,
            encoder_index=all_indices,
            return_attention_mask=True,
        )

        assert batch.negative_prompt_embeds is not None
        for ne in neg_embeds_list:
            batch.negative_prompt_embeds.append(ne)
        if batch.negative_attention_mask is not None:
            for nm in neg_masks_list:
                batch.negative_attention_mask.append(nm)

    return batch

fastvideo.pipelines.stages.text_encoding.TextEncodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify text encoding stage inputs.

Source code in fastvideo/pipelines/stages/text_encoding.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify text encoding stage inputs."""
    result = VerificationResult()
    result.add_check("prompt", batch.prompt, V.string_or_list_strings)
    # result.add_check(
    #     "negative_prompt", batch.negative_prompt, lambda x: not batch.
    #     do_classifier_free_guidance or V.string_not_empty(x))
    result.add_check("do_classifier_free_guidance",
                     batch.do_classifier_free_guidance, V.bool_value)
    result.add_check("prompt_embeds", batch.prompt_embeds, V.is_list)
    result.add_check("negative_prompt_embeds", batch.negative_prompt_embeds,
                     V.none_or_list)
    return result

fastvideo.pipelines.stages.text_encoding.TextEncodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify text encoding stage outputs.

Source code in fastvideo/pipelines/stages/text_encoding.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify text encoding stage outputs."""
    result = VerificationResult()
    result.add_check("prompt_embeds", batch.prompt_embeds,
                     V.list_of_tensors_min_dims(2))
    result.add_check(
        "negative_prompt_embeds", batch.negative_prompt_embeds,
        lambda x: not batch.do_classifier_free_guidance or V.
        list_of_tensors_with_min_dims(x, 2))
    return result

Functions¶

fastvideo.pipelines.stages.timestep_preparation ¶

Timestep preparation stages for diffusion pipelines.

This module contains implementations of timestep preparation stages for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.timestep_preparation.TimestepPreparationStage ¶

TimestepPreparationStage(scheduler)

Bases: PipelineStage

Stage for preparing timesteps for the diffusion process.

This stage handles the preparation of the timestep sequence that will be used during the diffusion process.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def __init__(self, scheduler) -> None:
    self.scheduler = scheduler

Functions¶

fastvideo.pipelines.stages.timestep_preparation.TimestepPreparationStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Prepare timesteps for the diffusion process.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch information.	required
`fastvideo_args`	`FastVideoArgs`	The inference arguments.	required

Returns:

Type	Description
`ForwardBatch`	The batch with prepared timesteps.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Prepare timesteps for the diffusion process.

    Args:
        batch: The current batch information.
        fastvideo_args: The inference arguments.

    Returns:
        The batch with prepared timesteps.
    """
    scheduler = self.scheduler
    device = get_local_torch_device()
    num_inference_steps = batch.num_inference_steps
    timesteps = batch.timesteps
    sigmas = batch.sigmas
    n_tokens = batch.n_tokens

    # Prepare extra kwargs for set_timesteps
    extra_set_timesteps_kwargs = {}
    if n_tokens is not None and "n_tokens" in inspect.signature(
            scheduler.set_timesteps).parameters:
        extra_set_timesteps_kwargs["n_tokens"] = n_tokens

    # Handle custom timesteps or sigmas
    if timesteps is not None and sigmas is not None:
        raise ValueError(
            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
        )

    if timesteps is not None:
        accepts_timesteps = "timesteps" in inspect.signature(
            scheduler.set_timesteps).parameters
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        # Convert timesteps to CPU if it's a tensor (for numpy conversion in scheduler)
        if isinstance(timesteps, torch.Tensor):
            timesteps_for_scheduler = timesteps.cpu()
        else:
            timesteps_for_scheduler = timesteps
        scheduler.set_timesteps(timesteps=timesteps_for_scheduler,
                                device=device,
                                **extra_set_timesteps_kwargs)
        timesteps = scheduler.timesteps
    elif sigmas is not None:
        accept_sigmas = "sigmas" in inspect.signature(
            scheduler.set_timesteps).parameters
        if not accept_sigmas:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" sigmas schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(sigmas=sigmas,
                                device=device,
                                **extra_set_timesteps_kwargs)
        timesteps = scheduler.timesteps
    else:
        scheduler.set_timesteps(num_inference_steps,
                                device=device,
                                **extra_set_timesteps_kwargs)
        timesteps = scheduler.timesteps

    # Update batch with prepared timesteps
    batch.timesteps = timesteps

    return batch

fastvideo.pipelines.stages.timestep_preparation.TimestepPreparationStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify timestep preparation stage inputs.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def verify_input(self, batch: ForwardBatch,
                 fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify timestep preparation stage inputs."""
    result = VerificationResult()
    result.add_check("num_inference_steps", batch.num_inference_steps,
                     V.positive_int)
    result.add_check("timesteps", batch.timesteps, V.none_or_tensor)
    result.add_check("sigmas", batch.sigmas, V.none_or_list)
    result.add_check("n_tokens", batch.n_tokens, V.none_or_positive_int)
    return result

fastvideo.pipelines.stages.timestep_preparation.TimestepPreparationStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify timestep preparation stage outputs.

Source code in fastvideo/pipelines/stages/timestep_preparation.py

def verify_output(self, batch: ForwardBatch,
                  fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify timestep preparation stage outputs."""
    result = VerificationResult()
    result.add_check("timesteps", batch.timesteps,
                     [V.is_tensor, V.with_dims(1)])
    return result

Functions¶

fastvideo.pipelines.stages.utils ¶

Utility functions for pipeline stages.

Functions¶

fastvideo.pipelines.stages.utils.retrieve_timesteps ¶

retrieve_timesteps(scheduler: Any, num_inference_steps: int | None = None, device: str | device | None = None, timesteps: list[int] | None = None, sigmas: list[float] | None = None, **kwargs: Any) -> tuple[Any, int]

Calls the scheduler's set_timesteps method and retrieves timesteps from the scheduler after the call. Handles custom timesteps. Any kwargs will be supplied to scheduler.set_timesteps.

Parameters:

Name	Type	Description	Default
`scheduler`	`SchedulerMixin`	The scheduler to get timesteps from.	required
`num_inference_steps`	`int`	The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` must be `None`.	`None`
`device`	`str` or `torch.device`, optional	The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.	`None`
`timesteps`	`List[int]`, optional	Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, `num_inference_steps` and `sigmas` must be `None`.	`None`
`sigmas`	`List[float]`, optional	Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`.	`None`

Returns:

Type	Description
`Any`	`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule and the
`int`	second element is the number of inference steps.

Source code in fastvideo/pipelines/stages/utils.py

def retrieve_timesteps(
    scheduler: Any,
    num_inference_steps: int | None = None,
    device: str | torch.device | None = None,
    timesteps: list[int] | None = None,
    sigmas: list[float] | None = None,
    **kwargs: Any,
) -> tuple[Any, int]:
    """
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule and the
        second element is the number of inference steps.
    """
    if timesteps is not None and sigmas is not None:
        raise ValueError(
            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
        )
    if timesteps is not None:
        accepts_timesteps = "timesteps" in set(
            inspect.signature(scheduler.set_timesteps).parameters.keys())
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
        timesteps = scheduler.timesteps
        if timesteps is None:
            raise ValueError("scheduler.timesteps is None after set_timesteps")
        num_inference_steps = len(timesteps)
    elif sigmas is not None:
        accept_sigmas = "sigmas" in set(
            inspect.signature(scheduler.set_timesteps).parameters.keys())
        if not accept_sigmas:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" sigmas schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
        timesteps = scheduler.timesteps
        if timesteps is None:
            raise ValueError("scheduler.timesteps is None after set_timesteps")
        num_inference_steps = len(timesteps)
    else:
        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
        timesteps = scheduler.timesteps
        if timesteps is None:
            raise ValueError("scheduler.timesteps is None after set_timesteps")
        num_inference_steps = len(timesteps)
    return timesteps, num_inference_steps

fastvideo.pipelines.stages.validators ¶

Common validators for pipeline stage verification.

This module provides reusable validation functions that can be used across all pipeline stages for input/output verification.

Classes¶

fastvideo.pipelines.stages.validators.StageValidators ¶

Common validators for pipeline stages.

Functions¶

fastvideo.pipelines.stages.validators.StageValidators.bool_value staticmethod ¶

bool_value(value: Any) -> bool

Check if value is a boolean.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def bool_value(value: Any) -> bool:
    """Check if value is a boolean."""
    return isinstance(value, bool)

fastvideo.pipelines.stages.validators.StageValidators.divisible staticmethod ¶

divisible(divisor: int) -> Callable[[Any], bool]

Return a validator that checks if value is divisible by divisor.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def divisible(divisor: int) -> Callable[[Any], bool]:
    """Return a validator that checks if value is divisible by divisor."""

    def validator(value: Any) -> bool:
        return StageValidators.divisible_by(value, divisor)

    return validator

fastvideo.pipelines.stages.validators.StageValidators.divisible_by staticmethod ¶

divisible_by(value: Any, divisor: int) -> bool

Check if value is divisible by divisor.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def divisible_by(value: Any, divisor: int) -> bool:
    """Check if value is divisible by divisor."""
    return value is not None and isinstance(value,
                                            int) and value % divisor == 0

fastvideo.pipelines.stages.validators.StageValidators.generator_or_list_generators staticmethod ¶

generator_or_list_generators(value: Any) -> bool

Check if value is a Generator or list of Generators.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def generator_or_list_generators(value: Any) -> bool:
    """Check if value is a Generator or list of Generators."""
    if isinstance(value, torch.Generator):
        return True
    if isinstance(value, list):
        return all(isinstance(item, torch.Generator) for item in value)
    return False

fastvideo.pipelines.stages.validators.StageValidators.is_list staticmethod ¶

is_list(value: Any) -> bool

Check if value is a list (can be empty).

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def is_list(value: Any) -> bool:
    """Check if value is a list (can be empty)."""
    return isinstance(value, list)

fastvideo.pipelines.stages.validators.StageValidators.is_tensor staticmethod ¶

is_tensor(value: Any) -> bool

Check if value is a torch tensor and doesn't contain NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def is_tensor(value: Any) -> bool:
    """Check if value is a torch tensor and doesn't contain NaN values."""
    if not isinstance(value, torch.Tensor):
        return False
    return not torch.isnan(value).any().item()

fastvideo.pipelines.stages.validators.StageValidators.is_tuple staticmethod ¶

is_tuple(value: Any) -> bool

Check if value is a tuple.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def is_tuple(value: Any) -> bool:
    """Check if value is a tuple."""
    return isinstance(value, tuple)

fastvideo.pipelines.stages.validators.StageValidators.list_length staticmethod ¶

list_length(value: Any, length: int) -> bool

Check if list has specific length.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_length(value: Any, length: int) -> bool:
    """Check if list has specific length."""
    return isinstance(value, list) and len(value) == length

fastvideo.pipelines.stages.validators.StageValidators.list_min_length staticmethod ¶

list_min_length(value: Any, min_length: int) -> bool

Check if list has at least min_length items.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_min_length(value: Any, min_length: int) -> bool:
    """Check if list has at least min_length items."""
    return isinstance(value, list) and len(value) >= min_length

fastvideo.pipelines.stages.validators.StageValidators.list_not_empty staticmethod ¶

list_not_empty(value: Any) -> bool

Check if value is a non-empty list.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_not_empty(value: Any) -> bool:
    """Check if value is a non-empty list."""
    return isinstance(value, list) and len(value) > 0

fastvideo.pipelines.stages.validators.StageValidators.list_of_tensors staticmethod ¶

list_of_tensors(value: Any) -> bool

Check if value is a non-empty list where all items are tensors without NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_of_tensors(value: Any) -> bool:
    """Check if value is a non-empty list where all items are tensors without NaN values."""
    if not isinstance(value, list) or len(value) == 0:
        return False
    for item in value:
        if not isinstance(item, torch.Tensor):
            return False
        if torch.isnan(item).any().item():
            return False
    return True

fastvideo.pipelines.stages.validators.StageValidators.list_of_tensors_dims staticmethod ¶

list_of_tensors_dims(dims: int) -> Callable[[Any], bool]

Return a validator that checks if value is a list of tensors with specific dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_of_tensors_dims(dims: int) -> Callable[[Any], bool]:
    """Return a validator that checks if value is a list of tensors with specific dimensions and no NaN values."""

    def validator(value: Any) -> bool:
        return StageValidators.list_of_tensors_with_dims(value, dims)

    return validator

fastvideo.pipelines.stages.validators.StageValidators.list_of_tensors_min_dims staticmethod ¶

list_of_tensors_min_dims(min_dims: int) -> Callable[[Any], bool]

Return a validator that checks if value is a list of tensors with at least min_dims dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_of_tensors_min_dims(min_dims: int) -> Callable[[Any], bool]:
    """Return a validator that checks if value is a list of tensors with at least min_dims dimensions and no NaN values."""

    def validator(value: Any) -> bool:
        return StageValidators.list_of_tensors_with_min_dims(
            value, min_dims)

    return validator

fastvideo.pipelines.stages.validators.StageValidators.list_of_tensors_with_dims staticmethod ¶

list_of_tensors_with_dims(value: Any, dims: int) -> bool

Check if value is a non-empty list where all items are tensors with specific dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_of_tensors_with_dims(value: Any, dims: int) -> bool:
    """Check if value is a non-empty list where all items are tensors with specific dimensions and no NaN values."""
    if not isinstance(value, list) or len(value) == 0:
        return False
    for item in value:
        if not isinstance(item, torch.Tensor):
            return False
        if item.dim() != dims:
            return False
        if torch.isnan(item).any().item():
            return False
    return True

fastvideo.pipelines.stages.validators.StageValidators.list_of_tensors_with_min_dims staticmethod ¶

list_of_tensors_with_min_dims(value: Any, min_dims: int) -> bool

Check if value is a non-empty list where all items are tensors with at least min_dims dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def list_of_tensors_with_min_dims(value: Any, min_dims: int) -> bool:
    """Check if value is a non-empty list where all items are tensors with at least min_dims dimensions and no NaN values."""
    if not isinstance(value, list) or len(value) == 0:
        return False
    for item in value:
        if not isinstance(item, torch.Tensor):
            return False
        if item.dim() < min_dims:
            return False
        if torch.isnan(item).any().item():
            return False
    return True

fastvideo.pipelines.stages.validators.StageValidators.min_dims staticmethod ¶

min_dims(min_dims: int) -> Callable[[Any], bool]

Return a validator that checks if tensor has at least min_dims dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def min_dims(min_dims: int) -> Callable[[Any], bool]:
    """Return a validator that checks if tensor has at least min_dims dimensions and no NaN values."""

    def validator(value: Any) -> bool:
        return StageValidators.tensor_min_dims(value, min_dims)

    return validator

fastvideo.pipelines.stages.validators.StageValidators.non_negative_float staticmethod ¶

non_negative_float(value: Any) -> bool

Check if value is a non-negative float.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def non_negative_float(value: Any) -> bool:
    """Check if value is a non-negative float."""
    return isinstance(value, int | float) and value >= 0

fastvideo.pipelines.stages.validators.StageValidators.none_or_list staticmethod ¶

none_or_list(value: Any) -> bool

Check if value is None or a list.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def none_or_list(value: Any) -> bool:
    """Check if value is None or a list."""
    return value is None or isinstance(value, list)

fastvideo.pipelines.stages.validators.StageValidators.none_or_positive_int staticmethod ¶

none_or_positive_int(value: Any) -> bool

Check if value is None or a positive integer.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def none_or_positive_int(value: Any) -> bool:
    """Check if value is None or a positive integer."""
    return value is None or (isinstance(value, int) and value > 0)

fastvideo.pipelines.stages.validators.StageValidators.none_or_tensor staticmethod ¶

none_or_tensor(value: Any) -> bool

Check if value is None or a tensor without NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def none_or_tensor(value: Any) -> bool:
    """Check if value is None or a tensor without NaN values."""
    if value is None:
        return True
    if not isinstance(value, torch.Tensor):
        return False
    return not torch.isnan(value).any().item()

fastvideo.pipelines.stages.validators.StageValidators.none_or_tensor_with_dims staticmethod ¶

none_or_tensor_with_dims(dims: int) -> Callable[[Any], bool]

Return a validator that checks if value is None or a tensor with specific dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def none_or_tensor_with_dims(dims: int) -> Callable[[Any], bool]:
    """Return a validator that checks if value is None or a tensor with specific dimensions and no NaN values."""

    def validator(value: Any) -> bool:
        if value is None:
            return True
        if not isinstance(value, torch.Tensor):
            return False
        if value.dim() != dims:
            return False
        return not torch.isnan(value).any().item()

    return validator

fastvideo.pipelines.stages.validators.StageValidators.not_none staticmethod ¶

not_none(value: Any) -> bool

Check if value is not None.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def not_none(value: Any) -> bool:
    """Check if value is not None."""
    return value is not None

fastvideo.pipelines.stages.validators.StageValidators.positive_float staticmethod ¶

positive_float(value: Any) -> bool

Check if value is a positive float.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def positive_float(value: Any) -> bool:
    """Check if value is a positive float."""
    return isinstance(value, int | float) and value > 0

fastvideo.pipelines.stages.validators.StageValidators.positive_int staticmethod ¶

positive_int(value: Any) -> bool

Check if value is a positive integer.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def positive_int(value: Any) -> bool:
    """Check if value is a positive integer."""
    return isinstance(value, int) and value > 0

fastvideo.pipelines.stages.validators.StageValidators.positive_int_divisible staticmethod ¶

positive_int_divisible(divisor: int) -> Callable[[Any], bool]

Return a validator that checks if value is a positive integer divisible by divisor.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def positive_int_divisible(divisor: int) -> Callable[[Any], bool]:
    """Return a validator that checks if value is a positive integer divisible by divisor."""

    def validator(value: Any) -> bool:
        return (isinstance(value, int) and value > 0
                and StageValidators.divisible_by(value, divisor))

    return validator

fastvideo.pipelines.stages.validators.StageValidators.string_not_empty staticmethod ¶

string_not_empty(value: Any) -> bool

Check if value is a non-empty string.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def string_not_empty(value: Any) -> bool:
    """Check if value is a non-empty string."""
    return isinstance(value, str) and len(value.strip()) > 0

fastvideo.pipelines.stages.validators.StageValidators.string_or_list_strings staticmethod ¶

string_or_list_strings(value: Any) -> bool

Check if value is a string or list of strings.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def string_or_list_strings(value: Any) -> bool:
    """Check if value is a string or list of strings."""
    if isinstance(value, str):
        return True
    if isinstance(value, list):
        return all(isinstance(item, str) for item in value)
    return False

fastvideo.pipelines.stages.validators.StageValidators.tensor_min_dims staticmethod ¶

tensor_min_dims(value: Any, min_dims: int) -> bool

Check if value is a tensor with at least min_dims dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def tensor_min_dims(value: Any, min_dims: int) -> bool:
    """Check if value is a tensor with at least min_dims dimensions and no NaN values."""
    if not isinstance(value, torch.Tensor):
        return False
    if value.dim() < min_dims:
        return False
    return not torch.isnan(value).any().item()

fastvideo.pipelines.stages.validators.StageValidators.tensor_shape_matches staticmethod ¶

tensor_shape_matches(value: Any, expected_shape: tuple) -> bool

Check if tensor shape matches expected shape (None for any size) and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def tensor_shape_matches(value: Any, expected_shape: tuple) -> bool:
    """Check if tensor shape matches expected shape (None for any size) and no NaN values."""
    if not isinstance(value, torch.Tensor):
        return False
    if len(value.shape) != len(expected_shape):
        return False
    for actual, expected in zip(value.shape, expected_shape, strict=True):
        if expected is not None and actual != expected:
            return False
    return not torch.isnan(value).any().item()

fastvideo.pipelines.stages.validators.StageValidators.tensor_with_dims staticmethod ¶

tensor_with_dims(value: Any, dims: int) -> bool

Check if value is a tensor with specific dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def tensor_with_dims(value: Any, dims: int) -> bool:
    """Check if value is a tensor with specific dimensions and no NaN values."""
    if not isinstance(value, torch.Tensor):
        return False
    if value.dim() != dims:
        return False
    return not torch.isnan(value).any().item()

fastvideo.pipelines.stages.validators.StageValidators.with_dims staticmethod ¶

with_dims(dims: int) -> Callable[[Any], bool]

Return a validator that checks if tensor has specific dimensions and no NaN values.

Source code in fastvideo/pipelines/stages/validators.py

@staticmethod
def with_dims(dims: int) -> Callable[[Any], bool]:
    """Return a validator that checks if tensor has specific dimensions and no NaN values."""

    def validator(value: Any) -> bool:
        return StageValidators.tensor_with_dims(value, dims)

    return validator

fastvideo.pipelines.stages.validators.ValidationFailure ¶

ValidationFailure(validator_name: str, actual_value: Any, expected: str | None = None, error_msg: str | None = None)

Details about a specific validation failure.

Source code in fastvideo/pipelines/stages/validators.py

def __init__(self,
             validator_name: str,
             actual_value: Any,
             expected: str | None = None,
             error_msg: str | None = None):
    self.validator_name = validator_name
    self.actual_value = actual_value
    self.expected = expected
    self.error_msg = error_msg

fastvideo.pipelines.stages.validators.VerificationResult ¶

VerificationResult()

Wrapper class for stage verification results.

Source code in fastvideo/pipelines/stages/validators.py

def __init__(self) -> None:
    self._checks: dict[str, bool] = {}
    self._failures: dict[str, list[ValidationFailure]] = {}

Functions¶

fastvideo.pipelines.stages.validators.VerificationResult.add_check ¶

add_check(field_name: str, value: Any, validators: Callable[[Any], bool] | list[Callable[[Any], bool]]) -> VerificationResult

Add a validation check for a field.

Parameters:

Name	Type	Description	Default
`field_name`	`str`	Name of the field being checked	required
`value`	`Any`	The actual value to validate	required
`validators`	`Callable[[Any], bool] \| list[Callable[[Any], bool]]`	Single validation function or list of validation functions. Each function will be called with the value as its first argument.	required

Returns:

Type	Description
`VerificationResult`	Self for method chaining

Examples:

Single validator¶

result.add_check("tensor", my_tensor, V.is_tensor)

Multiple validators (all must pass)¶

result.add_check("latents", batch.latents, [V.is_tensor, V.with_dims(5)])

Using partial functions for parameters¶

result.add_check("height", batch.height, [V.not_none, V.divisible(8)])

Source code in fastvideo/pipelines/stages/validators.py

def add_check(
    self, field_name: str, value: Any,
    validators: Callable[[Any], bool] | list[Callable[[Any], bool]]
) -> 'VerificationResult':
    """
    Add a validation check for a field.

    Args:
        field_name: Name of the field being checked
        value: The actual value to validate
        validators: Single validation function or list of validation functions.
                   Each function will be called with the value as its first argument.

    Returns:
        Self for method chaining

    Examples:
        # Single validator
        result.add_check("tensor", my_tensor, V.is_tensor)

        # Multiple validators (all must pass)
        result.add_check("latents", batch.latents, [V.is_tensor, V.with_dims(5)])

        # Using partial functions for parameters
        result.add_check("height", batch.height, [V.not_none, V.divisible(8)])
    """
    if not isinstance(validators, list):
        validators = [validators]

    failures = []
    all_passed = True

    # Apply all validators and collect detailed failure info
    for validator in validators:
        try:
            passed = validator(value)
            if not passed:
                all_passed = False
                failure = self._create_validation_failure(validator, value)
                failures.append(failure)
        except Exception as e:
            # If any validator raises an exception, consider the check failed
            all_passed = False
            validator_name = getattr(validator, '__name__', str(validator))
            failure = ValidationFailure(
                validator_name=validator_name,
                actual_value=value,
                error_msg=f"Exception during validation: {str(e)}")
            failures.append(failure)

    self._checks[field_name] = all_passed
    if not all_passed:
        self._failures[field_name] = failures

    return self

fastvideo.pipelines.stages.validators.VerificationResult.get_detailed_failures ¶

get_detailed_failures() -> dict[str, list[ValidationFailure]]

Get detailed failure information for each failed field.

Source code in fastvideo/pipelines/stages/validators.py

def get_detailed_failures(self) -> dict[str, list[ValidationFailure]]:
    """Get detailed failure information for each failed field."""
    return self._failures.copy()

fastvideo.pipelines.stages.validators.VerificationResult.get_failed_fields ¶

get_failed_fields() -> list[str]

Get list of fields that failed validation.

Source code in fastvideo/pipelines/stages/validators.py

def get_failed_fields(self) -> list[str]:
    """Get list of fields that failed validation."""
    return [field for field, passed in self._checks.items() if not passed]

fastvideo.pipelines.stages.validators.VerificationResult.get_failure_summary ¶

get_failure_summary() -> str

Get a comprehensive summary of all validation failures.

Source code in fastvideo/pipelines/stages/validators.py

def get_failure_summary(self) -> str:
    """Get a comprehensive summary of all validation failures."""
    if self.is_valid():
        return "All validations passed"

    summary_parts = []
    for field_name, failures in self._failures.items():
        field_summary = f"\n  Field '{field_name}':"
        for i, failure in enumerate(failures, 1):
            field_summary += f"\n    {i}. {failure}"
        summary_parts.append(field_summary)

    return "Validation failures:" + "".join(summary_parts)

fastvideo.pipelines.stages.validators.VerificationResult.is_valid ¶

is_valid() -> bool

Check if all validations passed.

Source code in fastvideo/pipelines/stages/validators.py

def is_valid(self) -> bool:
    """Check if all validations passed."""
    return all(self._checks.values())

fastvideo.pipelines.stages.validators.VerificationResult.to_dict ¶

to_dict() -> dict

Convert to dictionary for backward compatibility.

Source code in fastvideo/pipelines/stages/validators.py

def to_dict(self) -> dict:
    """Convert to dictionary for backward compatibility."""
    return self._checks.copy()

fastvideo.pipelines.training ¶

Training pipelines for fastvideo.v1.

This package contains pipelines for training diffusion models.

pipelines ¶

Classes¶

fastvideo.pipelines.ComposedPipelineBase ¶

Attributes¶

fastvideo.pipelines.ComposedPipelineBase.required_config_modules property ¶

fastvideo.pipelines.ComposedPipelineBase.stages property ¶

Functions¶

fastvideo.pipelines.ComposedPipelineBase.create_pipeline_stages abstractmethod ¶

fastvideo.pipelines.ComposedPipelineBase.create_training_stages ¶

fastvideo.pipelines.ComposedPipelineBase.forward ¶

fastvideo.pipelines.ComposedPipelineBase.from_pretrained classmethod ¶

fastvideo.pipelines.ComposedPipelineBase.initialize_pipeline ¶

fastvideo.pipelines.ComposedPipelineBase.load_modules ¶

fastvideo.pipelines.ForwardBatch dataclass ¶

Functions¶

fastvideo.pipelines.ForwardBatch.__post_init__ ¶

fastvideo.pipelines.LoRAPipeline ¶

Functions¶

fastvideo.pipelines.LoRAPipeline.convert_to_lora_layers ¶

fastvideo.pipelines.LoRAPipeline.set_lora_adapter ¶

fastvideo.pipelines.PipelineWithLoRA ¶

Functions¶

fastvideo.pipelines.build_pipeline ¶

Modules¶

fastvideo.pipelines.basic ¶

Modules¶

fastvideo.pipelines.basic.cosmos ¶

Modules¶

fastvideo.pipelines.basic.hunyuan ¶

Modules¶

fastvideo.pipelines.basic.hunyuan15 ¶

Modules¶

fastvideo.pipelines.basic.longcat ¶

Classes¶

Modules¶

fastvideo.pipelines.basic.matrixgame ¶

Modules¶

fastvideo.pipelines.basic.stepvideo ¶

Modules¶

fastvideo.pipelines.basic.wan ¶

Modules¶

fastvideo.pipelines.composed_pipeline_base ¶

Classes¶

fastvideo.pipelines.composed_pipeline_base.ComposedPipelineBase ¶

Attributes¶

Functions¶

Functions¶

fastvideo.pipelines.lora_pipeline ¶

Classes¶

fastvideo.pipelines.lora_pipeline.LoRAPipeline ¶

Functions¶

Functions¶

fastvideo.pipelines.pipeline_batch_info ¶

Classes¶

fastvideo.pipelines.pipeline_batch_info.ForwardBatch dataclass ¶

Functions¶

fastvideo.pipelines.pipeline_batch_info.PipelineLoggingInfo ¶

Functions¶

fastvideo.pipelines.pipeline_registry ¶

Classes¶

fastvideo.pipelines.pipeline_registry.PipelineType ¶

Functions¶

Functions¶

fastvideo.pipelines.pipeline_registry.get_pipeline_registry ¶

fastvideo.pipelines.pipeline_registry.import_pipeline_classes cached ¶

fastvideo.pipelines.preprocess ¶

Modules¶

fastvideo.pipelines.preprocess.preprocess_pipeline_base ¶

Classes¶

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_i2v ¶

Classes¶

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_ode_trajectory ¶

Classes¶

Functions¶

fastvideo.pipelines.preprocess.preprocess_pipeline_t2v ¶

Classes¶

fastvideo.pipelines.preprocess.preprocess_pipeline_text ¶

Classes¶

fastvideo.pipelines.ComposedPipelineBase.required_config_modules `property` ¶

fastvideo.pipelines.ComposedPipelineBase.stages `property` ¶

fastvideo.pipelines.ComposedPipelineBase.create_pipeline_stages `abstractmethod` ¶

fastvideo.pipelines.ComposedPipelineBase.from_pretrained `classmethod` ¶

fastvideo.pipelines.ForwardBatch `dataclass` ¶

fastvideo.pipelines.pipeline_batch_info.ForwardBatch `dataclass` ¶

fastvideo.pipelines.pipeline_registry.import_pipeline_classes `cached` ¶