ssim ¶

Modules¶

fastvideo.tests.ssim.test_causal_similarity ¶

Classes¶

Functions¶

fastvideo.tests.ssim.test_causal_similarity.test_causal_similarity ¶

test_causal_similarity(prompt, ATTENTION_BACKEND, model_id)

Test that runs inference with different parameters and compares the output to reference videos using SSIM.

Source code in fastvideo/tests/ssim/test_causal_similarity.py

@pytest.mark.parametrize("prompt", TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN"])
@pytest.mark.parametrize("model_id", list(MODEL_TO_PARAMS.keys()))
def test_causal_similarity(prompt, ATTENTION_BACKEND, model_id):
    """
    Test that runs inference with different parameters and compares the output
    to reference videos using SSIM.
    """
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND

    script_dir = os.path.dirname(os.path.abspath(__file__))

    base_output_dir = os.path.join(script_dir, 'generated_videos', model_id)
    output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND)
    output_video_name = f"{prompt[:100].strip()}.mp4"

    os.makedirs(output_dir, exist_ok=True)

    BASE_PARAMS = MODEL_TO_PARAMS[model_id]
    num_inference_steps = BASE_PARAMS["num_inference_steps"]

    init_kwargs = {
        "num_gpus": BASE_PARAMS["num_gpus"],
        "sp_size": BASE_PARAMS["sp_size"],
        "tp_size": BASE_PARAMS["tp_size"],
        "dit_cpu_offload": True,
    }
    if BASE_PARAMS.get("vae_sp"):
        init_kwargs["vae_sp"] = True
        init_kwargs["vae_tiling"] = True
    #if "text-encoder-precision" in BASE_PARAMS:
    #    init_kwargs["text_encoder_precisions"] = BASE_PARAMS["text-encoder-precision"]

    generation_kwargs = {
        "num_inference_steps": num_inference_steps,
        "output_path": output_dir,
        "height": BASE_PARAMS["height"],
        "width": BASE_PARAMS["width"],
        "num_frames": BASE_PARAMS["num_frames"],
        "seed": BASE_PARAMS["seed"],
    }
    if "neg_prompt" in BASE_PARAMS:
        generation_kwargs["neg_prompt"] = BASE_PARAMS["neg_prompt"]

    generator = VideoGenerator.from_pretrained(model_path=BASE_PARAMS["model_path"], **init_kwargs)
    generator.generate_video(prompt, **generation_kwargs)

    if isinstance(generator.executor, MultiprocExecutor):
        generator.executor.shutdown()

    assert os.path.exists(
        output_dir), f"Output video was not generated at {output_dir}"

    reference_folder = os.path.join(script_dir, device_reference_folder, model_id, ATTENTION_BACKEND)

    if not os.path.exists(reference_folder):
        logger.error("Reference folder missing")
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}")

    # Find the matching reference video based on the prompt
    reference_video_name = None

    for filename in os.listdir(reference_folder):
        if filename.endswith('.mp4') and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        logger.error(f"Reference video not found for prompt: {prompt} with backend: {ATTENTION_BACKEND}")
        raise FileNotFoundError(f"Reference video missing")

    reference_video_path = os.path.join(reference_folder, reference_video_name)
    generated_video_path = os.path.join(output_dir, output_video_name)

    logger.info(
        f"Computing SSIM between {reference_video_path} and {generated_video_path}"
    )
    ssim_values = compute_video_ssim_torchvision(reference_video_path,
                                                 generated_video_path,
                                                 use_ms_ssim=True)

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")
    logger.info(f"Writing SSIM results to directory: {output_dir}")

    success = write_ssim_results(output_dir, ssim_values, reference_video_path,
                                 generated_video_path, num_inference_steps,
                                 prompt)

    if not success:
        logger.error("Failed to write SSIM results to file")

    min_acceptable_ssim = 0.98
    assert mean_ssim >= min_acceptable_ssim, f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} for {model_id} with backend {ATTENTION_BACKEND}"

fastvideo.tests.ssim.test_inference_similarity ¶

Classes¶

Functions¶

fastvideo.tests.ssim.test_inference_similarity.test_i2v_inference_similarity ¶

test_i2v_inference_similarity(prompt, ATTENTION_BACKEND, model_id)

Test that runs inference with different parameters and compares the output to reference videos using SSIM.

Source code in fastvideo/tests/ssim/test_inference_similarity.py

@pytest.mark.parametrize("prompt", I2V_TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN"])
@pytest.mark.parametrize("model_id", list(I2V_MODEL_TO_PARAMS.keys()))
def test_i2v_inference_similarity(prompt, ATTENTION_BACKEND, model_id):
    """
    Test that runs inference with different parameters and compares the output
    to reference videos using SSIM.
    """
    assert len(I2V_TEST_PROMPTS) == len(I2V_IMAGE_PATHS), "Expect number of prompts equal to number of images"
    with _attention_backend(ATTENTION_BACKEND):
        script_dir = os.path.dirname(os.path.abspath(__file__))

        base_output_dir = os.path.join(script_dir, 'generated_videos', model_id)
        output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND)
        output_video_name = f"{prompt[:100].strip()}.mp4"

        os.makedirs(output_dir, exist_ok=True)

        BASE_PARAMS = I2V_MODEL_TO_PARAMS[model_id]
        num_inference_steps = BASE_PARAMS["num_inference_steps"]
        image_path = I2V_IMAGE_PATHS[I2V_TEST_PROMPTS.index(prompt)]

        init_kwargs = {
            "num_gpus": BASE_PARAMS["num_gpus"],
            "flow_shift": BASE_PARAMS["flow_shift"],
            "sp_size": BASE_PARAMS["sp_size"],
            "tp_size": BASE_PARAMS["tp_size"],
        }
        if BASE_PARAMS.get("vae_sp"):
            init_kwargs["vae_sp"] = True
            init_kwargs["vae_tiling"] = True
        if "text-encoder-precision" in BASE_PARAMS:
            init_kwargs["text_encoder_precisions"] = BASE_PARAMS[
                "text-encoder-precision"]

        generation_kwargs = {
            "num_inference_steps": num_inference_steps,
            "output_path": output_dir,
            "image_path": image_path,
            "height": BASE_PARAMS["height"],
            "width": BASE_PARAMS["width"],
            "num_frames": BASE_PARAMS["num_frames"],
            "guidance_scale": BASE_PARAMS["guidance_scale"],
            "embedded_cfg_scale": BASE_PARAMS["embedded_cfg_scale"],
            "seed": BASE_PARAMS["seed"],
            "fps": BASE_PARAMS["fps"],
        }
        if "neg_prompt" in BASE_PARAMS:
            generation_kwargs["neg_prompt"] = BASE_PARAMS["neg_prompt"]

        generator: VideoGenerator | None = None
        try:
            generator = VideoGenerator.from_pretrained(
                model_path=BASE_PARAMS["model_path"], **init_kwargs)
            generator.generate_video(prompt, **generation_kwargs)
        finally:
            _shutdown_executor(generator)

    assert os.path.exists(
        output_dir), f"Output video was not generated at {output_dir}"

    reference_folder = os.path.join(script_dir, device_reference_folder, model_id, ATTENTION_BACKEND)

    if not os.path.exists(reference_folder):
        logger.error("Reference folder missing")
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}")

    # Find the matching reference video based on the prompt
    reference_video_name = None

    for filename in os.listdir(reference_folder):
        if filename.endswith('.mp4') and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        logger.error(f"Reference video not found for prompt: {prompt} with backend: {ATTENTION_BACKEND}")
        raise FileNotFoundError(f"Reference video missing")

    reference_video_path = os.path.join(reference_folder, reference_video_name)
    generated_video_path = os.path.join(output_dir, output_video_name)

    logger.info(
        f"Computing SSIM between {reference_video_path} and {generated_video_path}"
    )
    ssim_values = compute_video_ssim_torchvision(reference_video_path,
                                                 generated_video_path,
                                                 use_ms_ssim=True)

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")
    logger.info(f"Writing SSIM results to directory: {output_dir}")

    success = write_ssim_results(output_dir, ssim_values, reference_video_path,
                                 generated_video_path, num_inference_steps,
                                 prompt)

    if not success:
        logger.error("Failed to write SSIM results to file")

    min_acceptable_ssim = 0.97
    assert mean_ssim >= min_acceptable_ssim, f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} for {model_id} with backend {ATTENTION_BACKEND}"

fastvideo.tests.ssim.test_inference_similarity.test_inference_similarity ¶

test_inference_similarity(prompt, ATTENTION_BACKEND, model_id)

Test that runs inference with different parameters and compares the output to reference videos using SSIM.

Source code in fastvideo/tests/ssim/test_inference_similarity.py

@pytest.mark.parametrize("prompt", TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN", "TORCH_SDPA"])
@pytest.mark.parametrize("model_id", list(MODEL_TO_PARAMS.keys()))
def test_inference_similarity(prompt, ATTENTION_BACKEND, model_id):
    """
    Test that runs inference with different parameters and compares the output
    to reference videos using SSIM.
    """
    with _attention_backend(ATTENTION_BACKEND):
        script_dir = os.path.dirname(os.path.abspath(__file__))

        base_output_dir = os.path.join(script_dir, 'generated_videos', model_id)
        output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND)
        output_video_name = f"{prompt[:100].strip()}.mp4"

        os.makedirs(output_dir, exist_ok=True)

        BASE_PARAMS = MODEL_TO_PARAMS[model_id]
        num_inference_steps = BASE_PARAMS["num_inference_steps"]

        init_kwargs = {
            "num_gpus": BASE_PARAMS["num_gpus"],
            "sp_size": BASE_PARAMS["sp_size"],
            "tp_size": BASE_PARAMS["tp_size"],
            "use_fsdp_inference": True,
            "dit_cpu_offload": False,
            "dit_layerwise_offload": False,
        }
        if "flow_shift" in BASE_PARAMS:
            init_kwargs["flow_shift"] = BASE_PARAMS["flow_shift"]
        if BASE_PARAMS.get("vae_sp"):
            init_kwargs["vae_sp"] = True
            init_kwargs["vae_tiling"] = True
        if "text-encoder-precision" in BASE_PARAMS:
            init_kwargs["text_encoder_precisions"] = BASE_PARAMS[
                "text-encoder-precision"]
        # LTX2-specific VAE tiling parameters
        if BASE_PARAMS.get("ltx2_vae_tiling"):
            init_kwargs["ltx2_vae_tiling"] = True
            init_kwargs["ltx2_vae_spatial_tile_size_in_pixels"] = BASE_PARAMS.get(
                "ltx2_vae_spatial_tile_size_in_pixels", 512)
            init_kwargs["ltx2_vae_spatial_tile_overlap_in_pixels"] = BASE_PARAMS.get(
                "ltx2_vae_spatial_tile_overlap_in_pixels", 64)
            init_kwargs["ltx2_vae_temporal_tile_size_in_frames"] = BASE_PARAMS.get(
                "ltx2_vae_temporal_tile_size_in_frames", 64)
            init_kwargs[
                "ltx2_vae_temporal_tile_overlap_in_frames"] = BASE_PARAMS.get(
                    "ltx2_vae_temporal_tile_overlap_in_frames", 24)

        generation_kwargs = {
            "num_inference_steps": num_inference_steps,
            "output_path": output_dir,
            "height": BASE_PARAMS["height"],
            "width": BASE_PARAMS["width"],
            "num_frames": BASE_PARAMS["num_frames"],
            "guidance_scale": BASE_PARAMS["guidance_scale"],
            "embedded_cfg_scale": BASE_PARAMS["embedded_cfg_scale"],
            "seed": BASE_PARAMS["seed"],
            "fps": BASE_PARAMS["fps"],
        }
        if "neg_prompt" in BASE_PARAMS:
            generation_kwargs["neg_prompt"] = BASE_PARAMS["neg_prompt"]

        generator: VideoGenerator | None = None
        try:
            generator = VideoGenerator.from_pretrained(
                model_path=BASE_PARAMS["model_path"], **init_kwargs)
            generator.generate_video(prompt, **generation_kwargs)
        finally:
            _shutdown_executor(generator)

    assert os.path.exists(
        output_dir), f"Output video was not generated at {output_dir}"

    reference_folder = os.path.join(script_dir, device_reference_folder, model_id, ATTENTION_BACKEND)

    if not os.path.exists(reference_folder):
        logger.error("Reference folder missing")
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}")

    # Find the matching reference video based on the prompt
    reference_video_name = None

    for filename in os.listdir(reference_folder):
        if filename.endswith('.mp4') and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        logger.error(f"Reference video not found for prompt: {prompt} with backend: {ATTENTION_BACKEND}")
        raise FileNotFoundError(f"Reference video missing")

    reference_video_path = os.path.join(reference_folder, reference_video_name)
    generated_video_path = os.path.join(output_dir, output_video_name)

    logger.info(
        f"Computing SSIM between {reference_video_path} and {generated_video_path}"
    )
    ssim_values = compute_video_ssim_torchvision(reference_video_path,
                                                 generated_video_path,
                                                 use_ms_ssim=True)

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")
    logger.info(f"Writing SSIM results to directory: {output_dir}")

    success = write_ssim_results(output_dir, ssim_values, reference_video_path,
                                 generated_video_path, num_inference_steps,
                                 prompt)

    if not success:
        logger.error("Failed to write SSIM results to file")

    min_acceptable_ssim = 0.93
    assert mean_ssim >= min_acceptable_ssim, f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} for {model_id} with backend {ATTENTION_BACKEND}"

fastvideo.tests.ssim.test_longcat_similarity ¶

SSIM-based similarity tests for LongCat video generation.

Tests three LongCat modes: - T2V (Text-to-Video): 480p video from text prompt - I2V (Image-to-Video): 480p video from image + text prompt
- VC (Video Continuation): 480p video continuation from input video + text prompt

Sampling parameters are derived from: - examples/inference/basic/basic_longcat_t2v.py - examples/inference/basic/basic_longcat_i2v.py - examples/inference/basic/basic_longcat_vc.py

Note: num_inference_steps is reduced for CI speed (4 steps vs 50 in examples).

Classes¶

Functions¶

fastvideo.tests.ssim.test_longcat_similarity.test_longcat_i2v_similarity ¶

test_longcat_i2v_similarity(prompt: str, ATTENTION_BACKEND: str)

Test LongCat I2V inference and compare output to reference videos using SSIM.

Parameters derived from examples/inference/basic/basic_longcat_i2v.py

Source code in fastvideo/tests/ssim/test_longcat_similarity.py

@pytest.mark.parametrize("prompt", I2V_TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN"])
def test_longcat_i2v_similarity(prompt: str, ATTENTION_BACKEND: str):
    """
    Test LongCat I2V inference and compare output to reference videos using SSIM.

    Parameters derived from examples/inference/basic/basic_longcat_i2v.py
    """
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND

    script_dir = os.path.dirname(os.path.abspath(__file__))
    model_id = "LongCat-Video-I2V"

    output_dir = os.path.join(script_dir, "generated_videos", model_id, ATTENTION_BACKEND)
    output_video_name = f"{prompt[:100].strip()}.mp4"
    os.makedirs(output_dir, exist_ok=True)

    # Get image path for this prompt
    prompt_idx = I2V_TEST_PROMPTS.index(prompt)
    image_path = _resolve_asset_path(I2V_IMAGE_PATHS[prompt_idx])

    init_kwargs = {
        "num_gpus": LONGCAT_I2V_PARAMS["num_gpus"],
        "use_fsdp_inference": True,
        "dit_cpu_offload": True,
        "vae_cpu_offload": True,
        "text_encoder_cpu_offload": True,
        "enable_bsa": False,
    }

    generation_kwargs = {
        "output_path": output_dir,
        "image_path": image_path,
        "height": LONGCAT_I2V_PARAMS["height"],
        "width": LONGCAT_I2V_PARAMS["width"],
        "num_frames": LONGCAT_I2V_PARAMS["num_frames"],
        "num_inference_steps": LONGCAT_I2V_PARAMS["num_inference_steps"],
        "guidance_scale": LONGCAT_I2V_PARAMS["guidance_scale"],
        "fps": LONGCAT_I2V_PARAMS["fps"],
        "seed": LONGCAT_I2V_PARAMS["seed"],
        "negative_prompt": LONGCAT_I2V_PARAMS["negative_prompt"],
    }

    generator = VideoGenerator.from_pretrained(
        model_path=LONGCAT_I2V_PARAMS["model_path"], **init_kwargs
    )
    generator.generate_video(prompt, **generation_kwargs)
    generator.shutdown()

    generated_video_path = os.path.join(output_dir, output_video_name)
    assert os.path.exists(generated_video_path), (
        f"Output video was not generated at {generated_video_path}"
    )

    # Find reference video
    reference_folder = os.path.join(
        script_dir, device_reference_folder, model_id, ATTENTION_BACKEND
    )
    if not os.path.exists(reference_folder):
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}"
        )

    reference_video_name = None
    for filename in os.listdir(reference_folder):
        if filename.endswith(".mp4") and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        raise FileNotFoundError(
            f"Reference video not found for prompt: {prompt[:50]}... with backend: {ATTENTION_BACKEND}"
        )

    reference_video_path = os.path.join(reference_folder, reference_video_name)

    logger.info(f"Computing SSIM between {reference_video_path} and {generated_video_path}")
    ssim_values = compute_video_ssim_torchvision(
        reference_video_path, generated_video_path, use_ms_ssim=True
    )

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")

    write_ssim_results(
        output_dir, ssim_values, reference_video_path, generated_video_path,
        LONGCAT_I2V_PARAMS["num_inference_steps"], prompt
    )

    min_acceptable_ssim = 0.90
    assert mean_ssim >= min_acceptable_ssim, (
        f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} "
        f"for {model_id} with backend {ATTENTION_BACKEND}"
    )

fastvideo.tests.ssim.test_longcat_similarity.test_longcat_t2v_similarity ¶

test_longcat_t2v_similarity(prompt: str, ATTENTION_BACKEND: str)

Test LongCat T2V inference and compare output to reference videos using SSIM.

Parameters derived from examples/inference/basic/basic_longcat_t2v.py

Source code in fastvideo/tests/ssim/test_longcat_similarity.py

@pytest.mark.parametrize("prompt", T2V_TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN"])
def test_longcat_t2v_similarity(prompt: str, ATTENTION_BACKEND: str):
    """
    Test LongCat T2V inference and compare output to reference videos using SSIM.

    Parameters derived from examples/inference/basic/basic_longcat_t2v.py
    """
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND

    script_dir = os.path.dirname(os.path.abspath(__file__))
    model_id = "LongCat-Video-T2V"

    output_dir = os.path.join(script_dir, "generated_videos", model_id, ATTENTION_BACKEND)
    output_video_name = f"{prompt[:100].strip()}.mp4"
    os.makedirs(output_dir, exist_ok=True)

    init_kwargs = {
        "num_gpus": LONGCAT_T2V_PARAMS["num_gpus"],
        "use_fsdp_inference": True,
        "dit_cpu_offload": True,
        "vae_cpu_offload": True,
        "text_encoder_cpu_offload": True,
        "enable_bsa": False,
    }

    generation_kwargs = {
        "output_path": output_dir,
        "height": LONGCAT_T2V_PARAMS["height"],
        "width": LONGCAT_T2V_PARAMS["width"],
        "num_frames": LONGCAT_T2V_PARAMS["num_frames"],
        "num_inference_steps": LONGCAT_T2V_PARAMS["num_inference_steps"],
        "guidance_scale": LONGCAT_T2V_PARAMS["guidance_scale"],
        "fps": LONGCAT_T2V_PARAMS["fps"],
        "seed": LONGCAT_T2V_PARAMS["seed"],
        "negative_prompt": LONGCAT_T2V_PARAMS["negative_prompt"],
    }

    generator = VideoGenerator.from_pretrained(
        model_path=LONGCAT_T2V_PARAMS["model_path"], **init_kwargs
    )
    generator.generate_video(prompt, **generation_kwargs)
    generator.shutdown()

    generated_video_path = os.path.join(output_dir, output_video_name)
    assert os.path.exists(generated_video_path), (
        f"Output video was not generated at {generated_video_path}"
    )

    # Find reference video
    reference_folder = os.path.join(
        script_dir, device_reference_folder, model_id, ATTENTION_BACKEND
    )
    if not os.path.exists(reference_folder):
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}"
        )

    reference_video_name = None
    for filename in os.listdir(reference_folder):
        if filename.endswith(".mp4") and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        raise FileNotFoundError(
            f"Reference video not found for prompt: {prompt[:50]}... with backend: {ATTENTION_BACKEND}"
        )

    reference_video_path = os.path.join(reference_folder, reference_video_name)

    logger.info(f"Computing SSIM between {reference_video_path} and {generated_video_path}")
    ssim_values = compute_video_ssim_torchvision(
        reference_video_path, generated_video_path, use_ms_ssim=True
    )

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")

    write_ssim_results(
        output_dir, ssim_values, reference_video_path, generated_video_path,
        LONGCAT_T2V_PARAMS["num_inference_steps"], prompt
    )

    min_acceptable_ssim = 0.90
    assert mean_ssim >= min_acceptable_ssim, (
        f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} "
        f"for {model_id} with backend {ATTENTION_BACKEND}"
    )

fastvideo.tests.ssim.test_longcat_similarity.test_longcat_vc_similarity ¶

test_longcat_vc_similarity(prompt: str, ATTENTION_BACKEND: str)

Test LongCat VC (Video Continuation) inference and compare output to reference videos using SSIM.

Parameters derived from examples/inference/basic/basic_longcat_vc.py

Source code in fastvideo/tests/ssim/test_longcat_similarity.py

@pytest.mark.parametrize("prompt", VC_TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN"])
def test_longcat_vc_similarity(prompt: str, ATTENTION_BACKEND: str):
    """
    Test LongCat VC (Video Continuation) inference and compare output to reference videos using SSIM.

    Parameters derived from examples/inference/basic/basic_longcat_vc.py
    """
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND

    script_dir = os.path.dirname(os.path.abspath(__file__))
    model_id = "LongCat-Video-VC"

    output_dir = os.path.join(script_dir, "generated_videos", model_id, ATTENTION_BACKEND)
    output_video_name = f"{prompt[:100].strip()}.mp4"
    os.makedirs(output_dir, exist_ok=True)

    # Get video path for this prompt
    prompt_idx = VC_TEST_PROMPTS.index(prompt)
    video_path = _resolve_asset_path(VC_VIDEO_PATHS[prompt_idx])

    if not os.path.exists(video_path):
        pytest.skip(f"Input video not found at {video_path}")

    init_kwargs = {
        "num_gpus": LONGCAT_VC_PARAMS["num_gpus"],
        "use_fsdp_inference": False,
        "dit_cpu_offload": False,
        "vae_cpu_offload": True,
        "text_encoder_cpu_offload": True,
        "pin_cpu_memory": False,
        "enable_bsa": False,
    }

    generation_kwargs = {
        "output_path": output_dir,
        "video_path": video_path,
        "num_cond_frames": LONGCAT_VC_PARAMS["num_cond_frames"],
        "height": LONGCAT_VC_PARAMS["height"],
        "width": LONGCAT_VC_PARAMS["width"],
        "num_frames": LONGCAT_VC_PARAMS["num_frames"],
        "num_inference_steps": LONGCAT_VC_PARAMS["num_inference_steps"],
        "guidance_scale": LONGCAT_VC_PARAMS["guidance_scale"],
        "fps": LONGCAT_VC_PARAMS["fps"],
        "seed": LONGCAT_VC_PARAMS["seed"],
        "negative_prompt": LONGCAT_VC_PARAMS["negative_prompt"],
    }

    generator = VideoGenerator.from_pretrained(
        model_path=LONGCAT_VC_PARAMS["model_path"], **init_kwargs
    )
    generator.generate_video(prompt, **generation_kwargs)
    generator.shutdown()

    generated_video_path = os.path.join(output_dir, output_video_name)
    assert os.path.exists(generated_video_path), (
        f"Output video was not generated at {generated_video_path}"
    )

    # Find reference video
    reference_folder = os.path.join(
        script_dir, device_reference_folder, model_id, ATTENTION_BACKEND
    )
    if not os.path.exists(reference_folder):
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}"
        )

    reference_video_name = None
    for filename in os.listdir(reference_folder):
        if filename.endswith(".mp4") and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        raise FileNotFoundError(
            f"Reference video not found for prompt: {prompt[:50]}... with backend: {ATTENTION_BACKEND}"
        )

    reference_video_path = os.path.join(reference_folder, reference_video_name)

    logger.info(f"Computing SSIM between {reference_video_path} and {generated_video_path}")
    ssim_values = compute_video_ssim_torchvision(
        reference_video_path, generated_video_path, use_ms_ssim=True
    )

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")

    write_ssim_results(
        output_dir, ssim_values, reference_video_path, generated_video_path,
        LONGCAT_VC_PARAMS["num_inference_steps"], prompt
    )

    min_acceptable_ssim = 0.90
    assert mean_ssim >= min_acceptable_ssim, (
        f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} "
        f"for {model_id} with backend {ATTENTION_BACKEND}"
    )

fastvideo.tests.ssim.test_matrixgame_similarity ¶

Classes¶

Functions¶

fastvideo.tests.ssim.test_matrixgame_similarity.test_matrixgame_similarity ¶

test_matrixgame_similarity(prompt, ATTENTION_BACKEND, model_id)

Test that runs inference with different parameters and compares the output to reference videos using SSIM.

Source code in fastvideo/tests/ssim/test_matrixgame_similarity.py

@pytest.mark.parametrize("prompt", TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN"])
@pytest.mark.parametrize("model_id", list(MODEL_TO_PARAMS.keys()))
def test_matrixgame_similarity(prompt, ATTENTION_BACKEND, model_id):
    """
    Test that runs inference with different parameters and compares the output
    to reference videos using SSIM.
    """
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND

    script_dir = os.path.dirname(os.path.abspath(__file__))

    base_output_dir = os.path.join(script_dir, "generated_videos", model_id)
    output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND)
    output_video_name = "video.mp4"

    os.makedirs(output_dir, exist_ok=True)

    BASE_PARAMS = MODEL_TO_PARAMS[model_id]
    num_inference_steps = BASE_PARAMS["num_inference_steps"]

    # Create action conditions for MatrixGame
    actions = create_action_presets(
        BASE_PARAMS["num_frames"], keyboard_dim=BASE_PARAMS["keyboard_dim"],
        seed=BASE_PARAMS["seed"]
    )
    latent_frames = (BASE_PARAMS["num_frames"] - 1) // 4 + 1
    grid_sizes = torch.tensor([latent_frames, 44, 80])

    init_kwargs = {
        "num_gpus": BASE_PARAMS["num_gpus"],
        "use_fsdp_inference": True,
        "dit_layerwise_offload": False,
        "dit_cpu_offload": False,
        "vae_cpu_offload": False,
        "text_encoder_cpu_offload": True,
        "pin_cpu_memory": True,
    }

    generation_kwargs = {
        "num_inference_steps": num_inference_steps,
        "output_path": output_dir,
        "image_path": TEST_IMAGE_PATHS[0],
        "height": BASE_PARAMS["height"],
        "width": BASE_PARAMS["width"],
        "num_frames": BASE_PARAMS["num_frames"],
        "seed": BASE_PARAMS["seed"],
        "mouse_cond": actions["mouse"].unsqueeze(0),
        "keyboard_cond": actions["keyboard"].unsqueeze(0),
        "grid_sizes": grid_sizes,
        "save_video": True,
    }

    generator = VideoGenerator.from_pretrained(model_path=BASE_PARAMS["model_path"], **init_kwargs)
    generator.generate_video(prompt, **generation_kwargs)

    if isinstance(generator.executor, MultiprocExecutor):
        generator.executor.shutdown()

    assert os.path.exists(output_dir), (
        f"Output video was not generated at {output_dir}"
    )

    reference_folder = os.path.join(
        script_dir, device_reference_folder, model_id, ATTENTION_BACKEND
    )

    if not os.path.exists(reference_folder):
        logger.error("Reference folder missing")
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}"
        )

    # Find the matching reference video based on the prompt
    reference_video_name = None

    for filename in os.listdir(reference_folder):
        if filename.endswith(".mp4"):
            reference_video_name = filename
            break

    if not reference_video_name:
        logger.error(
            f"Reference video not found for model: {model_id} with backend: {ATTENTION_BACKEND}"
        )
        raise FileNotFoundError("Reference video missing")

    reference_video_path = os.path.join(reference_folder, reference_video_name)
    generated_video_path = os.path.join(output_dir, output_video_name)

    logger.info(
        f"Computing SSIM between {reference_video_path} and {generated_video_path}"
    )
    ssim_values = compute_video_ssim_torchvision(
        reference_video_path, generated_video_path, use_ms_ssim=True
    )

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")
    logger.info(f"Writing SSIM results to directory: {output_dir}")

    success = write_ssim_results(
        output_dir,
        ssim_values,
        reference_video_path,
        generated_video_path,
        num_inference_steps,
        prompt,
    )

    if not success:
        logger.error("Failed to write SSIM results to file")

    min_acceptable_ssim = 0.98
    assert mean_ssim >= min_acceptable_ssim, (
        f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} for {model_id} with backend {ATTENTION_BACKEND}"
    )

fastvideo.tests.ssim.test_turbodiffusion_similarity ¶

SSIM-based similarity test for TurboDiffusion inference.

TurboDiffusion uses the SLA (Sparse-Linear Attention) backend and RCM scheduler for 1-4 step video generation.

Classes¶

Functions¶

fastvideo.tests.ssim.test_turbodiffusion_similarity.test_turbodiffusion_i2v_inference_similarity ¶

test_turbodiffusion_i2v_inference_similarity(prompt, model_id)

Test that runs TurboDiffusion I2V inference with dual-model switching, then compares the output to reference videos using SSIM.

Source code in fastvideo/tests/ssim/test_turbodiffusion_similarity.py

@pytest.mark.parametrize("prompt", TURBODIFFUSION_I2V_TEST_PROMPTS)
@pytest.mark.parametrize("model_id", list(TURBODIFFUSION_I2V_MODEL_TO_PARAMS.keys()))
def test_turbodiffusion_i2v_inference_similarity(prompt, model_id):
    """
    Test that runs TurboDiffusion I2V inference with dual-model switching,
    then compares the output to reference videos using SSIM.
    """
    # TurboDiffusion requires SLA attention backend
    ATTENTION_BACKEND = "SLA_ATTN"
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND

    assert len(TURBODIFFUSION_I2V_TEST_PROMPTS) == len(TURBODIFFUSION_I2V_IMAGE_PATHS), \
        "Expect number of prompts equal to number of images"

    script_dir = os.path.dirname(os.path.abspath(__file__))

    base_output_dir = os.path.join(script_dir, 'generated_videos', model_id)
    output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND)
    output_video_name = f"{prompt[:100].strip()}.mp4"

    os.makedirs(output_dir, exist_ok=True)

    BASE_PARAMS = TURBODIFFUSION_I2V_MODEL_TO_PARAMS[model_id]
    num_inference_steps = BASE_PARAMS["num_inference_steps"]
    image_path = TURBODIFFUSION_I2V_IMAGE_PATHS[TURBODIFFUSION_I2V_TEST_PROMPTS.index(prompt)]

    init_kwargs = {
        "num_gpus": BASE_PARAMS["num_gpus"],
        "sp_size": BASE_PARAMS["sp_size"],
        "tp_size": BASE_PARAMS["tp_size"],
        "override_pipeline_cls_name": "TurboDiffusionI2VPipeline",
        # Keep both transformers in VRAM - avoids CPU RAM bottleneck
        "dit_cpu_offload": False,
        "use_fsdp_inference": True,
        "dit_layerwise_offload": False,
    }

    generation_kwargs = {
        "num_inference_steps": num_inference_steps,
        "output_path": output_dir,
        "image_path": image_path,
        "height": BASE_PARAMS["height"],
        "width": BASE_PARAMS["width"],
        "num_frames": BASE_PARAMS["num_frames"],
        "guidance_scale": BASE_PARAMS["guidance_scale"],
        "seed": BASE_PARAMS["seed"],
        "fps": BASE_PARAMS["fps"],
    }

    generator = VideoGenerator.from_pretrained(
        model_path=BASE_PARAMS["model_path"],
        **init_kwargs
    )
    generator.generate_video(prompt, **generation_kwargs)

    if isinstance(generator.executor, MultiprocExecutor):
        generator.executor.shutdown()

    assert os.path.exists(output_dir), f"Output video was not generated at {output_dir}"

    reference_folder = os.path.join(
        script_dir, device_reference_folder, model_id, ATTENTION_BACKEND
    )

    if not os.path.exists(reference_folder):
        logger.error("Reference folder missing")
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}"
        )

    # Find the matching reference video based on the prompt
    reference_video_name = None

    for filename in os.listdir(reference_folder):
        if filename.endswith('.mp4') and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        logger.error(
            f"Reference video not found for prompt: {prompt} with backend: {ATTENTION_BACKEND}"
        )
        raise FileNotFoundError(f"Reference video missing")

    reference_video_path = os.path.join(reference_folder, reference_video_name)
    generated_video_path = os.path.join(output_dir, output_video_name)

    logger.info(
        f"Computing SSIM between {reference_video_path} and {generated_video_path}"
    )
    ssim_values = compute_video_ssim_torchvision(
        reference_video_path, generated_video_path, use_ms_ssim=True
    )

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")
    logger.info(f"Writing SSIM results to directory: {output_dir}")

    success = write_ssim_results(
        output_dir, ssim_values, reference_video_path,
        generated_video_path, num_inference_steps, prompt
    )

    if not success:
        logger.error("Failed to write SSIM results to file")

    # TurboDiffusion I2V uses fewer steps, may have slightly lower SSIM
    min_acceptable_ssim = 0.95
    assert mean_ssim >= min_acceptable_ssim, (
        f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} "
        f"for {model_id} with backend {ATTENTION_BACKEND}"
    )

fastvideo.tests.ssim.test_turbodiffusion_similarity.test_turbodiffusion_inference_similarity ¶

test_turbodiffusion_inference_similarity(prompt, model_id)

Test that runs TurboDiffusion inference with SLA attention and RCM scheduler, then compares the output to reference videos using SSIM.

Source code in fastvideo/tests/ssim/test_turbodiffusion_similarity.py

@pytest.mark.parametrize("prompt", TURBODIFFUSION_TEST_PROMPTS)
@pytest.mark.parametrize("model_id", list(TURBODIFFUSION_MODEL_TO_PARAMS.keys()))
def test_turbodiffusion_inference_similarity(prompt, model_id):
    """
    Test that runs TurboDiffusion inference with SLA attention and RCM scheduler,
    then compares the output to reference videos using SSIM.
    """
    # TurboDiffusion requires SLA attention backend
    ATTENTION_BACKEND = "SLA_ATTN"
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND

    script_dir = os.path.dirname(os.path.abspath(__file__))

    base_output_dir = os.path.join(script_dir, 'generated_videos', model_id)
    output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND)
    output_video_name = f"{prompt[:100].strip()}.mp4"

    os.makedirs(output_dir, exist_ok=True)

    BASE_PARAMS = TURBODIFFUSION_MODEL_TO_PARAMS[model_id]
    num_inference_steps = BASE_PARAMS["num_inference_steps"]

    init_kwargs = {
        "num_gpus": BASE_PARAMS["num_gpus"],
        "sp_size": BASE_PARAMS["sp_size"],
        "tp_size": BASE_PARAMS["tp_size"],
        "override_pipeline_cls_name": "TurboDiffusionPipeline",
    }

    generation_kwargs = {
        "num_inference_steps": num_inference_steps,
        "output_path": output_dir,
        "height": BASE_PARAMS["height"],
        "width": BASE_PARAMS["width"],
        "num_frames": BASE_PARAMS["num_frames"],
        "guidance_scale": BASE_PARAMS["guidance_scale"],
        "seed": BASE_PARAMS["seed"],
        "fps": BASE_PARAMS["fps"],
    }

    generator = VideoGenerator.from_pretrained(model_path=BASE_PARAMS["model_path"], **init_kwargs)
    generator.generate_video(prompt, **generation_kwargs)

    if isinstance(generator.executor, MultiprocExecutor):
        generator.executor.shutdown()

    assert os.path.exists(output_dir), f"Output video was not generated at {output_dir}"

    reference_folder = os.path.join(
        script_dir, device_reference_folder, model_id, ATTENTION_BACKEND
    )

    if not os.path.exists(reference_folder):
        logger.error("Reference folder missing")
        raise FileNotFoundError(
            f"Reference video folder does not exist: {reference_folder}"
        )

    # Find the matching reference video based on the prompt
    reference_video_name = None

    for filename in os.listdir(reference_folder):
        if filename.endswith('.mp4') and prompt[:100].strip() in filename:
            reference_video_name = filename
            break

    if not reference_video_name:
        logger.error(
            f"Reference video not found for prompt: {prompt} with backend: {ATTENTION_BACKEND}"
        )
        raise FileNotFoundError(f"Reference video missing")

    reference_video_path = os.path.join(reference_folder, reference_video_name)
    generated_video_path = os.path.join(output_dir, output_video_name)

    logger.info(
        f"Computing SSIM between {reference_video_path} and {generated_video_path}"
    )
    ssim_values = compute_video_ssim_torchvision(
        reference_video_path, generated_video_path, use_ms_ssim=True
    )

    mean_ssim = ssim_values[0]
    logger.info(f"SSIM mean value: {mean_ssim}")
    logger.info(f"Writing SSIM results to directory: {output_dir}")

    success = write_ssim_results(
        output_dir, ssim_values, reference_video_path,
        generated_video_path, num_inference_steps, prompt
    )

    if not success:
        logger.error("Failed to write SSIM results to file")

    # TurboDiffusion uses fewer steps, may have slightly lower SSIM
    min_acceptable_ssim = 0.95
    assert mean_ssim >= min_acceptable_ssim, (
        f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} "
        f"for {model_id} with backend {ATTENTION_BACKEND}"
    )