quantization ¶

Classes¶

Functions¶

fastvideo.layers.quantization.register_quantization_config ¶

register_quantization_config(quantization: str)

Register a customized vllm quantization config.

When a quantization method is not supported by vllm, you can register a customized quantization config to support it.

Parameters:

Name	Type	Description	Default
`quantization`	`str`	The quantization method name.	required

Examples:

>>> from fastvideo.layers.quantization import register_quantization_config
>>> from fastvideo.layers.quantization import get_quantization_config
>>> from fastvideo.layers.quantization.base_config import QuantizationConfig
>>>
>>> @register_quantization_config("my_quant")
... class MyQuantConfig(QuantizationConfig):
...     pass
>>>
>>> get_quantization_config("my_quant")
<class 'MyQuantConfig'>

Source code in fastvideo/layers/quantization/__init__.py

def register_quantization_config(quantization: str):
    """Register a customized vllm quantization config.

    When a quantization method is not supported by vllm, you can register a customized
    quantization config to support it.

    Args:
        quantization (str): The quantization method name.

    Examples:
        >>> from fastvideo.layers.quantization import register_quantization_config
        >>> from fastvideo.layers.quantization import get_quantization_config
        >>> from fastvideo.layers.quantization.base_config import QuantizationConfig
        >>>
        >>> @register_quantization_config("my_quant")
        ... class MyQuantConfig(QuantizationConfig):
        ...     pass
        >>>
        >>> get_quantization_config("my_quant")
        <class 'MyQuantConfig'>
    """  # noqa: E501

    def _wrapper(quant_config_cls):
        if quantization in QUANTIZATION_METHODS:
            raise ValueError(
                f"The quantization method `{quantization}` is already exists.")
        if not issubclass(quant_config_cls, QuantizationConfig):
            raise ValueError("The quantization config must be a subclass of "
                             "`QuantizationConfig`.")
        _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
        QUANTIZATION_METHODS.append(quantization)
        return quant_config_cls

    return _wrapper

Modules¶

fastvideo.layers.quantization.absmax_fp8 ¶

Classes¶

fastvideo.layers.quantization.absmax_fp8.AbsMaxFP8Config ¶

AbsMaxFP8Config()

Bases: QuantizationConfig

Config class for absmax float8_e4m3fn quantization. Currently only support per-tensor quantization.

Source code in fastvideo/layers/quantization/base_config.py

def __init__(self):
    super().__init__()
    # mapping is updated by models as they initialize
    self.packed_modules_mapping: dict[str, list[str]] = dict()

fastvideo.layers.quantization.absmax_fp8.AbsMaxFP8LinearMethod ¶

Bases: LinearMethodBase

Linear method with AbsMax FP8 quantization.

Functions¶

fastvideo.layers.quantization.base_config ¶

Classes¶

fastvideo.layers.quantization.base_config.QuantizationConfig ¶

QuantizationConfig()

Bases: ABC

Base class for quantization configs.

Source code in fastvideo/layers/quantization/base_config.py

def __init__(self):
    super().__init__()
    # mapping is updated by models as they initialize
    self.packed_modules_mapping: dict[str, list[str]] = dict()

Functions¶

fastvideo.layers.quantization.base_config.QuantizationConfig.from_config abstractmethod classmethod ¶

from_config(config: dict[str, Any]) -> QuantizationConfig

Create a config class from the model's quantization config.

Source code in fastvideo/layers/quantization/base_config.py

@classmethod
@abstractmethod
def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig":
    """Create a config class from the model's quantization config."""
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizationConfig.get_config_filenames abstractmethod staticmethod ¶

get_config_filenames() -> list[str]

List of filenames to search for in the model directory.

Source code in fastvideo/layers/quantization/base_config.py

@staticmethod
@abstractmethod
def get_config_filenames() -> list[str]:
    """List of filenames to search for in the model directory."""
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizationConfig.get_from_keys staticmethod ¶

get_from_keys(config: dict[str, Any], keys: list[str]) -> Any

Get a value from the model's quantization config.

Source code in fastvideo/layers/quantization/base_config.py

@staticmethod
def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
    """Get a value from the model's quantization config."""
    for key in keys:
        if key in config:
            return config[key]
    raise ValueError(f"Cannot find any of {keys} in the model's "
                     "quantization config.")

fastvideo.layers.quantization.base_config.QuantizationConfig.get_from_keys_or staticmethod ¶

get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any

Get a optional value from the model's quantization config.

Source code in fastvideo/layers/quantization/base_config.py

@staticmethod
def get_from_keys_or(config: dict[str, Any], keys: list[str],
                     default: Any) -> Any:
    """Get a optional value from the model's quantization config."""
    try:
        return QuantizationConfig.get_from_keys(config, keys)
    except ValueError:
        return default

fastvideo.layers.quantization.base_config.QuantizationConfig.get_min_capability abstractmethod classmethod ¶

get_min_capability() -> int

Minimum GPU capability to support the quantization method.

E.g., 70 for Volta, 75 for Turing, 80 for Ampere. This requirement is due to the custom CUDA kernels used by the quantization method.

Source code in fastvideo/layers/quantization/base_config.py

@classmethod
@abstractmethod
def get_min_capability(cls) -> int:
    """Minimum GPU capability to support the quantization method.

    E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
    This requirement is due to the custom CUDA kernels used by the
    quantization method.
    """
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizationConfig.get_name abstractmethod ¶

get_name() -> QuantizationMethods

Name of the quantization method.

Source code in fastvideo/layers/quantization/base_config.py

@abstractmethod
def get_name(self) -> QuantizationMethods:
    """Name of the quantization method."""
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizationConfig.get_quant_method abstractmethod ¶

get_quant_method(layer: Module, prefix: str) -> QuantizeMethodBase | None

Get the quantize method to use for the quantized layer.

Parameters:

Name	Type	Description	Default
`layer`	`Module`	The layer for the quant method.	required
`prefix`	`str`	The full name of the layer in the state dict	required

Returns: The quantize method. None if the given layer doesn't support quant method.

Source code in fastvideo/layers/quantization/base_config.py

@abstractmethod
def get_quant_method(self, layer: torch.nn.Module,
                     prefix: str) -> QuantizeMethodBase | None:
    """Get the quantize method to use for the quantized layer.

    Args:
        layer: The layer for the quant method.
        prefix: The full name of the layer in the state dict
    Returns:
        The quantize method. None if the given layer doesn't support quant
        method.
    """
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizationConfig.get_supported_act_dtypes abstractmethod ¶

get_supported_act_dtypes() -> list[dtype]

List of supported activation dtypes.

Source code in fastvideo/layers/quantization/base_config.py

@abstractmethod
def get_supported_act_dtypes(self) -> list[torch.dtype]:
    """List of supported activation dtypes."""
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizationConfig.override_quantization_method classmethod ¶

override_quantization_method(hf_quant_cfg, user_quant) -> QuantizationMethods | None

Detects if this quantization method can support a given checkpoint format by overriding the user specified quantization method -- this method should only be overwritten by subclasses in exceptional circumstances

Source code in fastvideo/layers/quantization/base_config.py

@classmethod
def override_quantization_method(cls, hf_quant_cfg,
                                 user_quant) -> QuantizationMethods | None:
    """
       Detects if this quantization method can support a given checkpoint
       format by overriding the user specified quantization method -- 
       this method should only be overwritten by subclasses in exceptional 
       circumstances
    """
    return None

fastvideo.layers.quantization.base_config.QuantizeMethodBase ¶

Bases: ABC

Base class for different quantized methods.

Functions¶

fastvideo.layers.quantization.base_config.QuantizeMethodBase.apply abstractmethod ¶

apply(layer: Module, *args, **kwargs) -> Tensor

Apply the weights in layer to the input tensor.

Expects create_weights to have been called before on the layer.

Source code in fastvideo/layers/quantization/base_config.py

@abstractmethod
def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
    """Apply the weights in layer to the input tensor.

    Expects create_weights to have been called before on the layer."""
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizeMethodBase.create_weights abstractmethod ¶

create_weights(layer: Module, *weight_args, **extra_weight_attrs)

Create weights for a layer.

The weights will be set as attributes of the layer.

Source code in fastvideo/layers/quantization/base_config.py

@abstractmethod
def create_weights(self, layer: torch.nn.Module, *weight_args,
                   **extra_weight_attrs):
    """Create weights for a layer.

    The weights will be set as attributes of the layer."""
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizeMethodBase.embedding ¶

embedding(layer: Module, *args, **kwargs) -> Tensor

Gather embeddings in the layer based on indices in the input tensor.

Expects create_weights to have been called before on the layer.

Source code in fastvideo/layers/quantization/base_config.py

def embedding(self, layer: torch.nn.Module, *args,
              **kwargs) -> torch.Tensor:
    """Gather embeddings in the layer based on indices in the input tensor.

    Expects create_weights to have been called before on the layer."""
    raise NotImplementedError

fastvideo.layers.quantization.base_config.QuantizeMethodBase.process_weights_after_loading ¶

process_weights_after_loading(layer: Module) -> None

Process the weight after loading.

This can be used for example, to transpose weights for computation.

Source code in fastvideo/layers/quantization/base_config.py

def process_weights_after_loading(self, layer: nn.Module) -> None:
    """Process the weight after loading.

    This can be used for example, to transpose weights for computation.
    """
    return

Functions¶

fastvideo.layers.quantization.base_config.method_has_implemented_embedding ¶

method_has_implemented_embedding(method_class: type[QuantizeMethodBase]) -> bool

Not all quant methods have embedding implemented, so we need to check that it exists for our given method. We check this by making sure the function has been changed from the base implementation.

Source code in fastvideo/layers/quantization/base_config.py

def method_has_implemented_embedding(
        method_class: type[QuantizeMethodBase]) -> bool:
    """
    Not all quant methods have embedding implemented, so we need to check that
    it exists for our given method. We check this by making sure the function
    has been changed from the base implementation.
    """
    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
                                            None)
    class_embedding = inspect.getattr_static(method_class, "embedding", None)

    return (class_embedding is not None
            and class_embedding is not base_embedding)