Skip to content

linear

Classes

fastvideo.layers.linear.ColumnParallelLinear

ColumnParallelLinear(input_size: int, output_size: int, bias: bool = True, gather_output: bool = False, skip_bias_add: bool = False, params_dtype: dtype | None = None, quant_config: QuantizationConfig | None = None, output_sizes: list[int] | None = None, prefix: str = '')

Bases: LinearBase

Linear layer with column parallelism.

The linear layer is defined as Y = XA + b. A is parallelized along its second dimension as A = [A_1, ..., A_p].

Parameters:

Name Type Description Default
input_size int

first dimension of matrix A.

required
output_size int

second dimension of matrix A.

required
bias bool

If true, add bias.

True
gather_output bool

If true, call all-gather on output and make Y available to all GPUs, otherwise, every GPU will have its output which is Y_i = XA_i

False
skip_bias_add bool

This was added to enable performance optimizations where bias can be fused with other element-wise operations. we skip adding bias but instead return it.

False
params_dtype dtype | None

Data type for the parameters.

None
quant_config QuantizationConfig | None

Quantization configure.

None
output_sizes list[int] | None

list of output sizes packed into one output, like for QKV the list would be size 3.

None
prefix str

The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj)

''
Source code in fastvideo/layers/linear.py
def __init__(self,
             input_size: int,
             output_size: int,
             bias: bool = True,
             gather_output: bool = False,
             skip_bias_add: bool = False,
             params_dtype: torch.dtype | None = None,
             quant_config: QuantizationConfig | None = None,
             output_sizes: list[int] | None = None,
             prefix: str = ""):
    # Divide the weight matrix along the last dimension.
    self.tp_size = get_tp_world_size()
    self.input_size_per_partition = input_size
    self.output_size_per_partition = divide(output_size, self.tp_size)
    self.output_partition_sizes = [self.output_size_per_partition]
    # If QKV or MergedColumn, use output size of each partition.
    if hasattr(self, "output_sizes"):
        self.output_partition_sizes = [
            divide(output_size, self.tp_size)
            for output_size in self.output_sizes
        ]

    super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                     quant_config, prefix)

    self.gather_output = gather_output

    if output_sizes is None:
        output_sizes = [output_size]

    assert self.quant_method is not None
    self.quant_method.create_weights(
        layer=self,
        input_size_per_partition=self.input_size_per_partition,
        output_partition_sizes=self.output_partition_sizes,
        input_size=self.input_size,
        output_size=self.output_size,
        params_dtype=self.params_dtype,
        weight_loader=(
            self.weight_loader_v2 if self.quant_method.__class__.__name__
            in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
    if bias:
        self.bias = Parameter(
            torch.empty(
                self.output_size_per_partition,
                dtype=params_dtype,
            ))
        set_weight_attrs(self.bias, {
            "output_dim": 0,
            "weight_loader": self.weight_loader,
        })
    else:
        self.register_parameter("bias", None)

fastvideo.layers.linear.LinearBase

LinearBase(input_size: int, output_size: int, skip_bias_add: bool = False, params_dtype: dtype | None = None, quant_config: QuantizationConfig | None = None, prefix: str = '')

Bases: Module

Base linear layer.

Parameters:

Name Type Description Default
input_size int

input dimension of the linear layer.

required
output_size int

output dimension of the linear layer.

required
bias

If true, add bias.

required
skip_bias_add bool

If true, skip adding bias but instead return it.

False
params_dtype dtype | None

Data type for the parameters.

None
quant_config QuantizationConfig | None

Quantization configure.

None
Source code in fastvideo/layers/linear.py
def __init__(
    self,
    input_size: int,
    output_size: int,
    skip_bias_add: bool = False,
    params_dtype: torch.dtype | None = None,
    quant_config: QuantizationConfig | None = None,
    prefix: str = "",
):
    super().__init__()

    # Keep input parameters
    self.input_size = input_size
    self.output_size = output_size
    self.skip_bias_add = skip_bias_add
    if params_dtype is None:
        params_dtype = torch.get_default_dtype()
    self.params_dtype = params_dtype
    self.quant_config = quant_config
    self.prefix = prefix
    if quant_config is None:
        self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod(
        )
    else:
        self.quant_method = quant_config.get_quant_method(self,
                                                          prefix=prefix)

fastvideo.layers.linear.LinearMethodBase

Bases: QuantizeMethodBase

Base class for different (maybe quantized) linear methods.

Functions

fastvideo.layers.linear.LinearMethodBase.apply abstractmethod
apply(layer: Module, x: Tensor, bias: Tensor | None = None) -> Tensor

Apply the weights in layer to the input tensor. Expects create_weights to have been called before on the layer.

Source code in fastvideo/layers/linear.py
@abstractmethod
def apply(self,
          layer: torch.nn.Module,
          x: torch.Tensor,
          bias: torch.Tensor | None = None) -> torch.Tensor:
    """Apply the weights in layer to the input tensor.
    Expects create_weights to have been called before on the layer."""
    raise NotImplementedError
fastvideo.layers.linear.LinearMethodBase.create_weights abstractmethod
create_weights(layer: Module, input_size_per_partition: int, output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: dtype, **extra_weight_attrs) -> None

Create weights for a linear layer. The weights will be set as attributes of the layer.

Parameters:

Name Type Description Default
layer Module

The layer that is using the LinearMethodBase factory.

required
input_size_per_partition int

Size of the weight input dim on rank X.

required
output_partition_sizes list[int]

Sizes of the output dim of each logical weight on rank X. E.g., output_partition_sizes for QKVLinear is a list contains the width of Wq, Wk, Wv on rank X.

required
input_size int

Size of the input dim of the weight across all ranks.

required
output_size int

Size of the output dim of the weight across all ranks.

required
params_dtype dtype

Datatype of the parameters.

required
Source code in fastvideo/layers/linear.py
@abstractmethod
def create_weights(self, layer: torch.nn.Module,
                   input_size_per_partition: int,
                   output_partition_sizes: list[int], input_size: int,
                   output_size: int, params_dtype: torch.dtype,
                   **extra_weight_attrs) -> None:
    """Create weights for a linear layer. 
       The weights will be set as attributes of the layer.

    Args:
        layer: The layer that is using the LinearMethodBase factory.
        input_size_per_partition: Size of the weight input dim on rank X.
        output_partition_sizes: Sizes of the output dim of each logical 
            weight on rank X. E.g., output_partition_sizes for QKVLinear
            is a list contains the width of Wq, Wk, Wv on rank X.
        input_size: Size of the input dim of the weight across all ranks.
        output_size: Size of the output dim of the weight across all ranks.
        params_dtype: Datatype of the parameters.
    """
    raise NotImplementedError

fastvideo.layers.linear.MergedColumnParallelLinear

MergedColumnParallelLinear(input_size: int, output_sizes: list[int], bias: bool = True, gather_output: bool = False, skip_bias_add: bool = False, params_dtype: dtype | None = None, quant_config: QuantizationConfig | None = None, prefix: str = '')

Bases: ColumnParallelLinear

Packed linear layers with column parallelism.

Similar to ColumnParallelLinear, but the weight matrix is concatenated along the output dimension. When the weight matrix is loaded, the different partitions are sharded separately.

Parameters:

Name Type Description Default
input_size int

input dimension of the linear layer.

required
output_sizes list[int]

list of output dimensions of the linear layer.

required
bias bool

If true, add bias.

True
gather_output bool

If true, call all-gather on output and make the output available to all GPUs, otherwise, every GPU will have its own output.

False
skip_bias_add bool

This was added to enable performance optimizations where bias can be fused with other element-wise operations. we skip adding bias but instead return it.

False
params_dtype dtype | None

Data type for the parameters.

None
quant_config QuantizationConfig | None

Quantization configure.

None
prefix str

The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj)

''
Source code in fastvideo/layers/linear.py
def __init__(self,
             input_size: int,
             output_sizes: list[int],
             bias: bool = True,
             gather_output: bool = False,
             skip_bias_add: bool = False,
             params_dtype: torch.dtype | None = None,
             quant_config: QuantizationConfig | None = None,
             prefix: str = ""):
    self.output_sizes = output_sizes
    tp_size = get_tp_world_size()
    assert all(output_size % tp_size == 0 for output_size in output_sizes)
    super().__init__(input_size=input_size,
                     output_size=sum(output_sizes),
                     bias=bias,
                     gather_output=gather_output,
                     skip_bias_add=skip_bias_add,
                     params_dtype=params_dtype,
                     quant_config=quant_config,
                     prefix=prefix)

fastvideo.layers.linear.QKVParallelLinear

QKVParallelLinear(hidden_size: int, head_size: int, total_num_heads: int, total_num_kv_heads: int | None = None, bias: bool = True, skip_bias_add: bool = False, params_dtype: dtype | None = None, quant_config: QuantizationConfig | None = None, prefix: str = '')

Bases: ColumnParallelLinear

Linear layers for the attention's QKV transformation.

Linear layers for the linear transformation of the query, key, and value vectors in the attention layer. The weight matrix is concatenated along the output dimension. The layer is parallelized along the head dimension. When the number of key/value heads is smaller than the number of query heads (e.g., multi-query/grouped-query attention), the key/value head may be replicated while the query heads are partitioned.

Parameters:

Name Type Description Default
hidden_size int

input hidden state size of the transformer.

required
head_size int

size of each attention head.

required
total_num_heads int

total number of attention query heads.

required
total_num_kv_heads int | None

total number of attention key/value heads. If None, assume total_num_kv_heads = total_num_heads.

None
bias bool

If true, add bias.

True
skip_bias_add bool

This was added to enable performance optimizations where bias can be fused with other element-wise operations. we skip adding bias but instead return it.

False
params_dtype dtype | None

Data type for the parameters.

None
quant_config QuantizationConfig | None

Quantization configure.

None
prefix str

The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj)

''
Source code in fastvideo/layers/linear.py
def __init__(self,
             hidden_size: int,
             head_size: int,
             total_num_heads: int,
             total_num_kv_heads: int | None = None,
             bias: bool = True,
             skip_bias_add: bool = False,
             params_dtype: torch.dtype | None = None,
             quant_config: QuantizationConfig | None = None,
             prefix: str = ""):
    self.hidden_size = hidden_size
    self.head_size = head_size
    self.total_num_heads = total_num_heads
    if total_num_kv_heads is None:
        total_num_kv_heads = total_num_heads
    self.total_num_kv_heads = total_num_kv_heads
    # Divide the weight matrix along the last dimension.
    tp_size = get_tp_world_size()
    self.num_heads = divide(self.total_num_heads, tp_size)
    if tp_size >= self.total_num_kv_heads:
        self.num_kv_heads = 1
        self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads)
    else:
        self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
        self.num_kv_head_replicas = 1
    input_size = self.hidden_size
    output_size = (self.num_heads +
                   2 * self.num_kv_heads) * tp_size * self.head_size
    self.output_sizes = [
        self.num_heads * self.head_size * tp_size,  # q_proj
        self.num_kv_heads * self.head_size * tp_size,  # k_proj
        self.num_kv_heads * self.head_size * tp_size,  # v_proj 
    ]

    super().__init__(input_size=input_size,
                     output_size=output_size,
                     bias=bias,
                     gather_output=False,
                     skip_bias_add=skip_bias_add,
                     params_dtype=params_dtype,
                     quant_config=quant_config,
                     prefix=prefix)

fastvideo.layers.linear.ReplicatedLinear

ReplicatedLinear(input_size: int, output_size: int, bias: bool = True, skip_bias_add: bool = False, params_dtype: dtype | None = None, quant_config: QuantizationConfig | None = None, prefix: str = '')

Bases: LinearBase

Replicated linear layer.

Parameters:

Name Type Description Default
input_size int

input dimension of the linear layer.

required
output_size int

output dimension of the linear layer.

required
bias bool

If true, add bias.

True
skip_bias_add bool

If true, skip adding bias but instead return it.

False
params_dtype dtype | None

Data type for the parameters.

None
quant_config QuantizationConfig | None

Quantization configure.

None
prefix str

The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj)

''
Source code in fastvideo/layers/linear.py
def __init__(self,
             input_size: int,
             output_size: int,
             bias: bool = True,
             skip_bias_add: bool = False,
             params_dtype: torch.dtype | None = None,
             quant_config: QuantizationConfig | None = None,
             prefix: str = ""):
    super().__init__(input_size,
                     output_size,
                     skip_bias_add,
                     params_dtype,
                     quant_config,
                     prefix=prefix)

    # All the linear layer supports quant method.
    assert self.quant_method is not None
    self.quant_method.create_weights(self,
                                     self.input_size, [self.output_size],
                                     self.input_size,
                                     self.output_size,
                                     self.params_dtype,
                                     weight_loader=self.weight_loader)

    if bias:
        self.bias = Parameter(
            torch.empty(
                self.output_size,
                dtype=self.params_dtype,
            ))
        set_weight_attrs(self.bias, {
            "output_dim": 0,
            "weight_loader": self.weight_loader,
        })
    else:
        self.register_parameter("bias", None)

fastvideo.layers.linear.RowParallelLinear

RowParallelLinear(input_size: int, output_size: int, bias: bool = True, input_is_parallel: bool = True, skip_bias_add: bool = False, params_dtype: dtype | None = None, reduce_results: bool = True, quant_config: QuantizationConfig | None = None, prefix: str = '')

Bases: LinearBase

Linear layer with row parallelism.

The linear layer is defined as Y = XA + b. A is parallelized along its first dimension and X along its second dimension as: - - | A_1 | | . | A = | . | X = [X_1, ..., X_p] | . | | A_p | - - Arguments: input_size: first dimension of matrix A. output_size: second dimension of matrix A. bias: If true, add bias. Note that bias is not parallelized. input_is_parallel: If true, we assume that the input is already split across the GPUs and we do not split again. skip_bias_add: This was added to enable performance optimization where bias can be fused with other element-wise operations. We skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure.

Source code in fastvideo/layers/linear.py
def __init__(self,
             input_size: int,
             output_size: int,
             bias: bool = True,
             input_is_parallel: bool = True,
             skip_bias_add: bool = False,
             params_dtype: torch.dtype | None = None,
             reduce_results: bool = True,
             quant_config: QuantizationConfig | None = None,
             prefix: str = ""):
    # Divide the weight matrix along the first dimension.
    self.tp_rank = get_tp_rank()
    self.tp_size = get_tp_world_size()
    self.input_size_per_partition = divide(input_size, self.tp_size)
    self.output_size_per_partition = output_size
    self.output_partition_sizes = [output_size]

    super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                     quant_config, prefix)

    self.input_is_parallel = input_is_parallel
    self.reduce_results = reduce_results

    assert self.quant_method is not None
    self.quant_method.create_weights(
        layer=self,
        input_size_per_partition=self.input_size_per_partition,
        output_partition_sizes=self.output_partition_sizes,
        input_size=self.input_size,
        output_size=self.output_size,
        params_dtype=self.params_dtype,
        weight_loader=(
            self.weight_loader_v2 if self.quant_method.__class__.__name__
            in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
    if not reduce_results and (bias and not skip_bias_add):
        raise ValueError("When not reduce the results, adding bias to the "
                         "results can lead to incorrect results")

    if bias:
        self.bias = Parameter(
            torch.empty(self.output_size, dtype=params_dtype))
        set_weight_attrs(self.bias, {
            "output_dim": 0,
            "weight_loader": self.weight_loader,
        })
    else:
        self.register_parameter("bias", None)

fastvideo.layers.linear.UnquantizedLinearMethod

Bases: LinearMethodBase

Linear method without quantization.

Functions

fastvideo.layers.linear.adjust_scalar_to_fused_array

adjust_scalar_to_fused_array(param: Tensor, loaded_weight: Tensor, shard_id: str | int) -> tuple[Tensor, Tensor]

For fused modules (QKV and MLP) we have an array of length N that holds 1 scale for each "logical" matrix. So the param is an array of length N. The loaded_weight corresponds to one of the shards on disk. Here, we slice the param based on the shard_id for loading.

Source code in fastvideo/layers/linear.py
def adjust_scalar_to_fused_array(
        param: torch.Tensor, loaded_weight: torch.Tensor,
        shard_id: str | int) -> tuple[torch.Tensor, torch.Tensor]:
    """For fused modules (QKV and MLP) we have an array of length
    N that holds 1 scale for each "logical" matrix. So the param
    is an array of length N. The loaded_weight corresponds to 
    one of the shards on disk. Here, we slice the param based on 
    the shard_id for loading.
    """
    qkv_idxs = {"q": 0, "k": 1, "v": 2}

    if isinstance(shard_id, str):
        shard_id = qkv_idxs[shard_id]
    elif not isinstance(shard_id, int):
        raise ValueError(f"Unknown Shard Id {shard_id}")

    # AutoFP8 scales do not have a shape
    # compressed-tensors scales do have a shape
    if len(loaded_weight.shape) != 0:
        assert loaded_weight.shape[0] == 1
        loaded_weight = loaded_weight[0]

    return param[shard_id], loaded_weight