代码部分 : quantize_qwen3_coder_30b_a3b_instruct_gptq.py

import os########## 环境变量设置 ##########
# 当前可用的 CUDA 编号
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# GPU 显存资源片段优化
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# GPU 物理设备
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"import torchfrom datasets import load_dataset
from transformers import AutoTokenizer
from gptqmodel import GPTQModel, QuantizeConfig# 校准数据集路径 (公开代码生成 bigcode/the-stack 数据集的 python 代码部分数据集)
local_parquet_path = "./calibration_dataset/train-00000-of-00206.parquet"
# Qwen3-Coder-30B-A3B-Instruct 模型路径
model_name_or_path = "./models/Qwen3-Coder-30B-A3B-Instruct"
# 量化后模型保存路径
quantized_model_dir = "./models/Qwen3-Coder-30B-A3B-Instruct-GPTQ"# 量化配置
# 参考 gptqmodel 示例和文档进行调整
quantize_config = QuantizeConfig(bits=4,                  # 量化为 4-bitgroup_size=128,          # group size 128 依据模型的 config.json "head_dim": 128damp_percent=0.01,       # Dampeningdesc_act=False,          # 设为 False 可提升速度和兼容性static_groups=False,     # 不设置静态组sym=True,                # 对称量化true_sequential=True,    # 真正的顺序量化# 根据 gptqmodel 文档可能还有其他参数
)# 内存映射配置 (启用 CPU 卸载)
# 告诉 transformers / accelerate 如何分配 CPU 和 GPU 内存
max_memory = {1: "22GiB",       # 数字键值表示 GPU 编号，量化过程分配的 GPU 1 显存"cpu": "65GiB"    # 量化过程分配的 CPU 内存
}# 校准数据集配置
calibration_config = {"n_samples": 300,    # 校准样本数"seq_len": 1024,    # 序列长度"seed": 42,         # 随机种子
}########## 加载 tokenizer ##########
print("1. Loading tokenizer...")
# 使用 trust_remote_code=True (Qwen 系列模型通常需要)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, trust_remote_code=True)
# 如果词向量编码中没有 pad_token，则将 eos_token 给它
if tokenizer.pad_token is None:tokenizer.pad_token = tokenizer.eos_token########## 加载并准备校准数据集 ##########
print("2. Loading and preparing calibration dataset from local parquet file...")
try:n_samples = calibration_config["n_samples"]seq_len = calibration_config["seq_len"]seed = calibration_config["seed"]print(f"   Loading dataset from {local_parquet_path}...")# 加载本地 parquet 文件raw_datasets = load_dataset("parquet", data_files=local_parquet_path, split="train")print(f"   Total samples in file: {len(raw_datasets)}")# 随机打乱并选择样本print(f"   Shuffling and selecting {n_samples} samples...")raw_datasets = raw_datasets.shuffle(seed=seed).select(range(min(n_samples, len(raw_datasets))))########## tokenize function ##########def tokenize_function(example):"""对单个样本进行 Tokenize, 用于 GPTQ 的输入"""# 1. 获取文本内容，从 "content" 键获取代码文本text = example.get("content", "")# 2. 快速检查: 确保是字符串且非空if not isinstance(text, str) or not text.strip():# 如果不是字符串或为空，直接跳过return Nonetry:# 3. tokenize 文本#    设置 return_tensors=None 确保返回 Python List (通常是 List[List[int]])encodings = tokenizer(text,truncation=True,      # 超过 max_length 则截断padding=False,        # 不进行填充max_length=seq_len,   # 最大序列长度return_tensors=None,  # 返回 Python List)# 4. 提取 input_ids 和 attention_maskinput_ids = encodings["input_ids"]attention_mask = encodings["attention_mask"]# 5. 检查 input_ids 必须存在且是列表 (这一步会过滤掉所有不符合预期格式的样本)if not (isinstance(input_ids, list) and isinstance(attention_mask, list)):return None# 6. 检查数据长度必须足够if len(input_ids) != len(attention_mask) or len(input_ids) < 32:return None# 7. 截断到指定长度，虽然 truncation=True 已经处理了，但显式截断更安全input_ids = input_ids[:seq_len]attention_mask = attention_mask[:seq_len]# 8. 返回符合 gptqmodel 要求的格式: {"input_ids": List[int], "attention_mask": List[int]}# gptqmodel 内部会将这个列表转换为 tensorreturn {"input_ids": input_ids,"attention_mask": attention_mask}except Exception as e:# 6. 捕获任何在 tokenize 或处理过程中发生的意外错误，并跳过该样本#    这可以防止一个坏样本导致整个量化过程崩溃#    print(f"Warning: Skipping sample due to tokenization error: {e}")return None########## tokenize dataset ##########print("   Tokenizing dataset...")tokenized_datasets = raw_datasets.map(tokenize_function,batched=False,remove_columns=raw_datasets.column_names,   # 移除数据集原始列desc="Tokenizing the stack (Python)",)########## 过滤无效样本 ##########print("   Filtering tokenized dataset...")initial_count = len(tokenized_datasets)tokenized_datasets = tokenized_datasets.filter(lambda example: example is not None andisinstance(example["input_ids"], list) andlen(example["input_ids"]) >= 32)filtered_count = len(tokenized_datasets)print(f"   Samples after filtering: {filtered_count} (removed {initial_count - filtered_count})")########## 准备最终校准数据集格式 ##########print("   Formatting final calibration dataset...")calibration_dataset = []for sample in tokenized_datasets:input_ids_list = sample["input_ids"]attention_mask_list = sample["attention_mask"]# 最终检查并转换为 tensorif (isinstance(input_ids_list, list) andisinstance(attention_mask_list, list) andlen(input_ids_list) == len(attention_mask_list) andlen(input_ids_list) >= 32):try:tensor_input_ids = torch.tensor(input_ids_list, dtype=torch.long)tensor_attention_mask = torch.tensor(attention_mask_list, dtype=torch.long)calibration_dataset.append({"input_ids": tensor_input_ids, "attention_mask": tensor_attention_mask})except Exception:# 忽略无法转换为 tensor 的样本passprint(f"   Final calibration dataset prepared with {len(calibration_dataset)} samples.")if len(calibration_dataset) == 0:raise ValueError("Final calibration dataset is empty!")except Exception as e:print(f"Error during data loading / preparation: {e}")raise########## 加载模型 ##########
print("3. Loading model with memory mapping...")
try:# 使用 device_map="auto" 和 max_memory 自动管理内存分配model = GPTQModel.from_pretrained(model_name_or_path,quantize_config=quantize_config,device_map="auto",                # 自动分配设备，可以自动将模型卸载到 CPU 内存上，量化过程中 CPU-GPU 之间的数据交互max_memory=max_memory,            # 指定最大内存分配torch_dtype=torch.bfloat16,       # 使用模型精度 bfloat16 加载trust_remote_code=True,           # Qwen 系列通常需要# low_cpu_mem_usage=True,         # 尝试减少 CPU 内存峰值# offload_folder="offload",       # 如果需要，指定一个磁盘文件夹用于卸载)print("   Model loaded successfully.")
except Exception as e:print(f"Error loading model: {e}")raise########## 执行量化 ##########
print("4. Starting quantization process...")
try:model.quantize(calibration_dataset=calibration_dataset)print("   Quantization completed successfully.")
except Exception as e:print(f"Error during quantization: {e}")raise  # 重新抛出以停止########## 保存模型 ##########
print("5. Saving quantized model...")
try:model.save_quantized(quantized_model_dir)tokenizer.save_pretrained(quantized_model_dir)print(f"   Quantized model saved to {quantized_model_dir}.")
except Exception as e:print(f"Error saving model: {e}")raiseprint("All steps completed successfully!")

执行过程会报错：
INFO -----------------------------------------------------------------------------------------------------------------------------------------
INFO | gptq | 0 | mlp.experts.121.down_proj | 0.00020292 | 267 | 0.01000 | 0.206 | 16.782 | /48] 2.1%
INFO -----------------------------------------------------------------------------------------------------------------------------------------
INFO | gptq | 0 | mlp.experts.122.down_proj | 0.00045387 | 295 | 0.01000 | 0.203 | 16.782 | /48] 2.1%
INFO -----------------------------------------------------------------------------------------------------------------------------------------
INFO | gptq | 0 | mlp.experts.123.down_proj | 0.00005101 | 291 | 0.01000 | 0.208 | 16.782 | /48] 2.1%
INFO -----------------------------------------------------------------------------------------------------------------------------------------
INFO | gptq | 0 | mlp.experts.124.down_proj | 0.00336569 | 296 | 0.01000 | 0.269 | 16.782 | /48] 2.1%
INFO -----------------------------------------------------------------------------------------------------------------------------------------
INFO | gptq | 0 | mlp.experts.125.down_proj | 0.00214480 | 295 | 0.01000 | 0.203 | 16.782 | /48] 2.1%
INFO -----------------------------------------------------------------------------------------------------------------------------------------
INFO | gptq | 0 | mlp.experts.126.down_proj | 0.00106318 | 297 | 0.01000 | 0.205 | 16.782 | /48] 2.1%
INFO -----------------------------------------------------------------------------------------------------------------------------------------
INFO | gptq | 0 | mlp.experts.127.down_proj | 0.00021535 | 271 | 0.01000 | 0.207 | 16.782 | /48] 2.1%
INFO -----------------------------------------------------------------------------------------------------------------------------------------
Quantizing layer 1 of 47 [1 of 47] ██-----------------------------------------------------| 0:04:07 / 1:38:48 [2/48] 4.2%Error during quantization: The size of tensor a (32) must match the size of tensor b (128) at non-singleton dimension 3
Traceback (most recent call last):
File “~/Quantization/quantize_qwen3_coder_30b_a3b_instruct_gptq.py”, line 194, in
model.quantize(calibration_dataset=calibration_dataset)

File “~/quantization/lib/python3.13/site-packages/gptqmodel/models/base.py”, line 450, in quantize
return module_looper.loop(
~~~~~~~~~~~~~~~~~~^
calibration_enable_gpu_cache=calibration_enable_gpu_cache,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
…<2 lines>…
backend=backend,
^^^^^^^^^^^^^^^^
)
^
File “~/quantization/lib/python3.13/site-packages/torch/utils/_contextlib.py”, line 120, in decorate_context
return func(*args, **kwargs)
File “~/quantization/lib/python3.13/site-packages/gptqmodel/looper/module_looper.py”, line 315, in loop
module(*layer_input) if is_lm_head_module else module(*layer_input,
~~~~~~^^^^^^^^^^^^^^
**additional_layer_inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File “~/quantization/lib/python3.13/site-packages/transformers/modeling_layers.py”, line 94, in call
return super().call(*args, **kwargs)
~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File “~/quantization/lib/python3.13/site-packages/torch/nn/modules/module.py”, line 1773, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File “~/quantization/lib/python3.13/site-packages/torch/nn/modules/module.py”, line 1784, in _call_impl
return forward_call(*args, **kwargs)
File “~/quantization/lib/python3.13/site-packages/transformers/models/qwen3_moe/modeling_qwen3_moe.py”, line 342, in forward
hidden_states, _ = self.self_attn(
~~~~~~~~~~~~~~^
hidden_states=hidden_states,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
…<5 lines>…
**kwargs,
^^^^^^^^^
)
^
File “~/quantization/lib/python3.13/site-packages/torch/nn/modules/module.py”, line 1773, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File “~/quantization/lib/python3.13/site-packages/torch/nn/modules/module.py”, line 1784, in _call_impl
return forward_call(*args, **kwargs)
File “~/quantization/lib/python3.13/site-packages/transformers/models/qwen3_moe/modeling_qwen3_moe.py”, line 167, in forward
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

File “~/quantization/lib/python3.13/site-packages/transformers/models/qwen3_moe/modeling_qwen3_moe.py”, line 78, in apply_rotary_pos_emb
q_embed = (q * cos) + (rotate_half(q) * sin)
^~~
RuntimeError: The size of tensor a (32) must match the size of tensor b (128) at non-singleton dimension 3

参考 https://github.com/ModelCloud/GPTQModel/issues/1665 解决错误
将 modeling_qwen3_moe.py 中 Qwen3MoeDecoderLayer 类的 forward 改写如图
在这里插入图片描述