Skip to content

Commit

Permalink
Merge branch 'ModelTC:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
gushiqiao authored Oct 17, 2024
2 parents bc80937 + 63f3365 commit 7e94a7c
Show file tree
Hide file tree
Showing 24 changed files with 120 additions and 46 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
base:
seed: &seed 42
model:
type: model_type
path: model path
tokenizer_mode: slow
torch_dtype: auto
calib:
name: custom
download: False
load_from_txt: True
path: ./inputs.txt
n_samples: 128
bs: -1
seq_len: 512
preproc: original_txt
padding: True
seed: *seed
eval:
eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
download: False
path: eval data path
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
bit: 4
symmetric: False
granularity: per_group
group_size: 128
special:
trans: True
# The options for "trans_version" include "v1" and "v2".
# But their results don't differ significantly.
trans_version: v2
weight_clip: False
# For 2-bit quantization, setting "clip_sym: False" will yield better results.
clip_sym: True
save:
save_trans: False
save_fake: False
save_path: /path/to/save/
20 changes: 16 additions & 4 deletions llmc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,23 +61,35 @@ def main(config):
logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
if not config.get('calib', False):
blockwise_opt = ALGO_REGISTRY[config.quant.method](
model, quant_config=config.quant, input=None, config=config
model,
quant_config=config.quant,
input=None,
padding_mask=None,
config=config
)
blockwise_opt.run_block_loop()
else:
dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib, model.processor)
calib_data = dataset.get_calib_dataset()
calib_data, padding_mask = dataset.get_calib_dataset()
model.collect_first_block_input(calib_data, config.calib.type)
del calib_data
gc.collect()
torch.cuda.empty_cache()
if not config.get('sparse', False):
blockwise_opt = ALGO_REGISTRY[config.quant.method](
model, config.quant, model.get_first_block_input(), config
model,
config.quant,
model.get_first_block_input(),
padding_mask,
config
)
else:
blockwise_opt = ALGO_REGISTRY[config.sparse.method](
model, config.sparse, model.get_first_block_input(), config
model,
config.sparse,
model.get_first_block_input(),
padding_mask,
config
)
blockwise_opt.run_block_loop()

Expand Down
3 changes: 2 additions & 1 deletion llmc/compression/blockwise_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@


class BlockwiseOpt(metaclass=ABCMeta):
def __init__(self, model, quant_config, input, config):
def __init__(self, model, quant_config, input, padding_mask, config):
self.model = model
self.blocks = model.get_blocks()
self.quant_config = quant_config
self.sparsity_config = quant_config
self.input = input
self.padding_mask = padding_mask
self.data_free = False if self.input else True
self.config = config
self.block_idx = None
Expand Down
8 changes: 6 additions & 2 deletions llmc/compression/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

@ALGO_REGISTRY
class Awq(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
special_config = self.quant_config.get('special', {})
self.trans = special_config.get('trans', True)
self.trans_version = special_config.get('trans_version', 'v2')
Expand Down Expand Up @@ -131,6 +131,10 @@ def search_scale_subset(self, layers_dict, input, inspect_module, subset_kwargs)
if isinstance(out, tuple):
out = out[0]

if self.padding_mask:
org_out = org_out * self.padding_mask[i].unsqueeze(dim=-1).to(org_out.device) # noqa
out = out * self.padding_mask[i].unsqueeze(dim=-1).to(out.device)

loss = (org_out - out).float().pow(2).mean().item()
loss_mean += x.shape[0] * 1.0 / self.n_samples * loss
scales_mean += x.shape[0] * 1.0 / self.n_samples * scales
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/base_blockwise_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@


class BaseBlockwiseQuantization(BlockwiseOpt):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.set_quant_config()

def w_qdq(self, module, wquantizer):
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/dgq.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

@ALGO_REGISTRY
class DGQ(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.model_dtype = next(self.model.model.parameters()).dtype

def w_qdq(self, module, wquantizer):
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

@ALGO_REGISTRY
class GPTQ(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.dev = torch.device('cuda')
self.model_dtype = next(self.model.model.parameters()).dtype
self.add_quant_config()
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/hqq.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

@ALGO_REGISTRY
class HQQ(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.add_quant_config()

@torch.no_grad()
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/llmint8.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

@ALGO_REGISTRY
class LlmInt8(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.add_quant_config()

@torch.no_grad()
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/ntweak.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

@ALGO_REGISTRY
class NormTweaking(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.add_quant_config()

model_type = self.config['model']['type']
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/omniq.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@

@ALGO_REGISTRY
class OmniQuant(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.add_quant_config()

model_type = self.config['model']['type']
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/osplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

@ALGO_REGISTRY
class OsPlus(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
def __init__(self, model, quant_config, input, padding_mask, config):
torch.set_grad_enabled(False)
super().__init__(model, quant_config, input, config)
super().__init__(model, quant_config, input, padding_mask, config)

special_config = self.quant_config.get('special', {})
self.weight_clip = special_config.get('weight_clip', False)
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/quarot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

@ALGO_REGISTRY
class Quarot(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.dev = torch.device('cuda')
self.add_quant_config()
self.preprocess()
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/quik.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

@ALGO_REGISTRY
class QUIK(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
self.add_quant_config()

def add_quant_config(self):
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/rtn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

@ALGO_REGISTRY
class RTN(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
if quant_config.get('act', False) and quant_config['act'].get('static', False):
logger.info('Activation quant is static. Calibration is required.')
self.act_static = True
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/smoothquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

@ALGO_REGISTRY
class SmoothQuant(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
special_config = self.quant_config.get('special', {})
self.alpha = special_config.get('alpha', 0.5)

Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/quantization/spqr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

@ALGO_REGISTRY
class SpQR(BaseBlockwiseQuantization):
def __init__(self, model, quant_config, input, config):
super().__init__(model, quant_config, input, config)
def __init__(self, model, quant_config, input, padding_mask, config):
super().__init__(model, quant_config, input, padding_mask, config)
assert (
self.wquantizer.granularity == 'per_group'
), 'SpQR only supports per_group quantization'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@


class BaseBlockwiseSparsification(BlockwiseOpt):
def __init__(self, model, sparsity_config, input, config):
super().__init__(model, sparsity_config, input, config)
def __init__(self, model, sparsity_config, input, padding_mask, config):
super().__init__(model, sparsity_config, input, padding_mask, config)
self.set_sparsity_config()

def block_init(self, block):
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/sparsification/magnitude.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

@ALGO_REGISTRY
class Magnitude(BaseBlockwiseSparsification):
def __init__(self, model, sparsity_config, input, config):
super().__init__(model, sparsity_config, input, config)
def __init__(self, model, sparsity_config, input, padding_mask, config):
super().__init__(model, sparsity_config, input, padding_mask, config)

@torch.no_grad()
def subset_transform(
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/sparsification/shortgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

@ALGO_REGISTRY
class ShortGPT(BaseBlockwiseSparsification):
def __init__(self, model, sparsity_config, input, config):
super().__init__(model, sparsity_config, input, config)
def __init__(self, model, sparsity_config, input, padding_mask, config):
super().__init__(model, sparsity_config, input, padding_mask, config)

def block_opt(self, block):
block = block.cuda()
Expand Down
4 changes: 2 additions & 2 deletions llmc/compression/sparsification/wanda.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

@ALGO_REGISTRY
class Wanda(BaseBlockwiseSparsification):
def __init__(self, model, sparsity_config, input, config):
super().__init__(model, sparsity_config, input, config)
def __init__(self, model, sparsity_config, input, padding_mask, config):
super().__init__(model, sparsity_config, input, padding_mask, config)

@torch.no_grad()
def get_row_scale(self, layer, act):
Expand Down
6 changes: 5 additions & 1 deletion llmc/data/dataset/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,11 @@ def get_calib_dataset(self):
elif self.calib_dataset_type == 'img_txt':
calib_samples = self.img_txt_group_samples_wo_mask(samples)
logger.info(f'len(calib_samples) : {len(calib_samples)}')
return calib_samples
if self.padding:
padding_mask = [calib_sample['attention_mask'] for calib_sample in calib_samples] # noqa
else:
padding_mask = None
return calib_samples, padding_mask

def general_preproc(self, calib_dataset, tokenizer, n_samples, seq_len):
dataset = calib_dataset.shuffle(seed=self.seed)
Expand Down
10 changes: 7 additions & 3 deletions llmc/data/dataset/specified_preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def pileval_omni(calib_dataset, tokenizer, n_samples, seq_len):
j = i + seq_len
inp = trainenc.input_ids[:, i:j]
samples.append(inp)
return samples, None
return samples


@PREPROC_REGISTRY
Expand Down Expand Up @@ -187,10 +187,14 @@ def random_truncate_txt(calib_dataset, tokenizer, n_samples, seq_len):


@PREPROC_REGISTRY
def original_txt(calib_dataset, tokenizer, n_samples, seq_len):
def original_txt(calib_dataset, tokenizer, n_samples, seq_len=None):
random.shuffle(calib_dataset)
n_samples = min(n_samples, len(calib_dataset))
samples = []
for i in range(n_samples):
trainenc = tokenizer(calib_dataset[i], return_tensors='pt')
samples.append(trainenc.input_ids)
inp = trainenc.input_ids
if seq_len and len(inp) > seq_len:
inp = inp[:seq_len]
samples.append(inp)
return samples
2 changes: 1 addition & 1 deletion tools/quant_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def fake_quant_weight_dynamic(self, module, args={}):
torch.cuda.empty_cache()

blockwise_opt = ALGO_REGISTRY[config.quant.method](
t_model, config.quant, t_model.get_first_block_input(), config
t_model, config.quant, t_model.get_first_block_input(), None, config
)
blockwise_opt.run_block_loop()
t_model = blockwise_opt.model
Expand Down

0 comments on commit 7e94a7c

Please sign in to comment.