能方便把你的测试代码共享一下吗
# chat
max_length = 2048
top_p = 0.01 # 越小越确定
temperature = 0.01 # 越小越确定
import time
t1 = time.time()
for _ in range(10):
input = 'xxxx'
response, history = model.chat(tokenizer, input, history=[], max_length=max_length, top_p=top_p, temperature=temperature, do_sample=False)
print(input, end=' ')
gprint(response)
print('耗时:', time.time()-t1)
# generate
max_length = 2048
top_p = 0.01 # 越小越确定
temperature = 0.01 # 越小越确定
t1 = time.time()
for _ in range(10):
input = 'xxx '
inputs = tokenizer(
input,
max_length=max_length,
return_tensors="pt"
)
inputs = inputs.to('cuda')
gen_kwargs = {"max_length": max_length, "num_beams": 1, "do_sample": False, "top_p": 0.01,
"temperature": 0.01, "logits_processor": None, "use_cache": True}
outputs = model.generate(**inputs, **gen_kwargs)
print(tokenizer.decode(outputs[0]))
print('耗时:', time.time()-t1)