【小白教程】ARC a750本地运行glm-4-9b,运行效果
书接上回:https://blog.51cto.com/u_17003782/11944492
建议先看看上回的内容
这次下载了glm-4-9b在a750上运行试试
一、先下载模型
打开modelscope网站:https://www.modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m/files modelscope相当于国内的Hugging Face。
全部下载
放到这里D:\AI\openvino-LLM\glm-4-9b-chat-1m,也可以根据自己情况更改路径
二、参考上回,用命令行运行
略
三、用vscode运行
命令行运行感觉不太顺手,改了下代码,通过vscode运行,改了原本的chat.py和convert.py, 先看效果,代码放在最后。
四、运行效果
1、int4量化
int4-CPU GIF动图
int4-GPU GIF动图
GPU整体速度比CPU快不少,int4量化优化效果很明显,但win10任务管理器性能显示GPU占用还是不高,a750显卡驱动显示GPU活动80%左右,看起来是系统识别显卡活动不太准。
2、int8量化
int8-CPU GIF动图
int8-GPU GIF动图
可以看到int8量化用GPU比CPU还要慢,尴尬了。。。CPU和GPU活动都没满,但是速度上不去
3、fp16
fp16-CPU GIF动图
CPU还能跑
fp16-GPU GIF动图
我去做了个饭,回来还没打开。。
感觉只是对int4量化进行了优化,其他的还不如在CPU上跑。
4、ollama
顺便用ollama也测试了下,
看起来比int4-GPU要慢,但是CPU活动没有占满。
五、代码
from transformers import AutoTokenizer, AutoConfig
from optimum.intel import OVWeightQuantizationConfig
from optimum.intel.openvino import OVModelForCausalLM
import os
from pathlib import Path
if __name__ == '__main__':
###### 定义参数 ######
model_id = 'D:\AI\openvino-LLM\glm-4-9b-chat-1m' #模型路径
precision = 'int4' # "fp16", "int8", "int4" #量化精度
output = Path('D:\AI\openvino-LLM\glm-4-9b-chat-1m-ov') #输出路径
###### 定义参数 ######
if output.exists() == False:
os.mkdir(output)
model_kwargs = {
"trust_remote_code": True,
"config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
}
compression_configs = {
"sym": False,
"group_size": 128,
"ratio": 0.8,
}
print("====Exporting IR=====")
if precision == "int4":
ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True,
compile=False, quantization_config=OVWeightQuantizationConfig(
bits=4, **compression_configs), **model_kwargs)
elif precision == "int8":
ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True,
compile=False, load_in_8bit=True, **model_kwargs)
else:
ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True,
compile=False, load_in_8bit=False, **model_kwargs)
ov_model.save_pretrained(output)
print("====Exporting tokenizer=====")
tokenizer = AutoTokenizer.from_pretrained(
model_id, trust_remote_code=True)
tokenizer.save_pretrained(output)
from typing import List, Tuple
from threading import Thread
import torch
from optimum.intel.openvino import OVModelForCausalLM
from transformers import (AutoTokenizer, AutoConfig,
TextIteratorStreamer, StoppingCriteriaList, StoppingCriteria)
def parse_text(text):
lines = text.split("\n")
lines = [line for line in lines if line != ""]
count = 0
for i, line in enumerate(lines):
if "```" in line:
count += 1
items = line.split('`')
if count % 2 == 1:
lines[i] = f'<pre><code class="language-{items[-1]}">'
else:
lines[i] = f'<br></code></pre>'
else:
if i > 0:
if count % 2 == 1:
line = line.replace("`", "\`")
line = line.replace("<", "<")
line = line.replace(">", ">")
line = line.replace(" ", " ")
line = line.replace("*", "*")
line = line.replace("_", "_")
line = line.replace("-", "-")
line = line.replace(".", ".")
line = line.replace("!", "!")
line = line.replace("(", "(")
line = line.replace(")", ")")
line = line.replace("$", "$")
lines[i] = "<br>" + line
text = "".join(lines)
return text
class StopOnTokens(StoppingCriteria):
def __init__(self, token_ids):
self.token_ids = token_ids
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
) -> bool:
for stop_id in self.token_ids:
if input_ids[0][-1] == stop_id:
return True
return False
if __name__ == "__main__":
###### 定义参数 ######
model_dir = 'D:\AI\openvino-LLM\glm-4-9b-chat-1m'
device='CPU' # device='CPU'
max_new_tokens= 10000 # 最长输出多少字符
###### 定义参数 ######
ov_config = {"PERFORMANCE_HINT": "LATENCY",
"NUM_STREAMS": "1", "CACHE_DIR": ""}
tokenizer = AutoTokenizer.from_pretrained(
model_dir, trust_remote_code=True)
print("====Compiling model====")
ov_model = OVModelForCausalLM.from_pretrained(
model_dir,
device=device,
ov_config=ov_config,
config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
trust_remote_code=True,
)
streamer = TextIteratorStreamer(
tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
)
stop_tokens = [0, 2]
stop_tokens = [StopOnTokens(stop_tokens)]
def convert_history_to_token(history: List[Tuple[str, str]]):
messages = []
for idx, (user_msg, model_msg) in enumerate(history):
if idx == len(history) - 1 and not model_msg:
messages.append({"role": "user", "content": user_msg})
break
if user_msg:
messages.append({"role": "user", "content": user_msg})
if model_msg:
messages.append({"role": "assistant", "content": model_msg})
model_inputs = tokenizer.apply_chat_template(messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt")
return model_inputs
history = []
print("====Starting conversation====")
while True:
input_text = input("用户: ")
if input_text.lower() == 'stop':
break
if input_text.lower() == 'clear':
history = []
print("AI助手: 对话历史已清空")
continue
print("glm4-9b:", end=" ")
history = history + [[parse_text(input_text), ""]]
model_inputs = convert_history_to_token(history)
generate_kwargs = dict(
input_ids=model_inputs,
max_new_tokens=max_new_tokens,
temperature=0.1,
do_sample=True,
top_p=1.0,
top_k=50,
repetition_penalty=1.1,
streamer=streamer,
stopping_criteria=StoppingCriteriaList(stop_tokens)
)
t1 = Thread(target=ov_model.generate, kwargs=generate_kwargs)
t1.start()
partial_text = ""
for new_text in streamer:
new_text = new_text
print(new_text, end="", flush=True)
partial_text += new_text
print("\n")
history[-1][1] = partial_text
码字不易,都看到这了点个赞吧。