Skip to content
This repository has been archived by the owner on Sep 4, 2023. It is now read-only.

Commit

Permalink
Moved to llm-rs
Browse files Browse the repository at this point in the history
  • Loading branch information
LLukas22 committed May 8, 2023
1 parent 31f2d95 commit 7e39ebb
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 15 deletions.
16 changes: 9 additions & 7 deletions src/api/chat_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import openai
from transformers.generation.stopping_criteria import StoppingCriteriaList
from transformers import AutoModel,AutoTokenizer,AutoModelForCausalLM,GenerationConfig,LlamaTokenizer,LlamaForCausalLM
from llama_rs_python import Model,SessionConfig,Precision
from llama_rs_python import GenerationConfig as RSGenerationConfig
from llm_rs import Llama,SessionConfig,Precision
from llm_rs import GenerationConfig as RSGenerationConfig
from huggingface_hub import hf_hub_download
import torch
from abc import ABC, abstractmethod
Expand Down Expand Up @@ -241,25 +241,26 @@ def generate_streaming(self,messages:List[ChatMessage],generationConfig:Generati
self.stop_reason="Max Tokens!"

class Cpu_Adapter(ModelAdapter):
def __init__(self,hf_token:str=None,repository:str="Sosaka/Alpaca-native-4bit-ggml",filename:str="ggml-alpaca-7b-q4.bin",max_length:int=2048,threads:int=8,kv_16:bool=True) -> None:
def __init__(self,hf_token:str=None,repository:str="Sosaka/Alpaca-native-4bit-ggml",filename:str="ggml-alpaca-7b-q4.bin",max_length:int=2048,threads:int=8,kv_16:bool=True,mmap:bool=True) -> None:
self.max_length = max_length
self.threads=threads
self.hf_token=hf_token
self.repository=repository
self.filename = filename
self.kv_16=kv_16
self.mmap=mmap

def info(self)->ModelInfo:
return ModelInfo(name="llama-rs",model=self.repository, accelerator="CPU")
return ModelInfo(name="llm-rs",model=self.repository, accelerator="CPU")

def default_config(self)->GenerationConfig:
return GenerationConfig(top_p=0.9,top_k=40,temperature=0.8,repetition_penalty=1.1,max_new_tokens=256)

def load(self):
self.ggjt_model = hf_hub_download(repo_id=self.repository, filename=self.filename,token=self.hf_token)
precision = Precision.FP16 if self.kv_16 else Precision.FP32
self.session_config = SessionConfig(threads=self.threads,context_length=self.max_length,keys_memory_type=precision,values_memory_type=precision)
self.model = Model(str(self.ggjt_model),session_config=self.session_config,verbose=True)
self.session_config = SessionConfig(threads=self.threads,context_length=self.max_length,keys_memory_type=precision,values_memory_type=precision,prefer_mmap=self.mmap)
self.model = Llama(str(self.ggjt_model),session_config=self.session_config,verbose=True)


def _hf_to_rs_config(self,generationConfig:GenerationConfig)->RSGenerationConfig:
Expand Down Expand Up @@ -306,7 +307,8 @@ def adapter_factory(configuration:Configuration)->ModelAdapter:
repository=configuration["cpu_model_repo"],
filename=configuration["cpu_model_filename"],
max_length=configuration["chat_max_length"],
kv_16=configuration["cpu_model_kv_16"]
kv_16=configuration["cpu_model_kv_16"],
mmap=configuration["cpu_model_mmap"]
)
else:
raise Exception("Unknown model type: " + model_to_use)
Expand Down
5 changes: 3 additions & 2 deletions src/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,11 @@ def App(
container.config.chat_apply_optimizations.from_env("ADAPTER_APPLY_OPTIMIZATIONS",as_=parse_bool,default=True)

#CPU Vars
container.config.cpu_model_repo.from_env("CPU_MODEL_REPO",default="Sosaka/Alpaca-native-4bit-ggml")
container.config.cpu_model_filename.from_env("CPU_MODEL_FILENAME",default="ggml-alpaca-7b-q4.bin")
container.config.cpu_model_repo.from_env("CPU_MODEL_REPO",default="LLukas22/alpaca-native-7B-4bit-ggjt")
container.config.cpu_model_filename.from_env("CPU_MODEL_FILENAME",default="ggjt-model.bin")
container.config.cpu_model_threads.from_env("CPU_MODEL_THREADS",as_=int,default=8)
container.config.cpu_model_kv_16.from_env("CPU_MODEL_KV_16",as_=parse_bool,default=True)
container.config.cpu_model_mmap.from_env("CPU_MODEL_MMAP",as_=parse_bool,default=True)
container.wire(modules=[__name__])


Expand Down
8 changes: 4 additions & 4 deletions src/api/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from typing import List,Optional
import torch

from llama_rs_python import Model,SessionConfig,Precision
from llama_rs_python import GenerationConfig as RSGenerationConfig
from llm_rs import Llama,SessionConfig,Precision
from llm_rs import GenerationConfig as RSGenerationConfig

class ManualStopCondition(StoppingCriteria):
"""
Expand Down Expand Up @@ -125,7 +125,7 @@ def __iter__(self)->str:


class CPUStreamer():
def __init__(self,model:Model,config:RSGenerationConfig,prompt:str,stop_words:List[str]=[]) -> None:
def __init__(self,model:Llama,config:RSGenerationConfig,prompt:str,stop_words:List[str]=[]) -> None:
self.model = model
self.config = config
self.prompt = prompt
Expand All @@ -147,7 +147,7 @@ def _callback(self,token:str)->Optional[bool]:

self.generated_tokens.put(token)

def start(self):
def start(self):
self.thread.start()

def __iter__(self):
Expand Down
3 changes: 2 additions & 1 deletion src/api/requirements-cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ huggingface_hub
llama-rs-python==0.0.2
openai
pynvml
farm-haystack
farm-haystack[elasticsearch]
llm-rs==0.1.1

#api dependencies
psutil
Expand Down
2 changes: 1 addition & 1 deletion src/api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ bitsandbytes==0.37.2
sentence-transformers>=2.2.0
huggingface_hub
transformers==4.28.1
llama-rs-python==0.0.2
llm-rs==0.1.1

#api dependencies
psutil
Expand Down

0 comments on commit 7e39ebb

Please sign in to comment.