Code on the site
I've added code highlighting with pygments and want to see how code blocks are going to work out and look. That spread to further site design, including the addition of proper light/dark mode based on system preferences, layout, and typography tweaks, etc.
To see how code looks, and a bit of what I'm doing, here is a basic typer app that uses IBM's Granite. The LLM needs to be downloaded and converted into MLX format for use with this script. Has basic memory by simply appending messages to the current prompt.
Granite can use tools. I have a more advanced version that I am experimenting with.
Here is some code. One thing to note when using MLX and models is that the temperature, top k/p, repetition penalty, etc., are handled a bit differently than other runners. See _generate_response below for an example, and this official MLX GitHub example.
from pathlib import Path
import typer
from mlx_lm import generate, load, sample_utils
class GraniteChat:
def __init__(self, model_path: str):
self.model_path = model_path
self.model = None
self.tokenizer = None
self.messages = []
self.system_prompt = self._build_system_prompt()
def _build_system_prompt(self) -> str:
return f"You are Granite, a helpful AI assistant. Respond naturally and conversationally to user messages."
def load_model(self):
print("Loading model...")
try:
self.model, self.tokenizer = load(self.model_path)
print("Model loaded successfully")
except Exception as e:
print(f"Failed to load model: {e}")
exit(1)
def _build_prompt(self) -> str:
chat_messages = [{"role": "system", "content": self.system_prompt}] + self.messages
return self.tokenizer.apply_chat_template(
chat_messages,
tokenize=False,
add_generation_prompt=True
)
def _generate_response(self, max_tokens: int = 1000) -> str:
prompt = self._build_prompt()
try:
# Create sampler and logits processors
sampler = sample_utils.make_sampler(temp=0.1, top_p=0.9)
logits_processors = sample_utils.make_logits_processors(
repetition_penalty=1.1
)
response = generate(
model=self.model,
tokenizer=self.tokenizer,
prompt=prompt,
sampler=sampler,
logits_processors=logits_processors,
max_tokens=max_tokens
)
# Remove the prompt from the response
if prompt in response:
response = response[len(prompt):].strip()
return response
except Exception as e:
return "Sorry, I encountered an error generating a response."
def chat_loop(self):
print("Granite 3.3 2B CLI")
print("Type 'exit' or 'quit' to end")
try:
while True:
user_input = input("\nYou: ").strip()
if user_input.lower() in ['exit', 'quit']:
break
if not user_input:
continue
self.messages.append({"role": "user", "content": user_input})
response = self._generate_response()
self.messages.append({"role": "assistant", "content": response})
print(f"\nGranite: {response}")
except KeyboardInterrupt:
print("\nGoodbye!")
app = typer.Typer()
@app.command()
def main(
model_path: str = typer.Option(
"./granite-3.3-2b-mlx",
"--model", "-m",
help="Path to the MLX-converted Granite model"
)
):
"""Start interactive chat with Granite AI"""
if not Path(model_path).exists():
print(f"Model path not found: {model_path}")
exit(1)
granite_chat = GraniteChat(model_path)
granite_chat.load_model()
granite_chat.chat_loop()
if __name__ == "__main__":
app()