Text Generation
Transformers
Safetensors
PyTorch
mistral
smarter
code
chemistry
biology
finance
legal
art
Mixture of Experts
Merge
text-generation-inference
music
climate
medical
673_trillion_parameters
Instructions to use ZeppelinCorp/Charm_15 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ZeppelinCorp/Charm_15 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ZeppelinCorp/Charm_15")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("ZeppelinCorp/Charm_15") model = AutoModelForCausalLM.from_pretrained("ZeppelinCorp/Charm_15") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use ZeppelinCorp/Charm_15 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ZeppelinCorp/Charm_15" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ZeppelinCorp/Charm_15", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/ZeppelinCorp/Charm_15
- SGLang
How to use ZeppelinCorp/Charm_15 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ZeppelinCorp/Charm_15" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ZeppelinCorp/Charm_15", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ZeppelinCorp/Charm_15" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ZeppelinCorp/Charm_15", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use ZeppelinCorp/Charm_15 with Docker Model Runner:
docker model run hf.co/ZeppelinCorp/Charm_15
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from typing import Optional, Dict, Any | |
| import logging | |
| import asyncio | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| class Charm15Chatbot: | |
| def __init__( | |
| self, | |
| model_path: str, | |
| device: Optional[str] = None, | |
| tokenizer_kwargs: Optional[Dict[str, Any]] = None, | |
| model_kwargs: Optional[Dict[str, Any]] = None, | |
| ): | |
| """ | |
| Initialize the chatbot. | |
| Args: | |
| model_path (str): Path or name of the pre-trained model. | |
| device (str, optional): Device to run the model on (e.g., "cuda" or "cpu"). Defaults to "cuda" if available. | |
| tokenizer_kwargs (dict, optional): Additional arguments for the tokenizer. | |
| model_kwargs (dict, optional): Additional arguments for the model. | |
| """ | |
| self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") | |
| self.tokenizer_kwargs = tokenizer_kwargs or {} | |
| self.model_kwargs = model_kwargs or {} | |
| # Load tokenizer and model | |
| logger.info(f"Loading model and tokenizer from {model_path}...") | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_path, **self.tokenizer_kwargs) | |
| self.model = AutoModelForCausalLM.from_pretrained(model_path, **self.model_kwargs).to(self.device) | |
| self.model.eval() | |
| logger.info("Model and tokenizer loaded successfully.") | |
| def generate_response( | |
| self, | |
| input_text: str, | |
| max_length: int = 512, | |
| temperature: float = 0.7, | |
| top_p: float = 0.9, | |
| top_k: Optional[int] = None, | |
| repetition_penalty: float = 1.0, | |
| **kwargs, | |
| ) -> str: | |
| """ | |
| Generate a response to the input text. | |
| Args: | |
| input_text (str): The input prompt. | |
| max_length (int): Maximum length of the generated text. | |
| temperature (float): Sampling temperature (higher = more random). | |
| top_p (float): Top-p (nucleus) sampling. | |
| top_k (int): Top-k sampling. | |
| repetition_penalty (float): Penalty for repeating tokens. | |
| **kwargs: Additional arguments for model.generate(). | |
| Returns: | |
| str: The generated response. | |
| """ | |
| try: | |
| inputs = self.tokenizer( | |
| input_text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=1024, | |
| ).to(self.device) | |
| with torch.no_grad(): | |
| output = self.model.generate( | |
| **inputs, | |
| max_length=max_length, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| repetition_penalty=repetition_penalty, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| **kwargs, | |
| ) | |
| response = self.tokenizer.decode(output[0], skip_special_tokens=True) | |
| logger.info("Response generated successfully.") | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error generating response: {e}") | |
| raise | |
| async def async_generate( | |
| self, | |
| input_text: str, | |
| max_length: int = 512, | |
| temperature: float = 0.7, | |
| top_p: float = 0.9, | |
| top_k: Optional[int] = None, | |
| repetition_penalty: float = 1.0, | |
| **kwargs, | |
| ) -> str: | |
| """ | |
| Asynchronously generate a response to the input text. | |
| Args: | |
| input_text (str): The input prompt. | |
| max_length (int): Maximum length of the generated text. | |
| temperature (float): Sampling temperature (higher = more random). | |
| top_p (float): Top-p (nucleus) sampling. | |
| top_k (int): Top-k sampling. | |
| repetition_penalty (float): Penalty for repeating tokens. | |
| **kwargs: Additional arguments for model.generate(). | |
| Returns: | |
| str: The generated response. | |
| """ | |
| return await asyncio.to_thread( | |
| self.generate_response, | |
| input_text, | |
| max_length=max_length, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| repetition_penalty=repetition_penalty, | |
| **kwargs, | |
| ) |