Scaling Audio-Text Retrieval with Multimodal Large Language Models
Paper • 2602.18010 • Published
YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
This repo contains the checkpoint of the following paper:
Scaling Audio-Text Retrieval with Multimodal Large Language Model
This is a CLAP model trained on our AudioVerse dataset, showing better generalization abiility than the original LAION-CLAP.
Try the model to extract audio and text features.
import os
import re
import librosa
import torch
import torch.nn.functional as F
from transformers import AutoModel
def text_preprocess(sentence):
sentence = sentence.lower()
sentence = re.sub(r'\s([,.!?;:"](?:\s|$))', r"\1", sentence).replace(" ", " ")
sentence = re.sub(r'[(,.!?;:|*\")]', " ", sentence).replace(" ", " ")
return sentence
def load_audio(path, sr=32000, max_len_sec=10.0):
wav, _ = librosa.load(path, sr=sr, mono=True)
if max_len_sec and wav.shape[-1] > max_len_sec * sr:
n = int(max_len_sec * sr)
s = (wav.shape[-1] - n) // 2
wav = wav[s:s + n]
return torch.from_numpy(wav)
def batch_audio(wavs):
max_len = max(w.shape[-1] for w in wavs)
return torch.stack([F.pad(w, [0, max_len - w.shape[-1]]) for w in wavs], dim=0)
# 1) Load model (single from_pretrained call, same style as Qwen2.5-Omni)
model_path = "Jazzcharles/clap-audioverse" # or your HF repo id, e.g. "Jazzcharles/clap-audioverse"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(device).eval()
sr = model.config.sr
# 2) Prepare retrieval inputs
# audio paths and text queries can be any same-batch lists
audio_files = [
"/home/jilan_xu/data/audiocaps/audiocaps_for_wavcaps/Y-R69Fa-mCaY.wav",
"/home/jilan_xu/data/audiocaps/audiocaps_for_wavcaps/YKvrcRMfFzOE.wav",
"/home/jilan_xu/data/audiocaps/audiocaps_for_wavcaps/Y7fmOlUlwoNg.wav",
]
text_queries = [
"A chainsaw cutting as wood is cracking",
"An engine running and helicopter propellers spinning",
"Constant rattling noise and sharp vibrations",
]
# 3) Tokenize / process (audio side & text side)
audio_inputs = batch_audio([load_audio(p, sr=sr) for p in audio_files]).to(device)
text_inputs = [text_preprocess(t) for t in text_queries]
# 4) Forward and extract features
with torch.inference_mode():
audio_feat = model.encode_audio(audio_inputs) # [N_audio, D], L2-normalized
text_feat = model.encode_text(text_inputs) # [N_text, D], L2-normalized
# 5) Similarity + top-k retrieval
score = text_feat @ audio_feat.T # [N_text, N_audio]
print(score.shape, score)
If you find our work helps, please cite our paper.
@misc{xu2026scalingaudiotextretrievalmultimodal,
title={Scaling Audio-Text Retrieval with Multimodal Large Language Models},
author={Jilan Xu and Carl Thomé and Danijela Horak and Weidi Xie and Andrew Zisserman},
year={2026},
eprint={2602.18010},
archivePrefix={arXiv},
primaryClass={cs.SD},
url={https://arxiv.org/abs/2602.18010},
}