Skip to content
Merged

Dev #24

Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
86 commits
Select commit Hold shift + click to select a range
87744d2
[update] README.md add model list
dianjixz May 12, 2025
7d392a3
Merge branch 'dev' of github.com:m5stack/StackFlow into dev
dianjixz May 12, 2025
10e4bdf
Refactor SOLA component code
yuyun2000 May 15, 2025
ebf908a
Merge branch 'dev' into opt/melotts
yuyun2000 May 15, 2025
6a96f35
Merge pull request #1 from yuyun2000/opt/melotts
yuyun2000 May 15, 2025
74c41a3
Add text normalization for Chinese, Japanese, and English
yuyun2000 May 16, 2025
0619178
Merge pull request #2 from yuyun2000/opt/melotts
yuyun2000 May 16, 2025
e479b19
Merge pull request #18 from yuyun2000/dev
Abandon-ht May 16, 2025
9a20dd0
[update] update melotts, update static_lib verison
May 16, 2025
daeaf4b
[update] update lib-llm version, update melotts model version.
May 16, 2025
00c0533
[update] update libonnxruntime.so
May 16, 2025
f775786
[update] add en-au, en-br, en-india, en-us model. Format code.
May 20, 2025
8acb179
[fix] Handles the situation where Either tagger or verbalizer file do…
May 20, 2025
f67506c
[update] update melotts-es-es model
May 20, 2025
764bca1
[update] update model list
May 20, 2025
aa10381
add trigger method to llm_kws
nyasu3w Jun 3, 2025
2f63527
Merge pull request #21 from nyasu3w/pr/trigger_kws
dianjixz Jun 5, 2025
b9401b2
[update] llm trigger Standardization.
dianjixz Jun 5, 2025
61c69a3
[update] update docs
Jun 10, 2025
2d0cd69
[update] vlm add task_camera_data
Jun 18, 2025
357a6f1
[update] llm-camera axera camera add custom_config
dianjixz Jun 23, 2025
43906d9
Merge branch 'dev' of github.com:m5stack/StackFlow into dev
dianjixz Jun 23, 2025
57b1437
[update] depth_anything use async inference, move ax_engine init.fix …
Jun 24, 2025
b7e62dd
[update] update llm-depth-anything version, llm-yolo version. fix lib…
Jun 25, 2025
592fd9e
[update] update llm-camera version
Jun 25, 2025
cccddd2
[update] update llm-vlm version
Jun 26, 2025
b0743f0
[update] update model list & add npu1 model.
Jun 27, 2025
629e822
[update] update docs
Jun 27, 2025
d29e074
[update] update ax650 model config, melotts model.
Jun 27, 2025
90fae78
[update] main_audio add 630c kit default param && StackFlow add send_…
dianjixz Jul 1, 2025
5995886
[update] main_audio add tinyalsa API cap function.
Jul 1, 2025
9abe069
[update] KWS sets multiple keywords, fix melotts
Jul 18, 2025
c25b4f0
[fix] Fix caching causing audio issues
Jul 23, 2025
cfbfd62
[update] update docs
Aug 14, 2025
a916ca0
[update] Reduce buffer frames
Aug 21, 2025
73c4a49
[update] ModuleLLM support ctx model, add HomeAssistant model, add mo…
Aug 22, 2025
9167b6e
[update] update llm_vlm encoder. update audio cache.
Aug 26, 2025
9a14d45
[update] support ax650. add ax650 model.
Aug 27, 2025
9d816fe
[update] ensure that a frame is written
Aug 28, 2025
92b10ac
[update] add internvl3-1B-ax630c model update main_vlm
Aug 29, 2025
57404bc
[update] add internvl3-1B config file, update postprocess.
Aug 29, 2025
2de874c
[update] update llm & vlm
Sep 3, 2025
b6d6e95
[update] move public include into static_lib, update llm & vlm
Sep 4, 2025
1df8ab9
[update] update model list
Sep 4, 2025
e628093
[update] update asr kws llm vlm vad whisper melotts version
Sep 4, 2025
a7d82af
[fix] fix alsa audio cap
Sep 8, 2025
bb48236
[update] add cosyvioce2
Sep 15, 2025
2d064fd
[update] update cosy_voice
Sep 15, 2025
52a09b6
[update] add new kws unit
Sep 17, 2025
01d6715
[update] update cosy_voice & new kws
Sep 23, 2025
2a5c139
update static version
Abandon-ht Sep 23, 2025
d489723
[update] clean code
Sep 25, 2025
27a16a4
[update] llm-openai-version fix kws
Sep 28, 2025
7a97143
[update] update sdk version & chip name
Sep 29, 2025
0e14999
[fix] Fix inference issues caused by memory synchronization
Oct 16, 2025
f9de469
[update] update CosyVoice2
Nov 3, 2025
4e3d7f3
[update] fix llm generate bug
Nov 4, 2025
a3d0913
[update] update model config
Nov 5, 2025
423427d
[update] update model ctx len
Nov 5, 2025
3a16259
[fix] pzmq close wait
dianjixz Nov 7, 2025
07c964c
[update] update software version
Nov 7, 2025
bd14152
[update] update ax_msp kconfig bsp version,pzmq add NotAction dec,sta…
dianjixz Nov 10, 2025
6d7ae90
[add] ax650_ec_proxy
dianjixz Nov 10, 2025
0d3e36f
[update] vlm support qwen3-vl model, add qwen3-vl-2b model. update pz…
Nov 14, 2025
324f04d
[update] update llm-vlm version & model config
Nov 21, 2025
bd4c03e
[fix] Fix cosyvoice Deinit bug
Nov 21, 2025
9e70ce8
[update] update llm-llm llm-cosyvoice version
Nov 21, 2025
8712328
[update] Add qwen3-vl-2B-Init4-ax630c model
Nov 26, 2025
cc1087f
[update] fix postprocess Div zero bug, update llm-openai-api, update …
Nov 27, 2025
4bc10a7
[fix] pzmq creat error
dianjixz Dec 3, 2025
f12314e
[update] del ec_prox
dianjixz Dec 3, 2025
e96fcf4
[update] llm_asr suooported sensevoice, update llm_audio supported al…
Dec 9, 2025
a674665
[update] kws supported custom 'hi m5' keywords
Dec 9, 2025
cc9d1bc
[update] perf llm backend & add c tokenizer
Dec 18, 2025
cde5921
[update] add legacy llm backend
Dec 18, 2025
50e3609
[update] Reduce model loading time. Optimize model loading method
Dec 18, 2025
ea7ddd0
[update] Add CosyVoice tokenizer server timeout
Dec 18, 2025
d5685d4
[update] kws support axmodel
Dec 18, 2025
bea45ab
[update] llm_asr supported zipformer stream model.
Dec 18, 2025
3f608d2
[update] add asr, kws model config
Dec 19, 2025
eff3a47
[update] perf llm-asr, kws add buttons control.
Dec 19, 2025
7b671f2
[update] update melotts play stop cap
Dec 22, 2025
3ab3d87
[update] update package version
Dec 26, 2025
9c7ba31
[update] update llm-asr & model config
Dec 26, 2025
312791e
[update] update llm-openai-api version
Dec 26, 2025
9f34887
[update] llm-model-audio version
Jan 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[update] perf llm-asr, kws add buttons control.
  • Loading branch information
LittleMouse committed Dec 19, 2025
commit eff3a479280ca07323f6b0ee039617abd163473d
74 changes: 67 additions & 7 deletions projects/llm_framework/main_asr/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,23 @@ class llm_task {
ENGINE_ONLINE = 3,
} engine_type_ = ENGINE_NCNN;

static constexpr int kSampleRate = 16000;
static constexpr int kFrameSamples = 160;
int pre_roll_frames_ = 30;
std::deque<int16_t> pre_roll_pcm_;
bool prev_vad_detected_ = false;

private:
void PushPreRollPcm(const int16_t *pcm, size_t n)
{
pre_roll_pcm_.insert(pre_roll_pcm_.end(), pcm, pcm + n);

const size_t max_samples = (size_t)pre_roll_frames_ * kFrameSamples;
while (pre_roll_pcm_.size() > max_samples) {
pre_roll_pcm_.pop_front();
}
}

public:
std::string model_;
std::string response_format_;
Expand Down Expand Up @@ -529,48 +546,86 @@ class llm_task {

void sys_pcm_on_data_onnx(const std::string &raw)
{
if (raw.size() >= sizeof(int16_t)) {
const int16_t *pcm16 = reinterpret_cast<const int16_t *>(raw.data());
size_t n16 = raw.size() / sizeof(int16_t);
PushPreRollPcm(pcm16, n16);
}

static int count = 0;
if (count < delay_audio_frame_) {
buffer_write_char(pcmdata, raw.data(), raw.length());
count++;
return;
}

buffer_write_char(pcmdata, raw.data(), raw.length());
buffer_position_set(pcmdata, 0);

std::vector<float> floatSamples;
floatSamples.reserve((delay_audio_frame_ + 1) * kFrameSamples);

int16_t audio_val;
while (buffer_read_i16(pcmdata, &audio_val, 1)) {
float normalizedSample = static_cast<float>(audio_val) / INT16_MAX;
floatSamples.push_back(normalizedSample);
floatSamples.push_back(static_cast<float>(audio_val) / 32768.0f);
}

buffer_resize(pcmdata, 0);
count = 0;

vad_->AcceptWaveform(floatSamples.data(), floatSamples.size());

bool detected = vad_->IsSpeechDetected();
bool speech_start = (!prev_vad_detected_ && detected);
prev_vad_detected_ = detected;

while (!vad_->Empty()) {
const auto &segment = vad_->Front();
if (!offline_stream_) offline_stream_ = onnx_recognizer_->CreateStream();
offline_stream_->AcceptWaveform(onnx_asr_config_.feat_config.sampling_rate, segment.samples.data(),
segment.samples.size());

if (!offline_stream_) {
offline_stream_ = onnx_recognizer_->CreateStream();
}

if (speech_start && !pre_roll_pcm_.empty()) {
std::vector<float> pre;
pre.reserve(pre_roll_pcm_.size());
for (int16_t s : pre_roll_pcm_) {
pre.push_back(static_cast<float>(s) / 32768.0f);
}

std::vector<float> merged;
merged.reserve(pre.size() + segment.samples.size());
merged.insert(merged.end(), pre.begin(), pre.end());
merged.insert(merged.end(), segment.samples.begin(), segment.samples.end());

offline_stream_->AcceptWaveform(kSampleRate, merged.data(), merged.size());

pre_roll_pcm_.clear();
speech_start = false;
} else {
offline_stream_->AcceptWaveform(kSampleRate, segment.samples.data(), segment.samples.size());
}

onnx_recognizer_->DecodeStream(offline_stream_.get());

const auto &result = offline_stream_->GetResult();
if (!result.text.empty() && out_callback_) {
out_callback_(result.text, true);
}

vad_->Pop();

offline_stream_.reset();
}

{
bool detected = vad_->IsSpeechDetected();
float chunk_ms = (delay_audio_frame_ + 1) * 10.0f;

if (detected) {
silence_ms_accum_ = 0.0f;
} else {
silence_ms_accum_ += chunk_ms;
}

if (silence_ms_accum_ >= silence_timeout) {
if (ensleep_) {
if (pause) pause();
Expand Down Expand Up @@ -1034,6 +1089,11 @@ class llm_asr : public StackFlow {
++it;
}
}

if (data.find("sys") != std::string::npos) {
llm_task_obj->audio_flage_ = false;
}

send("None", "None", LLM_NO_ERROR, work_id);
}

Expand Down
162 changes: 157 additions & 5 deletions projects/llm_framework/main_kws/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class llm_task {
int count_frames_ = 0;
long long last_trigger_time_ms_ = -1e9;
long long frame_index_global_ = 0;
int last_btn_204_state = -1;

public:
inline const std::string &model() const
Expand Down Expand Up @@ -294,9 +295,9 @@ class llm_task {
#undef CONFIG_AUTO_SET_SHERPA

#define CONFIG_AUTO_SET_AXERA(obj, key) \
if (config_body.contains(#key)) \
if (config_body.contains(#key)) \
axera_config_.key = config_body[#key]; \
else if (obj.contains(#key)) \
else if (obj.contains(#key)) \
axera_config_.key = obj[#key];

#define OPTS_AUTO_SET(obj, key) \
Expand Down Expand Up @@ -537,9 +538,27 @@ class llm_task {
}
}

void trigger()
void trigger_wakeup()
{
if (out_callback_) out_callback_("", true);
if (enwake_audio_ && (!wake_wav_file_.empty()) && play_awake_wav) {
play_awake_wav(wake_wav_file_);
}
if (out_callback_) {
if (enoutput_json_)
out_callback_("{\"reason\":\"button_204\"}", true);
else
out_callback_("", true);
}
}

void set_btn_204_state(int state)
{
last_btn_204_state = state;
}

int get_btn_204_state()
{
return last_btn_204_state;
}

bool delete_model()
Expand Down Expand Up @@ -790,6 +809,40 @@ class llm_kws : public StackFlow {
llm_task_obj->sys_pcm_on_data((*next_data));
}

void task_buttons_data(const std::weak_ptr<llm_task> llm_task_obj_weak,
const std::weak_ptr<llm_channel_obj> llm_channel_weak, const std::string &object,
const std::string &data)
{
auto llm_task_obj = llm_task_obj_weak.lock();
auto llm_channel = llm_channel_weak.lock();
if (!(llm_task_obj && llm_channel)) {
return;
}
if (data.empty() || (data == "None")) return;

try {
std::string user_msg = sample_unescapeString(data);
nlohmann::json btn_json = nlohmann::json::parse(user_msg);

if (btn_json.contains("code") && btn_json.contains("vale")) {
int current_code = btn_json["code"];
int current_vale = btn_json["vale"];

if (current_vale == 204) {
int last_code = llm_task_obj->get_btn_204_state();

if (last_code == 0 && current_code == 1) {
llm_task_obj->trigger_wakeup();
}

llm_task_obj->set_btn_204_state(current_code);
}
}
} catch (const std::exception &e) {
SLOGE("Button data JSON parse error: %s", e.what());
}
}

int setup(const std::string &work_id, const std::string &object, const std::string &data) override
{
nlohmann::json error_body;
Expand Down Expand Up @@ -836,6 +889,17 @@ class llm_kws : public StackFlow {
llm_channel->subscriber_work_id("", std::bind(&llm_kws::task_user_data, this, _llm_task_obj,
std::weak_ptr<llm_channel_obj>(llm_channel),
std::placeholders::_1, std::placeholders::_2));
} else if (input.find("buttons_thread") != std::string::npos) {
std::string socket_url = "ipc:///tmp/llm/ec_prox.event.socket";
auto business_logic = std::bind(
&llm_kws::task_buttons_data, this, std::weak_ptr<llm_task>(llm_task_obj),
std::weak_ptr<llm_channel_obj>(llm_channel), std::placeholders::_1, std::placeholders::_2);

llm_channel->subscriber(
socket_url, [llm_channel, business_logic](StackFlows::pzmq *p,
const std::shared_ptr<StackFlows::pzmq_data> &d) {
llm_channel->subscriber_event_call(business_logic, p, d);
});
}
}
llm_task_[work_id_num] = llm_task_obj;
Expand All @@ -851,6 +915,94 @@ class llm_kws : public StackFlow {
}
}

void link(const std::string &work_id, const std::string &object, const std::string &data) override
{
SLOGI("llm_kws::link:%s", data.c_str());
int ret = 0;
nlohmann::json error_body;

int work_id_num = sample_get_work_id_num(work_id);
if (llm_task_.find(work_id_num) == llm_task_.end()) {
error_body["code"] = -6;
error_body["message"] = "Unit Does Not Exist";
send("None", "None", error_body, work_id);
return;
}

auto llm_channel = get_channel(work_id);
auto llm_task_obj = llm_task_[work_id_num];

if (data.find("sys") != std::string::npos) {
if (audio_url_.empty()) audio_url_ = unit_call("audio", "cap", data);

std::weak_ptr<llm_task> _llm_task_obj = llm_task_obj;
llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr<pzmq_data> &raw) {
if (auto p = _llm_task_obj.lock()) p->sys_pcm_on_data(raw->string());
});

llm_task_obj->audio_flage_ = true;
llm_task_obj->inputs_.push_back(data);
} else if (data.find("buttons_thread") != std::string::npos) {
std::string socket_url = "ipc:///tmp/llm/ec_prox.event.socket";
auto business_logic =
std::bind(&llm_kws::task_buttons_data, this, std::weak_ptr<llm_task>(llm_task_obj),
std::weak_ptr<llm_channel_obj>(llm_channel), std::placeholders::_1, std::placeholders::_2);

llm_channel->subscriber(
socket_url,
[llm_channel, business_logic](StackFlows::pzmq *p, const std::shared_ptr<StackFlows::pzmq_data> &d) {
llm_channel->subscriber_event_call(business_logic, p, d);
});

llm_task_obj->inputs_.push_back(data);
} else {
error_body["code"] = -22;
error_body["message"] = "unsupported link target";
send("None", "None", error_body, work_id);
return;
}

if (ret) {
error_body["code"] = -20;
error_body["message"] = "link false";
send("None", "None", error_body, work_id);
return;
}
send("None", "None", LLM_NO_ERROR, work_id);
}

void unlink(const std::string &work_id, const std::string &object, const std::string &data) override
{
SLOGI("llm_kws::unlink:%s", data.c_str());
nlohmann::json error_body;

int work_id_num = sample_get_work_id_num(work_id);
if (llm_task_.find(work_id_num) == llm_task_.end()) {
error_body["code"] = -6;
error_body["message"] = "Unit Does Not Exist";
send("None", "None", error_body, work_id);
return;
}

auto llm_channel = get_channel(work_id);
auto llm_task_obj = llm_task_[work_id_num];

llm_channel->stop_subscriber_work_id(data);

for (auto it = llm_task_obj->inputs_.begin(); it != llm_task_obj->inputs_.end();) {
if (*it == data)
it = llm_task_obj->inputs_.erase(it);
else
++it;
}

if (data.find("sys") != std::string::npos) {
llm_task_obj->audio_flage_ = false;
}

send("None", "None", LLM_NO_ERROR, work_id);
}

void taskinfo(const std::string &work_id, const std::string &object, const std::string &data) override
{
SLOGI("llm_kws::taskinfo:%s", data.c_str());
Expand Down Expand Up @@ -937,7 +1089,7 @@ class llm_kws : public StackFlow {
_zmq.send_data(out);
return LLM_NONE;
}
llm_task_[work_id_num]->trigger();
llm_task_[work_id_num]->trigger_wakeup();
return LLM_NONE;
}

Expand Down
2 changes: 1 addition & 1 deletion projects/llm_framework/tools/llm_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep
'llm-model-qwen3-vl-2B-Int4-ax630c':[create_data_deb,'llm-model-qwen3-vl-2B-Int4-ax630c', '0.5', src_folder, revision],
## AX650
'llm-model-qwen2.5-0.5B-Int4-ax650':[create_data_deb,'llm-model-qwen2.5-0.5B-Int4-ax650', '0.4', src_folder, revision],
'llm-model-qwen2.5-HA-0.5B-ctx-ax650':[create_data_deb,'llm-model-qwen2.5-HA-0.5B-ctx-ax650', '0.5', src_folder, revision],
'llm-model-qwen2.5-HA-0.5B-ctx-ax650':[create_data_deb,'llm-model-qwen2.5-HA-0.5B-ctx-ax650', '0.6', src_folder, revision],
'llm-model-qwen2.5-1.5B-Int4-ax650':[create_data_deb,'llm-model-qwen2.5-1.5B-Int4-ax650', '0.4', src_folder, revision],
'llm-model-qwen2.5-3B-Int4-ax650':[create_data_deb,'llm-model-qwen2.5-3B-Int4-ax650', '0.4', src_folder, revision],
'llm-model-qwen2.5-7B-Int4-ax650':[create_data_deb,'llm-model-qwen2.5-7B-Int4-ax650', '0.4', src_folder, revision],
Expand Down
Loading