diff --git a/CMakeLists.txt b/CMakeLists.txt index a6ba292889269b74864eab563ac9330d296ca4cd..4a08cf12b7114e64ed41263a71a5a0ad4ef35922 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,48 +1,6 @@ - cmake_minimum_required(VERSION 3.21.1) - -add_subdirectory(aimodel) - project(QtAiInferenceApi LANGUAGES CXX) -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake") - -set(CMAKE_AUTOMOC ON) -set(CMAKE_INCLUDE_CURRENT_DIR ON) -set(QT_QML_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/qml) -set(QML_IMPORT_PATH ${QT_QML_OUTPUT_DIRECTORY} - CACHE STRING "Import paths for Qt Creator's code model" - FORCE -) - -find_package(Qt6 6.8 REQUIRED COMPONENTS Core Gui Qml Quick Multimedia) -qt_standard_project_setup(REQUIRES 6.8) -qt_add_executable(${CMAKE_PROJECT_NAME} - main.cpp -) - -qt_add_qml_module(${CMAKE_PROJECT_NAME} - URI qtaiinferenceapi - VERSION 1.0 - RESOURCES - qtquickcontrols2.conf - QML_FILES - App.qml - Screen01.ui.qml - ) - -target_link_libraries(${CMAKE_PROJECT_NAME} - PRIVATE - Qt6::Quick - Qt6::Multimedia - QtAiModelApi -) - - -include(GNUInstallDirs) -install(TARGETS ${CMAKE_PROJECT_NAME} - BUNDLE DESTINATION . - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -) +add_subdirectory(aimodel) +add_subdirectory(tests) diff --git a/README.md b/README.md index 5b8e2e9c40bc3857e52847a1f6ece6239e778615..380ce8ea6cb96d662b355805e461252ba8335c6d 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,22 @@ This projects contains the proof-of-concept for a new Qt AI Inference API. The purpose of the API is to let you easily use different types of AI models for inference from your Qt code, either from C++ or directly from QML! The API abstracts the details of the underlying model and framework implementations, allowing you to just tell what type of input and output you would like to use, and Qt will set things up for you! You can also chain different models together for pipelines. +**Disclaimer** +This API is in proof-of-concept stage and under active development, and not yet a part of the Qt framework. Hence, Qt's compatibility promise does not apply; the API can still change in breaking ways. But, it is also a great time to impact the direction it will take! For suggestions feel free to create a ticket in the Qt project's [JIRA](https://bugreports.qt.io), please use the label "QtAiApi" so we can easily find them and collect them together. + ## How it works When you declare a model in your code, Qt will infer from the given input and output type what backend it will set up for the model. The backends are implemented as QPlugins. Currently, the backends are: -| Input type | Output type | Qt backend | Description | -|------------|-------------|---------------|-------------------------------------------------------------------------------| -| Text\|Image| Text | QtOllamaModel | Uses ollama to load LLM models and communicate to them with ollama's REST API | -| Speech | Text | QtAsrModel | Uses Whisper for Automatic Speech Recognition (ASR), or speech-to-text | -| Image | Json | QtTritonModel | Uses Triton to load a model for object detection from images | -| Image | Json | QtYoloModel | Uses a YOLO model for object detection from images | -| Text | Speech | QtPiperModel | Uses Piper TTS model to convert text into speech | +| Input type | Output type | Qt backend | Description | +|------------|-------------|-----------------|-------------------------------------------------------------------------------| +| Text\|Image| Text | QtOllamaModel | Uses ollama to load LLM models and communicate to them with ollama's REST API | +| Speech | Text | QtAsrModel | Uses Whisper for Automatic Speech Recognition (ASR), or speech-to-text | +| Image | Json | QtTritonModel | Uses Triton to load a model for object detection from images | +| Image | Json | QtYoloModel | Uses a YOLO model for object detection from images | +| Text | Speech | QtTtsModel | Uses QtTextToSpeech (QtSpeech) to convert text into speech | +| Text | Speech | QtPiperModel | Uses Piper TTS model to convert text into speech | +| Text | Image | QtDiffuserModel | Uses Diffusers to convert text into images | Note, the Qt backends expect the underlying backend implementation (ollama, Whisper...) to be running, and will not take care of starting them up for you. You need to start them yourself, e.g. in the case of QtOllamaModel, loading the intended model to ollama's memory by running: ``` @@ -51,10 +56,10 @@ A combination of AiModelType flags to tell what type of model to instantiate. Po | InputImage | 0x00008 | The model takes image as input | | InputJson | 0x00010 | The model takes JSON as input | | OutputText | 0x00100 | The model outputs text | -| OutputAudio | 0x00200 |The model outputs speech | -| OutputVideo | 0x00400 |The model outputs video | -| OutputImage | 0x00800 |The model outputs image | -| OutputJson | 0x01000 |The model outputs JSON | +| OutputAudio | 0x00200 | The model outputs speech | +| OutputVideo | 0x00400 | The model outputs video | +| OutputImage | 0x00800 | The model outputs image | +| OutputJson | 0x01000 | The model outputs JSON | For supported input-output combinations, see the table under "How it works" section. @@ -121,7 +126,7 @@ MultiModal { | Read method: | QString model() | | Notifier signal: | void modelChanged() | -**QVariantList rag** +**QVariantList documents** Retrieval-Augmented Generation data to use for the model, if it supports it. RAG supports currently only chromadb, which should be running on background. @@ -134,7 +139,7 @@ import qtaimodel type: (MultiModal.InputText | MultiModal.OutputText) model: "llama3.2" prompt: "Which item has best armor bonus?" - rag: ["Cloth of Authority | Armour Class +1", + documents: ["Cloth of Authority | Armour Class +1", "Drunken Cloth | Constitution +2 (up to 20)", "Icebite Robe | Resistance to Damage Types: Cold damage.", "Obsidian Laced Robe | Grants Resistance to Damage Types: Fire damage.", @@ -145,11 +150,34 @@ import qtaimodel | | | |------------------|-----------------------------------------------------| -| Write method: | void setRag(QByteArray) | -| Read method: | QByteArray rag() | -| Notifier signal: | void ragChanged() | +| Write method: | void setDocuments(QVariantList) | +| Read method: | QVariantList documents() | +| Notifier signal: | void documentsChanged() | + +**int seed** + +Seed to use with model prompts. Seed reduces randomness in model answers. + +Example: +``` +import qtaimodel + + MultiModal { + id: llamaModel + type: (MultiModal.InputText | MultiModal.OutputText) + model: "gemma3" + prompt: "Say hello?" + seed: 3453654 + } +``` + +| | | +|------------------|-----------------------------------------------------| +| Write method: | void setDocuments(QByteArray) | +| Read method: | QByteArray documents() | +| Notifier signal: | void documentsChanged() | -**QVector<QAiModel*> inputs** +**QVector<QAiModel\*> inputs** A list of models this model will use as its inputs. This allows for chaining models together to create pipelines. You can use the Optional flag with the model's type to tell whether it's an optional or mandatory input. For mandatory inputs, this model will not process any other inputs before the mandatory one has something to offer. For optional ones, other inputs will be processed regardless if that input has data available or not. diff --git a/Screen01.ui.qml b/Screen01.ui.qml deleted file mode 100644 index ebd7d30520b865b85d0b2b38d79108808efdb26e..0000000000000000000000000000000000000000 --- a/Screen01.ui.qml +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (C) 2025 The Qt Company Ltd. -// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only - -// TODO: check if all is really needed here -import QtQuick 2.15 -import QtQuick.Layouts -import QtQuick.Controls -import Qt.labs.platform // TODO: Is this used here? -import QtMultimedia - -import qtaimodel - -Rectangle { - id: rectangle - anchors.fill: parent - color: "#ffffff" // TODO: Use some better color? - - property string llamaPrompt: "You are an assistant.\n" - property string imageFile: "" - - ColumnLayout { - RowLayout { - id: buttonRow - Button { - text: "Record audio" - onClicked: { - recorder.record() - } - } - Button { - text: "Stop audio recording" - onClicked: { - recorder.stop() - if (recorder.actualLocation != "") { - speechToText.pushData(recorder.actualLocation) - } - if (imageFile != "") { - imageToText.pushData(imageFile) - } - } - } - } - - RowLayout { - Button { - text: qsTr("Open image") - onClicked: fileDialog.open() - } - Text { - id: result - text: rectangle.imageFile - } - } - - TextField { - text: llamaPrompt - implicitWidth: 300 - onEditingFinished: llamaModel.prompt = text - } - - TextArea { - placeholderText: "Enter context" - background: Rectangle { - color: "lightgreen" - } - - implicitWidth: 300 - implicitHeight: 200 - onEditingFinished: llamaModel.rag = [text] - } - - Image { - source: imageFile - } - } - - FileDialog { - id: fileDialog - folder: StandardPaths.standardLocations(StandardPaths.PicturesLocation)[0] - nameFilters: ["*.*"] - onAccepted: { - imageFile = fileDialog.file - } - onRejected: {} - } - - CaptureSession { - audioInput: AudioInput {} - recorder: MediaRecorder { - id: recorder - mediaFormat { - fileFormat: MediaFormat.Wave - } - } - } - - MultiModal { - id: imageToText - type: (MultiModal.InputImage | MultiModal.OutputText) - model: "llava-phi3" // TODO: replace with Janus model from DeepSeek - prompt: "What is in the picture?" - optional: true - buffered: true - } - - MultiModal { - id: speechToText - type: (MultiModal.InputAudio | MultiModal.OutputText) - model: "turbo" - } - - MultiModal { - id: llamaModel - type: (MultiModal.InputText | MultiModal.OutputText) - model: "gemma3:4b" - prompt: llamaPrompt - inputs: [ imageToText, speechToText ] - } - - MultiModal { - id: text2speech - type: (MultiModal.InputText | MultiModal.OutputAudio) - inputs: [ llamaModel ] - } -} diff --git a/aimodel/CMakeLists.txt b/aimodel/CMakeLists.txt index 605edc97337e3623adcad9dc524c549e207f7f77..5733bb4988745f3d4b5b38f13ad2ad4f16a93336 100644 --- a/aimodel/CMakeLists.txt +++ b/aimodel/CMakeLists.txt @@ -19,25 +19,26 @@ add_subdirectory(plugins) find_package(Qt6 6.8 REQUIRED COMPONENTS Core Qml Quick Network) qt_standard_project_setup(REQUIRES 6.8) -qt_add_library(QtAiModelPluginInterface - qaimodelinterface_p.h +qt_add_library(QtAiModelPluginInterface SHARED + qaimodelinterface_p.h qtaiapiexports_p.h chromadb.h chromadb.cpp ) + +target_compile_definitions(QtAiModelPluginInterface PRIVATE QTAIAPI_LIBRARY) + target_link_libraries(QtAiModelPluginInterface PRIVATE Qt6::Core Qt6::Network ) -qt_add_qml_module(QtAiModelApi +qt_add_qml_module(QtAiModelApi STATIC URI qtaimodel VERSION 1.0 - SHARED SOURCES qaimodel.h qaimodel.cpp ) -qt_import_qml_plugins(QtAiModelApi) target_link_libraries(QtAiModelApi PRIVATE diff --git a/aimodel/chromadb.cpp b/aimodel/chromadb.cpp index 22ee94f136145cb27352b3884549ee8131d1a273..294e38387e81d90ffbd61d9f5e8537ed6c730341 100644 --- a/aimodel/chromadb.cpp +++ b/aimodel/chromadb.cpp @@ -25,7 +25,7 @@ void ChromaDb::sendRequest( if (reply.isHttpStatusSuccess()) { lambda(json ? json.value() : QJsonDocument(), reply.httpStatus()); } else { - qDebug() << "JSON decode error:" << request.url() << "HTTP status:" << reply.httpStatus(); + qDebug() << request.url() << "responded with error:" << reply.errorString(); setError(true); } }); @@ -74,7 +74,7 @@ void ChromaDb::reset() connect(true); } } else { - qDebug() << url << "deleted"; + qDebug() << url << "responded with error:" << reply.errorString(); } }); diff --git a/aimodel/chromadb.h b/aimodel/chromadb.h index 11b29068d527fdcbe226319cfba671df5aa14ce3..efee875a663c77ee34b9ed389ab8780662673f11 100644 --- a/aimodel/chromadb.h +++ b/aimodel/chromadb.h @@ -6,8 +6,9 @@ #include <QObject> #include <QRestAccessManager> +#include "qtaiapiexports_p.h" -class ChromaDb : public QObject +class QTAIAPI_EXPORT ChromaDb : public QObject { Q_OBJECT Q_PROPERTY(bool connected READ connected WRITE connect NOTIFY connectedChanged FINAL) diff --git a/aimodel/diffuser_server/diffuser_server.py b/aimodel/diffuser_server/diffuser_server.py new file mode 100755 index 0000000000000000000000000000000000000000..e5ab83f1d2af062ae9667cfbeaf9e3f9b78f0f3c --- /dev/null +++ b/aimodel/diffuser_server/diffuser_server.py @@ -0,0 +1,82 @@ +#!/usr/bin/python3 + +# Copyright (C) 2025 The Qt Company Ltd. +# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only + +from http.server import BaseHTTPRequestHandler,HTTPServer +from os import curdir, sep +import torch +from diffusers import AutoPipelineForText2Image, LCMScheduler +import simplejson +import base64 +from io import BytesIO +from PIL import Image + + + +PORT_NUMBER = 8005 + +#This class will handles any incoming request from +#the browser +class myHandler(BaseHTTPRequestHandler): + model = '' + + + #Handler for the POST requests + def do_POST(self): + print("do_POST"); + if self.path=="/send": + self.data_string = self.rfile.read(int(self.headers['Content-Length'])) + print("data_string: " + self.data_string.decode()) + json_data = simplejson.loads(self.data_string) + #print("json_data: " + simplejson.dumps(json_data)) + print("MODEL: " + json_data["model"]) + if self.model != json_data["model"]: + self.pipeline = AutoPipelineForText2Image.from_pretrained( + json_data["model"], + #'black-forest-labs/FLUX.1-dev', + #'IDKiro/sdxs-512-dreamshaper', + #"stabilityai/stable-diffusion-2-1-base", + #'black-forest-labs/FLUX.1-schnell', + torch_dtype=torch.float32, + #variant="fp16", + use_safetensor=True).to('cpu') + self.model = json_data["model"] + + image = self.pipeline(json_data["prompt"], + height=512, + width=512, + guidance_scale=0.0, + target_size=(1024, 1024), + original_size=(4096, 4096), + num_inference_steps=1 + #max_sequence_length=256 + ).images[0] + + buffered = BytesIO() + image.save(buffered, format="PNG") + b64image = base64.b64encode(buffered.getvalue()) + print("Sending response") + self.send_response(200) + self.end_headers() + json_response = {} + json_response["image"] = b64image + self.wfile.write(simplejson.dumps(json_response).encode("utf-8")) + return + + +try: + #Create a web server and define the handler to manage the + #incoming request + server = HTTPServer(('', PORT_NUMBER), myHandler) + print('Started httpserver on port ' , PORT_NUMBER) + + + #Wait forever for incoming htto requests + server.serve_forever() + +except KeyboardInterrupt: + print('^C received, shutting down the web server') + server.socket.close() + + diff --git a/aimodel/plugins/CMakeLists.txt b/aimodel/plugins/CMakeLists.txt index 0cf7b152013d91a73c116e49d995abb865cdc5fa..095e5540bc52449f4dcfc8e9dbee70bc92316efb 100644 --- a/aimodel/plugins/CMakeLists.txt +++ b/aimodel/plugins/CMakeLists.txt @@ -4,3 +4,4 @@ add_subdirectory(ollama) add_subdirectory(triton) add_subdirectory(yolo) add_subdirectory(piper-tts) +add_subdirectory(diffuser) diff --git a/aimodel/plugins/asr/CMakeLists.txt b/aimodel/plugins/asr/CMakeLists.txt index 9c19a88be73c79ce86dcab3893ee6045e9bafed3..4ecc6d9b42e7053f41ceeef44ad7fc052dd67ac3 100644 --- a/aimodel/plugins/asr/CMakeLists.txt +++ b/aimodel/plugins/asr/CMakeLists.txt @@ -5,7 +5,7 @@ qt_add_plugin(QtAsrModel qasraimodel_p.h qasraimodel_p.cpp ) set_target_properties(QtAsrModel PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/plugins" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/plugins/aimodel" ) target_link_libraries(QtAsrModel PRIVATE diff --git a/aimodel/plugins/asr/qasraimodel_p.cpp b/aimodel/plugins/asr/qasraimodel_p.cpp index a5f835cb91c861e1ebfd0f4d5fb6be470f2e4d08..ae1a33185944b6d7ba0772ac48564ae6c62942da 100644 --- a/aimodel/plugins/asr/qasraimodel_p.cpp +++ b/aimodel/plugins/asr/qasraimodel_p.cpp @@ -3,6 +3,7 @@ #include "qaimodel.h" #include "qasraimodel_p.h" +#include <QDir> #include <QJsonDocument> #include <QJsonObject> #include <QNetworkReply> @@ -15,12 +16,12 @@ QAsrAiModel::QAsrAiModel() { } -void QAsrAiModel::pushData(QVariantList data) +void QAsrAiModel::pushData(QVariantList data, int seed) { qDebug() << "QAsrAiModel::pushData(): data:" << data; if (data.isEmpty() || data.first().toUrl().isEmpty()) { - emit dataReceived(data.first().toUrl()); + emit dataReceived(data.first().toUrl().toLocalFile()); return; } @@ -29,7 +30,7 @@ void QAsrAiModel::pushData(QVariantList data) QJsonDocument doc; QJsonObject obj = doc.object(); obj["model"] = m_owner->model(); - obj["file"] = data.first().toUrl().path(); + obj["file"] = data.first().toUrl().toLocalFile(); //obj["stream"] = false; doc.setObject(obj); m_restApi.post(request, doc.toJson(), this, [this](QRestReply &reply) { diff --git a/aimodel/plugins/asr/qasraimodel_p.h b/aimodel/plugins/asr/qasraimodel_p.h index 44a57087569378bcf61ee3d73da9fc8209f70909..dd5307244b139365e79e811fe32ade673e7a689f 100644 --- a/aimodel/plugins/asr/qasraimodel_p.h +++ b/aimodel/plugins/asr/qasraimodel_p.h @@ -13,7 +13,7 @@ class QAsrAiModel : public AiModelPrivateInterface Q_OBJECT public: QAsrAiModel(); - void pushData(QVariantList data) override; + void pushData(QVariantList data, int seed) override; private: QNetworkAccessManager m_manager; diff --git a/aimodel/plugins/diffuser/CMakeLists.txt b/aimodel/plugins/diffuser/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ee0f90205fce5f6e4fbc401aa56431c68e798be --- /dev/null +++ b/aimodel/plugins/diffuser/CMakeLists.txt @@ -0,0 +1,16 @@ +find_package(Qt6 REQUIRED COMPONENTS Core Network Quick) + +qt_add_plugin(QtDiffuserModel + CLASS_NAME QAiModelPluginFactory + qdiffuseraimodel_p.h qdiffuseraimodel_p.cpp + ) +set_target_properties(QtDiffuserModel PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/plugins/aimodel" +) +target_link_libraries(QtDiffuserModel + PRIVATE + Qt6::Core + Qt6::Network + Qt6::Quick + QtAiModelPluginInterface) +include_directories(../..) diff --git a/aimodel/plugins/diffuser/plugin.json b/aimodel/plugins/diffuser/plugin.json new file mode 100644 index 0000000000000000000000000000000000000000..e9a40a3afe91d654c50a5cad881ac0e5df5b147a --- /dev/null +++ b/aimodel/plugins/diffuser/plugin.json @@ -0,0 +1,3 @@ +{ "name": "diffuserplugin", + "supportedTypes": ["InputText", "OutputImage"] +} diff --git a/aimodel/plugins/diffuser/qdiffuseraimodel_p.cpp b/aimodel/plugins/diffuser/qdiffuseraimodel_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..49120f02e0b6d7859a87ccd379fbb5df746ba0c9 --- /dev/null +++ b/aimodel/plugins/diffuser/qdiffuseraimodel_p.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2025 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only + +#include "qaimodel.h" +#include "qdiffuseraimodel_p.h" +#include <QJsonDocument> +#include <QJsonObject> +#include <QNetworkReply> +#include <QRestReply> +#include <QImage> + +QDiffuserAiModel::QDiffuserAiModel() + : AiModelPrivateInterface(), + m_manager(this) + , m_restApi(&m_manager) +{ +} + +void QDiffuserAiModel::pushData(QVariantList data, int seed) +{ + qDebug() << "QDiffuserAiModel::pushData(): data:" << data; + + if (data.isEmpty() || data.first().toString().isEmpty()) { + emit dataReceived(data.first().toString()); + return; + } + + QNetworkRequest request(QUrl("http://localhost:8005/send")); + request.setRawHeader("Content-Type", "application/json"); + QJsonDocument doc; + QJsonObject obj = doc.object(); + obj["model"] = m_owner->model(); + obj["prompt"] = data.first().toString(); + doc.setObject(obj); + qDebug() << doc.toJson(); + + m_restApi.post(request, doc.toJson(), this, [this](QRestReply &reply) { + if (auto json = reply.readJson()) { + emit dataReceived(QUrl( + QString("data:image/png;base64,") + json->object()["image"].toString().toUtf8())); + } + }); +} diff --git a/aimodel/plugins/diffuser/qdiffuseraimodel_p.h b/aimodel/plugins/diffuser/qdiffuseraimodel_p.h new file mode 100644 index 0000000000000000000000000000000000000000..4187de48ae9d7d387a5460dfdc299bb575156909 --- /dev/null +++ b/aimodel/plugins/diffuser/qdiffuseraimodel_p.h @@ -0,0 +1,33 @@ +// Copyright (C) 2025 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only + +#ifndef QDIFFUSERAIMODEL_P_H +#define QDIFFUSERAIMODEL_P_H + +#include <QObject> +#include <QRestAccessManager> +#include "qaimodelinterface_p.h" + +class QDiffuserAiModel : public AiModelPrivateInterface +{ + Q_OBJECT +public: + QDiffuserAiModel(); + void pushData(QVariantList data, int seed) override; + +private: + QNetworkAccessManager m_manager; + QRestAccessManager m_restApi; +}; + +class QDiffuserAiModelPlugin : public QAiModelPluginFactory +{ + Q_OBJECT + Q_PLUGIN_METADATA(IID "org.qt-project.Qt.QAiModelPluginFactory/1.0" FILE "plugin.json") + Q_INTERFACES(QAiModelPluginFactory) +public: + QDiffuserAiModelPlugin() {} + AiModelPrivateInterface* createInterface() { return new QDiffuserAiModel(); } +}; + +#endif // QDIFFUSERAIMODEL_P_H diff --git a/aimodel/plugins/ollama/CMakeLists.txt b/aimodel/plugins/ollama/CMakeLists.txt index 28a83e669c4166f68d267abf83fbf122550c7371..35122a7f849ee2c42d708def1b622a21806a9c91 100644 --- a/aimodel/plugins/ollama/CMakeLists.txt +++ b/aimodel/plugins/ollama/CMakeLists.txt @@ -6,7 +6,7 @@ qt_add_plugin(QtOllamaModel ) set_target_properties(QtOllamaModel PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/plugins" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/plugins/aimodel" ) target_link_libraries(QtOllamaModel PRIVATE diff --git a/aimodel/plugins/ollama/qllmaimodel_p.cpp b/aimodel/plugins/ollama/qllmaimodel_p.cpp index f1a57266fa4d3d75087af0a0f3735924be19528d..50eb4a2f751228106a10e93d8c788bd02c0d4b0d 100644 --- a/aimodel/plugins/ollama/qllmaimodel_p.cpp +++ b/aimodel/plugins/ollama/qllmaimodel_p.cpp @@ -6,6 +6,7 @@ #include <QJsonArray> #include <QJsonDocument> #include <QJsonObject> +#include <QJsonValue> #include <QNetworkReply> #include <QNetworkRequest> #include <QRestReply> @@ -35,7 +36,9 @@ static inline void sendRequest( QNetworkRequest request(url); request.setRawHeader("Content-Type", "application/json"); restApi->post(request, QJsonDocument(object).toJson(), owner, [=](QRestReply &reply) { - if (std::optional<QJsonDocument> json = reply.readJson()) { + if (!reply.isHttpStatusSuccess()) { + qDebug() << request.url() << "responded with error" << reply.errorString() << " and status:" << reply.httpStatus(); + } else if (std::optional<QJsonDocument> json = reply.readJson()) { lambda(json.value()); } else { qDebug() << "Error. No data received from" << request.url() << reply; @@ -43,7 +46,7 @@ static inline void sendRequest( }); } -void QLlmAiModel::pushData(QVariantList data) +void QLlmAiModel::pushData(QVariantList data, int seed) { QString query = m_owner->prompt(); QJsonArray images; @@ -52,7 +55,7 @@ void QLlmAiModel::pushData(QVariantList data) query.append(QString::fromLatin1(i.toByteArray())); if (i.canConvert<QUrl>()) { - QFile file(QUrl(i.toUrl()).path()); + QFile file(QUrl(i.toUrl()).toLocalFile()); if (file.open(QIODevice::ReadOnly) != 0) { QByteArray ba = file.readAll(); QByteArray ba2 = ba.toBase64(); @@ -63,13 +66,16 @@ void QLlmAiModel::pushData(QVariantList data) } qDebug() << this << "[\"prompt\"]: " << query << "[images]" << images.count(); - - - auto promptResponseReceived = [=](auto json) { emit dataReceived(json.object()["response"].toString().toUtf8()); }; + QPair<QString, QJsonValue> options; + if (seed != 0) { + options = {"options", QJsonObject({{"seed", {seed}}})}; + } + + if (m_chromadb.connected()) { connect(&m_chromadb, &ChromaDb::embeddingsFound, this, [=](auto embeddings) { QString documents; @@ -83,14 +89,19 @@ void QLlmAiModel::pushData(QVariantList data) qDebug() << q; sendRequest(&m_restApi, m_ollama_url_base + "generate", - QJsonObject({{"model", m_owner->model()}, {"prompt", q}, {"stream", false}}), + QJsonObject({{"model", m_owner->model()}, + {"prompt", q}, + {"stream", false}, + options}), this, promptResponseReceived); }, Qt::SingleShotConnection); sendRequest(&m_restApi, m_ollama_url_base + "embed", - QJsonObject({{"model", m_owner->model()}, {"input", query}}), + QJsonObject({{"model", m_owner->model()}, + {"input", query}, + options}), this, [this](auto json) { m_chromadb.fetchEmbeddings(json.object()["embeddings"].toArray().toVariantList()); @@ -101,7 +112,8 @@ void QLlmAiModel::pushData(QVariantList data) QJsonObject({{"model", m_owner->model()}, {"prompt", query}, {"stream", false}, - {"images", images}}), + {"images", images}, + options}), this, promptResponseReceived); } diff --git a/aimodel/plugins/ollama/qllmaimodel_p.h b/aimodel/plugins/ollama/qllmaimodel_p.h index e70180f17ae9abf8e9d138b5c0ca8504e8bf20e4..fc62d74204d8551e2ed0ae9068e8665aac94a8f0 100644 --- a/aimodel/plugins/ollama/qllmaimodel_p.h +++ b/aimodel/plugins/ollama/qllmaimodel_p.h @@ -14,7 +14,7 @@ class QLlmAiModel : public AiModelPrivateInterface Q_OBJECT public: QLlmAiModel(); - void pushData(QVariantList data) override; + void pushData(QVariantList data, int seed) override; void setRag(QVariantList data) override; private: diff --git a/aimodel/plugins/piper-tts/CMakeLists.txt b/aimodel/plugins/piper-tts/CMakeLists.txt index 651ef5be7c65d74728423dced40c6e23f78d5e0e..b00f4f34531a5169f10246cf69b5987480ab6d63 100644 --- a/aimodel/plugins/piper-tts/CMakeLists.txt +++ b/aimodel/plugins/piper-tts/CMakeLists.txt @@ -6,7 +6,7 @@ qt_add_plugin(QtPiperModel ) set_target_properties(QtPiperModel PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/plugins" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/plugins/aimodel" ) target_link_libraries(QtPiperModel PRIVATE diff --git a/aimodel/plugins/piper-tts/plugin.json b/aimodel/plugins/piper-tts/plugin.json index bb84b6f60d14db972274ccbcee444f96e56ffb26..a949a0e053f1f74d6341c0876ad1db617b9c386e 100644 --- a/aimodel/plugins/piper-tts/plugin.json +++ b/aimodel/plugins/piper-tts/plugin.json @@ -1,3 +1,3 @@ -{ "name": "ttsplugin", +{ "name": "ttspiperplugin", "supportedTypes": ["InputText", "OutputAudio"] } diff --git a/aimodel/plugins/piper-tts/qpiperaimodel_p.cpp b/aimodel/plugins/piper-tts/qpiperaimodel_p.cpp index 107f800324ebfb62da2ecfa4be62696d51493d37..197055411d99d8bff1d1c0b824aefb657283e707 100644 --- a/aimodel/plugins/piper-tts/qpiperaimodel_p.cpp +++ b/aimodel/plugins/piper-tts/qpiperaimodel_p.cpp @@ -1,23 +1,26 @@ -// Copyright (C) 2025 The Qt Company Ltd. -// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only - #include "qpiperaimodel_p.h" #include "qaimodel.h" #include <QJsonDocument> #include <QJsonObject> #include <QNetworkReply> #include <QRestReply> -#include <QFile> +#include <QMediaDevices> QPiperAiModel::QPiperAiModel() : AiModelPrivateInterface() , m_manager(this) , m_restApi(&m_manager) { - m_player.setAudioOutput(&m_audioOutput); + m_audioFormat.setSampleRate(22050); + m_audioFormat.setChannelCount(1); + m_audioFormat.setSampleFormat(QAudioFormat::Int16); + + m_audioSink.reset(new QAudioSink(QMediaDevices::defaultAudioOutput(),m_audioFormat)); + m_audioBuffer.reset(new QBuffer); } -void QPiperAiModel::pushData(QVariantList data) +void QPiperAiModel::pushData( + QVariantList data, int seed) { qDebug() << "QPiperAiModel::pushData(): data:" << data; @@ -34,17 +37,14 @@ void QPiperAiModel::pushData(QVariantList data) doc.setObject(obj); m_restApi.post(request, doc.toJson(), this, [this](QRestReply &reply) { if (auto json = reply.readJson()) { - //qDebug() << "[\"response\"]=" << json->object()["response"].toString(); - //emit dataReceived(json->object()["response"].toString().toUtf8()); - QFile file("test.wav"); - file.open(QIODevice::WriteOnly); - file.write(QByteArray::fromBase64(json->object()["response"].toString().toUtf8())); - file.close(); + m_audioSink->reset(); + m_audioBuffer->close(); + m_audioBuffer->setData(QByteArray::fromBase64(json->object()["response"].toString().toUtf8())); + m_audioBuffer->open(QIODevice::ReadOnly); - m_player.stop(); - m_player.setSource(QUrl::fromLocalFile("test.wav")); - m_player.play(); + // Start playback + m_audioSink->start(m_audioBuffer.data()); } }); - + emit dataReceived(data.first().toByteArray()); } diff --git a/aimodel/plugins/piper-tts/qpiperaimodel_p.h b/aimodel/plugins/piper-tts/qpiperaimodel_p.h index 5515abb6ceb181671b02cd918db197982ec206f0..bd60c6ae9bb89c5e0da26fd9cc2c67669f755b76 100644 --- a/aimodel/plugins/piper-tts/qpiperaimodel_p.h +++ b/aimodel/plugins/piper-tts/qpiperaimodel_p.h @@ -1,13 +1,12 @@ -// Copyright (C) 2025 The Qt Company Ltd. -// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only - #ifndef QTEXT2SPEEHCAIMODEL_P_H #define QTEXT2SPEEHCAIMODEL_P_H #include <QObject> #include <QRestAccessManager> -#include <QMediaPlayer> -#include <QAudioOutput> +#include <QAudioSink> +#include <QAudioFormat> +#include <QScopedPointer> +#include <QBuffer> #include "qaimodelinterface_p.h" class QPiperAiModel : public AiModelPrivateInterface @@ -15,17 +14,17 @@ class QPiperAiModel : public AiModelPrivateInterface Q_OBJECT public: QPiperAiModel(); - void pushData(QVariantList data) override; + void pushData(QVariantList data, int seed) override; private: QNetworkAccessManager m_manager; QRestAccessManager m_restApi; - QMediaPlayer m_player; - QAudioOutput m_audioOutput; + QScopedPointer<QBuffer> m_audioBuffer; + QScopedPointer<QAudioSink> m_audioSink; + QAudioFormat m_audioFormat; }; - -class QLlmAiModelPlugin : public QAiModelPluginFactory +class QPiperTtsAiModelPlugin : public QAiModelPluginFactory { Q_OBJECT Q_PLUGIN_METADATA(IID "org.qt-project.Qt.QAiModelPluginFactory/1.0" FILE "plugin.json") diff --git a/aimodel/plugins/triton/CMakeLists.txt b/aimodel/plugins/triton/CMakeLists.txt index fb1ca9435935a02d4dee91ecab7d78d7bf550b5d..69ecd1b99992054964653cee20c6c60d09c7a4aa 100644 --- a/aimodel/plugins/triton/CMakeLists.txt +++ b/aimodel/plugins/triton/CMakeLists.txt @@ -7,7 +7,7 @@ qt_add_plugin(QtTritonModel ) set_target_properties(QtTritonModel PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/plugins" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/plugins/aimodel" ) include_directories(../..) target_link_libraries(QtTritonModel diff --git a/aimodel/plugins/triton/qtritonmodel_p.cpp b/aimodel/plugins/triton/qtritonmodel_p.cpp index bd620bb512b4d2eca40afa7532f245fb77c16224..741e58c059030d2e52d2c8c27e51b37b6bba113a 100644 --- a/aimodel/plugins/triton/qtritonmodel_p.cpp +++ b/aimodel/plugins/triton/qtritonmodel_p.cpp @@ -151,7 +151,7 @@ Preprocess( // KServe (Open Inference Protocol API) -void QTritonModel::pushData(QVariantList data) +void QTritonModel::pushData(QVariantList data, int seed) { // Load the specified image. std::ifstream file(data.first().toByteArray().toStdString()); diff --git a/aimodel/plugins/triton/qtritonmodel_p.h b/aimodel/plugins/triton/qtritonmodel_p.h index 04f0127775cb0e87892ed989f9de2e60d5ae85c4..ae5c16c7e7b06e03a5772481c32778377c713da2 100644 --- a/aimodel/plugins/triton/qtritonmodel_p.h +++ b/aimodel/plugins/triton/qtritonmodel_p.h @@ -13,7 +13,7 @@ class QTritonModel : public AiModelPrivateInterface Q_OBJECT public: QTritonModel(); - void pushData(QVariantList data) override; + void pushData(QVariantList data, int seed) override; private: diff --git a/aimodel/plugins/tts/CMakeLists.txt b/aimodel/plugins/tts/CMakeLists.txt index 92dcf3b25c51bf2e43c5adcb42e33d8ca6c505fd..208fa726105006a663a4a42ada47a9a3073972cf 100644 --- a/aimodel/plugins/tts/CMakeLists.txt +++ b/aimodel/plugins/tts/CMakeLists.txt @@ -6,7 +6,7 @@ qt_add_plugin(QtTtsModel ) set_target_properties(QtTtsModel PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/plugins" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/plugins/aimodel" ) target_link_libraries(QtTtsModel PRIVATE diff --git a/aimodel/plugins/tts/qtext2speechaimodel_p.cpp b/aimodel/plugins/tts/qtext2speechaimodel_p.cpp index da1890861f6dd96b0fbbde59f07d67f5e7030eab..505e52dab67556309219650e5e6284558a32a9b0 100644 --- a/aimodel/plugins/tts/qtext2speechaimodel_p.cpp +++ b/aimodel/plugins/tts/qtext2speechaimodel_p.cpp @@ -18,7 +18,7 @@ QText2SpeechAiModel::QText2SpeechAiModel() }); } -void QText2SpeechAiModel::pushData(QVariantList data) +void QText2SpeechAiModel::pushData(QVariantList data, int seed) { m_speech->stop(); diff --git a/aimodel/plugins/tts/qtext2speechaimodel_p.h b/aimodel/plugins/tts/qtext2speechaimodel_p.h index 2666fa89c28a46179a6928de8febc1c0641b0cfb..0e7b438bbe00ca60f69419dc46c7bf52ef4685d7 100644 --- a/aimodel/plugins/tts/qtext2speechaimodel_p.h +++ b/aimodel/plugins/tts/qtext2speechaimodel_p.h @@ -15,7 +15,7 @@ class QText2SpeechAiModel : public AiModelPrivateInterface Q_OBJECT public: QText2SpeechAiModel(); - void pushData(QVariantList data) override; + void pushData(QVariantList data, int seed) override; QSharedPointer<QTextToSpeech> m_speech; }; diff --git a/aimodel/plugins/yolo/CMakeLists.txt b/aimodel/plugins/yolo/CMakeLists.txt index 039fe5f59deaa457b8bde3c581a860f105c386ce..872098842ff4237540d968283a539b38231d70fc 100644 --- a/aimodel/plugins/yolo/CMakeLists.txt +++ b/aimodel/plugins/yolo/CMakeLists.txt @@ -6,7 +6,7 @@ qt_add_plugin(QtYoloModel ) set_target_properties(QtYoloModel PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/plugins" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/plugins/aimodel" ) target_link_libraries(QtYoloModel PRIVATE diff --git a/aimodel/plugins/yolo/qyoloaimodel.cpp b/aimodel/plugins/yolo/qyoloaimodel.cpp index 59b722405be24c2d18098655ce175040be37a9a4..b367b98da505da86e590f71af9c65fa5a7c96d56 100644 --- a/aimodel/plugins/yolo/qyoloaimodel.cpp +++ b/aimodel/plugins/yolo/qyoloaimodel.cpp @@ -19,7 +19,7 @@ QYoloAiModel::QYoloAiModel() } -void QYoloAiModel::pushData(QVariantList data) +void QYoloAiModel::pushData(QVariantList data, int seed) { QImage image; if (data.first().canConvert<QImage>()) { diff --git a/aimodel/plugins/yolo/qyoloaimodel.h b/aimodel/plugins/yolo/qyoloaimodel.h index e32c3f83c7a7896e912c370b8b76e0481a303545..92f63fedbcb733298dc62f7361c0a1fec2b9cac0 100644 --- a/aimodel/plugins/yolo/qyoloaimodel.h +++ b/aimodel/plugins/yolo/qyoloaimodel.h @@ -13,7 +13,7 @@ class QYoloAiModel : public AiModelPrivateInterface Q_OBJECT public: QYoloAiModel(); - void pushData(QVariantList data) override; + void pushData(QVariantList data, int seed) override; private: QNetworkAccessManager m_manager; diff --git a/aimodel/qaimodel.cpp b/aimodel/qaimodel.cpp index 411691d94f9bbc72d43279c8cb29e3491d1ec77f..8f81cdb8a07fbc63b819b9166f024f5a5cd9fc28 100644 --- a/aimodel/qaimodel.cpp +++ b/aimodel/qaimodel.cpp @@ -8,10 +8,10 @@ #include <QDir> #include <QJsonArray> #include <QList> +#include <QCoreApplication> QAiModel::QAiModel() { - qRegisterMetaType<AiModelPrivateInterface::AiModelTypes>(); } AiModelPrivateInterface::AiModelTypes QAiModel::type() const @@ -42,38 +42,38 @@ void QAiModel::setType(const AiModelPrivateInterface::AiModelTypes &newType) for (auto *plugin : staticPlugins) qDebug() << "Static plugin: " << plugin; - m_interface.clear(); - QDir pluginsDir(QDir::currentPath() + "/qt-ai-inference-api/aimodel/plugins"); - //QDir pluginsDir(QDir::currentPath() + "/aimodel/plugins"); - qDebug() << "Plugins dir: " << pluginsDir.absolutePath(); - const auto entryList = pluginsDir.entryList(QDir::Files); - for (const QString &fileName : entryList) { - qDebug() << "Loading " << fileName << "..."; - QPluginLoader loader(pluginsDir.absoluteFilePath(fileName)); - QJsonObject object{ loader.metaData().value("MetaData").toObject() }; - qDebug() << "Metadata for " << fileName << ": " << object; - if (!object.value("supportedTypes").isArray()) { - qDebug() << "Incorrect json format in" << loader.metaData() - << "for plugin:" << fileName; - continue; - } - auto flagArray = object.value("supportedTypes").toArray().toVariantList(); - auto pluginFlags = constructAiModelTypeFlags(flagArray); - - qDebug() << pluginFlags; - if (pluginFlags.testFlags(newType)) { - auto *instance = loader.instance(); - QAiModelPluginFactory *plugin = qobject_cast<QAiModelPluginFactory*>(instance); - if (plugin) { - qDebug() << plugin << "created"; - m_interface.reset(plugin->createInterface()); - m_interface->init(this); - break; - } else { - qDebug() << "Could not convert" << instance << "to AiModelPrivateInterface*"; + for (auto &&pluginDir : QCoreApplication::libraryPaths()) { + QDir aiModelPluginsDir(pluginDir + "/aimodel"); + qDebug() << "Plugins dir: " << aiModelPluginsDir.absolutePath(); + const auto entryList = aiModelPluginsDir.entryList(QDir::Files); + for (const QString &fileName : entryList) { + qDebug() << "Loading " << fileName << "..."; + QPluginLoader loader(aiModelPluginsDir.absoluteFilePath(fileName)); + QJsonObject object{ loader.metaData().value("MetaData").toObject() }; + qDebug() << "Metadata for " << fileName << ": " << object; + if (!object.value("supportedTypes").isArray()) { + qDebug() << "Incorrect json format in" << loader.metaData() + << "for plugin:" << fileName; + continue; } - } + auto flagArray = object.value("supportedTypes").toArray().toVariantList(); + auto pluginFlags = constructAiModelTypeFlags(flagArray); + + qDebug() << pluginFlags; + if (pluginFlags.testFlags(newType)) { + auto *instance = loader.instance(); + QAiModelPluginFactory *plugin = qobject_cast<QAiModelPluginFactory*>(instance); + if (plugin) { + qDebug() << plugin << "created"; + m_interface.reset(plugin->createInterface()); + m_interface->init(this); + break; + } else { + qDebug() << "Could not convert" << instance << "to QAiModelPluginFactory*"; + } + } + } } @@ -163,9 +163,22 @@ void QAiModel::processCombinedData(QVariant data) } } +int QAiModel::seed() const +{ + return m_seed; +} + +void QAiModel::setSeed(int newSeed) +{ + if (m_seed == newSeed) + return; + m_seed = newSeed; + emit seedChanged(); +} + void QAiModel::dataReceived(QVariant data) { - qDebug() << interface() << ":" << __func__ << "(): data:" << data; + qDebug() << interface() << ":" << __func__ << "(): data:" << data.typeName(); m_buffer = data; m_processing = false; @@ -187,30 +200,30 @@ void QAiModel::pushData(QVariant data) m_buffer.clear(); m_processing = true; emit processingChanged(); + qDebug() << data.typeName() << data; if (QByteArrayView(data.typeName()) == "QVariantList") - interface()->pushData(data.value<QVariantList>()); + interface()->pushData(data.value<QVariantList>(), m_seed); else - interface()->pushData({data}); + interface()->pushData({data}, m_seed); } } - -QVariantList QAiModel::rag() const +QVariantList QAiModel::documents() const { - return m_rag; + return m_documents; } -void QAiModel::setRag(const QVariantList &newRag) +void QAiModel::setDocuments(const QVariantList &newDocuments) { - if (m_rag == newRag) + if (m_documents == newDocuments) return; - m_rag = newRag; - qDebug() << newRag << m_output; + m_documents = newDocuments; + qDebug() << newDocuments << m_output; if (!m_interface.isNull()) { - m_interface->setRag(newRag); + m_interface->setRag(newDocuments); } - emit ragChanged(); + emit documentsChanged(); } diff --git a/aimodel/qaimodel.h b/aimodel/qaimodel.h index 620bbc7fd74b63cb77e04799a1e8bdcaecb32cff..4df08b233df6a101c1103d7b7ba65a172726ab7b 100644 --- a/aimodel/qaimodel.h +++ b/aimodel/qaimodel.h @@ -23,7 +23,7 @@ class QAiModel : public QObject Q_PROPERTY( QString model READ model WRITE setModel NOTIFY modelChanged FINAL) Q_PROPERTY( - QVariantList rag READ rag WRITE setRag NOTIFY ragChanged FINAL) + QVariantList documents READ documents WRITE setDocuments NOTIFY documentsChanged FINAL) Q_PROPERTY( QVector<QAiModel*> inputs READ inputs WRITE setInputs NOTIFY inputsChanged FINAL) Q_PROPERTY( @@ -32,6 +32,8 @@ class QAiModel : public QObject bool buffered READ buffered WRITE setBuffered NOTIFY bufferedChanged FINAL) Q_PROPERTY( bool optional READ optional WRITE setOptional NOTIFY optionalChanged FINAL) + Q_PROPERTY( + int seed READ seed WRITE setSeed NOTIFY seedChanged FINAL) public: @@ -51,8 +53,8 @@ public: QVector<QAiModel*> inputs() const; void setInputs(QVector<QAiModel*>newInputs); - QVariantList rag() const; - void setRag(const QVariantList &newRag); + QVariantList documents() const; + void setDocuments(const QVariantList &newDocuments); Q_INVOKABLE void pushData(QVariant data); Q_INVOKABLE void clearBuffer(); @@ -66,7 +68,7 @@ signals: void inputsChanged(); - void ragChanged(); + void documentsChanged(); void processingChanged(); @@ -76,6 +78,8 @@ signals: void optionalChanged(); + void seedChanged(); + private Q_SLOTS: void dataReceived(QVariant data); @@ -88,6 +92,9 @@ public: bool optional() const; void setOptional(bool newOptional); + int seed() const; + void setSeed(int newSeed); + private: void processCombinedData(QVariant data); @@ -98,9 +105,10 @@ private: QString m_inputModel{}; QAiModel* m_output{nullptr}; QSharedPointer<AiModelPrivateInterface> m_interface; - QVariantList m_rag; + QVariantList m_documents; AiModelPrivateInterface::AiModelTypes m_type; QVariant m_buffer; // TODO: replace with QVariant + int m_seed{0}; bool m_processing {false}; bool m_buffered {false}; bool m_optional {false}; diff --git a/aimodel/qaimodelinterface_p.h b/aimodel/qaimodelinterface_p.h index 177cc11ffb4e462ad97f2b33d0713bd989db9173..4d7742e6091f4d09ed1088a8ddf4a66670baa31d 100644 --- a/aimodel/qaimodelinterface_p.h +++ b/aimodel/qaimodelinterface_p.h @@ -6,10 +6,11 @@ #include <QObject> #include <QVariant> +#include "qtaiapiexports_p.h" class QAiModel; -class AiModelPrivateInterface : public QObject +class QTAIAPI_EXPORT AiModelPrivateInterface : public QObject { Q_OBJECT public: @@ -33,7 +34,7 @@ public: void init(QAiModel *owner) { m_owner = owner;} virtual ~AiModelPrivateInterface() {} - virtual void pushData(QVariantList data) = 0; + virtual void pushData(QVariantList data, int seed) = 0; virtual void setRag(QVariantList data) {} @@ -47,7 +48,7 @@ public: }; Q_DECLARE_OPERATORS_FOR_FLAGS(AiModelPrivateInterface::AiModelTypes) -class QAiModelPluginFactory : public QObject +class QTAIAPI_EXPORT QAiModelPluginFactory : public QObject { Q_OBJECT public: diff --git a/aimodel/qtaiapiexports_p.h b/aimodel/qtaiapiexports_p.h new file mode 100644 index 0000000000000000000000000000000000000000..7b94d7890b06585517408b0ddc9fe767eed0288c --- /dev/null +++ b/aimodel/qtaiapiexports_p.h @@ -0,0 +1,7 @@ +#include <QtCore/QtGlobal> + +#if defined(QTAIAPI_LIBRARY) +# define QTAIAPI_EXPORT Q_DECL_EXPORT +#else +# define QTAIAPI_EXPORT Q_DECL_IMPORT +#endif diff --git a/aimodel/tts_server/piper_server.py b/aimodel/tts_server/piper_server.py index 97328a6495e11d119d53aabeba5895ee722477e6..53d9597a18f5d39956aac2ddb09fe1c66acd99ea 100644 --- a/aimodel/tts_server/piper_server.py +++ b/aimodel/tts_server/piper_server.py @@ -1,7 +1,25 @@ #!/usr/bin/env python3 - -# Copyright (C) 2025 The Qt Company Ltd. -# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only +# +# MIT License +# +# Copyright (c) 2022 Michael Hansen +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import argparse import io @@ -113,15 +131,11 @@ def main() -> None: _LOGGER.debug("get text: %s", text["text"]) json_response = {} - with io.BytesIO() as wav_io: - with wave.open(wav_io, "wb") as wav_file: - voice.synthesize(text["text"], wav_file, **synthesize_args) - - json_response["response"] = base64.b64encode(wav_io.getvalue()).decode("utf-8") - #audio_str = ""; - #for audio_bytes in voice.synthesize_stream_raw(text["text"], **synthesize_args): - # audio_str += (base64.b64encode(audio_bytes).decode("utf-8")) - #json_response["response"] = audio_str + audio_str = b''; + for audio_bytes in voice.synthesize_stream_raw(text["text"], **synthesize_args): + _LOGGER.debug( len(audio_bytes) ) + audio_str = audio_str + audio_bytes + json_response["response"] = base64.b64encode(audio_str).decode("utf-8") return jsonify(json_response) app.run(host=args.host, port=args.port) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f46259127f916658f4b25051ace649c41327f9c --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,2 @@ + +add_subdirectory(app) diff --git a/tests/app/App.qml b/tests/app/App.qml new file mode 100644 index 0000000000000000000000000000000000000000..ab4337c57ac4bf58fed241ac86227dd461f61210 --- /dev/null +++ b/tests/app/App.qml @@ -0,0 +1,25 @@ +// Copyright (C) 2025 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only + +import QtQuick 2.15 +import QtQuick.Controls 2.15 +import QtQuick.Window 2.15 + +ApplicationWindow { + id: mainWindow + visible: true + width: Screen.width / 3 + height: Screen.height / 3 + //width: Screen.width / 2 + //height: Screen.height / 2 + //flags: Qt.FramelessWindowHint | Qt.Window + //visibility: Window.FullScreen + color: "black" // Optional background color for the main window + + // @disable-check M300 + Screen01 { + anchors.fill: parent + } + +} + diff --git a/tests/app/CMakeLists.txt b/tests/app/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f5192382a209136a2ca7df6b9fddbc2a0462f71 --- /dev/null +++ b/tests/app/CMakeLists.txt @@ -0,0 +1,49 @@ +cmake_minimum_required(VERSION 3.21.1) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +set(CMAKE_AUTOMOC ON) +set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(QT_QML_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/qml) +set(QML_IMPORT_PATH ${QT_QML_OUTPUT_DIRECTORY} + CACHE STRING "Import paths for Qt Creator's code model" + FORCE +) + + +find_package(Qt6 6.8 REQUIRED COMPONENTS Core Gui Qml Quick Multimedia) +qt_standard_project_setup(REQUIRES 6.8) + +qt_add_executable(QtAiTestApp + main.cpp +) + +qt_add_qml_module(QtAiTestApp + URI qtaiinferenceapi + VERSION 1.0 + RESOURCES + qtquickcontrols2.conf + QML_FILES + App.qml + Screen01.ui.qml + ) + +target_link_libraries(QtAiTestApp + PRIVATE + Qt6::Quick + Qt6::Multimedia + QtAiModelApi +) + +set_target_properties(QtAiTestApp + PROPERTIES + QT_QML_ROOT_PATH ${QT_QML_OUTPUT_DIRECTORY} + QML_IMPORT_PATH ${QT_QML_OUTPUT_DIRECTORY} + QT_RESOURCE_PREFIX ${QT_QML_OUTPUT_DIRECTORY}) + +include(GNUInstallDirs) +install(TARGETS QtAiTestApp + BUNDLE DESTINATION . + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/QtAiInferenceApi.qmlproject b/tests/app/QtAiInferenceApi.qmlproject similarity index 100% rename from QtAiInferenceApi.qmlproject rename to tests/app/QtAiInferenceApi.qmlproject diff --git a/tests/app/Screen01.ui.qml b/tests/app/Screen01.ui.qml new file mode 100644 index 0000000000000000000000000000000000000000..511cddee541040161859db638d342d1aef505239 --- /dev/null +++ b/tests/app/Screen01.ui.qml @@ -0,0 +1,216 @@ +// Copyright (C) 2025 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only + +// TODO: check if all is really needed here +import QtQuick 2.15 +import QtQuick.Layouts +import QtQuick.Controls +import Qt.labs.platform // TODO: Is this used here? +import QtMultimedia + +import qtaimodel + +Rectangle { + id: rectangle + anchors.fill: parent + color: "#ffffff" // TODO: Use some better color? + + property string llamaPrompt: "You are an assistant.\n" + property string imageFile: "" + + ColumnLayout { + RowLayout { + ColumnLayout { + RowLayout { + Rectangle { + width: 20 + height: 20 + color: speechToText.processing ? "red" : "green" + } + + Button { + visible: recorder.recorderState !== MediaRecorder.RecordingState + text: "Record audio" + onClicked: { + recorder.record() + } + } + Button { + visible: recorder.recorderState === MediaRecorder.RecordingState + text: "Stop audio recording" + onClicked: { + recorder.stop() + if (recorder.actualLocation != "") { + speechToText.pushData(recorder.actualLocation) + } + if (imageFile != "") { + imageToText.pushData(imageFile) + } + } + } + CaptureSession { + audioInput: AudioInput {} + recorder: MediaRecorder { + id: recorder + mediaFormat { + fileFormat: MediaFormat.Wave + } + } + } + + MultiModal { + id: speechToText + type: (MultiModal.InputAudio | MultiModal.OutputText) + model: "turbo" + } + } + + RowLayout { + Rectangle { + width: 20 + height: 20 + color: imageToText.processing ? "red" : "green" + } + + Button { + text: qsTr("Open image") + onClicked: fileDialog.open() + + + FileDialog { + id: fileDialog + folder: StandardPaths.standardLocations(StandardPaths.PicturesLocation)[0] + nameFilters: ["*.*"] + onAccepted: { + imageFile = fileDialog.file + } + onRejected: {} + } + } + Text { + id: result + text: rectangle.imageFile + } + MultiModal { + id: imageToText + type: (MultiModal.InputImage | MultiModal.OutputText) + model: "llava-phi3" // TODO: replace with Janus model from DeepSeek + prompt: "What is in the picture?" + optional: true + buffered: true + } + } + + RowLayout { + Rectangle { + width: 20 + height: 20 + color: diffuser.processing ? "red" : "green" + } + TextField { + placeholderText: "Text2Image" + implicitWidth: 300 + onEditingFinished: diffuser.pushData(text) + } + MultiModal { + id: diffuser + type: (MultiModal.InputText | MultiModal.OutputImage) + model: 'IDKiro/sdxs-512-dreamshaper' + } + + Connections { + target: diffuser + function onGotResult(result) { + imageLocation.source = result + } + } + } + + + TextField { + text: llamaPrompt + placeholderText: "Llama prompt" + implicitWidth: 300 + onEditingFinished: llamaModel.prompt = text + } + RowLayout { + Rectangle { + width: 20 + height: 20 + color: llamaModel.processing ? "red" : "green" + } + TextField { + placeholderText: "Text2Text" + implicitWidth: 300 + onEditingFinished: llamaModel.pushData(text) + } + } + + TextArea { + placeholderText: "Enter context" + background: Rectangle { + color: "lightgreen" + } + + implicitWidth: 300 + implicitHeight: 100 + onEditingFinished: llamaModel.documents = [text] + } + MultiModal { + id: llamaModel + type: (MultiModal.InputText | MultiModal.OutputText) + model: "gemma3:4b" + prompt: llamaPrompt + inputs: [ imageToText, speechToText ] + } + + RowLayout { + Rectangle { + width: 20 + height: 20 + color: text2speech.processing ? "red" : "green" + } + TextArea { + implicitWidth: 300 + implicitHeight: 100 + background: Rectangle { + color: "lightblue" + } + + } + + MultiModal { + id: text2speech + type: (MultiModal.InputText | MultiModal.OutputAudio) + inputs: [ llamaModel ] + } + } + + } + + Rectangle { + width: 300 + height: 300 + color: "red" + Image { + anchors.fill: parent + anchors.margins: 2 + id: imageLocation + source: imageFile + fillMode: Image.PreserveAspectFit + } + } + } + + + + + } + + + + + + + +} diff --git a/main.cpp b/tests/app/main.cpp similarity index 76% rename from main.cpp rename to tests/app/main.cpp index 52e40facce31fb86bb5f7a10582c4e96bc4da8bc..6edfc01621ffb8678ad8044168d61df3dd55f175 100644 --- a/main.cpp +++ b/tests/app/main.cpp @@ -20,9 +20,13 @@ int main(int argc, char *argv[]) }, Qt::QueuedConnection); engine.addImportPath("qml"); + engine.addImportPath("../../qml"); // For QML plugins + QCoreApplication::addLibraryPath("../../plugins"); // For backend plugins + engine.loadFromModule("qtaiinferenceapi", "App"); - qDebug() << "Standard path for pictures: " << QStandardPaths::standardLocations(QStandardPaths::PicturesLocation); + qDebug() << "Standard path for pictures: " << QStandardPaths::standardLocations(QStandardPaths::PicturesLocation) + << "QML import path" << engine.importPathList(); if (engine.rootObjects().isEmpty()) return -1; diff --git a/qtquickcontrols2.conf b/tests/app/qtquickcontrols2.conf similarity index 100% rename from qtquickcontrols2.conf rename to tests/app/qtquickcontrols2.conf