Use sqlite-vec extension instead of chromadb for embeddings (#14163)

* swap sqlite_vec for chroma in requirements

* load sqlite_vec in embeddings manager

* remove chroma and revamp Embeddings class for sqlite_vec

* manual minilm onnx inference

* remove chroma in clip model

* migrate api from chroma to sqlite_vec

* migrate event cleanup from chroma to sqlite_vec

* migrate embedding maintainer from chroma to sqlite_vec

* genai description for sqlite_vec

* load sqlite_vec in main thread db

* extend the SqliteQueueDatabase class and use peewee db.execute_sql

* search with Event type for similarity

* fix similarity search

* install and add comment about transformers

* fix normalization

* add id filter

* clean up

* clean up

* fully remove chroma and add transformers env var

* readd uvicorn for fastapi

* readd tokenizer parallelism env var

* remove chroma from docs

* remove chroma from UI

* try removing custom pysqlite3 build

* hard code limit

* optimize queries

* revert explore query

* fix query

* keep building pysqlite3

* single pass fetch and process

* remove unnecessary re-embed

* update deps

* move SqliteVecQueueDatabase to db directory

* make search thumbnail take up full size of results box

* improve typing

* improve model downloading and add status screen

* daemon downloading thread

* catch case when semantic search is disabled

* fix typing

* build sqlite_vec from source

* resolve conflict

* file permissions

* try build deps

* remove sources

* sources

* fix thread start

* include git in build

* reorder embeddings after detectors are started

* build with sqlite amalgamation

* non-platform specific

* use wget instead of curl

* remove unzip -d

* remove sqlite_vec from requirements and load the compiled version

* fix build

* avoid race in db connection

* add scale_factor and bias to description zscore normalization
This commit is contained in:
Josh Hawkins
2024-10-07 15:30:45 -05:00
committed by GitHub
parent 757150dec1
commit 24ac9f3e5a
42 changed files with 951 additions and 533 deletions

View File

@@ -8,6 +8,8 @@ from enum import Enum
from multiprocessing.synchronize import Event as MpEvent
from pathlib import Path
from playhouse.sqliteq import SqliteQueueDatabase
from frigate.config import FrigateConfig
from frigate.const import CLIPS_DIR
from frigate.embeddings.embeddings import Embeddings
@@ -22,16 +24,19 @@ class EventCleanupType(str, Enum):
class EventCleanup(threading.Thread):
def __init__(self, config: FrigateConfig, stop_event: MpEvent):
def __init__(
self, config: FrigateConfig, stop_event: MpEvent, db: SqliteQueueDatabase
):
super().__init__(name="event_cleanup")
self.config = config
self.stop_event = stop_event
self.db = db
self.camera_keys = list(self.config.cameras.keys())
self.removed_camera_labels: list[str] = None
self.camera_labels: dict[str, dict[str, any]] = {}
if self.config.semantic_search.enabled:
self.embeddings = Embeddings()
self.embeddings = Embeddings(self.db)
def get_removed_camera_labels(self) -> list[Event]:
"""Get a list of distinct labels for removed cameras."""
@@ -229,15 +234,8 @@ class EventCleanup(threading.Thread):
Event.delete().where(Event.id << chunk).execute()
if self.config.semantic_search.enabled:
for collection in [
self.embeddings.thumbnail,
self.embeddings.description,
]:
existing_ids = collection.get(ids=chunk, include=[])["ids"]
if existing_ids:
collection.delete(ids=existing_ids)
logger.debug(
f"Deleted {len(existing_ids)} embeddings from {collection.__class__.__name__}"
)
self.embeddings.delete_description(chunk)
self.embeddings.delete_thumbnail(chunk)
logger.debug(f"Deleted {len(events_to_delete)} embeddings")
logger.info("Exiting event cleanup...")