瀏覽代碼

0815 嵌入模型和向量数据库

Qing 8 月之前
父節點
當前提交
8a70d69ea3

+ 17 - 0
consumer-service-demo/spring-ai-demo/pom.xml

@@ -27,6 +27,23 @@
 			<artifactId>spring-ai-openai-spring-boot-starter</artifactId>
 		</dependency>
 
+<!--		<dependency>-->
+<!--			<groupId>org.springframework.ai</groupId>-->
+<!--			<artifactId>spring-ai-transformers-spring-boot-starter</artifactId>-->
+<!--		</dependency>-->
+
+		<!-- 整合redis向量数据库 -->
+		<dependency>
+			<groupId>org.springframework.ai</groupId>
+			<artifactId>spring-ai-redis-store-spring-boot-starter</artifactId>
+		</dependency>
+
+		<!-- 整合tika文档处理工具 -->
+		<dependency>
+			<groupId>org.springframework.ai</groupId>
+			<artifactId>spring-ai-tika-document-reader</artifactId>
+		</dependency>
+
 		<dependency>
 			<groupId>org.projectlombok</groupId>
 			<artifactId>lombok</artifactId>

+ 17 - 0
consumer-service-demo/spring-ai-demo/src/main/java/com/sf/ai/config/DocumentConfig.java

@@ -0,0 +1,17 @@
+package com.sf.ai.config;
+
+import org.springframework.ai.document.DocumentTransformer;
+import org.springframework.ai.transformer.splitter.TokenTextSplitter;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+
+@Configuration
+public class DocumentConfig {
+
+    // 文本分割器
+    @Bean
+    public DocumentTransformer documentTransformer() {
+        // 将一段长文本以Token数量分割
+        return new TokenTextSplitter();
+    }
+}

+ 33 - 0
consumer-service-demo/spring-ai-demo/src/main/java/com/sf/ai/controller/embedding/EmbeddingController.java

@@ -0,0 +1,33 @@
+package com.sf.ai.controller.embedding;
+
+import lombok.RequiredArgsConstructor;
+import org.springframework.ai.embedding.EmbeddingRequest;
+import org.springframework.ai.embedding.EmbeddingResponse;
+import org.springframework.ai.openai.OpenAiEmbeddingModel;
+import org.springframework.ai.openai.OpenAiEmbeddingOptions;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
+
+import java.util.List;
+
+// 嵌入模型
+@RestController
+@RequiredArgsConstructor
+public class EmbeddingController {
+
+    private final OpenAiEmbeddingModel embeddingModel;
+
+    // http://localhost:8090/embedding?vectorStr=
+    @GetMapping("/embedding")
+    public List<Double> embedding(@RequestParam("vectorStr") String vectorStr) {
+        OpenAiEmbeddingOptions options = OpenAiEmbeddingOptions.builder()
+                .withModel("text-embedding-ada-002")
+                .withEncodingFormat("float")
+                .build();
+        EmbeddingRequest embeddingRequest = new EmbeddingRequest(List.of(vectorStr), options);
+        EmbeddingResponse embeddingResponse = embeddingModel.call(embeddingRequest);
+        List<Double> output = embeddingResponse.getResult().getOutput();
+        return output;
+    }
+}

+ 83 - 0
consumer-service-demo/spring-ai-demo/src/main/java/com/sf/ai/controller/embedding/VectorController.java

@@ -0,0 +1,83 @@
+package com.sf.ai.controller.embedding;
+
+import lombok.RequiredArgsConstructor;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentTransformer;
+import org.springframework.ai.reader.tika.TikaDocumentReader;
+import org.springframework.ai.vectorstore.RedisVectorStore;
+import org.springframework.ai.vectorstore.SearchRequest;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.core.io.Resource;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
+
+import java.util.List;
+import java.util.Map;
+
+@RestController
+@RequiredArgsConstructor
+public class VectorController {
+
+    private final RedisVectorStore vectorStore;
+
+    // http://localhost:8090/vector/init
+    @GetMapping("/vector/init")
+    public List<Document> init() {
+        List<Document> documents = List.of(
+                new Document("Spring AI rocks!! Spring AI rocks!! Spring AI rocks!! Spring AI rocks!! Spring AI rocks!!", Map.of("meta1", "meta1")),
+                new Document("The World is Big and Salvation Lurks Around the Corner"),
+                new Document("You walk forward facing the past and you turn back toward the future.", Map.of("meta2", "meta2")));
+        vectorStore.add(documents);
+
+        // 相似度查询
+        List<Document> result = vectorStore.similaritySearch(SearchRequest.query("Spring").withTopK(1));
+        return result;
+    }
+
+    // http://localhost:8090/vector/init2
+    @GetMapping("/vector/init2")
+    public List<Document> init2() {
+        List<Document> documents = List.of(
+                new Document("Java 集合, 也叫作容器,主要是由两大接口派生而来:一个是 Collection接口,主要用于存放单一元素;另一个是 Map 接口,主要用于存放键值对。对于Collection 接口,下面又有三个主要的子接口:List、Set 和 Queue。"),
+                new Document("Vector:就比Arraylist多了个 synchronized (线程安全),因为效率较低,现在已经不太建议使用。 hashTable:就比hashMap多了个synchronized (线程安全),不建议使用。ConcurrentHashMap:是Java5中支持高并发、高吞吐量的线程安全HashMap实现。它由Segment数组结构和HashEntry数组结构组成。Segment数组在ConcurrentHashMap里扮演锁的角色,HashEntry则用于存储键-值对数据。一个ConcurrentHashMap里包含一个Segment数组,Segment的结构和HashMap类似,是一种数组和链表结构;一个Segment里包含一个HashEntry数组,每个HashEntry是一个链表结构的元素;每个Segment守护着一个HashEntry数组里的元 素,当对HashEntry数组的数据进行修改时,必须首先获得它对应的Segment锁。(推荐使用)"),
+                new Document("HashMap 主要用来存放键值对,它基于哈希表的 Map 接口实现,是常用的 Java 集合之一,是非线程安全的。HashMap 可以存储 null 的 key 和 value,但 null 作为键只能有一个,null 作为值可以有多个。"),
+                new Document("JDK1.8 之前 HashMap 由 数组+链表 组成的,数组是 HashMap 的主体,链表则是主要为了解决哈希冲突而存在的(“拉链法”解决冲突)。 JDK1.8 以后的 HashMap 在解决哈希冲突时有了较大的变化,当链表长度大于等于阈值(默认为 8)(将链表转换成红黑树前会判断,如果当前数组的长度小于 64,那么会选择先进行数组扩容,而不是转换为红黑树)时,将链表转化为红黑树,以减少搜索时间。"),
+                new Document("HashMap 默认的初始化大小为 16。之后每次扩充,容量变为原来的 2 倍。并且, HashMap 总是使用 2 的幂作为哈希表的大小。")
+        );
+        vectorStore.add(documents);
+
+        // 相似度查询
+        List<Document> result = vectorStore.similaritySearch(SearchRequest.query("HashMap").withTopK(3));
+        return result;
+    }
+
+
+    @Value("classpath:question.pdf")
+    private Resource question;
+
+    private final DocumentTransformer tokenTextSplitter;
+
+    // http://localhost:8090/document/handle
+    @GetMapping("/document/handle")
+    public String addDocument() {
+        TikaDocumentReader documentReader = new TikaDocumentReader(question);
+        List<Document> documents = documentReader.get();
+        System.out.println("初步分割的大小:" + documents.size());
+        // 更加精细化的分割
+        List<Document> applied = tokenTextSplitter.apply(documents);
+        System.out.println("二次分割的大小:" + applied.size());
+//        vectorStore.add(applied);
+        vectorStore.accept(applied);
+        return "";
+    }
+
+    // http://localhost:8090/document/search?query=HashMap&size=3
+    @GetMapping("/document/search")
+    public List<Document> search(@RequestParam("query") String query, @RequestParam("size") Integer size) {
+        // 查询
+        SearchRequest searchRequest = SearchRequest.query(query).withTopK(size);
+        List<Document> result = vectorStore.similaritySearch(searchRequest);
+        return result;
+    }
+}

+ 11 - 1
consumer-service-demo/spring-ai-demo/src/main/resources/application.properties

@@ -1,5 +1,8 @@
 server.port=8090
 spring.application.name=spring-ai-demo
+
+#spring.ai.retry.max-attempts=0
+
 # open ai \u4EE3\u7406\u7F51\u5740
 spring.ai.openai.base-url=https://api.xty.app
 # \u7F51\u5740 token
@@ -10,4 +13,11 @@ spring.ai.openai.api-key=sk-aLTR8cque07DSSqVA072596f1d2e4365Ad8e27B733AaD12b
 # \u767E\u5EA6\u667A\u80FD\u4E91 token
 baidu.app-id=92186539
 baidu.api-key=ADNygQQHZIL2gdP65iTPbsVr
-baidu.secret-key=GbnfopuRdqWckID4ZUAdBkwRadNanDNK
+baidu.secret-key=GbnfopuRdqWckID4ZUAdBkwRadNanDNK
+
+# redis \u8FDE\u63A5\u5730\u5740
+spring.ai.vectorstore.redis.uri=redis://:123456@10.211.55.14:6379
+# \u7D22\u5F15\u540D
+spring.ai.vectorstore.redis.index=default-index
+# \u6570\u636E\u524D\u7F00
+spring.ai.vectorstore.redis.prefix=default: