|
| 1 | +import os |
| 2 | +from typing import Any, Dict, List, Optional, Tuple, Union |
| 3 | + |
| 4 | +from pydantic.v1 import PrivateAttr |
| 5 | + |
| 6 | +from redisvl.utils.rerank.base import BaseReranker |
| 7 | + |
| 8 | + |
| 9 | +class CohereReranker(BaseReranker): |
| 10 | + """ |
| 11 | + The CohereReranker class uses Cohere's API to rerank documents based on an |
| 12 | + input query. |
| 13 | +
|
| 14 | + This reranker is designed to interact with Cohere's /rerank API, |
| 15 | + requiring an API key for authentication. The key can be provided |
| 16 | + directly in the `api_config` dictionary or through the `COHERE_API_KEY` |
| 17 | + environment variable. User must obtain an API key from Cohere's website |
| 18 | + (https://dashboard.cohere.com/). Additionally, the `cohere` python |
| 19 | + client must be installed with `pip install cohere`. |
| 20 | +
|
| 21 | + .. code-block:: python |
| 22 | +
|
| 23 | +
|
| 24 | + """ |
| 25 | + |
| 26 | + _client: Any = PrivateAttr() |
| 27 | + _aclient: Any = PrivateAttr() |
| 28 | + |
| 29 | + def __init__( |
| 30 | + self, |
| 31 | + model: str = "rerank-english-v3.0", |
| 32 | + rank_by: Optional[List[str]] = None, |
| 33 | + limit: int = 5, |
| 34 | + return_score: bool = True, |
| 35 | + api_config: Optional[Dict] = None, |
| 36 | + ) -> None: |
| 37 | + """ |
| 38 | + Initialize the CohereReranker with specified model, ranking criteria, |
| 39 | + and API configuration. |
| 40 | +
|
| 41 | + Parameters: |
| 42 | + model (str): The identifier for the Cohere model used for reranking. |
| 43 | + Defaults to 'rerank-english-v3.0'. |
| 44 | + rank_by (Optional[List[str]]): Optional list of keys specifying the |
| 45 | + attributes in the documents that should be considered for |
| 46 | + ranking. None means ranking will rely on the model's default |
| 47 | + behavior. |
| 48 | + limit (int): The maximum number of results to return after |
| 49 | + reranking. Must be a positive integer. |
| 50 | + return_score (bool): Whether to return scores alongside the |
| 51 | + reranked results. |
| 52 | + api_config (Optional[Dict], optional): Dictionary containing the API key. |
| 53 | + Defaults to None. |
| 54 | +
|
| 55 | + Raises: |
| 56 | + ImportError: If the cohere library is not installed. |
| 57 | + ValueError: If the API key is not provided. |
| 58 | + """ |
| 59 | + super().__init__( |
| 60 | + model=model, rank_by=rank_by, limit=limit, return_score=return_score |
| 61 | + ) |
| 62 | + self._initialize_clients(api_config) |
| 63 | + |
| 64 | + def _initialize_clients(self, api_config: Optional[Dict]): |
| 65 | + """ |
| 66 | + Setup the Cohere clients using the provided API key or an |
| 67 | + environment variable. |
| 68 | + """ |
| 69 | + # Dynamic import of the cohere module |
| 70 | + try: |
| 71 | + from cohere import AsyncClient, Client |
| 72 | + except ImportError: |
| 73 | + raise ImportError( |
| 74 | + "Cohere vectorizer requires the cohere library. \ |
| 75 | + Please install with `pip install cohere`" |
| 76 | + ) |
| 77 | + |
| 78 | + # Fetch the API key from api_config or environment variable |
| 79 | + api_key = ( |
| 80 | + api_config.get("api_key") if api_config else os.getenv("COHERE_API_KEY") |
| 81 | + ) |
| 82 | + if not api_key: |
| 83 | + raise ValueError( |
| 84 | + "Cohere API key is required. " |
| 85 | + "Provide it in api_config or set the COHERE_API_KEY environment variable." |
| 86 | + ) |
| 87 | + self._client = Client(api_key=api_key, client_name="redisvl") |
| 88 | + self._aclient = AsyncClient(api_key=api_key, client_name="redisvl") |
| 89 | + |
| 90 | + def _preprocess( |
| 91 | + self, query: str, docs: Union[List[Dict[str, Any]], List[str]], **kwargs |
| 92 | + ): |
| 93 | + """ |
| 94 | + Prepare and validate reranking config based on provided input and |
| 95 | + optional overrides. |
| 96 | + """ |
| 97 | + limit = kwargs.get("limit", self.limit) |
| 98 | + return_score = kwargs.get("return_score", self.return_score) |
| 99 | + max_chunks_per_doc = kwargs.get("max_chunks_per_doc") |
| 100 | + rank_by = kwargs.get("rank_by", self.rank_by) or [] |
| 101 | + rank_by = [rank_by] if isinstance(rank_by, str) else rank_by |
| 102 | + |
| 103 | + reranker_kwargs = { |
| 104 | + "model": self.model, |
| 105 | + "query": query, |
| 106 | + "top_n": limit, |
| 107 | + "documents": docs, |
| 108 | + "max_chunks_per_doc": max_chunks_per_doc, |
| 109 | + } |
| 110 | + # if we are working with list of dicts |
| 111 | + if all(isinstance(doc, dict) for doc in docs): |
| 112 | + if rank_by: |
| 113 | + reranker_kwargs["rank_fields"] = rank_by |
| 114 | + else: |
| 115 | + raise ValueError( |
| 116 | + "If reranking dictionary-like docs, " |
| 117 | + "you must provide a list of rank_by fields" |
| 118 | + ) |
| 119 | + |
| 120 | + return reranker_kwargs, return_score |
| 121 | + |
| 122 | + @staticmethod |
| 123 | + def _postprocess( |
| 124 | + docs: Union[List[Dict[str, Any]], List[str]], |
| 125 | + rankings: List[Any], |
| 126 | + ) -> Tuple[List[Any], List[float]]: |
| 127 | + """ |
| 128 | + Post-process the initial list of documents to include ranking scores, |
| 129 | + if specified. |
| 130 | + """ |
| 131 | + reranked_docs, scores = [], [] |
| 132 | + for item in rankings.results: # type: ignore |
| 133 | + scores.append(item.relevance_score) |
| 134 | + reranked_docs.append(docs[item.index]) |
| 135 | + return reranked_docs, scores |
| 136 | + |
| 137 | + def rank( |
| 138 | + self, query: str, docs: Union[List[Dict[str, Any]], List[str]], **kwargs |
| 139 | + ) -> Union[Tuple[List[Dict[str, Any]], List[float]], List[Dict[str, Any]]]: |
| 140 | + """ |
| 141 | + Rerank documents based on the provided query using the Cohere rerank API. |
| 142 | +
|
| 143 | + This method processes the user's query and the provided documents to |
| 144 | + rerank them in a manner that is potentially more relevant to the |
| 145 | + query's context. |
| 146 | +
|
| 147 | + Parameters: |
| 148 | + query (str): The user's search query. |
| 149 | + docs (Union[List[Dict[str, Any]], List[str]]): The list of documents |
| 150 | + to be ranked, either as dictionaries or strings. |
| 151 | +
|
| 152 | + Returns: |
| 153 | + Union[Tuple[Union[List[Dict[str, Any]], List[str]], float], List[Dict[str, Any]]]: The reranked list of documents and optionally associated scores. |
| 154 | + """ |
| 155 | + reranker_kwargs, return_score = self._preprocess(query, docs, **kwargs) |
| 156 | + rankings = self._client.rerank(**reranker_kwargs) |
| 157 | + reranked_docs, scores = self._postprocess(docs, rankings) |
| 158 | + if return_score: |
| 159 | + return reranked_docs, scores |
| 160 | + return reranked_docs |
| 161 | + |
| 162 | + async def arank( |
| 163 | + self, query: str, docs: Union[List[Dict[str, Any]], List[str]], **kwargs |
| 164 | + ) -> Union[Tuple[List[Dict[str, Any]], List[float]], List[Dict[str, Any]]]: |
| 165 | + """ |
| 166 | + Rerank documents based on the provided query using the Cohere rerank API. |
| 167 | +
|
| 168 | + This method processes the user's query and the provided documents to |
| 169 | + rerank them in a manner that is potentially more relevant to the |
| 170 | + query's context. |
| 171 | +
|
| 172 | + Parameters: |
| 173 | + query (str): The user's search query. |
| 174 | + docs (Union[List[Dict[str, Any]], List[str]]): The list of documents |
| 175 | + to be ranked, either as dictionaries or strings. |
| 176 | +
|
| 177 | + Returns: |
| 178 | + Union[Tuple[Union[List[Dict[str, Any]], List[str]], float], List[Dict[str, Any]]]: The reranked list of documents and optionally associated scores. |
| 179 | + """ |
| 180 | + reranker_kwargs, return_score = self._preprocess(query, docs, **kwargs) |
| 181 | + rankings = await self._aclient.rerank(**reranker_kwargs) |
| 182 | + reranked_docs, scores = self._postprocess(docs, rankings) |
| 183 | + if return_score: |
| 184 | + return reranked_docs, scores |
| 185 | + return reranked_docs |
0 commit comments