|
| 1 | +# Copyright (c) 2025, Moodle HQ - Research |
| 2 | +# SPDX-License-Identifier: BSD-3-Clause |
| 3 | + |
| 4 | +"""Main entry point for the knowledge base MCP compatible server.""" |
| 5 | + |
| 6 | +import logging |
| 7 | +import os |
| 8 | +import sys |
| 9 | + |
| 10 | +from pathlib import Path |
| 11 | + |
| 12 | +from dotenv import load_dotenv |
| 13 | +from langchain_core.runnables import RunnableConfig |
| 14 | + |
| 15 | +import wiki_rag.index as index |
| 16 | +import wiki_rag.mcp_server as mcp_global |
| 17 | + |
| 18 | +from wiki_rag import LOG_LEVEL, ROOT_DIR, __version__, server |
| 19 | +from wiki_rag.search.util import ConfigSchema |
| 20 | +from wiki_rag.util import setup_logging |
| 21 | + |
| 22 | + |
| 23 | +def main(): |
| 24 | + """Run the MCP server with all the configuration in place.""" |
| 25 | + setup_logging(level=LOG_LEVEL) |
| 26 | + logger = logging.getLogger(__name__) |
| 27 | + logger.info("wiki_rag-server-mcp_server starting up...") |
| 28 | + |
| 29 | + # Print the version of the bot. |
| 30 | + logger.warning(f"Version: {__version__}") |
| 31 | + |
| 32 | + dotenv_file = ROOT_DIR / ".env" |
| 33 | + if dotenv_file.exists(): |
| 34 | + logger.warning("Loading environment variables from %s", dotenv_file) |
| 35 | + logger.warning("Note: .env files are not supposed to be used in production. Use env secrets instead.") |
| 36 | + load_dotenv(dotenv_file) |
| 37 | + |
| 38 | + mediawiki_url = os.getenv("MEDIAWIKI_URL") |
| 39 | + if not mediawiki_url: |
| 40 | + logger.error("Mediawiki URL not found in environment. Exiting.") |
| 41 | + sys.exit(1) |
| 42 | + |
| 43 | + mediawiki_namespaces = os.getenv("MEDIAWIKI_NAMESPACES") |
| 44 | + if not mediawiki_namespaces: |
| 45 | + logger.error("Mediawiki namespaces not found in environment. Exiting.") |
| 46 | + sys.exit(1) |
| 47 | + mediawiki_namespaces = mediawiki_namespaces.split(",") |
| 48 | + mediawiki_namespaces = [int(ns.strip()) for ns in mediawiki_namespaces] # no whitespace and int. |
| 49 | + mediawiki_namespaces = list(set(mediawiki_namespaces)) # unique |
| 50 | + |
| 51 | + loader_dump_path = os.getenv("LOADER_DUMP_PATH") |
| 52 | + if loader_dump_path: |
| 53 | + loader_dump_path = Path(loader_dump_path) |
| 54 | + else: |
| 55 | + loader_dump_path = ROOT_DIR / "data" |
| 56 | + # If the directory does not exist, create it. |
| 57 | + if not loader_dump_path.exists(): |
| 58 | + logger.warning(f"Data directory {loader_dump_path} not found. Creating it.") |
| 59 | + try: |
| 60 | + loader_dump_path.mkdir() |
| 61 | + except Exception: |
| 62 | + logger.error(f"Could not create data directory {loader_dump_path}. Exiting.") |
| 63 | + sys.exit(1) |
| 64 | + |
| 65 | + collection_name = os.getenv("COLLECTION_NAME") |
| 66 | + if not collection_name: |
| 67 | + logger.error("Collection name not found in environment. Exiting.") |
| 68 | + sys.exit(1) |
| 69 | + # TODO: Validate that only numbers, letters and underscores are used. |
| 70 | + |
| 71 | + index.milvus_url = os.getenv("MILVUS_URL") |
| 72 | + if not index.milvus_url: |
| 73 | + logger.error("Milvus URL not found in environment. Exiting.") |
| 74 | + sys.exit(1) |
| 75 | + |
| 76 | + # If tracing is enabled, put a name for the project. |
| 77 | + if os.getenv("LANGSMITH_TRACING", "false") == "true": |
| 78 | + os.environ["LANGSMITH_PROJECT"] = f"{collection_name}" |
| 79 | + |
| 80 | + user_agent = os.getenv("USER_AGENT") |
| 81 | + if not user_agent: |
| 82 | + logger.info("User agent not found in environment. Using default.") |
| 83 | + user_agent = "Moodle Research Crawler/{version} (https://git.in.moodle.com/research)" |
| 84 | + user_agent = f"{user_agent.format(version=__version__)}" |
| 85 | + |
| 86 | + embedding_model = os.getenv("EMBEDDING_MODEL") |
| 87 | + if not embedding_model: |
| 88 | + logger.error("Embedding model not found in environment. Exiting.") |
| 89 | + sys.exit(1) |
| 90 | + |
| 91 | + embedding_dimensions = os.getenv("EMBEDDING_DIMENSIONS") |
| 92 | + if not embedding_dimensions: |
| 93 | + logger.error("Embedding dimensions not found in environment. Exiting.") |
| 94 | + sys.exit(1) |
| 95 | + embedding_dimensions = int(embedding_dimensions) |
| 96 | + |
| 97 | + llm_model = os.getenv("LLM_MODEL") |
| 98 | + if not llm_model: |
| 99 | + logger.error("LLM model not found in environment. Exiting.") |
| 100 | + sys.exit(1) |
| 101 | + |
| 102 | + mcp_api_base = os.getenv("MCP_API_BASE") |
| 103 | + if not mcp_api_base: |
| 104 | + logger.error("MCP API base not found in environment. Exiting.") |
| 105 | + sys.exit(1) |
| 106 | + parts = mcp_api_base.split(":") |
| 107 | + mcp_server = parts[0] |
| 108 | + if len(parts) > 1: |
| 109 | + mcp_port = int(parts[1]) |
| 110 | + else: |
| 111 | + mcp_port = 8081 |
| 112 | + |
| 113 | + # Calculate the file that we are going to use as source for the resources. |
| 114 | + input_candidate = "" |
| 115 | + for file in sorted(loader_dump_path.iterdir()): |
| 116 | + if file.is_file() and file.name.startswith(collection_name) and file.name.endswith(".json"): |
| 117 | + input_candidate = file |
| 118 | + if input_candidate: |
| 119 | + mcp_global.res_file = loader_dump_path / input_candidate |
| 120 | + |
| 121 | + if not mcp_global.res_file: |
| 122 | + logger.warning(f"No input file found in {loader_dump_path} with collection name {collection_name}.") |
| 123 | + |
| 124 | + # These are optional, default to 0 (unlimited). |
| 125 | + wrapper_chat_max_turns = int(os.getenv("WRAPPER_CHAT_MAX_TURNS", 0)) |
| 126 | + wrapper_chat_max_tokens = int(os.getenv("WRAPPER_CHAT_MAX_TOKENS", 0)) |
| 127 | + wrapper_model_name = os.getenv("WRAPPER_MODEL_NAME") or os.getenv("COLLECTION_NAME") |
| 128 | + if not wrapper_model_name: |
| 129 | + logger.error("Public wrapper name not found in environment. Exiting.") # This is unreachable. |
| 130 | + sys.exit(1) |
| 131 | + |
| 132 | + # Prepare the configuration schema. |
| 133 | + # TODO, make prompt name, task_def, kb_*, cutoff, max tokens, temperature, top_p |
| 134 | + # configurable. With defaults applied if not configured. |
| 135 | + config_schema = ConfigSchema( |
| 136 | + prompt_name="moodlehq/wiki-rag", |
| 137 | + task_def="Moodle user documentation", |
| 138 | + kb_name="Moodle Docs", |
| 139 | + kb_url=mediawiki_url, |
| 140 | + collection_name=collection_name, |
| 141 | + embedding_model=embedding_model, |
| 142 | + embedding_dimension=embedding_dimensions, |
| 143 | + llm_model=llm_model, |
| 144 | + search_distance_cutoff=0.6, |
| 145 | + max_completion_tokens=768, |
| 146 | + temperature=0.05, |
| 147 | + top_p=0.85, |
| 148 | + stream=False, |
| 149 | + wrapper_chat_max_turns=wrapper_chat_max_turns, |
| 150 | + wrapper_chat_max_tokens=wrapper_chat_max_tokens, |
| 151 | + wrapper_model_name=wrapper_model_name, |
| 152 | + ).items() |
| 153 | + |
| 154 | + # Prepare the configuration. |
| 155 | + server.config = RunnableConfig(configurable=dict(config_schema)) |
| 156 | + |
| 157 | + # Start the mcp_server server |
| 158 | + from wiki_rag.mcp_server.server import mcp |
| 159 | + |
| 160 | + mcp.settings.host = mcp_server |
| 161 | + mcp.settings.port = mcp_port |
| 162 | + mcp.run("sse") |
| 163 | + # import asyncio |
| 164 | + # asyncio.run(mcp_server.run_sse_async()) |
| 165 | + |
| 166 | + logger.info("wiki_rag-server-mcp_server finished.") |
| 167 | + |
| 168 | + |
| 169 | +if __name__ == "__main__": |
| 170 | + main() |
0 commit comments