#!/usr/bin/env python3 """ Patch script to add fetch_and_save tool to Skill Seekers MCP. This tool fetches web pages and saves directly to files WITHOUT passing content through the Claude API, saving ~97% of tokens compared to WebFetch. Usage: python add_fetch_and_save.py After running: - Restart Claude Code to reload MCP servers - Use: mcp__skill-seeker__fetch_and_save(url="...", output="...") """ import re import sys from pathlib import Path # Find skill_seekers installation try: import skill_seekers SKILL_SEEKERS_PATH = Path(skill_seekers.__file__).parent except ImportError: print("❌ skill-seekers not installed. Run: pip install skill-seekers") sys.exit(1) print(f"📁 Found skill-seekers at: {SKILL_SEEKERS_PATH}") # ============================================================================ # 1. Add fetch_and_save_tool to scraping_tools.py # ============================================================================ FETCH_AND_SAVE_CODE = ''' async def fetch_and_save_tool(args: dict) -> list[TextContent]: """ Fetch web pages and save directly to files without passing content through Claude API. This tool is optimized for token efficiency - it downloads content using Python's httpx library and saves directly to disk. Only metadata (status, bytes, path) is returned, NOT the actual content. This saves ~97% of tokens compared to WebFetch. Supports: - Single URL or batch of URLs - Automatic markdown extraction from HTML - Raw markdown file preservation - Configurable output paths Args: args: Dictionary containing: - url (str, optional): Single URL to fetch - urls (list, optional): List of {"url": str, "output": str} objects for batch - output (str, optional): Output file path (required if using single url) - extract_markdown (bool, optional): Extract markdown from HTML (default: True) - timeout (int, optional): Request timeout in seconds (default: 30) - rate_limit (float, optional): Delay between requests in seconds (default: 0.5) Returns: List[TextContent]: Summary of fetched files (status, bytes, errors) - NOT content Example: # Single file fetch_and_save(url="https://docs.example.com/guide.md", output="docs/guide.md") # Batch mode fetch_and_save(urls=[ {"url": "https://docs.example.com/intro.md", "output": "docs/intro.md"}, {"url": "https://docs.example.com/api.md", "output": "docs/api.md"} ]) """ import asyncio try: import httpx from bs4 import BeautifulSoup except ImportError as e: return [TextContent(type="text", text=f"❌ Missing dependency: {e}\\nInstall with: pip install httpx beautifulsoup4")] # Parse arguments single_url = args.get("url") urls_list = args.get("urls", []) single_output = args.get("output") extract_markdown = args.get("extract_markdown", True) timeout_val = args.get("timeout", 30) rate_limit = args.get("rate_limit", 0.5) # Build task list tasks = [] if single_url and single_output: tasks.append({"url": single_url, "output": single_output}) if urls_list: tasks.extend(urls_list) if not tasks: return [TextContent(type="text", text="❌ Error: Must provide 'url' + 'output' or 'urls' list")] # Results tracking results = { "success": [], "failed": [], "total_bytes": 0, } def extract_text_from_html(html_content: str) -> str: """Extract clean text content from HTML, preserving structure as markdown.""" soup = BeautifulSoup(html_content, "html.parser") # Remove script and style elements for tag in soup(["script", "style", "nav", "footer", "header"]): tag.decompose() # Find main content area main = soup.select_one("main, article, [role='main'], .content, .markdown-body") if not main: main = soup.body or soup lines = [] # Extract title title = soup.select_one("title") if title: lines.append(f"# {title.get_text().strip()}\\n") # Process content for elem in main.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "pre", "code", "li", "blockquote"]): text = elem.get_text().strip() if not text: continue if elem.name.startswith("h"): level = int(elem.name[1]) lines.append(f"\\n{'#' * level} {text}\\n") elif elem.name == "pre": code = elem.get_text() lang = "" code_elem = elem.find("code") if code_elem: classes = code_elem.get("class", []) for cls in classes: if cls.startswith("language-"): lang = cls.replace("language-", "") break lines.append(f"\\n```{lang}\\n{code}\\n```\\n") elif elem.name == "li": lines.append(f"- {text}") elif elem.name == "blockquote": lines.append(f"> {text}") elif elem.name == "p": lines.append(f"\\n{text}\\n") return "\\n".join(lines) async def fetch_single(client, task: dict) -> dict: """Fetch a single URL and save to file.""" url = task["url"] output_path = Path(task["output"]) try: output_path.parent.mkdir(parents=True, exist_ok=True) response = await client.get(url, follow_redirects=True) response.raise_for_status() content = response.text if extract_markdown and not url.endswith(".md"): if content.strip().startswith(" 0: await asyncio.sleep(rate_limit) await run_all() output_lines = [ f"📥 Fetch and Save Complete", f"", f"✅ Success: {len(results['success'])} files", f"❌ Failed: {len(results['failed'])} files", f"📦 Total: {results['total_bytes']:,} bytes", f"", ] if results["success"]: output_lines.append("### Saved Files:") for r in results["success"][:20]: output_lines.append(f" - {r['output']} ({r['bytes']:,} bytes)") if len(results["success"]) > 20: output_lines.append(f" ... and {len(results['success']) - 20} more") if results["failed"]: output_lines.append("\\n### Failed:") for r in results["failed"][:10]: output_lines.append(f" - {r['url']}: {r['error']}") if len(results["failed"]) > 10: output_lines.append(f" ... and {len(results['failed']) - 10} more") return [TextContent(type="text", text="\\n".join(output_lines))] ''' TOOL_REGISTRATION = ''' @safe_tool_decorator( description="Fetch web pages and save directly to files. Token-efficient: downloads via Python httpx, saves to disk, returns only metadata (NOT content). Saves ~97% tokens vs WebFetch. Supports batch mode and HTML-to-markdown conversion." ) async def fetch_and_save( url: str | None = None, output: str | None = None, urls: list | None = None, extract_markdown: bool = True, timeout: int = 30, rate_limit: float = 0.5, ) -> str: """ Fetch web pages and save directly to files without passing content through Claude API. Args: url: Single URL to fetch (use with 'output') output: Output file path for single URL mode urls: List of {"url": str, "output": str} dicts for batch mode extract_markdown: Extract markdown from HTML pages (default: true) timeout: Request timeout in seconds (default: 30) rate_limit: Delay between requests in seconds (default: 0.5) Returns: Summary with success/failure counts and file sizes - NOT content. """ args = { "extract_markdown": extract_markdown, "timeout": timeout, "rate_limit": rate_limit, } if url: args["url"] = url if output: args["output"] = output if urls: args["urls"] = urls result = await fetch_and_save_impl(args) if isinstance(result, list) and result: return result[0].text if hasattr(result[0], "text") else str(result[0]) return str(result) ''' def patch_scraping_tools(): """Add fetch_and_save_tool to scraping_tools.py""" file_path = SKILL_SEEKERS_PATH / "mcp" / "tools" / "scraping_tools.py" if not file_path.exists(): print(f"❌ File not found: {file_path}") return False content = file_path.read_text(encoding="utf-8") # Check if already patched if "fetch_and_save_tool" in content: print("✓ scraping_tools.py already patched") return True # Add to end of file content = content.rstrip() + "\n" + FETCH_AND_SAVE_CODE file_path.write_text(content, encoding="utf-8") print("✅ Patched scraping_tools.py") return True def patch_init(): """Add export to __init__.py""" file_path = SKILL_SEEKERS_PATH / "mcp" / "tools" / "__init__.py" if not file_path.exists(): print(f"❌ File not found: {file_path}") return False content = file_path.read_text(encoding="utf-8") if "fetch_and_save_impl" in content: print("✓ __init__.py already patched") return True # Add import import_line = "from .scraping_tools import (\n fetch_and_save_tool as fetch_and_save_impl,\n)" content = content.replace( "from .scraping_tools import (\n build_how_to_guides_tool as build_how_to_guides_impl,\n)", "from .scraping_tools import (\n build_how_to_guides_tool as build_how_to_guides_impl,\n)\n" + import_line ) # Add to __all__ content = content.replace( '"extract_config_patterns_impl",', '"extract_config_patterns_impl",\n "fetch_and_save_impl",' ) file_path.write_text(content, encoding="utf-8") print("✅ Patched __init__.py") return True def patch_server(): """Add tool registration to server_fastmcp.py""" file_path = SKILL_SEEKERS_PATH / "mcp" / "server_fastmcp.py" if not file_path.exists(): print(f"❌ File not found: {file_path}") return False content = file_path.read_text(encoding="utf-8") if "fetch_and_save_impl" in content: print("✓ server_fastmcp.py already patched") return True # Add import content = content.replace( "extract_test_examples_impl,", "extract_test_examples_impl,\n fetch_and_save_impl," ) # Add tool registration before PACKAGING TOOLS content = content.replace( "# ============================================================================\n# PACKAGING TOOLS", TOOL_REGISTRATION + "\n# ============================================================================\n# PACKAGING TOOLS" ) file_path.write_text(content, encoding="utf-8") print("✅ Patched server_fastmcp.py") return True def main(): print("=" * 60) print("🔧 Adding fetch_and_save tool to Skill Seekers MCP") print("=" * 60) print() success = True success = patch_scraping_tools() and success success = patch_init() and success success = patch_server() and success print() if success: print("=" * 60) print("✅ Patch complete!") print() print("Next steps:") print(" 1. Restart Claude Code to reload MCP servers") print(" 2. Use the tool:") print() print(" mcp__skill-seeker__fetch_and_save(") print(' url="https://example.com/doc.md",') print(' output="local/path/doc.md"') print(" )") print() print(" Or batch mode:") print() print(" mcp__skill-seeker__fetch_and_save(urls=[") print(' {"url": "...", "output": "..."},') print(' {"url": "...", "output": "..."}') print(" ])") print("=" * 60) else: print("❌ Some patches failed. Check errors above.") sys.exit(1) if __name__ == "__main__": main()