Files
wiki/skills/yandex-search/scripts/yandex_search.py
2026-04-12 21:55:33 +03:00

114 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""Yandex Search API v2 (async) — web search for Russian-language content."""
import os
import sys
import json
import time
import urllib.request
import urllib.parse
import xml.etree.ElementTree as ET
API_KEY = os.environ.get("YANDEX_API_KEY", "")
SEARCH_URL = "https://searchapi.api.cloud.yandex.net/v2/web/searchAsync"
OPERATION_URL = "https://operation.api.cloud.yandex.net/operations/{}"
def search(query: str, max_results: int = 5, lang: str = "ru") -> list[dict]:
"""Search Yandex and return list of results."""
search_type = "SEARCH_TYPE_RU" if lang == "ru" else "SEARCH_TYPE_COM"
payload = json.dumps({
"query": {
"searchType": search_type,
"queryText": query
},
"sortSpec": {"sortType": "SORT_TYPE_RL"},
"groupingSpec": {
"groupMode": "GROUP_MODE_DEEP",
"groupsOnPage": max_results,
"docsInGroup": 1
}
}).encode("utf-8")
req = urllib.request.Request(
SEARCH_URL,
data=payload,
headers={
"Authorization": f"Api-Key {API_KEY}",
"Content-Type": "application/json"
},
method="POST"
)
with urllib.request.urlopen(req) as r:
operation = json.loads(r.read())
operation_id = operation["id"]
# Poll until done (max 15 sec)
for _ in range(15):
time.sleep(1)
poll_req = urllib.request.Request(
OPERATION_URL.format(operation_id),
headers={"Authorization": f"Api-Key {API_KEY}"}
)
with urllib.request.urlopen(poll_req) as r:
status = json.loads(r.read())
if status.get("done"):
break
if not status.get("done"):
return [{"error": "Search timed out"}]
import base64
raw = status.get("response", {}).get("rawData", "")
xml_data = base64.b64decode(raw).decode("utf-8", errors="ignore")
return parse_xml(xml_data, max_results)
def parse_xml(xml_data: str, max_results: int) -> list[dict]:
"""Parse Yandex XML response into structured results."""
try:
root = ET.fromstring(xml_data)
except ET.ParseError:
return [{"error": "Failed to parse XML response"}]
results = []
for group in root.findall(".//group")[:max_results]:
doc = group.find("doc")
if doc is None:
continue
url = doc.findtext("url", "")
title_el = doc.find("title")
title = "".join(title_el.itertext()) if title_el is not None else ""
passages = []
for p in doc.findall(".//passage"):
text = "".join(p.itertext())
if text:
passages.append(text)
snippet = " ".join(passages[:2])[:300] if passages else ""
domain = doc.findtext("domain", "")
results.append({
"title": title,
"url": url,
"domain": domain,
"snippet": snippet,
})
return results
if __name__ == "__main__":
query = sys.argv[1] if len(sys.argv) > 1 else "openclaw"
max_r = int(sys.argv[2]) if len(sys.argv) > 2 else 5
lang = sys.argv[3] if len(sys.argv) > 3 else "ru"
results = search(query, max_r, lang)
print(json.dumps(results, ensure_ascii=False, indent=2))