114 lines
3.2 KiB
Python
114 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Yandex Search API v2 (async) — web search for Russian-language content."""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import urllib.request
|
|
import urllib.parse
|
|
import xml.etree.ElementTree as ET
|
|
|
|
API_KEY = os.environ.get("YANDEX_API_KEY", "")
|
|
SEARCH_URL = "https://searchapi.api.cloud.yandex.net/v2/web/searchAsync"
|
|
OPERATION_URL = "https://operation.api.cloud.yandex.net/operations/{}"
|
|
|
|
|
|
def search(query: str, max_results: int = 5, lang: str = "ru") -> list[dict]:
|
|
"""Search Yandex and return list of results."""
|
|
search_type = "SEARCH_TYPE_RU" if lang == "ru" else "SEARCH_TYPE_COM"
|
|
|
|
payload = json.dumps({
|
|
"query": {
|
|
"searchType": search_type,
|
|
"queryText": query
|
|
},
|
|
"sortSpec": {"sortType": "SORT_TYPE_RL"},
|
|
"groupingSpec": {
|
|
"groupMode": "GROUP_MODE_DEEP",
|
|
"groupsOnPage": max_results,
|
|
"docsInGroup": 1
|
|
}
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
SEARCH_URL,
|
|
data=payload,
|
|
headers={
|
|
"Authorization": f"Api-Key {API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
method="POST"
|
|
)
|
|
|
|
with urllib.request.urlopen(req) as r:
|
|
operation = json.loads(r.read())
|
|
|
|
operation_id = operation["id"]
|
|
|
|
# Poll until done (max 15 sec)
|
|
for _ in range(15):
|
|
time.sleep(1)
|
|
poll_req = urllib.request.Request(
|
|
OPERATION_URL.format(operation_id),
|
|
headers={"Authorization": f"Api-Key {API_KEY}"}
|
|
)
|
|
with urllib.request.urlopen(poll_req) as r:
|
|
status = json.loads(r.read())
|
|
if status.get("done"):
|
|
break
|
|
|
|
if not status.get("done"):
|
|
return [{"error": "Search timed out"}]
|
|
|
|
import base64
|
|
raw = status.get("response", {}).get("rawData", "")
|
|
xml_data = base64.b64decode(raw).decode("utf-8", errors="ignore")
|
|
|
|
return parse_xml(xml_data, max_results)
|
|
|
|
|
|
def parse_xml(xml_data: str, max_results: int) -> list[dict]:
|
|
"""Parse Yandex XML response into structured results."""
|
|
try:
|
|
root = ET.fromstring(xml_data)
|
|
except ET.ParseError:
|
|
return [{"error": "Failed to parse XML response"}]
|
|
|
|
results = []
|
|
for group in root.findall(".//group")[:max_results]:
|
|
doc = group.find("doc")
|
|
if doc is None:
|
|
continue
|
|
|
|
url = doc.findtext("url", "")
|
|
title_el = doc.find("title")
|
|
title = "".join(title_el.itertext()) if title_el is not None else ""
|
|
|
|
passages = []
|
|
for p in doc.findall(".//passage"):
|
|
text = "".join(p.itertext())
|
|
if text:
|
|
passages.append(text)
|
|
|
|
snippet = " ".join(passages[:2])[:300] if passages else ""
|
|
domain = doc.findtext("domain", "")
|
|
|
|
results.append({
|
|
"title": title,
|
|
"url": url,
|
|
"domain": domain,
|
|
"snippet": snippet,
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
query = sys.argv[1] if len(sys.argv) > 1 else "openclaw"
|
|
max_r = int(sys.argv[2]) if len(sys.argv) > 2 else 5
|
|
lang = sys.argv[3] if len(sys.argv) > 3 else "ru"
|
|
|
|
results = search(query, max_r, lang)
|
|
print(json.dumps(results, ensure_ascii=False, indent=2))
|