Конфиг-only активация двух новых источников GPS-треков поверх pipeline ET-008. Не вводит новых компонентов, БД-таблиц, endpoint'ов. Config: - config/gps_sources.yaml: enduro_russia enabled=true, base_url исправлен на endurorussia.ru (без дефиса); добавлена запись wikiloc с max_tracks_per_run=50, activity_filter=[motorcycle, enduro]. - config/gps_regions.yaml: wikiloc добавлен в tsfo_plus_chuvashia.sources. Parser: - wikiloc.py: добавлен soft-cap max_tracks_per_run в collect(), извлечение created_at из GPX metadata/первого trkpt — для корректной межисточниковой дедупликации с EnduroRussia. UI (src/web/gps_tracks.js): - GPS_SOURCE_COLORS: добавлен цвет wikiloc (#4363d8). - Дефолтный фильтр sources включает wikiloc. - GPS_SOURCE_ATTRIBUTIONS: маппинг source_id → строка атрибуции; _updateGpsAttribution() подтягивает /api/gps-tracks/health и выставляет attribution с теми источниками, у которых tracks > 0. - _buildGpsFiltersUI: чекбокс «Wikiloc» в #gps-source-grid. Tests: - Fixtures: 7 файлов в tests/fixtures/gps-tracks/. - Unit: 10 UT-ER + 10 UT-WL — парсеры, MAPPING, bbox-фильтр, pagination, 429/403 graceful-stop, rate-limit, max_tracks_per_run. - Integration: IT-ER-01, IT-WL-01, IT-WL-02, IT-DEDUP-01, IT-LIC-01 через scripts.gps_collect.main + httpx.MockTransport. - Contract: 2 CT-ER с маркером @pytest.mark.network (nightly only). - JS: 2 новых теста на наличие wikiloc в SOURCE_COLORS и в фильтрах. Linters/Tests: ruff clean (новые файлы), 166 pytest passed, 24 JS-tests passed. Refs: ET-009 Acceptance: AC-01..AC-08, AC-14..AC-17 (для AC-09..AC-13 — продакшн-прогон) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
400 lines
14 KiB
Python
400 lines
14 KiB
Python
"""Парсер Wikiloc — HTML-парсинг публичных треков (ET-009)."""
|
||
import asyncio
|
||
import math
|
||
import logging
|
||
import re
|
||
from typing import AsyncGenerator
|
||
|
||
import defusedxml.ElementTree as ET
|
||
import httpx
|
||
|
||
from src.api.gps_tracks.models import TrackInsert
|
||
from src.api.gps_tracks.sources.base import SourceParser
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Wikiloc activity codes для поиска
|
||
_ACTIVITY_CODES = {
|
||
"motorcycle": 19, # Motorcycle
|
||
"enduro": 19,
|
||
"mtb": 3, # Mountain biking
|
||
}
|
||
|
||
# Паттерны для парсинга HTML
|
||
_TRACK_URL_RE = re.compile(r'href="(/trails/[^"]+/\d+)"')
|
||
_TRACK_ID_RE = re.compile(r'/trails/[^/]+/(\d+)')
|
||
_GPX_LINK_RE = re.compile(r'href="([^"]*download[^"]*\.gpx[^"]*|[^"]*\.gpx[^"]*download[^"]*)"' , re.IGNORECASE)
|
||
_TRAIL_JSON_RE = re.compile(r'wikiloc\.trail\s*=\s*(\{.*?\});', re.DOTALL)
|
||
|
||
|
||
class WikilocParser(SourceParser):
|
||
"""Парсер Wikiloc через HTTP-парсинг страниц поиска.
|
||
|
||
Wikiloc не имеет публичного API. Используем HTML-парсинг с агрессивным
|
||
rate-limit (10 сек). При 403/429 — graceful stop без краша.
|
||
"""
|
||
|
||
MAPPING = {
|
||
"motorcycle": "moto",
|
||
"enduro": "enduro",
|
||
"mtb": "bicycle",
|
||
"mountain biking": "bicycle",
|
||
"hiking": "hike",
|
||
"running": "hike",
|
||
"trail running": "hike",
|
||
"offroad": "offroad",
|
||
}
|
||
|
||
async def collect(self, bbox: tuple, ctx: dict) -> AsyncGenerator[TrackInsert, None]:
|
||
"""Собирает треки из Wikiloc через HTML-парсинг.
|
||
|
||
Args:
|
||
bbox: (west, south, east, north)
|
||
ctx: контекст выполнения
|
||
|
||
Yields:
|
||
TrackInsert объекты
|
||
"""
|
||
west, south, east, north = bbox
|
||
base_url = self.config.get("base_url", "https://www.wikiloc.com").rstrip("/")
|
||
rate_limit = self.config.get("rate_limit_sec", 10)
|
||
user_agent = self.config.get("user_agent", "enduro-trails/1.0")
|
||
source_id = self.config.get("id", "wikiloc")
|
||
source_priority = self.config.get("source_priority", 70)
|
||
activity_filter = self.config.get("activity_filter", ["motorcycle", "enduro"])
|
||
max_tracks = self.config.get("max_tracks_per_run")
|
||
yielded = 0
|
||
|
||
headers = {
|
||
"User-Agent": user_agent,
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "en-US,en;q=0.5",
|
||
}
|
||
|
||
async with httpx.AsyncClient(
|
||
timeout=30,
|
||
headers=headers,
|
||
follow_redirects=True,
|
||
) as client:
|
||
for activity in activity_filter:
|
||
act_code = _ACTIVITY_CODES.get(activity, 19)
|
||
|
||
page = 0
|
||
while True:
|
||
# URL поиска по bbox
|
||
search_url = (
|
||
f"{base_url}/wikiloc/find.do"
|
||
f"?act={act_code}"
|
||
f"&sw={south},{west}"
|
||
f"&ne={north},{east}"
|
||
f"&page={page}"
|
||
)
|
||
|
||
try:
|
||
await asyncio.sleep(rate_limit)
|
||
resp = await client.get(search_url)
|
||
except Exception as exc:
|
||
logger.error("Wikiloc: failed to fetch search page: %s", exc)
|
||
return
|
||
|
||
if resp.status_code in (403, 429):
|
||
logger.warning(
|
||
"Wikiloc: received %d on search, graceful stop",
|
||
resp.status_code,
|
||
)
|
||
return
|
||
|
||
if resp.status_code != 200:
|
||
logger.warning("Wikiloc: search returned %d", resp.status_code)
|
||
break
|
||
|
||
html = resp.text
|
||
track_paths = _extract_track_paths(html)
|
||
|
||
if not track_paths:
|
||
logger.info("Wikiloc: no tracks on page %d for activity %s", page, activity)
|
||
break
|
||
|
||
for path in track_paths:
|
||
track_id_match = _TRACK_ID_RE.search(path)
|
||
if not track_id_match:
|
||
continue
|
||
track_id = track_id_match.group(1)
|
||
track_url = f"{base_url}{path}"
|
||
|
||
# Скачиваем страницу трека для получения GPX ссылки
|
||
try:
|
||
await asyncio.sleep(rate_limit)
|
||
track_resp = await client.get(track_url)
|
||
except Exception as exc:
|
||
logger.error("Wikiloc: failed to fetch track %s: %s", track_id, exc)
|
||
continue
|
||
|
||
if track_resp.status_code in (403, 429):
|
||
logger.warning(
|
||
"Wikiloc: received %d on track %s, graceful stop",
|
||
track_resp.status_code,
|
||
track_id,
|
||
)
|
||
return
|
||
|
||
if track_resp.status_code != 200:
|
||
logger.warning("Wikiloc: track %s returned %d", track_id, track_resp.status_code)
|
||
continue
|
||
|
||
track_html = track_resp.text
|
||
|
||
# Ищем ссылку на GPX
|
||
gpx_url = _extract_gpx_url(track_html, base_url, track_id)
|
||
if not gpx_url:
|
||
logger.debug("Wikiloc: no GPX link found for track %s", track_id)
|
||
continue
|
||
|
||
# Скачиваем GPX
|
||
try:
|
||
await asyncio.sleep(rate_limit)
|
||
gpx_resp = await client.get(gpx_url)
|
||
except Exception as exc:
|
||
logger.error("Wikiloc: failed to fetch GPX %s: %s", track_id, exc)
|
||
continue
|
||
|
||
if gpx_resp.status_code in (403, 429):
|
||
logger.warning(
|
||
"Wikiloc: received %d on GPX %s, graceful stop",
|
||
gpx_resp.status_code,
|
||
track_id,
|
||
)
|
||
return
|
||
|
||
if gpx_resp.status_code != 200:
|
||
logger.warning("Wikiloc: GPX %s returned %d", track_id, gpx_resp.status_code)
|
||
continue
|
||
|
||
# Парсим GPX
|
||
name = _extract_track_name(track_html)
|
||
track = _parse_gpx(
|
||
gpx_resp.content,
|
||
track_id=track_id,
|
||
name=name,
|
||
activity_type=self.MAPPING.get(activity, "moto"),
|
||
source_id=source_id,
|
||
track_url=track_url,
|
||
source_priority=source_priority,
|
||
)
|
||
if track is None:
|
||
continue
|
||
|
||
if not _bbox_intersects(
|
||
(track.min_lon, track.min_lat, track.max_lon, track.max_lat),
|
||
(west, south, east, north),
|
||
):
|
||
continue
|
||
|
||
if max_tracks is not None and yielded >= max_tracks:
|
||
logger.info(
|
||
"Wikiloc: reached max_tracks_per_run=%d, stopping",
|
||
max_tracks,
|
||
)
|
||
return
|
||
|
||
yield track
|
||
yielded += 1
|
||
|
||
page += 1
|
||
|
||
|
||
def _extract_track_paths(html: str) -> list:
|
||
"""Извлекает пути к трекам из HTML страницы поиска Wikiloc."""
|
||
# Ищем ссылки вида /trails/motorcycle-enduro/name-12345678
|
||
paths = _TRACK_URL_RE.findall(html)
|
||
# Дедупликация с сохранением порядка
|
||
seen = set()
|
||
result = []
|
||
for p in paths:
|
||
if p not in seen and _TRACK_ID_RE.search(p):
|
||
seen.add(p)
|
||
result.append(p)
|
||
return result
|
||
|
||
|
||
def _extract_gpx_url(html: str, base_url: str, track_id: str) -> str | None:
|
||
"""Извлекает URL для скачивания GPX из страницы трека."""
|
||
# Вариант 1: прямая ссылка на GPX
|
||
m = _GPX_LINK_RE.search(html)
|
||
if m:
|
||
url = m.group(1)
|
||
if url.startswith("http"):
|
||
return url
|
||
return base_url + url
|
||
|
||
# Вариант 2: стандартный URL скачивания Wikiloc
|
||
# https://www.wikiloc.com/wikiloc/downloadTrail.do?id=XXXXX
|
||
dl_re = re.search(r'downloadTrail\.do\?id=(\d+)', html)
|
||
if dl_re:
|
||
return f"{base_url}/wikiloc/downloadTrail.do?id={dl_re.group(1)}"
|
||
|
||
# Вариант 3: по track_id
|
||
return f"{base_url}/wikiloc/downloadTrail.do?id={track_id}"
|
||
|
||
|
||
def _extract_track_name(html: str) -> str | None:
|
||
"""Извлекает название трека из HTML страницы."""
|
||
# Ищем <h1> или <title>
|
||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||
if m:
|
||
return m.group(1).strip()
|
||
m = re.search(r'<title>([^<|]+)', html)
|
||
if m:
|
||
return m.group(1).strip()
|
||
return None
|
||
|
||
|
||
def _parse_gpx(
|
||
content: bytes,
|
||
track_id: str,
|
||
name: str | None,
|
||
activity_type: str,
|
||
source_id: str,
|
||
track_url: str,
|
||
source_priority: int,
|
||
) -> "TrackInsert | None":
|
||
"""Парсит GPX-файл Wikiloc и возвращает TrackInsert."""
|
||
try:
|
||
root = ET.fromstring(content)
|
||
except Exception as exc:
|
||
logger.error("Wikiloc: failed to parse GPX %s: %s", track_id, exc)
|
||
return None
|
||
|
||
ns = ""
|
||
tag = root.tag
|
||
if tag.startswith("{"):
|
||
ns = tag.split("}")[0] + "}"
|
||
|
||
# Извлекаем название и время из GPX metadata
|
||
created_at = None
|
||
for child in root:
|
||
local = child.tag.replace(ns, "") if ns else child.tag
|
||
if local == "metadata":
|
||
for meta_child in child:
|
||
local2 = meta_child.tag.replace(ns, "") if ns else meta_child.tag
|
||
if local2 == "name" and not name:
|
||
name = meta_child.text
|
||
elif local2 == "time" and meta_child.text:
|
||
created_at = meta_child.text.strip()
|
||
break
|
||
|
||
# Fallback: первая <trkpt><time> из первого trkseg
|
||
if not created_at:
|
||
for trk in root:
|
||
local = trk.tag.replace(ns, "") if ns else trk.tag
|
||
if local != "trk":
|
||
continue
|
||
for trkseg in trk:
|
||
local2 = trkseg.tag.replace(ns, "") if ns else trkseg.tag
|
||
if local2 != "trkseg":
|
||
continue
|
||
for trkpt in trkseg:
|
||
for sub in trkpt:
|
||
sub_local = sub.tag.replace(ns, "") if ns else sub.tag
|
||
if sub_local == "time" and sub.text:
|
||
created_at = sub.text.strip()
|
||
break
|
||
if created_at:
|
||
break
|
||
if created_at:
|
||
break
|
||
if created_at:
|
||
break
|
||
|
||
coords = []
|
||
for trk in root:
|
||
local = trk.tag.replace(ns, "") if ns else trk.tag
|
||
if local != "trk":
|
||
continue
|
||
for trkseg in trk:
|
||
local2 = trkseg.tag.replace(ns, "") if ns else trkseg.tag
|
||
if local2 != "trkseg":
|
||
continue
|
||
for trkpt in trkseg:
|
||
try:
|
||
lat = float(trkpt.get("lat", 0))
|
||
lon = float(trkpt.get("lon", 0))
|
||
if lat == 0 and lon == 0:
|
||
continue
|
||
coords.append((lon, lat))
|
||
except (TypeError, ValueError):
|
||
continue
|
||
|
||
if len(coords) < 2:
|
||
logger.debug("Wikiloc: track %s has < 2 points, skipping", track_id)
|
||
return None
|
||
|
||
lons = [c[0] for c in coords]
|
||
lats = [c[1] for c in coords]
|
||
min_lon, max_lon = min(lons), max(lons)
|
||
min_lat, max_lat = min(lats), max(lats)
|
||
|
||
length_m = _calc_track_length(coords)
|
||
if length_m < 10:
|
||
return None
|
||
|
||
try:
|
||
from shapely.geometry import LineString
|
||
from shapely import wkb
|
||
geom_wkb = wkb.dumps(LineString(coords))
|
||
except Exception as exc:
|
||
logger.error("Wikiloc: shapely error for track %s: %s", track_id, exc)
|
||
return None
|
||
|
||
from src.api.gps_tracks.models import ACTIVITY_TYPES
|
||
if activity_type not in ACTIVITY_TYPES:
|
||
activity_type = "moto"
|
||
|
||
return TrackInsert(
|
||
external_id=str(track_id),
|
||
source_id=source_id,
|
||
external_url=track_url,
|
||
name=name,
|
||
description=None,
|
||
activity_type=activity_type,
|
||
user=None,
|
||
created_at=created_at,
|
||
length_m=length_m,
|
||
points_count=len(coords),
|
||
geom_wkb=geom_wkb,
|
||
min_lon=min_lon,
|
||
min_lat=min_lat,
|
||
max_lon=max_lon,
|
||
max_lat=max_lat,
|
||
tags=[],
|
||
source_priority=source_priority,
|
||
)
|
||
|
||
|
||
def _haversine_m(lon1: float, lat1: float, lon2: float, lat2: float) -> float:
|
||
"""Расстояние между двумя точками в метрах (Haversine)."""
|
||
R = 6371000
|
||
phi1, phi2 = math.radians(lat1), math.radians(lat2)
|
||
dphi = math.radians(lat2 - lat1)
|
||
dlam = math.radians(lon2 - lon1)
|
||
a = math.sin(dphi / 2) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlam / 2) ** 2
|
||
return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
|
||
|
||
|
||
def _calc_track_length(coords: list) -> float:
|
||
"""Считает длину трека через Haversine."""
|
||
total = 0.0
|
||
for i in range(len(coords) - 1):
|
||
total += _haversine_m(coords[i][0], coords[i][1], coords[i + 1][0], coords[i + 1][1])
|
||
return total
|
||
|
||
|
||
def _bbox_intersects(a: tuple, b: tuple) -> bool:
|
||
"""Проверяет пересечение двух bbox (west, south, east, north)."""
|
||
a_west, a_south, a_east, a_north = a
|
||
b_west, b_south, b_east, b_north = b
|
||
return not (
|
||
a_east < b_west or a_west > b_east or
|
||
a_north < b_south or a_south > b_north
|
||
)
|