mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-06-22 03:20:45 -04:00
148 lines
5.8 KiB
Python
148 lines
5.8 KiB
Python
import json
|
|
|
|
import pytest
|
|
|
|
from .conftest import (
|
|
api_client_request,
|
|
cli_env,
|
|
create_admin_and_token,
|
|
get_free_port,
|
|
init_archive,
|
|
live_api_request,
|
|
parse_jsonl_output,
|
|
run_archivebox_cmd,
|
|
start_archivebox_server,
|
|
stop_server,
|
|
wait_for_live_api,
|
|
)
|
|
|
|
pytestmark = pytest.mark.django_db(transaction=True)
|
|
|
|
|
|
def test_cli_update_api_accepts_empty_json_without_traceback(client, tmp_path, api_headers):
|
|
init_archive(tmp_path)
|
|
|
|
try:
|
|
response = api_client_request(
|
|
client,
|
|
"post",
|
|
"/api/v1/cli/update",
|
|
payload={},
|
|
headers=api_headers,
|
|
)
|
|
finally:
|
|
stop_server(tmp_path)
|
|
|
|
assert response.status_code == 200, response.content
|
|
payload = response.json()
|
|
assert payload["success"] is True
|
|
assert "Traceback" not in response.content.decode()
|
|
|
|
|
|
@pytest.mark.timeout(180)
|
|
def test_cli_update_api_supports_all_snapshot_list_filters_with_real_rows(tmp_path):
|
|
env = cli_env(disable_extractors=True)
|
|
init_archive(tmp_path)
|
|
|
|
records = [
|
|
{
|
|
"type": "Snapshot",
|
|
"url": "https://alpha.example.com/articles/needle",
|
|
"title": "Needle Alpha",
|
|
"tags": "api-keep",
|
|
"timestamp": "1700000000",
|
|
"bookmarked_at": "2023-11-14T22:13:20+00:00",
|
|
},
|
|
{
|
|
"type": "Snapshot",
|
|
"url": "https://beta.example.org/posts/haystack",
|
|
"title": "Haystack Beta",
|
|
"tags": "api-other",
|
|
"timestamp": "1710000000",
|
|
"bookmarked_at": "2024-03-09T16:00:00+00:00",
|
|
},
|
|
{
|
|
"type": "Snapshot",
|
|
"url": "https://docs.archivebox.io/manual",
|
|
"title": "Manual Gamma",
|
|
"tags": "api-docs",
|
|
"timestamp": "1720000000",
|
|
"bookmarked_at": "2024-07-03T09:46:40+00:00",
|
|
},
|
|
]
|
|
stdin = "\n".join(json.dumps(record) for record in records) + "\n"
|
|
run_archivebox_cmd(["snapshot", "create"], cwd=tmp_path, stdin=stdin, env=env, check=True)
|
|
list_result = run_archivebox_cmd(["snapshot", "list", "--sort", "timestamp"], cwd=tmp_path, env=env, check=True)
|
|
initial_snapshots = {record["url"]: record for record in parse_jsonl_output(list_result.stdout) if record.get("type") == "Snapshot"}
|
|
alpha = initial_snapshots["https://alpha.example.com/articles/needle"]
|
|
run_archivebox_cmd(
|
|
["snapshot", "update", "--status=paused"],
|
|
cwd=tmp_path,
|
|
stdin=json.dumps(alpha),
|
|
env=env,
|
|
check=True,
|
|
)
|
|
|
|
port = get_free_port()
|
|
env = {
|
|
**cli_env(port=port, server=True, PUBLIC_INDEX="True"),
|
|
**env,
|
|
}
|
|
api_token = create_admin_and_token(tmp_path)
|
|
|
|
def assert_update_filter(label, body, expected_records):
|
|
response = live_api_request(
|
|
port,
|
|
"post",
|
|
"/api/v1/cli/update",
|
|
api_token=api_token,
|
|
json={**body, "batch_size": 100, "migrate_only": True},
|
|
timeout=30,
|
|
)
|
|
assert response.status_code == 200, f"{label}: {response.text}"
|
|
assert "Traceback" not in response.text
|
|
payload = response.json()
|
|
assert payload["success"] is True, label
|
|
expected_ids = {record["id"] for record in expected_records}
|
|
assert set(payload["result"]["snapshot_ids"]) == expected_ids, label
|
|
assert payload["result"]["matched_count"] == len(expected_ids), label
|
|
|
|
try:
|
|
start_archivebox_server(tmp_path, env=env, port=port)
|
|
wait_for_live_api(port)
|
|
list_result = run_archivebox_cmd(["snapshot", "list", "--sort", "timestamp"], cwd=tmp_path, env=env, check=True)
|
|
snapshots = {record["url"]: record for record in parse_jsonl_output(list_result.stdout) if record.get("type") == "Snapshot"}
|
|
alpha = snapshots["https://alpha.example.com/articles/needle"]
|
|
beta = snapshots["https://beta.example.org/posts/haystack"]
|
|
gamma = snapshots["https://docs.archivebox.io/manual"]
|
|
status_result = run_archivebox_cmd(
|
|
["snapshot", "list", "--status", alpha["status"]],
|
|
cwd=tmp_path,
|
|
env=env,
|
|
check=True,
|
|
)
|
|
status_records = [record for record in parse_jsonl_output(status_result.stdout) if record.get("type") == "Snapshot"]
|
|
|
|
cases = [
|
|
("status", {"status": alpha["status"]}, status_records),
|
|
("filter_type exact", {"filter_type": "exact", "filter_patterns": [alpha["url"]]}, [alpha]),
|
|
("filter_type substring", {"filter_type": "substring", "filter_patterns": ["needle"]}, [alpha]),
|
|
("filter_type regex", {"filter_type": "regex", "filter_patterns": [r"alpha\.example\.com/.+needle"]}, [alpha]),
|
|
("filter_type domain", {"filter_type": "domain", "filter_patterns": ["alpha.example.com"]}, [alpha]),
|
|
("filter_type tag", {"filter_type": "tag", "filter_patterns": ["api-keep"]}, [alpha]),
|
|
("filter_type timestamp", {"filter_type": "timestamp", "filter_patterns": [alpha["timestamp"]]}, [alpha]),
|
|
("url__icontains", {"url__icontains": "needle"}, [alpha]),
|
|
("url__istartswith", {"url__istartswith": "https://alpha.example.com"}, [alpha]),
|
|
("tag", {"tag": "api-keep"}, [alpha]),
|
|
("crawl_id", {"crawl_id": alpha["crawl_id"]}, [alpha]),
|
|
("limit and sort", {"limit": 1, "sort": "timestamp"}, [alpha]),
|
|
("search", {"search": "meta", "filter_patterns": ["Needle Alpha"]}, [alpha]),
|
|
("before", {"before": 1715000000}, [alpha, beta]),
|
|
("after", {"after": 1715000000}, [gamma]),
|
|
("resume", {"resume": beta["timestamp"]}, [alpha, beta]),
|
|
]
|
|
for label, body, expected_records in cases:
|
|
assert_update_filter(label, body, expected_records)
|
|
finally:
|
|
stop_server(tmp_path)
|