Relax skill package top-level normalization

This commit is contained in:
Yanli 盐粒 2026-06-25 22:13:06 +08:00
parent c55629c84d
commit b9c6ad82f9
2 changed files with 77 additions and 11 deletions

View File

@ -76,17 +76,28 @@ class SkillPackageService:
def validate_and_normalize(self, *, content: bytes, filename: str) -> NormalizedSkillPackage:
"""Return the canonical drive package for an uploaded skill archive.
The shallowest ``SKILL.md`` defines the skill root. The returned manifest
is normalized to archive-root ``SKILL.md`` and its hash describes the
rebuilt archive bytes. Member read/decompression failures while consuming
the archive are mapped to ``invalid_archive``.
The shallowest ``SKILL.md`` defines the skill root. When exactly one
depth-2 ``<folder>/SKILL.md`` exists, normalization strips that top-level
folder and silently discards all members outside it, including nested
foreign paths. When that unique depth-2 condition does not apply, files
outside the selected skill root still raise ``files_outside_skill_root``.
The returned manifest is normalized to archive-root ``SKILL.md`` and its
hash describes the rebuilt archive bytes. Member read/decompression
failures while consuming the archive are mapped to ``invalid_archive``.
"""
archive = self._open_archive(content=content, filename=filename)
with archive:
members, total_uncompressed = self._collect_file_members(archive)
entry_path = self._find_skill_md([safe_path for _, safe_path in members])
members = self._collect_file_members(archive)
member_paths = [safe_path for _, safe_path in members]
entry_path = self._find_skill_md(member_paths)
strip_prefix = self._skill_root_prefix(entry_path)
normalized_members = self._normalize_members(members=members, skill_root_prefix=strip_prefix)
normalized_members = self._normalize_members(
members=members,
skill_root_prefix=strip_prefix,
ignore_outside_selected_root=self._can_strip_single_top_level_folder(
paths=member_paths, entry_path=entry_path
),
)
skill_md_member = normalized_members[_SKILL_MD_NAME]
self._validate_skill_md_size(skill_md_member)
skill_md_bytes = self._read_member_bytes_from_archive(archive, member_info=skill_md_member)
@ -94,6 +105,7 @@ class SkillPackageService:
normalized_archive_bytes = self._build_normalized_archive(
archive=archive, normalized_members=normalized_members
)
normalized_size = sum(max(info.file_size, 0) for info in normalized_members.values())
name, description = self._parse_skill_md(skill_md)
manifest = SkillManifest(
@ -101,7 +113,7 @@ class SkillPackageService:
description=description,
entry_path=_SKILL_MD_NAME,
files=sorted(normalized_members),
size=total_uncompressed,
size=normalized_size,
hash=hashlib.sha256(normalized_archive_bytes).hexdigest(),
)
return NormalizedSkillPackage(
@ -123,7 +135,7 @@ class SkillPackageService:
except zipfile.BadZipFile as exc:
raise SkillPackageError("invalid_archive", "skill archive is not a valid zip", status_code=400) from exc
def _collect_file_members(self, archive: zipfile.ZipFile) -> tuple[list[tuple[zipfile.ZipInfo, str]], int]:
def _collect_file_members(self, archive: zipfile.ZipFile) -> list[tuple[zipfile.ZipInfo, str]]:
infos = [info for info in archive.infolist() if not info.is_dir()]
if len(infos) > _MAX_ENTRIES:
raise SkillPackageError("too_many_entries", "skill archive has too many files", status_code=400)
@ -139,7 +151,7 @@ class SkillPackageService:
"skill archive uncompressed size exceeds limit",
status_code=400,
)
return members, total_uncompressed
return members
@staticmethod
def _skill_root_prefix(entry_path: str) -> str | None:
@ -153,11 +165,14 @@ class SkillPackageService:
*,
members: list[tuple[zipfile.ZipInfo, str]],
skill_root_prefix: str | None,
ignore_outside_selected_root: bool = False,
) -> dict[str, zipfile.ZipInfo]:
normalized_members: dict[str, zipfile.ZipInfo] = {}
for info, safe_path in members:
if skill_root_prefix is not None:
if not safe_path.startswith(skill_root_prefix):
if ignore_outside_selected_root:
continue
raise SkillPackageError(
"files_outside_skill_root",
"skill archive contains files outside the selected skill root",
@ -229,6 +244,13 @@ class SkillPackageService:
# Prefer the shallowest SKILL.md (skill root).
return min(candidates, key=lambda p: (p.count("/"), len(p)))
@staticmethod
def _can_strip_single_top_level_folder(*, paths: list[str], entry_path: str) -> bool:
if entry_path.count("/") != 1:
return False
candidates = [path for path in paths if path.count("/") == 1 and posixpath.basename(path) == _SKILL_MD_NAME]
return len(candidates) == 1 and candidates[0] == entry_path
@staticmethod
def _read_member_bytes_from_archive(archive: zipfile.ZipFile, *, member_info: zipfile.ZipInfo) -> bytes:
try:

View File

@ -89,6 +89,50 @@ def test_validate_and_normalize_strips_single_top_level_folder():
assert _archive_members(package.archive_bytes) == ["SKILL.md", "scripts/run.py"]
def test_validate_and_normalize_strips_single_top_level_folder_ignoring_other_root_entries():
package = _normalize(
{
"pdf-toolkit/SKILL.md": _SKILL_MD.encode(),
"pdf-toolkit/scripts/run.py": b"print('hi')\n",
"README.md": b"bundle notes\n",
}
)
assert package.manifest.entry_path == "SKILL.md"
assert package.manifest.files == ["SKILL.md", "scripts/run.py"]
assert package.skill_md_bytes == _SKILL_MD.encode()
assert package.strip_prefix == "pdf-toolkit/"
assert _archive_members(package.archive_bytes) == ["SKILL.md", "scripts/run.py"]
def test_validate_and_normalize_strips_single_top_level_folder_dropping_nested_foreign_paths():
package = _normalize(
{
"pdf-toolkit/SKILL.md": _SKILL_MD.encode(),
"pdf-toolkit/scripts/run.py": b"print('hi')\n",
"bundle/other.txt": b"x",
}
)
assert package.manifest.entry_path == "SKILL.md"
assert package.manifest.files == ["SKILL.md", "scripts/run.py"]
assert package.skill_md_bytes == _SKILL_MD.encode()
assert package.strip_prefix == "pdf-toolkit/"
assert _archive_members(package.archive_bytes) == ["SKILL.md", "scripts/run.py"]
def test_validate_and_normalize_rejects_multiple_depth_2_skill_roots_with_sibling_skill_tree():
with pytest.raises(SkillPackageError) as exc_info:
_normalize(
{
"pdf-toolkit/SKILL.md": _SKILL_MD.encode(),
"pdf-toolkit/scripts/run.py": b"print('hi')\n",
"other-tool/SKILL.md": _SKILL_MD.encode(),
}
)
assert exc_info.value.code == "files_outside_skill_root"
def test_validate_and_normalize_strips_deeper_selected_skill_root():
members = {
"bundle/pdf-toolkit/SKILL.md": _SKILL_MD.encode(),
@ -188,7 +232,7 @@ def test_unterminated_frontmatter_falls_back_to_heading():
def test_validate_and_normalize_rejects_files_outside_selected_skill_root():
with pytest.raises(SkillPackageError) as exc_info:
_normalize({"pdf-toolkit/SKILL.md": _SKILL_MD.encode(), "README.md": b"x"})
_normalize({"bundle/pdf-toolkit/SKILL.md": _SKILL_MD.encode(), "README.md": b"x"})
assert exc_info.value.code == "files_outside_skill_root"