diff --git a/api/core/sandbox/inspector.py b/api/core/sandbox/inspector.py deleted file mode 100644 index a7e85c9a38..0000000000 --- a/api/core/sandbox/inspector.py +++ /dev/null @@ -1,462 +0,0 @@ -from __future__ import annotations - -import abc -import json -import logging -import os -import tempfile -from pathlib import Path, PurePosixPath -from uuid import UUID, uuid4 - -from core.sandbox.entities.files import SandboxFileDownloadTicket, SandboxFileNode -from core.sandbox.manager import SandboxManager -from core.sandbox.security.archive_signer import SandboxArchivePath -from core.sandbox.security.sandbox_file_signer import SandboxFileDownloadPath -from core.sandbox.storage import sandbox_file_storage -from core.virtual_environment.__base.exec import CommandExecutionError -from core.virtual_environment.__base.helpers import execute -from core.virtual_environment.__base.virtual_environment import VirtualEnvironment -from extensions.ext_storage import storage - -logger = logging.getLogger(__name__) - - -class SandboxFileSource(abc.ABC): - _LIST_TIMEOUT_SECONDS = 30 - _UPLOAD_TIMEOUT_SECONDS = 60 - _EXPORT_EXPIRES_IN_SECONDS = 60 * 5 - - def __init__(self, *, tenant_id: str, sandbox_id: str): - self._tenant_id = tenant_id - self._sandbox_id = sandbox_id - - @abc.abstractmethod - def list_files(self, *, path: str, recursive: bool) -> list[SandboxFileNode]: - raise NotImplementedError - - @abc.abstractmethod - def download_file(self, *, path: str) -> SandboxFileDownloadTicket: - raise NotImplementedError - - -class SandboxFileRuntimeSource(SandboxFileSource): - def __init__(self, *, tenant_id: str, sandbox_id: str, runtime: VirtualEnvironment): - super().__init__(tenant_id=tenant_id, sandbox_id=sandbox_id) - self._runtime = runtime - - def list_files(self, *, path: str, recursive: bool) -> list[SandboxFileNode]: - script = r""" -import json -import os -import sys - -path = sys.argv[1] -recursive = sys.argv[2] == "1" - -def norm(rel: str) -> str: - rel = rel.replace("\\\\", "/") - rel = rel.lstrip("./") - return rel or "." - -def stat_entry(full_path: str, rel_path: str) -> dict: - st = os.stat(full_path) - is_dir = os.path.isdir(full_path) - return { - "path": norm(rel_path), - "is_dir": is_dir, - "size": None if is_dir else int(st.st_size), - "mtime": int(st.st_mtime), - } - -entries = [] -if recursive: - for root, dirs, files in os.walk(path): - for d in dirs: - fp = os.path.join(root, d) - rp = os.path.relpath(fp, ".") - entries.append(stat_entry(fp, rp)) - for f in files: - fp = os.path.join(root, f) - rp = os.path.relpath(fp, ".") - entries.append(stat_entry(fp, rp)) -else: - if os.path.isfile(path): - rel_path = os.path.relpath(path, ".") - entries.append(stat_entry(path, rel_path)) - else: - for item in os.scandir(path): - rel_path = os.path.relpath(item.path, ".") - entries.append(stat_entry(item.path, rel_path)) - -print(json.dumps(entries)) -""" - - try: - result = execute( - self._runtime, - [ - "sh", - "-c", - 'if command -v python3 >/dev/null 2>&1; then py=python3; else py=python; fi; "$py" -c "$0" "$@"', - script, - path, - "1" if recursive else "0", - ], - timeout=self._LIST_TIMEOUT_SECONDS, - error_message="Failed to list sandbox files", - ) - except CommandExecutionError as exc: - raise RuntimeError(str(exc)) from exc - - try: - raw = json.loads(result.stdout.decode("utf-8")) - except Exception as exc: - raise RuntimeError("Malformed sandbox file list output") from exc - - entries: list[SandboxFileNode] = [] - for item in raw: - item_path = str(item.get("path")) - item_is_dir = bool(item.get("is_dir")) - extension = None - if not item_is_dir: - ext = os.path.splitext(item_path)[1] - extension = ext or None - entries.append( - SandboxFileNode( - path=item_path, - is_dir=item_is_dir, - size=item.get("size"), - mtime=item.get("mtime"), - extension=extension, - ) - ) - return entries - - def download_file(self, *, path: str) -> SandboxFileDownloadTicket: - kind = self._detect_path_kind(path) - - export_name = os.path.basename(path.rstrip("/")) or "workspace" - filename = f"{export_name}.tar.gz" if kind == "dir" else (os.path.basename(path) or "file") - export_id = uuid4().hex - export_path = SandboxFileDownloadPath( - tenant_id=UUID(self._tenant_id), - sandbox_id=UUID(self._sandbox_id), - export_id=export_id, - filename=filename, - ) - - upload_url = sandbox_file_storage.get_upload_url(export_path, expires_in=self._EXPORT_EXPIRES_IN_SECONDS) - - if kind == "dir": - archive_path = f"/tmp/{export_id}.tar.gz" - try: - execute( - self._runtime, - ["tar", "-czf", archive_path, "-C", ".", path], - timeout=self._UPLOAD_TIMEOUT_SECONDS, - error_message="Failed to archive directory in sandbox", - ) - execute( - self._runtime, - ["curl", "-s", "-f", "-X", "PUT", "-T", archive_path, upload_url], - timeout=self._UPLOAD_TIMEOUT_SECONDS, - error_message="Failed to upload directory archive from sandbox", - ) - except CommandExecutionError as exc: - raise RuntimeError(str(exc)) from exc - finally: - try: - execute( - self._runtime, - ["rm", "-f", archive_path], - timeout=self._LIST_TIMEOUT_SECONDS, - error_message="Failed to cleanup temp archive", - ) - except Exception as exc: - # Best-effort cleanup; do not fail the download on cleanup issues. - logger.debug("Failed to cleanup temp archive %s: %s", archive_path, exc) - else: - try: - execute( - self._runtime, - ["curl", "-s", "-f", "-X", "PUT", "-T", path, upload_url], - timeout=self._UPLOAD_TIMEOUT_SECONDS, - error_message="Failed to upload file from sandbox", - ) - except CommandExecutionError as exc: - raise RuntimeError(str(exc)) from exc - - download_url = sandbox_file_storage.get_download_url(export_path, expires_in=self._EXPORT_EXPIRES_IN_SECONDS) - return SandboxFileDownloadTicket( - download_url=download_url, - expires_in=self._EXPORT_EXPIRES_IN_SECONDS, - export_id=export_id, - ) - - def _detect_path_kind(self, path: str) -> str: - script = r""" -import os -import sys - -p = sys.argv[1] -if os.path.isdir(p): - print("dir") - raise SystemExit(0) -if os.path.isfile(p): - print("file") - raise SystemExit(0) -print("none") -raise SystemExit(2) -""" - - try: - result = execute( - self._runtime, - [ - "sh", - "-c", - 'if command -v python3 >/dev/null 2>&1; then py=python3; else py=python; fi; "$py" -c "$0" "$@"', - script, - path, - ], - timeout=self._LIST_TIMEOUT_SECONDS, - error_message="Failed to check path in sandbox", - ) - except CommandExecutionError as exc: - raise ValueError(str(exc)) from exc - - kind = result.stdout.decode("utf-8", errors="replace").strip() - if kind not in ("dir", "file"): - raise ValueError("File not found in sandbox") - return kind - - -class SandboxFileArchiveSource(SandboxFileSource): - def list_files(self, *, path: str, recursive: bool) -> list[SandboxFileNode]: - import tarfile - - archive_path = SandboxArchivePath(tenant_id=UUID(self._tenant_id), sandbox_id=UUID(self._sandbox_id)) - storage_key = archive_path.get_storage_key() - if not storage.exists(storage_key): - raise ValueError("Sandbox archive not found") - - with tempfile.TemporaryDirectory(prefix="dify-sandbox-archive-") as tmpdir: - local_archive = os.path.join(tmpdir, "workspace.tar.gz") - storage.download(storage_key, local_archive) - - entries_by_path: dict[str, SandboxFileNode] = {} - - def add_dir(dir_path: str) -> None: - if dir_path in ("", "."): - return - if dir_path not in entries_by_path: - entries_by_path[dir_path] = SandboxFileNode( - path=dir_path, is_dir=True, size=None, mtime=None, extension=None - ) - - def clean(member_name: str) -> str: - name = member_name.lstrip("./") - return name.rstrip("/") - - target_prefix = "" if path in (".", "") else f"{path}/" - - with tarfile.open(local_archive, mode="r:gz") as tf: - for m in tf.getmembers(): - mp = clean(m.name) - if mp in ("", "."): - continue - - if not recursive: - if path in (".", ""): - if "/" in mp: - add_dir(mp.split("/", 1)[0]) - continue - else: - if not mp.startswith(target_prefix): - continue - rest = mp[len(target_prefix) :] - if rest == "": - continue - if "/" in rest: - add_dir(f"{path}/{rest.split('/', 1)[0]}") - continue - else: - if path not in (".", "") and not (mp == path or mp.startswith(target_prefix)): - continue - - parent = os.path.dirname(mp) - while parent not in ("", "."): - if path not in (".", "") and parent == path: - break - add_dir(parent) - parent = os.path.dirname(parent) - - is_dir = m.isdir() - extension = None - if not is_dir: - ext = os.path.splitext(mp)[1] - extension = ext or None - entries_by_path[mp] = SandboxFileNode( - path=mp, - is_dir=is_dir, - size=None if is_dir else int(m.size), - mtime=int(m.mtime) if m.mtime else None, - extension=extension, - ) - - return sorted(entries_by_path.values(), key=lambda e: e.path) - - def download_file(self, *, path: str) -> SandboxFileDownloadTicket: - import tarfile - - archive_path = SandboxArchivePath(tenant_id=UUID(self._tenant_id), sandbox_id=UUID(self._sandbox_id)) - storage_key = archive_path.get_storage_key() - if not storage.exists(storage_key): - raise ValueError("Sandbox archive not found") - - export_name = os.path.basename(path.rstrip("/")) or "workspace" - export_id = uuid4().hex - - # Decide file vs directory inside archive. - is_dir_request = path in (".", "") - - with tempfile.TemporaryDirectory(prefix="dify-sandbox-archive-") as tmpdir: - local_archive = os.path.join(tmpdir, "workspace.tar.gz") - storage.download(storage_key, local_archive) - - with tarfile.open(local_archive, mode="r:gz") as tf: - member_name = path.lstrip("./").rstrip("/") - if not is_dir_request: - # If it is an explicit file in archive, treat as file download. - member = None - try: - member = tf.getmember(member_name) - except KeyError: - try: - member = tf.getmember(f"./{member_name}") - except KeyError: - member = None - - if member is not None and not member.isdir(): - export_path = SandboxFileDownloadPath( - tenant_id=UUID(self._tenant_id), - sandbox_id=UUID(self._sandbox_id), - export_id=export_id, - filename=os.path.basename(member_name) or "file", - ) - extracted = tf.extractfile(member) - if extracted is None: - raise ValueError("File not found in sandbox archive") - sandbox_file_storage.save(export_path, extracted.read()) - - download_url = sandbox_file_storage.get_download_url( - export_path, expires_in=self._EXPORT_EXPIRES_IN_SECONDS - ) - return SandboxFileDownloadTicket( - download_url=download_url, - expires_in=self._EXPORT_EXPIRES_IN_SECONDS, - export_id=export_id, - ) - - # Otherwise treat as directory (implied dir is common in tar). - is_dir_request = True - - if is_dir_request: - export_path = SandboxFileDownloadPath( - tenant_id=UUID(self._tenant_id), - sandbox_id=UUID(self._sandbox_id), - export_id=export_id, - filename=f"{export_name}.tar.gz", - ) - export_local = os.path.join(tmpdir, "export.tar.gz") - - prefix = "" if member_name in (".", "") else f"{member_name}/" - found_any = False - for m in tf.getmembers(): - src_name = m.name.lstrip("./").rstrip("/") - if member_name not in (".", ""): - if src_name != member_name and not src_name.startswith(prefix): - continue - found_any = True - break - - if not found_any: - raise ValueError("File not found in sandbox archive") - - with tarfile.open(export_local, mode="w:gz") as out: - if member_name not in (".", ""): - dir_info = tarfile.TarInfo(name=member_name) - dir_info.type = tarfile.DIRTYPE - dir_info.size = 0 - out.addfile(dir_info) - - for m in tf.getmembers(): - src_name = m.name.lstrip("./") - if member_name not in (".", ""): - if src_name != member_name and not src_name.startswith(prefix): - continue - ti = tarfile.TarInfo(name=src_name.rstrip("/")) - ti.mode = m.mode - ti.mtime = m.mtime - ti.uid = m.uid - ti.gid = m.gid - ti.uname = m.uname - ti.gname = m.gname - if m.isdir(): - ti.type = tarfile.DIRTYPE - ti.size = 0 - out.addfile(ti) - continue - extracted = tf.extractfile(m) - if extracted is None: - continue - ti.size = int(m.size) - out.addfile(ti, fileobj=extracted) - - sandbox_file_storage.save(export_path, Path(export_local).read_bytes()) - - download_url = sandbox_file_storage.get_download_url( - export_path, expires_in=self._EXPORT_EXPIRES_IN_SECONDS - ) - return SandboxFileDownloadTicket( - download_url=download_url, - expires_in=self._EXPORT_EXPIRES_IN_SECONDS, - export_id=export_id, - ) - - raise ValueError("File not found in sandbox archive") - - -class SandboxFileBrowser: - def __init__(self, *, tenant_id: str, sandbox_id: str): - self._tenant_id = tenant_id - self._sandbox_id = sandbox_id - - @staticmethod - def _normalize_workspace_path(path: str | None) -> str: - raw = (path or ".").strip() - if raw == "": - raw = "." - - p = PurePosixPath(raw) - if p.is_absolute(): - raise ValueError("path must be relative") - if any(part == ".." for part in p.parts): - raise ValueError("path must not contain '..'") - - normalized = str(p) - return "." if normalized in (".", "") else normalized - - def _backend(self) -> SandboxFileSource: - runtime = SandboxManager.get(self._sandbox_id) - if runtime is not None: - return SandboxFileRuntimeSource(tenant_id=self._tenant_id, sandbox_id=self._sandbox_id, runtime=runtime) - return SandboxFileArchiveSource(tenant_id=self._tenant_id, sandbox_id=self._sandbox_id) - - def list_files(self, *, path: str | None = None, recursive: bool = False) -> list[SandboxFileNode]: - workspace_path = self._normalize_workspace_path(path) - return self._backend().list_files(path=workspace_path, recursive=recursive) - - def download_file(self, *, path: str) -> SandboxFileDownloadTicket: - workspace_path = self._normalize_workspace_path(path) - return self._backend().download_file(path=workspace_path) diff --git a/api/core/sandbox/inspector/__init__.py b/api/core/sandbox/inspector/__init__.py new file mode 100644 index 0000000000..e259a158a2 --- /dev/null +++ b/api/core/sandbox/inspector/__init__.py @@ -0,0 +1,11 @@ +from core.sandbox.inspector.archive_source import SandboxFileArchiveSource +from core.sandbox.inspector.base import SandboxFileSource +from core.sandbox.inspector.browser import SandboxFileBrowser +from core.sandbox.inspector.runtime_source import SandboxFileRuntimeSource + +__all__ = [ + "SandboxFileArchiveSource", + "SandboxFileBrowser", + "SandboxFileRuntimeSource", + "SandboxFileSource", +] diff --git a/api/core/sandbox/inspector/archive_source.py b/api/core/sandbox/inspector/archive_source.py new file mode 100644 index 0000000000..8876d878cf --- /dev/null +++ b/api/core/sandbox/inspector/archive_source.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import json +import os +from typing import TYPE_CHECKING +from uuid import UUID, uuid4 + +from core.sandbox.entities.files import SandboxFileDownloadTicket, SandboxFileNode +from core.sandbox.inspector.base import SandboxFileSource +from core.sandbox.security.archive_signer import SandboxArchivePath, SandboxArchiveSigner +from core.sandbox.security.sandbox_file_signer import SandboxFileDownloadPath +from core.sandbox.storage import sandbox_file_storage +from core.virtual_environment.__base.exec import CommandExecutionError +from core.virtual_environment.__base.helpers import execute +from extensions.ext_storage import storage + +if TYPE_CHECKING: + from core.zip_sandbox import ZipSandbox + + +class SandboxFileArchiveSource(SandboxFileSource): + _PYTHON_EXEC_CMD = 'if command -v python3 >/dev/null 2>&1; then py=python3; else py=python; fi; "$py" -c "$0" "$@"' + _LIST_SCRIPT = r""" +import json +import os +import sys + +path = sys.argv[1] +recursive = sys.argv[2] == "1" + +def norm(rel: str) -> str: + rel = rel.replace("\\", "/") + rel = rel.lstrip("./") + return rel or "." + +def stat_entry(full_path: str, rel_path: str) -> dict: + st = os.stat(full_path) + is_dir = os.path.isdir(full_path) + return { + "path": norm(rel_path), + "is_dir": is_dir, + "size": None if is_dir else int(st.st_size), + "mtime": int(st.st_mtime), + } + +entries = [] +if recursive: + for root, dirs, files in os.walk(path): + for d in dirs: + fp = os.path.join(root, d) + rp = os.path.relpath(fp, ".") + entries.append(stat_entry(fp, rp)) + for f in files: + fp = os.path.join(root, f) + rp = os.path.relpath(fp, ".") + entries.append(stat_entry(fp, rp)) +else: + if os.path.isfile(path): + rel_path = os.path.relpath(path, ".") + entries.append(stat_entry(path, rel_path)) + else: + for item in os.scandir(path): + rel_path = os.path.relpath(item.path, ".") + entries.append(stat_entry(item.path, rel_path)) + +print(json.dumps(entries)) +""" + + def _get_archive_download_url(self) -> str: + """Get a pre-signed download URL for the sandbox archive.""" + archive_path = SandboxArchivePath(tenant_id=UUID(self._tenant_id), sandbox_id=UUID(self._sandbox_id)) + storage_key = archive_path.get_storage_key() + if not storage.exists(storage_key): + raise ValueError("Sandbox archive not found") + return SandboxArchiveSigner.build_signed_url( + archive_path=archive_path, + expires_in=self._EXPORT_EXPIRES_IN_SECONDS, + action=SandboxArchiveSigner.OPERATION_DOWNLOAD, + ) + + def _create_zip_sandbox(self) -> ZipSandbox: + """Create a ZipSandbox instance for archive operations.""" + from core.zip_sandbox import ZipSandbox + + return ZipSandbox(tenant_id=self._tenant_id, user_id="system", app_id="sandbox-archive-browser") + + def list_files(self, *, path: str, recursive: bool) -> list[SandboxFileNode]: + archive_url = self._get_archive_download_url() + + with self._create_zip_sandbox() as zs: + # Download and extract the archive + archive_path = zs.download_archive(archive_url, path="workspace.tar.gz") + zs.untar(archive_path=archive_path, dest_dir="workspace") + + # List files using Python script in sandbox + try: + result = execute( + zs.vm, + [ + "sh", + "-c", + self._PYTHON_EXEC_CMD, + self._LIST_SCRIPT, + f"workspace/{path}" if path not in (".", "") else "workspace", + "1" if recursive else "0", + ], + timeout=self._LIST_TIMEOUT_SECONDS, + error_message="Failed to list sandbox files", + ) + except CommandExecutionError as exc: + raise RuntimeError(str(exc)) from exc + + try: + raw = json.loads(result.stdout.decode("utf-8")) + except Exception as exc: + raise RuntimeError("Malformed sandbox file list output") from exc + + entries: list[SandboxFileNode] = [] + for item in raw: + item_path = str(item.get("path")) + # Strip the "workspace/" prefix from paths + if item_path.startswith("workspace/"): + item_path = item_path[len("workspace/") :] + elif item_path == "workspace": + continue # Skip the workspace directory itself + + item_is_dir = bool(item.get("is_dir")) + extension = None + if not item_is_dir: + ext = os.path.splitext(item_path)[1] + extension = ext or None + entries.append( + SandboxFileNode( + path=item_path, + is_dir=item_is_dir, + size=item.get("size"), + mtime=item.get("mtime"), + extension=extension, + ) + ) + return sorted(entries, key=lambda e: e.path) + + def download_file(self, *, path: str) -> SandboxFileDownloadTicket: + archive_url = self._get_archive_download_url() + export_name = os.path.basename(path.rstrip("/")) or "workspace" + export_id = uuid4().hex + + with self._create_zip_sandbox() as zs: + # Download and extract the archive + archive_path = zs.download_archive(archive_url, path="workspace.tar.gz") + zs.untar(archive_path=archive_path, dest_dir="workspace") + + # Determine the target path inside extracted workspace + target_path = f"workspace/{path}" if path not in (".", "") else "workspace" + + # Detect if target is file or directory + detect_script = r""" +import os +import sys + +p = sys.argv[1] +if os.path.isdir(p): + print("dir") + raise SystemExit(0) +if os.path.isfile(p): + print("file") + raise SystemExit(0) +print("none") +raise SystemExit(2) +""" + try: + result = execute( + zs.vm, + [ + "sh", + "-c", + self._PYTHON_EXEC_CMD, + detect_script, + target_path, + ], + timeout=self._LIST_TIMEOUT_SECONDS, + error_message="Failed to check path in sandbox", + ) + except CommandExecutionError as exc: + raise ValueError(str(exc)) from exc + + kind = result.stdout.decode("utf-8", errors="replace").strip() + if kind not in ("dir", "file"): + raise ValueError("File not found in sandbox archive") + + if kind == "file": + # Download file content from sandbox + file_data = zs.read_file(target_path) + export_path = SandboxFileDownloadPath( + tenant_id=UUID(self._tenant_id), + sandbox_id=UUID(self._sandbox_id), + export_id=export_id, + filename=os.path.basename(path) or "file", + ) + sandbox_file_storage.save(export_path, file_data) + else: + # Create tar.gz archive of the directory + tar_file = zs.tar(target_path, include_base=True, compress=True) + tar_data = zs.read_file(tar_file.path) + export_path = SandboxFileDownloadPath( + tenant_id=UUID(self._tenant_id), + sandbox_id=UUID(self._sandbox_id), + export_id=export_id, + filename=f"{export_name}.tar.gz", + ) + sandbox_file_storage.save(export_path, tar_data) + + download_url = sandbox_file_storage.get_download_url(export_path, expires_in=self._EXPORT_EXPIRES_IN_SECONDS) + return SandboxFileDownloadTicket( + download_url=download_url, + expires_in=self._EXPORT_EXPIRES_IN_SECONDS, + export_id=export_id, + ) diff --git a/api/core/sandbox/inspector/base.py b/api/core/sandbox/inspector/base.py new file mode 100644 index 0000000000..028f3e6e83 --- /dev/null +++ b/api/core/sandbox/inspector/base.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import abc + +from core.sandbox.entities.files import SandboxFileDownloadTicket, SandboxFileNode + + +class SandboxFileSource(abc.ABC): + _LIST_TIMEOUT_SECONDS = 30 + _UPLOAD_TIMEOUT_SECONDS = 60 * 10 + _EXPORT_EXPIRES_IN_SECONDS = 60 * 10 + + def __init__(self, *, tenant_id: str, sandbox_id: str): + self._tenant_id = tenant_id + self._sandbox_id = sandbox_id + + @abc.abstractmethod + def list_files(self, *, path: str, recursive: bool) -> list[SandboxFileNode]: + raise NotImplementedError + + @abc.abstractmethod + def download_file(self, *, path: str) -> SandboxFileDownloadTicket: + raise NotImplementedError diff --git a/api/core/sandbox/inspector/browser.py b/api/core/sandbox/inspector/browser.py new file mode 100644 index 0000000000..0c29c31570 --- /dev/null +++ b/api/core/sandbox/inspector/browser.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + +from core.sandbox.entities.files import SandboxFileDownloadTicket, SandboxFileNode +from core.sandbox.inspector.archive_source import SandboxFileArchiveSource +from core.sandbox.inspector.base import SandboxFileSource +from core.sandbox.inspector.runtime_source import SandboxFileRuntimeSource +from core.sandbox.manager import SandboxManager + + +class SandboxFileBrowser: + def __init__(self, *, tenant_id: str, sandbox_id: str): + self._tenant_id = tenant_id + self._sandbox_id = sandbox_id + + @staticmethod + def _normalize_workspace_path(path: str | None) -> str: + raw = (path or ".").strip() + if raw == "": + raw = "." + + p = PurePosixPath(raw) + if p.is_absolute(): + raise ValueError("path must be relative") + if any(part == ".." for part in p.parts): + raise ValueError("path must not contain '..'") + + normalized = str(p) + return "." if normalized in (".", "") else normalized + + def _backend(self) -> SandboxFileSource: + runtime = SandboxManager.get(self._sandbox_id) + if runtime is not None: + return SandboxFileRuntimeSource(tenant_id=self._tenant_id, sandbox_id=self._sandbox_id, runtime=runtime) + return SandboxFileArchiveSource(tenant_id=self._tenant_id, sandbox_id=self._sandbox_id) + + def list_files(self, *, path: str | None = None, recursive: bool = False) -> list[SandboxFileNode]: + workspace_path = self._normalize_workspace_path(path) + return self._backend().list_files(path=workspace_path, recursive=recursive) + + def download_file(self, *, path: str) -> SandboxFileDownloadTicket: + workspace_path = self._normalize_workspace_path(path) + return self._backend().download_file(path=workspace_path) diff --git a/api/core/sandbox/inspector/runtime_source.py b/api/core/sandbox/inspector/runtime_source.py new file mode 100644 index 0000000000..7481169212 --- /dev/null +++ b/api/core/sandbox/inspector/runtime_source.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +import json +import logging +import os +from uuid import UUID, uuid4 + +from core.sandbox.entities.files import SandboxFileDownloadTicket, SandboxFileNode +from core.sandbox.inspector.base import SandboxFileSource +from core.sandbox.security.sandbox_file_signer import SandboxFileDownloadPath +from core.sandbox.storage import sandbox_file_storage +from core.virtual_environment.__base.exec import CommandExecutionError +from core.virtual_environment.__base.helpers import execute +from core.virtual_environment.__base.virtual_environment import VirtualEnvironment + +logger = logging.getLogger(__name__) + + +class SandboxFileRuntimeSource(SandboxFileSource): + def __init__(self, *, tenant_id: str, sandbox_id: str, runtime: VirtualEnvironment): + super().__init__(tenant_id=tenant_id, sandbox_id=sandbox_id) + self._runtime = runtime + + def list_files(self, *, path: str, recursive: bool) -> list[SandboxFileNode]: + script = r""" +import json +import os +import sys + +path = sys.argv[1] +recursive = sys.argv[2] == "1" + +def norm(rel: str) -> str: + rel = rel.replace("\\\\", "/") + rel = rel.lstrip("./") + return rel or "." + +def stat_entry(full_path: str, rel_path: str) -> dict: + st = os.stat(full_path) + is_dir = os.path.isdir(full_path) + return { + "path": norm(rel_path), + "is_dir": is_dir, + "size": None if is_dir else int(st.st_size), + "mtime": int(st.st_mtime), + } + +entries = [] +if recursive: + for root, dirs, files in os.walk(path): + for d in dirs: + fp = os.path.join(root, d) + rp = os.path.relpath(fp, ".") + entries.append(stat_entry(fp, rp)) + for f in files: + fp = os.path.join(root, f) + rp = os.path.relpath(fp, ".") + entries.append(stat_entry(fp, rp)) +else: + if os.path.isfile(path): + rel_path = os.path.relpath(path, ".") + entries.append(stat_entry(path, rel_path)) + else: + for item in os.scandir(path): + rel_path = os.path.relpath(item.path, ".") + entries.append(stat_entry(item.path, rel_path)) + +print(json.dumps(entries)) +""" + + try: + result = execute( + self._runtime, + [ + "sh", + "-c", + 'if command -v python3 >/dev/null 2>&1; then py=python3; else py=python; fi; "$py" -c "$0" "$@"', + script, + path, + "1" if recursive else "0", + ], + timeout=self._LIST_TIMEOUT_SECONDS, + error_message="Failed to list sandbox files", + ) + except CommandExecutionError as exc: + raise RuntimeError(str(exc)) from exc + + try: + raw = json.loads(result.stdout.decode("utf-8")) + except Exception as exc: + raise RuntimeError("Malformed sandbox file list output") from exc + + entries: list[SandboxFileNode] = [] + for item in raw: + item_path = str(item.get("path")) + item_is_dir = bool(item.get("is_dir")) + extension = None + if not item_is_dir: + ext = os.path.splitext(item_path)[1] + extension = ext or None + entries.append( + SandboxFileNode( + path=item_path, + is_dir=item_is_dir, + size=item.get("size"), + mtime=item.get("mtime"), + extension=extension, + ) + ) + return entries + + def download_file(self, *, path: str) -> SandboxFileDownloadTicket: + kind = self._detect_path_kind(path) + + export_name = os.path.basename(path.rstrip("/")) or "workspace" + filename = f"{export_name}.tar.gz" if kind == "dir" else (os.path.basename(path) or "file") + export_id = uuid4().hex + export_path = SandboxFileDownloadPath( + tenant_id=UUID(self._tenant_id), + sandbox_id=UUID(self._sandbox_id), + export_id=export_id, + filename=filename, + ) + + upload_url = sandbox_file_storage.get_upload_url(export_path, expires_in=self._EXPORT_EXPIRES_IN_SECONDS) + + if kind == "dir": + archive_path = f"/tmp/{export_id}.tar.gz" + try: + execute( + self._runtime, + ["tar", "-czf", archive_path, "-C", ".", path], + timeout=self._UPLOAD_TIMEOUT_SECONDS, + error_message="Failed to archive directory in sandbox", + ) + execute( + self._runtime, + ["curl", "-s", "-f", "-X", "PUT", "-T", archive_path, upload_url], + timeout=self._UPLOAD_TIMEOUT_SECONDS, + error_message="Failed to upload directory archive from sandbox", + ) + except CommandExecutionError as exc: + raise RuntimeError(str(exc)) from exc + finally: + try: + execute( + self._runtime, + ["rm", "-f", archive_path], + timeout=self._LIST_TIMEOUT_SECONDS, + error_message="Failed to cleanup temp archive", + ) + except Exception as exc: + # Best-effort cleanup; do not fail the download on cleanup issues. + logger.debug("Failed to cleanup temp archive %s: %s", archive_path, exc) + else: + try: + execute( + self._runtime, + ["curl", "-s", "-f", "-X", "PUT", "-T", path, upload_url], + timeout=self._UPLOAD_TIMEOUT_SECONDS, + error_message="Failed to upload file from sandbox", + ) + except CommandExecutionError as exc: + raise RuntimeError(str(exc)) from exc + + download_url = sandbox_file_storage.get_download_url(export_path, expires_in=self._EXPORT_EXPIRES_IN_SECONDS) + return SandboxFileDownloadTicket( + download_url=download_url, + expires_in=self._EXPORT_EXPIRES_IN_SECONDS, + export_id=export_id, + ) + + def _detect_path_kind(self, path: str) -> str: + script = r""" +import os +import sys + +p = sys.argv[1] +if os.path.isdir(p): + print("dir") + raise SystemExit(0) +if os.path.isfile(p): + print("file") + raise SystemExit(0) +print("none") +raise SystemExit(2) +""" + + try: + result = execute( + self._runtime, + [ + "sh", + "-c", + 'if command -v python3 >/dev/null 2>&1; then py=python3; else py=python; fi; "$py" -c "$0" "$@"', + script, + path, + ], + timeout=self._LIST_TIMEOUT_SECONDS, + error_message="Failed to check path in sandbox", + ) + except CommandExecutionError as exc: + raise ValueError(str(exc)) from exc + + kind = result.stdout.decode("utf-8", errors="replace").strip() + if kind not in ("dir", "file"): + raise ValueError("File not found in sandbox") + return kind diff --git a/api/core/sandbox/storage/archive_storage.py b/api/core/sandbox/storage/archive_storage.py index 3b117d2359..3b44da5fc9 100644 --- a/api/core/sandbox/storage/archive_storage.py +++ b/api/core/sandbox/storage/archive_storage.py @@ -59,7 +59,10 @@ class ArchiveSandboxStorage(SandboxStorage): ( pipeline(sandbox) .add(["curl", "-fsSL", download_url, "-o", archive_name], error_message="Failed to download archive") - .add(["tar", "-xzf", archive_name], error_message="Failed to extract archive") + .add( + ["sh", "-c", 'tar -xzf "$1" 2>/dev/null; exit $?', "sh", archive_name], + error_message="Failed to extract archive", + ) .add(["rm", archive_name], error_message="Failed to cleanup archive") .execute(timeout=ARCHIVE_DOWNLOAD_TIMEOUT, raise_on_error=True) ) diff --git a/api/core/zip_sandbox/zip_sandbox.py b/api/core/zip_sandbox/zip_sandbox.py index ea2b541719..bd71096d47 100644 --- a/api/core/zip_sandbox/zip_sandbox.py +++ b/api/core/zip_sandbox/zip_sandbox.py @@ -349,10 +349,62 @@ class ZipSandbox: ( pipeline(self.vm) .add(["mkdir", "-p", dest_dir], error_message="Failed to create destination directory") - .add(["tar", extract_flag, archive_path, "-C", dest_dir], error_message="Failed to extract tar archive") + .add( + ["sh", "-c", f'tar {extract_flag} "$1" -C "$2" 2>/dev/null; exit $?', "sh", archive_path, dest_dir], + error_message="Failed to extract tar archive", + ) .execute(timeout=self._DEFAULT_TIMEOUT_SECONDS, raise_on_error=True) ) except PipelineExecutionError as exc: raise RuntimeError(str(exc)) from exc return dest_dir + + def tar(self, src: str = ".", *, include_base: bool = True, compress: bool = True) -> SandboxFile: + """Create a tar archive and return a handle to it. + + Args: + src: Source path to archive (file or directory) + include_base: If True, include the base directory name in the archive + compress: If True, create a gzipped tar archive (.tar.gz) + + Returns: + SandboxFile handle to the created archive + """ + src = self._normalize_path(src) + extension = ".tar.gz" if compress else ".tar" + out_path = f"/tmp/{uuid4().hex}{extension}" + + create_flag = "-czf" if compress else "-cf" + + try: + if src in (".", ""): + # Archive current directory contents + execute( + self.vm, + ["tar", create_flag, out_path, "-C", ".", "."], + timeout=self._DEFAULT_TIMEOUT_SECONDS, + error_message="Failed to create tar archive", + ) + elif include_base: + # Archive with base directory name included + parent_dir = posixpath.dirname(src) or "." + base_name = posixpath.basename(src) + execute( + self.vm, + ["tar", create_flag, out_path, "-C", parent_dir, base_name], + timeout=self._DEFAULT_TIMEOUT_SECONDS, + error_message="Failed to create tar archive", + ) + else: + # Archive contents without base directory name + execute( + self.vm, + ["tar", create_flag, out_path, "-C", src, "."], + timeout=self._DEFAULT_TIMEOUT_SECONDS, + error_message="Failed to create tar archive", + ) + except CommandExecutionError as exc: + raise RuntimeError(str(exc)) from exc + + return SandboxFile(path=out_path)