diff --git a/.github/workflows/jit.yml b/.github/workflows/jit.yml index 81db07fffa5eeb..9ef2052bab5dad 100644 --- a/.github/workflows/jit.yml +++ b/.github/workflows/jit.yml @@ -21,7 +21,7 @@ concurrency: env: FORCE_COLOR: 1 - LLVM_VERSION: 19 + LLVM_VERSION: 21 jobs: interpreter: @@ -168,7 +168,6 @@ jobs: fail-fast: false matrix: include: - - name: JIT without optimizations (Debug) configure_flags: --enable-experimental-jit --with-pydebug test_env: "PYTHON_UOPS_OPTIMIZE=0" diff --git a/.github/workflows/tail-call.yml b/.github/workflows/tail-call.yml index f1e342bbac28a7..42474fcc1c321e 100644 --- a/.github/workflows/tail-call.yml +++ b/.github/workflows/tail-call.yml @@ -20,7 +20,7 @@ concurrency: env: FORCE_COLOR: 1 - LLVM_VERSION: 20 + LLVM_VERSION: 21 jobs: windows: diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-19-10-32-28.gh-issue-136895.HfsEh0.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-19-10-32-28.gh-issue-136895.HfsEh0.rst new file mode 100644 index 00000000000000..0a615ed131127f --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-19-10-32-28.gh-issue-136895.HfsEh0.rst @@ -0,0 +1 @@ +Update JIT compilation to use LLVM 21 at build time. diff --git a/PCbuild/get_external.py b/PCbuild/get_external.py index a78aa6a23041ad..cc9087c792d318 100755 --- a/PCbuild/get_external.py +++ b/PCbuild/get_external.py @@ -3,6 +3,7 @@ import argparse import os import pathlib +import shutil import sys import time import urllib.error @@ -22,15 +23,13 @@ def retrieve_with_retries(download_location, output_path, reporthook, ) except (urllib.error.URLError, ConnectionError) as ex: if attempt == max_retries: - msg = f"Download from {download_location} failed." - raise OSError(msg) from ex + raise OSError(f'Download from {download_location} failed.') from ex time.sleep(2.25**attempt) else: return resp - def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose): - repo = f'cpython-{"bin" if binary else "source"}-deps' + repo = 'cpython-bin-deps' if binary else 'cpython-source-deps' url = f'https://github.com/{org}/{repo}/archive/{commit_hash}.zip' reporthook = None if verbose: @@ -44,6 +43,29 @@ def fetch_zip(commit_hash, zip_dir, *, org='python', binary=False, verbose): return filename +def fetch_release(tag, tarball_dir, *, org='python', verbose=False): + url = f'https://github.com/{org}/cpython-bin-deps/releases/download/{tag}/{tag}.tar.xz' + reporthook = None + if verbose: + reporthook = print + tarball_dir.mkdir(parents=True, exist_ok=True) + output_path = tarball_dir / f'{tag}.tar.xz' + retrieve_with_retries(url, output_path, reporthook) + return output_path + + +def extract_tarball(externals_dir, tarball_path, tag): + output_path = externals_dir / tag + try: + shutil.unpack_archive(os.fspath(tarball_path), os.fspath(output_path)) + except Exception as ex: + raise OSError( + f'Failed to extract {tarball_path}. The archive may be ' + f'corrupted; try deleting it and re-running.' + ) from ex + return output_path + + def extract_zip(externals_dir, zip_path): with zipfile.ZipFile(os.fspath(zip_path)) as zf: zf.extractall(os.fspath(externals_dir)) @@ -55,6 +77,8 @@ def parse_args(): p.add_argument('-v', '--verbose', action='store_true') p.add_argument('-b', '--binary', action='store_true', help='Is the dependency in the binary repo?') + p.add_argument('-r', '--release', action='store_true', + help='Download from GitHub release assets instead of branch') p.add_argument('-O', '--organization', help='Organization owning the deps repos', default='python') p.add_argument('-e', '--externals-dir', type=pathlib.Path, @@ -67,15 +91,36 @@ def parse_args(): def main(): args = parse_args() - zip_path = fetch_zip( - args.tag, - args.externals_dir / 'zips', - org=args.organization, - binary=args.binary, - verbose=args.verbose, - ) final_name = args.externals_dir / args.tag - extracted = extract_zip(args.externals_dir, zip_path) + + # Check if the dependency already exists in externals/ directory + # (either already downloaded/extracted, or checked into the git tree) + if final_name.exists(): + if args.verbose: + print(f'{args.tag} already exists at {final_name}, skipping download.') + return + + # Determine download method: release artifacts for large deps (like LLVM), + # otherwise zip download from GitHub branches + if args.release: + tarball_path = fetch_release( + args.tag, + args.externals_dir / 'tarballs', + org=args.organization, + verbose=args.verbose, + ) + extracted = extract_tarball(args.externals_dir, tarball_path, args.tag) + else: + # Use zip download from GitHub branches + # (cpython-bin-deps if --binary, cpython-source-deps otherwise) + zip_path = fetch_zip( + args.tag, + args.externals_dir / 'zips', + org=args.organization, + binary=args.binary, + verbose=args.verbose, + ) + extracted = extract_zip(args.externals_dir, zip_path) for wait in [1, 2, 3, 5, 8, 0]: try: extracted.replace(final_name) diff --git a/PCbuild/get_externals.bat b/PCbuild/get_externals.bat index 215b1c9f781d91..cea53ce9c5e0ee 100644 --- a/PCbuild/get_externals.bat +++ b/PCbuild/get_externals.bat @@ -1,5 +1,5 @@ @echo off -setlocal +setlocal EnableDelayedExpansion rem Simple script to fetch source for external libraries if NOT DEFINED PCBUILD (set PCBUILD=%~dp0) @@ -82,7 +82,7 @@ if NOT "%IncludeLibffi%"=="false" set binaries=%binaries% libffi-3.4.4 if NOT "%IncludeSSL%"=="false" set binaries=%binaries% openssl-bin-3.0.19 if NOT "%IncludeTkinter%"=="false" set binaries=%binaries% tcltk-8.6.15.0 if NOT "%IncludeSSLSrc%"=="false" set binaries=%binaries% nasm-2.11.06 -if NOT "%IncludeLLVM%"=="false" set binaries=%binaries% llvm-19.1.7.0 +if NOT "%IncludeLLVM%"=="false" set binaries=%binaries% llvm-21.1.4.0 for %%b in (%binaries%) do ( if exist "%EXTERNALS_DIR%\%%b" ( @@ -92,7 +92,9 @@ for %%b in (%binaries%) do ( git clone --depth 1 https://github.com/%ORG%/cpython-bin-deps --branch %%b "%EXTERNALS_DIR%\%%b" ) else ( echo.Fetching %%b... - %PYTHON% -E "%PCBUILD%\get_external.py" -b -O %ORG% -e "%EXTERNALS_DIR%" %%b + set _fetch_args=--binary + echo %%b | findstr /B "llvm-" >nul && set _fetch_args=--release + %PYTHON% -E "%PCBUILD%\get_external.py" !_fetch_args! --organization %ORG% --externals-dir "%EXTERNALS_DIR%" %%b ) ) diff --git a/Python/jit.c b/Python/jit.c index 9fbd8a18590411..4a10a99f476c3d 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -419,15 +419,43 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value) } void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state); +void patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state); #include "jit_stencils.h" #if defined(__aarch64__) || defined(_M_ARM64) #define TRAMPOLINE_SIZE 16 + #define DATA_ALIGN 8 +#elif defined(__x86_64__) || defined(_M_X64) + // x86_64 trampolines: 14 bytes (jmp *(%rip) + 8-byte addr) + 2 bytes padding. + // Currently used on macOS where LLVM 21 GOT entries may exceed ±2GB + // PC-relative range, but enabled on all x86_64 platforms defensively. + #define TRAMPOLINE_SIZE 16 + #define DATA_ALIGN 8 #else #define TRAMPOLINE_SIZE 0 + #define DATA_ALIGN 1 #endif +// Get the trampoline memory location for a given symbol ordinal. +static unsigned char * +get_trampoline_slot(int ordinal, jit_state *state) +{ + const uint32_t symbol_mask = 1 << (ordinal % 32); + const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; + assert(symbol_mask & trampoline_mask); + + // Count the number of set bits in the trampoline mask lower than ordinal. + int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); + for (int i = 0; i < ordinal / 32; i++) { + index += _Py_popcount32(state->trampolines.mask[i]); + } + + unsigned char *trampoline = state->trampolines.mem + index * TRAMPOLINE_SIZE; + assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); + return trampoline; +} + // Generate and patch AArch64 trampolines. The symbols to jump to are stored // in the jit_stencils.h in the symbols_map. void @@ -444,20 +472,8 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) return; } - // Masking is done modulo 32 as the mask is stored as an array of uint32_t - const uint32_t symbol_mask = 1 << (ordinal % 32); - const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; - assert(symbol_mask & trampoline_mask); - - // Count the number of set bits in the trampoline mask lower than ordinal, - // this gives the index into the array of trampolines. - int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); - for (int i = 0; i < ordinal / 32; i++) { - index += _Py_popcount32(state->trampolines.mask[i]); - } - - uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); - assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); + // Out of range - need a trampoline + uint32_t *p = (uint32_t *)get_trampoline_slot(ordinal, state); /* Generate the trampoline @@ -474,6 +490,37 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) patch_aarch64_26r(location, (uintptr_t)p); } +// Generate and patch x86_64 trampolines. +void +patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state) +{ + uint64_t value = (uintptr_t)symbols_map[ordinal]; + int64_t range = (int64_t)value - 4 - (int64_t)location; + + // If we are in range of 32 signed bits, we can patch directly + if (range >= -(1LL << 31) && range < (1LL << 31)) { + patch_32r(location, value - 4); + return; + } + + // Out of range - need a trampoline + unsigned char *trampoline = get_trampoline_slot(ordinal, state); + + /* Generate the trampoline (14 bytes, padded to 16): + 0: ff 25 00 00 00 00 jmp *(%rip) + 6: XX XX XX XX XX XX XX XX (64-bit target address) + + Reference: https://wiki.osdev.org/X86-64_Instruction_Encoding#FF (JMP r/m64) + */ + trampoline[0] = 0xFF; + trampoline[1] = 0x25; + memset(trampoline + 2, 0, 4); + memcpy(trampoline + 6, &value, 8); + + // Patch the call site to call the trampoline instead + patch_32r(location, (uintptr_t)trampoline - 4); +} + static void combine_symbol_mask(const symbol_mask src, symbol_mask dest) { @@ -515,8 +562,13 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz // Round up to the nearest page: size_t page_size = get_page_size(); assert((page_size & (page_size - 1)) == 0); - size_t padding = page_size - ((code_size + state.trampolines.size + data_size) & (page_size - 1)); - size_t total_size = code_size + state.trampolines.size + data_size + padding; + size_t code_padding = + DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1)); + size_t padding = page_size - + ((code_size + state.trampolines.size + code_padding + data_size) & + (page_size - 1)); + size_t total_size = + code_size + state.trampolines.size + code_padding + data_size + padding; unsigned char *memory = jit_alloc(total_size); if (memory == NULL) { return -1; @@ -535,7 +587,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz // Loop again to emit the code: unsigned char *code = memory; state.trampolines.mem = memory + code_size; - unsigned char *data = memory + code_size + state.trampolines.size; + unsigned char *data = memory + code_size + state.trampolines.size + code_padding; // Compile the shim, which handles converting between the native // calling convention and the calling convention used by jitted code // (which may be different for efficiency reasons). @@ -557,7 +609,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz code += group->code_size; data += group->data_size; assert(code == memory + code_size); - assert(data == memory + code_size + state.trampolines.size + data_size); + assert( + data == + memory + code_size + state.trampolines.size + code_padding + data_size); if (mark_executable(memory, total_size)) { jit_free(memory, total_size); return -1; diff --git a/Tools/jit/README.md b/Tools/jit/README.md index 8e817574b4d72b..85ba8bd806ef72 100644 --- a/Tools/jit/README.md +++ b/Tools/jit/README.md @@ -9,32 +9,32 @@ Python 3.11 or newer is required to build the JIT. The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon). -LLVM version 19 is required. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-19`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. +LLVM version 21 is the officially supported version. You can modify if needed using the `LLVM_VERSION` env var during configure. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-21`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. It's easy to install all of the required tools: ### Linux -Install LLVM 19 on Ubuntu/Debian: +Install LLVM 21 on Ubuntu/Debian: ```sh wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh -sudo ./llvm.sh 19 +sudo ./llvm.sh 21 ``` -Install LLVM 19 on Fedora Linux 40 or newer: +Install LLVM 21 on Fedora Linux 40 or newer: ```sh -sudo dnf install 'clang(major) = 19' 'llvm(major) = 19' +sudo dnf install 'clang(major) = 21' 'llvm(major) = 21' ``` ### macOS -Install LLVM 19 with [Homebrew](https://brew.sh): +Install LLVM 21 with [Homebrew](https://brew.sh): ```sh -brew install llvm@19 +brew install llvm@21 ``` Homebrew won't add any of the tools to your `$PATH`. That's okay; the build script knows how to find them. @@ -43,14 +43,18 @@ Homebrew won't add any of the tools to your `$PATH`. That's okay; the build scri LLVM is downloaded automatically (along with other external binary dependencies) by `PCbuild\build.bat`. -Otherwise, you can install LLVM 19 [by searching for it on LLVM's GitHub releases page](https://github.com/llvm/llvm-project/releases?q=19), clicking on "Assets", downloading the appropriate Windows installer for your platform (likely the file ending with `-win64.exe`), and running it. **When installing, be sure to select the option labeled "Add LLVM to the system PATH".** +Otherwise, you can install LLVM 21 [by searching for it on LLVM's GitHub releases page](https://github.com/llvm/llvm-project/releases?q=21), clicking on "Assets", downloading the appropriate Windows installer for your platform (likely the file ending with `-win64.exe`), and running it. **When installing, be sure to select the option labeled "Add LLVM to the system PATH".** Alternatively, you can use [chocolatey](https://chocolatey.org): ```sh -choco install llvm --version=19.1.0 +choco install llvm --version=21.1.8 ``` +### Dev Containers + +If you are working on CPython in a [Codespaces instance](https://devguide.python.org/getting-started/setup-building/#using-codespaces), there's no +need to install LLVM as the Fedora 42 base image includes LLVM 21 out of the box. ## Building diff --git a/Tools/jit/_llvm.py b/Tools/jit/_llvm.py index f09a8404871b24..1447b5f07a6332 100644 --- a/Tools/jit/_llvm.py +++ b/Tools/jit/_llvm.py @@ -10,9 +10,9 @@ import _targets -_LLVM_VERSION = 19 +_LLVM_VERSION = 21 _LLVM_VERSION_PATTERN = re.compile(rf"version\s+{_LLVM_VERSION}\.\d+\.\d+\S*\s+") -_EXTERNALS_LLVM_TAG = "llvm-19.1.7.0" +_EXTERNALS_LLVM_TAG = "llvm-21.1.4.0" _P = typing.ParamSpec("_P") _R = typing.TypeVar("_R") @@ -38,6 +38,13 @@ async def wrapper( _CORES = asyncio.BoundedSemaphore(os.cpu_count() or 1) +def _candidate_names(tool: str) -> list[str]: + candidates = [tool] + if os.name == "nt": + candidates.append(f"{tool}.exe") + return candidates + + async def _run(tool: str, args: typing.Iterable[str], echo: bool = False) -> str | None: command = [tool, *args] async with _CORES: @@ -70,24 +77,26 @@ async def _get_brew_llvm_prefix(*, echo: bool = False) -> str | None: @_async_cache async def _find_tool(tool: str, *, echo: bool = False) -> str | None: # Unversioned executables: - path = tool - if await _check_tool_version(path, echo=echo): - return path + for path in _candidate_names(tool): + if await _check_tool_version(path, echo=echo): + return path # Versioned executables: - path = f"{tool}-{_LLVM_VERSION}" - if await _check_tool_version(path, echo=echo): - return path + for path in _candidate_names(f"{tool}-{_LLVM_VERSION}"): + if await _check_tool_version(path, echo=echo): + return path # PCbuild externals: externals = os.environ.get("EXTERNALS_DIR", _targets.EXTERNALS) - path = os.path.join(externals, _EXTERNALS_LLVM_TAG, "bin", tool) - if await _check_tool_version(path, echo=echo): - return path + for name in _candidate_names(tool): + path = os.path.join(externals, _EXTERNALS_LLVM_TAG, "bin", name) + if await _check_tool_version(path, echo=echo): + return path # Homebrew-installed executables: prefix = await _get_brew_llvm_prefix(echo=echo) if prefix is not None: - path = os.path.join(prefix, "bin", tool) - if await _check_tool_version(path, echo=echo): - return path + for name in _candidate_names(tool): + path = os.path.join(prefix, "bin", name) + if await _check_tool_version(path, echo=echo): + return path # Nothing found: return None diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 5977a7a30502ba..d5a94c98a20661 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -302,6 +302,23 @@ def process_relocations( self._trampolines.add(ordinal) hole.addend = ordinal hole.symbol = None + # x86_64 Darwin trampolines for external symbols + elif ( + hole.kind == "X86_64_RELOC_BRANCH" + and hole.value is HoleValue.ZERO + and hole.symbol not in self.symbols + ): + hole.func = "patch_x86_64_trampoline" + hole.need_state = True + assert hole.symbol is not None + if hole.symbol in known_symbols: + ordinal = known_symbols[hole.symbol] + else: + ordinal = len(known_symbols) + known_symbols[hole.symbol] = ordinal + self._trampolines.add(ordinal) + hole.addend = ordinal + hole.symbol = None self.code.remove_jump() self.code.add_nops(nop=nop, alignment=alignment) self.data.pad(8) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index f1085cc9bf081d..b1e8b9892ba0ef 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -150,10 +150,6 @@ async def _compile( "-fno-asynchronous-unwind-tables", # Don't call built-in functions that we can't find or patch: "-fno-builtin", - # Emit relaxable 64-bit calls/jumps, so we don't have to worry about - # about emitting in-range trampolines for out-of-range targets. - # We can probably remove this and emit trampolines in the future: - "-fno-plt", # Don't call stack-smashing canaries that we can't find or patch: "-fno-stack-protector", "-std=c11", @@ -523,7 +519,7 @@ def get_target(host: str) -> _COFF | _ELF | _MachO: condition = "defined(__aarch64__) && defined(__APPLE__)" target = _MachO(host, condition, alignment=8, prefix="_") elif re.fullmatch(r"aarch64-pc-windows-msvc", host): - args = ["-fms-runtime-lib=dll", "-fplt"] + args = ["-fms-runtime-lib=dll"] condition = "defined(_M_ARM64)" target = _COFF(host, condition, alignment=8, args=args) elif re.fullmatch(r"aarch64-.*-linux-gnu", host): @@ -532,6 +528,7 @@ def get_target(host: str) -> _COFF | _ELF | _MachO: # On aarch64 Linux, intrinsics were being emitted and this flag # was required to disable them. "-mno-outline-atomics", + "-fno-plt", ] condition = "defined(__aarch64__) && defined(__linux__)" target = _ELF(host, condition, alignment=8, args=args) @@ -551,7 +548,7 @@ def get_target(host: str) -> _COFF | _ELF | _MachO: condition = "defined(_M_X64)" target = _COFF(host, condition, args=args) elif re.fullmatch(r"x86_64-.*-linux-gnu", host): - args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0"] + args = ["-fno-pic", "-mcmodel=medium", "-mlarge-data-threshold=0", "-fno-plt"] condition = "defined(__x86_64__) && defined(__linux__)" target = _ELF(host, condition, args=args) else: