diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 000000000..533eb3cca --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,50 @@ +name: python-pypi-publish + +on: + push: + branches: + - main + tags: + - "*.*.*" + +jobs: + release: + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/graphframes-py + permissions: + id-token: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up JDK + if: startsWith(github.ref, 'refs/tags/') + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'zulu' + + - name: Set up Python + if: startsWith(github.ref, 'refs/tags/') + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Set up Poetry + if: startsWith(github.ref, 'refs/tags/') + uses: snok/install-poetry@v1 + + - name: Build GraphFrames python + if: startsWith(github.ref, 'refs/tags/') + working-directory: python + run: | + poetry version ${{ github.ref_name }} + poetry build + + - name: PyPi publish + if: startsWith(github.ref, 'refs/tags/') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: python/dist diff --git a/python/dev/build_jar.py b/python/dev/build_jar.py index 03e3e0171..8c9ee1a39 100644 --- a/python/dev/build_jar.py +++ b/python/dev/build_jar.py @@ -1,54 +1,64 @@ import shutil import subprocess import sys +from collections.abc import Sequence from pathlib import Path -def build(spark_version: str = "3.5.4"): - print("Building GraphFrames JAR...") - print(f"SPARK_VERSION: {spark_version[:3]}") - assert spark_version[:3] in {"3.3", "3.4", "3.5"}, "Unsopported spark version!" - project_root = Path(__file__).parent.parent.parent - sbt_executable = project_root.joinpath("build").joinpath("sbt").absolute().__str__() - sbt_build_command = [sbt_executable, f"-Dspark.version={spark_version}", "assembly"] - sbt_build = subprocess.Popen( - sbt_build_command, - stdout=subprocess.PIPE, - universal_newlines=True, - cwd=project_root, - ) - while sbt_build.poll() is None: - assert sbt_build.stdout is not None # typing stuff - line = sbt_build.stdout.readline() - print(line.rstrip(), flush=True) - - if sbt_build.returncode != 0: - print("Error during the build of GraphFrames JAR!") - print("stdout: ", sbt_build.stdout) - print("stderr: ", sbt_build.stderr) - sys.exit(1) - else: - print("Building DONE successfully!") +def build(spark_versions: Sequence[str] = ["3.5.5"]): + for spark_version in spark_versions: + print("Building GraphFrames JAR...") + print(f"SPARK_VERSION: {spark_version[:3]}") + assert spark_version[:3] in {"3.5",}, "Unsopported spark version!" + project_root = Path(__file__).parent.parent.parent + sbt_executable = project_root.joinpath("build").joinpath("sbt").absolute().__str__() + sbt_build_command = [ + sbt_executable, + f"-Dspark.version={spark_version}", + "clean", + "assembly", + ] + sbt_build = subprocess.Popen( + sbt_build_command, + stdout=subprocess.PIPE, + universal_newlines=True, + cwd=project_root, + ) + while sbt_build.poll() is None: + assert sbt_build.stdout is not None # typing stuff + line = sbt_build.stdout.readline() + print(line.rstrip(), flush=True) + + if sbt_build.returncode != 0: + print("Error during the build of GraphFrames JAR!") + print("stdout: ", sbt_build.stdout) + print("stderr: ", sbt_build.stderr) + sys.exit(1) + else: + print("Building DONE successfully!") - python_resources = ( - project_root.joinpath("python").joinpath("graphframes").joinpath("resources") - ) - target_dir = project_root.joinpath("target").joinpath("scala-2.12") - gf_jar = None + python_resources = ( + project_root.joinpath("python").joinpath("graphframes").joinpath("resources") + ) + target_dir = project_root.joinpath("target").joinpath("scala-2.12") + gf_jar = None - for pp in target_dir.glob("*.jar"): - if "graphframes-assembly" in pp.name: - gf_jar = pp - break + for pp in target_dir.glob("*.jar"): + if "graphframes-assembly" in pp.name: + gf_jar = pp + break - assert gf_jar is not None, "Missing JAR!" - python_resources.mkdir(parents=True, exist_ok=True) - shutil.copy(gf_jar, python_resources.joinpath(gf_jar.name)) + assert gf_jar is not None, "Missing JAR!" + shutil.rmtree(python_resources, ignore_errors=True) + python_resources.mkdir(parents=True, exist_ok=True) + shutil.copy(gf_jar, python_resources.joinpath(f"spark-{spark_version}-{gf_jar.name}")) if __name__ == "__main__": if len(sys.argv) > 1: - spark_version = sys.argv[1] - build(spark_version) + spark_versions = [] + for arg in sys.argv[1:]: + spark_versions.append(arg) + build(spark_versions) else: build() diff --git a/python/graphframes/__init__.py b/python/graphframes/__init__.py index bded262bc..271a994c1 100644 --- a/python/graphframes/__init__.py +++ b/python/graphframes/__init__.py @@ -1,3 +1,32 @@ +import pathlib +from importlib import resources + +from pyspark.version import __version__ + from .graphframe import GraphFrame -__all__ = ["GraphFrame"] + +def get_gf_jar_location() -> str: + """ + Returns a location of the GraphFrames JAR, + included to the distribution of the graphframes-py. + + Usage: just add the returned value of the function to `spark.jars`: + `SparkSession.builder.master(...).config("spark.jars", get_gf_jar_location()).getOrCreate()`. + + In the case your version of PySpark is not compatible with the version of GraphFrames, + this function will raise an exception! + """ + resources_root = resources.files("graphframes").joinpath("resources") + + for pp in resources_root.iterdir(): + assert isinstance(pp, pathlib.PosixPath) # type checking + if pp.is_file() and pp.name.endswith(".jar") and __version__[:5] in pp.name: + return str(pp.absolute()) + + raise ValueError( + f"You version of spark {__version__} is not supported by this version of grpahframes!" + ) + + +__all__ = ["GraphFrame", "get_gf_jar_location"] diff --git a/python/poetry.lock b/python/poetry.lock index 72bc686b7..cf7ebf8d0 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -503,13 +503,13 @@ pyflakes = ">=3.3.0,<3.4.0" [[package]] name = "googleapis-common-protos" -version = "1.69.2" +version = "1.70.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis_common_protos-1.69.2-py3-none-any.whl", hash = "sha256:0b30452ff9c7a27d80bfc5718954063e8ab53dd3697093d3bc99581f5fd24212"}, - {file = "googleapis_common_protos-1.69.2.tar.gz", hash = "sha256:3e1b904a27a33c821b4b749fd31d334c0c9c30e6113023d495e48979a3dc9c5f"}, + {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"}, + {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"}, ] [package.dependencies] @@ -1377,12 +1377,12 @@ test = ["coverage[toml] (>=5.2)", "hypothesis", "pytest (>=6.0)", "pytest-benchm [[package]] name = "pyspark" -version = "3.5.4" +version = "3.5.5" description = "Apache Spark Python API" optional = false python-versions = ">=3.8" files = [ - {file = "pyspark-3.5.4.tar.gz", hash = "sha256:1c2926d63020902163f58222466adf6f8016f6c43c1f319b8e7a71dbaa05fc51"}, + {file = "pyspark-3.5.5.tar.gz", hash = "sha256:6effc9ce98edf231f4d683fd14f7270629bf8458c628d6a2620ded4bb34f3cb9"}, ] [package.dependencies] @@ -1786,13 +1786,13 @@ files = [ [[package]] name = "typing-extensions" -version = "4.13.1" +version = "4.13.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.13.1-py3-none-any.whl", hash = "sha256:4b6cf02909eb5495cfbc3f6e8fd49217e6cc7944e145cdda8caa3734777f9e69"}, - {file = "typing_extensions-4.13.1.tar.gz", hash = "sha256:98795af00fb9640edec5b8e31fc647597b4691f099ad75f469a2616be1a76dff"}, + {file = "typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c"}, + {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"}, ] [[package]] @@ -1808,13 +1808,13 @@ files = [ [[package]] name = "urllib3" -version = "2.3.0" +version = "2.4.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" files = [ - {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, - {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, + {file = "urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813"}, + {file = "urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466"}, ] [package.extras] @@ -1826,4 +1826,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = ">=3.10 <3.13" -content-hash = "e42ec6d711507ef5d4ab48a5d1448475850618b5eaf73d68ebb83b3f11ebe584" +content-hash = "63adc9d1f3f2df742edf046de209e7927cee3aa1d53b7f5bbf1c62a7d635659e" diff --git a/python/pyproject.toml b/python/pyproject.toml index 43985c8f9..51fb9ccc7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -23,7 +23,7 @@ include = [ ] [tool.poetry.build] -script = "dev/build_jar.py" +script = "dev/build_jar.py" # Spark version should be aligned inside the script! [tool.poetry.urls] @@ -35,14 +35,14 @@ script = "dev/build_jar.py" [tool.poetry.dependencies] python = ">=3.10 <3.13" nose = "1.3.7" -pyspark = ">=3.4 <4.0" +pyspark = ">=3.5 <4.0" numpy = ">= 1.7" [tool.poetry.group.dev.dependencies] black = "^23.12.1" flake8 = "^7.1.1" isort = "^6.0.0" -pyspark = { version = "3.5.4", extras = ["connect"] } +pyspark = { version = "3.5.5", extras = ["connect"] } grpcio = "<=1.67.1" pytest = "^8.3.4" diff --git a/python/tests/conftest.py b/python/tests/conftest.py index ba6fa97ad..33d6ed627 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -1,13 +1,12 @@ import pathlib import shutil import warnings -from importlib import resources import pytest from pyspark.sql import SparkSession from pyspark.version import __version__ -from graphframes import GraphFrame +from graphframes import GraphFrame, get_gf_jar_location from graphframes.classic.graphframe import _java_api if __version__[:3] >= "3.4": @@ -43,15 +42,7 @@ def spark(): spark_builder = SparkSession.builder.master("local[4]").config( "spark.sql.shuffle.partitions", 4 ) - resources_root = resources.files("graphframes").joinpath("resources") - spark_jars = [] - for pp in resources_root.iterdir(): - assert isinstance(pp, pathlib.PosixPath) # type checking - if pp.is_file() and pp.name.endswith(".jar"): - spark_jars.append(pp.absolute().__str__()) - if spark_jars: - jars_str = ",".join(spark_jars) - spark = spark_builder.config("spark.jars", jars_str) + spark = spark_builder.config("spark.jars", get_gf_jar_location()) spark = spark_builder.getOrCreate() spark.sparkContext.setCheckpointDir(checkpointDir) yield spark