graphframes · SemyonSinchenko · Mar 10, 2026 · Oct 16, 2025 · Oct 16, 2025 · Oct 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -79,3 +79,8 @@ spark-*
 
 # Zed
 .zed
+
+# Emacs
+.dir-locals.el
+*~
+.aider*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,15 +21,14 @@ repos:
 
       - id: scalafmt
         name: scalafmt
-        entry: build/sbt scalafmtCheckAll
+        entry: build/sbt scalafmtAll
         language: system
         types: [scala]
         pass_filenames: false
 
       - id: scalafix
         name: scalafix
-        entry: build/sbt "scalafixAll --check"
+        entry: build/sbt scalafixAll
         language: system
         types: [scala]
         pass_filenames: false
-
diff --git a/NOTICE b/NOTICE
@@ -8,3 +8,10 @@ Copyright 2014-2025 The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
+
+Part of the code of the project is heavily inspired or copied from the Apache Spark ML project, which are licensed under the Apache Software License, Version 2.0. The Apache Spark project has the following NOTICE:
+Apache Spark
+Copyright 2014 and onwards The Apache Software Foundation.
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
diff --git a/build.sbt b/build.sbt
@@ -96,10 +96,12 @@ lazy val commonSetting = Seq(
     "--add-opens=java.base/java.lang=ALL-UNNAMED",
     "--add-opens=java.base/java.nio=ALL-UNNAMED",
     "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
-    "--add-opens=java.base/java.util=ALL-UNNAMED"),
+    "--add-opens=java.base/java.util=ALL-UNNAMED",
+    "--add-opens=java.base/sun.security.action=ALL-UNNAMED",
+    "--add-opens=java.base/java.io=ALL-UNNAMED"),
 
   // Scalac options
-  tpolecatScalacOptions ++= Set(
+  Compile / tpolecatScalacOptions ++= Set(
     ScalacOptions.lint,
     ScalacOptions.deprecation,
     ScalacOptions.warnDeadCode,
@@ -111,7 +113,10 @@ lazy val commonSetting = Seq(
     ScalacOptions.warnUnusedNoWarn,
     ScalacOptions.source3,
     ScalacOptions.fatalWarnings),
-  tpolecatExcludeOptions ++= Set(ScalacOptions.warnNonUnitStatement),
+  Compile / tpolecatExcludeOptions ++= Set(
+    ScalacOptions.warnNonUnitStatement,
+    ScalacOptions.privateWarnUnusedNoWarn,
+    ScalacOptions.warnUnusedNoWarn),
   Test / tpolecatExcludeOptions ++= Set(
     ScalacOptions.warnValueDiscard,
     ScalacOptions.warnUnusedLocals,
@@ -122,8 +127,7 @@ lazy val commonSetting = Seq(
     ScalacOptions.warnNumericWiden,
     ScalacOptions.privateWarnNumericWiden,
     ScalacOptions.warnUnusedNoWarn,
-    ScalacOptions.privateWarnUnusedNoWarn,
-  ))
+    ScalacOptions.privateWarnUnusedNoWarn))
 
 lazy val graphx = (project in file("graphx"))
   .settings(
@@ -136,7 +140,9 @@ lazy val graphx = (project in file("graphx"))
     // for scala 2.13 we should mark "unused" class tags by @nowarn,
     // for scala 2.12 we shouldn't
     // the only way at the moment is to not check unused @nowarn for GraphX
-    tpolecatExcludeOptions ++= Set(ScalacOptions.warnUnusedNoWarn, ScalacOptions.privateWarnUnusedNoWarn),
+    tpolecatExcludeOptions ++= Set(
+      ScalacOptions.warnUnusedNoWarn,
+      ScalacOptions.privateWarnUnusedNoWarn),
 
     // Global settings
     Global / concurrentRestrictions := Seq(Tags.limitAll(1)),

diff --git a/connect/src/main/protobuf/graphframes.proto b/connect/src/main/protobuf/graphframes.proto
@@ -35,8 +35,9 @@ message GraphFramesAPI {
     SVDPlusPlus svd_plus_plus = 18;
     TriangleCount triangle_count = 19;
     Triplets triplets = 20;
-    MaximalIndependentSet mis = 22;
     KCore kcore = 21;
+    MaximalIndependentSet mis = 22;
+    RandomWalkEmbeddings rw_embeddings = 23;
   }
 }
 
@@ -208,3 +209,38 @@ message KCore {
   int32 checkpoint_interval = 2;
   optional StorageLevel storage_level = 3;
 }
+
+message RandomWalkEmbeddings {
+  bool use_edge_direction = 1;
+  string rw_model = 2;
+  int32 rw_max_nbrs = 3;
+  int32 rw_num_walks_per_node = 4;
+  int32 rw_batch_size = 5;
+  int32 rw_num_batches = 6;
+  int64 rw_seed = 7;
+  double rw_restart_probability = 8;
+  string rw_temporary_prefix = 9;
+  string rw_cached_walks = 10;
+  string sequence_model = 11;
+  int32 hash2vec_context_size = 12;
+  int32 hash2vec_num_partitions = 13;
+  int32 hash2vec_embeddings_dim = 14;
+  string hash2vec_decay_function = 15;
+  double hash2vec_gaussian_sigma = 16;
+  int32 hash2vec_hashing_seed = 17;
+  int32 hash2vec_sign_seed = 18;
+  bool hash2vec_do_l2_norm = 19;
+  bool hash2vec_safe_l2 = 20;
+  int32 word2vec_max_iter = 21;
+  int32 word2vec_embeddings_dim = 22;
+  int32 word2vec_window_size = 23;
+  int32 word2vec_num_partitions = 24;
+  int32 word2vec_min_count = 25;
+  int32 word2vec_max_sentence_length = 26;
+  int64 word2vec_seed = 27;
+  double word2vec_step_size = 28;
+  bool aggregate_neighbors = 29;
+  int32 aggregate_neighbors_max_nbrs = 30;
+  int64 aggregate_neighbors_seed = 31;
+  bool clean_up_after_run = 32;
+}
diff --git a/connect/src/main/scala/org/apache/spark/sql/graphframes/GraphFramesConnectUtils.scala b/connect/src/main/scala/org/apache/spark/sql/graphframes/GraphFramesConnectUtils.scala
@@ -12,6 +12,7 @@ import org.apache.spark.storage.StorageLevel
 import org.graphframes.GraphFrame
 import org.graphframes.GraphFramesUnreachableException
 import org.graphframes.connect.proto
+import org.graphframes.embeddings.RandomWalkEmbeddings
 
 import scala.jdk.CollectionConverters.*
 
@@ -454,6 +455,44 @@ object GraphFramesConnectUtils {
 
         kCoreBuilder.run()
       }
+      case proto.GraphFramesAPI.MethodCase.RW_EMBEDDINGS => {
+        val message = apiMessage.getRwEmbeddings()
+
+        RandomWalkEmbeddings.pythonAPI(
+          graph = graphFrame,
+          useEdgeDirection = message.getUseEdgeDirection(),
+          rwModel = message.getRwModel(),
+          rwMaxNbrs = message.getRwMaxNbrs(),
+          rwNumWalksPerNode = message.getRwNumWalksPerNode(),
+          rwBatchSize = message.getRwBatchSize(),
+          rwNumBatches = message.getRwNumBatches(),
+          rwSeed = message.getRwSeed(),
+          rwRestartProbability = message.getRwRestartProbability(),
+          rwTemporaryPrefix = message.getRwTemporaryPrefix(),
+          rwCachedWalks = message.getRwCachedWalks(),
+          sequenceModel = message.getSequenceModel(),
+          hash2vecContextSize = message.getHash2VecContextSize(),
+          hash2vecNumPartitions = message.getHash2VecNumPartitions(),
+          hash2vecEmbeddingsDim = message.getHash2VecEmbeddingsDim(),
+          hash2vecDecayFunction = message.getHash2VecDecayFunction(),
+          hash2vecGaussianSigma = message.getHash2VecGaussianSigma(),
+          hash2vecHashingSeed = message.getHash2VecHashingSeed(),
+          hash2vecSignSeed = message.getHash2VecSignSeed(),
+          hash2vecDoL2Norm = message.getHash2VecDoL2Norm(),
+          hash2vecSafeL2 = message.getHash2VecSafeL2(),
+          word2vecMaxIter = message.getWord2VecMaxIter(),
+          word2vecEmbeddingsDim = message.getWord2VecEmbeddingsDim(),
+          word2vecWindowSize = message.getWord2VecWindowSize(),
+          word2vecNumPartitions = message.getWord2VecNumPartitions(),
+          word2vecMinCount = message.getWord2VecMinCount(),
+          word2vecMaxSentenceLength = message.getWord2VecMaxSentenceLength(),
+          word2vecSeed = message.getWord2VecSeed(),
+          word2vecStepSize = message.getWord2VecStepSize(),
+          aggregateNeighbors = message.getAggregateNeighbors(),
+          aggregateNeighborsMaxNbrs = message.getAggregateNeighborsMaxNbrs(),
+          aggregateNeighborsSeed = message.getAggregateNeighborsSeed(),
+          cleanUpAfterRun = message.getCleanUpAfterRun())
+      }
       case _ => throw new GraphFramesUnreachableException() // Unreachable
     }
   }

diff --git a/core/src/main/scala/org/apache/spark/sql/graphframes/expressions/KMinSampling.scala b/core/src/main/scala/org/apache/spark/sql/graphframes/expressions/KMinSampling.scala
@@ -0,0 +1,165 @@
+package org.apache.spark.sql.graphframes.expressions
+
+import org.apache.spark.sql.Encoder
+import org.apache.spark.sql.Encoders
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.functions.udaf
+import org.apache.spark.sql.types.*
+import org.apache.spark.sql.types.DataType
+import org.graphframes.GraphFramesUnsupportedVertexTypeException
+
+import scala.annotation.nowarn
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
+
+case class KMinAccum[T](values: Array[T], weights: Array[Long], var cnt: Int) extends Serializable
+
+case class KMinSampling[T: ClassTag](size: Int)(implicit
+    @nowarn tag: TypeTag[T],
+    ord: Ordering[T])
+    extends Aggregator[Row, KMinAccum[T], Seq[T]]
+    with Serializable {
+
+  override def zero: KMinAccum[T] = KMinAccum(Array.ofDim[T](size), Array.ofDim[Long](size), 0)
+
+  override def reduce(b: KMinAccum[T], a: Row): KMinAccum[T] = {
+    val newWeight = a.getLong(1)
+    val newValue = a.getAs[T](0)
+    // fast-path: buffer is already full of "strong" elements
+    // the case of "influencer" vertex
+    if (b.cnt == size) {
+      val lastWeight = b.weights.last
+      if ((lastWeight < newWeight) || ((lastWeight == newWeight) && (ord.compare(
+          newValue,
+          b.values.last) >= 0))) {
+        return b
+      }
+    }
+
+    // slow-path: custom binary search for (Weight, Value)
+    // We want to find the first index where (b.w, b.v) > (newWeight, newValue)
+    var low = 0
+    var high = b.cnt - 1
+    var idx = b.cnt // Default insertion point is at the end
+
+    while (low <= high) {
+      val mid = (low + high) / 2
+      val midWeight = b.weights(mid)
+
+      // Compare (midWeight, midValue) vs (newWeight, newValue)
+      val res =
+        if (midWeight < newWeight) -1
+        else if (midWeight > newWeight) 1
+        else ord.compare(b.values(mid), newValue)
+
+      if (res <= 0) {
+        // mid is smaller or equal: we must insert after mid
+        low = mid + 1
+      } else {
+        // mid is larger: potential insertion point here
+        idx = mid
+        high = mid - 1
+      }
+    }
+
+    if (idx < size) {
+      val newCount = math.min(b.cnt + 1, size)
+      if (idx < newCount - 1) {
+        // shift to the right if needed
+        System.arraycopy(b.weights, idx, b.weights, idx + 1, newCount - idx - 1)
+        System.arraycopy(b.values, idx, b.values, idx + 1, newCount - idx - 1)
+      }
+
+      b.weights(idx) = newWeight
+      b.values(idx) = newValue
+      b.cnt = newCount
+    }
+
+    b
+  }
+
+  override def merge(b1: KMinAccum[T], b2: KMinAccum[T]): KMinAccum[T] = {
+
+    if (b1.cnt == 0) {
+      return b2
+    }
+
+    if (b2.cnt == 0) {
+      return b1
+    }
+
+    val resultSize = math.min(b1.cnt + b2.cnt, size)
+    val newValues = Array.ofDim[T](resultSize)
+    val newWeights = Array.ofDim[Long](resultSize)
+
+    var i = 0
+    var j = 0
+    var r = 0
+
+    while (r < resultSize) {
+      val useLeft = if (i >= b1.cnt) {
+        false
+      } else if (j >= b2.cnt) {
+        true
+      } else {
+        val wLeft = b1.weights(i)
+        val wRight = b2.weights(j)
+
+        if (wLeft < wRight) {
+          true
+        } else if (wLeft > wRight) {
+          false
+        } else {
+          ord.compare(b1.values(i), b2.values(j)) <= 0
+        }
+      }
+
+      if (useLeft) {
+        newWeights(r) = b1.weights(i)
+        newValues(r) = b1.values(i)
+        i += 1
+      } else {
+        newWeights(r) = b2.weights(j)
+        newValues(r) = b2.values(j)
+        j += 1
+      }
+
+      r += 1
+    }
+
+    KMinAccum(newValues, newWeights, resultSize)
+  }
+
+  override def finish(reduction: KMinAccum[T]): Seq[T] =
+    reduction.values.slice(0, reduction.cnt).toSeq
+  // TODO: replace by Kryo after 4.0.2 is released, see SPARK-52819
+  override def bufferEncoder: Encoder[KMinAccum[T]] = Encoders.product
+  override def outputEncoder: Encoder[Seq[T]] = ExpressionEncoder[Seq[T]]()
+}
+
+object KMinSampling extends Serializable {
+  def getEncoder(spark: SparkSession, dataType: DataType, colNames: Seq[String]): Encoder[Row] = {
+    // That is very stupid way actually. But it is the only way with public API
+    spark
+      .createDataFrame(
+        java.util.List.of[Row](),
+        StructType(
+          StructField(colNames(0), dataType) :: StructField(colNames(1), LongType) :: Nil))
+      .encoder
+  }
+
+  def fromSparkType(dataType: DataType, size: Int, encoder: Encoder[Row]): UserDefinedFunction = {
+    dataType match {
+      case StringType => udaf(KMinSampling[java.lang.String](size), encoder)
+      case ShortType => udaf(KMinSampling[java.lang.Short](size), encoder)
+      case ByteType => udaf(KMinSampling[java.lang.Byte](size), encoder)
+      case IntegerType => udaf(KMinSampling[java.lang.Integer](size), encoder)
+      case LongType => udaf(KMinSampling[java.lang.Long](size), encoder)
+      case _ => throw new GraphFramesUnsupportedVertexTypeException("unsupported vertex type")
+    }
+  }
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -79,3 +79,8 @@ spark-* @@
     # Zed
     .zed
+    # Emacs
+    .dir-locals.el
+    *~
+    .aider*