Last Updated: April 30, 2017
·
8.432K
· ahmad_ragab

SBT Assembly, Spark, and You!

Tested with Scala 2.11.11, sbt 0.13.15 and Spark 2.1.0

build.sbt accomplished the following things:
* execute sbt run from sbt shell or terminal
* Use sbt-assembly plugin to properly package jar for usage with spark-submit for a spark instance running inside docker container

name := "spark-mllib-test"
version := "1.0"
scalaVersion := "2.11.11"
val sparkVersion = "2.1.0"

libraryDependencies ++= Seq (
  "org.apache.spark" %% "spark-core" % sparkVersion % "provided",  // spark runtime already provides jars
  "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided",
  "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",

  // not relevant, just allows me to pass command line options to spark job
  "args4j" % "args4j" % "2.33", 
  "com.bizo" % "args4j-helpers_2.10" % "1.0.0"
)

/* without this explicit merge strategy code you get a lot of noise from sbt-assembly 
   complaining about not being able to dedup files */
assemblyMergeStrategy in assembly := {
  case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last
  case PathList("javax", "inject", xs @ _*) => MergeStrategy.last
  case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
  case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
  case PathList("org", "apache", xs @ _*) => MergeStrategy.last
  case PathList("com", "google", xs @ _*) => MergeStrategy.last
  case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
  case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
  case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
  case "about.html" => MergeStrategy.rename
  case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
  case "META-INF/mailcap" => MergeStrategy.last
  case "META-INF/mimetypes.default" => MergeStrategy.last
  case "plugin.properties" => MergeStrategy.last
  case "log4j.properties" => MergeStrategy.last
  case "overview.html" => MergeStrategy.last  // Added this for 2.1.0 I think
  case x =>
    val oldStrategy = (assemblyMergeStrategy in assembly).value
    oldStrategy(x)
}

/* including scala bloats your assembly jar unnecessarily, and may interfere with 
   spark runtime */
assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 
assemblyJarName in assembly := "spark-mllib-test.jar"

/* you need to be able to undo the "provided" annotation on the deps when running your spark 
   programs locally i.e. from sbt; this bit reincludes the full classpaths in the compile and run tasks. */
fullClasspath in Runtime := (fullClasspath in (Compile, run)).value