SBT Assembly, Spark, and You!
Tested with Scala 2.11.11, sbt 0.13.15 and Spark 2.1.0
build.sbt
accomplished the following things:
* execute sbt run
from sbt shell or terminal
* Use sbt-assembly plugin to properly package jar for usage with spark-submit
for a spark instance running inside docker container
name := "spark-mllib-test"
version := "1.0"
scalaVersion := "2.11.11"
val sparkVersion = "2.1.0"
libraryDependencies ++= Seq (
"org.apache.spark" %% "spark-core" % sparkVersion % "provided", // spark runtime already provides jars
"org.apache.spark" %% "spark-streaming" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
// not relevant, just allows me to pass command line options to spark job
"args4j" % "args4j" % "2.33",
"com.bizo" % "args4j-helpers_2.10" % "1.0.0"
)
/* without this explicit merge strategy code you get a lot of noise from sbt-assembly
complaining about not being able to dedup files */
assemblyMergeStrategy in assembly := {
case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last
case PathList("javax", "inject", xs @ _*) => MergeStrategy.last
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
case PathList("org", "apache", xs @ _*) => MergeStrategy.last
case PathList("com", "google", xs @ _*) => MergeStrategy.last
case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
case "about.html" => MergeStrategy.rename
case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
case "META-INF/mailcap" => MergeStrategy.last
case "META-INF/mimetypes.default" => MergeStrategy.last
case "plugin.properties" => MergeStrategy.last
case "log4j.properties" => MergeStrategy.last
case "overview.html" => MergeStrategy.last // Added this for 2.1.0 I think
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
}
/* including scala bloats your assembly jar unnecessarily, and may interfere with
spark runtime */
assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
assemblyJarName in assembly := "spark-mllib-test.jar"
/* you need to be able to undo the "provided" annotation on the deps when running your spark
programs locally i.e. from sbt; this bit reincludes the full classpaths in the compile and run tasks. */
fullClasspath in Runtime := (fullClasspath in (Compile, run)).value
Written by Ahmad Saad Ragab
Related protips
Have a fresh tip? Share with Coderwall community!
Post
Post a tip
Best
#Sbt
Authors
ahmad_ragab
8.552K
agile_jordi
1.542K
Sponsored by #native_company# — Learn More
#native_title#
#native_desc#