/*
RDD.scala
*/
// =======================================================================
// Methods that should be implemented by subclasses of RDD
// =======================================================================
/**
* :: DeveloperApi ::
* Implemented by subclasses to compute a given partition.
*/
@DeveloperApi
def compute(split: Partition, context: TaskContext): Iterator[T]
/**
* Implemented by subclasses to return the set of partitions in this RDD. This method will only
* be called once, so it is safe to implement a time-consuming computation in it.
*
* The partitions in this array must satisfy the following property:
* `rdd.partitions.zipWithIndex.forall { case (partition, index) => partition.index == index }`
*/
protected def getPartitions: Array[Partition]
/**
* Implemented by subclasses to return how this RDD depends on parent RDDs. This method will only
* be called once, so it is safe to implement a time-consuming computation in it.
*/
protected def getDependencies: Seq[Dependency[_]] = deps
/**
* Optionally overridden by subclasses to specify placement preferences.
*/
protected def getPreferredLocations(split: Partition): Seq[String] = Nil
/** Optionally overridden by subclasses to specify how they are partitioned. */
@transient val partitioner: Option[Partitioner] = None
first 3 are compulsory
compute 같은거는 경우에 따라 parent로부터 받거나 아니면 새로 만들거나 할 듯.
last two are optional
그 외 , 필요에 따라 clearDependencies(), 등 과 같은 함수를 override하기도 함.
write examples (e.g. ShuffledRDD.scala)
Partition definition
trait partition 소개
trait Partition extends Serializable {
/**
* Get the partition's index within its parent RDD
*/
def index: Int
// A better default implementation of HashCode
override def hashCode(): Int = index
override def equals(other: Any): Boolean = super.equals(other)
}
normally, each RDD has its own Partition class (ex. ShuffledRDD + ShuffledRDDPartition )