Pipeline Specification (PPS)
Learn about the different attributes of a pipeline spec.
This document discusses each of the fields present in a pipeline specification.
Before You Start #
- Pachydermâs pipeline specifications can be written in JSON or YAML.
- Pachyderm uses its json parser if the first character is
{. - A pipeline specification file can contain multiple pipeline declarations at once.
Minimal Spec #
Generally speaking, the only attributes that are strictly required for all scenarios are pipeline.name and transform. Beyond those, other attributes are conditionally required based on your pipelineâs use case. The following are a few examples of common use cases along with their minimally required attributes.
Use Case:
{
"pipeline": {
"name": "wordcount",
"project": {
"name": "projectName"
}
},
"transform": {
"image": "wordcount-image",
"cmd": ["/binary", "/pfs/data", "/pfs/out"]
},
"input": {
"cron": {
{
"name": string,
"spec": string,
"repo": string,
"start": time,
"overwrite": bool
}
}
}
}{
"pipeline": {
"name": "wordcount",
"project": {
"name": "projectName"
}
},
"transform": {
"image": "wordcount-image",
"cmd": ["/binary", "/pfs/data", "/pfs/out"]
},
"input": {
"pfs": {
"repo": "data",
"glob": "/*"
}
},
"egress": {
"sqlDatabase": {
"url": string,
"fileFormat": {
"type": string,
"columns": [string]
},
"secret": {
"name": string,
"key": "PACHYDERM_SQL_PASSWORD"
}
}
},
}{
"pipeline": {
"name": "wordcount",
"project": {
"name": "projectName"
}
},
"transform": {
"image": "wordcount-image",
"cmd": ["/binary", "/pfs/data", "/pfs/out"]
},
"input": {
"pfs": {
"repo": "data",
"glob": "/*"
}
},
"egress": {
"URL": "s3://bucket/dir"
},
}{
"pipeline": {
"name": "wordcount",
"project": {
"name": "projectName"
}
},
"transform": {
"image": "wordcount-image",
"cmd": ["/binary", "/pfs/data", "/pfs/out"]
},
"input": {
"pfs": {
"repo": "data",
"glob": "/*"
}
}
}{
"pipeline": {
"name": "wordcount",
"project": {
"name": "projectName"
}
},
"transform": {
"image": "wordcount-image",
"cmd": ["/binary", "/pfs/data", "/pfs/out"]
},
"input": {
"pfs": {
"repo": "data",
"glob": "/*"
}
},
"service": {
"internalPort": int,
"externalPort": int
},
}{
"pipeline": {
"name": "wordcount",
"project": {
"name": "projectName"
}
},
"transform": {
"image": "wordcount-image",
"cmd": ["/binary", "/pfs/data", "/pfs/out"]
},
"spout": {
},
}{
"pipeline": {
"name": "wordcount",
"project": {
"name": "projectName"
}
},
"transform": {
"image": "wordcount-image",
"cmd": ["/binary", "/pfs/data", "/pfs/out"]
},
"input": {
"pfs": {
"repo": "data",
"glob": "/*"
}
},
"s3Out": true,
}{
"pipeline": {
"name": string,
"project": {
"name": "projectName"
},
},
"description": string,
"metadata": {
"annotations": {
"annotation": string
},
"labels": {
"label": string
}
},
"tfJob": {
"tfJob": string,
},
"transform": {
"image": string,
"cmd": [ string ],
"errCmd": [ string ],
"env": {
string: string
},
"secrets": [ {
"name": string,
"mountPath": string
},
{
"name": string,
"envVar": string,
"key": string
} ],
"imagePullSecrets": [ string ],
"stdin": [ string ],
"errStdin": [ string ],
"acceptReturnCode": [ int ],
"debug": bool,
"user": string,
"workingDir": string,
"dockerfile": string,
"memoryVolume": bool,
},
"parallelismSpec": {
"constant": int
},
"egress": {
// Egress to an object store
"URL": "s3://bucket/dir"
// Egress to a database
"sqlDatabase": {
"url": string,
"fileFormat": {
"type": string,
"columns": [string]
},
"secret": {
"name": string,
"key": "PACHYDERM_SQL_PASSWORD"
}
}
},
"update": bool,
"outputBranch": string,
[
{
"workerId": string,
"jobId": string,
"datumStatus" : {
"started": timestamp,
"data": []
}
}
],
"s3Out": bool,
"resourceRequests": {
"cpu": number,
"memory": string,
"gpu": {
"type": string,
"number": int
}
"disk": string,
},
"resourceLimits": {
"cpu": number,
"memory": string,
"gpu": {
"type": string,
"number": int
}
"disk": string,
},
"sidecarResourceLimits": {
"cpu": number,
"memory": string,
"gpu": {
"type": string,
"number": int
}
"disk": string,
},
"input": {
<"pfs", "cross", "union", "join", "group" or "cron" see below>
},
"description": string,
"reprocess": bool,
"service": {
"internalPort": int,
"externalPort": int
},
"spout": {
\\ Optionally, you can combine a spout with a service:
"service": {
"internalPort": int,
"externalPort": int
}
},
"datumSetSpec": {
"number": int,
"sizeBytes": int,
"perWorker": int,
}
"datumTimeout": string,
"jobTimeout": string,
"salt": string,
"datumTries": int,
"schedulingSpec": {
"nodeSelector": {string: string},
"priorityClassName": string
},
"podSpec": string,
"podPatch": string,
"specCommit": {
"option": false,
"branch": {
"option": false,
"repo": {
"option": false,
"name": string,
"type": string,
"project":{
"option": false,
"name": string,
},
},
"name": string
},
"id": string,
}
"metadata": {
},
"reprocessSpec": string,
"autoscaling": bool
}
------------------------------------
"pfs" input
------------------------------------
"pfs": {
"name": string,
"repo": string,
"repoType":string,
"branch": string,
"commit":string,
"glob": string,
"joinOn":string,
"outerJoin": bool,
"groupBy": string,
"lazy" bool,
"emptyFiles": bool,
"s3": bool,
"trigger": {
"branch": string,
"all": bool,
"cronSpec": string,
},
}
------------------------------------
"cross" or "union" input
------------------------------------
"cross" or "union": [
{
"pfs": {
"name": string,
"repo": string,
"branch": string,
"glob": string,
"lazy" bool,
"emptyFiles": bool,
"s3": bool
}
},
{
"pfs": {
"name": string,
"repo": string,
"branch": string,
"glob": string,
"lazy" bool,
"emptyFiles": bool,
"s3": bool
}
}
...
]
------------------------------------
"join" input
------------------------------------
"join": [
{
"pfs": {
"name": string,
"repo": string,
"branch": string,
"glob": string,
"joinOn": string,
"outerJoin": bool,
"lazy": bool,
"emptyFiles": bool,
"s3": bool
}
},
{
"pfs": {
"name": string,
"repo": string,
"branch": string,
"glob": string,
"joinOn": string,
"outerJoin": bool,
"lazy": bool,
"emptyFiles": bool,
"s3": bool
}
}
]
------------------------------------
"group" input
------------------------------------
"group": [
{
"pfs": {
"name": string,
"repo": string,
"branch": string,
"glob": string,
"groupBy": string,
"lazy": bool,
"emptyFiles": bool,
"s3": bool
}
},
{
"pfs": {
"name": string,
"repo": string,
"branch": string,
"glob": string,
"groupBy": string,
"lazy": bool,
"emptyFiles": bool,
"s3": bool
}
}
]
------------------------------------
"cron" input
------------------------------------
"cron": {
"name": string,
"spec": string,
"repo": string,
"start": time,
"overwrite": bool
}âšī¸
For a single-page view of all PPS options, go to the PPS series page.