24 from collections
import defaultdict
27 import lsst.pex.config
as pexConfig
31 from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
34 from .parquetTable
import ParquetTable
35 from .multiBandUtils
import makeMergeArgumentParser, MergeSourcesRunner
36 from .functors
import CompositeFunctor, RAColumn, DecColumn, Column
39 def flattenFilters(df, filterDict, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
40 """Flattens a dataframe with multilevel column index 42 newDf = pd.DataFrame()
43 for filt, filtShort
in filterDict.items():
45 columnFormat =
'{0}{1}' if camelCase
else '{0}_{1}' 46 newColumns = {c: columnFormat.format(filtShort, c)
47 for c
in subdf.columns
if c
not in noDupCols}
48 cols = list(newColumns.keys())
49 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
51 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
56 priorityList = pexConfig.ListField(
59 doc=
"Priority-ordered list of bands for the merge." 61 engine = pexConfig.Field(
64 doc=
"Parquet engine for writing (pyarrow or fastparquet)" 66 coaddName = pexConfig.Field(
73 pexConfig.Config.validate(self)
75 raise RuntimeError(
"No priority list provided")
79 """Write filter-merged source tables to parquet 81 _DefaultName =
"writeObjectTable" 82 ConfigClass = WriteObjectTableConfig
83 RunnerClass = MergeSourcesRunner
86 inputDatasets = (
'forced_src',
'meas',
'ref')
91 def __init__(self, butler=None, schema=None, **kwargs):
95 CmdLineTask.__init__(self, **kwargs)
99 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in 100 subclasses that inherit from MergeSourcesTask. 101 @param[in] patchRefList list of data references for each filter 103 catalogs = dict(self.
readCatalog(patchRef)
for patchRef
in patchRefList)
104 dataId = patchRefList[0].dataId
105 mergedCatalog = self.
run(catalogs, tract=dataId[
'tract'], patch=dataId[
'patch'])
106 self.
write(patchRefList[0], mergedCatalog)
109 def _makeArgumentParser(cls):
110 """Create a suitable ArgumentParser. 112 We will use the ArgumentParser to get a list of data 113 references for patches; the RunnerClass will sort them into lists 114 of data references for the same patch. 116 References first of self.inputDatasets, rather than 122 """Read input catalogs 124 Read all the input datasets given by the 'inputDatasets' 129 patchRef : `lsst.daf.persistence.ButlerDataRef` 130 Data reference for patch 134 Tuple consisting of filter name and a dict of catalogs, keyed by 137 filterName = patchRef.dataId[
"filter"]
140 catalog = patchRef.get(self.config.coaddName +
"Coadd_" + dataset, immediate=
True)
141 self.log.info(
"Read %d sources from %s for filter %s: %s" %
142 (len(catalog), dataset, filterName, patchRef.dataId))
143 catalogDict[dataset] = catalog
144 return filterName, catalogDict
146 def run(self, catalogs, tract, patch):
147 """Merge multiple catalogs. 152 Mapping from filter names to dict of catalogs. 154 tractId to use for the tractId column 156 patchId to use for the patchId column 160 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable` 161 Merged dataframe, with each column prefixed by 162 `filter_tag(filt)`, wrapped in the parquet writer shim class. 166 for filt, tableDict
in catalogs.items():
167 for dataset, table
in tableDict.items():
169 df = table.asAstropy().to_pandas().set_index(
'id', drop=
True)
172 df = df.reindex(sorted(df.columns), axis=1)
173 df[
'tractId'] = tract
174 df[
'patchId'] = patch
177 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c)
for c
in df.columns],
178 names=(
'dataset',
'filter',
'column'))
181 catalog = functools.reduce(
lambda d1, d2: d1.join(d2), dfs)
189 catalog : `ParquetTable` 191 patchRef : `lsst.daf.persistence.ButlerDataRef` 192 Data reference for patch 194 patchRef.put(catalog, self.config.coaddName +
"Coadd_" + self.
outputDataset)
197 mergeDataId = patchRef.dataId.copy()
198 del mergeDataId[
"filter"]
199 self.log.info(
"Wrote merged catalog: %s" % (mergeDataId,))
202 """No metadata to write, and not sure how to write it for a list of dataRefs. 207 class WriteSourceTableConfig(pexConfig.Config):
208 doApplyExternalPhotoCalib = pexConfig.Field(
211 doc=(
"Add local photoCalib columns from the calexp.photoCalib? Should only set True if " 212 "generating Source Tables from older src tables which do not already have local calib columns")
214 doApplyExternalSkyWcs = pexConfig.Field(
217 doc=(
"Add local WCS columns from the calexp.wcs? Should only set True if " 218 "generating Source Tables from older src tables which do not already have local calib columns")
223 """Write source table to parquet 225 _DefaultName =
"writeSourceTable" 226 ConfigClass = WriteSourceTableConfig
229 src = dataRef.get(
'src')
230 if self.config.doApplyExternalPhotoCalib
or self.config.doApplyExternalSkyWcs:
233 ccdVisitId = dataRef.get(
'ccdExposureId')
234 result = self.
run(src, ccdVisitId=ccdVisitId)
235 dataRef.put(result.table,
'source')
238 def run(self, catalog, ccdVisitId=None):
239 """Convert `src` catalog to parquet 243 catalog: `afwTable.SourceCatalog` 244 catalog to be converted 246 ccdVisitId to be added as a column 250 result : `lsst.pipe.base.Struct` 252 `ParquetTable` version of the input catalog 254 self.log.info(
"Generating parquet table from src catalog")
255 df = catalog.asAstropy().to_pandas().set_index(
'id', drop=
True)
256 df[
'ccdVisitId'] = ccdVisitId
257 return pipeBase.Struct(table=
ParquetTable(dataFrame=df))
260 """Add columns with local calibration evaluated at each centroid 262 for backwards compatibility with old repos. 263 This exists for the purpose of converting old src catalogs 264 (which don't have the expected local calib columns) to Source Tables. 268 catalog: `afwTable.SourceCatalog` 269 catalog to which calib columns will be added 270 dataRef: `lsst.daf.persistence.ButlerDataRef 271 for fetching the calibs from disk. 275 newCat: `afwTable.SourceCatalog` 276 Source Catalog with requested local calib columns 278 measureConfig = SingleFrameMeasurementTask.ConfigClass()
279 measureConfig.doReplaceWithNoise =
False 282 exposure = dataRef.get(
'calexp_sub',
285 aliasMap = catalog.schema.getAliasMap()
286 mapper = afwTable.SchemaMapper(catalog.schema)
287 mapper.addMinimalSchema(catalog.schema,
True)
288 schema = mapper.getOutputSchema()
290 exposureIdInfo = dataRef.get(
"expIdInfo")
291 measureConfig.plugins.names = []
292 if self.config.doApplyExternalSkyWcs:
293 plugin =
'base_LocalWcs' 295 raise RuntimeError(f
"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
297 measureConfig.plugins.names.add(plugin)
299 if self.config.doApplyExternalPhotoCalib:
300 plugin =
'base_LocalPhotoCalib' 302 raise RuntimeError(f
"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
304 measureConfig.plugins.names.add(plugin)
306 measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
307 schema.setAliasMap(aliasMap)
308 newCat = afwTable.SourceCatalog(schema)
309 newCat.extend(catalog, mapper=mapper)
310 measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
314 """No metadata to write. 319 def _makeArgumentParser(cls):
321 parser.add_id_argument(
"--id",
'src',
322 help=
"data ID, e.g. --id visit=12345 ccd=0")
327 """Calculate columns from ParquetTable 329 This object manages and organizes an arbitrary set of computations 330 on a catalog. The catalog is defined by a 331 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a 332 `deepCoadd_obj` dataset, and the computations are defined by a collection 333 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently, 334 a `CompositeFunctor`). 336 After the object is initialized, accessing the `.df` attribute (which 337 holds the `pandas.DataFrame` containing the results of the calculations) triggers 338 computation of said dataframe. 340 One of the conveniences of using this object is the ability to define a desired common 341 filter for all functors. This enables the same functor collection to be passed to 342 several different `PostprocessAnalysis` objects without having to change the original 343 functor collection, since the `filt` keyword argument of this object triggers an 344 overwrite of the `filt` property for all functors in the collection. 346 This object also allows a list of refFlags to be passed, and defines a set of default 347 refFlags that are always included even if not requested. 349 If a list of `ParquetTable` object is passed, rather than a single one, then the 350 calculations will be mapped over all the input catalogs. In principle, it should 351 be straightforward to parallelize this activity, but initial tests have failed 352 (see TODO in code comments). 356 parq : `lsst.pipe.tasks.ParquetTable` (or list of such) 357 Source catalog(s) for computation 359 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor` 360 Computations to do (functors that act on `parq`). 361 If a dict, the output 362 DataFrame will have columns keyed accordingly. 363 If a list, the column keys will come from the 364 `.shortname` attribute of each functor. 366 filt : `str` (optional) 367 Filter in which to calculate. If provided, 368 this will overwrite any existing `.filt` attribute 369 of the provided functors. 371 flags : `list` (optional) 372 List of flags (per-band) to include in output table. 374 refFlags : `list` (optional) 375 List of refFlags (only reference band) to include in output table. 379 _defaultRefFlags = []
380 _defaultFuncs = ((
'coord_ra',
RAColumn()),
383 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
388 self.
flags = list(flags)
if flags
is not None else []
390 if refFlags
is not None:
403 additionalFuncs.update({flag:
Column(flag, dataset=
'ref')
for flag
in self.
refFlags})
404 additionalFuncs.update({flag:
Column(flag, dataset=
'meas')
for flag
in self.
flags})
406 if isinstance(self.
functors, CompositeFunctor):
411 func.funcDict.update(additionalFuncs)
412 func.filt = self.
filt 418 return [name
for name, func
in self.
func.funcDict.items()
if func.noDup
or func.dataset ==
'ref']
428 if type(self.
parq)
in (list, tuple):
430 dflist = [self.
func(parq, dropna=dropna)
for parq
in self.
parq]
433 dflist = pool.map(functools.partial(self.
func, dropna=dropna), self.
parq)
434 self.
_df = pd.concat(dflist)
442 functorFile = pexConfig.Field(
444 doc=
'Path to YAML file specifying functors to be computed',
451 """Base class for transforming/standardizing a catalog 453 by applying functors that convert units and apply calibrations. 454 The purpose of this task is to perform a set of computations on 455 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the 456 results to a new dataset (which needs to be declared in an `outputDataset` 459 The calculations to be performed are defined in a YAML file that specifies 460 a set of functors to be computed, provided as 461 a `--functorFile` config parameter. An example of such a YAML file 486 - base_InputCount_value 489 functor: DeconvolvedMoments 494 - merge_measurement_i 495 - merge_measurement_r 496 - merge_measurement_z 497 - merge_measurement_y 498 - merge_measurement_g 499 - base_PixelFlags_flag_inexact_psfCenter 502 The names for each entry under "func" will become the names of columns in the 503 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`. 504 Positional arguments to be passed to each functor are in the `args` list, 505 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`, 506 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization. 508 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and 509 taken from the `'ref'` dataset. 511 The "flags" entry will be expanded out per band. 513 Note, if `'filter'` is provided as part of the `dataId` when running this task (even though 514 `deepCoadd_obj` does not use `'filter'`), then this will override the `filt` kwargs 515 provided in the YAML file, and the calculations will be done in that filter. 517 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object 518 to organize and excecute the calculations. 522 def _DefaultName(self):
523 raise NotImplementedError(
'Subclass must define "_DefaultName" attribute')
527 raise NotImplementedError(
'Subclass must define "outputDataset" attribute')
531 raise NotImplementedError(
'Subclass must define "inputDataset" attribute')
535 raise NotImplementedError(
'Subclass must define "ConfigClass" attribute')
540 df = self.
run(parq, funcs=funcs, dataId=dataRef.dataId)
541 self.
write(df, dataRef)
544 def run(self, parq, funcs=None, dataId=None):
545 """Do postprocessing calculations 547 Takes a `ParquetTable` object and dataId, 548 returns a dataframe with results of postprocessing calculations. 552 parq : `lsst.pipe.tasks.parquetTable.ParquetTable` 553 ParquetTable from which calculations are done. 554 funcs : `lsst.pipe.tasks.functors.Functors` 555 Functors to apply to the table's columns 556 dataId : dict, optional 557 Used to add a `patchId` column to the output dataframe. 564 self.log.info(
"Transforming/standardizing the source table dataId: %s", dataId)
566 filt = dataId.get(
'filter',
None)
567 df = self.
transform(filt, parq, funcs, dataId).df
568 self.log.info(
"Made a table of %d columns and %d rows", len(df.columns), len(df))
572 funcs = CompositeFunctor.from_file(self.config.functorFile)
573 funcs.update(dict(PostprocessAnalysis._defaultFuncs))
584 analysis = self.
getAnalysis(parq, funcs=funcs, filt=filt)
586 if dataId
is not None:
587 for key, value
in dataId.items():
590 return pipeBase.Struct(
599 """No metadata to write. 604 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
605 coaddName = pexConfig.Field(
610 filterMap = pexConfig.DictField(
614 doc=(
"Dictionary mapping full filter name to short one for column name munging." 615 "These filters determine the output columns no matter what filters the " 616 "input data actually contain.")
618 camelCase = pexConfig.Field(
621 doc=(
"Write per-filter columns names with camelCase, else underscore " 622 "For example: gPsfFlux instead of g_PsfFlux.")
624 multilevelOutput = pexConfig.Field(
627 doc=(
"Whether results dataframe should have a multilevel column index (True) or be flat " 628 "and name-munged (False).")
633 """Compute Flatted Object Table as defined in the DPDD 635 Do the same set of postprocessing calculations on all bands 637 This is identical to `TransformCatalogBaseTask`, except for that it does the 638 specified functor calculations for all filters present in the 639 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified 640 by the YAML file will be superceded. 642 _DefaultName =
"transformObjectCatalog" 643 ConfigClass = TransformObjectCatalogConfig
645 inputDataset =
'deepCoadd_obj' 646 outputDataset =
'objectTable' 649 def _makeArgumentParser(cls):
652 ContainerClass=CoaddDataIdContainer,
653 help=
"data ID, e.g. --id tract=12345 patch=1,2")
656 def run(self, parq, funcs=None, dataId=None):
659 templateDf = pd.DataFrame()
662 for filt
in parq.columnLevelNames[
'filter']:
663 if filt
not in self.config.filterMap:
664 self.log.info(
"Ignoring %s data in the input", filt)
666 self.log.info(
"Transforming the catalog of filter %s", filt)
667 result = self.
transform(filt, parq, funcs, dataId)
668 dfDict[filt] = result.df
669 analysisDict[filt] = result.analysis
671 templateDf = result.df
674 for filt
in self.config.filterMap:
675 if filt
not in dfDict:
676 self.log.info(
"Adding empty columns for filter %s", filt)
677 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
680 df = pd.concat(dfDict, axis=1, names=[
'filter',
'column'])
682 if not self.config.multilevelOutput:
683 noDupCols = list(set.union(*[set(v.noDupCols)
for v
in analysisDict.values()]))
684 if dataId
is not None:
685 noDupCols += list(dataId.keys())
686 df =
flattenFilters(df, self.config.filterMap, noDupCols=noDupCols,
687 camelCase=self.config.camelCase)
689 self.log.info(
"Made a table of %d columns and %d rows", len(df.columns), len(df))
696 """Make self.refList from self.idList 698 Generate a list of data references given tract and/or patch. 699 This was adapted from `TractQADataIdContainer`, which was 700 `TractDataIdContainer` modifie to not require "filter". 701 Only existing dataRefs are returned. 703 def getPatchRefList(tract):
704 return [namespace.butler.dataRef(datasetType=self.datasetType,
706 patch=
"%d,%d" % patch.getIndex())
for patch
in tract]
708 tractRefs = defaultdict(list)
709 for dataId
in self.idList:
712 if "tract" in dataId:
713 tractId = dataId[
"tract"]
714 if "patch" in dataId:
715 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
717 patch=dataId[
'patch']))
719 tractRefs[tractId] += getPatchRefList(skymap[tractId])
721 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
724 for tractRefList
in tractRefs.values():
725 existingRefs = [ref
for ref
in tractRefList
if ref.datasetExists()]
726 outputRefList.append(existingRefs)
732 coaddName = pexConfig.Field(
740 """Write patch-merged source tables to a tract-level parquet file 742 _DefaultName =
"consolidateObjectTable" 743 ConfigClass = ConsolidateObjectTableConfig
745 inputDataset =
'objectTable' 746 outputDataset =
'objectTable_tract' 749 def _makeArgumentParser(cls):
753 help=
"data ID, e.g. --id tract=12345",
754 ContainerClass=TractObjectDataIdContainer)
758 df = pd.concat([patchRef.get().toDataFrame()
for patchRef
in patchRefList])
762 """No metadata to write. 767 class TransformSourceTableConfig(TransformCatalogBaseConfig):
772 """Transform/standardize a source catalog 774 _DefaultName =
"transformSourceTable" 775 ConfigClass = TransformSourceTableConfig
777 inputDataset =
'source' 778 outputDataset =
'sourceTable' 781 """No metadata to write. 786 def _makeArgumentParser(cls):
788 parser.add_id_argument(
"--id", datasetType=cls.
inputDataset,
790 help=
"data ID, e.g. --id visit=12345 ccd=0")
795 """DataIdContainer that groups sensor-level id's by visit 799 """Make self.refList from self.idList 801 Generate a list of data references grouped by visit. 805 namespace : `argparse.Namespace` 806 Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments 808 def ccdDataRefList(visitId):
809 """Get all possible ccds for a given visit""" 810 ccds = namespace.butler.queryMetadata(
'src', [
'ccd'], dataId={
'visit': visitId})
811 return [namespace.butler.dataRef(datasetType=self.datasetType,
813 ccd=ccd)
for ccd
in ccds]
815 visitRefs = defaultdict(list)
816 for dataId
in self.idList:
817 if "visit" in dataId:
818 visitId = dataId[
"visit"]
820 visitRefs[visitId].append(namespace.butler.dataRef(datasetType=self.datasetType,
821 visit=visitId, ccd=dataId[
'ccd']))
823 visitRefs[visitId] += ccdDataRefList(visitId)
825 for refList
in visitRefs.values():
826 existingRefs = [ref
for ref
in refList
if ref.datasetExists()]
827 outputRefList.append(existingRefs)
836 class ConsolidateSourceTableTask(CmdLineTask):
837 """Concatenate `sourceTable` list into a per-visit `sourceTable_visit` 839 _DefaultName =
'consolidateSourceTable' 840 ConfigClass = ConsolidateSourceTableConfig
842 inputDataset =
'sourceTable' 843 outputDataset =
'sourceTable_visit' 846 self.log.info(
"Concatenating %s per-detector Source Tables", len(dataRefList))
847 df = pd.concat([dataRef.get().toDataFrame()
for dataRef
in dataRefList])
851 def _makeArgumentParser(cls):
855 help=
"data ID, e.g. --id visit=12345",
856 ContainerClass=VisitDataIdContainer)
860 """No metadata to write. 865 """No config to write.
def readCatalog(self, patchRef)
def flattenFilters(df, filterDict, noDupCols=['coord_ra', coord_dec, camelCase=False)
def makeDataRefList(self, namespace)
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def run(self, parq, funcs=None, dataId=None)
def writeMetadata(self, dataRef)
def runDataRef(self, dataRef)
def write(self, patchRef, catalog)
def runDataRef(self, patchRefList)
def runDataRef(self, dataRef)
def __init__(self, butler=None, schema=None, kwargs)
def writeMetadata(self, dataRefList)
def writeMetadata(self, dataRef)
def compute(self, dropna=False, pool=None)
def run(self, catalogs, tract, patch)
def getSkymap(self, namespace)
def runDataRef(self, dataRefList)
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
def writeConfig(self, butler, clobber=False, doBackup=True)
def run(self, parq, funcs=None, dataId=None)
def __init__(self, parq, functors, filt=None, flags=None, refFlags=None)
def writeMetadata(self, dataRef)
def run(self, catalog, ccdVisitId=None)
def makeDataRefList(self, namespace)
def write(self, df, parqRef)
def writeMetadata(self, dataRef)
def writeMetadata(self, dataRef)
def transform(self, filt, parq, funcs, dataId)
def getAnalysis(self, parq, funcs=None, filt=None)
def addCalibColumns(self, catalog, dataRef)