lsst.pipe.tasks  8.5-hsc+2
postprocess.py
Go to the documentation of this file.
1 # This file is part of pipe_tasks
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 
22 import functools
23 import pandas as pd
24 from collections import defaultdict
25 
26 import lsst.geom
27 import lsst.pex.config as pexConfig
28 import lsst.pipe.base as pipeBase
29 import lsst.afw.table as afwTable
30 from lsst.meas.base import SingleFrameMeasurementTask
31 from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
32 from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
33 
34 from .parquetTable import ParquetTable
35 from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
36 from .functors import CompositeFunctor, RAColumn, DecColumn, Column
37 
38 
39 def flattenFilters(df, filterDict, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
40  """Flattens a dataframe with multilevel column index
41  """
42  newDf = pd.DataFrame()
43  for filt, filtShort in filterDict.items():
44  subdf = df[filt]
45  columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
46  newColumns = {c: columnFormat.format(filtShort, c)
47  for c in subdf.columns if c not in noDupCols}
48  cols = list(newColumns.keys())
49  newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
50 
51  newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
52  return newDf
53 
54 
55 class WriteObjectTableConfig(pexConfig.Config):
56  priorityList = pexConfig.ListField(
57  dtype=str,
58  default=[],
59  doc="Priority-ordered list of bands for the merge."
60  )
61  engine = pexConfig.Field(
62  dtype=str,
63  default="pyarrow",
64  doc="Parquet engine for writing (pyarrow or fastparquet)"
65  )
66  coaddName = pexConfig.Field(
67  dtype=str,
68  default="deep",
69  doc="Name of coadd"
70  )
71 
72  def validate(self):
73  pexConfig.Config.validate(self)
74  if len(self.priorityList) == 0:
75  raise RuntimeError("No priority list provided")
76 
77 
78 class WriteObjectTableTask(CmdLineTask):
79  """Write filter-merged source tables to parquet
80  """
81  _DefaultName = "writeObjectTable"
82  ConfigClass = WriteObjectTableConfig
83  RunnerClass = MergeSourcesRunner
84 
85  # Names of table datasets to be merged
86  inputDatasets = ('forced_src', 'meas', 'ref')
87 
88  # Tag of output dataset written by `MergeSourcesTask.write`
89  outputDataset = 'obj'
90 
91  def __init__(self, butler=None, schema=None, **kwargs):
92  # It is a shame that this class can't use the default init for CmdLineTask
93  # But to do so would require its own special task runner, which is many
94  # more lines of specialization, so this is how it is for now
95  CmdLineTask.__init__(self, **kwargs)
96 
97  def runDataRef(self, patchRefList):
98  """!
99  @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
100  subclasses that inherit from MergeSourcesTask.
101  @param[in] patchRefList list of data references for each filter
102  """
103  catalogs = dict(self.readCatalog(patchRef) for patchRef in patchRefList)
104  dataId = patchRefList[0].dataId
105  mergedCatalog = self.run(catalogs, tract=dataId['tract'], patch=dataId['patch'])
106  self.write(patchRefList[0], mergedCatalog)
107 
108  @classmethod
109  def _makeArgumentParser(cls):
110  """Create a suitable ArgumentParser.
111 
112  We will use the ArgumentParser to get a list of data
113  references for patches; the RunnerClass will sort them into lists
114  of data references for the same patch.
115 
116  References first of self.inputDatasets, rather than
117  self.inputDataset
118  """
120 
121  def readCatalog(self, patchRef):
122  """Read input catalogs
123 
124  Read all the input datasets given by the 'inputDatasets'
125  attribute.
126 
127  Parameters
128  ----------
129  patchRef : `lsst.daf.persistence.ButlerDataRef`
130  Data reference for patch
131 
132  Returns
133  -------
134  Tuple consisting of filter name and a dict of catalogs, keyed by
135  dataset name
136  """
137  filterName = patchRef.dataId["filter"]
138  catalogDict = {}
139  for dataset in self.inputDatasets:
140  catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
141  self.log.info("Read %d sources from %s for filter %s: %s" %
142  (len(catalog), dataset, filterName, patchRef.dataId))
143  catalogDict[dataset] = catalog
144  return filterName, catalogDict
145 
146  def run(self, catalogs, tract, patch):
147  """Merge multiple catalogs.
148 
149  Parameters
150  ----------
151  catalogs : `dict`
152  Mapping from filter names to dict of catalogs.
153  tract : int
154  tractId to use for the tractId column
155  patch : str
156  patchId to use for the patchId column
157 
158  Returns
159  -------
160  catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
161  Merged dataframe, with each column prefixed by
162  `filter_tag(filt)`, wrapped in the parquet writer shim class.
163  """
164 
165  dfs = []
166  for filt, tableDict in catalogs.items():
167  for dataset, table in tableDict.items():
168  # Convert afwTable to pandas DataFrame
169  df = table.asAstropy().to_pandas().set_index('id', drop=True)
170 
171  # Sort columns by name, to ensure matching schema among patches
172  df = df.reindex(sorted(df.columns), axis=1)
173  df['tractId'] = tract
174  df['patchId'] = patch
175 
176  # Make columns a 3-level MultiIndex
177  df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
178  names=('dataset', 'filter', 'column'))
179  dfs.append(df)
180 
181  catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
182  return ParquetTable(dataFrame=catalog)
183 
184  def write(self, patchRef, catalog):
185  """Write the output.
186 
187  Parameters
188  ----------
189  catalog : `ParquetTable`
190  Catalog to write
191  patchRef : `lsst.daf.persistence.ButlerDataRef`
192  Data reference for patch
193  """
194  patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDataset)
195  # since the filter isn't actually part of the data ID for the dataset we're saving,
196  # it's confusing to see it in the log message, even if the butler simply ignores it.
197  mergeDataId = patchRef.dataId.copy()
198  del mergeDataId["filter"]
199  self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
200 
201  def writeMetadata(self, dataRefList):
202  """No metadata to write, and not sure how to write it for a list of dataRefs.
203  """
204  pass
205 
206 
207 class WriteSourceTableConfig(pexConfig.Config):
208  doApplyExternalPhotoCalib = pexConfig.Field(
209  dtype=bool,
210  default=False,
211  doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
212  "generating Source Tables from older src tables which do not already have local calib columns")
213  )
214  doApplyExternalSkyWcs = pexConfig.Field(
215  dtype=bool,
216  default=False,
217  doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
218  "generating Source Tables from older src tables which do not already have local calib columns")
219  )
220 
221 
222 class WriteSourceTableTask(CmdLineTask):
223  """Write source table to parquet
224  """
225  _DefaultName = "writeSourceTable"
226  ConfigClass = WriteSourceTableConfig
227 
228  def runDataRef(self, dataRef):
229  src = dataRef.get('src')
230  if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
231  src = self.addCalibColumns(src, dataRef)
232 
233  ccdVisitId = dataRef.get('ccdExposureId')
234  result = self.run(src, ccdVisitId=ccdVisitId)
235  dataRef.put(result.table, 'source')
236  return result
237 
238  def run(self, catalog, ccdVisitId=None):
239  """Convert `src` catalog to parquet
240 
241  Parameters
242  ----------
243  catalog: `afwTable.SourceCatalog`
244  catalog to be converted
245  ccdVisitId: `int`
246  ccdVisitId to be added as a column
247 
248  Returns
249  -------
250  result : `lsst.pipe.base.Struct`
251  ``table``
252  `ParquetTable` version of the input catalog
253  """
254  self.log.info("Generating parquet table from src catalog")
255  df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
256  df['ccdVisitId'] = ccdVisitId
257  return pipeBase.Struct(table=ParquetTable(dataFrame=df))
258 
259  def addCalibColumns(self, catalog, dataRef):
260  """Add columns with local calibration evaluated at each centroid
261 
262  for backwards compatibility with old repos.
263  This exists for the purpose of converting old src catalogs
264  (which don't have the expected local calib columns) to Source Tables.
265 
266  Parameters
267  ----------
268  catalog: `afwTable.SourceCatalog`
269  catalog to which calib columns will be added
270  dataRef: `lsst.daf.persistence.ButlerDataRef
271  for fetching the calibs from disk.
272 
273  Returns
274  -------
275  newCat: `afwTable.SourceCatalog`
276  Source Catalog with requested local calib columns
277  """
278  measureConfig = SingleFrameMeasurementTask.ConfigClass()
279  measureConfig.doReplaceWithNoise = False
280 
281  # Just need the WCS or the PhotoCalib attached to an exposue
282  exposure = dataRef.get('calexp_sub',
284 
285  aliasMap = catalog.schema.getAliasMap()
286  mapper = afwTable.SchemaMapper(catalog.schema)
287  mapper.addMinimalSchema(catalog.schema, True)
288  schema = mapper.getOutputSchema()
289 
290  exposureIdInfo = dataRef.get("expIdInfo")
291  measureConfig.plugins.names = []
292  if self.config.doApplyExternalSkyWcs:
293  plugin = 'base_LocalWcs'
294  if plugin in schema:
295  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
296  else:
297  measureConfig.plugins.names.add(plugin)
298 
299  if self.config.doApplyExternalPhotoCalib:
300  plugin = 'base_LocalPhotoCalib'
301  if plugin in schema:
302  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
303  else:
304  measureConfig.plugins.names.add(plugin)
305 
306  measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
307  schema.setAliasMap(aliasMap)
308  newCat = afwTable.SourceCatalog(schema)
309  newCat.extend(catalog, mapper=mapper)
310  measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
311  return newCat
312 
313  def writeMetadata(self, dataRef):
314  """No metadata to write.
315  """
316  pass
317 
318  @classmethod
319  def _makeArgumentParser(cls):
320  parser = ArgumentParser(name=cls._DefaultName)
321  parser.add_id_argument("--id", 'src',
322  help="data ID, e.g. --id visit=12345 ccd=0")
323  return parser
324 
325 
326 class PostprocessAnalysis(object):
327  """Calculate columns from ParquetTable
328 
329  This object manages and organizes an arbitrary set of computations
330  on a catalog. The catalog is defined by a
331  `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
332  `deepCoadd_obj` dataset, and the computations are defined by a collection
333  of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
334  a `CompositeFunctor`).
335 
336  After the object is initialized, accessing the `.df` attribute (which
337  holds the `pandas.DataFrame` containing the results of the calculations) triggers
338  computation of said dataframe.
339 
340  One of the conveniences of using this object is the ability to define a desired common
341  filter for all functors. This enables the same functor collection to be passed to
342  several different `PostprocessAnalysis` objects without having to change the original
343  functor collection, since the `filt` keyword argument of this object triggers an
344  overwrite of the `filt` property for all functors in the collection.
345 
346  This object also allows a list of refFlags to be passed, and defines a set of default
347  refFlags that are always included even if not requested.
348 
349  If a list of `ParquetTable` object is passed, rather than a single one, then the
350  calculations will be mapped over all the input catalogs. In principle, it should
351  be straightforward to parallelize this activity, but initial tests have failed
352  (see TODO in code comments).
353 
354  Parameters
355  ----------
356  parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
357  Source catalog(s) for computation
358 
359  functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
360  Computations to do (functors that act on `parq`).
361  If a dict, the output
362  DataFrame will have columns keyed accordingly.
363  If a list, the column keys will come from the
364  `.shortname` attribute of each functor.
365 
366  filt : `str` (optional)
367  Filter in which to calculate. If provided,
368  this will overwrite any existing `.filt` attribute
369  of the provided functors.
370 
371  flags : `list` (optional)
372  List of flags (per-band) to include in output table.
373 
374  refFlags : `list` (optional)
375  List of refFlags (only reference band) to include in output table.
376 
377 
378  """
379  _defaultRefFlags = []
380  _defaultFuncs = (('coord_ra', RAColumn()),
381  ('coord_dec', DecColumn()))
382 
383  def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
384  self.parq = parq
385  self.functors = functors
386 
387  self.filt = filt
388  self.flags = list(flags) if flags is not None else []
389  self.refFlags = list(self._defaultRefFlags)
390  if refFlags is not None:
391  self.refFlags += list(refFlags)
392 
393  self._df = None
394 
395  @property
396  def defaultFuncs(self):
397  funcs = dict(self._defaultFuncs)
398  return funcs
399 
400  @property
401  def func(self):
402  additionalFuncs = self.defaultFuncs
403  additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlags})
404  additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flags})
405 
406  if isinstance(self.functors, CompositeFunctor):
407  func = self.functors
408  else:
409  func = CompositeFunctor(self.functors)
410 
411  func.funcDict.update(additionalFuncs)
412  func.filt = self.filt
413 
414  return func
415 
416  @property
417  def noDupCols(self):
418  return [name for name, func in self.func.funcDict.items() if func.noDup or func.dataset == 'ref']
419 
420  @property
421  def df(self):
422  if self._df is None:
423  self.compute()
424  return self._df
425 
426  def compute(self, dropna=False, pool=None):
427  # map over multiple parquet tables
428  if type(self.parq) in (list, tuple):
429  if pool is None:
430  dflist = [self.func(parq, dropna=dropna) for parq in self.parq]
431  else:
432  # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
433  dflist = pool.map(functools.partial(self.func, dropna=dropna), self.parq)
434  self._df = pd.concat(dflist)
435  else:
436  self._df = self.func(self.parq, dropna=dropna)
437 
438  return self._df
439 
440 
441 class TransformCatalogBaseConfig(pexConfig.Config):
442  functorFile = pexConfig.Field(
443  dtype=str,
444  doc='Path to YAML file specifying functors to be computed',
445  default=None,
446  optional=True
447  )
448 
449 
450 class TransformCatalogBaseTask(CmdLineTask):
451  """Base class for transforming/standardizing a catalog
452 
453  by applying functors that convert units and apply calibrations.
454  The purpose of this task is to perform a set of computations on
455  an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
456  results to a new dataset (which needs to be declared in an `outputDataset`
457  attribute).
458 
459  The calculations to be performed are defined in a YAML file that specifies
460  a set of functors to be computed, provided as
461  a `--functorFile` config parameter. An example of such a YAML file
462  is the following:
463 
464  funcs:
465  psfMag:
466  functor: Mag
467  args:
468  - base_PsfFlux
469  filt: HSC-G
470  dataset: meas
471  cmodel_magDiff:
472  functor: MagDiff
473  args:
474  - modelfit_CModel
475  - base_PsfFlux
476  filt: HSC-G
477  gauss_magDiff:
478  functor: MagDiff
479  args:
480  - base_GaussianFlux
481  - base_PsfFlux
482  filt: HSC-G
483  count:
484  functor: Column
485  args:
486  - base_InputCount_value
487  filt: HSC-G
488  deconvolved_moments:
489  functor: DeconvolvedMoments
490  filt: HSC-G
491  dataset: forced_src
492  refFlags:
493  - calib_psfUsed
494  - merge_measurement_i
495  - merge_measurement_r
496  - merge_measurement_z
497  - merge_measurement_y
498  - merge_measurement_g
499  - base_PixelFlags_flag_inexact_psfCenter
500  - detect_isPrimary
501 
502  The names for each entry under "func" will become the names of columns in the
503  output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
504  Positional arguments to be passed to each functor are in the `args` list,
505  and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
506  `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
507 
508  The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
509  taken from the `'ref'` dataset.
510 
511  The "flags" entry will be expanded out per band.
512 
513  Note, if `'filter'` is provided as part of the `dataId` when running this task (even though
514  `deepCoadd_obj` does not use `'filter'`), then this will override the `filt` kwargs
515  provided in the YAML file, and the calculations will be done in that filter.
516 
517  This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
518  to organize and excecute the calculations.
519 
520  """
521  @property
522  def _DefaultName(self):
523  raise NotImplementedError('Subclass must define "_DefaultName" attribute')
524 
525  @property
526  def outputDataset(self):
527  raise NotImplementedError('Subclass must define "outputDataset" attribute')
528 
529  @property
530  def inputDataset(self):
531  raise NotImplementedError('Subclass must define "inputDataset" attribute')
532 
533  @property
534  def ConfigClass(self):
535  raise NotImplementedError('Subclass must define "ConfigClass" attribute')
536 
537  def runDataRef(self, dataRef):
538  parq = dataRef.get()
539  funcs = self.getFunctors()
540  df = self.run(parq, funcs=funcs, dataId=dataRef.dataId)
541  self.write(df, dataRef)
542  return df
543 
544  def run(self, parq, funcs=None, dataId=None):
545  """Do postprocessing calculations
546 
547  Takes a `ParquetTable` object and dataId,
548  returns a dataframe with results of postprocessing calculations.
549 
550  Parameters
551  ----------
552  parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
553  ParquetTable from which calculations are done.
554  funcs : `lsst.pipe.tasks.functors.Functors`
555  Functors to apply to the table's columns
556  dataId : dict, optional
557  Used to add a `patchId` column to the output dataframe.
558 
559  Returns
560  ------
561  `pandas.DataFrame`
562 
563  """
564  self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
565 
566  filt = dataId.get('filter', None)
567  df = self.transform(filt, parq, funcs, dataId).df
568  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
569  return df
570 
571  def getFunctors(self):
572  funcs = CompositeFunctor.from_file(self.config.functorFile)
573  funcs.update(dict(PostprocessAnalysis._defaultFuncs))
574  return funcs
575 
576  def getAnalysis(self, parq, funcs=None, filt=None):
577  # Avoids disk access if funcs is passed
578  if funcs is None:
579  funcs = self.getFunctors()
580  analysis = PostprocessAnalysis(parq, funcs, filt=filt)
581  return analysis
582 
583  def transform(self, filt, parq, funcs, dataId):
584  analysis = self.getAnalysis(parq, funcs=funcs, filt=filt)
585  df = analysis.df
586  if dataId is not None:
587  for key, value in dataId.items():
588  df[key] = value
589 
590  return pipeBase.Struct(
591  df=df,
592  analysis=analysis
593  )
594 
595  def write(self, df, parqRef):
596  parqRef.put(ParquetTable(dataFrame=df), self.outputDataset)
597 
598  def writeMetadata(self, dataRef):
599  """No metadata to write.
600  """
601  pass
602 
603 
604 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
605  coaddName = pexConfig.Field(
606  dtype=str,
607  default="deep",
608  doc="Name of coadd"
609  )
610  filterMap = pexConfig.DictField(
611  keytype=str,
612  itemtype=str,
613  default={},
614  doc=("Dictionary mapping full filter name to short one for column name munging."
615  "These filters determine the output columns no matter what filters the "
616  "input data actually contain.")
617  )
618  camelCase = pexConfig.Field(
619  dtype=bool,
620  default=True,
621  doc=("Write per-filter columns names with camelCase, else underscore "
622  "For example: gPsfFlux instead of g_PsfFlux.")
623  )
624  multilevelOutput = pexConfig.Field(
625  dtype=bool,
626  default=False,
627  doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
628  "and name-munged (False).")
629  )
630 
631 
633  """Compute Flatted Object Table as defined in the DPDD
634 
635  Do the same set of postprocessing calculations on all bands
636 
637  This is identical to `TransformCatalogBaseTask`, except for that it does the
638  specified functor calculations for all filters present in the
639  input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
640  by the YAML file will be superceded.
641  """
642  _DefaultName = "transformObjectCatalog"
643  ConfigClass = TransformObjectCatalogConfig
644 
645  inputDataset = 'deepCoadd_obj'
646  outputDataset = 'objectTable'
647 
648  @classmethod
649  def _makeArgumentParser(cls):
650  parser = ArgumentParser(name=cls._DefaultName)
651  parser.add_id_argument("--id", cls.inputDataset,
652  ContainerClass=CoaddDataIdContainer,
653  help="data ID, e.g. --id tract=12345 patch=1,2")
654  return parser
655 
656  def run(self, parq, funcs=None, dataId=None):
657  dfDict = {}
658  analysisDict = {}
659  templateDf = pd.DataFrame()
660  # Perform transform for data of filters that exist in parq and are
661  # specified in config.filterMap
662  for filt in parq.columnLevelNames['filter']:
663  if filt not in self.config.filterMap:
664  self.log.info("Ignoring %s data in the input", filt)
665  continue
666  self.log.info("Transforming the catalog of filter %s", filt)
667  result = self.transform(filt, parq, funcs, dataId)
668  dfDict[filt] = result.df
669  analysisDict[filt] = result.analysis
670  if templateDf.empty:
671  templateDf = result.df
672 
673  # Fill NaNs in columns of other wanted filters
674  for filt in self.config.filterMap:
675  if filt not in dfDict:
676  self.log.info("Adding empty columns for filter %s", filt)
677  dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
678 
679  # This makes a multilevel column index, with filter as first level
680  df = pd.concat(dfDict, axis=1, names=['filter', 'column'])
681 
682  if not self.config.multilevelOutput:
683  noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
684  if dataId is not None:
685  noDupCols += list(dataId.keys())
686  df = flattenFilters(df, self.config.filterMap, noDupCols=noDupCols,
687  camelCase=self.config.camelCase)
688 
689  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
690  return df
691 
692 
694 
695  def makeDataRefList(self, namespace):
696  """Make self.refList from self.idList
697 
698  Generate a list of data references given tract and/or patch.
699  This was adapted from `TractQADataIdContainer`, which was
700  `TractDataIdContainer` modifie to not require "filter".
701  Only existing dataRefs are returned.
702  """
703  def getPatchRefList(tract):
704  return [namespace.butler.dataRef(datasetType=self.datasetType,
705  tract=tract.getId(),
706  patch="%d,%d" % patch.getIndex()) for patch in tract]
707 
708  tractRefs = defaultdict(list) # Data references for each tract
709  for dataId in self.idList:
710  skymap = self.getSkymap(namespace)
711 
712  if "tract" in dataId:
713  tractId = dataId["tract"]
714  if "patch" in dataId:
715  tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
716  tract=tractId,
717  patch=dataId['patch']))
718  else:
719  tractRefs[tractId] += getPatchRefList(skymap[tractId])
720  else:
721  tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
722  for tract in skymap)
723  outputRefList = []
724  for tractRefList in tractRefs.values():
725  existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
726  outputRefList.append(existingRefs)
727 
728  self.refList = outputRefList
729 
730 
731 class ConsolidateObjectTableConfig(pexConfig.Config):
732  coaddName = pexConfig.Field(
733  dtype=str,
734  default="deep",
735  doc="Name of coadd"
736  )
737 
738 
739 class ConsolidateObjectTableTask(CmdLineTask):
740  """Write patch-merged source tables to a tract-level parquet file
741  """
742  _DefaultName = "consolidateObjectTable"
743  ConfigClass = ConsolidateObjectTableConfig
744 
745  inputDataset = 'objectTable'
746  outputDataset = 'objectTable_tract'
747 
748  @classmethod
749  def _makeArgumentParser(cls):
750  parser = ArgumentParser(name=cls._DefaultName)
751 
752  parser.add_id_argument("--id", cls.inputDataset,
753  help="data ID, e.g. --id tract=12345",
754  ContainerClass=TractObjectDataIdContainer)
755  return parser
756 
757  def runDataRef(self, patchRefList):
758  df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
759  patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
760 
761  def writeMetadata(self, dataRef):
762  """No metadata to write.
763  """
764  pass
765 
766 
767 class TransformSourceTableConfig(TransformCatalogBaseConfig):
768  pass
769 
770 
772  """Transform/standardize a source catalog
773  """
774  _DefaultName = "transformSourceTable"
775  ConfigClass = TransformSourceTableConfig
776 
777  inputDataset = 'source'
778  outputDataset = 'sourceTable'
779 
780  def writeMetadata(self, dataRef):
781  """No metadata to write.
782  """
783  pass
784 
785  @classmethod
786  def _makeArgumentParser(cls):
787  parser = ArgumentParser(name=cls._DefaultName)
788  parser.add_id_argument("--id", datasetType=cls.inputDataset,
789  level="sensor",
790  help="data ID, e.g. --id visit=12345 ccd=0")
791  return parser
792 
793 
794 class VisitDataIdContainer(DataIdContainer):
795  """DataIdContainer that groups sensor-level id's by visit
796  """
797 
798  def makeDataRefList(self, namespace):
799  """Make self.refList from self.idList
800 
801  Generate a list of data references grouped by visit.
802 
803  Parameters
804  ----------
805  namespace : `argparse.Namespace`
806  Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
807  """
808  def ccdDataRefList(visitId):
809  """Get all possible ccds for a given visit"""
810  ccds = namespace.butler.queryMetadata('src', ['ccd'], dataId={'visit': visitId})
811  return [namespace.butler.dataRef(datasetType=self.datasetType,
812  visit=visitId,
813  ccd=ccd) for ccd in ccds]
814  # Group by visits
815  visitRefs = defaultdict(list)
816  for dataId in self.idList:
817  if "visit" in dataId:
818  visitId = dataId["visit"]
819  if "ccd" in dataId:
820  visitRefs[visitId].append(namespace.butler.dataRef(datasetType=self.datasetType,
821  visit=visitId, ccd=dataId['ccd']))
822  else:
823  visitRefs[visitId] += ccdDataRefList(visitId)
824  outputRefList = []
825  for refList in visitRefs.values():
826  existingRefs = [ref for ref in refList if ref.datasetExists()]
827  outputRefList.append(existingRefs)
828 
829  self.refList = outputRefList
830 
831 
832 class ConsolidateSourceTableConfig(pexConfig.Config):
833  pass
834 
835 
836 class ConsolidateSourceTableTask(CmdLineTask):
837  """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
838  """
839  _DefaultName = 'consolidateSourceTable'
840  ConfigClass = ConsolidateSourceTableConfig
841 
842  inputDataset = 'sourceTable'
843  outputDataset = 'sourceTable_visit'
844 
845  def runDataRef(self, dataRefList):
846  self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
847  df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
848  dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
849 
850  @classmethod
851  def _makeArgumentParser(cls):
852  parser = ArgumentParser(name=cls._DefaultName)
853 
854  parser.add_id_argument("--id", cls.inputDataset,
855  help="data ID, e.g. --id visit=12345",
856  ContainerClass=VisitDataIdContainer)
857  return parser
858 
859  def writeMetadata(self, dataRef):
860  """No metadata to write.
861  """
862  pass
863 
864  def writeConfig(self, butler, clobber=False, doBackup=True):
865  """No config to write.
866  """
867  pass
def flattenFilters(df, filterDict, noDupCols=['coord_ra', coord_dec, camelCase=False)
Definition: postprocess.py:39
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def run(self, parq, funcs=None, dataId=None)
Definition: postprocess.py:544
def __init__(self, butler=None, schema=None, kwargs)
Definition: postprocess.py:91
def compute(self, dropna=False, pool=None)
Definition: postprocess.py:426
def run(self, catalogs, tract, patch)
Definition: postprocess.py:146
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
Definition: postprocess.py:97
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: postprocess.py:864
def run(self, parq, funcs=None, dataId=None)
Definition: postprocess.py:656
def __init__(self, parq, functors, filt=None, flags=None, refFlags=None)
Definition: postprocess.py:383
def run(self, catalog, ccdVisitId=None)
Definition: postprocess.py:238
def transform(self, filt, parq, funcs, dataId)
Definition: postprocess.py:583
def getAnalysis(self, parq, funcs=None, filt=None)
Definition: postprocess.py:576
def addCalibColumns(self, catalog, dataRef)
Definition: postprocess.py:259