Program Listing for File SequenceParser.h

Return to documentation for file (src/smartpeak/include/SmartPeak/io/SequenceParser.h)

// --------------------------------------------------------------------------
//   SmartPeak -- Fast and Accurate CE-, GC- and LC-MS(/MS) Data Processing
// --------------------------------------------------------------------------
// Copyright The SmartPeak Team -- Novo Nordisk Foundation
// Center for Biosustainability, Technical University of Denmark 2018-2021.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: Douglas McCloskey $
// $Authors: Douglas McCloskey, Pasquale Domenico Colaianni $
// --------------------------------------------------------------------------

#pragma once

#include <SmartPeak/core/FeatureMetadata.h>
#include <SmartPeak/core/SampleType.h>
#include <SmartPeak/core/SequenceHandler.h>
#include <SmartPeak/core/Utilities.h>
#include <plog/Log.h>
#include <unsupported/Eigen/CXX11/Tensor>

namespace SmartPeak
{
  class SequenceParser
  {
public:
    SequenceParser()                                 = delete;
    ~SequenceParser()                                = delete;
    SequenceParser(const SequenceParser&)            = delete;
    SequenceParser& operator=(const SequenceParser&) = delete;
    SequenceParser(SequenceParser&&)                 = delete;
    SequenceParser& operator=(SequenceParser&&)      = delete;

    static constexpr char const * const s_PeptideRef {"PeptideRef"};
    static constexpr char const * const s_native_id {"native_id"};

    static void readSequenceFile(
      SequenceHandler& sequenceHandler,
      const std::string& pathname,
      const std::string& delimiter
    );

    static void makeSequenceFileAnalyst(
      SequenceHandler& sequenceHandler,
      std::vector<std::vector<std::string>>& rows_out,
      std::vector<std::string>& headers_out
    );

    static void writeSequenceFileAnalyst(
      SequenceHandler& sequenceHandler,
      const std::string& filename,
      const std::string& delimiter = "\t"
    );

    static void makeSequenceFileMasshunter(
      SequenceHandler& sequenceHandler,
      std::vector<std::vector<std::string>>& rows_out,
      std::vector<std::string>& headers_out
    );

    static void writeSequenceFileMasshunter(
      SequenceHandler& sequenceHandler,
      const std::string& filename,
      const std::string& delimiter = "\t"
    );

    static void makeSequenceFileXcalibur(
      SequenceHandler& sequenceHandler,
      std::vector<std::vector<std::string>>& rows_out,
      std::vector<std::string>& headers_out
    );

    static void writeSequenceFileXcalibur(
      SequenceHandler& sequenceHandler,
      const std::string& filename,
      const std::string& delimiter = "\t"
    );

    template<typename T>
    static bool validateAndConvert(
      const std::string& s,
      T& output
    )
    {
      if (Utilities::trimString(s).empty()) {
        return false;
      }

      if (std::is_same<T, int>::value) {
        output = std::stoi(s);
      } else if (std::is_same<T, float>::value) {
        output = std::stof(s);
      } else {
        LOGE << "Case not handled";
        return false;
      }

      return true;
    }

    /*
    @brief make a table (row major) of string representations of
      all meta_data for all sample_types in the feature history.

    NOTE: Internally, to_string() rounds at 1e-6. Therefore, some precision might be lost.
    */
    static void makeDataTableFromMetaValue(
      const SequenceHandler& sequenceHandler,
      std::vector<std::vector<std::string>>& rows_out,
      std::vector<std::string>& headers_out,
      const std::vector<std::string>& meta_data,
      const std::set<SampleType>& sample_types,
      const std::set<std::string>& sample_names,
      const std::set<std::string>& component_group_names,
      const std::set<std::string>& component_names
    );

    static bool writeDataTableFromMetaValue(
      const SequenceHandler& sequenceHandler,
      const std::string& filename,
      const std::vector<FeatureMetadata>& meta_data,
      const std::set<SampleType>& sample_types
    );

    struct Row
    {
      Row() = default;
      ~Row() = default;
      Row(const Row&) = default;
      Row& operator=(const Row&) = default;
      Row(Row&&) = default;
      Row& operator=(Row&&) = default;

      Row(const std::string& cgn, const std::string& cn, const std::string& mvn) :
        component_group_name(cgn),
        component_name(cn),
        meta_value_name(mvn) {}

      std::string component_group_name;
      std::string component_name;
      std::string meta_value_name;
    };

    struct Row_less
    {
      bool operator()(const Row& lhs, const Row& rhs) const
      {
        return lhs.component_group_name + lhs.component_name + lhs.meta_value_name <
          rhs.component_group_name + rhs.component_name + rhs.meta_value_name;
      }
    };

    static void makeDataMatrixFromMetaValue(
      const SequenceHandler& sequenceHandler,
      Eigen::Tensor<float,2>& data_out,
      Eigen::Tensor<std::string, 1>& columns_out,
      Eigen::Tensor<std::string, 2>& rows_out,
      const std::vector<std::string>& meta_data,
      const std::set<SampleType>& sample_types,
      const std::set<std::string>& sample_names,
      const std::set<std::string>& component_group_names,
      const std::set<std::string>& component_names
    );

    // NOTE: Internally, to_string() rounds at 1e-6. Therefore, some precision might be lost.
    static bool writeDataMatrixFromMetaValue(
      const SequenceHandler& sequenceHandler,
      const std::string& filename,
      const std::vector<FeatureMetadata>& meta_data,
      const std::set<SampleType>& sample_types
    );
  };
}