Freecontact/src/ write bioxsd.cpp

From Rost Lab Open
/*  FreeContact - program to predict protein residue contacts from a sufficiently large multiple alignment
*   Copyright (C) 2012-2013  Laszlo Kajan <lkajan@rostlab.org> Rost Lab, Technical University of Munich, Germany
*   
*   This program is free software: you can redistribute it and/or modify
*   it under the terms of the GNU General Public License as published by
*   the Free Software Foundation, either version 3 of the License, or
*   (at your option) any later version.
*   
*   This program is distributed in the hope that it will be useful,
*   but WITHOUT ANY WARRANTY; without even the implied warranty of
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*   GNU General Public License for more details.
*   
*   You should have received a copy of the GNU General Public License
*   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
#include "fc.h"
#include "freecontact_input_xsd.hxx"
#include "freecontact_output_xsd.hxx"
#include "freecontact.h"

using namespace std;
namespace bo = boost;
namespace fc = freecontact;
namespace fix = freecontact_input_xsd;
namespace fox = freecontact_output_xsd;

auto_ptr<BioXSD::Score> _get_bxScore(const fc::contact_t& __ct, const std::string& __scid);

// forward declaration in freecontact.cpp
std::string         _xml_flatali(std::istream& __is, bool dbg)
{
    string ret;
    try
    {
        auto_ptr<fix::FcAlignment> xml( fix::alignment(__is) );
        ret = xml->flat(); // lkajan: this may be empty, as <alignment> is a choice of <flat>,...
    }
    catch (const xml_schema::exception& e)
    {
        cerr << e << "\n"; throw;
    }
    //
    return ret;
}

// forward declaration in freecontact.cpp
void                _write_bioxsd(const fc::ali_t& __ali, const resn_mapper_t &__resn, const map<string, vector<fc::contact_t> >& __res, bool dbg)
{
    // XML generation - deepest first
    auto_ptr<BioXSD::GeneralSequenceSegment> seqSeg_p(new BioXSD::SequenceSegment());
    seqSeg_p->min( __resn.qs ); seqSeg_p->max( __resn.qs+__resn.q.length()-1 );

    auto_ptr<BioXSD::SequencePosition> seqPos_p(new BioXSD::SequencePosition());
    seqPos_p->segment().push_back( seqSeg_p  );

    auto_ptr<BioXSD::SequenceReference> seqRef_p(new BioXSD::SequenceReference());
    seqRef_p->subsequencePosition( seqPos_p );

    string ucq(__resn.q); int (*tou_p)(int) = toupper; transform( __resn.q.begin(), __resn.q.end(), ucq.begin(), tou_p );
    auto_ptr<BioXSD::BiosequenceRecord> seqRec_p( new BioXSD::GeneralAminoacidSequenceRecord( BioXSD::GeneralAminoacidSequence( ucq )));
    seqRec_p->reference( seqRef_p );
    //seqRec_p->name("query"); // lkajan: name is not mandatory

    auto_ptr<BioXSD::referenceSequence> refSeq_p(new BioXSD::referenceSequence());
    refSeq_p->sequenceRecord( seqRec_p );

    BioXSD::FeatureRecord featRec( refSeq_p );


    auto_ptr<BioXSD::SemanticConcept> metCatCon_p(new BioXSD::SemanticConcept());
    metCatCon_p->ontologyName("EDAM");
    metCatCon_p->accession("operation_0272");
    metCatCon_p->conceptUri("http://edamontology.org/operation_0272");
    metCatCon_p->term("Residue interaction prediction");

    auto_ptr<BioXSD::EntryReference> metCit_p(new BioXSD::EntryReference());
    metCit_p->date(xml_schema::date(2013, 3, 1));

    auto_ptr<BioXSD::method1> method_p(new BioXSD::method1("freecontact", "M#fc"));
    method_p->categoryConcept().push_back( metCatCon_p );
    method_p->citation().push_back( metCit_p );

    auto_ptr<BioXSD::scoreType> sc_l1_p(new BioXSD::scoreType("S#l1"));
    {
        auto_ptr<BioXSD::EntryReference> metCit_p(new BioXSD::EntryReference());
        metCit_p->dbName("PubMed");
        metCit_p->accession("22101153");
        metCit_p->entryUri("http://www.ncbi.nlm.nih.gov/pubmed/22101153");
        metCit_p->date(xml_schema::date(2011, 11, 17));

        auto_ptr<BioXSD::Method> method_p(new BioXSD::Method("l1-norm"));
        method_p->citation().push_back( metCit_p );

        sc_l1_p->method().push_back( method_p );
    }

    auto_ptr<BioXSD::scoreType> sc_MI_p(new BioXSD::scoreType("S#MI"));
    {
        auto_ptr<BioXSD::EntryReference> metCit_p(new BioXSD::EntryReference());
        metCit_p->dbName("PubMed");
        metCit_p->accession("22163331");
        metCit_p->entryUri("http://www.ncbi.nlm.nih.gov/pubmed/22163331");
        metCit_p->date(xml_schema::date(2011, 12, 7));

        auto_ptr<BioXSD::Method> method_p(new BioXSD::Method("Mutual information"));
        method_p->citation().push_back( metCit_p );

        sc_MI_p->method().push_back( method_p );
    }

    auto_ptr<BioXSD::scoreType> sc_fr_p(new BioXSD::scoreType("S#fr"));
    {
        auto_ptr<BioXSD::EntryReference> metCit_p(new BioXSD::EntryReference());
        metCit_p->dbName("PubMed");
        metCit_p->accession("23410359");
        metCit_p->entryUri("http://www.ncbi.nlm.nih.gov/pubmed/23410359");
        metCit_p->date(xml_schema::date(2013, 1, 11));

        auto_ptr<BioXSD::Method> method_p(new BioXSD::Method("Frobenius norm"));
        method_p->citation().push_back( metCit_p );

        sc_fr_p->method().push_back( method_p );
    }

    auto_ptr<BioXSD::SemanticConcept> featCon_p(new BioXSD::SemanticConcept());
    // TODO: find seq ontology term for contact
    //featCon_p->ontologyName("EDAM");
    //featCon_p->accession("data_1547");
    //featCon_p->conceptUri("http://edamontology.org/data_1547");
    featCon_p->term("Proximity of residues (Cb-Cb<8A), 'contact'");

    auto_ptr<BioXSD::FeatureType> feat_p(new BioXSD::FeatureType());
    feat_p->name("Proximity of residues");
    feat_p->equalConcept().push_back( featCon_p );

    auto_ptr<BioXSD::condensedReferences1> cref_p(new BioXSD::condensedReferences1());
    cref_p->methodIdRef("M#fc");

    auto_ptr<BioXSD::annotation1> ann_p(new BioXSD::annotation1(feat_p));
    ann_p->condensedReferences( cref_p );

    // occurrences
    size_t l1norm_i = 0, MI_i = 0, fro_i = 0;
    for(size_t i = 0; i < __ali.alilen; ++i)
        for(size_t j = i+1; j < __ali.alilen; ++j)
        {
            // create only those positions that we have results for
            bool has_l1norm = false, has_MI = false, has_fro = false;
            if( ( has_l1norm = __res.at("l1norm")[l1norm_i].i == i && __res.at("l1norm")[l1norm_i].j == j ) |
                    ( has_MI = __res.at("MI")[MI_i].i == i && __res.at("MI")[MI_i].j == j ) |
                    ( has_fro = __res.at("fro")[fro_i].i == i && __res.at("fro")[fro_i].j == j ))
            {
                auto_ptr<BioXSD::GeneralSequencePoint> pt1_p(new BioXSD::SequencePoint());
                pt1_p->pos( __resn(i) );
                auto_ptr<BioXSD::GeneralSequencePoint> pt2_p(new BioXSD::SequencePoint());
                pt2_p->pos( __resn(j) );

                auto_ptr<BioXSD::GeneralSequencePosition> pos_p(new BioXSD::SequencePosition());
                pos_p->point().push_back( pt1_p );
                pos_p->point().push_back( pt2_p );

                auto_ptr<BioXSD::occurrence> occ_p(new BioXSD::occurrence());
                occ_p->position( pos_p );

                // lkajan: option: put the scores in <evidence><predicted><score/>+</pr></ev>
                if(has_l1norm) occ_p->score().push_back( _get_bxScore( __res.at("l1norm")[l1norm_i++], "S#l1" ));
                if(has_MI) occ_p->score().push_back( _get_bxScore( __res.at("MI")[MI_i++], "S#MI" ));
                if(has_fro) occ_p->score().push_back( _get_bxScore( __res.at("fro")[fro_i++], "S#fr" ));

                ann_p->occurrence().push_back( occ_p );
            }
        }

    auto_ptr<BioXSD::blockWithOccurrenceReferences> occRef_p(new BioXSD::blockWithOccurrenceReferences()); 
    occRef_p->method().push_back( method_p );
    occRef_p->scoreType().push_back( sc_l1_p );
    occRef_p->scoreType().push_back( sc_MI_p );
    occRef_p->scoreType().push_back( sc_fr_p );
    occRef_p->annotation().push_back( ann_p );

    featRec.blockWithOccurrenceReferences().push_back( occRef_p );

    // serialize
    xml_schema::namespace_infomap map;
    map["fox"].name = "http://rostlab.org/freecontact/output";
    map["fox"].schema = "http://rostlab.org/~lkajan/freecontact_output.xsd";
    map["bx"].name = "http://bioxsd.org/BioXSD-1.1";
    // lkajan: do not give this location, the information is redundant, it is in freecontact.xsd
    //map["bx"].schema = "http://rostlab.org/~lkajan/BioXSD-1.1-lkajan1.xsd";

    fox::contactMap( cout, featRec, map );
    //fox::contactMap( cout, featRec, map, "UTF-8", xml_schema::flags::dont_pretty_print );
}


auto_ptr<BioXSD::Score> _get_bxScore( const fc::contact_t& __ct, const std::string& __scid )
{
    auto_ptr<BioXSD::Score> MI_score_p(new BioXSD::Score( bo::str(bo::format("%g") % __ct.score) ));
    MI_score_p->scoreTypeIdRef(__scid);
    return MI_score_p;
}


// vim:et:ts=4:ai: