votca 2024.2-dev
Loading...
Searching...
No Matches
pdbreader.cc
Go to the documentation of this file.
1/*
2 * Copyright 2009-2020 The VOTCA Development Team (http://www.votca.org)
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18// Standard includes
19#include <sstream>
20#include <unordered_map>
21#include <vector>
22
23// Third party includes
24#include <boost/algorithm/string.hpp>
25#include <boost/lexical_cast.hpp>
26
27// VOTCA includes
28#include <votca/tools/getline.h>
29
30// Local private VOTCA includes
31#include "pdbreader.h"
32
33namespace votca {
34namespace csg {
35using namespace boost;
36using namespace std;
37
38bool PDBReader::ReadTopology(string file, Topology &top) {
39 topology_ = true;
40 top.Cleanup();
41
42 fl_.open(file);
43 if (!fl_.is_open()) {
44 throw std::ios_base::failure("Error on open topology file: " + file);
45 }
46
47 NextFrame(top);
48 fl_.close();
49
50 return true;
51}
52
53bool PDBReader::Open(const string &file) {
54
55 fl_.open(file);
56 if (!fl_.is_open()) {
57 throw std::ios_base::failure("Error on open trajectory file: " + file);
58 }
59
60 return true;
61}
62
63void PDBReader::Close() { fl_.close(); }
64
66 topology_ = false;
67 return NextFrame(top);
68}
69
71
72 cout << "The VOTCA pdb reader attempts to follow the official format: "
73 << endl;
74 cout << "http://www.wwpdb.org/documentation/file-format" << endl;
75 cout << endl;
76 cout << "NOTE the element symbol is important for associating a mass with "
77 << endl;
78 cout << "each atom, if no element symbol is specified the atom name will be"
79 << endl;
80 cout << "used and assumed to be an element symbol." << endl;
81 cout << "The pdb file format does not support pseudo atoms such as a CH3 "
82 << endl;
83 cout << "and thus VOTCA cannot read such atoms in when using a .pdb file."
84 << endl;
85 cout << "If you need to use pseudo atoms consider using a different file "
86 << endl;
87 cout << "format such as the lammps data format." << endl;
88
89 string line;
90 tools::Elements elements;
91 // Two column vector for storing all bonds
92 // 1 - id of first atom
93 // 2 - id of second atom
94 vector<vector<Index>> bond_pairs;
95 // Store pointers to every bead
96 // WARNING we are assuming in the bead_Eigen::Vector3d that the indices of the
97 // beads
98 // correspond to the order in which they are read in. As in the first
99 // bead read in will be at index 0, etc...
100 vector<Bead *> bead_vec;
102 // Read in information from .pdb file
104 Index bead_count = 0;
105 while (tools::getline(fl_, line)) {
106 if (tools::wildcmp("CRYST1*", line)) {
107 string a, b, c, alpha, beta, gamma;
108 try {
109 // 1 - 6 Record name "CRYST1"
110 a = string(line, (7 - 1), 9);
111 // 7 - 15 Real(9.3) a (Angstroms)
112 b = string(line, (16 - 1), 9);
113 // 16 - 24 Real(9.3) b (Angstroms)
114 c = string(line, (25 - 1), 9);
115 // 25 - 33 Real(9.3) c (Angstroms)
116 alpha = string(line, (34 - 1), 7);
117 // 34 - 40 Real(7.2) alpha (degrees)
118 beta = string(line, (41 - 1), 7);
119 // 41 - 47 Real(7.2) beta (degrees)
120 gamma = string(line, (48 - 1), 7);
121 // 48 - 54 Real(7.2) gamma (degrees)
122 // 56 - 66 LString Space group
123 // 67 - 70 Integer Z value
124 } catch (std::out_of_range &) {
125 throw std::runtime_error("Misformated pdb file in CRYST1 line");
126 }
127 boost::algorithm::trim(a);
128 boost::algorithm::trim(b);
129 boost::algorithm::trim(c);
130 boost::algorithm::trim(alpha);
131 boost::algorithm::trim(beta);
132 boost::algorithm::trim(gamma);
133 if ((!tools::wildcmp("90*", alpha)) || (!tools::wildcmp("90*", beta)) ||
134 (!tools::wildcmp("90*", gamma))) {
135 throw std::runtime_error(
136 "Non cubical box in pdb file not implemented, yet!");
137 }
138 double aa = stod(a) / 10.0;
139 double bb = stod(b) / 10.0;
140 double cc = stod(c) / 10.0;
141 Eigen::Matrix3d box = Eigen::Matrix3d::Zero();
142 box.diagonal() = Eigen::Vector3d(aa, bb, cc);
143 top.setBox(box);
144 }
145 // Only read the CONECT keyword if the topology is set too true
146 if (topology_ && tools::wildcmp("CONECT*", line)) {
147 vector<string> bonded_atms;
148 string atm1;
149 try {
150 // If the CONECT keyword is found then there must be at least
151 // two atom identifiers, more than that is optional.
152
153 stringstream ss;
154 ss << string(line.substr(6));
155 // 1 - 6 Record name "CONECT"
156 // 11 - 7 Real(5) atm1 (ID)
157 ss >> atm1;
158 string temp_atm;
159 ss >> temp_atm;
160 bonded_atms.push_back(temp_atm);
161 ss >> temp_atm;
162 // Here we have taken a less rigorous approach to the .pdb files
163 // we do not care at this point how large the ids of the atoms are
164 // they can be greater than 99,999 with this approach.
165 while (ss) {
166 bonded_atms.push_back(temp_atm);
167 ss >> temp_atm;
168 }
169 } catch (std::out_of_range &) {
170 throw std::runtime_error("Misformated pdb file in CONECT line\n" +
171 line);
172 }
173
174 vector<Index> row(2);
175 boost::algorithm::trim(atm1);
176 Index at1 = boost::lexical_cast<Index>(atm1);
177 row.at(0) = at1;
178
179 for (auto &bonded_atm : bonded_atms) {
180 Index at2 = boost::lexical_cast<Index>(bonded_atm);
181 row.at(1) = at2;
182 // Because every bond will be counted twice in a .pdb file
183 // we will only add bonds where the id (atm1) is less than the
184 // bonded_atm
185 if (at1 < at2) {
186 bond_pairs.push_back(row);
187 }
188 }
189 }
190
191 if (tools::wildcmp("ATOM*", line) || tools::wildcmp("HETATM*", line)) {
192
193 // according to PDB format
194 string x, y, z, resNum, resName, atName;
195 string charge, elem_sym;
196 // string atNum;
197 try {
198 /* Some pdb don't include all this, read only what we really need*/
199 /* leave this here in case we need more later*/
200
201 // str , "ATOM", "HETATM"
202 // string recType (line,( 1-1),6);
203 // Index , Atom serial number
204 // atNum = string(line,( 7-1),6);
205 // str , Atom name
206 atName = string(line, (13 - 1), 4);
207 // char , Alternate location indicator
208 // string atAltLoc (line,(17-1),1);
209 // str , Residue name
210 resName = string(line, (18 - 1), 3);
211 // char , Chain identifier
212 // string chainID (line,(22-1),1);
213 // Index , Residue sequence number
214 resNum = string(line, (23 - 1), 4);
215 // char , Code for insertion of res
216 // string atICode (line,(27-1),1);
217 // float 8.3 , x
218 x = string(line, (31 - 1), 8);
219 // float 8.3 , y
220 y = string(line, (39 - 1), 8);
221 // float 8.3 , z
222 z = string(line, (47 - 1), 8);
223 // float 6.2 , Occupancy
224 // string atOccup (line,(55-1),6);
225 // float 6.2 , Temperature factor
226 // string atTFactor (line,(61-1),6);
227 // str , Segment identifier
228 // string segID (line,(73-1),4);
229 // str , Element symbol
230 elem_sym = string(line, (77 - 1), 2);
231 // str , Charge on the atom
232 charge = string(line, (79 - 1), 2);
233 } catch (std::out_of_range &) {
234 string err_msg = "Misformated pdb file in atom line # " +
235 boost::lexical_cast<string>(bead_count) +
236 "\n the correct pdb file format requires 80 "
237 "characters in width (spaces matter). Furthermore, " +
238 "\n to read the topology in from a .pdb file the "
239 "following attributes must be " +
240 "\n specified: "
241 " " +
242 "\n Atom Name, Residue Name, Residue Number, x, y, z, "
243 "element symbol"
244 "charge (optional) \n";
245 throw std::runtime_error(err_msg);
246 }
247 boost::algorithm::trim(atName);
248 boost::algorithm::trim(resName);
249 boost::algorithm::trim(resNum);
250 boost::algorithm::trim(x);
251 boost::algorithm::trim(y);
252 boost::algorithm::trim(z);
253 boost::algorithm::trim(elem_sym);
254 boost::algorithm::trim(charge);
255
256 bead_count++;
257
258 Bead *b;
259 // Only read the CONECT keyword if the topology is set too true
260 if (topology_) {
261 Index resnr;
262 try {
263 resnr = boost::lexical_cast<Index>(resNum);
264 } catch (bad_lexical_cast &) {
265 throw std::runtime_error(
266 "Cannot convert resNum='" + resNum +
267 "' to int, that usallly means: misformated pdb file");
268 }
269
270 if (resName == "") {
271 cout << "WARNING no resname specified, assigning name to: UNK"
272 << endl;
273 resName = "UNK";
274 }
275
276 if (resnr < 1) {
277 throw std::runtime_error("Misformated pdb file, resnr has to be > 0");
278 }
279 // TODO: fix the case that resnr is not in ascending order
280 if (resnr > top.ResidueCount()) {
281 while ((resnr - 1) > top.ResidueCount()) { // pdb resnr should start
282 // with 1 but accept
283 // sloppy files
284
285 // create dummy residue, hopefully it will never show
286
287 top.CreateResidue(resName);
288 cout << "Warning: residue numbers not continuous, create dummy "
289 "residue with residue number "
290 << top.ResidueCount() << endl;
291 }
292 top.CreateResidue(resName);
293 }
294 // This is not correct, but still better than no type at all!
295 if (!top.BeadTypeExist(atName)) {
296 top.RegisterBeadType(atName);
297 }
298
299 // Determine if the charge has been provided in the .pdb file or if we
300 // will be assuming it is 0
301 double ch = 0;
302 if (charge != "") {
303 ch = stod(charge);
304 } else {
305 cout << "WARNING no charge was specified for " << endl;
306 cout << line << endl;
307 cout << "Assuming a charge of 0" << endl;
308 }
309
310 if (elem_sym == "") {
311 cout << "WARNING no element was specified, assuming atom name is "
312 << endl;
313 cout << "an element symbol: " << atName << endl;
314 if (elements.isElement(atName)) {
315 elem_sym = atName;
316 } else {
317 throw std::runtime_error(
318 "Atom name is not an element symbol, so substitution fails, "
319 "the element is needed in order to resolve the mass.");
320 }
321 }
322
323 // CreateBead takes 6 parameters in the following order
324 // 1 - symmetry of the bead
325 // 2 - name of the bead (string)
326 // 3 - bead type (BeadType *)
327 // 4 - residue number (Index)
328 // 5 - mass (double)
329 // 6 - charge (double)
330 //
331 // res -1 as internal number starts with 0
332 b = top.CreateBead(Bead::spherical, atName, atName, resnr - 1,
333 elements.getMass(elem_sym), ch);
334 } else {
335 b = top.getBead(bead_count - 1);
336 }
337 // convert to nm from A
338 b->setPos(
339 Eigen::Vector3d(stod(x) / 10.0, stod(y) / 10.0, stod(z) / 10.0));
340
341 bead_vec.push_back(b);
342 }
343
344 if ((line == "ENDMDL") || (line == "END") || (fl_.eof())) {
345 break;
346 }
347 }
348
349 if (!topology_ && (bead_count > 0) && bead_count != top.BeadCount()) {
350 throw std::runtime_error(
351 "number of beads in topology and trajectory differ");
352 }
353
355 // Sort data and determine atom structure, connect with top (molecules, bonds)
357
358 // Extra processing is done if the file is a topology file, in which case the
359 // atoms must be sorted into molecules and the bond interactions recorded
360 if (topology_) {
361 // Now we need to add the bond pairs
362 // WARNING We are assuming the atom ids are contiguous with no gaps
363
364 // First Index - is the index of the atom
365 // Second Index - is the index of the molecule
366 map<Index, Index> atm_molecule;
367
368 // First Index - is the index of the molecule
369 // list<Index> - is a list of the atoms in the molecule
370 unordered_map<Index, list<Index>> molecule_atms;
371
372 // Keep track of the number of molecules we have created through an index
373 Index mol_index = 0;
374
375 // Cycle through all bonds
376 for (auto &bond_pair : bond_pairs) {
377
378 Index atm_id1 = bond_pair.at(0);
379 Index atm_id2 = bond_pair.at(1);
380 // Check to see if either atm referred to in the bond is already
381 // attached to a molecule
382 auto mol_iter1 = atm_molecule.find(atm_id1);
383 auto mol_iter2 = atm_molecule.find(atm_id2);
384
385 // This means neither atom is attached to a molecule
386 if (mol_iter1 == atm_molecule.end() && mol_iter2 == atm_molecule.end()) {
387 // We are going to create a new row for a new molecule
388 list<Index> atms_in_mol;
389 atms_in_mol.push_back(atm_id1);
390 atms_in_mol.push_back(atm_id2);
391 molecule_atms[mol_index] = atms_in_mol;
392 // Associate atm1 and atm2 with the molecule index
393 atm_molecule[atm_id1] = mol_index;
394 atm_molecule[atm_id2] = mol_index;
395 // Increment the molecule index
396 mol_index++;
397
398 // This means only atm2 is attached to a molecule
399 } else if (mol_iter1 == atm_molecule.end()) {
400 // Add atm1 to the molecule that contains atm2
401 molecule_atms[mol_iter2->second].push_back(atm_id1);
402 // Associate atm1 with the molecule it is now part of
403 atm_molecule[atm_id1] = mol_iter2->second;
404
405 // This means only atm1 is attached to a molecule
406 } else if (mol_iter2 == atm_molecule.end()) {
407 // Add atm2 to the molecule that contains atm1
408 molecule_atms[mol_iter1->second].push_back(atm_id2);
409 // Associate atm1 with the molecule it is now part of
410 atm_molecule[atm_id2] = mol_iter1->second;
411
412 } else if (mol_iter1 != mol_iter2) {
413 // This means both atm1 and atm2 are attached to a molecule
414 // But if they are already attached to the same molecule there is
415 // nothing else to be done.
416 Index chosen_mol;
417 Index obsolete_mol;
418 // We will merge the atms to the molecule with the smallest index
419 if (mol_iter1->second < mol_iter2->second) {
420 chosen_mol = mol_iter1->second;
421 obsolete_mol = mol_iter2->second;
422 } else {
423 chosen_mol = mol_iter2->second;
424 obsolete_mol = mol_iter1->second;
425 }
426
427 // Now we will proceed to cycle through the atms that were in the now
428 // obsolete molecule and make sure they are pointing to the new molecule
429 for (Index &atm_temp : molecule_atms[obsolete_mol]) {
430
431 atm_molecule[atm_temp] = chosen_mol;
432 }
433
434 // Splicing will remove atoms from the now obsolete molecule and place
435 // them on the chosen molecule.
436 molecule_atms[chosen_mol].splice(molecule_atms[chosen_mol].end(),
437 molecule_atms[obsolete_mol]);
438
439 // Finally we will clear out the record of the obsolete molecule
440 molecule_atms.erase(obsolete_mol);
441 }
442 }
443#ifndef NDEBUG
444 cerr << "Consistency check for pdbreader" << endl;
445 Index i = 0;
446 for (auto lis = molecule_atms.begin(); lis != molecule_atms.end(); lis++) {
447 cerr << "Molecule " << i << endl;
448 cerr << "Atoms: ";
449 for (auto atm_ind = lis->second.begin(); atm_ind != lis->second.end();
450 atm_ind++) {
451 cerr << *atm_ind << " ";
452 }
453 cerr << endl;
454 i++;
455 }
456 cerr << endl;
457#endif
458 // Now that we know which interactions belong to which molecules we can:
459 // 1 Add the molecules
460 // 2 Add the bond interactions
461
462 // Molecule map
463 // First Index - is the index of the molecule
464 // Molecule* - is a pointer to the Molecule object
465 map<Index, Molecule *> mol_map;
466
467 // Used to reindex the molecules so that they start at 0 and progress
468 // with out gaps in their ids.
469 // First Index - is the index of the old molecule
470 // Second Index - is the new index
471 map<Index, Index> mol_reInd_map;
472
473 Index ind = 0;
474 for (auto mol = molecule_atms.begin(); mol != molecule_atms.end(); mol++) {
475
476 string mol_name = "PDB Molecule " + boost::lexical_cast<string>(ind);
477
478 Molecule *mi = top.CreateMolecule(mol_name);
479 mol_map[mol->first] = mi;
480 mol_reInd_map[mol->first] = ind;
481
482 // Add all the atoms to the appropriate molecule object
483 list<Index> atm_list = molecule_atms[mol->first];
484 for (Index &atm_temp : atm_list) {
485
486 string residuename = "DUM";
487 mi->AddBead(bead_vec.at(atm_temp - 1), residuename);
488 }
489 ind++;
490 }
491
492 Index bond_indx = 0;
493 // Cyle through the bonds and add them to the appropriate molecule
494 for (auto &bond_pair : bond_pairs) {
495
496 Index atm_id1 = bond_pair.at(0);
497 Index atm_id2 = bond_pair.at(1);
498 // Should be able to just look at one of the atoms the bond is attached
499 // too because the other will also be attached to the same molecule.
500 Index mol_ind = atm_molecule[atm_id1];
501 Molecule *mi = mol_map[mol_ind];
502 // Grab the id of the bead associated with the atom
503 // It may be the case that the atom id's and bead id's are different
504 Index bead_id1 = bead_vec.at(atm_id1 - 1)->getId();
505 Index bead_id2 = bead_vec.at(atm_id2 - 1)->getId();
506 Interaction *ic = new IBond(bead_id1, bead_id2);
507 ic->setGroup("BONDS");
508 ic->setIndex(bond_indx);
509 bond_indx++;
510 ic->setMolecule(mol_reInd_map[mol_ind]);
511 top.AddBondedInteraction(ic);
512 mi->AddInteraction(ic);
513 }
514
515 // Finally we want to build an exclusion matrix
516 top.RebuildExclusions();
517 }
518
519 return !fl_.eof();
520}
521} // namespace csg
522} // namespace votca
virtual void setPos(const Eigen::Vector3d &bead_position)
Definition basebead.h:161
information about a bead
Definition bead.h:50
bond interaction
base class for all interactions
Definition interaction.h:40
void setGroup(const std::string &group)
Definition interaction.h:49
void setIndex(const Index &index)
Definition interaction.h:67
void setMolecule(const Index &mol)
Definition interaction.h:76
information about molecules
Definition molecule.h:45
void AddBead(Bead *bead, const std::string &name)
Add a bead to the molecule.
Definition molecule.cc:29
void AddInteraction(Interaction *ic)
Add an interaction to the molecule.
Definition molecule.h:77
bool Open(const std::string &file) override
open a trajectory file
Definition pdbreader.cc:53
void Close() override
Definition pdbreader.cc:63
std::ifstream fl_
Definition pdbreader.h:63
bool NextFrame(Topology &top) override
read in the next frame
Definition pdbreader.cc:70
bool FirstFrame(Topology &top) override
read in the first frame
Definition pdbreader.cc:65
bool ReadTopology(std::string file, Topology &top) override
open a topology file
Definition pdbreader.cc:38
topology of the whole system
Definition topology.h:81
Residue & CreateResidue(std::string name)
Create a new resiude.
Definition topology.h:462
void setBox(const Eigen::Matrix3d &box, BoundaryCondition::eBoxtype boxtype=BoundaryCondition::typeAuto)
Definition topology.h:272
void Cleanup()
Cleans up all the stored data.
Definition topology.cc:49
Index ResidueCount() const
Definition topology.h:156
bool BeadTypeExist(std::string type) const
Determine if a bead type exists.
Definition topology.cc:210
Bead * CreateBead(Bead::Symmetry symmetry, std::string name, std::string type, Index resnr, double m, double q)
Creates a new Bead.
Definition topology.h:441
void AddBondedInteraction(Interaction *ic)
Definition topology.cc:188
Index BeadCount() const
Definition topology.h:150
void RegisterBeadType(std::string type)
Register the bead type with the topology object.
Definition topology.cc:214
Molecule * CreateMolecule(std::string name)
Creates a new molecule.
Definition topology.h:449
Bead * getBead(const Index i)
Returns a pointer to the bead with index i.
Definition topology.h:227
information about an element
Definition elements.h:42
double getMass(std::string name)
Returns the mass of each atom in a.u.
Definition elements.cc:62
bool isElement(std::string name)
Determine if the name is a recognized element symbol or name.
Definition elements.cc:32
STL namespace.
int wildcmp(const char *wild, const char *string)
Definition tokenizer.cc:28
std::istream & getline(std::istream &is, std::string &str)
Wrapper for a getline function.
Definition getline.h:35
base class for all analysis tools
Definition basebead.h:33
Eigen::Index Index
Definition types.h:26