Skip to content

parsing

Functions:

pmd_structure_from_rdmol

pmd_structure_from_rdmol(rd_mol: Mol)

Generate a parmed structure from an RDKit Mol.

The atom types and charges are extracted from the properties.

:param rd_mol: :return:

Source code in ties/parsing.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def pmd_structure_from_rdmol(rd_mol: rdkit.Chem.Mol):
    """
    Generate a parmed structure from an RDKit Mol.

    The atom types and charges are extracted from the properties.

    :param rd_mol:
    :return:
    """

    # use directly PDB since that is what parmed is always using
    fh = StringIO(rdkit.Chem.MolToPDBBlock(rd_mol))
    parmed_structure = PDBFile.parse(fh, skip_bonds=True)

    # verify that they are the same structures
    for rd_a, pmd_a in zip(rd_mol.GetAtoms(), parmed_structure.atoms):
        assert rd_a.GetAtomicNum() == pmd_a.atomic_number

    # extract the charges and the atom types
    pq_prop_openff = "atom.dprop.PartialCharge"
    if rd_mol.HasProp(pq_prop_openff):
        pqs = list(map(float, rd_mol.GetProp(pq_prop_openff).split()))
        assert len(pqs) == rd_mol.GetNumAtoms()
        for atom, pq in zip(parmed_structure.atoms, pqs):
            atom.charge = pq
    else:
        warnings.warn(
            f"Missing partial charges property ({pq_prop_openff}) from the RDKit Mol"
        )

    # fast fail - avoid using wrong files
    # fixme - temporary, remove after a few weeks
    if rd_mol.HasProp("BCCAtomTypes"):
        raise Exception(
            "Please update your SDF. The property BCCAtomTypes is not allowed anymore. "
            "Use the standardised atom.dprop.GAFFAtomType"
        )

    at_prop = "atom.dprop.GAFFAtomType"
    if rd_mol.HasProp("%s" % at_prop):
        ats = list(rd_mol.GetProp(at_prop).split())
        assert len(ats) == rd_mol.GetNumAtoms()
        for atom, at in zip(parmed_structure.atoms, ats):
            atom.type = at
    else:
        warnings.warn(f"Missing atom types property ({at_prop}) in the RDKit molecule")

    # check if every bond from the RDKit molecule is in the parmed structure
    for rd_bond in rd_mol.GetBonds():
        found = False
        for pmd_bond in parmed_structure.bonds:
            if (
                rd_bond.GetBeginAtomIdx() == pmd_bond.atom1.idx
                and rd_bond.GetEndAtomIdx() == pmd_bond.atom2.idx
            ):
                found = True

        if not found:
            raise Exception(
                "Missing a bond in parmed structure after a conversion from an rdkit molecule"
            )

    # check the opposite, are there any extra bonds?
    for pmd_bond in parmed_structure.bonds:
        found = False
        for rd_bond in rd_mol.GetBonds():
            if (
                rd_bond.GetBeginAtomIdx() == pmd_bond.atom1.idx
                and rd_bond.GetEndAtomIdx() == pmd_bond.atom2.idx
            ):
                found = True
                break

        if not found:
            raise Exception(
                "The new parmed structure contains a bond that could not be found in the original rdkit molecule"
            )

    return parmed_structure

get_atoms_bonds_and_parmed_structure

get_atoms_bonds_and_parmed_structure(filename)

:param filename: :return: (atoms, bonds, parmed_structure)

Source code in ties/parsing.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def get_atoms_bonds_and_parmed_structure(filename):
    """

    :param filename:
    :return: (atoms, bonds, parmed_structure)
    """

    # load if .sdf
    if Path(filename).suffix.lower() == ".sdf":
        warnings.warn(
            f"Reading .sdf ({filename}) via RDKit - using only the first conformer. "
        )
        mol = rdkit.Chem.SDMolSupplier(filename, removeHs=False)[0]
        parmed_structure = pmd_structure_from_rdmol(mol)
    else:
        parmed_structure: parmed.Structure = parmed.load_file(
            str(filename), structure=True
        )
        correct_mol2_gaff_atoms(parmed_structure)

    atoms, bonds = get_atoms_bonds_from_pmd_structure(parmed_structure)

    return atoms, bonds, parmed_structure

correct_mol2_gaff_atoms

correct_mol2_gaff_atoms(parmed_structure: Structure)

The mol2 gaff atom types are not understood by Parmed.

e.g. "cl" are not read as chlorine, but as carbon

This functions corrects them by modifying the input structure.

Source code in ties/parsing.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def correct_mol2_gaff_atoms(parmed_structure: parmed.Structure):
    """
    The mol2 gaff atom types are not understood by Parmed.

    e.g. "cl" are not read as chlorine, but as carbon

    This functions corrects them by modifying the input structure.
    """

    if not is_gaff_atoms(parmed_structure):
        return

    for atom in parmed_structure.atoms:
        element = periodic_table.GetAtomicNumber(gaff_element_map[atom.type.upper()])
        if element != atom.element:
            # correct the atomic number
            atom.element = element
            # check that it updated the element name
            assert atom.element_name == gaff_element_map[atom.type.upper()]

get_atoms_bonds_from_pmd_structure

get_atoms_bonds_from_pmd_structure(parmed_structure: Structure)

convert the Parmed atoms into Atom objects.

Source code in ties/parsing.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def get_atoms_bonds_from_pmd_structure(parmed_structure: parmed.Structure):
    """
    # convert the Parmed atoms into Atom objects.
    """
    atoms = []
    for parmed_atom in parmed_structure.atoms:
        atom_type = parmed_atom.type

        if not atom_type:
            # extract the element symbol from the atomic number
            atom_type = periodic_table.GetElementSymbol(parmed_atom.atomic_number)

        # atom type might be empty if
        if not atom_type:
            # use the atom name as the atom type, e.g. C7
            atom_type = parmed_atom.name

        try:
            atom = Atom(
                name=parmed_atom.name,
                atom_type=atom_type,
                charge=parmed_atom.charge,
            )
        except AttributeError:
            # most likely the charges were missing, manually set the charges to 0
            atom = Atom(
                name=parmed_atom.name,
                atom_type=atom_type,
                charge=0.0,
            )
            logger.warning(
                "One of the input files is missing charges. Setting the charge to 0"
            )
        atom.id = parmed_atom.idx
        atom.position = [parmed_atom.xx, parmed_atom.xy, parmed_atom.xz]
        atom.resname = parmed_atom.residue.name
        atoms.append(atom)

    bonds = [(b.atom1.idx, b.atom2.idx, b.order) for b in parmed_structure.bonds]

    for from_idx, to_idx, bond_type in bonds:
        atoms[from_idx].bind_to(atoms[to_idx], bond_type)

    # verify: the ids and the order in the list is the same
    for i, atom in enumerate(atoms):
        assert i == atom.id

    return atoms, bonds

do_atom_names_have_simple_format

do_atom_names_have_simple_format(names)

Check if the atom name is followed by a number, e.g. "C15" Note that the full atom name cannot be more than 4 characters. This is because the PDB format does not allow for more characters which can lead to inconsistencies.

:param names: a list of atom names :type names: list[str] :return True if they all follow the correct format.

Source code in ties/parsing.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def do_atom_names_have_simple_format(names):
    """
    Check if the atom name is followed by a number, e.g. "C15"
    Note that the full atom name cannot be more than 4 characters.
    This is because the PDB format does not allow for more
    characters which can lead to inconsistencies.

    :param names: a list of atom names
    :type names: list[str]
    :return True if they all follow the correct format.
    """
    for name in names:
        # cannot exceed 4 characters
        if len(name) > 4:
            return False

        # count letters before any digit
        letter_count = 0
        for letter in name:
            if not letter.isalpha():
                break

            letter_count += 1

        # at least one character
        if letter_count == 0:
            return False

        # extrac the number suffix
        atom_number = name[letter_count:]
        try:
            int(atom_number)
        except Exception:
            return False

    return True