# # title: pdb_extract.py # summary: extract a fragment PDB structure given a torsion file # author: Nicholas Fitzkee (nickfitzkee at jhu dot edu) # date: May 21, 2004 # # description: # Search results from the coil library are returned as tarred groups # of torsion angle (tor) files. Because of their size, the structure # of each fragment is not downloaded by default. If users wish to # posess a PDB file for each fragment in the PDB, they must download # the files themselves (using wget and the file list), or they must # generate them from their own local copies of the PDB. This file # facilitates the second approach. # # Given a torsions file and the original PDB file from which it was # derived, this program will construct a minimal PDB file that describes # the structure of the residues in the tor file. It generates no new # data: it merely serves as a filter for existing pdb files # # User Options ############################################################### # PDB_DIR is a string describing the local location of the PDB. This # directory should be flat: by default, all files are expected to reside # in this directory. You can get around this limitation by changing the # GET_PDB_FILE function. PDB_DIR = '/databases/pdb' # GET_PDB_FILE is a function that, given a four-letter PDB ID, and a # one character chain identifier, will return the file name underneath # PDB_DIR that contains the structural data. Two examples are given: # this example (the default) uses the standard PDB naming convention, # only with gzipped files instead of compressed files. GET_PDB_FILE = lambda id, chain: '/pdb%s.ent.gz' % id.lower() # This example could be used for a heirarchy structure, like the PDB # 'divided' directory. It uses the more conventional 'def' syntax # for defining a function #def GET_PDB_FILE (id, chain): # id = id.lower() # return '/%s/pdb%s.ent.Z' % (id[1:3], id) # UNCOMPRESS is the full path to the uncompress binary, if necessary # (used for .Z files on UNIX systems) Mac OSX users should change # this line to /gunzip. It seems as though the OS X version # of uncompress likes to remove files decompressed to the console even # though it's not supposed to. UNCOMPRESS = '/usr/bin/uncompress' # COMPRESS is the full path to the compress binary, if necessary # (used for .Z files on UNIX systems) COMPRESS = '/usr/bin/compress' # End User Options / Begin Main Function ##################################### CMP_NONE = 0 CMP_GZIP = 1 CMP_COMPRESS = 2 import sys, getopt, os def main(argc, argv): short_opts = 'hl:o:zZ' long_opts = ['help', 'list=', 'outdir=', 'use-compress', 'use-gzip'] files = [] outfile = None outdir = lambda x: os.path.split(x)[0] compress = CMP_NONE try: opts, args = getopt.getopt(argv[1:], short_opts, long_opts) except getopt.error: usage() for o, v in opts: if o in ('-l', '--list'): files = files + parse_list_file(v) elif o in ('-o', '--outdir'): dir = v # buggy python: don't use lambda x: v safe_mkdirs(v) outdir = lambda x: dir elif o in ('-z', '--use-gzip'): compress = CMP_GZIP elif o in ('-Z', '--use-compress'): compress = CMP_COMPRESS elif o in ('-h', '--help'): help() else: usage() if not len(files): try: assert(len(args)) files = args except: usage() pdb_extract(files, outdir, compress) # Documentation Section ###################################################### def help(): print """pdb_extract.py - extract pdb fragments given a tor file Given a .tor (or .tor.gz) file, along with the original PDB file used to create the .tor file, this utility constructs the fragment structure described by the tor file, allowing the user to generate structures rather than downloading them from the Coil Library server. usage: python2.2 pdb_extract.py [options] [] ... pdb_extract.py takes the following options: -l | --list Rather than specifying a single file at the command line, process a list of files stored in . contains a list of pathnames and files (blank lines and lines starting with '#' are ignored). -o | --outdir Specify the output directory for a list of files given above. By default, the files are generated in the same location that the tor files are read from. -z | --use-gzip Use gzip compression (.gz) when writing output files. -Z | --use-compress Use compress compression (.Z) when writing output files. If no list file is given, specifies a single file that should be converted. """ sys.exit() def usage(): print "usage: pdb_extract.py [-h] [options] [] ..." sys.exit() # core function ############################################################### def pdb_extract(files, outdir, compress): """pdb_extract(files, outdir, compress) Given a list of torsion files (eg /some/path/1hel_.7.102_.tor.gz), this function determines the appropriate PDB structure file name and extracts the relevant coordinates from that file. It then creates a reduced PDB structure file using the output directory generating function outdir (which takes the input file path and determines the output file path). The file will be compressed according to the enumeration options in compress. """ for path in files: print path dir, file = os.path.split(path) pdb, chain, flen, start, ic = parse_tor_fname(file) pdbfn = '%s/%s' % (PDB_DIR, GET_PDB_FILE(pdb, chain)) chain, seq, res_idx = parse_pcl_tor_file(path) output = [] output_sequences(output, chain, seq) natm, chain, res, idx = output_atoms(output, chain, res_idx, pdbfn) output_termination(output, natm, chain, res, idx) if chain == ' ': chain = '_' tgt_path = outdir(path) if not tgt_path: tgt_path = '.' pdb_out = '%s/%s%s.%i.%i%s.pdb' % (outdir(path), pdb, chain, flen, start, ic) if compress == CMP_GZIP: pdb_out = pdb_out + '.gz' if compress == CMP_COMPRESS: pdb_out = pdb_out + '.Z' flexible_write(pdb_out, output) # utility functions (related to core functionality) ########################### def parse_tor_fname(fn): """parse_tor_fname(fn) Given a torsion file name, return a tuple containing the PDB, chain, fragment index (int) and starting residue (int). """ pdb = fn[0:4] chain = fn[4] toks = fn[6:].split('.') flen = int(toks[0]) ridx = int(toks[1][:-1]) icod = toks[1][-1] return pdb, chain, flen, ridx, icod def parse_pcl_tor_file(tor): """parse_for_file(tor) - parse a torsion file Given the path to a torsion file, this function opens it, and returns the following information as a tuple: the chain ID, a sequence of residues described in the tor file, and a range of residue ID's from the torsion file. """ lines = flexible_read(tor) chain = lines[1][0:2].strip() if not len(chain): chain = ' ' nres = int(lines[1][2:]) res_seq = [] res_idx = {} for n in xrange(nres): ridx = int(lines[2+n][0:8]) rcod = lines[2+n][9] res_idx[(ridx, rcod)] = 1 res_seq.append(lines[2+n][11:14].strip()) return chain, res_seq, res_idx def output_sequences(data, chain, seq): """output_sequences(data, chain, seq) - write SEQRES records to 'file' Given a list of lines in data, this function appends SEQRES records to data such that, when written out using flexible_write, the file will contain the appropriate SEQRES entries. chain identifies the chain corresponding to this entry, and SEQ is the sequence of amino acids. """ nr = len(seq) sn = 1 elem = 0 line = '' fmt = 'SEQRES %2i %1s %4i ' for i in xrange(nr): if elem == 0: line = line + (fmt % (sn, chain, nr)) sn = sn + 1 line = line + ('%3s ' % seq[i]) elem = elem + 1 if elem == 13: line = line + ('\n') data.append(line) line = '' elem = 0 if elem != 0: line = line + ('\n') data.append(line) def output_atoms(data, chain, res_idx, pdbfn): """output_atoms(data, chain, res_idx, pdbfn) - write ATOM records to data Given a list of lines data, a chain ID, residue numbers, and a PDB file name, this function filters the coordinates within the PDB file such that only the residue numbers listed in nums for the given chain are written into data. Returns the number of atoms written. """ lines = flexible_read(pdbfn) natm = 1 fmt = 'ATOM %5i %4s%1s%3s %1s%4s%1s %8.3f%8.3f%8.3f%6.2f%6.2f\n' models = 0 for line in lines: if len(line) >= 6 and line[0:6].upper() == 'MODEL ': if models: break models = 1 continue if len(line) <= 6 or line[0:6].upper() != 'ATOM ': continue toks = parse_pdb_line(line) anum, anam, al, res, chn, idx, ic, x, y, z, occ, b = toks if chn <> chain: continue if not res_idx.has_key((idx, ic)): continue data.append(fmt % (natm, anam, al, res, chain, idx, ic, x, y, z, occ, b)) lastres = res lastidx = idx natm = natm + 1 return natm, chain, lastres, lastidx def output_termination(data, natm, chain, res, idx): """output_termination(data, chain, res, idx) - write TER record to data""" data.append('TER %5i %3s %1s%4i\n' % (natm, res, chain, idx)) data.append('END\n') def parse_pdb_line(line): return (int(line[6:11]), line[12:16], line[16], line[17:21].strip(), line[21], int(line[22:26]), line[26], float(line[30:38]), float(line[38:46]), float(line[46:54]), float(line[54:60]), float(line[60:66])) # helper functions ############################################################ def flexible_read(filename): """flexible_read(file) - open and load the file flexible_read opens a file, loads it into a list of lines, and closes the file. It does some smart probing to see what type of file it is, and can handle .Z and .gz files. Its return value is the list of lines. """ result = None if filename[-3:] == '.gz': import gzip f = gzip.open(filename) result = f.readlines() f.close() elif filename[-2:] == '.Z': import tempfile tfile = tempfile.mktemp() # mkstemp in python2.3 is better os.system("%s -c %s > %s" % (UNCOMPRESS, filename, tfile)) f = open(tfile) result = f.readlines() f.close() os.unlink(tfile) else: f = open(filename) result = f.readlines() f.close() return result def flexible_write(filename, data): """flexible_write(file, data) - write to This function creates a new file containing the given data. It behaves similarly to flexible_read, such that flexible_write('foo', flexible_read('bar')) copies from bar to foo. """ if filename[-3:] == '.gz': import gzip f = gzip.open(filename, 'w') map(f.write, data) f.close() elif filename[-2:] == '.Z': import tempfile tfile = tempfile.mktemp() f = open(tfile, 'w') map(f.write, data) f.close() os.system('%s -c %s > %s' % (COMPRESS, tfile, filename)) os.unlink(tfile) else: f = open(filename, 'w') map(f.write, data) f.close() def safe_mkdirs(dir): """recursive directory make function that doesn't die if dir exists""" if not os.path.exists(dir): os.makedirs(dir, mode=0755) def parse_list_file(filename): """parse a list of filenames""" f = open(filename) result = [] while 1: ln = f.readline() if not len(ln): break ln = ln.strip() if not len(ln) or ln[0] == '#': continue result.append(ln) f.close() return result # Main Function Wrapper ####################################################### try: import psyco except: pass else: try: psyco.full() print 'Using psyco.full() to optimize functions...' except: psyco.bind(pdb_extract) psyco.bind(parse_tor_fname) psyco.bind(parse_pcl_tor_file) psyco.bind(output_sequences) psyco.bind(output_atoms) psyco.bind(output_termination) psyco.bind(parse_pdb_line) psyco.bind(flexible_read) psyco.bind(flexible_write) print 'Using psyco.bind() to optimize functions...' if __name__ == '__main__': main(len(sys.argv), sys.argv)