generate.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Aug 20, 17:04

generate.py
View Options

	#!/usr/bin/env python
	"""
	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author: Oliver Henrich (University of Strathclyde, Glasgow)
	------------------------------------------------------------------------- */
	"""


	"""
	Import basic modules
	"""
	import sys, os, timeit

	from timeit import default_timer as timer
	start_time = timer()
	"""
	Try to import numpy; if failed, import a local version mynumpy
	which needs to be provided
	"""
	try:
	import numpy as np
	except:
	print >> sys.stderr, "numpy not found. Exiting."
	sys.exit(1)

	"""
	Check that the required arguments (box offset and size in simulation units
	and the sequence file were provided
	"""
	try:
	box_offset = float(sys.argv[1])
	box_length = float(sys.argv[2])
	infile = sys.argv[3]
	except:
	print >> sys.stderr, "Usage: %s <%s> <%s> <%s>" % (sys.argv[0], \
	"box offset", "box length", "file with sequences")
	sys.exit(1)
	box = np.array ([box_length, box_length, box_length])

	"""
	Try to open the file and fail gracefully if file cannot be opened
	"""
	try:
	inp = open (infile, 'r')
	inp.close()
	except:
	print >> sys.stderr, "Could not open file '%s' for reading. \
	Aborting." % infile
	sys.exit(2)

	# return parts of a string
	def partition(s, d):
	if d in s:
	sp = s.split(d, 1)
	return sp[0], d, sp[1]
	else:
	return s, "", ""

	"""
	Define the model constants
	"""
	# set model constants
	PI = np.pi
	POS_BASE = 0.4
	POS_BACK = -0.4
	EXCL_RC1 = 0.711879214356
	EXCL_RC2 = 0.335388426126
	EXCL_RC3 = 0.52329943261

	"""
	Define auxillary variables for the construction of a helix
	"""
	# center of the double strand
	CM_CENTER_DS = POS_BASE + 0.2

	# ideal distance between base sites of two nucleotides
	# which are to be base paired in a duplex
	BASE_BASE = 0.3897628551303122

	# cutoff distance for overlap check
	RC2 = 16

	# squares of the excluded volume distances for overlap check
	RC2_BACK = EXCL_RC1**2
	RC2_BASE = EXCL_RC2**2
	RC2_BACK_BASE = EXCL_RC3**2

	# enumeration to translate from letters to numbers and vice versa
	number_to_base = {1 : 'A', 2 : 'C', 3 : 'G', 4 : 'T'}
	base_to_number = {'A' : 1, 'a' : 1, 'C' : 2, 'c' : 2,
	'G' : 3, 'g' : 3, 'T' : 4, 't' : 4}

	# auxillary arrays
	positions = []
	a1s = []
	a3s = []
	quaternions = []

	newpositions = []
	newa1s = []
	newa3s = []

	basetype = []
	strandnum = []

	bonds = []

	"""
	Convert local body frame to quaternion DOF
	"""
	def exyz_to_quat (mya1, mya3):

	mya2 = np.cross(mya3, mya1)
	myquat = [1,0,0,0]

	q0sq = 0.25 * (mya1[0] + mya2[1] + mya3[2] + 1.0)
	q1sq = q0sq - 0.5 * (mya2[1] + mya3[2])
	q2sq = q0sq - 0.5 * (mya1[0] + mya3[2])
	q3sq = q0sq - 0.5 * (mya1[0] + mya2[1])

	# some component must be greater than 1/4 since they sum to 1
	# compute other components from it

	if q0sq >= 0.25:
	myquat[0] = np.sqrt(q0sq)
	myquat[1] = (mya2[2] - mya3[1]) / (4.0*myquat[0])
	myquat[2] = (mya3[0] - mya1[2]) / (4.0*myquat[0])
	myquat[3] = (mya1[1] - mya2[0]) / (4.0*myquat[0])
	elif q1sq >= 0.25:
	myquat[1] = np.sqrt(q1sq)
	myquat[0] = (mya2[2] - mya3[1]) / (4.0*myquat[1])
	myquat[2] = (mya2[0] + mya1[1]) / (4.0*myquat[1])
	myquat[3] = (mya1[2] + mya3[0]) / (4.0*myquat[1])
	elif q2sq >= 0.25:
	myquat[2] = np.sqrt(q2sq)
	myquat[0] = (mya3[0] - mya1[2]) / (4.0*myquat[2])
	myquat[1] = (mya2[0] + mya1[1]) / (4.0*myquat[2])
	myquat[3] = (mya3[1] + mya2[2]) / (4.0*myquat[2])
	elif q3sq >= 0.25:
	myquat[3] = np.sqrt(q3sq)
	myquat[0] = (mya1[1] - mya2[0]) / (4.0*myquat[3])
	myquat[1] = (mya3[0] + mya1[2]) / (4.0*myquat[3])
	myquat[2] = (mya3[1] + mya2[2]) / (4.0*myquat[3])

	norm = 1.0/np.sqrt(myquat[0]myquat[0] + myquat[1]myquat[1] + \
	myquat[2]myquat[2] + myquat[3]myquat[3])
	myquat[0] *= norm
	myquat[1] *= norm
	myquat[2] *= norm
	myquat[3] *= norm

	return np.array([myquat[0],myquat[1],myquat[2],myquat[3]])

	"""
	Adds a strand to the system by appending it to the array of previous strands
	"""
	def add_strands (mynewpositions, mynewa1s, mynewa3s):
	overlap = False

	# This is a simple check for each of the particles where for previously
	# placed particles i we check whether it overlaps with any of the
	# newly created particles j

	print >> sys.stdout, "## Checking for overlaps"

	for i in xrange(len(positions)):

	p = positions[i]
	pa1 = a1s[i]

	for j in xrange (len(mynewpositions)):

	q = mynewpositions[j]
	qa1 = mynewa1s[j]

	# skip particles that are anyway too far away
	dr = p - q
	dr -= box * np.rint (dr / box)
	if np.dot(dr, dr) > RC2:
	continue

	# base site and backbone site of the two particles
	p_pos_back = p + pa1 * POS_BACK
	p_pos_base = p + pa1 * POS_BASE
	q_pos_back = q + qa1 * POS_BACK
	q_pos_base = q + qa1 * POS_BASE

	# check for no overlap between the two backbone sites
	dr = p_pos_back - q_pos_back
	dr -= box * np.rint (dr / box)
	if np.dot(dr, dr) < RC2_BACK:
	overlap = True

	# check for no overlap between the two base sites
	dr = p_pos_base - q_pos_base
	dr -= box * np.rint (dr / box)
	if np.dot(dr, dr) < RC2_BASE:
	overlap = True

	# check for no overlap between backbone site of particle p
	# with base site of particle q
	dr = p_pos_back - q_pos_base
	dr -= box * np.rint (dr / box)
	if np.dot(dr, dr) < RC2_BACK_BASE:
	overlap = True

	# check for no overlap between base site of particle p and
	# backbone site of particle q
	dr = p_pos_base - q_pos_back
	dr -= box * np.rint (dr / box)
	if np.dot(dr, dr) < RC2_BACK_BASE:
	overlap = True

	# exit if there is an overlap
	if overlap:
	return False

	# append to the existing list if no overlap is found
	if not overlap:

	for p in mynewpositions:
	positions.append(p)
	for p in mynewa1s:
	a1s.append (p)
	for p in mynewa3s:
	a3s.append (p)
	# calculate quaternion from local body frame and append
	for ia in xrange(len(mynewpositions)):
	mynewquaternions = exyz_to_quat(mynewa1s[ia],mynewa3s[ia])
	quaternions.append(mynewquaternions)

	return True


	"""
	Returns the rotation matrix defined by an axis and angle
	"""
	def get_rotation_matrix(axis, anglest):
	# The argument anglest can be either an angle in radiants
	# (accepted types are float, int or np.float64 or np.float64)
	# or a tuple [angle, units] where angle is a number and
	# units is a string. It tells the routine whether to use degrees,
	# radiants (the default) or base pairs turns.
	if not isinstance (anglest, (np.float64, np.float32, float, int)):
	if len(anglest) > 1:
	if anglest[1] in ["degrees", "deg", "o"]:
	#angle = np.deg2rad (anglest[0])
	angle = (np.pi / 180.) * (anglest[0])
	elif anglest[1] in ["bp"]:
	angle = int(anglest[0]) * (np.pi / 180.) * (35.9)
	else:
	angle = float(anglest[0])
	else:
	angle = float(anglest[0])
	else:
	angle = float(anglest) # in degrees (?)

	axis = np.array(axis)
	axis /= np.sqrt(np.dot(axis, axis))

	ct = np.cos(angle)
	st = np.sin(angle)
	olc = 1. - ct
	x, y, z = axis

	return np.array([[olcxx+ct, olcxy-stz, olcxz+sty],
	[olcxy+stz, olcyy+ct, olcyz-stx],
	[olcxz-sty, olcyz+stx, olczz+ct]])

	"""
	Generates the position and orientation vectors of a
	(single or double) strand from a sequence string
	"""
	def generate_strand(bp, sequence=None, start_pos=np.array([0, 0, 0]), \
	dir=np.array([0, 0, 1]), perp=False, double=True, rot=0.):
	# generate empty arrays
	mynewpositions, mynewa1s, mynewa3s = [], [], []

	# cast the provided start_pos array into a numpy array
	start_pos = np.array(start_pos, dtype=float)

	# overall direction of the helix
	dir = np.array(dir, dtype=float)
	if sequence == None:
	sequence = np.random.randint(1, 5, bp)

	# the elseif here is most likely redundant
	elif len(sequence) != bp:
	n = bp - len(sequence)
	sequence += np.random.randint(1, 5, n)
	print >> sys.stderr, "sequence is too short, adding %d random bases" % n

	# normalize direction
	dir_norm = np.sqrt(np.dot(dir,dir))
	if dir_norm < 1e-10:
	print >> sys.stderr, "direction must be a valid vector, \
	defaulting to (0, 0, 1)"
	dir = np.array([0, 0, 1])
	else: dir /= dir_norm

	# find a vector orthogonal to dir to act as helix direction,
	# if not provided switch off random orientation
	if perp is None or perp is False:
	v1 = np.random.random_sample(3)
	v1 -= dir * (np.dot(dir, v1))
	v1 /= np.sqrt(sum(v1*v1))
	else:
	v1 = perp;

	# generate rotational matrix representing the overall rotation of the helix
	R0 = get_rotation_matrix(dir, rot)

	# rotation matrix corresponding to one step along the helix
	R = get_rotation_matrix(dir, [1, "bp"])

	# set the vector a1 (backbone to base) to v1
	a1 = v1

	# apply the global rotation to a1
	a1 = np.dot(R0, a1)

	# set the position of the fist backbone site to start_pos
	rb = np.array(start_pos)

	# set a3 to the direction of the helix
	a3 = dir
	for i in range(bp):
	# work out the position of the centre of mass of the nucleotide
	rcdm = rb - CM_CENTER_DS * a1

	# append to newpositions
	mynewpositions.append(rcdm)
	mynewa1s.append(a1)
	mynewa3s.append(a3)

	# if we are not at the end of the helix, we work out a1 and rb for the
	# next nucleotide along the helix
	if i != bp - 1:
	a1 = np.dot(R, a1)
	rb += a3 * BASE_BASE

	# if we are working on a double strand, we do a cycle similar
	# to the previous one but backwards
	if double == True:
	a1 = -a1
	a3 = -dir
	R = R.transpose()
	for i in range(bp):
	rcdm = rb - CM_CENTER_DS * a1
	mynewpositions.append (rcdm)
	mynewa1s.append (a1)
	mynewa3s.append (a3)
	a1 = np.dot(R, a1)
	rb += a3 * BASE_BASE

	assert (len (mynewpositions) > 0)

	return [mynewpositions, mynewa1s, mynewa3s]


	"""
	Main function for this script.
	Reads a text file with the following format:
	- Each line contains the sequence for a single strand (A,C,G,T)
	- Lines beginning with the keyword 'DOUBLE' produce double-stranded DNA

	Ex: Two ssDNA (single stranded DNA)
	ATATATA
	GCGCGCG

	Ex: Two strands, one double stranded, the other single stranded.
	DOUBLE AGGGCT
	CCTGTA

	"""

	def read_strands(filename):
	try:
	infile = open (filename)
	except:
	print >> sys.stderr, "Could not open file '%s'. Aborting." % filename
	sys.exit(2)

	# This block works out the number of nucleotides and strands by reading
	# the number of non-empty lines in the input file and the number of letters,
	# taking the possible DOUBLE keyword into account.
	nstrands, nnucl, nbonds = 0, 0, 0
	lines = infile.readlines()
	for line in lines:
	line = line.upper().strip()
	if len(line) == 0:
	continue
	if line[:6] == 'DOUBLE':
	line = line.split()[1]
	length = len(line)
	print >> sys.stdout, "## Found duplex of %i base pairs" % length
	nnucl += 2*length
	nstrands += 2
	nbonds += (2*length-2)
	else:
	line = line.split()[0]
	length = len(line)
	print >> sys.stdout, \
	"## Found single strand of %i bases" % length
	nnucl += length
	nstrands += 1
	nbonds += length-1
	# rewind the sequence input file
	infile.seek(0)

	print >> sys.stdout, "## nstrands, nnucl = ", nstrands, nnucl

	# generate the data file in LAMMPS format
	try:
	out = open ("data.oxdna", "w")
	except:
	print >> sys.stderr, "Could not open data file for writing. Aborting."
	sys.exit(2)

	lines = infile.readlines()
	nlines = len(lines)
	i = 1
	myns = 0
	noffset = 1

	for line in lines:
	line = line.upper().strip()

	# skip empty lines
	if len(line) == 0:
	i += 1
	continue

	# block for duplexes: last argument of the generate function
	# is set to 'True'
	if line[:6] == 'DOUBLE':
	line = line.split()[1]
	length = len(line)
	seq = [(base_to_number[x]) for x in line]

	myns += 1
	for b in xrange(length):
	basetype.append(seq[b])
	strandnum.append(myns)

	for b in xrange(length-1):
	bondpair = [noffset + b, noffset + b + 1]
	bonds.append(bondpair)
	noffset += length

	# create the sequence of the second strand as made of
	# complementary bases
	seq2 = [5-s for s in seq]
	seq2.reverse()

	myns += 1
	for b in xrange(length):
	basetype.append(seq2[b])
	strandnum.append(myns)

	for b in xrange(length-1):
	bondpair = [noffset + b, noffset + b + 1]
	bonds.append(bondpair)
	noffset += length

	print >> sys.stdout, "## Created duplex of %i bases" % (2*length)

	# generate random position of the first nucleotide
	cdm = box_offset + np.random.random_sample(3) * box

	# generate the random direction of the helix
	axis = np.random.random_sample(3)
	axis /= np.sqrt(np.dot(axis, axis))

	# use the generate function defined above to create
	# the position and orientation vector of the strand
	newpositions, newa1s, newa3s = generate_strand(len(line), \
	sequence=seq, dir=axis, start_pos=cdm, double=True)

	# generate a new position for the strand until it does not overlap
	# with anything already present
	start = timer()
	while not add_strands(newpositions, newa1s, newa3s):
	cdm = box_offset + np.random.random_sample(3) * box
	axis = np.random.random_sample(3)
	axis /= np.sqrt(np.dot(axis, axis))
	newpositions, newa1s, newa3s = generate_strand(len(line), \
	sequence=seq, dir=axis, start_pos=cdm, double=True)
	print >> sys.stdout, "## Trying %i" % i
	end = timer()
	print >> sys.stdout, "## Added duplex of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
	(2*length, i, nlines, end-start, len(positions), nnucl)

	# block for single strands: last argument of the generate function
	# is set to 'False'
	else:
	length = len(line)
	seq = [(base_to_number[x]) for x in line]

	myns += 1
	for b in xrange(length):
	basetype.append(seq[b])
	strandnum.append(myns)

	for b in xrange(length-1):
	bondpair = [noffset + b, noffset + b + 1]
	bonds.append(bondpair)
	noffset += length

	# generate random position of the first nucleotide
	cdm = box_offset + np.random.random_sample(3) * box

	# generate the random direction of the helix
	axis = np.random.random_sample(3)
	axis /= np.sqrt(np.dot(axis, axis))

	print >> sys.stdout, \
	"## Created single strand of %i bases" % length

	newpositions, newa1s, newa3s = generate_strand(length, \
	sequence=seq, dir=axis, start_pos=cdm, double=False)
	start = timer()
	while not add_strands(newpositions, newa1s, newa3s):
	cdm = box_offset + np.random.random_sample(3) * box
	axis = np.random.random_sample(3)
	axis /= np.sqrt(np.dot(axis, axis))
	newpositions, newa1s, newa3s = generate_strand(length, \
	sequence=seq, dir=axis, start_pos=cdm, double=False)
	print >> sys.stdout, "## Trying %i" % (i)
	end = timer()
	print >> sys.stdout, "## Added single strand of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
	(length, i, nlines, end-start,len(positions), nnucl)

	i += 1

	# sanity check
	if not len(positions) == nnucl:
	print len(positions), nnucl
	raise AssertionError

	out.write('# LAMMPS data file\n')
	out.write('%d atoms\n' % nnucl)
	out.write('%d ellipsoids\n' % nnucl)
	out.write('%d bonds\n' % nbonds)
	out.write('\n')
	out.write('4 atom types\n')
	out.write('1 bond types\n')
	out.write('\n')
	out.write('# System size\n')
	out.write('%f %f xlo xhi\n' % (box_offset,box_offset+box_length))
	out.write('%f %f ylo yhi\n' % (box_offset,box_offset+box_length))
	out.write('%f %f zlo zhi\n' % (box_offset,box_offset+box_length))

	out.write('\n')
	out.write('Masses\n')
	out.write('\n')
	out.write('1 3.1575\n')
	out.write('2 3.1575\n')
	out.write('3 3.1575\n')
	out.write('4 3.1575\n')

	# for each nucleotide print a line under the headers
	# Atoms, Velocities, Ellipsoids and Bonds
	out.write('\n')
	out.write(\
	'# Atom-ID, type, position, molecule-ID, ellipsoid flag, density\n')
	out.write('Atoms\n')
	out.write('\n')

	for i in xrange(nnucl):
	out.write('%d %d %22.15le %22.15le %22.15le %d 1 1\n' \
	% (i+1, basetype[i], \
	positions[i][0], positions[i][1], positions[i][2], \
	strandnum[i]))

	out.write('\n')
	out.write('# Atom-ID, translational, rotational velocity\n')
	out.write('Velocities\n')
	out.write('\n')

	for i in xrange(nnucl):
	out.write("%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n" \
	% (i+1,0.0,0.0,0.0,0.0,0.0,0.0))

	out.write('\n')
	out.write('# Atom-ID, shape, quaternion\n')
	out.write('Ellipsoids\n')
	out.write('\n')

	for i in xrange(nnucl):
	out.write(\
	"%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n" \
	% (i+1,1.1739845031423408,1.1739845031423408,1.1739845031423408, \
	quaternions[i][0],quaternions[i][1], quaternions[i][2],quaternions[i][3]))

	out.write('\n')
	out.write('# Bond topology\n')
	out.write('Bonds\n')
	out.write('\n')

	for i in xrange(nbonds):
	out.write("%d %d %d %d\n" % (i+1,1,bonds[i][0],bonds[i][1]))

	out.close()

	print >> sys.stdout, "## Wrote data to 'data.oxdna'"
	print >> sys.stdout, "## DONE"

	# call the above main() function, which executes the program
	read_strands (infile)

	end_time=timer()
	runtime = end_time-start_time
	hours = runtime/3600
	minutes = (runtime-np.rint(hours)*3600)/60
	seconds = (runtime-np.rint(hours)3600-np.rint(minutes)60)%60
	print >> sys.stdout, "## Total runtime %ih:%im:%.2fs" % (hours,minutes,seconds)

generate.pyNo OneTemporaryActions

File Metadata

generate.pyView Options

Event Timeline

generate.py
No OneTemporary
Actions

generate.py
View Options