diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..c9c2a5e --- /dev/null +++ b/404.html @@ -0,0 +1,460 @@ + + + +
+ + + + + + + + + + + + + + +Learning objectives
+Teaching goals are:
+Lesson plan:
+Sometime, also programs need input, for example +the name of the input file and the name of a results file:
+ +Sometime you need to get 'stuff to work', for example, +code written by someone else. +In this session, we practice this, +going through the problems and errors that occur +when you want to 'just want to run the stupid code'.
+Learning objectives
+Read the official Python documentation about sys.argv
here.
Create a script with the following text:
+ +Run the Python script as shown below. What does it print?
+python3 read_argv.py
Run the Python script as shown below. What does it print?
+python3 read_argv.py hello world
Run the Python script as shown below. What does it print?
+python3 read_argv.py "hello world"
Run the Python script as shown below. What does it print?
+python3 read_argv.py 'hello world'
Learning objectives
+sys.argv
is a listsys.argv
is a list. Here we use this list
Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Modify the script in such a way that the script will only show the first command-line +argument.
+For example, running the script like this:
+ +it should show hello
.
Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Now, we make the script produce different output.
+If the script is run like this:
+ +it should show Hello human!
.
If the script is run like this:
+ +it should show Sad to see you go human!
.
Write the script to do that.
+In this exercise you will write a program that calculates the distance +between two sequences, e.g:
+ +A simple program (without fonctions or modules) is sufficient. +# define sequences
+seq1 = "ACGT"
+seq2 = "AGGT"
+
+# initiate counter
+distance_score = 0
+
+# for each letter in the sequences
+for a,b in zip(seq1, seq2)
+
+ # if they don't match, add a distance point
+ if a != b:
+ distance_score += 1
+
+# print result to the terminal
+print("Distance between A and B: ", distance_score)
+
Calculate the distance between the following sequences and print out + the result. Since the following sequences are already aligned, we + can calculate the distance between them. Change your program so that it can + read two aligned sequences from the command line. Test your program with the + following sequences.
+ +import sys
+
+# read sequences from command line arguments
+seq1 = sys.argv[1]
+seq2 = sys.argv[2]
+
+# initiate counter
+distance_score = 0
+
+# for each letter in the sequences
+for a,b in zip(seq1, seq2)
+
+ # if they don't match, add a distance point
+ if a != b:
+ distance_score += 1
+
+# print result to the terminal
+print(f"Distance between A and B: {distance_score}")
+
Extend the program that the distance between two sequences is only + calculated when both sequences have the same length. Test your program + with the input sequences:
+ +Note: Here you can use either the assert
keyword or an if
+clause. You could also raise a custom exception using try-except
.
import sys
+
+# read sequences from command line arguments
+seq1 = sys.argv[1]
+seq2 = sys.argv[2]
+
+assert len(seq1) == len(seq2), "Sequences must be of same length"
+
+# initiate counter
+distance_score = 0
+
+# for each letter in the sequences
+for a,b in zip(seq1, seq2)
+
+ # if they don't match, add a distance point
+ if a != b:
+ distance_score += 1
+
+# print result to the terminal
+print(f"Distance between A and B: {distance_score}")
+
Extend the program that the second sequence is inverted and assigned to a +third sequence. Please, read the first and second sequence from the command +line. Calculate the distances between the first and the second and between the +first and the third sequence.
+Compare the distance between the first and the second and the first and the +third sequence and print the alignment with the smaller distance. If the +distances are equal, then print the alignment of the first and second sequence.
+Test your program with the following sequences:
+ +import sys
+
+# get sequences from the command line arguments
+seq1 = sys.argv[1]
+seq2 = sys.argv[2]
+
+### reverse the seq2 string and save as seq2_rev
+
+# initiate variable
+seq2_rev = ""
+
+# for each letter in seq2
+for i in range(len(seq2)):
+ # add the next character to the reversed seq2 string
+ seq2_rev += seq2[len(seq2) - i - 1]
+
+# only run the calculation if the sequences have the same length
+if (len(seq1) == len(seq2)):
+
+ # initiate counters
+ dist_1_2 = 0
+ dist_1_2rev = 0
+ # for each letter in the sequences
+ for i in range(len(seq1)):
+
+ # if they don't match, add a distance point
+ if (seq1[i] != seq2[i]):
+ dist_1_2 += 1
+
+ # if they don't match, add a distance point
+ if (seq1[i] != seq2_rev[i]):
+ dist_1_2rev += 1
+
+# if the distance seq1seq2 is less or eq to distance seq1seq2_rev
+if (dist_1_2 <= dist_1_2rev):
+ # print the seq1seq2 sequences and distance score
+ print("Sequence seq1: ", seq1)
+ print("Sequence seq2: ", seq2)
+ print("Distance between seq1 and seq2: ", dist_1_2)
+
+# else, if the seq1seq2_rev distance is less than seq1seq2
+else:
+ # print the seq1seq2_rev sequences and distance score
+ print("Sequence seq1: ", seq1)
+ print("Sequence seq2_rev: ", seq2_rev)
+ print("Distance between seq1 and seq2_rev: ", dist_1_2rev)
+
+# tell the user the lengths differ
+else:
+ print("Sequences seq1 and seq2 are of different length.")
+
Open an editor and save your new program. In this program we will create + a few functions.
+Define two functions similarity
and distance
:
Note: Purines are A and G, pyrimidines are C and T.
+# define which bases are purines and pyrimidines
+pur = ["A", "G"]
+pyr = ["C", "T"]
+
+# define the similarity function for two single bases
+def similarity(base1, base2):
+
+ # if they match, return 1
+ if (base1 == base2):
+ return 1
+
+ # else,if they dont match but are of the same kind
+ elif (((base1 in pur) and (base2 in pur)) or ((base1 in pyr) and (base2 in pyr))):
+ return 0.5
+
+ # if they neither matches or are of the same kind, return 0
+ else:
+ return 0
+
+# define the distance function for two single bases
+def distance(base1, base2):
+
+ # if they match, return 0
+ if (base1 == base2):
+ return 0
+
+ # else,if they dont match but are of the same kind
+ elif (((base1 in pur) and (base2 in pur)) or ((base1 in pyr) and (base2 in pyr))):
+ return 0.5
+
+ # if they neither matches or are of the same kind, return 1
+ else:
+ return 1
+
sequence_similarity
and sequence_distance
, which
+ calculates the similarity and distance of two whole sequences.# define the similarity function for whole sequences
+def sequence_similarity (seq1, seq2):
+
+ # initiate counter
+ similarity_score = 0.0
+
+ # go through all bases in seq1
+ for i in range(len(seq1)):
+
+ # calculate their similarity and add to the score
+ similarity_score = similarity_score + similarity(seq1[i], seq2[i])
+
+ # return the final score
+ return similarity_score
+
+
+# define the distance function for whole sequences
+def sequence_distance(seq1, seq2):
+
+ # initiate counter
+ distance_score = 0.0
+
+ # go through all bases in seq1
+ for i in range(len(seq1)):
+
+ # calculate the distance and add to the score
+ distance_score = distance_score + distance(seq1[i], seq2[i])
+
+ # return the final score
+ return distance_score
+
import sys
+
+### Paste here the code for the functions you wrote in 1.2 and 1.3 ###
+
+# read the sequences from command line arguments
+seq1 = sys.argv[1]
+seq2 = sys.argv[2]
+
+# print the similarity and distance
+print("Similarity: ", sequence_similarity(seq1, seq2))
+print("Distance: ", sequence_distance(seq1, seq2))
+
In this exercise we will write three different programs.
+Write a new Python file (module) called sequence_tools.py
which
+ contain both the two functions similarity
and distance
as defined
+ previously.
#########################
+### sequence_tools.py ###
+#########################
+
+# define which bases are purines and pyrimidines
+pur = ["A", "G"]
+pyr = ["C", "T"]
+
+# define the similarity function for two single bases
+def similarity(base1, base2):
+ # if they match, return 1
+ if (base1 == base2):
+ return 1
+ # else,if they dont match but are of the same kind
+ elif (((base1 in pur) and (base2 in pur)) or ((base1 in pyr) and (base2 in pyr)))
+ return 0.5
+ # if they neither matches or are of the same kind, return 0
+ else:
+ return 0
+
+# define the distance function for two single bases
+def distance(base1, base2):
+ # if they match, return 0
+ if (base1 == base2):
+ return 0
+ # else,if they dont match but are of the same kind
+ elif (((base1 in pur) and (base2 in pur)) or ((base1 in pyr) and (base2 in pyr)))
+ return 0.5
+ # if they neither matches or are of the same kind, return 1
+ else:
+ return 1
+
+# define the similarity function for whole sequences
+def sequence_similarity (seq1, seq2):
+ # initiate counter
+ similarity_score = 0.0
+ # go through all bases in seq1
+ for i in range(len(seq1)):
+ # calculate their similarity and add to the score
+ similarity_score = similarity_score + similarity(seq1[i], seq2[i])
+ # return the final score
+ return similarity_score
+
+
+# define the distance function for whole sequences
+def sequence_distance(seq1, seq2):
+ # initiate counter
+ distance_score = 0.0
+ # go through all bases in seq1
+ for i in range(len(seq1)):
+ # calculate the distance and add to the score
+ distance_score = distance_score + distance(seq1[i], seq2[i])
+ # return the final score
+ return distance_score
+
Write another Python file that calculates for each combination of two
+ sequences stored in list seq_list
the similarity and distance using the
+ module defined previously.
+
from sequence_tools import *
+
+# define sequences
+seq_list = ["ATCCGGT", "GCGTTAC", "CTACTGC", "TTGCAGT", "AGTCACC"]
+
+# loop over each sequence in seq_list
+for i in range(len(seq_list)):
+
+ # loop over the remaining sequences in seq_list
+ for j in range(i+1, len(seq_list)):
+
+ # calculate the similarity and distance
+ similarity_score = sequence_similarity(seq_list[i], seq_list[j])
+ distance_score = sequence_distance(seq_list[i], seq_list[j])
+
+ # print the result for this comparison
+ print(seq_list[i], seq_list[j], " Similarity: ", similarity_score, " Distance: ", distance_score)
+
ATCCGGT GCGTTAC Similarity: 2.5 Distance: 4.5
+ATCCGGT CTACTGC Similarity: 3.5 Distance: 3.5
+ATCCGGT TTGCAGT Similarity: 4.5 Distance: 2.5
+ATCCGGT AGTCACC Similarity: 3.5 Distance: 3.5
+GCGTTAC CTACTGC Similarity: 4.0 Distance: 3.0
+GCGTTAC TTGCAGT Similarity: 3.0 Distance: 4.0
+GCGTTAC AGTCACC Similarity: 2.0 Distance: 5.0
+CTACTGC TTGCAGT Similarity: 4.5 Distance: 2.5
+CTACTGC AGTCACC Similarity: 2.0 Distance: 5.0
+TTGCAGT AGTCACC Similarity: 2.5 Distance: 4.5
+
** Extend your program. Determine the combination of sequences with the
+ highest similarity of all sequences stored in list l. Write these two
+ sequences and the alignment into a new file, called similar_sequences.txt
.**
For example for two given sequences: “ATC” and “ACC” The alignment would be: +
+And this alignment should be written to a new output file. Hint: A +line-break in Python can be made by adding ’\n’ to the end of the line. +from sequence_tools import *
+
+# define sequences
+seq_list = ["ATCCGGT", "GCGTTAC", "CTACTGC", "TTGCAGT", "AGTCACC"]
+
+# define variables
+similarity_highscore = 0
+best_seq1 = ""
+best_seq2 = ""
+
+# loop over each sequence in seq_list
+for i in range(len(seq_list)):
+
+ # compare the sequence to all remaining sequences in seq_list
+ for j in range(i+1, len(seq_list)):
+
+ # calculate the similarity
+ similarity_score = sequence_similarity(seq_list[i], seq_list[j])
+
+ # check if it's a new similarity highscore
+ if (similarity_score > similarity_highscore):
+
+ # if it is, save this as the new highscore
+ similarity_highscore = similarity_score
+ best_seq1 = seq_list[i]
+ best_seq2 = seq_list[j]
+
+# create an empty string to add the alignment to
+alignment_matches = ""
+
+# go through each letter the best aligned pair
+for i in range(len(best_seq1)):
+
+ # find places where they match
+ if (best_seq1[i] == best_seq2[i]):
+ alignment_matches = alignment_matches + "|"
+ # and places they don't match
+ else:
+ alignment_matches = alignment_matches + " "
+
+# write the sequences and the match symbols to file
+outfile = open("similar_sequences.txt", "w")
+outfile.write(best_seq1 + "\n")
+outfile.write(alignment_matches + "\n")
+outfile.write(best_seq2 + "\n")
+
Learning objectives
+Teaching goals are:
+Lesson plan:
+Most programmers need to work on data and produce some result. +In Python, we -of course- can read from files and write to files. +Here we do just that.
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Then do:
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Consider searching for 'Python reverse order', +as you will need to do this in the exercise.
+Then do:
+Learning objectives
+Now we try to make this script work!
+Read:
+ +Now, try to get the script to work.
+When running the code, there should be a visible pop-up window. If not, +re-read the UPPMAX documentation page 'Login to Rackham' +and enable X-forwarding.
+The exercise is about practicing to search for an UPPMAX module, +hence giving away the answer makes this session useless.
+However, if you feel stuck too much, +you can watch a video in which this exercise is done +here
+Learning objectives
+Teaching goals are:
+Lesson plan:
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Then do exercise +- 3.8.1 +- 3.8.5
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Then do exercise:
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Then do exercise:
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Then do exercise:
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Then do exercise:
+Learning objectives
+Teaching goals are:
+Lesson plan:
+Sometimes one wants to make a plot, such like a scatterplot or barchart. +Matplotlib is a popular Python package to make plots. +Here we experience how easy/hard it is to use Matplotlib on Rackham.
+Learning objectives
+Read matplotlib's 'Pyplot tutorial' here.
+Copy-paste the first script on that page to a script and get it to work. +Do you expect this to be easy?
+ + + + + + + + + + + + + +Learning objectives
+Teaching goals are:
+Lesson plan:
+Sometime you need to get 'stuff to work', for example, +code written by someone else. +In this session, we practice this, +going through the problems and errors that occur +when you want to 'just want to run the stupid code'.
+Learning objectives
+Read the following sections of How to Think Like a Computer Scientist: Learning with Python 3:
+Copy-paste the first script in that section to a script.
+Run the script and verify it will not run. Does the error message make sense?
+Learning objectives
+Now we try to make this script work!
+Read:
+ +Now, try to get the script to work.
+When running the code, there should be a visible pop-up window. If not, +re-read the UPPMAX documentation page 'Login to Rackham' +and enable X-forwarding.
+The exercise is about practicing to search for an UPPMAX module, +hence giving away the answer makes this session useless.
+However, if you feel stuck too much, +you can watch a video in which this exercise is done +here
+We are now going to introduce functions
+def
keywordSimilarity of Sequence
+Let us write a function that calculates some measure of similarity between +two strings e.g. a DNA sequences such as +
+where the element-wise distance function is given by +and we want to calculate the sum over sequences \(A\) and \(B\) of same length +\(N\).
+such that
+ +Yielding a total distance of 4. Let's write Python function that solves +this problem
+def sequence_distance(A, B):
+ # Store current distance
+ distance = 0
+
+ # Loop over index of A and B
+ for i in range(len(A)):
+ if A[i] != B[i]:
+ distance += 1
+
+ return distance
+
+A = "GATCGTTCG"
+B = "CATGGTTGA"
+sequence_distance(A, B)
+
Can we improve the function?
+We require that the sequences are of equal length. We could easily
+verify this by using an assert
statement
Remeber that the range(len(seq))
is not considered idiomatic Python?
+The way to solve this when having to access elements from two iterables
+is using the zip
function. Refactoring we get
By convention a Python either mutates the input arguments or returns something +- not both. It is also possible that function does nothing of the +aforementioned. Let's see an example using a list which is a mutable object
+.py
)Let's look at an example by creating a simple math module. Let's structure the
+code in a new directory called mathlib
. First create a new directory and cd
+into it
Next start by creating a module called math_funcs.py
def add(a, b):
+ return a + b
+
+def sub(a, b):
+ return a - b
+
+def mul(a, b):
+ return a * b
+
Next we will create a new module that call the functions we defined in
+math_funcs.py
and performs some calculations - let's call it calculate.py
import math_funcs as mf
+
+a = 3
+b = 1
+
+print(f"{a} + {b} = {mf.add(a, b)}")
+print(f"{a} - {b} = {mf.sub(a, b)}")
+print(f"{a} * {b} = {mf.mul(a, b)}")
+
Imports and namespaces
+Notice that the import
statement makes the functions of math_funcs.py
+accessible from calculate.py
. In this case we attached math_funcs
the
+namespace mf
for short. To access the functions we need to use the dot
+notation. An alternative could have been to
We could also have skipped the aliasing
+ +A big no no is using wildcard imports from module import *
. This will
+clutter the namespace!
Takeaways
+You already know what a command-line interface (CLI) program is!
+ +Where mkdir
is the command and dir_name
is the input argument. What if we
+want to design such a program in Python? Let's continue working on our
+"Similarity of Sequence" example. The goal when we are done will be to
++Read text files containing DNA sequences, passed as arguments to a program +called
+read_seq.py
Let's ignore the reading of text files for now. First create a new directory
+called dna_lib
and cd
into it
For the purpose of practicing working with modules let's separate our program
+into separate modules. First create a module called dna_metrics.py
def sequence_distance(A, B):
+ # Assert equal length
+ assert len(A) == len(B), "Must be of equal length"
+ # Store current distance
+ distance = 0
+
+ # Loop over index of A and B
+ for a, b in zip(A, B)
+ if a != b:
+ distance += 1
+
+ return distance
+
Then create read_seq.py
import dna_metrics as dm
+import sys
+
+# Read command line args
+A = sys.argv[1]
+B = sys.argv[2]
+
+print(f"Distance between A and B is {dm.sequence_distance(A,B)}")
+
!!! note "if __name__ == "__main__"
" convention
+ By convention we usually wrap the code that "runs" our program in an if
+ statement for executable programs. This is to ensure that program is not
+ executed if imported by another module but only when called directly. The
+ code would then look like this
+
import dna_metrics as dm
+import sys
+
+if __name__ == "__main__":
+ # Read command line args
+ A = sys.argv[1]
+ B = sys.argv[2]
+
+ print(f"Distance between A and B is {dm.sequence_distance(A,B)}")
+
__name__ == "__main__"
is only satisfied for the entry
+ point module.
+We are now going to add some IO functionality. Rather than passing the
+sequences directly as arguments we are going to use the builtin open
function
+to read files and instead of printing the results to the standard output we
+will dump the results into a text file. First let's create a new directory
+data
that will contain seq1.txt
and seq2.txt
Let's explore the open
command from ipython
using seq1.txt
[ins] In [1]: inputfile = open("seq1.txt", "r")
+
+[ins] In [2]: inputfile
+Out[2]: <_io.TextIOWrapper name='seq1.txt' mode='r' encoding='UTF-8'>
+
+[ins] In [3]: A = inputfile.readline()
+
+[ins] In [4]: A
+Out[4]: 'GATCGTTCG\n'
+
+[ins] In [5]: A.strip()
+Out[5]: 'GATCGTTCG'
+
Adapting our program accordingly using the keyword with
to define a context
+in which the file is open.
import dna_metrics as dm
+import sys
+
+if __name__ == "__main__":
+ # Read command line args
+ with open(sys.argv[1], "r") as f:
+ A = f.readline().strip()
+
+ with open(sys.argv[2], "r") as f:
+ B = f.readline().strip()
+
+ print(f"Distance between A and B is {dm.sequence_distance(A,B)}")
+
Now run the script
+ +As a final step we will save the results into an output file
+import dna_metrics as dm
+import sys
+
+if __name__ == "__main__":
+ # Read command line args
+ with open(sys.argv[1], "r") as f:
+ A = f.readline().strip()
+
+ with open(sys.argv[2], "r") as f:
+ B = f.readline().strip()
+
+ with open(sys.argv[3], "w") as f:
+ f.write(f"Distance between A and B is {dm.sequence_distance(A,B)}")
+
Let's run the script a final time
+ +++ + + + + + + + + + + + + +Classes provide a means of bundling data and functionality together. Creating +a new class creates a new type of object, allowing new instances of that type +to be made. Each class instance can have attributes attached to it for +maintaining its state. Class instances can also have methods (defined by its +class) for modifying its state. ref
+
This module introduces the fundamentals of Python programming language.
+Content
+Time | +Topic | +
---|---|
13:00-14:00 | +UPPMAX modules and Python packages | +
14:00-14:15 | +Break | +
14:15-15:00 | +File IO, command line arguments | +
15:00-15:15 | +Break | +
15:15-16:00 | +Graphics, functions | +
Link to HackMd: https://hackmd.io/@dianai/uppmax-intro/
+{"use strict";/*!
+ * escape-html
+ * Copyright(c) 2012-2013 TJ Holowaychuk
+ * Copyright(c) 2015 Andreas Lubbe
+ * Copyright(c) 2015 Tiancheng "Timothy" Gu
+ * MIT Licensed
+ */var Wa=/["'&<>]/;Vn.exports=Ua;function Ua(e){var t=""+e,r=Wa.exec(t);if(!r)return t;var o,n="",i=0,s=0;for(i=r.index;i