1.1 Write a regular expression that finds html tags in a file and prints them. #!/usr/bin/python import re # open a file file = open("file.html","r") text = file.readlines() file.close() # searching the file content line by line: keyword = re.compile(r"<.+?>") for line in text: result = keyword.search (line) if result: print result.group(), ":", line, --- 2.1 Continue with the previous exercise but print the type of every html tag your script finds, such as html, body, title, a, br. #!/usr/bin/python import re # open a file file = open("file.html","r") text = file.readlines() file.close() # searching the file content line by line: keyword = re.compile(r"<(.+?)>") for line in text: result = keyword.search (line) if result: print result.group(1), ":", line, --- 2.2 Optional: Print all lines in the alice.txt file so that the first and the last character in each line are switched. #!/usr/bin/python import re # open a file file = open("alice.txt","r") text = file.readlines() file.close() # compiling the regular expression: keyword = re.compile(r"(.)(.*)(.)") for line in text: result = keyword.search (line) if result: print result.group(3) + result.group(2) + result.group(1) --- 2.3 Print all lines in the alice.txt file that contain two double characters. #!/usr/bin/python import re # open a file file = open("alice.txt","r") text = file.readlines() file.close() # compiling the regular expression: keyword = re.compile(r"(.)\1(.*)(.)\3") for line in text: result = keyword.search (line) if result: print result.group() --- 3.1 all upper case A by lower case a. # compiling the regular expression: keyword = re.compile(r"A") # searching the file content line by line: for line in text: print keyword.sub ("a",line), --- 3.2 Delete all words with more than 3 characters. # compiling the regular expression: keyword = re.compile(r"\b\w\w\w\w+\b") # searching the file content line by line: for line in text: print keyword.sub ("",line), --- 3.3 Print two blank space characters after the "." at the end of a sentence. # compiling the regular expression: keyword = re.compile(r"\.") # searching the file content line by line: for line in text: print keyword.sub (". ",line), --- (Optional: Don't do this if the "." is the last character in a line.) # compiling the regular expression: keyword = re.compile(r"\.[^\n]") # searching the file content line by line: for line in text: print keyword.sub (". ",line), --- 3.4 Replace single quotes (' or `) by double quotes. # compiling the regular expression: keyword = re.compile(r"['`]") # searching the file content line by line: for line in text: print keyword.sub ("\"",line), --- 3.5 Modify your program from exercise 1.1, so that it deletes al HTML markup. #!/usr/bin/python import re # open a file file = open("file.html","r") text = file.readlines() file.close() # searching the file content line by line: keyword = re.compile(r"<.+?>") # searching the file content line by line: for line in text: print keyword.sub ("",line), --- 4.1 Modify the example so that it splits the file into words instead of sentence parts. #!/usr/bin/python import re # open a file file = open("alice.txt","r") text = file.readlines() file.close() # join all of the lines together using " " as glue bigstring = " ".join(text) # delete newline characters and white space from the end of each line keyword = re.compile(r"\s*\n\s*") bigstring = keyword.sub (" ",bigstring) keyword = re.compile(r"\s*") text = keyword.split (bigstring) for line in text: print line 4.2 Write a script that takes an HTML source file as input and prints it so that a newline follows only "closing tags", i.e. tags that are of the form . #!/usr/bin/python import re # open a file file = open("file.html","r") text = file.readlines() file.close() # join all of the lines together using " " as glue bigstring = " ".join(text) # delete newline characters and white space from the end of each line keyword = re.compile(r"\s*\n\s*") bigstring = keyword.sub (" ",bigstring) # split bigstring where "." or "," occurs keyword = re.compile(r"") text = keyword.split (bigstring) for line in text: print line