1.1 Write a regular expression that finds html tags in a file and 
prints them. 

#!/usr/bin/python
import re

# open a file
file = open("file.html","r")
text = file.readlines()
file.close()

# searching the file content line by line:
keyword = re.compile(r"<.+?>")

for line in text:
    result = keyword.search (line)
    if result:
       print result.group(), ":", line,

---

2.1 Continue with the previous exercise but print the type of every 
html tag your script finds, such as html, body, title, a, br. 

#!/usr/bin/python
import re

# open a file
file = open("file.html","r")
text = file.readlines()
file.close()

# searching the file content line by line:
keyword = re.compile(r"<(.+?)>")

for line in text:
    result = keyword.search (line)
    if result:
       print result.group(1), ":", line,


---
2.2 Optional: Print all lines in the alice.txt file so that the first and
the last character in each line are switched.

#!/usr/bin/python
import re

# open a file
file = open("alice.txt","r")
text = file.readlines()
file.close()

# compiling the regular expression:
keyword = re.compile(r"(.)(.*)(.)")

for line in text:
    result = keyword.search (line)
    if result:
        print result.group(3) + result.group(2) + result.group(1)

---

2.3 Print all lines in the alice.txt file that contain two double
characters.

#!/usr/bin/python
import re

# open a file
file = open("alice.txt","r")
text = file.readlines()
file.close()

# compiling the regular expression:
keyword = re.compile(r"(.)\1(.*)(.)\3")

for line in text:
    result = keyword.search (line)
    if result:
        print result.group()


---

3.1 all upper case A by lower case a. 

# compiling the regular expression:
keyword = re.compile(r"A")

# searching the file content line by line:
for line in text:
    print keyword.sub ("a",line),

---

3.2 Delete all words with more than 3 characters.

# compiling the regular expression:
keyword = re.compile(r"\b\w\w\w\w+\b")

# searching the file content line by line:
for line in text:
    print keyword.sub ("",line),

---

3.3 Print two blank space characters after the "." at the end of a sentence.

# compiling the regular expression:
keyword = re.compile(r"\.")

# searching the file content line by line:
for line in text:
    print keyword.sub (". ",line),

---
(Optional: Don't do this if the "." is the last character in a line.)

# compiling the regular expression:
keyword = re.compile(r"\.[^\n]")

# searching the file content line by line:
for line in text:
    print keyword.sub (".  ",line),

---

3.4 Replace single quotes (' or `) by double quotes. 

# compiling the regular expression:
keyword = re.compile(r"['`]")

# searching the file content line by line:
for line in text:
    print keyword.sub ("\"",line),

---
3.5 Modify your program from exercise 1.1, so that it deletes al 
HTML markup.

#!/usr/bin/python
import re

# open a file
file = open("file.html","r")
text = file.readlines()
file.close()

# searching the file content line by line:
keyword = re.compile(r"<.+?>")

# searching the file content line by line:
for line in text:
    print keyword.sub ("",line),


---

4.1 Modify the example so that it splits the file into words instead of
sentence parts.

#!/usr/bin/python
import re

# open a file
file = open("alice.txt","r")
text = file.readlines()
file.close()

# join all of the lines together using " " as glue
bigstring = " ".join(text)

# delete newline characters and white space from the end of each line
keyword = re.compile(r"\s*\n\s*")
bigstring = keyword.sub (" ",bigstring)

keyword = re.compile(r"\s*")
text = keyword.split (bigstring)

for line in text:
    print line

4.2 Write a script that takes an HTML source file as input and prints 
it so that a newline follows only "closing tags", i.e. tags that are of 
the form </...>. 

#!/usr/bin/python
import re

# open a file
file = open("file.html","r")
text = file.readlines()
file.close()

# join all of the lines together using " " as glue
bigstring = " ".join(text)

# delete newline characters and white space from the end of each line
keyword = re.compile(r"\s*\n\s*")
bigstring = keyword.sub (" ",bigstring)

# split bigstring where "." or "," occurs
keyword = re.compile(r"</.*?>")
text = keyword.split (bigstring)

for line in text:
    print line