I recently wanted to be able to check broken links in an html file and did not want to buy any commercial programs. So I wrote a program in python which shows you the broken links in an html file. Here is the code:
#!/usr/bin/env python
import os, sys
def usage():
print “usage: %s <html file>” % sys.argv[0]
print “checks the html file for broken links”
def fileExists(file):
-
inf = os.stat(file)
return True
try:except OSError:
def extractLink(line, tag):
index = line.find(tag)+len(tag)+1
end = line.find(“\”", index+1)
link = line[index:end]
return link
def getDirectory(file):
-
index = file.find(“/”, index+1)
index = 0
while file.find(“/”, index+1) > index:directory = file[:index]
return directory
######################
# the main program starts here #
######################
if len(sys.argv) < 2:
file = sys.argv[1]
text = open(file, “r”).readlines()
linklist = []
tag = “href=”
#extract the links from the text
for line in text:
-
-
link = extractLink(line, tag)
if not “\”" in link and not “‘” in link:
if tag in line:
if file.startswith(“/”):
directory = os.path.abspath(getDirectory(file))
else:
directory = os.path.abspath(getDirectory(os.getcwd()+”/”+file))
if not directory.endswith(“/”):
directory = directory+”/”
print “-”*30
print “missing file(s): “
print “-”*30
for link in linklist:
print “-”*30
There are a lot of possibilities to improve this program, e.g. you could provide the line on which the broken link is etc. Feel free to post any improvements or comments, I hope this is helpful.