While constructing UNC Class Finder, Tara and I were trying to construct a script that would programatically extract the course data from a PDF.

read more

The PDF that looked like this:

And copying the text from the PDF would really fuck up the order of a lot of things. Tara determined that the order stayed more or less the same by opening it in Adobe Reader and copying it from there, so we manually downloaded the PDF every time we wanted to update the database, used Adobe Reader to copy the text, run it through our Java program, and then upload the resulting CSV to the UNC Class Finder database. It was a very manual and annoying process to do it like this, but thankfully over time we figured out how we could automate more and more of it.

Here's the code we had in the early stages:

	
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.NoSuchElementException;
import java.util.Scanner;

public class importAndClean {

	/**
	 * @param args
	 * @throws FileNotFoundException
	 */

	public static void main(String[] args) throws FileNotFoundException {
		// TODO Auto-generated method stub
		File allData = new File("courses.txt");

		ArrayList courseSubjects = new ArrayList();
		ArrayList courseNumbers = new ArrayList();
		ArrayList courseSections = new ArrayList();
		ArrayList courseBuildings = new ArrayList();
		ArrayList courseNames = new ArrayList();
		ArrayList allDays = new ArrayList();
		ArrayList startTimes = new ArrayList();
		ArrayList endTimes = new ArrayList();
		ArrayList instructors = new ArrayList();
		ArrayList genEds = new ArrayList();
		// Scanner allCourses = new
		// Scanner(allData).useDelimiter("____________________________________________________________________________________________________________________________________________________________");
		Scanner allCourses = new Scanner(allData);

		 while(allCourses.hasNextLine()){
		//for (int i = 0; i < 55000; i++) {

			String lookingForbldg = allCourses.nextLine();

			if (lookingForbldg.contains("Bldg:")) {

				Scanner scanningFirstLine = new Scanner(lookingForbldg)
						.useDelimiter(" ");

				String courseSubject = scanningFirstLine.next();

				String courseNumber = null;
				String courseSection = null;

				if (!courseSubject.equals("Bldg:")) {

					courseNumber = scanningFirstLine.next();

					courseSection = scanningFirstLine.next();

					scanningFirstLine.next();

					String courseBuilding = "";
					while (scanningFirstLine.hasNext()) {
						courseBuilding = courseBuilding
								.concat(scanningFirstLine.next());
						courseBuilding = courseBuilding.concat(" ");
					}
					courseBuildings.add(courseBuilding);

				}

				// ///////////////////////////////////////////////////////////////////
				allCourses.nextLine();
				String lookingForClassName = allCourses.nextLine();

				Scanner scanningSecondLine = new Scanner(lookingForClassName)
						.useDelimiter(" ");
				scanningSecondLine.next();

				String courseName = "";
				while (scanningSecondLine.hasNext()) {
					courseName = courseName.concat(scanningSecondLine.next());
					courseName = courseName.concat(" ");
				}
				courseNames.add(courseName);
				// ///////////////////////////////////////////////////////////////////

				String lookingForDaysTime = allCourses.nextLine();
				try {
					Scanner scanningThirdLine = new Scanner(lookingForDaysTime)
							.useDelimiter(" ");
					scanningThirdLine.next();
					scanningThirdLine.next();
					scanningThirdLine.next();

					String days = scanningThirdLine.next();

					scanningThirdLine.next();

					String startTime = scanningThirdLine.next();
					scanningThirdLine.next();
					String endTime = scanningThirdLine.next();

					// ///////////////////////////////////////////////////////////////////
					String lookingForInstructor = allCourses.nextLine();
					Scanner scanningFourthLine = new Scanner(
							lookingForInstructor).useDelimiter(" ");
					scanningFourthLine.next();
					scanningFourthLine.next();
					scanningFourthLine.next();
					scanningFourthLine.next();
					scanningFourthLine.next();

					String instructor = scanningFourthLine.next();
					// ///////////////////////////////////////////////////////////////////
					ArrayList classGenEds = new ArrayList();
					String lookingForAttributes = allCourses.nextLine();
					if(lookingForAttributes.contains("Special")){
						String moreAttributes = allCourses.nextLine();
						Scanner lookingForAttributesScan = new Scanner(moreAttributes).useDelimiter(" ");
						lookingForAttributesScan.next();
						Scanner lookingForGenEds = new Scanner( lookingForAttributesScan.next()).useDelimiter(",");
						
						while(lookingForGenEds.hasNext()){
							classGenEds.add(lookingForGenEds.next());
						
						}
					}else{
					
					Scanner lookingForAttributesScan = new Scanner(lookingForAttributes).useDelimiter(" ");
					lookingForAttributesScan.next();
					Scanner lookingForGenEds = new Scanner( lookingForAttributesScan.next()).useDelimiter(",");
					
					while(lookingForGenEds.hasNext()){
						classGenEds.add(lookingForGenEds.next());
					
					}
					//System.out.println(lookingForGenEds.next());
					
					}
					

					courseSubjects.add(courseSubject);
					courseNumbers.add(courseNumber);
					courseSections.add(courseSection);
					allDays.add(days);
					startTimes.add(startTime);
					endTimes.add(endTime);
					instructors.add(instructor);
					genEds.add(classGenEds);
				} catch (java.util.NoSuchElementException e) {
					// System.out.println(lookingForDaysTime);
				}

			}

		}
		for(int i=0;i<4100;i++){
			if(genEds.get(i).toString().contains("WB")){
			
			System.out.print(courseSubjects.get(i));
			System.out.print("   ");
			System.out.print(courseNumbers.get(i));
			System.out.print("   ");
			System.out.print(courseSections.get(i));
			System.out.print("   ");
			System.out.print(courseBuildings.get(i));
			System.out.print("   ");
			System.out.print(courseNames.get(i));
			System.out.print(allDays.get(i));
			System.out.print(" ");
			System.out.print(startTimes.get(i));
			System.out.print("   ");
			System.out.print(endTimes.get(i));
			System.out.print("   ");
			System.out.print(instructors.get(i));
			System.out.print("   ");
			System.out.print(genEds.get(i));
			System.out.println();
			}
		}
		

	}

}