HTMLParser.java 3.17 KB
package com.upc.pbe.upcnews;

import java.util.ArrayList;

import android.util.Log;

//Parseja arxius HTML
public class HTMLParser {
	
	private final static String TAG = "HTMLParser";
	private ArrayList<String> resources; //Llista de recursos trobats
	private String server; //URL Server

	public HTMLParser(String u){
		//Separem host i server en la direccio donada
		String host = u.substring(7);
		server = u.substring(0,host.indexOf("/")+7);
		resources = new ArrayList<String>();
	}
	
	public ArrayList<String> parse(String code){
		//Metode principal, nejetem la llista si hi ha continguts i parsejem
		if(!resources.isEmpty()){
			resources.clear();
		}
		parseDirectories(resources,code);
		parseFiles(resources,code);
		return resources;
	}

	public void parseDirectories(ArrayList<String> resources, String code){
		//Busca directoris en un arxiu HTML
		//Separem el codi en linies
		String[] split = code.split("\n");
		//Busquem el tag <a>, descartant comentaris:
		for (int i = 0; i < split.length; i++){
			while (split[i].startsWith("<!--")){
				if (split[i++].endsWith("-->")){
					break;
				}
			}
			// Posible comentari descartat
			if (split[i].indexOf("<a ") != -1){ //Enllaç  
				if (split[i].indexOf("href=") != -1) { //Aixo te bona pinta
					String dirpath = split[i].substring(split[i].indexOf("href=\"") + 6); //Eliminem morralla del principi.
					dirpath = dirpath.substring(0, dirpath.indexOf("\"")); //Treiem la morralla del final, ens queda el valor de href.
					if(dirpath.lastIndexOf("/") != -1) { //Mira que sigui una carpeta y no un arxiu sol.
						dirpath = dirpath.substring(0,dirpath.lastIndexOf("/")+1); //Elimina la part que no es carpeta
						if(dirpath.startsWith("/")){
							dirpath = server + dirpath;
						}
						if(!resources.contains(dirpath)) { //No siguem repetitius
							resources.add(dirpath);
						}
						Log.d(TAG, "DIRECTORI TROBAT: " + dirpath);
					}
				}
			}
		}
	}

	public void parseFiles(ArrayList<String> resources, String code){
		//Parseja arxius en un arxiu HTML
		//Separem el codi en linies
		String[] split = code.split("\n");
		for (int i = 0; i < split.length; i++){
			while (split[i].startsWith("<!--")){
				if (split[i++].endsWith("-->")){
					break;
				}
			}
			// Possible comentari descartat
			if (split[i].contains("<a ")) { //Enllaç
				if (split[i].contains("href=")) { //Aixo te bona pinta
					String filepath = split[i].substring(split[i].indexOf("href=\"") + 6); //Eliminem morralla del principi.
					filepath = filepath.substring(0, filepath.indexOf("\"")); //Treiem la morralla del final, ens queda el valor de href.
					if(filepath.endsWith(".m3u8")) { //Mira que sigui una llista.
						String filename;
						//Deteccio d'arxius ocults
						if(filepath.indexOf("/") != -1){
							filename = filepath.substring(filepath.lastIndexOf("/"));
						}
						else{
							filename = filepath;
						}
						if(!filename.startsWith(".")){
							if(!resources.contains(filepath)) //No siguem repetitius
							{
								resources.add(filepath);
							}
							Log.d(TAG, "PLAYLIST TROBADA: " + filepath);
						}
					}
				}
			}
		}
	}
}