## actual content, organize entries in dictionary
table_cont = {}
infoGenes = set()

## 14,023,138 lines
## -- remove first two lines
with open("gene_info", "r") as infoFile:
    ## columns:
    ## 0: tax_id
    ## 1: GeneID
    ## 2: symbol
    ## 6: chromosome
    ## 7: map_location
    next(infoFile)
    next(infoFile)
    for row in infoFile:
        rSplit = row.split("\t")
        dbTup = []
        dbTup.append(rSplit[2]) ## symbol
        dbTup.append(rSplit[6]) ## chromosome
        dbTup.append(rSplit[7]) ## map_location
        table_cont[(rSplit[0], rSplit[1])] = dbTup
        infoGenes.add((rSplit[0], rSplit[1]))
        
## 22,162,843 lines
with open("gene2refseq", "r") as infoFile:

    with open("gene_comb_table.csv", "w") as out:
        ## columns:
        ## 0: tax_id
        ## 1: GeneID
        ## 2: status
        ## 5: protein_accession_version
        ## 6: protein_gi
        ## 9: start_pos
        ## 10: end_pos
        next(infoFile)
        cnt = 0
        first = True
        for row in infoFile:
            dbTup = []
            rSplit = row.split("\t")
            if (rSplit[0], rSplit[1]) in table_cont:
                ## gene was already in first table
                dbTup = [elmnt for elmnt in table_cont[(rSplit[0], rSplit[1])]]
                dbTup.append(rSplit[2]) # status
                dbTup.append(rSplit[5]) # protein_accession_version
                dbTup.append(rSplit[6]) # protein_gi
                dbTup.append(rSplit[9]) # start_pos
                dbTup.append(rSplit[10]) # end_pos
                if (rSplit[0], rSplit[1]) in infoGenes:
                    infoGenes.remove((rSplit[0], rSplit[1]))
            else:
                ## gene was not in first table
                ## fill first three items with NULL
                dbTup.append("-")
                dbTup.append("-")
                dbTup.append("-")
                dbTup.append(rSplit[2]) # status
                dbTup.append(rSplit[5]) # protein_accession_version
                dbTup.append(rSplit[6]) # protein_gi
                dbTup.append(rSplit[9]) # start_pos
                dbTup.append(rSplit[10]) # end_pos
            
            dbTup.insert(0, rSplit[1]) ## insert geneID
            dbTup.insert(0, rSplit[0]) ## insert tax_ID
            out.write("\t".join(dbTup))
            out.write("\n")

## write remaining genes that were only in first table
with open("gene_comb_table.csv", "a") as out:
    for k in infoGenes:
        tup = table_cont[k]
        if len(tup) < 8:
            print "yes, needed"
            ## gene was only in first table, not in second
            tup.append("-")
            tup.append("-")
            tup.append("-")
            tup.append("-")
            tup.append("-")
        tup.insert(0, k[1]) ## insert geneID
        tup.insert(0, k[0]) ## insert tax_ID
        out.write("\t".join(tup))
        out.write("\n")
